Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/fs/buffer.c

2

* linux/fs/buffer.c

3

*

3

*

4

5

*/

5

*/

6

7

/*

7

/*

8

* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95

8

* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95

9

*

9

*

10

* Removed a lot of unnecessary code and simplified things now that

10

* Removed a lot of unnecessary code and simplified things now that

11

* the buffer cache isn't our primary cache - Andrew Tridgell 12/96

11

* the buffer cache isn't our primary cache - Andrew Tridgell 12/96

12

*

12

*

13

* Speed up hash, lru, and free list operations. Use gfp() for allocating

13

* Speed up hash, lru, and free list operations. Use gfp() for allocating

14

* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM

14

* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM

15

*

15

*

16

* Added 32k buffer block sizes - these are required older ARM systems. - RMK

16

* Added 32k buffer block sizes - these are required older ARM systems. - RMK

17

*

17

*

18

* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>

18

* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>

19

*/

19

*/

20

21

#include <linux/kernel.h>

21

#include <linux/kernel.h>

22

#include <linux/syscalls.h>

22

#include <linux/syscalls.h>

23

#include <linux/fs.h>

23

#include <linux/fs.h>

24

#include <linux/mm.h>

24

#include <linux/mm.h>

25

#include <linux/percpu.h>

25

#include <linux/percpu.h>

26

#include <linux/slab.h>

26

#include <linux/slab.h>

27

#include <linux/capability.h>

27

#include <linux/capability.h>

28

#include <linux/blkdev.h>

28

#include <linux/blkdev.h>

29

#include <linux/file.h>

29

#include <linux/file.h>

30

#include <linux/quotaops.h>

30

#include <linux/quotaops.h>

31

#include <linux/highmem.h>

31

#include <linux/highmem.h>

32

#include <linux/module.h>

32

#include <linux/module.h>

33

#include <linux/writeback.h>

33

#include <linux/writeback.h>

34

#include <linux/hash.h>

34

#include <linux/hash.h>

35

#include <linux/suspend.h>

35

#include <linux/suspend.h>

36

#include <linux/buffer_head.h>

36

#include <linux/buffer_head.h>

37

#include <linux/task_io_accounting_ops.h>

37

#include <linux/task_io_accounting_ops.h>

38

#include <linux/bio.h>

38

#include <linux/bio.h>

39

#include <linux/notifier.h>

39

#include <linux/notifier.h>

40

#include <linux/cpu.h>

40

#include <linux/cpu.h>

41

#include <linux/bitops.h>

41

#include <linux/bitops.h>

42

#include <linux/mpage.h>

42

#include <linux/mpage.h>

43

#include <linux/bit_spinlock.h>

43

#include <linux/bit_spinlock.h>

44

45

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

45

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

46

47

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

47

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

48

49

inline void

49

inline void

50

init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)

50

init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)

51

{

51

{

52

bh->b_end_io = handler;

52

bh->b_end_io = handler;

53

bh->b_private = private;

53

bh->b_private = private;

54

}

54

}

55

56

static int sync_buffer(void *word)

56

static int sync_buffer(void *word)

57

{

57

{

58

struct block_device *bd;

58

struct block_device *bd;

59

struct buffer_head *bh

59

struct buffer_head *bh

60

= container_of(word, struct buffer_head, b_state);

60

= container_of(word, struct buffer_head, b_state);

61

62

smp_mb();

62

smp_mb();

63

bd = bh->b_bdev;

63

bd = bh->b_bdev;

64

if (bd)

64

if (bd)

65

blk_run_address_space(bd->bd_inode->i_mapping);

65

blk_run_address_space(bd->bd_inode->i_mapping);

66

io_schedule();

66

io_schedule();

67

return 0;

67

return 0;

68

}

68

}

69

70

void __lock_buffer(struct buffer_head *bh)

70

void __lock_buffer(struct buffer_head *bh)

71

{

71

{

72

wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,

72

wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,

73

TASK_UNINTERRUPTIBLE);

73

TASK_UNINTERRUPTIBLE);

74

}

74

}

75

EXPORT_SYMBOL(__lock_buffer);

75

EXPORT_SYMBOL(__lock_buffer);

76

77

void unlock_buffer(struct buffer_head *bh)

77

void unlock_buffer(struct buffer_head *bh)

78

{

78

{

79

clear_bit_unlock(BH_Lock, &bh->b_state);

79

clear_bit_unlock(BH_Lock, &bh->b_state);

80

smp_mb__after_clear_bit();

80

smp_mb__after_clear_bit();

81

wake_up_bit(&bh->b_state, BH_Lock);

81

wake_up_bit(&bh->b_state, BH_Lock);

82

}

82

}

83

84

/*

84

/*

85

* Block until a buffer comes unlocked. This doesn't stop it

85

* Block until a buffer comes unlocked. This doesn't stop it

86

* from becoming locked again - you have to lock it yourself

86

* from becoming locked again - you have to lock it yourself

87

* if you want to preserve its state.

87

* if you want to preserve its state.

88

*/

88

*/

89

void __wait_on_buffer(struct buffer_head * bh)

89

void __wait_on_buffer(struct buffer_head * bh)

90

{

90

{

91

wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);

91

wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);

92

}

92

}

93

94

static void

94

static void

95

__clear_page_buffers(struct page *page)

95

__clear_page_buffers(struct page *page)

96

{

96

{

97

ClearPagePrivate(page);

97

ClearPagePrivate(page);

98

set_page_private(page, 0);

98

set_page_private(page, 0);

99

page_cache_release(page);

99

page_cache_release(page);

100

}

100

}

101

102

103

static int quiet_error(struct buffer_head *bh)

104

{

105

if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())

106

return 0;

107

return 1;

108

}

109

110

102

static void buffer_io_error(struct buffer_head *bh)

111

static void buffer_io_error(struct buffer_head *bh)

103

{

112

{

104

char b[BDEVNAME_SIZE];

113

char b[BDEVNAME_SIZE];

105

106

printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",

114

printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",

107

bdevname(bh->b_bdev, b),

115

bdevname(bh->b_bdev, b),

108

(unsigned long long)bh->b_blocknr);

116

(unsigned long long)bh->b_blocknr);

109

}

117

}

110

118

111

/*

119

/*

112

* End-of-IO handler helper function which does not touch the bh after

120

* End-of-IO handler helper function which does not touch the bh after

113

* unlocking it.

121

* unlocking it.

114

* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but

122

* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but

115

* a race there is benign: unlock_buffer() only use the bh's address for

123

* a race there is benign: unlock_buffer() only use the bh's address for

116

* hashing after unlocking the buffer, so it doesn't actually touch the bh

124

* hashing after unlocking the buffer, so it doesn't actually touch the bh

117

* itself.

125

* itself.

118

*/

126

*/

119

static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)

127

static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)

120

{

128

{

121

if (uptodate) {

129

if (uptodate) {

122

set_buffer_uptodate(bh);

130

set_buffer_uptodate(bh);

123

} else {

131

} else {

124

/* This happens, due to failed READA attempts. */

132

/* This happens, due to failed READA attempts. */

125

clear_buffer_uptodate(bh);

133

clear_buffer_uptodate(bh);

126

}

134

}

127

unlock_buffer(bh);

135

unlock_buffer(bh);

128

}

136

}

129

137

130

/*

138

/*

131

* Default synchronous end-of-IO handler.. Just mark it up-to-date and

139

* Default synchronous end-of-IO handler.. Just mark it up-to-date and

132

* unlock the buffer. This is what ll_rw_block uses too.

140

* unlock the buffer. This is what ll_rw_block uses too.

133

*/

141

*/

134

void end_buffer_read_sync(struct buffer_head *bh, int uptodate)

142

void end_buffer_read_sync(struct buffer_head *bh, int uptodate)

135

{

143

{

136

__end_buffer_read_notouch(bh, uptodate);

144

__end_buffer_read_notouch(bh, uptodate);

137

put_bh(bh);

145

put_bh(bh);

138

}

146

}

139

147

140

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)

148

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)

141

{

149

{

142

char b[BDEVNAME_SIZE];

150

char b[BDEVNAME_SIZE];

143

151

144

if (uptodate) {

152

if (uptodate) {

145

set_buffer_uptodate(bh);

153

set_buffer_uptodate(bh);

146

} else {

154

} else {

147

if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {

155

if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {

148

buffer_io_error(bh);

156

buffer_io_error(bh);

149

printk(KERN_WARNING "lost page write due to "

157

printk(KERN_WARNING "lost page write due to "

150

"I/O error on %s\n",

158

"I/O error on %s\n",

151

bdevname(bh->b_bdev, b));

159

bdevname(bh->b_bdev, b));

152

}

160

}

153

set_buffer_write_io_error(bh);

161

set_buffer_write_io_error(bh);

154

clear_buffer_uptodate(bh);

162

clear_buffer_uptodate(bh);

155

}

163

}

156

unlock_buffer(bh);

164

unlock_buffer(bh);

157

put_bh(bh);

165

put_bh(bh);

158

}

166

}

159

167

160

/*

168

/*

161

* Write out and wait upon all the dirty data associated with a block

169

* Write out and wait upon all the dirty data associated with a block

162

* device via its mapping. Does not take the superblock lock.

170

* device via its mapping. Does not take the superblock lock.

163

*/

171

*/

164

int sync_blockdev(struct block_device *bdev)

172

int sync_blockdev(struct block_device *bdev)

165

{

173

{

166

int ret = 0;

174

int ret = 0;

167

175

168

if (bdev)

176

if (bdev)

169

ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);

177

ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);

170

return ret;

178

return ret;

171

}

179

}

172

EXPORT_SYMBOL(sync_blockdev);

180

EXPORT_SYMBOL(sync_blockdev);

173

181

174

/*

182

/*

175

* Write out and wait upon all dirty data associated with this

183

* Write out and wait upon all dirty data associated with this

176

* device. Filesystem data as well as the underlying block

184

* device. Filesystem data as well as the underlying block

177

* device. Takes the superblock lock.

185

* device. Takes the superblock lock.

178

*/

186

*/

179

int fsync_bdev(struct block_device *bdev)

187

int fsync_bdev(struct block_device *bdev)

180

{

188

{

181

struct super_block *sb = get_super(bdev);

189

struct super_block *sb = get_super(bdev);

182

if (sb) {

190

if (sb) {

183

int res = fsync_super(sb);

191

int res = fsync_super(sb);

184

drop_super(sb);

192

drop_super(sb);

185

return res;

193

return res;

186

}

194

}

187

return sync_blockdev(bdev);

195

return sync_blockdev(bdev);

188

}

196

}

189

197

190

/**

198

/**

191

* freeze_bdev -- lock a filesystem and force it into a consistent state

199

* freeze_bdev -- lock a filesystem and force it into a consistent state

192

* @bdev: blockdevice to lock

200

* @bdev: blockdevice to lock

193

*

201

*

194

* This takes the block device bd_mount_sem to make sure no new mounts

202

* This takes the block device bd_mount_sem to make sure no new mounts

195

* happen on bdev until thaw_bdev() is called.

203

* happen on bdev until thaw_bdev() is called.

196

* If a superblock is found on this device, we take the s_umount semaphore

204

* If a superblock is found on this device, we take the s_umount semaphore

197

* on it to make sure nobody unmounts until the snapshot creation is done.

205

* on it to make sure nobody unmounts until the snapshot creation is done.

198

*/

206

*/

199

struct super_block *freeze_bdev(struct block_device *bdev)

207

struct super_block *freeze_bdev(struct block_device *bdev)

200

{

208

{

201

struct super_block *sb;

209

struct super_block *sb;

202

210

203

down(&bdev->bd_mount_sem);

211

down(&bdev->bd_mount_sem);

204

sb = get_super(bdev);

212

sb = get_super(bdev);

205

if (sb && !(sb->s_flags & MS_RDONLY)) {

213

if (sb && !(sb->s_flags & MS_RDONLY)) {

206

sb->s_frozen = SB_FREEZE_WRITE;

214

sb->s_frozen = SB_FREEZE_WRITE;

207

smp_wmb();

215

smp_wmb();

208

216

209

__fsync_super(sb);

217

__fsync_super(sb);

210

218

211

sb->s_frozen = SB_FREEZE_TRANS;

219

sb->s_frozen = SB_FREEZE_TRANS;

212

smp_wmb();

220

smp_wmb();

213

221

214

sync_blockdev(sb->s_bdev);

222

sync_blockdev(sb->s_bdev);

215

223

216

if (sb->s_op->write_super_lockfs)

224

if (sb->s_op->write_super_lockfs)

217

sb->s_op->write_super_lockfs(sb);

225

sb->s_op->write_super_lockfs(sb);

218

}

226

}

219

227

220

sync_blockdev(bdev);

228

sync_blockdev(bdev);

221

return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */

229

return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */

222

}

230

}

223

EXPORT_SYMBOL(freeze_bdev);

231

EXPORT_SYMBOL(freeze_bdev);

224

232

225

/**

233

/**

226

* thaw_bdev -- unlock filesystem

234

* thaw_bdev -- unlock filesystem

227

* @bdev: blockdevice to unlock

235

* @bdev: blockdevice to unlock

228

* @sb: associated superblock

236

* @sb: associated superblock

229

*

237

*

230

* Unlocks the filesystem and marks it writeable again after freeze_bdev().

238

* Unlocks the filesystem and marks it writeable again after freeze_bdev().

231

*/

239

*/

232

void thaw_bdev(struct block_device *bdev, struct super_block *sb)

240

void thaw_bdev(struct block_device *bdev, struct super_block *sb)

233

{

241

{

234

if (sb) {

242

if (sb) {

235

BUG_ON(sb->s_bdev != bdev);

243

BUG_ON(sb->s_bdev != bdev);

236

244

237

if (sb->s_op->unlockfs)

245

if (sb->s_op->unlockfs)

238

sb->s_op->unlockfs(sb);

246

sb->s_op->unlockfs(sb);

239

sb->s_frozen = SB_UNFROZEN;

247

sb->s_frozen = SB_UNFROZEN;

240

smp_wmb();

248

smp_wmb();

241

wake_up(&sb->s_wait_unfrozen);

249

wake_up(&sb->s_wait_unfrozen);

242

drop_super(sb);

250

drop_super(sb);

243

}

251

}

244

252

245

up(&bdev->bd_mount_sem);

253

up(&bdev->bd_mount_sem);

246

}

254

}

247

EXPORT_SYMBOL(thaw_bdev);

255

EXPORT_SYMBOL(thaw_bdev);

248

256

249

/*

257

/*

250

* Various filesystems appear to want __find_get_block to be non-blocking.

258

* Various filesystems appear to want __find_get_block to be non-blocking.

251

* But it's the page lock which protects the buffers. To get around this,

259

* But it's the page lock which protects the buffers. To get around this,

252

* we get exclusion from try_to_free_buffers with the blockdev mapping's

260

* we get exclusion from try_to_free_buffers with the blockdev mapping's

253

* private_lock.

261

* private_lock.

254

*

262

*

255

* Hack idea: for the blockdev mapping, i_bufferlist_lock contention

263

* Hack idea: for the blockdev mapping, i_bufferlist_lock contention

256

* may be quite high. This code could TryLock the page, and if that

264

* may be quite high. This code could TryLock the page, and if that

257

* succeeds, there is no need to take private_lock. (But if

265

* succeeds, there is no need to take private_lock. (But if

258

* private_lock is contended then so is mapping->tree_lock).

266

* private_lock is contended then so is mapping->tree_lock).

259

*/

267

*/

260

static struct buffer_head *

268

static struct buffer_head *

261

__find_get_block_slow(struct block_device *bdev, sector_t block)

269

__find_get_block_slow(struct block_device *bdev, sector_t block)

262

{

270

{

263

struct inode *bd_inode = bdev->bd_inode;

271

struct inode *bd_inode = bdev->bd_inode;

264

struct address_space *bd_mapping = bd_inode->i_mapping;

272

struct address_space *bd_mapping = bd_inode->i_mapping;

265

struct buffer_head *ret = NULL;

273

struct buffer_head *ret = NULL;

266

pgoff_t index;

274

pgoff_t index;

267

struct buffer_head *bh;

275

struct buffer_head *bh;

268

struct buffer_head *head;

276

struct buffer_head *head;

269

struct page *page;

277

struct page *page;

270

int all_mapped = 1;

278

int all_mapped = 1;

271

279

272

index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);

280

index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);

273

page = find_get_page(bd_mapping, index);

281

page = find_get_page(bd_mapping, index);

274

if (!page)

282

if (!page)

275

goto out;

283

goto out;

276

284

277

spin_lock(&bd_mapping->private_lock);

285

spin_lock(&bd_mapping->private_lock);

278

if (!page_has_buffers(page))

286

if (!page_has_buffers(page))

279

goto out_unlock;

287

goto out_unlock;

280

head = page_buffers(page);

288

head = page_buffers(page);

281

bh = head;

289

bh = head;

282

do {

290

do {

283

if (bh->b_blocknr == block) {

291

if (bh->b_blocknr == block) {

284

ret = bh;

292

ret = bh;

285

get_bh(bh);

293

get_bh(bh);

286

goto out_unlock;

294

goto out_unlock;

287

}

295

}

288

if (!buffer_mapped(bh))

296

if (!buffer_mapped(bh))

289

all_mapped = 0;

297

all_mapped = 0;

290

bh = bh->b_this_page;

298

bh = bh->b_this_page;

291

} while (bh != head);

299

} while (bh != head);

292

300

293

/* we might be here because some of the buffers on this page are

301

/* we might be here because some of the buffers on this page are

294

* not mapped. This is due to various races between

302

* not mapped. This is due to various races between

295

* file io on the block device and getblk. It gets dealt with

303

* file io on the block device and getblk. It gets dealt with

296

* elsewhere, don't buffer_error if we had some unmapped buffers

304

* elsewhere, don't buffer_error if we had some unmapped buffers

297

*/

305

*/

298

if (all_mapped) {

306

if (all_mapped) {

299

printk("__find_get_block_slow() failed. "

307

printk("__find_get_block_slow() failed. "

300

"block=%llu, b_blocknr=%llu\n",

308

"block=%llu, b_blocknr=%llu\n",

301

(unsigned long long)block,

309

(unsigned long long)block,

302

(unsigned long long)bh->b_blocknr);

310

(unsigned long long)bh->b_blocknr);

303

printk("b_state=0x%08lx, b_size=%zu\n",

311

printk("b_state=0x%08lx, b_size=%zu\n",

304

bh->b_state, bh->b_size);

312

bh->b_state, bh->b_size);

305

printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);

313

printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);

306

}

314

}

307

out_unlock:

315

out_unlock:

308

spin_unlock(&bd_mapping->private_lock);

316

spin_unlock(&bd_mapping->private_lock);

309

page_cache_release(page);

317

page_cache_release(page);

310

out:

318

out:

311

return ret;

319

return ret;

312

}

320

}

313

321

314

/* If invalidate_buffers() will trash dirty buffers, it means some kind

322

/* If invalidate_buffers() will trash dirty buffers, it means some kind

315

of fs corruption is going on. Trashing dirty data always imply losing

323

of fs corruption is going on. Trashing dirty data always imply losing

316

information that was supposed to be just stored on the physical layer

324

information that was supposed to be just stored on the physical layer

317

by the user.

325

by the user.

318

326

319

Thus invalidate_buffers in general usage is not allwowed to trash

327

Thus invalidate_buffers in general usage is not allwowed to trash

320

dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to

328

dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to

321

be preserved. These buffers are simply skipped.

329

be preserved. These buffers are simply skipped.

322

330

323

We also skip buffers which are still in use. For example this can

331

We also skip buffers which are still in use. For example this can

324

happen if a userspace program is reading the block device.

332

happen if a userspace program is reading the block device.

325

333

326

NOTE: In the case where the user removed a removable-media-disk even if

334

NOTE: In the case where the user removed a removable-media-disk even if

327

there's still dirty data not synced on disk (due a bug in the device driver

335

there's still dirty data not synced on disk (due a bug in the device driver

328

or due an error of the user), by not destroying the dirty buffers we could

336

or due an error of the user), by not destroying the dirty buffers we could

329

generate corruption also on the next media inserted, thus a parameter is

337

generate corruption also on the next media inserted, thus a parameter is

330

necessary to handle this case in the most safe way possible (trying

338

necessary to handle this case in the most safe way possible (trying

331

to not corrupt also the new disk inserted with the data belonging to

339

to not corrupt also the new disk inserted with the data belonging to

332

the old now corrupted disk). Also for the ramdisk the natural thing

340

the old now corrupted disk). Also for the ramdisk the natural thing

333

to do in order to release the ramdisk memory is to destroy dirty buffers.

341

to do in order to release the ramdisk memory is to destroy dirty buffers.

334

342

335

These are two special cases. Normal usage imply the device driver

343

These are two special cases. Normal usage imply the device driver

336

to issue a sync on the device (without waiting I/O completion) and

344

to issue a sync on the device (without waiting I/O completion) and

337

then an invalidate_buffers call that doesn't trash dirty buffers.

345

then an invalidate_buffers call that doesn't trash dirty buffers.

338

346

339

For handling cache coherency with the blkdev pagecache the 'update' case

347

For handling cache coherency with the blkdev pagecache the 'update' case

340

is been introduced. It is needed to re-read from disk any pinned

348

is been introduced. It is needed to re-read from disk any pinned

341

buffer. NOTE: re-reading from disk is destructive so we can do it only

349

buffer. NOTE: re-reading from disk is destructive so we can do it only

342

when we assume nobody is changing the buffercache under our I/O and when

350

when we assume nobody is changing the buffercache under our I/O and when

343

we think the disk contains more recent information than the buffercache.

351

we think the disk contains more recent information than the buffercache.

344

The update == 1 pass marks the buffers we need to update, the update == 2

352

The update == 1 pass marks the buffers we need to update, the update == 2

345

pass does the actual I/O. */

353

pass does the actual I/O. */

346

void invalidate_bdev(struct block_device *bdev)

354

void invalidate_bdev(struct block_device *bdev)

347

{

355

{

348

struct address_space *mapping = bdev->bd_inode->i_mapping;

356

struct address_space *mapping = bdev->bd_inode->i_mapping;

349

357

350

if (mapping->nrpages == 0)

358

if (mapping->nrpages == 0)

351

return;

359

return;

352

360

353

invalidate_bh_lrus();

361

invalidate_bh_lrus();

354

invalidate_mapping_pages(mapping, 0, -1);

362

invalidate_mapping_pages(mapping, 0, -1);

355

}

363

}

356

364

357

/*

365

/*

358

* Kick pdflush then try to free up some ZONE_NORMAL memory.

366

* Kick pdflush then try to free up some ZONE_NORMAL memory.

359

*/

367

*/

360

static void free_more_memory(void)

368

static void free_more_memory(void)

361

{

369

{

362

struct zone *zone;

370

struct zone *zone;

363

int nid;

371

int nid;

364

372

365

wakeup_pdflush(1024);

373

wakeup_pdflush(1024);

366

yield();

374

yield();

367

375

368

for_each_online_node(nid) {

376

for_each_online_node(nid) {

369

(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),

377

(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),

370

gfp_zone(GFP_NOFS), NULL,

378

gfp_zone(GFP_NOFS), NULL,

371

&zone);

379

&zone);

372

if (zone)

380

if (zone)

373

try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,

381

try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,

374

GFP_NOFS);

382

GFP_NOFS);

375

}

383

}

376

}

384

}

377

385

378

/*

386

/*

379

* I/O completion handler for block_read_full_page() - pages

387

* I/O completion handler for block_read_full_page() - pages

380

* which come unlocked at the end of I/O.

388

* which come unlocked at the end of I/O.

381

*/

389

*/

382

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)

390

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)

383

{

391

{

384

unsigned long flags;

392

unsigned long flags;

385

struct buffer_head *first;

393

struct buffer_head *first;

386

struct buffer_head *tmp;

394

struct buffer_head *tmp;

387

struct page *page;

395

struct page *page;

388

int page_uptodate = 1;

396

int page_uptodate = 1;

389

397

390

BUG_ON(!buffer_async_read(bh));

398

BUG_ON(!buffer_async_read(bh));

391

399

392

page = bh->b_page;

400

page = bh->b_page;

393

if (uptodate) {

401

if (uptodate) {

394

set_buffer_uptodate(bh);

402

set_buffer_uptodate(bh);

395

} else {

403

} else {

396

clear_buffer_uptodate(bh);

404

clear_buffer_uptodate(bh);

397

if (printk_ratelimit())

405

if (!quiet_error(bh))

398

buffer_io_error(bh);

406

buffer_io_error(bh);

399

SetPageError(page);

407

SetPageError(page);

400

}

408

}

401

409

402

/*

410

/*

403

* Be _very_ careful from here on. Bad things can happen if

411

* Be _very_ careful from here on. Bad things can happen if

404

* two buffer heads end IO at almost the same time and both

412

* two buffer heads end IO at almost the same time and both

405

* decide that the page is now completely done.

413

* decide that the page is now completely done.

406

*/

414

*/

407

first = page_buffers(page);

415

first = page_buffers(page);

408

local_irq_save(flags);

416

local_irq_save(flags);

409

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

417

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

410

clear_buffer_async_read(bh);

418

clear_buffer_async_read(bh);

411

unlock_buffer(bh);

419

unlock_buffer(bh);

412

tmp = bh;

420

tmp = bh;

413

do {

421

do {

414

if (!buffer_uptodate(tmp))

422

if (!buffer_uptodate(tmp))

415

page_uptodate = 0;

423

page_uptodate = 0;

416

if (buffer_async_read(tmp)) {

424

if (buffer_async_read(tmp)) {

417

BUG_ON(!buffer_locked(tmp));

425

BUG_ON(!buffer_locked(tmp));

418

goto still_busy;

426

goto still_busy;

419

}

427

}

420

tmp = tmp->b_this_page;

428

tmp = tmp->b_this_page;

421

} while (tmp != bh);

429

} while (tmp != bh);

422

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

430

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

423

local_irq_restore(flags);

431

local_irq_restore(flags);

424

432

425

/*

433

/*

426

* If none of the buffers had errors and they are all

434

* If none of the buffers had errors and they are all

427

* uptodate then we can set the page uptodate.

435

* uptodate then we can set the page uptodate.

428

*/

436

*/

429

if (page_uptodate && !PageError(page))

437

if (page_uptodate && !PageError(page))

430

SetPageUptodate(page);

438

SetPageUptodate(page);

431

unlock_page(page);

439

unlock_page(page);

432

return;

440

return;

433

441

434

still_busy:

442

still_busy:

435

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

443

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

436

local_irq_restore(flags);

444

local_irq_restore(flags);

437

return;

445

return;

438

}

446

}

439

447

440

/*

448

/*

441

* Completion handler for block_write_full_page() - pages which are unlocked

449

* Completion handler for block_write_full_page() - pages which are unlocked

442

* during I/O, and which have PageWriteback cleared upon I/O completion.

450

* during I/O, and which have PageWriteback cleared upon I/O completion.

443

*/

451

*/

444

static void end_buffer_async_write(struct buffer_head *bh, int uptodate)

452

static void end_buffer_async_write(struct buffer_head *bh, int uptodate)

445

{

453

{

446

char b[BDEVNAME_SIZE];

454

char b[BDEVNAME_SIZE];

447

unsigned long flags;

455

unsigned long flags;

448

struct buffer_head *first;

456

struct buffer_head *first;

449

struct buffer_head *tmp;

457

struct buffer_head *tmp;

450

struct page *page;

458

struct page *page;

451

459

452

BUG_ON(!buffer_async_write(bh));

460

BUG_ON(!buffer_async_write(bh));

453

461

454

page = bh->b_page;

462

page = bh->b_page;

455

if (uptodate) {

463

if (uptodate) {

456

set_buffer_uptodate(bh);

464

set_buffer_uptodate(bh);

457

} else {

465

} else {

458

if (printk_ratelimit()) {

466

if (!quiet_error(bh)) {

459

buffer_io_error(bh);

467

buffer_io_error(bh);

460

printk(KERN_WARNING "lost page write due to "

468

printk(KERN_WARNING "lost page write due to "

461

"I/O error on %s\n",

469

"I/O error on %s\n",

462

bdevname(bh->b_bdev, b));

470

bdevname(bh->b_bdev, b));

463

}

471

}

464

set_bit(AS_EIO, &page->mapping->flags);

472

set_bit(AS_EIO, &page->mapping->flags);

465

set_buffer_write_io_error(bh);

473

set_buffer_write_io_error(bh);

466

clear_buffer_uptodate(bh);

474

clear_buffer_uptodate(bh);

467

SetPageError(page);

475

SetPageError(page);

468

}

476

}

469

477

470

first = page_buffers(page);

478

first = page_buffers(page);

471

local_irq_save(flags);

479

local_irq_save(flags);

472

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

480

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

473

481

474

clear_buffer_async_write(bh);

482

clear_buffer_async_write(bh);

475

unlock_buffer(bh);

483

unlock_buffer(bh);

476

tmp = bh->b_this_page;

484

tmp = bh->b_this_page;

477

while (tmp != bh) {

485

while (tmp != bh) {

478

if (buffer_async_write(tmp)) {

486

if (buffer_async_write(tmp)) {

479

BUG_ON(!buffer_locked(tmp));

487

BUG_ON(!buffer_locked(tmp));

480

goto still_busy;

488

goto still_busy;

481

}

489

}

482

tmp = tmp->b_this_page;

490

tmp = tmp->b_this_page;

483

}

491

}

484

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

492

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

485

local_irq_restore(flags);

493

local_irq_restore(flags);

486

end_page_writeback(page);

494

end_page_writeback(page);

487

return;

495

return;

488

496

489

still_busy:

497

still_busy:

490

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

498

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

491

local_irq_restore(flags);

499

local_irq_restore(flags);

492

return;

500

return;

493

}

501

}

494

502

495

/*

503

/*

496

* If a page's buffers are under async readin (end_buffer_async_read

504

* If a page's buffers are under async readin (end_buffer_async_read

497

* completion) then there is a possibility that another thread of

505

* completion) then there is a possibility that another thread of

498

* control could lock one of the buffers after it has completed

506

* control could lock one of the buffers after it has completed

499

* but while some of the other buffers have not completed. This

507

* but while some of the other buffers have not completed. This

500

* locked buffer would confuse end_buffer_async_read() into not unlocking

508

* locked buffer would confuse end_buffer_async_read() into not unlocking

501

* the page. So the absence of BH_Async_Read tells end_buffer_async_read()

509

* the page. So the absence of BH_Async_Read tells end_buffer_async_read()

502

* that this buffer is not under async I/O.

510

* that this buffer is not under async I/O.

503

*

511

*

504

* The page comes unlocked when it has no locked buffer_async buffers

512

* The page comes unlocked when it has no locked buffer_async buffers

505

* left.

513

* left.

506

*

514

*

507

* PageLocked prevents anyone starting new async I/O reads any of

515

* PageLocked prevents anyone starting new async I/O reads any of

508

* the buffers.

516

* the buffers.

509

*

517

*

510

* PageWriteback is used to prevent simultaneous writeout of the same

518

* PageWriteback is used to prevent simultaneous writeout of the same

511

* page.

519

* page.

512

*

520

*

513

* PageLocked prevents anyone from starting writeback of a page which is

521

* PageLocked prevents anyone from starting writeback of a page which is

514

* under read I/O (PageWriteback is only ever set against a locked page).

522

* under read I/O (PageWriteback is only ever set against a locked page).

515

*/

523

*/

516

static void mark_buffer_async_read(struct buffer_head *bh)

524

static void mark_buffer_async_read(struct buffer_head *bh)

517

{

525

{

518

bh->b_end_io = end_buffer_async_read;

526

bh->b_end_io = end_buffer_async_read;

519

set_buffer_async_read(bh);

527

set_buffer_async_read(bh);

520

}

528

}

521

529

522

void mark_buffer_async_write(struct buffer_head *bh)

530

void mark_buffer_async_write(struct buffer_head *bh)

523

{

531

{

524

bh->b_end_io = end_buffer_async_write;

532

bh->b_end_io = end_buffer_async_write;

525

set_buffer_async_write(bh);

533

set_buffer_async_write(bh);

526

}

534

}

527

EXPORT_SYMBOL(mark_buffer_async_write);

535

EXPORT_SYMBOL(mark_buffer_async_write);

528

536

529

537

530

/*

538

/*

531

* fs/buffer.c contains helper functions for buffer-backed address space's

539

* fs/buffer.c contains helper functions for buffer-backed address space's

532

* fsync functions. A common requirement for buffer-based filesystems is

540

* fsync functions. A common requirement for buffer-based filesystems is

533

* that certain data from the backing blockdev needs to be written out for

541

* that certain data from the backing blockdev needs to be written out for

534

* a successful fsync(). For example, ext2 indirect blocks need to be

542

* a successful fsync(). For example, ext2 indirect blocks need to be

535

* written back and waited upon before fsync() returns.

543

* written back and waited upon before fsync() returns.

536

*

544

*

537

* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),

545

* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),

538

* inode_has_buffers() and invalidate_inode_buffers() are provided for the

546

* inode_has_buffers() and invalidate_inode_buffers() are provided for the

539

* management of a list of dependent buffers at ->i_mapping->private_list.

547

* management of a list of dependent buffers at ->i_mapping->private_list.

540

*

548

*

541

* Locking is a little subtle: try_to_free_buffers() will remove buffers

549

* Locking is a little subtle: try_to_free_buffers() will remove buffers

542

* from their controlling inode's queue when they are being freed. But

550

* from their controlling inode's queue when they are being freed. But

543

* try_to_free_buffers() will be operating against the *blockdev* mapping

551

* try_to_free_buffers() will be operating against the *blockdev* mapping

544

* at the time, not against the S_ISREG file which depends on those buffers.

552

* at the time, not against the S_ISREG file which depends on those buffers.

545

* So the locking for private_list is via the private_lock in the address_space

553

* So the locking for private_list is via the private_lock in the address_space

546

* which backs the buffers. Which is different from the address_space

554

* which backs the buffers. Which is different from the address_space

547

* against which the buffers are listed. So for a particular address_space,

555

* against which the buffers are listed. So for a particular address_space,

548

* mapping->private_lock does *not* protect mapping->private_list! In fact,

556

* mapping->private_lock does *not* protect mapping->private_list! In fact,

549

* mapping->private_list will always be protected by the backing blockdev's

557

* mapping->private_list will always be protected by the backing blockdev's

550

* ->private_lock.

558

* ->private_lock.

551

*

559

*

552

* Which introduces a requirement: all buffers on an address_space's

560

* Which introduces a requirement: all buffers on an address_space's

553

* ->private_list must be from the same address_space: the blockdev's.

561

* ->private_list must be from the same address_space: the blockdev's.

554

*

562

*

555

* address_spaces which do not place buffers at ->private_list via these

563

* address_spaces which do not place buffers at ->private_list via these

556

* utility functions are free to use private_lock and private_list for

564

* utility functions are free to use private_lock and private_list for

557

* whatever they want. The only requirement is that list_empty(private_list)

565

* whatever they want. The only requirement is that list_empty(private_list)

558

* be true at clear_inode() time.

566

* be true at clear_inode() time.

559

*

567

*

560

* FIXME: clear_inode should not call invalidate_inode_buffers(). The

568

* FIXME: clear_inode should not call invalidate_inode_buffers(). The

561

* filesystems should do that. invalidate_inode_buffers() should just go

569

* filesystems should do that. invalidate_inode_buffers() should just go

562

* BUG_ON(!list_empty).

570

* BUG_ON(!list_empty).

563

*

571

*

564

* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should

572

* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should

565

* take an address_space, not an inode. And it should be called

573

* take an address_space, not an inode. And it should be called

566

* mark_buffer_dirty_fsync() to clearly define why those buffers are being

574

* mark_buffer_dirty_fsync() to clearly define why those buffers are being

567

* queued up.

575

* queued up.

568

*

576

*

569

* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the

577

* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the

570

* list if it is already on a list. Because if the buffer is on a list,

578

* list if it is already on a list. Because if the buffer is on a list,

571

* it *must* already be on the right one. If not, the filesystem is being

579

* it *must* already be on the right one. If not, the filesystem is being

572

* silly. This will save a ton of locking. But first we have to ensure

580

* silly. This will save a ton of locking. But first we have to ensure

573

* that buffers are taken *off* the old inode's list when they are freed

581

* that buffers are taken *off* the old inode's list when they are freed

574

* (presumably in truncate). That requires careful auditing of all

582

* (presumably in truncate). That requires careful auditing of all

575

* filesystems (do it inside bforget()). It could also be done by bringing

583

* filesystems (do it inside bforget()). It could also be done by bringing

576

* b_inode back.

584

* b_inode back.

577

*/

585

*/

578

586

579

/*

587

/*

580

* The buffer's backing address_space's private_lock must be held

588

* The buffer's backing address_space's private_lock must be held

581

*/

589

*/

582

static void __remove_assoc_queue(struct buffer_head *bh)

590

static void __remove_assoc_queue(struct buffer_head *bh)

583

{

591

{

584

list_del_init(&bh->b_assoc_buffers);

592

list_del_init(&bh->b_assoc_buffers);

585

WARN_ON(!bh->b_assoc_map);

593

WARN_ON(!bh->b_assoc_map);

586

if (buffer_write_io_error(bh))

594

if (buffer_write_io_error(bh))

587

set_bit(AS_EIO, &bh->b_assoc_map->flags);

595

set_bit(AS_EIO, &bh->b_assoc_map->flags);

588

bh->b_assoc_map = NULL;

596

bh->b_assoc_map = NULL;

589

}

597

}

590

598

591

int inode_has_buffers(struct inode *inode)

599

int inode_has_buffers(struct inode *inode)

592

{

600

{

593

return !list_empty(&inode->i_data.private_list);

601

return !list_empty(&inode->i_data.private_list);

594

}

602

}

595

603

596

/*

604

/*

597

* osync is designed to support O_SYNC io. It waits synchronously for

605

* osync is designed to support O_SYNC io. It waits synchronously for

598

* all already-submitted IO to complete, but does not queue any new

606

* all already-submitted IO to complete, but does not queue any new

599

* writes to the disk.

607

* writes to the disk.

600

*

608

*

601

* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as

609

* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as

602

* you dirty the buffers, and then use osync_inode_buffers to wait for

610

* you dirty the buffers, and then use osync_inode_buffers to wait for

603

* completion. Any other dirty buffers which are not yet queued for

611

* completion. Any other dirty buffers which are not yet queued for

604

* write will not be flushed to disk by the osync.

612

* write will not be flushed to disk by the osync.

605

*/

613

*/

606

static int osync_buffers_list(spinlock_t *lock, struct list_head *list)

614

static int osync_buffers_list(spinlock_t *lock, struct list_head *list)

607

{

615

{

608

struct buffer_head *bh;

616

struct buffer_head *bh;

609

struct list_head *p;

617

struct list_head *p;

610

int err = 0;

618

int err = 0;

611

619

612

spin_lock(lock);

620

spin_lock(lock);

613

repeat:

621

repeat:

614

list_for_each_prev(p, list) {

622

list_for_each_prev(p, list) {

615

bh = BH_ENTRY(p);

623

bh = BH_ENTRY(p);

616

if (buffer_locked(bh)) {

624

if (buffer_locked(bh)) {

617

get_bh(bh);

625

get_bh(bh);

618

spin_unlock(lock);

626

spin_unlock(lock);

619

wait_on_buffer(bh);

627

wait_on_buffer(bh);

620

if (!buffer_uptodate(bh))

628

if (!buffer_uptodate(bh))

621

err = -EIO;

629

err = -EIO;

622

brelse(bh);

630

brelse(bh);

623

spin_lock(lock);

631

spin_lock(lock);

624

goto repeat;

632

goto repeat;

625

}

633

}

626

}

634

}

627

spin_unlock(lock);

635

spin_unlock(lock);

628

return err;

636

return err;

629

}

637

}

630

638

631

/**

639

/**

632

* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers

640

* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers

633

* @mapping: the mapping which wants those buffers written

641

* @mapping: the mapping which wants those buffers written

634

*

642

*

635

* Starts I/O against the buffers at mapping->private_list, and waits upon

643

* Starts I/O against the buffers at mapping->private_list, and waits upon

636

* that I/O.

644

* that I/O.

637

*

645

*

638

* Basically, this is a convenience function for fsync().

646

* Basically, this is a convenience function for fsync().

639

* @mapping is a file or directory which needs those buffers to be written for

647

* @mapping is a file or directory which needs those buffers to be written for

640

* a successful fsync().

648

* a successful fsync().

641

*/

649

*/

642

int sync_mapping_buffers(struct address_space *mapping)

650

int sync_mapping_buffers(struct address_space *mapping)

643

{

651

{

644

struct address_space *buffer_mapping = mapping->assoc_mapping;

652

struct address_space *buffer_mapping = mapping->assoc_mapping;

645

653

646

if (buffer_mapping == NULL || list_empty(&mapping->private_list))

654

if (buffer_mapping == NULL || list_empty(&mapping->private_list))

647

return 0;

655

return 0;

648

656

649

return fsync_buffers_list(&buffer_mapping->private_lock,

657

return fsync_buffers_list(&buffer_mapping->private_lock,

650

&mapping->private_list);

658

&mapping->private_list);

651

}

659

}

652

EXPORT_SYMBOL(sync_mapping_buffers);

660

EXPORT_SYMBOL(sync_mapping_buffers);

653

661

654

/*

662

/*

655

* Called when we've recently written block `bblock', and it is known that

663

* Called when we've recently written block `bblock', and it is known that

656

* `bblock' was for a buffer_boundary() buffer. This means that the block at

664

* `bblock' was for a buffer_boundary() buffer. This means that the block at

657

* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's

665

* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's

658

* dirty, schedule it for IO. So that indirects merge nicely with their data.

666

* dirty, schedule it for IO. So that indirects merge nicely with their data.

659

*/

667

*/

660

void write_boundary_block(struct block_device *bdev,

668

void write_boundary_block(struct block_device *bdev,

661

sector_t bblock, unsigned blocksize)

669

sector_t bblock, unsigned blocksize)

662

{

670

{

663

struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);

671

struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);

664

if (bh) {

672

if (bh) {

665

if (buffer_dirty(bh))

673

if (buffer_dirty(bh))

666

ll_rw_block(WRITE, 1, &bh);

674

ll_rw_block(WRITE, 1, &bh);

667

put_bh(bh);

675

put_bh(bh);

668

}

676

}

669

}

677

}

670

678

671

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)

679

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)

672

{

680

{

673

struct address_space *mapping = inode->i_mapping;

681

struct address_space *mapping = inode->i_mapping;

674

struct address_space *buffer_mapping = bh->b_page->mapping;

682

struct address_space *buffer_mapping = bh->b_page->mapping;

675

683

676

mark_buffer_dirty(bh);

684

mark_buffer_dirty(bh);

677

if (!mapping->assoc_mapping) {

685

if (!mapping->assoc_mapping) {

678

mapping->assoc_mapping = buffer_mapping;

686

mapping->assoc_mapping = buffer_mapping;

679

} else {

687

} else {

680

BUG_ON(mapping->assoc_mapping != buffer_mapping);

688

BUG_ON(mapping->assoc_mapping != buffer_mapping);

681

}

689

}

682

if (!bh->b_assoc_map) {

690

if (!bh->b_assoc_map) {

683

spin_lock(&buffer_mapping->private_lock);

691

spin_lock(&buffer_mapping->private_lock);

684

list_move_tail(&bh->b_assoc_buffers,

692

list_move_tail(&bh->b_assoc_buffers,

685

&mapping->private_list);

693

&mapping->private_list);

686

bh->b_assoc_map = mapping;

694

bh->b_assoc_map = mapping;

687

spin_unlock(&buffer_mapping->private_lock);

695

spin_unlock(&buffer_mapping->private_lock);

688

}

696

}

689

}

697

}

690

EXPORT_SYMBOL(mark_buffer_dirty_inode);

698

EXPORT_SYMBOL(mark_buffer_dirty_inode);

691

699

692

/*

700

/*

693

* Mark the page dirty, and set it dirty in the radix tree, and mark the inode

701

* Mark the page dirty, and set it dirty in the radix tree, and mark the inode

694

* dirty.

702

* dirty.

695

*

703

*

696

* If warn is true, then emit a warning if the page is not uptodate and has

704

* If warn is true, then emit a warning if the page is not uptodate and has

697

* not been truncated.

705

* not been truncated.

698

*/

706

*/

699

static int __set_page_dirty(struct page *page,

707

static int __set_page_dirty(struct page *page,

700

struct address_space *mapping, int warn)

708

struct address_space *mapping, int warn)

701

{

709

{

702

if (unlikely(!mapping))

710

if (unlikely(!mapping))

703

return !TestSetPageDirty(page);

711

return !TestSetPageDirty(page);

704

712

705

if (TestSetPageDirty(page))

713

if (TestSetPageDirty(page))

706

return 0;

714

return 0;

707

715

708

spin_lock_irq(&mapping->tree_lock);

716

spin_lock_irq(&mapping->tree_lock);

709

if (page->mapping) { /* Race with truncate? */

717

if (page->mapping) { /* Race with truncate? */

710

WARN_ON_ONCE(warn && !PageUptodate(page));

718

WARN_ON_ONCE(warn && !PageUptodate(page));

711

719

712

if (mapping_cap_account_dirty(mapping)) {

720

if (mapping_cap_account_dirty(mapping)) {

713

__inc_zone_page_state(page, NR_FILE_DIRTY);

721

__inc_zone_page_state(page, NR_FILE_DIRTY);

714

__inc_bdi_stat(mapping->backing_dev_info,

722

__inc_bdi_stat(mapping->backing_dev_info,

715

BDI_RECLAIMABLE);

723

BDI_RECLAIMABLE);

716

task_io_account_write(PAGE_CACHE_SIZE);

724

task_io_account_write(PAGE_CACHE_SIZE);

717

}

725

}

718

radix_tree_tag_set(&mapping->page_tree,

726

radix_tree_tag_set(&mapping->page_tree,

719

page_index(page), PAGECACHE_TAG_DIRTY);

727

page_index(page), PAGECACHE_TAG_DIRTY);

720

}

728

}

721

spin_unlock_irq(&mapping->tree_lock);

729

spin_unlock_irq(&mapping->tree_lock);

722

__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

730

__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

723

731

724

return 1;

732

return 1;

725

}

733

}

726

734

727

/*

735

/*

728

* Add a page to the dirty page list.

736

* Add a page to the dirty page list.

729

*

737

*

730

* It is a sad fact of life that this function is called from several places

738

* It is a sad fact of life that this function is called from several places

731

* deeply under spinlocking. It may not sleep.

739

* deeply under spinlocking. It may not sleep.

732

*

740

*

733

* If the page has buffers, the uptodate buffers are set dirty, to preserve

741

* If the page has buffers, the uptodate buffers are set dirty, to preserve

734

* dirty-state coherency between the page and the buffers. It the page does

742

* dirty-state coherency between the page and the buffers. It the page does

735

* not have buffers then when they are later attached they will all be set

743

* not have buffers then when they are later attached they will all be set

736

* dirty.

744

* dirty.

737

*

745

*

738

* The buffers are dirtied before the page is dirtied. There's a small race

746

* The buffers are dirtied before the page is dirtied. There's a small race

739

* window in which a writepage caller may see the page cleanness but not the

747

* window in which a writepage caller may see the page cleanness but not the

740

* buffer dirtiness. That's fine. If this code were to set the page dirty

748

* buffer dirtiness. That's fine. If this code were to set the page dirty

741

* before the buffers, a concurrent writepage caller could clear the page dirty

749

* before the buffers, a concurrent writepage caller could clear the page dirty

742

* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean

750

* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean

743

* page on the dirty page list.

751

* page on the dirty page list.

744

*

752

*

745

* We use private_lock to lock against try_to_free_buffers while using the

753

* We use private_lock to lock against try_to_free_buffers while using the

746

* page's buffer list. Also use this to protect against clean buffers being

754

* page's buffer list. Also use this to protect against clean buffers being

747

* added to the page after it was set dirty.

755

* added to the page after it was set dirty.

748

*

756

*

749

* FIXME: may need to call ->reservepage here as well. That's rather up to the

757

* FIXME: may need to call ->reservepage here as well. That's rather up to the

750

* address_space though.

758

* address_space though.

751

*/

759

*/

752

int __set_page_dirty_buffers(struct page *page)

760

int __set_page_dirty_buffers(struct page *page)

753

{

761

{

754

struct address_space *mapping = page_mapping(page);

762

struct address_space *mapping = page_mapping(page);

755

763

756

if (unlikely(!mapping))

764

if (unlikely(!mapping))

757

return !TestSetPageDirty(page);

765

return !TestSetPageDirty(page);

758

766

759

spin_lock(&mapping->private_lock);

767

spin_lock(&mapping->private_lock);

760

if (page_has_buffers(page)) {

768

if (page_has_buffers(page)) {

761

struct buffer_head *head = page_buffers(page);

769

struct buffer_head *head = page_buffers(page);

762

struct buffer_head *bh = head;

770

struct buffer_head *bh = head;

763

771

764

do {

772

do {

765

set_buffer_dirty(bh);

773

set_buffer_dirty(bh);

766

bh = bh->b_this_page;

774

bh = bh->b_this_page;

767

} while (bh != head);

775

} while (bh != head);

768

}

776

}

769

spin_unlock(&mapping->private_lock);

777

spin_unlock(&mapping->private_lock);

770

778

771

return __set_page_dirty(page, mapping, 1);

779

return __set_page_dirty(page, mapping, 1);

772

}

780

}

773

EXPORT_SYMBOL(__set_page_dirty_buffers);

781

EXPORT_SYMBOL(__set_page_dirty_buffers);

774

782

775

/*

783

/*

776

* Write out and wait upon a list of buffers.

784

* Write out and wait upon a list of buffers.

777

*

785

*

778

* We have conflicting pressures: we want to make sure that all

786

* We have conflicting pressures: we want to make sure that all

779

* initially dirty buffers get waited on, but that any subsequently

787

* initially dirty buffers get waited on, but that any subsequently

780

* dirtied buffers don't. After all, we don't want fsync to last

788

* dirtied buffers don't. After all, we don't want fsync to last

781

* forever if somebody is actively writing to the file.

789

* forever if somebody is actively writing to the file.

782

*

790

*

783

* Do this in two main stages: first we copy dirty buffers to a

791

* Do this in two main stages: first we copy dirty buffers to a

784

* temporary inode list, queueing the writes as we go. Then we clean

792

* temporary inode list, queueing the writes as we go. Then we clean

785

* up, waiting for those writes to complete.

793

* up, waiting for those writes to complete.

786

*

794

*

787

* During this second stage, any subsequent updates to the file may end

795

* During this second stage, any subsequent updates to the file may end

788

* up refiling the buffer on the original inode's dirty list again, so

796

* up refiling the buffer on the original inode's dirty list again, so

789

* there is a chance we will end up with a buffer queued for write but

797

* there is a chance we will end up with a buffer queued for write but

790

* not yet completed on that list. So, as a final cleanup we go through

798

* not yet completed on that list. So, as a final cleanup we go through

791

* the osync code to catch these locked, dirty buffers without requeuing

799

* the osync code to catch these locked, dirty buffers without requeuing

792

* any newly dirty buffers for write.

800

* any newly dirty buffers for write.

793

*/

801

*/

794

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)

802

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)

795

{

803

{

796

struct buffer_head *bh;

804

struct buffer_head *bh;

797

struct list_head tmp;

805

struct list_head tmp;

798

struct address_space *mapping;

806

struct address_space *mapping;

799

int err = 0, err2;

807

int err = 0, err2;

800

808

801

INIT_LIST_HEAD(&tmp);

809

INIT_LIST_HEAD(&tmp);

802

810

803

spin_lock(lock);

811

spin_lock(lock);

804

while (!list_empty(list)) {

812

while (!list_empty(list)) {

805

bh = BH_ENTRY(list->next);

813

bh = BH_ENTRY(list->next);

806

mapping = bh->b_assoc_map;

814

mapping = bh->b_assoc_map;

807

__remove_assoc_queue(bh);

815

__remove_assoc_queue(bh);

808

/* Avoid race with mark_buffer_dirty_inode() which does

816

/* Avoid race with mark_buffer_dirty_inode() which does

809

* a lockless check and we rely on seeing the dirty bit */

817

* a lockless check and we rely on seeing the dirty bit */

810

smp_mb();

818

smp_mb();

811

if (buffer_dirty(bh) || buffer_locked(bh)) {

819

if (buffer_dirty(bh) || buffer_locked(bh)) {

812

list_add(&bh->b_assoc_buffers, &tmp);

820

list_add(&bh->b_assoc_buffers, &tmp);

813

bh->b_assoc_map = mapping;

821

bh->b_assoc_map = mapping;

814

if (buffer_dirty(bh)) {

822

if (buffer_dirty(bh)) {

815

get_bh(bh);

823

get_bh(bh);

816

spin_unlock(lock);

824

spin_unlock(lock);

817

/*

825

/*

818

* Ensure any pending I/O completes so that

826

* Ensure any pending I/O completes so that

819

* ll_rw_block() actually writes the current

827

* ll_rw_block() actually writes the current

820

* contents - it is a noop if I/O is still in

828

* contents - it is a noop if I/O is still in

821

* flight on potentially older contents.

829

* flight on potentially older contents.

822

*/

830

*/

823

ll_rw_block(SWRITE_SYNC, 1, &bh);

831

ll_rw_block(SWRITE_SYNC, 1, &bh);

824

brelse(bh);

832

brelse(bh);

825

spin_lock(lock);

833

spin_lock(lock);

826

}

834

}

827

}

835

}

828

}

836

}

829

837

830

while (!list_empty(&tmp)) {

838

while (!list_empty(&tmp)) {

831

bh = BH_ENTRY(tmp.prev);

839

bh = BH_ENTRY(tmp.prev);

832

get_bh(bh);

840

get_bh(bh);

833

mapping = bh->b_assoc_map;

841

mapping = bh->b_assoc_map;

834

__remove_assoc_queue(bh);

842

__remove_assoc_queue(bh);

835

/* Avoid race with mark_buffer_dirty_inode() which does

843

/* Avoid race with mark_buffer_dirty_inode() which does

836

* a lockless check and we rely on seeing the dirty bit */

844

* a lockless check and we rely on seeing the dirty bit */

837

smp_mb();

845

smp_mb();

838

if (buffer_dirty(bh)) {

846

if (buffer_dirty(bh)) {

839

list_add(&bh->b_assoc_buffers,

847

list_add(&bh->b_assoc_buffers,

840

&mapping->private_list);

848

&mapping->private_list);

841

bh->b_assoc_map = mapping;

849

bh->b_assoc_map = mapping;

842

}

850

}

843

spin_unlock(lock);

851

spin_unlock(lock);

844

wait_on_buffer(bh);

852

wait_on_buffer(bh);

845

if (!buffer_uptodate(bh))

853

if (!buffer_uptodate(bh))

846

err = -EIO;

854

err = -EIO;

847

brelse(bh);

855

brelse(bh);

848

spin_lock(lock);

856

spin_lock(lock);

849

}

857

}

850

858

851

spin_unlock(lock);

859

spin_unlock(lock);

852

err2 = osync_buffers_list(lock, list);

860

err2 = osync_buffers_list(lock, list);

853

if (err)

861

if (err)

854

return err;

862

return err;

855

else

863

else

856

return err2;

864

return err2;

857

}

865

}

858

866

859

/*

867

/*

860

* Invalidate any and all dirty buffers on a given inode. We are

868

* Invalidate any and all dirty buffers on a given inode. We are

861

* probably unmounting the fs, but that doesn't mean we have already

869

* probably unmounting the fs, but that doesn't mean we have already

862

* done a sync(). Just drop the buffers from the inode list.

870

* done a sync(). Just drop the buffers from the inode list.

863

*

871

*

864

* NOTE: we take the inode's blockdev's mapping's private_lock. Which

872

* NOTE: we take the inode's blockdev's mapping's private_lock. Which

865

* assumes that all the buffers are against the blockdev. Not true

873

* assumes that all the buffers are against the blockdev. Not true

866

* for reiserfs.

874

* for reiserfs.

867

*/

875

*/

868

void invalidate_inode_buffers(struct inode *inode)

876

void invalidate_inode_buffers(struct inode *inode)

869

{

877

{

870

if (inode_has_buffers(inode)) {

878

if (inode_has_buffers(inode)) {

871

struct address_space *mapping = &inode->i_data;

879

struct address_space *mapping = &inode->i_data;

872

struct list_head *list = &mapping->private_list;

880

struct list_head *list = &mapping->private_list;

873

struct address_space *buffer_mapping = mapping->assoc_mapping;

881

struct address_space *buffer_mapping = mapping->assoc_mapping;

874

882

875

spin_lock(&buffer_mapping->private_lock);

883

spin_lock(&buffer_mapping->private_lock);

876

while (!list_empty(list))

884

while (!list_empty(list))

877

__remove_assoc_queue(BH_ENTRY(list->next));

885

__remove_assoc_queue(BH_ENTRY(list->next));

878

spin_unlock(&buffer_mapping->private_lock);

886

spin_unlock(&buffer_mapping->private_lock);

879

}

887

}

880

}

888

}

881

EXPORT_SYMBOL(invalidate_inode_buffers);

889

EXPORT_SYMBOL(invalidate_inode_buffers);

882

890

883

/*

891

/*

884

* Remove any clean buffers from the inode's buffer list. This is called

892

* Remove any clean buffers from the inode's buffer list. This is called

885

* when we're trying to free the inode itself. Those buffers can pin it.

893

* when we're trying to free the inode itself. Those buffers can pin it.

886

*

894

*

887

* Returns true if all buffers were removed.

895

* Returns true if all buffers were removed.

888

*/

896

*/

889

int remove_inode_buffers(struct inode *inode)

897

int remove_inode_buffers(struct inode *inode)

890

{

898

{

891

int ret = 1;

899

int ret = 1;

892

900

893

if (inode_has_buffers(inode)) {

901

if (inode_has_buffers(inode)) {

894

struct address_space *mapping = &inode->i_data;

902

struct address_space *mapping = &inode->i_data;

895

struct list_head *list = &mapping->private_list;

903

struct list_head *list = &mapping->private_list;

896

struct address_space *buffer_mapping = mapping->assoc_mapping;

904

struct address_space *buffer_mapping = mapping->assoc_mapping;

897

905

898

spin_lock(&buffer_mapping->private_lock);

906

spin_lock(&buffer_mapping->private_lock);

899

while (!list_empty(list)) {

907

while (!list_empty(list)) {

900

struct buffer_head *bh = BH_ENTRY(list->next);

908

struct buffer_head *bh = BH_ENTRY(list->next);

901

if (buffer_dirty(bh)) {

909

if (buffer_dirty(bh)) {

902

ret = 0;

910

ret = 0;

903

break;

911

break;

904

}

912

}

905

__remove_assoc_queue(bh);

913

__remove_assoc_queue(bh);

906

}

914

}

907

spin_unlock(&buffer_mapping->private_lock);

915

spin_unlock(&buffer_mapping->private_lock);

908

}

916

}

909

return ret;

917

return ret;

910

}

918

}

911

919

912

/*

920

/*

913

* Create the appropriate buffers when given a page for data area and

921

* Create the appropriate buffers when given a page for data area and

914

* the size of each buffer.. Use the bh->b_this_page linked list to

922

* the size of each buffer.. Use the bh->b_this_page linked list to

915

* follow the buffers created. Return NULL if unable to create more

923

* follow the buffers created. Return NULL if unable to create more

916

* buffers.

924

* buffers.

917

*

925

*

918

* The retry flag is used to differentiate async IO (paging, swapping)

926

* The retry flag is used to differentiate async IO (paging, swapping)

919

* which may not fail from ordinary buffer allocations.

927

* which may not fail from ordinary buffer allocations.

920

*/

928

*/

921

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,

929

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,

922

int retry)

930

int retry)

923

{

931

{

924

struct buffer_head *bh, *head;

932

struct buffer_head *bh, *head;

925

long offset;

933

long offset;

926

934

927

try_again:

935

try_again:

928

head = NULL;

936

head = NULL;

929

offset = PAGE_SIZE;

937

offset = PAGE_SIZE;

930

while ((offset -= size) >= 0) {

938

while ((offset -= size) >= 0) {

931

bh = alloc_buffer_head(GFP_NOFS);

939

bh = alloc_buffer_head(GFP_NOFS);

932

if (!bh)

940

if (!bh)

933

goto no_grow;

941

goto no_grow;

934

942

935

bh->b_bdev = NULL;

943

bh->b_bdev = NULL;

936

bh->b_this_page = head;

944

bh->b_this_page = head;

937

bh->b_blocknr = -1;

945

bh->b_blocknr = -1;

938

head = bh;

946

head = bh;

939

947

940

bh->b_state = 0;

948

bh->b_state = 0;

941

atomic_set(&bh->b_count, 0);

949

atomic_set(&bh->b_count, 0);

942

bh->b_private = NULL;

950

bh->b_private = NULL;

943

bh->b_size = size;

951

bh->b_size = size;

944

952

945

/* Link the buffer to its page */

953

/* Link the buffer to its page */

946

set_bh_page(bh, page, offset);

954

set_bh_page(bh, page, offset);

947

955

948

init_buffer(bh, NULL, NULL);

956

init_buffer(bh, NULL, NULL);

949

}

957

}

950

return head;

958

return head;

951

/*

959

/*

952

* In case anything failed, we just free everything we got.

960

* In case anything failed, we just free everything we got.

953

*/

961

*/

954

no_grow:

962

no_grow:

955

if (head) {

963

if (head) {

956

do {

964

do {

957

bh = head;

965

bh = head;

958

head = head->b_this_page;

966

head = head->b_this_page;

959

free_buffer_head(bh);

967

free_buffer_head(bh);

960

} while (head);

968

} while (head);

961

}

969

}

962

970

963

/*

971

/*

964

* Return failure for non-async IO requests. Async IO requests

972

* Return failure for non-async IO requests. Async IO requests

965

* are not allowed to fail, so we have to wait until buffer heads

973

* are not allowed to fail, so we have to wait until buffer heads

966

* become available. But we don't want tasks sleeping with

974

* become available. But we don't want tasks sleeping with

967

* partially complete buffers, so all were released above.

975

* partially complete buffers, so all were released above.

968

*/

976

*/

969

if (!retry)

977

if (!retry)

970

return NULL;

978

return NULL;

971

979

972

/* We're _really_ low on memory. Now we just

980

/* We're _really_ low on memory. Now we just

973

* wait for old buffer heads to become free due to

981

* wait for old buffer heads to become free due to

974

* finishing IO. Since this is an async request and

982

* finishing IO. Since this is an async request and

975

* the reserve list is empty, we're sure there are

983

* the reserve list is empty, we're sure there are

976

* async buffer heads in use.

984

* async buffer heads in use.

977

*/

985

*/

978

free_more_memory();

986

free_more_memory();

979

goto try_again;

987

goto try_again;

980

}

988

}

981

EXPORT_SYMBOL_GPL(alloc_page_buffers);

989

EXPORT_SYMBOL_GPL(alloc_page_buffers);

982

990

983

static inline void

991

static inline void

984

link_dev_buffers(struct page *page, struct buffer_head *head)

992

link_dev_buffers(struct page *page, struct buffer_head *head)

985

{

993

{

986

struct buffer_head *bh, *tail;

994

struct buffer_head *bh, *tail;

987

995

988

bh = head;

996

bh = head;

989

do {

997

do {

990

tail = bh;

998

tail = bh;

991

bh = bh->b_this_page;

999

bh = bh->b_this_page;

992

} while (bh);

1000

} while (bh);

993

tail->b_this_page = head;

1001

tail->b_this_page = head;

994

attach_page_buffers(page, head);

1002

attach_page_buffers(page, head);

995

}

1003

}

996

1004

997

/*

1005

/*

998

* Initialise the state of a blockdev page's buffers.

1006

* Initialise the state of a blockdev page's buffers.

999

*/

1007

*/

1000

static void

1008

static void

1001

init_page_buffers(struct page *page, struct block_device *bdev,

1009

init_page_buffers(struct page *page, struct block_device *bdev,

1002

sector_t block, int size)

1010

sector_t block, int size)

1003

{

1011

{

1004

struct buffer_head *head = page_buffers(page);

1012

struct buffer_head *head = page_buffers(page);

1005

struct buffer_head *bh = head;

1013

struct buffer_head *bh = head;

1006

int uptodate = PageUptodate(page);

1014

int uptodate = PageUptodate(page);

1007

1015

1008

do {

1016

do {

1009

if (!buffer_mapped(bh)) {

1017

if (!buffer_mapped(bh)) {

1010

init_buffer(bh, NULL, NULL);

1018

init_buffer(bh, NULL, NULL);

1011

bh->b_bdev = bdev;

1019

bh->b_bdev = bdev;

1012

bh->b_blocknr = block;

1020

bh->b_blocknr = block;

1013

if (uptodate)

1021

if (uptodate)

1014

set_buffer_uptodate(bh);

1022

set_buffer_uptodate(bh);

1015

set_buffer_mapped(bh);

1023

set_buffer_mapped(bh);

1016

}

1024

}

1017

block++;

1025

block++;

1018

bh = bh->b_this_page;

1026

bh = bh->b_this_page;

1019

} while (bh != head);

1027

} while (bh != head);

1020

}

1028

}

1021

1029

1022

/*

1030

/*

1023

* Create the page-cache page that contains the requested block.

1031

* Create the page-cache page that contains the requested block.

1024

*

1032

*

1025

* This is user purely for blockdev mappings.

1033

* This is user purely for blockdev mappings.

1026

*/

1034

*/

1027

static struct page *

1035

static struct page *

1028

grow_dev_page(struct block_device *bdev, sector_t block,

1036

grow_dev_page(struct block_device *bdev, sector_t block,

1029

pgoff_t index, int size)

1037

pgoff_t index, int size)

1030

{

1038

{

1031

struct inode *inode = bdev->bd_inode;

1039

struct inode *inode = bdev->bd_inode;

1032

struct page *page;

1040

struct page *page;

1033

struct buffer_head *bh;

1041

struct buffer_head *bh;

1034

1042

1035

page = find_or_create_page(inode->i_mapping, index,

1043

page = find_or_create_page(inode->i_mapping, index,

1036

(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);

1044

(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);

1037

if (!page)

1045

if (!page)

1038

return NULL;

1046

return NULL;

1039

1047

1040

BUG_ON(!PageLocked(page));

1048

BUG_ON(!PageLocked(page));

1041

1049

1042

if (page_has_buffers(page)) {

1050

if (page_has_buffers(page)) {

1043

bh = page_buffers(page);

1051

bh = page_buffers(page);

1044

if (bh->b_size == size) {

1052

if (bh->b_size == size) {

1045

init_page_buffers(page, bdev, block, size);

1053

init_page_buffers(page, bdev, block, size);

1046

return page;

1054

return page;

1047

}

1055

}

1048

if (!try_to_free_buffers(page))

1056

if (!try_to_free_buffers(page))

1049

goto failed;

1057

goto failed;

1050

}

1058

}

1051

1059

1052

/*

1060

/*

1053

* Allocate some buffers for this page

1061

* Allocate some buffers for this page

1054

*/

1062

*/

1055

bh = alloc_page_buffers(page, size, 0);

1063

bh = alloc_page_buffers(page, size, 0);

1056

if (!bh)

1064

if (!bh)

1057

goto failed;

1065

goto failed;

1058

1066

1059

/*

1067

/*

1060

* Link the page to the buffers and initialise them. Take the

1068

* Link the page to the buffers and initialise them. Take the

1061

* lock to be atomic wrt __find_get_block(), which does not

1069

* lock to be atomic wrt __find_get_block(), which does not

1062

* run under the page lock.

1070

* run under the page lock.

1063

*/

1071

*/

1064

spin_lock(&inode->i_mapping->private_lock);

1072

spin_lock(&inode->i_mapping->private_lock);

1065

link_dev_buffers(page, bh);

1073

link_dev_buffers(page, bh);

1066

init_page_buffers(page, bdev, block, size);

1074

init_page_buffers(page, bdev, block, size);

1067

spin_unlock(&inode->i_mapping->private_lock);

1075

spin_unlock(&inode->i_mapping->private_lock);

1068

return page;

1076

return page;

1069

1077

1070

failed:

1078

failed:

1071

BUG();

1079

BUG();

1072

unlock_page(page);

1080

unlock_page(page);

1073

page_cache_release(page);

1081

page_cache_release(page);

1074

return NULL;

1082

return NULL;

1075

}

1083

}

1076

1084

1077

/*

1085

/*

1078

* Create buffers for the specified block device block's page. If

1086

* Create buffers for the specified block device block's page. If

1079

* that page was dirty, the buffers are set dirty also.

1087

* that page was dirty, the buffers are set dirty also.

1080

*/

1088

*/

1081

static int

1089

static int

1082

grow_buffers(struct block_device *bdev, sector_t block, int size)

1090

grow_buffers(struct block_device *bdev, sector_t block, int size)

1083

{

1091

{

1084

struct page *page;

1092

struct page *page;

1085

pgoff_t index;

1093

pgoff_t index;

1086

int sizebits;

1094

int sizebits;

1087

1095

1088

sizebits = -1;

1096

sizebits = -1;

1089

do {

1097

do {

1090

sizebits++;

1098

sizebits++;

1091

} while ((size << sizebits) < PAGE_SIZE);

1099

} while ((size << sizebits) < PAGE_SIZE);

1092

1100

1093

index = block >> sizebits;

1101

index = block >> sizebits;

1094

1102

1095

/*

1103

/*

1096

* Check for a block which wants to lie outside our maximum possible

1104

* Check for a block which wants to lie outside our maximum possible

1097

* pagecache index. (this comparison is done using sector_t types).

1105

* pagecache index. (this comparison is done using sector_t types).

1098

*/

1106

*/

1099

if (unlikely(index != block >> sizebits)) {

1107

if (unlikely(index != block >> sizebits)) {

1100

char b[BDEVNAME_SIZE];

1108

char b[BDEVNAME_SIZE];

1101

1109

1102

printk(KERN_ERR "%s: requested out-of-range block %llu for "

1110

printk(KERN_ERR "%s: requested out-of-range block %llu for "

1103

"device %s\n",

1111

"device %s\n",

1104

__func__, (unsigned long long)block,

1112

__func__, (unsigned long long)block,

1105

bdevname(bdev, b));

1113

bdevname(bdev, b));

1106

return -EIO;

1114

return -EIO;

1107

}

1115

}

1108

block = index << sizebits;

1116

block = index << sizebits;

1109

/* Create a page with the proper size buffers.. */

1117

/* Create a page with the proper size buffers.. */

1110

page = grow_dev_page(bdev, block, index, size);

1118

page = grow_dev_page(bdev, block, index, size);

1111

if (!page)

1119

if (!page)

1112

return 0;

1120

return 0;

1113

unlock_page(page);

1121

unlock_page(page);

1114

page_cache_release(page);

1122

page_cache_release(page);

1115

return 1;

1123

return 1;

1116

}

1124

}

1117

1125

1118

static struct buffer_head *

1126

static struct buffer_head *

1119

__getblk_slow(struct block_device *bdev, sector_t block, int size)

1127

__getblk_slow(struct block_device *bdev, sector_t block, int size)

1120

{

1128

{

1121

/* Size must be multiple of hard sectorsize */

1129

/* Size must be multiple of hard sectorsize */

1122

if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||

1130

if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||

1123

(size < 512 || size > PAGE_SIZE))) {

1131

(size < 512 || size > PAGE_SIZE))) {

1124

printk(KERN_ERR "getblk(): invalid block size %d requested\n",

1132

printk(KERN_ERR "getblk(): invalid block size %d requested\n",

1125

size);

1133

size);

1126

printk(KERN_ERR "hardsect size: %d\n",

1134

printk(KERN_ERR "hardsect size: %d\n",

1127

bdev_hardsect_size(bdev));

1135

bdev_hardsect_size(bdev));

1128

1136

1129

dump_stack();

1137

dump_stack();

1130

return NULL;

1138

return NULL;

1131

}

1139

}

1132

1140

1133

for (;;) {

1141

for (;;) {

1134

struct buffer_head * bh;

1142

struct buffer_head * bh;

1135

int ret;

1143

int ret;

1136

1144

1137

bh = __find_get_block(bdev, block, size);

1145

bh = __find_get_block(bdev, block, size);

1138

if (bh)

1146

if (bh)

1139

return bh;

1147

return bh;

1140

1148

1141

ret = grow_buffers(bdev, block, size);

1149

ret = grow_buffers(bdev, block, size);

1142

if (ret < 0)

1150

if (ret < 0)

1143

return NULL;

1151

return NULL;

1144

if (ret == 0)

1152

if (ret == 0)

1145

free_more_memory();

1153

free_more_memory();

1146

}

1154

}

1147

}

1155

}

1148

1156

1149

/*

1157

/*

1150

* The relationship between dirty buffers and dirty pages:

1158

* The relationship between dirty buffers and dirty pages:

1151

*

1159

*

1152

* Whenever a page has any dirty buffers, the page's dirty bit is set, and

1160

* Whenever a page has any dirty buffers, the page's dirty bit is set, and

1153

* the page is tagged dirty in its radix tree.

1161

* the page is tagged dirty in its radix tree.

1154

*

1162

*

1155

* At all times, the dirtiness of the buffers represents the dirtiness of

1163

* At all times, the dirtiness of the buffers represents the dirtiness of

1156

* subsections of the page. If the page has buffers, the page dirty bit is

1164

* subsections of the page. If the page has buffers, the page dirty bit is

1157

* merely a hint about the true dirty state.

1165

* merely a hint about the true dirty state.

1158

*

1166

*

1159

* When a page is set dirty in its entirety, all its buffers are marked dirty

1167

* When a page is set dirty in its entirety, all its buffers are marked dirty

1160

* (if the page has buffers).

1168

* (if the page has buffers).

1161

*

1169

*

1162

* When a buffer is marked dirty, its page is dirtied, but the page's other

1170

* When a buffer is marked dirty, its page is dirtied, but the page's other

1163

* buffers are not.

1171

* buffers are not.

1164

*

1172

*

1165

* Also. When blockdev buffers are explicitly read with bread(), they

1173

* Also. When blockdev buffers are explicitly read with bread(), they

1166

* individually become uptodate. But their backing page remains not

1174

* individually become uptodate. But their backing page remains not

1167

* uptodate - even if all of its buffers are uptodate. A subsequent

1175

* uptodate - even if all of its buffers are uptodate. A subsequent

1168

* block_read_full_page() against that page will discover all the uptodate

1176

* block_read_full_page() against that page will discover all the uptodate

1169

* buffers, will set the page uptodate and will perform no I/O.

1177

* buffers, will set the page uptodate and will perform no I/O.

1170

*/

1178

*/

1171

1179

1172

/**

1180

/**

1173

* mark_buffer_dirty - mark a buffer_head as needing writeout

1181

* mark_buffer_dirty - mark a buffer_head as needing writeout

1174

* @bh: the buffer_head to mark dirty

1182

* @bh: the buffer_head to mark dirty

1175

*

1183

*

1176

* mark_buffer_dirty() will set the dirty bit against the buffer, then set its

1184

* mark_buffer_dirty() will set the dirty bit against the buffer, then set its

1177

* backing page dirty, then tag the page as dirty in its address_space's radix

1185

* backing page dirty, then tag the page as dirty in its address_space's radix

1178

* tree and then attach the address_space's inode to its superblock's dirty

1186

* tree and then attach the address_space's inode to its superblock's dirty

1179

* inode list.

1187

* inode list.

1180

*

1188

*

1181

* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,

1189

* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,

1182

* mapping->tree_lock and the global inode_lock.

1190

* mapping->tree_lock and the global inode_lock.

1183

*/

1191

*/

1184

void mark_buffer_dirty(struct buffer_head *bh)

1192

void mark_buffer_dirty(struct buffer_head *bh)

1185

{

1193

{

1186

WARN_ON_ONCE(!buffer_uptodate(bh));

1194

WARN_ON_ONCE(!buffer_uptodate(bh));

1187

1195

1188

/*

1196

/*

1189

* Very *carefully* optimize the it-is-already-dirty case.

1197

* Very *carefully* optimize the it-is-already-dirty case.

1190

*

1198

*

1191

* Don't let the final "is it dirty" escape to before we

1199

* Don't let the final "is it dirty" escape to before we

1192

* perhaps modified the buffer.

1200

* perhaps modified the buffer.

1193

*/

1201

*/

1194

if (buffer_dirty(bh)) {

1202

if (buffer_dirty(bh)) {

1195

smp_mb();

1203

smp_mb();

1196

if (buffer_dirty(bh))

1204

if (buffer_dirty(bh))

1197

return;

1205

return;

1198

}

1206

}

1199

1207

1200

if (!test_set_buffer_dirty(bh))

1208

if (!test_set_buffer_dirty(bh))

1201

__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);

1209

__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);

1202

}

1210

}

1203

1211

1204

/*

1212

/*

1205

* Decrement a buffer_head's reference count. If all buffers against a page

1213

* Decrement a buffer_head's reference count. If all buffers against a page

1206

* have zero reference count, are clean and unlocked, and if the page is clean

1214

* have zero reference count, are clean and unlocked, and if the page is clean

1207

* and unlocked then try_to_free_buffers() may strip the buffers from the page

1215

* and unlocked then try_to_free_buffers() may strip the buffers from the page

1208

* in preparation for freeing it (sometimes, rarely, buffers are removed from

1216

* in preparation for freeing it (sometimes, rarely, buffers are removed from

1209

* a page but it ends up not being freed, and buffers may later be reattached).

1217

* a page but it ends up not being freed, and buffers may later be reattached).

1210

*/

1218

*/

1211

void __brelse(struct buffer_head * buf)

1219

void __brelse(struct buffer_head * buf)

1212

{

1220

{

1213

if (atomic_read(&buf->b_count)) {

1221

if (atomic_read(&buf->b_count)) {

1214

put_bh(buf);

1222

put_bh(buf);

1215

return;

1223

return;

1216

}

1224

}

1217

WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");

1225

WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");

1218

}

1226

}

1219

1227

1220

/*

1228

/*

1221

* bforget() is like brelse(), except it discards any

1229

* bforget() is like brelse(), except it discards any

1222

* potentially dirty data.

1230

* potentially dirty data.

1223

*/

1231

*/

1224

void __bforget(struct buffer_head *bh)

1232

void __bforget(struct buffer_head *bh)

1225

{

1233

{

1226

clear_buffer_dirty(bh);

1234

clear_buffer_dirty(bh);

1227

if (bh->b_assoc_map) {

1235

if (bh->b_assoc_map) {

1228

struct address_space *buffer_mapping = bh->b_page->mapping;

1236

struct address_space *buffer_mapping = bh->b_page->mapping;

1229

1237

1230

spin_lock(&buffer_mapping->private_lock);

1238

spin_lock(&buffer_mapping->private_lock);

1231

list_del_init(&bh->b_assoc_buffers);

1239

list_del_init(&bh->b_assoc_buffers);

1232

bh->b_assoc_map = NULL;

1240

bh->b_assoc_map = NULL;

1233

spin_unlock(&buffer_mapping->private_lock);

1241

spin_unlock(&buffer_mapping->private_lock);

1234

}

1242

}

1235

__brelse(bh);

1243

__brelse(bh);

1236

}

1244

}

1237

1245

1238

static struct buffer_head *__bread_slow(struct buffer_head *bh)

1246

static struct buffer_head *__bread_slow(struct buffer_head *bh)

1239

{

1247

{

1240

lock_buffer(bh);

1248

lock_buffer(bh);

1241

if (buffer_uptodate(bh)) {

1249

if (buffer_uptodate(bh)) {

1242

unlock_buffer(bh);

1250

unlock_buffer(bh);

1243

return bh;

1251

return bh;

1244

} else {

1252

} else {

1245

get_bh(bh);

1253

get_bh(bh);

1246

bh->b_end_io = end_buffer_read_sync;

1254

bh->b_end_io = end_buffer_read_sync;

1247

submit_bh(READ, bh);

1255

submit_bh(READ, bh);

1248

wait_on_buffer(bh);

1256

wait_on_buffer(bh);

1249

if (buffer_uptodate(bh))

1257

if (buffer_uptodate(bh))

1250

return bh;

1258

return bh;

1251

}

1259

}

1252

brelse(bh);

1260

brelse(bh);

1253

return NULL;

1261

return NULL;

1254

}

1262

}

1255

1263

1256

/*

1264

/*

1257

* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().

1265

* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().

1258

* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their

1266

* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their

1259

* refcount elevated by one when they're in an LRU. A buffer can only appear

1267

* refcount elevated by one when they're in an LRU. A buffer can only appear

1260

* once in a particular CPU's LRU. A single buffer can be present in multiple

1268

* once in a particular CPU's LRU. A single buffer can be present in multiple

1261

* CPU's LRUs at the same time.

1269

* CPU's LRUs at the same time.

1262

*

1270

*

1263

* This is a transparent caching front-end to sb_bread(), sb_getblk() and

1271

* This is a transparent caching front-end to sb_bread(), sb_getblk() and

1264

* sb_find_get_block().

1272

* sb_find_get_block().

1265

*

1273

*

1266

* The LRUs themselves only need locking against invalidate_bh_lrus. We use

1274

* The LRUs themselves only need locking against invalidate_bh_lrus. We use

1267

* a local interrupt disable for that.

1275

* a local interrupt disable for that.

1268

*/

1276

*/

1269

1277

1270

#define BH_LRU_SIZE 8

1278

#define BH_LRU_SIZE 8

1271

1279

1272

struct bh_lru {

1280

struct bh_lru {

1273

struct buffer_head *bhs[BH_LRU_SIZE];

1281

struct buffer_head *bhs[BH_LRU_SIZE];

1274

};

1282

};

1275

1283

1276

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

1284

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

1277

1285

1278

#ifdef CONFIG_SMP

1286

#ifdef CONFIG_SMP

1279

#define bh_lru_lock() local_irq_disable()

1287

#define bh_lru_lock() local_irq_disable()

1280

#define bh_lru_unlock() local_irq_enable()

1288

#define bh_lru_unlock() local_irq_enable()

1281

#else

1289

#else

1282

#define bh_lru_lock() preempt_disable()

1290

#define bh_lru_lock() preempt_disable()

1283

#define bh_lru_unlock() preempt_enable()

1291

#define bh_lru_unlock() preempt_enable()

1284

#endif

1292

#endif

1285

1293

1286

static inline void check_irqs_on(void)

1294

static inline void check_irqs_on(void)

1287

{

1295

{

1288

#ifdef irqs_disabled

1296

#ifdef irqs_disabled

1289

BUG_ON(irqs_disabled());

1297

BUG_ON(irqs_disabled());

1290

#endif

1298

#endif

1291

}

1299

}

1292

1300

1293

/*

1301

/*

1294

* The LRU management algorithm is dopey-but-simple. Sorry.

1302

* The LRU management algorithm is dopey-but-simple. Sorry.

1295

*/

1303

*/

1296

static void bh_lru_install(struct buffer_head *bh)

1304

static void bh_lru_install(struct buffer_head *bh)

1297

{

1305

{

1298

struct buffer_head *evictee = NULL;

1306

struct buffer_head *evictee = NULL;

1299

struct bh_lru *lru;

1307

struct bh_lru *lru;

1300

1308

1301

check_irqs_on();

1309

check_irqs_on();

1302

bh_lru_lock();

1310

bh_lru_lock();

1303

lru = &__get_cpu_var(bh_lrus);

1311

lru = &__get_cpu_var(bh_lrus);

1304

if (lru->bhs[0] != bh) {

1312

if (lru->bhs[0] != bh) {

1305

struct buffer_head *bhs[BH_LRU_SIZE];

1313

struct buffer_head *bhs[BH_LRU_SIZE];

1306

int in;

1314

int in;

1307

int out = 0;

1315

int out = 0;

1308

1316

1309

get_bh(bh);

1317

get_bh(bh);

1310

bhs[out++] = bh;

1318

bhs[out++] = bh;

1311

for (in = 0; in < BH_LRU_SIZE; in++) {

1319

for (in = 0; in < BH_LRU_SIZE; in++) {

1312

struct buffer_head *bh2 = lru->bhs[in];

1320

struct buffer_head *bh2 = lru->bhs[in];

1313

1321

1314

if (bh2 == bh) {

1322

if (bh2 == bh) {

1315

__brelse(bh2);

1323

__brelse(bh2);

1316

} else {

1324

} else {

1317

if (out >= BH_LRU_SIZE) {

1325

if (out >= BH_LRU_SIZE) {

1318

BUG_ON(evictee != NULL);

1326

BUG_ON(evictee != NULL);

1319

evictee = bh2;

1327

evictee = bh2;

1320

} else {

1328

} else {

1321

bhs[out++] = bh2;

1329

bhs[out++] = bh2;

1322

}

1330

}

1323

}

1331

}

1324

}

1332

}

1325

while (out < BH_LRU_SIZE)

1333

while (out < BH_LRU_SIZE)

1326

bhs[out++] = NULL;

1334

bhs[out++] = NULL;

1327

memcpy(lru->bhs, bhs, sizeof(bhs));

1335

memcpy(lru->bhs, bhs, sizeof(bhs));

1328

}

1336

}

1329

bh_lru_unlock();

1337

bh_lru_unlock();

1330

1338

1331

if (evictee)

1339

if (evictee)

1332

__brelse(evictee);

1340

__brelse(evictee);

1333

}

1341

}

1334

1342

1335

/*

1343

/*

1336

* Look up the bh in this cpu's LRU. If it's there, move it to the head.

1344

* Look up the bh in this cpu's LRU. If it's there, move it to the head.

1337

*/

1345

*/

1338

static struct buffer_head *

1346

static struct buffer_head *

1339

lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)

1347

lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)

1340

{

1348

{

1341

struct buffer_head *ret = NULL;

1349

struct buffer_head *ret = NULL;

1342

struct bh_lru *lru;

1350

struct bh_lru *lru;

1343

unsigned int i;

1351

unsigned int i;

1344

1352

1345

check_irqs_on();

1353

check_irqs_on();

1346

bh_lru_lock();

1354

bh_lru_lock();

1347

lru = &__get_cpu_var(bh_lrus);

1355

lru = &__get_cpu_var(bh_lrus);

1348

for (i = 0; i < BH_LRU_SIZE; i++) {

1356

for (i = 0; i < BH_LRU_SIZE; i++) {

1349

struct buffer_head *bh = lru->bhs[i];

1357

struct buffer_head *bh = lru->bhs[i];

1350

1358

1351

if (bh && bh->b_bdev == bdev &&

1359

if (bh && bh->b_bdev == bdev &&

1352

bh->b_blocknr == block && bh->b_size == size) {

1360

bh->b_blocknr == block && bh->b_size == size) {

1353

if (i) {

1361

if (i) {

1354

while (i) {

1362

while (i) {

1355

lru->bhs[i] = lru->bhs[i - 1];

1363

lru->bhs[i] = lru->bhs[i - 1];

1356

i--;

1364

i--;

1357

}

1365

}

1358

lru->bhs[0] = bh;

1366

lru->bhs[0] = bh;

1359

}

1367

}

1360

get_bh(bh);

1368

get_bh(bh);

1361

ret = bh;

1369

ret = bh;

1362

break;

1370

break;

1363

}

1371

}

1364

}

1372

}

1365

bh_lru_unlock();

1373

bh_lru_unlock();

1366

return ret;

1374

return ret;

1367

}

1375

}

1368

1376

1369

/*

1377

/*

1370

* Perform a pagecache lookup for the matching buffer. If it's there, refresh

1378

* Perform a pagecache lookup for the matching buffer. If it's there, refresh

1371

* it in the LRU and mark it as accessed. If it is not present then return

1379

* it in the LRU and mark it as accessed. If it is not present then return

1372

* NULL

1380

* NULL

1373

*/

1381

*/

1374

struct buffer_head *

1382

struct buffer_head *

1375

__find_get_block(struct block_device *bdev, sector_t block, unsigned size)

1383

__find_get_block(struct block_device *bdev, sector_t block, unsigned size)

1376

{

1384

{

1377

struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

1385

struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

1378

1386

1379

if (bh == NULL) {

1387

if (bh == NULL) {

1380

bh = __find_get_block_slow(bdev, block);

1388

bh = __find_get_block_slow(bdev, block);

1381

if (bh)

1389

if (bh)

1382

bh_lru_install(bh);

1390

bh_lru_install(bh);

1383

}

1391

}

1384

if (bh)

1392

if (bh)

1385

touch_buffer(bh);

1393

touch_buffer(bh);

1386

return bh;

1394

return bh;

1387

}

1395

}

1388

EXPORT_SYMBOL(__find_get_block);

1396

EXPORT_SYMBOL(__find_get_block);

1389

1397

1390

/*

1398

/*

1391

* __getblk will locate (and, if necessary, create) the buffer_head

1399

* __getblk will locate (and, if necessary, create) the buffer_head

1392

* which corresponds to the passed block_device, block and size. The

1400

* which corresponds to the passed block_device, block and size. The

1393

* returned buffer has its reference count incremented.

1401

* returned buffer has its reference count incremented.

1394

*

1402

*

1395

* __getblk() cannot fail - it just keeps trying. If you pass it an

1403

* __getblk() cannot fail - it just keeps trying. If you pass it an

1396

* illegal block number, __getblk() will happily return a buffer_head

1404

* illegal block number, __getblk() will happily return a buffer_head

1397

* which represents the non-existent block. Very weird.

1405

* which represents the non-existent block. Very weird.

1398

*

1406

*

1399

* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()

1407

* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()

1400

* attempt is failing. FIXME, perhaps?

1408

* attempt is failing. FIXME, perhaps?

1401

*/

1409

*/

1402

struct buffer_head *

1410

struct buffer_head *

1403

__getblk(struct block_device *bdev, sector_t block, unsigned size)

1411

__getblk(struct block_device *bdev, sector_t block, unsigned size)

1404

{

1412

{

1405

struct buffer_head *bh = __find_get_block(bdev, block, size);

1413

struct buffer_head *bh = __find_get_block(bdev, block, size);

1406

1414

1407

might_sleep();

1415

might_sleep();

1408

if (bh == NULL)

1416

if (bh == NULL)

1409

bh = __getblk_slow(bdev, block, size);

1417

bh = __getblk_slow(bdev, block, size);

1410

return bh;

1418

return bh;

1411

}

1419

}

1412

EXPORT_SYMBOL(__getblk);

1420

EXPORT_SYMBOL(__getblk);

1413

1421

1414

/*

1422

/*

1415

* Do async read-ahead on a buffer..

1423

* Do async read-ahead on a buffer..

1416

*/

1424

*/

1417

void __breadahead(struct block_device *bdev, sector_t block, unsigned size)

1425

void __breadahead(struct block_device *bdev, sector_t block, unsigned size)

1418

{

1426

{

1419

struct buffer_head *bh = __getblk(bdev, block, size);

1427

struct buffer_head *bh = __getblk(bdev, block, size);

1420

if (likely(bh)) {

1428

if (likely(bh)) {

1421

ll_rw_block(READA, 1, &bh);

1429

ll_rw_block(READA, 1, &bh);

1422

brelse(bh);

1430

brelse(bh);

1423

}

1431

}

1424

}

1432

}

1425

EXPORT_SYMBOL(__breadahead);

1433

EXPORT_SYMBOL(__breadahead);

1426

1434

1427

/**

1435

/**

1428

* __bread() - reads a specified block and returns the bh

1436

* __bread() - reads a specified block and returns the bh

1429

* @bdev: the block_device to read from

1437

* @bdev: the block_device to read from

1430

* @block: number of block

1438

* @block: number of block

1431

* @size: size (in bytes) to read

1439

* @size: size (in bytes) to read

1432

*

1440

*

1433

* Reads a specified block, and returns buffer head that contains it.

1441

* Reads a specified block, and returns buffer head that contains it.

1434

* It returns NULL if the block was unreadable.

1442

* It returns NULL if the block was unreadable.

1435

*/

1443

*/

1436

struct buffer_head *

1444

struct buffer_head *

1437

__bread(struct block_device *bdev, sector_t block, unsigned size)

1445

__bread(struct block_device *bdev, sector_t block, unsigned size)

1438

{

1446

{

1439

struct buffer_head *bh = __getblk(bdev, block, size);

1447

struct buffer_head *bh = __getblk(bdev, block, size);

1440

1448

1441

if (likely(bh) && !buffer_uptodate(bh))

1449

if (likely(bh) && !buffer_uptodate(bh))

1442

bh = __bread_slow(bh);

1450

bh = __bread_slow(bh);

1443

return bh;

1451

return bh;

1444

}

1452

}

1445

EXPORT_SYMBOL(__bread);

1453

EXPORT_SYMBOL(__bread);

1446

1454

1447

/*

1455

/*

1448

* invalidate_bh_lrus() is called rarely - but not only at unmount.

1456

* invalidate_bh_lrus() is called rarely - but not only at unmount.

1449

* This doesn't race because it runs in each cpu either in irq

1457

* This doesn't race because it runs in each cpu either in irq

1450

* or with preempt disabled.

1458

* or with preempt disabled.

1451

*/

1459

*/

1452

static void invalidate_bh_lru(void *arg)

1460

static void invalidate_bh_lru(void *arg)

1453

{

1461

{

1454

struct bh_lru *b = &get_cpu_var(bh_lrus);

1462

struct bh_lru *b = &get_cpu_var(bh_lrus);

1455

int i;

1463

int i;

1456

1464

1457

for (i = 0; i < BH_LRU_SIZE; i++) {

1465

for (i = 0; i < BH_LRU_SIZE; i++) {

1458

brelse(b->bhs[i]);

1466

brelse(b->bhs[i]);

1459

b->bhs[i] = NULL;

1467

b->bhs[i] = NULL;

1460

}

1468

}

1461

put_cpu_var(bh_lrus);

1469

put_cpu_var(bh_lrus);

1462

}

1470

}

1463

1471

1464

void invalidate_bh_lrus(void)

1472

void invalidate_bh_lrus(void)

1465

{

1473

{

1466

on_each_cpu(invalidate_bh_lru, NULL, 1);

1474

on_each_cpu(invalidate_bh_lru, NULL, 1);

1467

}

1475

}

1468

EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

1476

EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

1469

1477

1470

void set_bh_page(struct buffer_head *bh,

1478

void set_bh_page(struct buffer_head *bh,

1471

struct page *page, unsigned long offset)

1479

struct page *page, unsigned long offset)

1472

{

1480

{

1473

bh->b_page = page;

1481

bh->b_page = page;

1474

BUG_ON(offset >= PAGE_SIZE);

1482

BUG_ON(offset >= PAGE_SIZE);

1475

if (PageHighMem(page))

1483

if (PageHighMem(page))

1476

/*

1484

/*

1477

* This catches illegal uses and preserves the offset:

1485

* This catches illegal uses and preserves the offset:

1478

*/

1486

*/

1479

bh->b_data = (char *)(0 + offset);

1487

bh->b_data = (char *)(0 + offset);

1480

else

1488

else

1481

bh->b_data = page_address(page) + offset;

1489

bh->b_data = page_address(page) + offset;

1482

}

1490

}

1483

EXPORT_SYMBOL(set_bh_page);

1491

EXPORT_SYMBOL(set_bh_page);

1484

1492

1485

/*

1493

/*

1486

* Called when truncating a buffer on a page completely.

1494

* Called when truncating a buffer on a page completely.

1487

*/

1495

*/

1488

static void discard_buffer(struct buffer_head * bh)

1496

static void discard_buffer(struct buffer_head * bh)

1489

{

1497

{

1490

lock_buffer(bh);

1498

lock_buffer(bh);

1491

clear_buffer_dirty(bh);

1499

clear_buffer_dirty(bh);

1492

bh->b_bdev = NULL;

1500

bh->b_bdev = NULL;

1493

clear_buffer_mapped(bh);

1501

clear_buffer_mapped(bh);

1494

clear_buffer_req(bh);

1502

clear_buffer_req(bh);

1495

clear_buffer_new(bh);

1503

clear_buffer_new(bh);

1496

clear_buffer_delay(bh);

1504

clear_buffer_delay(bh);

1497

clear_buffer_unwritten(bh);

1505

clear_buffer_unwritten(bh);

1498

unlock_buffer(bh);

1506

unlock_buffer(bh);

1499

}

1507

}

1500

1508

1501

/**

1509

/**

1502

* block_invalidatepage - invalidate part of all of a buffer-backed page

1510

* block_invalidatepage - invalidate part of all of a buffer-backed page

1503

*

1511

*

1504

* @page: the page which is affected

1512

* @page: the page which is affected

1505

* @offset: the index of the truncation point

1513

* @offset: the index of the truncation point

1506

*

1514

*

1507

* block_invalidatepage() is called when all or part of the page has become

1515

* block_invalidatepage() is called when all or part of the page has become

1508

* invalidatedby a truncate operation.

1516

* invalidatedby a truncate operation.

1509

*

1517

*

1510

* block_invalidatepage() does not have to release all buffers, but it must

1518

* block_invalidatepage() does not have to release all buffers, but it must

1511

* ensure that no dirty buffer is left outside @offset and that no I/O

1519

* ensure that no dirty buffer is left outside @offset and that no I/O

1512

* is underway against any of the blocks which are outside the truncation

1520

* is underway against any of the blocks which are outside the truncation

1513

* point. Because the caller is about to free (and possibly reuse) those

1521

* point. Because the caller is about to free (and possibly reuse) those

1514

* blocks on-disk.

1522

* blocks on-disk.

1515

*/

1523

*/

1516

void block_invalidatepage(struct page *page, unsigned long offset)

1524

void block_invalidatepage(struct page *page, unsigned long offset)

1517

{

1525

{

1518

struct buffer_head *head, *bh, *next;

1526

struct buffer_head *head, *bh, *next;

1519

unsigned int curr_off = 0;

1527

unsigned int curr_off = 0;

1520

1528

1521

BUG_ON(!PageLocked(page));

1529

BUG_ON(!PageLocked(page));

1522

if (!page_has_buffers(page))

1530

if (!page_has_buffers(page))

1523

goto out;

1531

goto out;

1524

1532

1525

head = page_buffers(page);

1533

head = page_buffers(page);

1526

bh = head;

1534

bh = head;

1527

do {

1535

do {

1528

unsigned int next_off = curr_off + bh->b_size;

1536

unsigned int next_off = curr_off + bh->b_size;

1529

next = bh->b_this_page;

1537

next = bh->b_this_page;

1530

1538

1531

/*

1539

/*

1532

* is this block fully invalidated?

1540

* is this block fully invalidated?

1533

*/

1541

*/

1534

if (offset <= curr_off)

1542

if (offset <= curr_off)

1535

discard_buffer(bh);

1543

discard_buffer(bh);

1536

curr_off = next_off;

1544

curr_off = next_off;

1537

bh = next;

1545

bh = next;

1538

} while (bh != head);

1546

} while (bh != head);

1539

1547

1540

/*

1548

/*

1541

* We release buffers only if the entire page is being invalidated.

1549

* We release buffers only if the entire page is being invalidated.

1542

* The get_block cached value has been unconditionally invalidated,

1550

* The get_block cached value has been unconditionally invalidated,

1543

* so real IO is not possible anymore.

1551

* so real IO is not possible anymore.

1544

*/

1552

*/

1545

if (offset == 0)

1553

if (offset == 0)

1546

try_to_release_page(page, 0);

1554

try_to_release_page(page, 0);

1547

out:

1555

out:

1548

return;

1556

return;

1549

}

1557

}

1550

EXPORT_SYMBOL(block_invalidatepage);

1558

EXPORT_SYMBOL(block_invalidatepage);

1551

1559

1552

/*

1560

/*

1553

* We attach and possibly dirty the buffers atomically wrt

1561

* We attach and possibly dirty the buffers atomically wrt

1554

* __set_page_dirty_buffers() via private_lock. try_to_free_buffers

1562

* __set_page_dirty_buffers() via private_lock. try_to_free_buffers

1555

* is already excluded via the page lock.

1563

* is already excluded via the page lock.

1556

*/

1564

*/

1557

void create_empty_buffers(struct page *page,

1565

void create_empty_buffers(struct page *page,

1558

unsigned long blocksize, unsigned long b_state)

1566

unsigned long blocksize, unsigned long b_state)

1559

{

1567

{

1560

struct buffer_head *bh, *head, *tail;

1568

struct buffer_head *bh, *head, *tail;

1561

1569

1562

head = alloc_page_buffers(page, blocksize, 1);

1570

head = alloc_page_buffers(page, blocksize, 1);

1563

bh = head;

1571

bh = head;

1564

do {

1572

do {

1565

bh->b_state |= b_state;

1573

bh->b_state |= b_state;

1566

tail = bh;

1574

tail = bh;

1567

bh = bh->b_this_page;

1575

bh = bh->b_this_page;

1568

} while (bh);

1576

} while (bh);

1569

tail->b_this_page = head;

1577

tail->b_this_page = head;

1570

1578

1571

spin_lock(&page->mapping->private_lock);

1579

spin_lock(&page->mapping->private_lock);

1572

if (PageUptodate(page) || PageDirty(page)) {

1580

if (PageUptodate(page) || PageDirty(page)) {

1573

bh = head;

1581

bh = head;

1574

do {

1582

do {

1575

if (PageDirty(page))

1583

if (PageDirty(page))

1576

set_buffer_dirty(bh);

1584

set_buffer_dirty(bh);

1577

if (PageUptodate(page))

1585

if (PageUptodate(page))

1578

set_buffer_uptodate(bh);

1586

set_buffer_uptodate(bh);

1579

bh = bh->b_this_page;

1587

bh = bh->b_this_page;

1580

} while (bh != head);

1588

} while (bh != head);

1581

}

1589

}

1582

attach_page_buffers(page, head);

1590

attach_page_buffers(page, head);

1583

spin_unlock(&page->mapping->private_lock);

1591

spin_unlock(&page->mapping->private_lock);

1584

}

1592

}

1585

EXPORT_SYMBOL(create_empty_buffers);

1593

EXPORT_SYMBOL(create_empty_buffers);

1586

1594

1587

/*

1595

/*

1588

* We are taking a block for data and we don't want any output from any

1596

* We are taking a block for data and we don't want any output from any

1589

* buffer-cache aliases starting from return from that function and

1597

* buffer-cache aliases starting from return from that function and

1590

* until the moment when something will explicitly mark the buffer

1598

* until the moment when something will explicitly mark the buffer

1591

* dirty (hopefully that will not happen until we will free that block ;-)

1599

* dirty (hopefully that will not happen until we will free that block ;-)

1592

* We don't even need to mark it not-uptodate - nobody can expect

1600

* We don't even need to mark it not-uptodate - nobody can expect

1593

* anything from a newly allocated buffer anyway. We used to used

1601

* anything from a newly allocated buffer anyway. We used to used

1594

* unmap_buffer() for such invalidation, but that was wrong. We definitely

1602

* unmap_buffer() for such invalidation, but that was wrong. We definitely

1595

* don't want to mark the alias unmapped, for example - it would confuse

1603

* don't want to mark the alias unmapped, for example - it would confuse

1596

* anyone who might pick it with bread() afterwards...

1604

* anyone who might pick it with bread() afterwards...

1597

*

1605

*

1598

* Also.. Note that bforget() doesn't lock the buffer. So there can

1606

* Also.. Note that bforget() doesn't lock the buffer. So there can

1599

* be writeout I/O going on against recently-freed buffers. We don't

1607

* be writeout I/O going on against recently-freed buffers. We don't

1600

* wait on that I/O in bforget() - it's more efficient to wait on the I/O

1608

* wait on that I/O in bforget() - it's more efficient to wait on the I/O

1601

* only if we really need to. That happens here.

1609

* only if we really need to. That happens here.

1602

*/

1610

*/

1603

void unmap_underlying_metadata(struct block_device *bdev, sector_t block)

1611

void unmap_underlying_metadata(struct block_device *bdev, sector_t block)

1604

{

1612

{

1605

struct buffer_head *old_bh;

1613

struct buffer_head *old_bh;

1606

1614

1607

might_sleep();

1615

might_sleep();

1608

1616

1609

old_bh = __find_get_block_slow(bdev, block);

1617

old_bh = __find_get_block_slow(bdev, block);

1610

if (old_bh) {

1618

if (old_bh) {

1611

clear_buffer_dirty(old_bh);

1619

clear_buffer_dirty(old_bh);

1612

wait_on_buffer(old_bh);

1620

wait_on_buffer(old_bh);

1613

clear_buffer_req(old_bh);

1621

clear_buffer_req(old_bh);

1614

__brelse(old_bh);

1622

__brelse(old_bh);

1615

}

1623

}

1616

}

1624

}

1617

EXPORT_SYMBOL(unmap_underlying_metadata);

1625

EXPORT_SYMBOL(unmap_underlying_metadata);

1618

1626

1619

/*

1627

/*

1620

* NOTE! All mapped/uptodate combinations are valid:

1628

* NOTE! All mapped/uptodate combinations are valid:

1621

*

1629

*

1622

* Mapped Uptodate Meaning

1630

* Mapped Uptodate Meaning

1623

*

1631

*

1624

* No No "unknown" - must do get_block()

1632

* No No "unknown" - must do get_block()

1625

* No Yes "hole" - zero-filled

1633

* No Yes "hole" - zero-filled

1626

* Yes No "allocated" - allocated on disk, not read in

1634

* Yes No "allocated" - allocated on disk, not read in

1627

* Yes Yes "valid" - allocated and up-to-date in memory.

1635

* Yes Yes "valid" - allocated and up-to-date in memory.

1628

*

1636

*

1629

* "Dirty" is valid only with the last case (mapped+uptodate).

1637

* "Dirty" is valid only with the last case (mapped+uptodate).

1630

*/

1638

*/

1631

1639

1632

/*

1640

/*

1633

* While block_write_full_page is writing back the dirty buffers under

1641

* While block_write_full_page is writing back the dirty buffers under

1634

* the page lock, whoever dirtied the buffers may decide to clean them

1642

* the page lock, whoever dirtied the buffers may decide to clean them

1635

* again at any time. We handle that by only looking at the buffer

1643

* again at any time. We handle that by only looking at the buffer

1636

* state inside lock_buffer().

1644

* state inside lock_buffer().

1637

*

1645

*

1638

* If block_write_full_page() is called for regular writeback

1646

* If block_write_full_page() is called for regular writeback

1639

* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a

1647

* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a

1640

* locked buffer. This only can happen if someone has written the buffer

1648

* locked buffer. This only can happen if someone has written the buffer

1641

* directly, with submit_bh(). At the address_space level PageWriteback

1649

* directly, with submit_bh(). At the address_space level PageWriteback

1642

* prevents this contention from occurring.

1650

* prevents this contention from occurring.

1643

*/

1651

*/

1644

static int __block_write_full_page(struct inode *inode, struct page *page,

1652

static int __block_write_full_page(struct inode *inode, struct page *page,

1645

get_block_t *get_block, struct writeback_control *wbc)

1653

get_block_t *get_block, struct writeback_control *wbc)

1646

{

1654

{

1647

int err;

1655

int err;

1648

sector_t block;

1656

sector_t block;

1649

sector_t last_block;

1657

sector_t last_block;

1650

struct buffer_head *bh, *head;

1658

struct buffer_head *bh, *head;

1651

const unsigned blocksize = 1 << inode->i_blkbits;

1659

const unsigned blocksize = 1 << inode->i_blkbits;

1652

int nr_underway = 0;

1660

int nr_underway = 0;

1653

1661

1654

BUG_ON(!PageLocked(page));

1662

BUG_ON(!PageLocked(page));

1655

1663

1656

last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;

1664

last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;

1657

1665

1658

if (!page_has_buffers(page)) {

1666

if (!page_has_buffers(page)) {

1659

create_empty_buffers(page, blocksize,

1667

create_empty_buffers(page, blocksize,

1660

(1 << BH_Dirty)|(1 << BH_Uptodate));

1668

(1 << BH_Dirty)|(1 << BH_Uptodate));

1661

}

1669

}

1662

1670

1663

/*

1671

/*

1664

* Be very careful. We have no exclusion from __set_page_dirty_buffers

1672

* Be very careful. We have no exclusion from __set_page_dirty_buffers

1665

* here, and the (potentially unmapped) buffers may become dirty at

1673

* here, and the (potentially unmapped) buffers may become dirty at

1666

* any time. If a buffer becomes dirty here after we've inspected it

1674

* any time. If a buffer becomes dirty here after we've inspected it

1667

* then we just miss that fact, and the page stays dirty.

1675

* then we just miss that fact, and the page stays dirty.

1668

*

1676

*

1669

* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;

1677

* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;

1670

* handle that here by just cleaning them.

1678

* handle that here by just cleaning them.

1671

*/

1679

*/

1672

1680

1673

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

1681

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

1674

head = page_buffers(page);

1682

head = page_buffers(page);

1675

bh = head;

1683

bh = head;

1676

1684

1677

/*

1685

/*

1678

* Get all the dirty buffers mapped to disk addresses and

1686

* Get all the dirty buffers mapped to disk addresses and

1679

* handle any aliases from the underlying blockdev's mapping.

1687

* handle any aliases from the underlying blockdev's mapping.

1680

*/

1688

*/

1681

do {

1689

do {

1682

if (block > last_block) {

1690

if (block > last_block) {

1683

/*

1691

/*

1684

* mapped buffers outside i_size will occur, because

1692

* mapped buffers outside i_size will occur, because

1685

* this page can be outside i_size when there is a

1693

* this page can be outside i_size when there is a

1686

* truncate in progress.

1694

* truncate in progress.

1687

*/

1695

*/

1688

/*

1696

/*

1689

* The buffer was zeroed by block_write_full_page()

1697

* The buffer was zeroed by block_write_full_page()

1690

*/

1698

*/

1691

clear_buffer_dirty(bh);

1699

clear_buffer_dirty(bh);

1692

set_buffer_uptodate(bh);

1700

set_buffer_uptodate(bh);

1693

} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&

1701

} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&

1694

buffer_dirty(bh)) {

1702

buffer_dirty(bh)) {

1695

WARN_ON(bh->b_size != blocksize);

1703

WARN_ON(bh->b_size != blocksize);

1696

err = get_block(inode, block, bh, 1);

1704

err = get_block(inode, block, bh, 1);

1697

if (err)

1705

if (err)

1698

goto recover;

1706

goto recover;

1699

clear_buffer_delay(bh);

1707

clear_buffer_delay(bh);

1700

if (buffer_new(bh)) {

1708

if (buffer_new(bh)) {

1701

/* blockdev mappings never come here */

1709

/* blockdev mappings never come here */

1702

clear_buffer_new(bh);

1710

clear_buffer_new(bh);

1703

unmap_underlying_metadata(bh->b_bdev,

1711

unmap_underlying_metadata(bh->b_bdev,

1704

bh->b_blocknr);

1712

bh->b_blocknr);

1705

}

1713

}

1706

}

1714

}

1707

bh = bh->b_this_page;

1715

bh = bh->b_this_page;

1708

block++;

1716

block++;

1709

} while (bh != head);

1717

} while (bh != head);

1710

1718

1711

do {

1719

do {

1712

if (!buffer_mapped(bh))

1720

if (!buffer_mapped(bh))

1713

continue;

1721

continue;

1714

/*

1722

/*

1715

* If it's a fully non-blocking write attempt and we cannot

1723

* If it's a fully non-blocking write attempt and we cannot

1716

* lock the buffer then redirty the page. Note that this can

1724

* lock the buffer then redirty the page. Note that this can

1717

* potentially cause a busy-wait loop from pdflush and kswapd

1725

* potentially cause a busy-wait loop from pdflush and kswapd

1718

* activity, but those code paths have their own higher-level

1726

* activity, but those code paths have their own higher-level

1719

* throttling.

1727

* throttling.

1720

*/

1728

*/

1721

if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {

1729

if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {

1722

lock_buffer(bh);

1730

lock_buffer(bh);

1723

} else if (!trylock_buffer(bh)) {

1731

} else if (!trylock_buffer(bh)) {

1724

redirty_page_for_writepage(wbc, page);

1732

redirty_page_for_writepage(wbc, page);

1725

continue;

1733

continue;

1726

}

1734

}

1727

if (test_clear_buffer_dirty(bh)) {

1735

if (test_clear_buffer_dirty(bh)) {

1728

mark_buffer_async_write(bh);

1736

mark_buffer_async_write(bh);

1729

} else {

1737

} else {

1730

unlock_buffer(bh);

1738

unlock_buffer(bh);

1731

}

1739

}

1732

} while ((bh = bh->b_this_page) != head);

1740

} while ((bh = bh->b_this_page) != head);

1733

1741

1734

/*

1742

/*

1735

* The page and its buffers are protected by PageWriteback(), so we can

1743

* The page and its buffers are protected by PageWriteback(), so we can

1736

* drop the bh refcounts early.

1744

* drop the bh refcounts early.

1737

*/

1745

*/

1738

BUG_ON(PageWriteback(page));

1746

BUG_ON(PageWriteback(page));

1739

set_page_writeback(page);

1747

set_page_writeback(page);

1740

1748

1741

do {

1749

do {

1742

struct buffer_head *next = bh->b_this_page;

1750

struct buffer_head *next = bh->b_this_page;

1743

if (buffer_async_write(bh)) {

1751

if (buffer_async_write(bh)) {

1744

submit_bh(WRITE, bh);

1752

submit_bh(WRITE, bh);

1745

nr_underway++;

1753

nr_underway++;

1746

}

1754

}

1747

bh = next;

1755

bh = next;

1748

} while (bh != head);

1756

} while (bh != head);

1749

unlock_page(page);

1757

unlock_page(page);

1750

1758

1751

err = 0;

1759

err = 0;

1752

done:

1760

done:

1753

if (nr_underway == 0) {

1761

if (nr_underway == 0) {

1754

/*

1762

/*

1755

* The page was marked dirty, but the buffers were

1763

* The page was marked dirty, but the buffers were

1756

* clean. Someone wrote them back by hand with

1764

* clean. Someone wrote them back by hand with

1757

* ll_rw_block/submit_bh. A rare case.

1765

* ll_rw_block/submit_bh. A rare case.

1758

*/

1766

*/

1759

end_page_writeback(page);

1767

end_page_writeback(page);

1760

1768

1761

/*

1769

/*

1762

* The page and buffer_heads can be released at any time from

1770

* The page and buffer_heads can be released at any time from

1763

* here on.

1771

* here on.

1764

*/

1772

*/

1765

}

1773

}

1766

return err;

1774

return err;

1767

1775

1768

recover:

1776

recover:

1769

/*

1777

/*

1770

* ENOSPC, or some other error. We may already have added some

1778

* ENOSPC, or some other error. We may already have added some

1771

* blocks to the file, so we need to write these out to avoid

1779

* blocks to the file, so we need to write these out to avoid

1772

* exposing stale data.

1780

* exposing stale data.

1773

* The page is currently locked and not marked for writeback

1781

* The page is currently locked and not marked for writeback

1774

*/

1782

*/

1775

bh = head;

1783

bh = head;

1776

/* Recovery: lock and submit the mapped buffers */

1784

/* Recovery: lock and submit the mapped buffers */

1777

do {

1785

do {

1778

if (buffer_mapped(bh) && buffer_dirty(bh) &&

1786

if (buffer_mapped(bh) && buffer_dirty(bh) &&

1779

!buffer_delay(bh)) {

1787

!buffer_delay(bh)) {

1780

lock_buffer(bh);

1788

lock_buffer(bh);

1781

mark_buffer_async_write(bh);

1789

mark_buffer_async_write(bh);

1782

} else {

1790

} else {

1783

/*

1791

/*

1784

* The buffer may have been set dirty during

1792

* The buffer may have been set dirty during

1785

* attachment to a dirty page.

1793

* attachment to a dirty page.

1786

*/

1794

*/

1787

clear_buffer_dirty(bh);

1795

clear_buffer_dirty(bh);

1788

}

1796

}

1789

} while ((bh = bh->b_this_page) != head);

1797

} while ((bh = bh->b_this_page) != head);

1790

SetPageError(page);

1798

SetPageError(page);

1791

BUG_ON(PageWriteback(page));

1799

BUG_ON(PageWriteback(page));

1792

mapping_set_error(page->mapping, err);

1800

mapping_set_error(page->mapping, err);

1793

set_page_writeback(page);

1801

set_page_writeback(page);

1794

do {

1802

do {

1795

struct buffer_head *next = bh->b_this_page;

1803

struct buffer_head *next = bh->b_this_page;

1796

if (buffer_async_write(bh)) {

1804

if (buffer_async_write(bh)) {

1797

clear_buffer_dirty(bh);

1805

clear_buffer_dirty(bh);

1798

submit_bh(WRITE, bh);

1806

submit_bh(WRITE, bh);

1799

nr_underway++;

1807

nr_underway++;

1800

}

1808

}

1801

bh = next;

1809

bh = next;

1802

} while (bh != head);

1810

} while (bh != head);

1803

unlock_page(page);

1811

unlock_page(page);

1804

goto done;

1812

goto done;

1805

}

1813

}

1806

1814

1807

/*

1815

/*

1808

* If a page has any new buffers, zero them out here, and mark them uptodate

1816

* If a page has any new buffers, zero them out here, and mark them uptodate

1809

* and dirty so they'll be written out (in order to prevent uninitialised

1817

* and dirty so they'll be written out (in order to prevent uninitialised

1810

* block data from leaking). And clear the new bit.

1818

* block data from leaking). And clear the new bit.

1811

*/

1819

*/

1812

void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)

1820

void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)

1813

{

1821

{

1814

unsigned int block_start, block_end;

1822

unsigned int block_start, block_end;

1815

struct buffer_head *head, *bh;

1823

struct buffer_head *head, *bh;

1816

1824

1817

BUG_ON(!PageLocked(page));

1825

BUG_ON(!PageLocked(page));

1818

if (!page_has_buffers(page))

1826

if (!page_has_buffers(page))

1819

return;

1827

return;

1820

1828

1821

bh = head = page_buffers(page);

1829

bh = head = page_buffers(page);

1822

block_start = 0;

1830

block_start = 0;

1823

do {

1831

do {

1824

block_end = block_start + bh->b_size;

1832

block_end = block_start + bh->b_size;

1825

1833

1826

if (buffer_new(bh)) {

1834

if (buffer_new(bh)) {

1827

if (block_end > from && block_start < to) {

1835

if (block_end > from && block_start < to) {

1828

if (!PageUptodate(page)) {

1836

if (!PageUptodate(page)) {

1829

unsigned start, size;

1837

unsigned start, size;

1830

1838

1831

start = max(from, block_start);

1839

start = max(from, block_start);

1832

size = min(to, block_end) - start;

1840

size = min(to, block_end) - start;

1833

1841

1834

zero_user(page, start, size);

1842

zero_user(page, start, size);

1835

set_buffer_uptodate(bh);

1843

set_buffer_uptodate(bh);

1836

}

1844

}

1837

1845

1838

clear_buffer_new(bh);

1846

clear_buffer_new(bh);

1839

mark_buffer_dirty(bh);

1847

mark_buffer_dirty(bh);

1840

}

1848

}

1841

}

1849

}

1842

1850

1843

block_start = block_end;

1851

block_start = block_end;

1844

bh = bh->b_this_page;

1852

bh = bh->b_this_page;

1845

} while (bh != head);

1853

} while (bh != head);

1846

}

1854

}

1847

EXPORT_SYMBOL(page_zero_new_buffers);

1855

EXPORT_SYMBOL(page_zero_new_buffers);

1848

1856

1849

static int __block_prepare_write(struct inode *inode, struct page *page,

1857

static int __block_prepare_write(struct inode *inode, struct page *page,

1850

unsigned from, unsigned to, get_block_t *get_block)

1858

unsigned from, unsigned to, get_block_t *get_block)

1851

{

1859

{

1852

unsigned block_start, block_end;

1860

unsigned block_start, block_end;

1853

sector_t block;

1861

sector_t block;

1854

int err = 0;

1862

int err = 0;

1855

unsigned blocksize, bbits;

1863

unsigned blocksize, bbits;

1856

struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

1864

struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

1857

1865

1858

BUG_ON(!PageLocked(page));

1866

BUG_ON(!PageLocked(page));

1859

BUG_ON(from > PAGE_CACHE_SIZE);

1867

BUG_ON(from > PAGE_CACHE_SIZE);

1860

BUG_ON(to > PAGE_CACHE_SIZE);

1868

BUG_ON(to > PAGE_CACHE_SIZE);

1861

BUG_ON(from > to);

1869

BUG_ON(from > to);

1862

1870

1863

blocksize = 1 << inode->i_blkbits;

1871

blocksize = 1 << inode->i_blkbits;

1864

if (!page_has_buffers(page))

1872

if (!page_has_buffers(page))

1865

create_empty_buffers(page, blocksize, 0);

1873

create_empty_buffers(page, blocksize, 0);

1866

head = page_buffers(page);

1874

head = page_buffers(page);

1867

1875

1868

bbits = inode->i_blkbits;

1876

bbits = inode->i_blkbits;

1869

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

1877

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

1870

1878

1871

for(bh = head, block_start = 0; bh != head || !block_start;

1879

for(bh = head, block_start = 0; bh != head || !block_start;

1872

block++, block_start=block_end, bh = bh->b_this_page) {

1880

block++, block_start=block_end, bh = bh->b_this_page) {

1873

block_end = block_start + blocksize;

1881

block_end = block_start + blocksize;

1874

if (block_end <= from || block_start >= to) {

1882

if (block_end <= from || block_start >= to) {

1875

if (PageUptodate(page)) {

1883

if (PageUptodate(page)) {

1876

if (!buffer_uptodate(bh))

1884

if (!buffer_uptodate(bh))

1877

set_buffer_uptodate(bh);

1885

set_buffer_uptodate(bh);

1878

}

1886

}

1879

continue;

1887

continue;

1880

}

1888

}

1881

if (buffer_new(bh))

1889

if (buffer_new(bh))

1882

clear_buffer_new(bh);

1890

clear_buffer_new(bh);

1883

if (!buffer_mapped(bh)) {

1891

if (!buffer_mapped(bh)) {

1884

WARN_ON(bh->b_size != blocksize);

1892

WARN_ON(bh->b_size != blocksize);

1885

err = get_block(inode, block, bh, 1);

1893

err = get_block(inode, block, bh, 1);

1886

if (err)

1894

if (err)

1887

break;

1895

break;

1888

if (buffer_new(bh)) {

1896

if (buffer_new(bh)) {

1889

unmap_underlying_metadata(bh->b_bdev,

1897

unmap_underlying_metadata(bh->b_bdev,

1890

bh->b_blocknr);

1898

bh->b_blocknr);

1891

if (PageUptodate(page)) {

1899

if (PageUptodate(page)) {

1892

clear_buffer_new(bh);

1900

clear_buffer_new(bh);

1893

set_buffer_uptodate(bh);

1901

set_buffer_uptodate(bh);

1894

mark_buffer_dirty(bh);

1902

mark_buffer_dirty(bh);

1895

continue;

1903

continue;

1896

}

1904

}

1897

if (block_end > to || block_start < from)

1905

if (block_end > to || block_start < from)

1898

zero_user_segments(page,

1906

zero_user_segments(page,

1899

to, block_end,

1907

to, block_end,

1900

block_start, from);

1908

block_start, from);

1901

continue;

1909

continue;

1902

}

1910

}

1903

}

1911

}

1904

if (PageUptodate(page)) {

1912

if (PageUptodate(page)) {

1905

if (!buffer_uptodate(bh))

1913

if (!buffer_uptodate(bh))

1906

set_buffer_uptodate(bh);

1914

set_buffer_uptodate(bh);

1907

continue;

1915

continue;

1908

}

1916

}

1909

if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

1917

if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

1910

!buffer_unwritten(bh) &&

1918

!buffer_unwritten(bh) &&

1911

(block_start < from || block_end > to)) {

1919

(block_start < from || block_end > to)) {

1912

ll_rw_block(READ, 1, &bh);

1920

ll_rw_block(READ, 1, &bh);

1913

*wait_bh++=bh;

1921

*wait_bh++=bh;

1914

}

1922

}

1915

}

1923

}

1916

/*

1924

/*

1917

* If we issued read requests - let them complete.

1925

* If we issued read requests - let them complete.

1918

*/

1926

*/

1919

while(wait_bh > wait) {

1927

while(wait_bh > wait) {

1920

wait_on_buffer(*--wait_bh);

1928

wait_on_buffer(*--wait_bh);

1921

if (!buffer_uptodate(*wait_bh))

1929

if (!buffer_uptodate(*wait_bh))

1922

err = -EIO;

1930

err = -EIO;

1923

}

1931

}

1924

if (unlikely(err))

1932

if (unlikely(err))

1925

page_zero_new_buffers(page, from, to);

1933

page_zero_new_buffers(page, from, to);

1926

return err;

1934

return err;

1927

}

1935

}

1928

1936

1929

static int __block_commit_write(struct inode *inode, struct page *page,

1937

static int __block_commit_write(struct inode *inode, struct page *page,

1930

unsigned from, unsigned to)

1938

unsigned from, unsigned to)

1931

{

1939

{

1932

unsigned block_start, block_end;

1940

unsigned block_start, block_end;

1933

int partial = 0;

1941

int partial = 0;

1934

unsigned blocksize;

1942

unsigned blocksize;

1935

struct buffer_head *bh, *head;

1943

struct buffer_head *bh, *head;

1936

1944

1937

blocksize = 1 << inode->i_blkbits;

1945

blocksize = 1 << inode->i_blkbits;

1938

1946

1939

for(bh = head = page_buffers(page), block_start = 0;

1947

for(bh = head = page_buffers(page), block_start = 0;

1940

bh != head || !block_start;

1948

bh != head || !block_start;

1941

block_start=block_end, bh = bh->b_this_page) {

1949

block_start=block_end, bh = bh->b_this_page) {

1942

block_end = block_start + blocksize;

1950

block_end = block_start + blocksize;

1943

if (block_end <= from || block_start >= to) {

1951

if (block_end <= from || block_start >= to) {

1944

if (!buffer_uptodate(bh))

1952

if (!buffer_uptodate(bh))

1945

partial = 1;

1953

partial = 1;

1946

} else {

1954

} else {

1947

set_buffer_uptodate(bh);

1955

set_buffer_uptodate(bh);

1948

mark_buffer_dirty(bh);

1956

mark_buffer_dirty(bh);

1949

}

1957

}

1950

clear_buffer_new(bh);

1958

clear_buffer_new(bh);

1951

}

1959

}

1952

1960

1953

/*

1961

/*

1954

* If this is a partial write which happened to make all buffers

1962

* If this is a partial write which happened to make all buffers

1955

* uptodate then we can optimize away a bogus readpage() for

1963

* uptodate then we can optimize away a bogus readpage() for

1956

* the next read(). Here we 'discover' whether the page went

1964

* the next read(). Here we 'discover' whether the page went

1957

* uptodate as a result of this (potentially partial) write.

1965

* uptodate as a result of this (potentially partial) write.

1958

*/

1966

*/

1959

if (!partial)

1967

if (!partial)

1960

SetPageUptodate(page);

1968

SetPageUptodate(page);

1961

return 0;

1969

return 0;

1962

}

1970

}

1963

1971

1964

/*

1972

/*

1965

* block_write_begin takes care of the basic task of block allocation and

1973

* block_write_begin takes care of the basic task of block allocation and

1966

* bringing partial write blocks uptodate first.

1974

* bringing partial write blocks uptodate first.

1967

*

1975

*

1968

* If *pagep is not NULL, then block_write_begin uses the locked page

1976

* If *pagep is not NULL, then block_write_begin uses the locked page

1969

* at *pagep rather than allocating its own. In this case, the page will

1977

* at *pagep rather than allocating its own. In this case, the page will

1970

* not be unlocked or deallocated on failure.

1978

* not be unlocked or deallocated on failure.

1971

*/

1979

*/

1972

int block_write_begin(struct file *file, struct address_space *mapping,

1980

int block_write_begin(struct file *file, struct address_space *mapping,

1973

loff_t pos, unsigned len, unsigned flags,

1981

loff_t pos, unsigned len, unsigned flags,

1974

struct page **pagep, void **fsdata,

1982

struct page **pagep, void **fsdata,

1975

get_block_t *get_block)

1983

get_block_t *get_block)

1976

{

1984

{

1977

struct inode *inode = mapping->host;

1985

struct inode *inode = mapping->host;

1978

int status = 0;

1986

int status = 0;

1979

struct page *page;

1987

struct page *page;

1980

pgoff_t index;

1988

pgoff_t index;

1981

unsigned start, end;

1989

unsigned start, end;

1982

int ownpage = 0;

1990

int ownpage = 0;

1983

1991

1984

index = pos >> PAGE_CACHE_SHIFT;

1992

index = pos >> PAGE_CACHE_SHIFT;

1985

start = pos & (PAGE_CACHE_SIZE - 1);

1993

start = pos & (PAGE_CACHE_SIZE - 1);

1986

end = start + len;

1994

end = start + len;

1987

1995

1988

page = *pagep;

1996

page = *pagep;

1989

if (page == NULL) {

1997

if (page == NULL) {

1990

ownpage = 1;

1998

ownpage = 1;

1991

page = __grab_cache_page(mapping, index);

1999

page = __grab_cache_page(mapping, index);

1992

if (!page) {

2000

if (!page) {

1993

status = -ENOMEM;

2001

status = -ENOMEM;

1994

goto out;

2002

goto out;

1995

}

2003

}

1996

*pagep = page;

2004

*pagep = page;

1997

} else

2005

} else

1998

BUG_ON(!PageLocked(page));

2006

BUG_ON(!PageLocked(page));

1999

2007

2000

status = __block_prepare_write(inode, page, start, end, get_block);

2008

status = __block_prepare_write(inode, page, start, end, get_block);

2001

if (unlikely(status)) {

2009

if (unlikely(status)) {

2002

ClearPageUptodate(page);

2010

ClearPageUptodate(page);

2003

2011

2004

if (ownpage) {

2012

if (ownpage) {

2005

unlock_page(page);

2013

unlock_page(page);

2006

page_cache_release(page);

2014

page_cache_release(page);

2007

*pagep = NULL;

2015

*pagep = NULL;

2008

2016

2009

/*

2017

/*

2010

* prepare_write() may have instantiated a few blocks

2018

* prepare_write() may have instantiated a few blocks

2011

* outside i_size. Trim these off again. Don't need

2019

* outside i_size. Trim these off again. Don't need

2012

* i_size_read because we hold i_mutex.

2020

* i_size_read because we hold i_mutex.

2013

*/

2021

*/

2014

if (pos + len > inode->i_size)

2022

if (pos + len > inode->i_size)

2015

vmtruncate(inode, inode->i_size);

2023

vmtruncate(inode, inode->i_size);

2016

}

2024

}

2017

goto out;

2025

goto out;

2018

}

2026

}

2019

2027

2020

out:

2028

out:

2021

return status;

2029

return status;

2022

}

2030

}

2023

EXPORT_SYMBOL(block_write_begin);

2031

EXPORT_SYMBOL(block_write_begin);

2024

2032

2025

int block_write_end(struct file *file, struct address_space *mapping,

2033

int block_write_end(struct file *file, struct address_space *mapping,

2026

loff_t pos, unsigned len, unsigned copied,

2034

loff_t pos, unsigned len, unsigned copied,

2027

struct page *page, void *fsdata)

2035

struct page *page, void *fsdata)

2028

{

2036

{

2029

struct inode *inode = mapping->host;

2037

struct inode *inode = mapping->host;

2030

unsigned start;

2038

unsigned start;

2031

2039

2032

start = pos & (PAGE_CACHE_SIZE - 1);

2040

start = pos & (PAGE_CACHE_SIZE - 1);

2033

2041

2034

if (unlikely(copied < len)) {

2042

if (unlikely(copied < len)) {

2035

/*

2043

/*

2036

* The buffers that were written will now be uptodate, so we

2044

* The buffers that were written will now be uptodate, so we

2037

* don't have to worry about a readpage reading them and

2045

* don't have to worry about a readpage reading them and

2038

* overwriting a partial write. However if we have encountered

2046

* overwriting a partial write. However if we have encountered

2039

* a short write and only partially written into a buffer, it

2047

* a short write and only partially written into a buffer, it

2040

* will not be marked uptodate, so a readpage might come in and

2048

* will not be marked uptodate, so a readpage might come in and

2041

* destroy our partial write.

2049

* destroy our partial write.

2042

*

2050

*

2043

* Do the simplest thing, and just treat any short write to a

2051

* Do the simplest thing, and just treat any short write to a

2044

* non uptodate page as a zero-length write, and force the

2052

* non uptodate page as a zero-length write, and force the

2045

* caller to redo the whole thing.

2053

* caller to redo the whole thing.

2046

*/

2054

*/

2047

if (!PageUptodate(page))

2055

if (!PageUptodate(page))

2048

copied = 0;

2056

copied = 0;

2049

2057

2050

page_zero_new_buffers(page, start+copied, start+len);

2058

page_zero_new_buffers(page, start+copied, start+len);

2051

}

2059

}

2052

flush_dcache_page(page);

2060

flush_dcache_page(page);

2053

2061

2054

/* This could be a short (even 0-length) commit */

2062

/* This could be a short (even 0-length) commit */

2055

__block_commit_write(inode, page, start, start+copied);

2063

__block_commit_write(inode, page, start, start+copied);

2056

2064

2057

return copied;

2065

return copied;

2058

}

2066

}

2059

EXPORT_SYMBOL(block_write_end);

2067

EXPORT_SYMBOL(block_write_end);

2060

2068

2061

int generic_write_end(struct file *file, struct address_space *mapping,

2069

int generic_write_end(struct file *file, struct address_space *mapping,

2062

loff_t pos, unsigned len, unsigned copied,

2070

loff_t pos, unsigned len, unsigned copied,

2063

struct page *page, void *fsdata)

2071

struct page *page, void *fsdata)

2064

{

2072

{

2065

struct inode *inode = mapping->host;

2073

struct inode *inode = mapping->host;

2066

int i_size_changed = 0;

2074

int i_size_changed = 0;

2067

2075

2068

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

2076

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

2069

2077

2070

/*

2078

/*

2071

* No need to use i_size_read() here, the i_size

2079

* No need to use i_size_read() here, the i_size

2072

* cannot change under us because we hold i_mutex.

2080

* cannot change under us because we hold i_mutex.

2073

*

2081

*

2074

* But it's important to update i_size while still holding page lock:

2082

* But it's important to update i_size while still holding page lock:

2075

* page writeout could otherwise come in and zero beyond i_size.

2083

* page writeout could otherwise come in and zero beyond i_size.

2076

*/

2084

*/

2077

if (pos+copied > inode->i_size) {

2085

if (pos+copied > inode->i_size) {

2078

i_size_write(inode, pos+copied);

2086

i_size_write(inode, pos+copied);

2079

i_size_changed = 1;

2087

i_size_changed = 1;

2080

}

2088

}

2081

2089

2082

unlock_page(page);

2090

unlock_page(page);

2083

page_cache_release(page);

2091

page_cache_release(page);

2084

2092

2085

/*

2093

/*

2086

* Don't mark the inode dirty under page lock. First, it unnecessarily

2094

* Don't mark the inode dirty under page lock. First, it unnecessarily

2087

* makes the holding time of page lock longer. Second, it forces lock

2095

* makes the holding time of page lock longer. Second, it forces lock

2088

* ordering of page lock and transaction start for journaling

2096

* ordering of page lock and transaction start for journaling

2089

* filesystems.

2097

* filesystems.

2090

*/

2098

*/

2091

if (i_size_changed)

2099

if (i_size_changed)

2092

mark_inode_dirty(inode);

2100

mark_inode_dirty(inode);

2093

2101

2094

return copied;

2102

return copied;

2095

}

2103

}

2096

EXPORT_SYMBOL(generic_write_end);

2104

EXPORT_SYMBOL(generic_write_end);

2097

2105

2098

/*

2106

/*

2099

* block_is_partially_uptodate checks whether buffers within a page are

2107

* block_is_partially_uptodate checks whether buffers within a page are

2100

* uptodate or not.

2108

* uptodate or not.

2101

*

2109

*

2102

* Returns true if all buffers which correspond to a file portion

2110

* Returns true if all buffers which correspond to a file portion

2103

* we want to read are uptodate.

2111

* we want to read are uptodate.

2104

*/

2112

*/

2105

int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,

2113

int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,

2106

unsigned long from)

2114

unsigned long from)

2107

{

2115

{

2108

struct inode *inode = page->mapping->host;

2116

struct inode *inode = page->mapping->host;

2109

unsigned block_start, block_end, blocksize;

2117

unsigned block_start, block_end, blocksize;

2110

unsigned to;

2118

unsigned to;

2111

struct buffer_head *bh, *head;

2119

struct buffer_head *bh, *head;

2112

int ret = 1;

2120

int ret = 1;

2113

2121

2114

if (!page_has_buffers(page))

2122

if (!page_has_buffers(page))

2115

return 0;

2123

return 0;

2116

2124

2117

blocksize = 1 << inode->i_blkbits;

2125

blocksize = 1 << inode->i_blkbits;

2118

to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);

2126

to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);

2119

to = from + to;

2127

to = from + to;

2120

if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)

2128

if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)

2121

return 0;

2129

return 0;

2122

2130

2123

head = page_buffers(page);

2131

head = page_buffers(page);

2124

bh = head;

2132

bh = head;

2125

block_start = 0;

2133

block_start = 0;

2126

do {

2134

do {

2127

block_end = block_start + blocksize;

2135

block_end = block_start + blocksize;

2128

if (block_end > from && block_start < to) {

2136

if (block_end > from && block_start < to) {

2129

if (!buffer_uptodate(bh)) {

2137

if (!buffer_uptodate(bh)) {

2130

ret = 0;

2138

ret = 0;

2131

break;

2139

break;

2132

}

2140

}

2133

if (block_end >= to)

2141

if (block_end >= to)

2134

break;

2142

break;

2135

}

2143

}

2136

block_start = block_end;

2144

block_start = block_end;

2137

bh = bh->b_this_page;

2145

bh = bh->b_this_page;

2138

} while (bh != head);

2146

} while (bh != head);

2139

2147

2140

return ret;

2148

return ret;

2141

}

2149

}

2142

EXPORT_SYMBOL(block_is_partially_uptodate);

2150

EXPORT_SYMBOL(block_is_partially_uptodate);

2143

2151

2144

/*

2152

/*

2145

* Generic "read page" function for block devices that have the normal

2153

* Generic "read page" function for block devices that have the normal

2146

* get_block functionality. This is most of the block device filesystems.

2154

* get_block functionality. This is most of the block device filesystems.

2147

* Reads the page asynchronously --- the unlock_buffer() and

2155

* Reads the page asynchronously --- the unlock_buffer() and

2148

* set/clear_buffer_uptodate() functions propagate buffer state into the

2156

* set/clear_buffer_uptodate() functions propagate buffer state into the

2149

* page struct once IO has completed.

2157

* page struct once IO has completed.

2150

*/

2158

*/

2151

int block_read_full_page(struct page *page, get_block_t *get_block)

2159

int block_read_full_page(struct page *page, get_block_t *get_block)

2152

{

2160

{

2153

struct inode *inode = page->mapping->host;

2161

struct inode *inode = page->mapping->host;

2154

sector_t iblock, lblock;

2162

sector_t iblock, lblock;

2155

struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];

2163

struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];

2156

unsigned int blocksize;

2164

unsigned int blocksize;

2157

int nr, i;

2165

int nr, i;

2158

int fully_mapped = 1;

2166

int fully_mapped = 1;

2159

2167

2160

BUG_ON(!PageLocked(page));

2168

BUG_ON(!PageLocked(page));

2161

blocksize = 1 << inode->i_blkbits;

2169

blocksize = 1 << inode->i_blkbits;

2162

if (!page_has_buffers(page))

2170

if (!page_has_buffers(page))

2163

create_empty_buffers(page, blocksize, 0);

2171

create_empty_buffers(page, blocksize, 0);

2164

head = page_buffers(page);

2172

head = page_buffers(page);

2165

2173

2166

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2174

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2167

lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;

2175

lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;

2168

bh = head;

2176

bh = head;

2169

nr = 0;

2177

nr = 0;

2170

i = 0;

2178

i = 0;

2171

2179

2172

do {

2180

do {

2173

if (buffer_uptodate(bh))

2181

if (buffer_uptodate(bh))

2174

continue;

2182

continue;

2175

2183

2176

if (!buffer_mapped(bh)) {

2184

if (!buffer_mapped(bh)) {

2177

int err = 0;

2185

int err = 0;

2178

2186

2179

fully_mapped = 0;

2187

fully_mapped = 0;

2180

if (iblock < lblock) {

2188

if (iblock < lblock) {

2181

WARN_ON(bh->b_size != blocksize);

2189

WARN_ON(bh->b_size != blocksize);

2182

err = get_block(inode, iblock, bh, 0);

2190

err = get_block(inode, iblock, bh, 0);

2183

if (err)

2191

if (err)

2184

SetPageError(page);

2192

SetPageError(page);

2185

}

2193

}

2186

if (!buffer_mapped(bh)) {

2194

if (!buffer_mapped(bh)) {

2187

zero_user(page, i * blocksize, blocksize);

2195

zero_user(page, i * blocksize, blocksize);

2188

if (!err)

2196

if (!err)

2189

set_buffer_uptodate(bh);

2197

set_buffer_uptodate(bh);

2190

continue;

2198

continue;

2191

}

2199

}

2192

/*

2200

/*

2193

* get_block() might have updated the buffer

2201

* get_block() might have updated the buffer

2194

* synchronously

2202

* synchronously

2195

*/

2203

*/

2196

if (buffer_uptodate(bh))

2204

if (buffer_uptodate(bh))

2197

continue;

2205

continue;

2198

}

2206

}

2199

arr[nr++] = bh;

2207

arr[nr++] = bh;

2200

} while (i++, iblock++, (bh = bh->b_this_page) != head);

2208

} while (i++, iblock++, (bh = bh->b_this_page) != head);

2201

2209

2202

if (fully_mapped)

2210

if (fully_mapped)

2203

SetPageMappedToDisk(page);

2211

SetPageMappedToDisk(page);

2204

2212

2205

if (!nr) {

2213

if (!nr) {

2206

/*

2214

/*

2207

* All buffers are uptodate - we can set the page uptodate

2215

* All buffers are uptodate - we can set the page uptodate

2208

* as well. But not if get_block() returned an error.

2216

* as well. But not if get_block() returned an error.

2209

*/

2217

*/

2210

if (!PageError(page))

2218

if (!PageError(page))

2211

SetPageUptodate(page);

2219

SetPageUptodate(page);

2212

unlock_page(page);

2220

unlock_page(page);

2213

return 0;

2221

return 0;

2214

}

2222

}

2215

2223

2216

/* Stage two: lock the buffers */

2224

/* Stage two: lock the buffers */

2217

for (i = 0; i < nr; i++) {

2225

for (i = 0; i < nr; i++) {

2218

bh = arr[i];

2226

bh = arr[i];

2219

lock_buffer(bh);

2227

lock_buffer(bh);

2220

mark_buffer_async_read(bh);

2228

mark_buffer_async_read(bh);

2221

}

2229

}

2222

2230

2223

/*

2231

/*

2224

* Stage 3: start the IO. Check for uptodateness

2232

* Stage 3: start the IO. Check for uptodateness

2225

* inside the buffer lock in case another process reading

2233

* inside the buffer lock in case another process reading

2226

* the underlying blockdev brought it uptodate (the sct fix).

2234

* the underlying blockdev brought it uptodate (the sct fix).

2227

*/

2235

*/

2228

for (i = 0; i < nr; i++) {

2236

for (i = 0; i < nr; i++) {

2229

bh = arr[i];

2237

bh = arr[i];

2230

if (buffer_uptodate(bh))

2238

if (buffer_uptodate(bh))

2231

end_buffer_async_read(bh, 1);

2239

end_buffer_async_read(bh, 1);

2232

else

2240

else

2233

submit_bh(READ, bh);

2241

submit_bh(READ, bh);

2234

}

2242

}

2235

return 0;

2243

return 0;

2236

}

2244

}

2237

2245

2238

/* utility function for filesystems that need to do work on expanding

2246

/* utility function for filesystems that need to do work on expanding

2239

* truncates. Uses filesystem pagecache writes to allow the filesystem to

2247

* truncates. Uses filesystem pagecache writes to allow the filesystem to

2240

* deal with the hole.

2248

* deal with the hole.

2241

*/

2249

*/

2242

int generic_cont_expand_simple(struct inode *inode, loff_t size)

2250

int generic_cont_expand_simple(struct inode *inode, loff_t size)

2243

{

2251

{

2244

struct address_space *mapping = inode->i_mapping;

2252

struct address_space *mapping = inode->i_mapping;

2245

struct page *page;

2253

struct page *page;

2246

void *fsdata;

2254

void *fsdata;

2247

unsigned long limit;

2255

unsigned long limit;

2248

int err;

2256

int err;

2249

2257

2250

err = -EFBIG;

2258

err = -EFBIG;

2251

limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;

2259

limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;

2252

if (limit != RLIM_INFINITY && size > (loff_t)limit) {

2260

if (limit != RLIM_INFINITY && size > (loff_t)limit) {

2253

send_sig(SIGXFSZ, current, 0);

2261

send_sig(SIGXFSZ, current, 0);

2254

goto out;

2262

goto out;

2255

}

2263

}

2256

if (size > inode->i_sb->s_maxbytes)

2264

if (size > inode->i_sb->s_maxbytes)

2257

goto out;

2265

goto out;

2258

2266

2259

err = pagecache_write_begin(NULL, mapping, size, 0,

2267

err = pagecache_write_begin(NULL, mapping, size, 0,

2260

AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,

2268

AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,

2261

&page, &fsdata);

2269

&page, &fsdata);

2262

if (err)

2270

if (err)

2263

goto out;

2271

goto out;

2264

2272

2265

err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);

2273

err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);

2266

BUG_ON(err > 0);

2274

BUG_ON(err > 0);

2267

2275

2268

out:

2276

out:

2269

return err;

2277

return err;

2270

}

2278

}

2271

2279

2272

static int cont_expand_zero(struct file *file, struct address_space *mapping,

2280

static int cont_expand_zero(struct file *file, struct address_space *mapping,

2273

loff_t pos, loff_t *bytes)

2281

loff_t pos, loff_t *bytes)

2274

{

2282

{

2275

struct inode *inode = mapping->host;

2283

struct inode *inode = mapping->host;

2276

unsigned blocksize = 1 << inode->i_blkbits;

2284

unsigned blocksize = 1 << inode->i_blkbits;

2277

struct page *page;

2285

struct page *page;

2278

void *fsdata;

2286

void *fsdata;

2279

pgoff_t index, curidx;

2287

pgoff_t index, curidx;

2280

loff_t curpos;

2288

loff_t curpos;

2281

unsigned zerofrom, offset, len;

2289

unsigned zerofrom, offset, len;

2282

int err = 0;

2290

int err = 0;

2283

2291

2284

index = pos >> PAGE_CACHE_SHIFT;

2292

index = pos >> PAGE_CACHE_SHIFT;

2285

offset = pos & ~PAGE_CACHE_MASK;

2293

offset = pos & ~PAGE_CACHE_MASK;

2286

2294

2287

while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {

2295

while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {

2288

zerofrom = curpos & ~PAGE_CACHE_MASK;

2296

zerofrom = curpos & ~PAGE_CACHE_MASK;

2289

if (zerofrom & (blocksize-1)) {

2297

if (zerofrom & (blocksize-1)) {

2290

*bytes |= (blocksize-1);

2298

*bytes |= (blocksize-1);

2291

(*bytes)++;

2299

(*bytes)++;

2292

}

2300

}

2293

len = PAGE_CACHE_SIZE - zerofrom;

2301

len = PAGE_CACHE_SIZE - zerofrom;

2294

2302

2295

err = pagecache_write_begin(file, mapping, curpos, len,

2303

err = pagecache_write_begin(file, mapping, curpos, len,

2296

AOP_FLAG_UNINTERRUPTIBLE,

2304

AOP_FLAG_UNINTERRUPTIBLE,

2297

&page, &fsdata);

2305

&page, &fsdata);

2298

if (err)

2306

if (err)

2299

goto out;

2307

goto out;

2300

zero_user(page, zerofrom, len);

2308

zero_user(page, zerofrom, len);

2301

err = pagecache_write_end(file, mapping, curpos, len, len,

2309

err = pagecache_write_end(file, mapping, curpos, len, len,

2302

page, fsdata);

2310

page, fsdata);

2303

if (err < 0)

2311

if (err < 0)

2304

goto out;

2312

goto out;

2305

BUG_ON(err != len);

2313

BUG_ON(err != len);

2306

err = 0;

2314

err = 0;

2307

2315

2308

balance_dirty_pages_ratelimited(mapping);

2316

balance_dirty_pages_ratelimited(mapping);

2309

}

2317

}

2310

2318

2311

/* page covers the boundary, find the boundary offset */

2319

/* page covers the boundary, find the boundary offset */

2312

if (index == curidx) {

2320

if (index == curidx) {

2313

zerofrom = curpos & ~PAGE_CACHE_MASK;

2321

zerofrom = curpos & ~PAGE_CACHE_MASK;

2314

/* if we will expand the thing last block will be filled */

2322

/* if we will expand the thing last block will be filled */

2315

if (offset <= zerofrom) {

2323

if (offset <= zerofrom) {

2316

goto out;

2324

goto out;

2317

}

2325

}

2318

if (zerofrom & (blocksize-1)) {

2326

if (zerofrom & (blocksize-1)) {

2319

*bytes |= (blocksize-1);

2327

*bytes |= (blocksize-1);

2320

(*bytes)++;

2328

(*bytes)++;

2321

}

2329

}

2322

len = offset - zerofrom;

2330

len = offset - zerofrom;

2323

2331

2324

err = pagecache_write_begin(file, mapping, curpos, len,

2332

err = pagecache_write_begin(file, mapping, curpos, len,

2325

AOP_FLAG_UNINTERRUPTIBLE,

2333

AOP_FLAG_UNINTERRUPTIBLE,

2326

&page, &fsdata);

2334

&page, &fsdata);

2327

if (err)

2335

if (err)

2328

goto out;

2336

goto out;

2329

zero_user(page, zerofrom, len);

2337

zero_user(page, zerofrom, len);

2330

err = pagecache_write_end(file, mapping, curpos, len, len,

2338

err = pagecache_write_end(file, mapping, curpos, len, len,

2331

page, fsdata);

2339

page, fsdata);

2332

if (err < 0)

2340

if (err < 0)

2333

goto out;

2341

goto out;

2334

BUG_ON(err != len);

2342

BUG_ON(err != len);

2335

err = 0;

2343

err = 0;

2336

}

2344

}

2337

out:

2345

out:

2338

return err;

2346

return err;

2339

}

2347

}

2340

2348

2341

/*

2349

/*

2342

* For moronic filesystems that do not allow holes in file.

2350

* For moronic filesystems that do not allow holes in file.

2343

* We may have to extend the file.

2351

* We may have to extend the file.

2344

*/

2352

*/

2345

int cont_write_begin(struct file *file, struct address_space *mapping,

2353

int cont_write_begin(struct file *file, struct address_space *mapping,

2346

loff_t pos, unsigned len, unsigned flags,

2354

loff_t pos, unsigned len, unsigned flags,

2347

struct page **pagep, void **fsdata,

2355

struct page **pagep, void **fsdata,

2348

get_block_t *get_block, loff_t *bytes)

2356

get_block_t *get_block, loff_t *bytes)

2349

{

2357

{

2350

struct inode *inode = mapping->host;

2358

struct inode *inode = mapping->host;

2351

unsigned blocksize = 1 << inode->i_blkbits;

2359

unsigned blocksize = 1 << inode->i_blkbits;

2352

unsigned zerofrom;

2360

unsigned zerofrom;

2353

int err;

2361

int err;

2354

2362

2355

err = cont_expand_zero(file, mapping, pos, bytes);

2363

err = cont_expand_zero(file, mapping, pos, bytes);

2356

if (err)

2364

if (err)

2357

goto out;

2365

goto out;

2358

2366

2359

zerofrom = *bytes & ~PAGE_CACHE_MASK;

2367

zerofrom = *bytes & ~PAGE_CACHE_MASK;

2360

if (pos+len > *bytes && zerofrom & (blocksize-1)) {

2368

if (pos+len > *bytes && zerofrom & (blocksize-1)) {

2361

*bytes |= (blocksize-1);

2369

*bytes |= (blocksize-1);

2362

(*bytes)++;

2370

(*bytes)++;

2363

}

2371

}

2364

2372

2365

*pagep = NULL;

2373

*pagep = NULL;

2366

err = block_write_begin(file, mapping, pos, len,

2374

err = block_write_begin(file, mapping, pos, len,

2367

flags, pagep, fsdata, get_block);

2375

flags, pagep, fsdata, get_block);

2368

out:

2376

out:

2369

return err;

2377

return err;

2370

}

2378

}

2371

2379

2372

int block_prepare_write(struct page *page, unsigned from, unsigned to,

2380

int block_prepare_write(struct page *page, unsigned from, unsigned to,

2373

get_block_t *get_block)

2381

get_block_t *get_block)

2374

{

2382

{

2375

struct inode *inode = page->mapping->host;

2383

struct inode *inode = page->mapping->host;

2376

int err = __block_prepare_write(inode, page, from, to, get_block);

2384

int err = __block_prepare_write(inode, page, from, to, get_block);

2377

if (err)

2385

if (err)

2378

ClearPageUptodate(page);

2386

ClearPageUptodate(page);

2379

return err;

2387

return err;

2380

}

2388

}

2381

2389

2382

int block_commit_write(struct page *page, unsigned from, unsigned to)

2390

int block_commit_write(struct page *page, unsigned from, unsigned to)

2383

{

2391

{

2384

struct inode *inode = page->mapping->host;

2392

struct inode *inode = page->mapping->host;

2385

__block_commit_write(inode,page,from,to);

2393

__block_commit_write(inode,page,from,to);

2386

return 0;

2394

return 0;

2387

}

2395

}

2388

2396

2389

/*

2397

/*

2390

* block_page_mkwrite() is not allowed to change the file size as it gets

2398

* block_page_mkwrite() is not allowed to change the file size as it gets

2391

* called from a page fault handler when a page is first dirtied. Hence we must

2399

* called from a page fault handler when a page is first dirtied. Hence we must

2392

* be careful to check for EOF conditions here. We set the page up correctly

2400

* be careful to check for EOF conditions here. We set the page up correctly

2393

* for a written page which means we get ENOSPC checking when writing into

2401

* for a written page which means we get ENOSPC checking when writing into

2394

* holes and correct delalloc and unwritten extent mapping on filesystems that

2402

* holes and correct delalloc and unwritten extent mapping on filesystems that

2395

* support these features.

2403

* support these features.

2396

*

2404

*

2397

* We are not allowed to take the i_mutex here so we have to play games to

2405

* We are not allowed to take the i_mutex here so we have to play games to

2398

* protect against truncate races as the page could now be beyond EOF. Because

2406

* protect against truncate races as the page could now be beyond EOF. Because

2399

* vmtruncate() writes the inode size before removing pages, once we have the

2407

* vmtruncate() writes the inode size before removing pages, once we have the

2400

* page lock we can determine safely if the page is beyond EOF. If it is not

2408

* page lock we can determine safely if the page is beyond EOF. If it is not

2401

* beyond EOF, then the page is guaranteed safe against truncation until we

2409

* beyond EOF, then the page is guaranteed safe against truncation until we

2402

* unlock the page.

2410

* unlock the page.

2403

*/

2411

*/

2404

int

2412

int

2405

block_page_mkwrite(struct vm_area_struct *vma, struct page *page,

2413

block_page_mkwrite(struct vm_area_struct *vma, struct page *page,

2406

get_block_t get_block)

2414

get_block_t get_block)

2407

{

2415

{

2408

struct inode *inode = vma->vm_file->f_path.dentry->d_inode;

2416

struct inode *inode = vma->vm_file->f_path.dentry->d_inode;

2409

unsigned long end;

2417

unsigned long end;

2410

loff_t size;

2418

loff_t size;

2411

int ret = -EINVAL;

2419

int ret = -EINVAL;

2412

2420

2413

lock_page(page);

2421

lock_page(page);

2414

size = i_size_read(inode);

2422

size = i_size_read(inode);

2415

if ((page->mapping != inode->i_mapping) ||

2423

if ((page->mapping != inode->i_mapping) ||

2416

(page_offset(page) > size)) {

2424

(page_offset(page) > size)) {

2417

/* page got truncated out from underneath us */

2425

/* page got truncated out from underneath us */

2418

goto out_unlock;

2426

goto out_unlock;

2419

}

2427

}

2420

2428

2421

/* page is wholly or partially inside EOF */

2429

/* page is wholly or partially inside EOF */

2422

if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)

2430

if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)

2423

end = size & ~PAGE_CACHE_MASK;

2431

end = size & ~PAGE_CACHE_MASK;

2424

else

2432

else

2425

end = PAGE_CACHE_SIZE;

2433

end = PAGE_CACHE_SIZE;

2426

2434

2427

ret = block_prepare_write(page, 0, end, get_block);

2435

ret = block_prepare_write(page, 0, end, get_block);

2428

if (!ret)

2436

if (!ret)

2429

ret = block_commit_write(page, 0, end);

2437

ret = block_commit_write(page, 0, end);

2430

2438

2431

out_unlock:

2439

out_unlock:

2432

unlock_page(page);

2440

unlock_page(page);

2433

return ret;

2441

return ret;

2434

}

2442

}

2435

2443

2436

/*

2444

/*

2437

* nobh_write_begin()'s prereads are special: the buffer_heads are freed

2445

* nobh_write_begin()'s prereads are special: the buffer_heads are freed

2438

* immediately, while under the page lock. So it needs a special end_io

2446

* immediately, while under the page lock. So it needs a special end_io

2439

* handler which does not touch the bh after unlocking it.

2447

* handler which does not touch the bh after unlocking it.

2440

*/

2448

*/

2441

static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)

2449

static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)

2442

{

2450

{

2443

__end_buffer_read_notouch(bh, uptodate);

2451

__end_buffer_read_notouch(bh, uptodate);

2444

}

2452

}

2445

2453

2446

/*

2454

/*

2447

* Attach the singly-linked list of buffers created by nobh_write_begin, to

2455

* Attach the singly-linked list of buffers created by nobh_write_begin, to

2448

* the page (converting it to circular linked list and taking care of page

2456

* the page (converting it to circular linked list and taking care of page

2449

* dirty races).

2457

* dirty races).

2450

*/

2458

*/

2451

static void attach_nobh_buffers(struct page *page, struct buffer_head *head)

2459

static void attach_nobh_buffers(struct page *page, struct buffer_head *head)

2452

{

2460

{

2453

struct buffer_head *bh;

2461

struct buffer_head *bh;

2454

2462

2455

BUG_ON(!PageLocked(page));

2463

BUG_ON(!PageLocked(page));

2456

2464

2457

spin_lock(&page->mapping->private_lock);

2465

spin_lock(&page->mapping->private_lock);

2458

bh = head;

2466

bh = head;

2459

do {

2467

do {

2460

if (PageDirty(page))

2468

if (PageDirty(page))

2461

set_buffer_dirty(bh);

2469

set_buffer_dirty(bh);

2462

if (!bh->b_this_page)

2470

if (!bh->b_this_page)

2463

bh->b_this_page = head;

2471

bh->b_this_page = head;

2464

bh = bh->b_this_page;

2472

bh = bh->b_this_page;

2465

} while (bh != head);

2473

} while (bh != head);

2466

attach_page_buffers(page, head);

2474

attach_page_buffers(page, head);

2467

spin_unlock(&page->mapping->private_lock);

2475

spin_unlock(&page->mapping->private_lock);

2468

}

2476

}

2469

2477

2470

/*

2478

/*

2471

* On entry, the page is fully not uptodate.

2479

* On entry, the page is fully not uptodate.

2472

* On exit the page is fully uptodate in the areas outside (from,to)

2480

* On exit the page is fully uptodate in the areas outside (from,to)

2473

*/

2481

*/

2474

int nobh_write_begin(struct file *file, struct address_space *mapping,

2482

int nobh_write_begin(struct file *file, struct address_space *mapping,

2475

loff_t pos, unsigned len, unsigned flags,

2483

loff_t pos, unsigned len, unsigned flags,

2476

struct page **pagep, void **fsdata,

2484

struct page **pagep, void **fsdata,

2477

get_block_t *get_block)

2485

get_block_t *get_block)

2478

{

2486

{

2479

struct inode *inode = mapping->host;

2487

struct inode *inode = mapping->host;

2480

const unsigned blkbits = inode->i_blkbits;

2488

const unsigned blkbits = inode->i_blkbits;

2481

const unsigned blocksize = 1 << blkbits;

2489

const unsigned blocksize = 1 << blkbits;

2482

struct buffer_head *head, *bh;

2490

struct buffer_head *head, *bh;

2483

struct page *page;

2491

struct page *page;

2484

pgoff_t index;

2492

pgoff_t index;

2485

unsigned from, to;

2493

unsigned from, to;

2486

unsigned block_in_page;

2494

unsigned block_in_page;

2487

unsigned block_start, block_end;

2495

unsigned block_start, block_end;

2488

sector_t block_in_file;

2496

sector_t block_in_file;

2489

int nr_reads = 0;

2497

int nr_reads = 0;

2490

int ret = 0;

2498

int ret = 0;

2491

int is_mapped_to_disk = 1;

2499

int is_mapped_to_disk = 1;

2492

2500

2493

index = pos >> PAGE_CACHE_SHIFT;

2501

index = pos >> PAGE_CACHE_SHIFT;

2494

from = pos & (PAGE_CACHE_SIZE - 1);

2502

from = pos & (PAGE_CACHE_SIZE - 1);

2495

to = from + len;

2503

to = from + len;

2496

2504

2497

page = __grab_cache_page(mapping, index);

2505

page = __grab_cache_page(mapping, index);

2498

if (!page)

2506

if (!page)

2499

return -ENOMEM;

2507

return -ENOMEM;

2500

*pagep = page;

2508

*pagep = page;

2501

*fsdata = NULL;

2509

*fsdata = NULL;

2502

2510

2503

if (page_has_buffers(page)) {

2511

if (page_has_buffers(page)) {

2504

unlock_page(page);

2512

unlock_page(page);

2505

page_cache_release(page);

2513

page_cache_release(page);

2506

*pagep = NULL;

2514

*pagep = NULL;

2507

return block_write_begin(file, mapping, pos, len, flags, pagep,

2515

return block_write_begin(file, mapping, pos, len, flags, pagep,

2508

fsdata, get_block);

2516

fsdata, get_block);

2509

}

2517

}

2510

2518

2511

if (PageMappedToDisk(page))

2519

if (PageMappedToDisk(page))

2512

return 0;

2520

return 0;

2513

2521

2514

/*

2522

/*

2515

* Allocate buffers so that we can keep track of state, and potentially

2523

* Allocate buffers so that we can keep track of state, and potentially

2516

* attach them to the page if an error occurs. In the common case of

2524

* attach them to the page if an error occurs. In the common case of

2517

* no error, they will just be freed again without ever being attached

2525

* no error, they will just be freed again without ever being attached

2518

* to the page (which is all OK, because we're under the page lock).

2526

* to the page (which is all OK, because we're under the page lock).

2519

*

2527

*

2520

* Be careful: the buffer linked list is a NULL terminated one, rather

2528

* Be careful: the buffer linked list is a NULL terminated one, rather

2521

* than the circular one we're used to.

2529

* than the circular one we're used to.

2522

*/

2530

*/

2523

head = alloc_page_buffers(page, blocksize, 0);

2531

head = alloc_page_buffers(page, blocksize, 0);

2524

if (!head) {

2532

if (!head) {

2525

ret = -ENOMEM;

2533

ret = -ENOMEM;

2526

goto out_release;

2534

goto out_release;

2527

}

2535

}

2528

2536

2529

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);

2537

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);

2530

2538

2531

/*

2539

/*

2532

* We loop across all blocks in the page, whether or not they are

2540

* We loop across all blocks in the page, whether or not they are

2533

* part of the affected region. This is so we can discover if the

2541

* part of the affected region. This is so we can discover if the

2534

* page is fully mapped-to-disk.

2542

* page is fully mapped-to-disk.

2535

*/

2543

*/

2536

for (block_start = 0, block_in_page = 0, bh = head;

2544

for (block_start = 0, block_in_page = 0, bh = head;

2537

block_start < PAGE_CACHE_SIZE;

2545

block_start < PAGE_CACHE_SIZE;

2538

block_in_page++, block_start += blocksize, bh = bh->b_this_page) {

2546

block_in_page++, block_start += blocksize, bh = bh->b_this_page) {

2539

int create;

2547

int create;

2540

2548

2541

block_end = block_start + blocksize;

2549

block_end = block_start + blocksize;

2542

bh->b_state = 0;

2550

bh->b_state = 0;

2543

create = 1;

2551

create = 1;

2544

if (block_start >= to)

2552

if (block_start >= to)

2545

create = 0;

2553

create = 0;

2546

ret = get_block(inode, block_in_file + block_in_page,

2554

ret = get_block(inode, block_in_file + block_in_page,

2547

bh, create);

2555

bh, create);

2548

if (ret)

2556

if (ret)

2549

goto failed;

2557

goto failed;

2550

if (!buffer_mapped(bh))

2558

if (!buffer_mapped(bh))

2551

is_mapped_to_disk = 0;

2559

is_mapped_to_disk = 0;

2552

if (buffer_new(bh))

2560

if (buffer_new(bh))

2553

unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);

2561

unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);

2554

if (PageUptodate(page)) {

2562

if (PageUptodate(page)) {

2555

set_buffer_uptodate(bh);

2563

set_buffer_uptodate(bh);

2556

continue;

2564

continue;

2557

}

2565

}

2558

if (buffer_new(bh) || !buffer_mapped(bh)) {

2566

if (buffer_new(bh) || !buffer_mapped(bh)) {

2559

zero_user_segments(page, block_start, from,

2567

zero_user_segments(page, block_start, from,

2560

to, block_end);

2568

to, block_end);

2561

continue;

2569

continue;

2562

}

2570

}

2563

if (buffer_uptodate(bh))

2571

if (buffer_uptodate(bh))

2564

continue; /* reiserfs does this */

2572

continue; /* reiserfs does this */

2565

if (block_start < from || block_end > to) {

2573

if (block_start < from || block_end > to) {

2566

lock_buffer(bh);

2574

lock_buffer(bh);

2567

bh->b_end_io = end_buffer_read_nobh;

2575

bh->b_end_io = end_buffer_read_nobh;

2568

submit_bh(READ, bh);

2576

submit_bh(READ, bh);

2569

nr_reads++;

2577

nr_reads++;

2570

}

2578

}

2571

}

2579

}

2572

2580

2573

if (nr_reads) {

2581

if (nr_reads) {

2574

/*

2582

/*

2575

* The page is locked, so these buffers are protected from

2583

* The page is locked, so these buffers are protected from

2576

* any VM or truncate activity. Hence we don't need to care

2584

* any VM or truncate activity. Hence we don't need to care

2577

* for the buffer_head refcounts.

2585

* for the buffer_head refcounts.

2578

*/

2586

*/

2579

for (bh = head; bh; bh = bh->b_this_page) {

2587

for (bh = head; bh; bh = bh->b_this_page) {

2580

wait_on_buffer(bh);

2588

wait_on_buffer(bh);

2581

if (!buffer_uptodate(bh))

2589

if (!buffer_uptodate(bh))

2582

ret = -EIO;

2590

ret = -EIO;

2583

}

2591

}

2584

if (ret)

2592

if (ret)

2585

goto failed;

2593

goto failed;

2586

}

2594

}

2587

2595

2588

if (is_mapped_to_disk)

2596

if (is_mapped_to_disk)

2589

SetPageMappedToDisk(page);

2597

SetPageMappedToDisk(page);

2590

2598

2591

*fsdata = head; /* to be released by nobh_write_end */

2599

*fsdata = head; /* to be released by nobh_write_end */

2592

2600

2593

return 0;

2601

return 0;

2594

2602

2595

failed:

2603

failed:

2596

BUG_ON(!ret);

2604

BUG_ON(!ret);

2597

/*

2605

/*

2598

* Error recovery is a bit difficult. We need to zero out blocks that

2606

* Error recovery is a bit difficult. We need to zero out blocks that

2599

* were newly allocated, and dirty them to ensure they get written out.

2607

* were newly allocated, and dirty them to ensure they get written out.

2600

* Buffers need to be attached to the page at this point, otherwise

2608

* Buffers need to be attached to the page at this point, otherwise

2601

* the handling of potential IO errors during writeout would be hard

2609

* the handling of potential IO errors during writeout would be hard

2602

* (could try doing synchronous writeout, but what if that fails too?)

2610

* (could try doing synchronous writeout, but what if that fails too?)

2603

*/

2611

*/

2604

attach_nobh_buffers(page, head);

2612

attach_nobh_buffers(page, head);

2605

page_zero_new_buffers(page, from, to);

2613

page_zero_new_buffers(page, from, to);

2606

2614

2607

out_release:

2615

out_release:

2608

unlock_page(page);

2616

unlock_page(page);

2609

page_cache_release(page);

2617

page_cache_release(page);

2610

*pagep = NULL;

2618

*pagep = NULL;

2611

2619

2612

if (pos + len > inode->i_size)

2620

if (pos + len > inode->i_size)

2613

vmtruncate(inode, inode->i_size);

2621

vmtruncate(inode, inode->i_size);

2614

2622

2615

return ret;

2623

return ret;

2616

}

2624

}

2617

EXPORT_SYMBOL(nobh_write_begin);

2625

EXPORT_SYMBOL(nobh_write_begin);

2618

2626

2619

int nobh_write_end(struct file *file, struct address_space *mapping,

2627

int nobh_write_end(struct file *file, struct address_space *mapping,

2620

loff_t pos, unsigned len, unsigned copied,

2628

loff_t pos, unsigned len, unsigned copied,

2621

struct page *page, void *fsdata)

2629

struct page *page, void *fsdata)

2622

{

2630

{

2623

struct inode *inode = page->mapping->host;

2631

struct inode *inode = page->mapping->host;

2624

struct buffer_head *head = fsdata;

2632

struct buffer_head *head = fsdata;

2625

struct buffer_head *bh;

2633

struct buffer_head *bh;

2626

BUG_ON(fsdata != NULL && page_has_buffers(page));

2634

BUG_ON(fsdata != NULL && page_has_buffers(page));

2627

2635

2628

if (unlikely(copied < len) && !page_has_buffers(page))

2636

if (unlikely(copied < len) && !page_has_buffers(page))

2629

attach_nobh_buffers(page, head);

2637

attach_nobh_buffers(page, head);

2630

if (page_has_buffers(page))

2638

if (page_has_buffers(page))

2631

return generic_write_end(file, mapping, pos, len,

2639

return generic_write_end(file, mapping, pos, len,

2632

copied, page, fsdata);

2640

copied, page, fsdata);

2633

2641

2634

SetPageUptodate(page);

2642

SetPageUptodate(page);

2635

set_page_dirty(page);

2643

set_page_dirty(page);

2636

if (pos+copied > inode->i_size) {

2644

if (pos+copied > inode->i_size) {

2637

i_size_write(inode, pos+copied);

2645

i_size_write(inode, pos+copied);

2638

mark_inode_dirty(inode);

2646

mark_inode_dirty(inode);

2639

}

2647

}

2640

2648

2641

unlock_page(page);

2649

unlock_page(page);

2642

page_cache_release(page);

2650

page_cache_release(page);

2643

2651

2644

while (head) {

2652

while (head) {

2645

bh = head;

2653

bh = head;

2646

head = head->b_this_page;

2654

head = head->b_this_page;

2647

free_buffer_head(bh);

2655

free_buffer_head(bh);

2648

}

2656

}

2649

2657

2650

return copied;

2658

return copied;

2651

}

2659

}

2652

EXPORT_SYMBOL(nobh_write_end);

2660

EXPORT_SYMBOL(nobh_write_end);

2653

2661

2654

/*

2662

/*

2655

* nobh_writepage() - based on block_full_write_page() except

2663

* nobh_writepage() - based on block_full_write_page() except

2656

* that it tries to operate without attaching bufferheads to

2664

* that it tries to operate without attaching bufferheads to

2657

* the page.

2665

* the page.

2658

*/

2666

*/

2659

int nobh_writepage(struct page *page, get_block_t *get_block,

2667

int nobh_writepage(struct page *page, get_block_t *get_block,

2660

struct writeback_control *wbc)

2668

struct writeback_control *wbc)

2661

{

2669

{

2662

struct inode * const inode = page->mapping->host;

2670

struct inode * const inode = page->mapping->host;

2663

loff_t i_size = i_size_read(inode);

2671

loff_t i_size = i_size_read(inode);

2664

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2672

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2665

unsigned offset;

2673

unsigned offset;

2666

int ret;

2674

int ret;

2667

2675

2668

/* Is the page fully inside i_size? */

2676

/* Is the page fully inside i_size? */

2669

if (page->index < end_index)

2677

if (page->index < end_index)

2670

goto out;

2678

goto out;

2671

2679

2672

/* Is the page fully outside i_size? (truncate in progress) */

2680

/* Is the page fully outside i_size? (truncate in progress) */

2673

offset = i_size & (PAGE_CACHE_SIZE-1);

2681

offset = i_size & (PAGE_CACHE_SIZE-1);

2674

if (page->index >= end_index+1 || !offset) {

2682

if (page->index >= end_index+1 || !offset) {

2675

/*

2683

/*

2676

* The page may have dirty, unmapped buffers. For example,

2684

* The page may have dirty, unmapped buffers. For example,

2677

* they may have been added in ext3_writepage(). Make them

2685

* they may have been added in ext3_writepage(). Make them

2678

* freeable here, so the page does not leak.

2686

* freeable here, so the page does not leak.

2679

*/

2687

*/

2680

#if 0

2688

#if 0

2681

/* Not really sure about this - do we need this ? */

2689

/* Not really sure about this - do we need this ? */

2682

if (page->mapping->a_ops->invalidatepage)

2690

if (page->mapping->a_ops->invalidatepage)

2683

page->mapping->a_ops->invalidatepage(page, offset);

2691

page->mapping->a_ops->invalidatepage(page, offset);

2684

#endif

2692

#endif

2685

unlock_page(page);

2693

unlock_page(page);

2686

return 0; /* don't care */

2694

return 0; /* don't care */

2687

}

2695

}

2688

2696

2689

/*

2697

/*

2690

* The page straddles i_size. It must be zeroed out on each and every

2698

* The page straddles i_size. It must be zeroed out on each and every

2691

* writepage invocation because it may be mmapped. "A file is mapped

2699

* writepage invocation because it may be mmapped. "A file is mapped

2692

* in multiples of the page size. For a file that is not a multiple of

2700

* in multiples of the page size. For a file that is not a multiple of

2693

* the page size, the remaining memory is zeroed when mapped, and

2701

* the page size, the remaining memory is zeroed when mapped, and

2694

* writes to that region are not written out to the file."

2702

* writes to that region are not written out to the file."

2695

*/

2703

*/

2696

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2704

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2697

out:

2705

out:

2698

ret = mpage_writepage(page, get_block, wbc);

2706

ret = mpage_writepage(page, get_block, wbc);

2699

if (ret == -EAGAIN)

2707

if (ret == -EAGAIN)

2700

ret = __block_write_full_page(inode, page, get_block, wbc);

2708

ret = __block_write_full_page(inode, page, get_block, wbc);

2701

return ret;

2709

return ret;

2702

}

2710

}

2703

EXPORT_SYMBOL(nobh_writepage);

2711

EXPORT_SYMBOL(nobh_writepage);

2704

2712

2705

int nobh_truncate_page(struct address_space *mapping,

2713

int nobh_truncate_page(struct address_space *mapping,

2706

loff_t from, get_block_t *get_block)

2714

loff_t from, get_block_t *get_block)

2707

{

2715

{

2708

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2716

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2709

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2717

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2710

unsigned blocksize;

2718

unsigned blocksize;

2711

sector_t iblock;

2719

sector_t iblock;

2712

unsigned length, pos;

2720

unsigned length, pos;

2713

struct inode *inode = mapping->host;

2721

struct inode *inode = mapping->host;

2714

struct page *page;

2722

struct page *page;

2715

struct buffer_head map_bh;

2723

struct buffer_head map_bh;

2716

int err;

2724

int err;

2717

2725

2718

blocksize = 1 << inode->i_blkbits;

2726

blocksize = 1 << inode->i_blkbits;

2719

length = offset & (blocksize - 1);

2727

length = offset & (blocksize - 1);

2720

2728

2721

/* Block boundary? Nothing to do */

2729

/* Block boundary? Nothing to do */

2722

if (!length)

2730

if (!length)

2723

return 0;

2731

return 0;

2724

2732

2725

length = blocksize - length;

2733

length = blocksize - length;

2726

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2734

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2727

2735

2728

page = grab_cache_page(mapping, index);

2736

page = grab_cache_page(mapping, index);

2729

err = -ENOMEM;

2737

err = -ENOMEM;

2730

if (!page)

2738

if (!page)

2731

goto out;

2739

goto out;

2732

2740

2733

if (page_has_buffers(page)) {

2741

if (page_has_buffers(page)) {

2734

has_buffers:

2742

has_buffers:

2735

unlock_page(page);

2743

unlock_page(page);

2736

page_cache_release(page);

2744

page_cache_release(page);

2737

return block_truncate_page(mapping, from, get_block);

2745

return block_truncate_page(mapping, from, get_block);

2738

}

2746

}

2739

2747

2740

/* Find the buffer that contains "offset" */

2748

/* Find the buffer that contains "offset" */

2741

pos = blocksize;

2749

pos = blocksize;

2742

while (offset >= pos) {

2750

while (offset >= pos) {

2743

iblock++;

2751

iblock++;

2744

pos += blocksize;

2752

pos += blocksize;

2745

}

2753

}

2746

2754

2747

err = get_block(inode, iblock, &map_bh, 0);

2755

err = get_block(inode, iblock, &map_bh, 0);

2748

if (err)

2756

if (err)

2749

goto unlock;

2757

goto unlock;

2750

/* unmapped? It's a hole - nothing to do */

2758

/* unmapped? It's a hole - nothing to do */

2751

if (!buffer_mapped(&map_bh))

2759

if (!buffer_mapped(&map_bh))

2752

goto unlock;

2760

goto unlock;

2753

2761

2754

/* Ok, it's mapped. Make sure it's up-to-date */

2762

/* Ok, it's mapped. Make sure it's up-to-date */

2755

if (!PageUptodate(page)) {

2763

if (!PageUptodate(page)) {

2756

err = mapping->a_ops->readpage(NULL, page);

2764

err = mapping->a_ops->readpage(NULL, page);

2757

if (err) {

2765

if (err) {

2758

page_cache_release(page);

2766

page_cache_release(page);

2759

goto out;

2767

goto out;

2760

}

2768

}

2761

lock_page(page);

2769

lock_page(page);

2762

if (!PageUptodate(page)) {

2770

if (!PageUptodate(page)) {

2763

err = -EIO;

2771

err = -EIO;

2764

goto unlock;

2772

goto unlock;

2765

}

2773

}

2766

if (page_has_buffers(page))

2774

if (page_has_buffers(page))

2767

goto has_buffers;

2775

goto has_buffers;

2768

}

2776

}

2769

zero_user(page, offset, length);

2777

zero_user(page, offset, length);

2770

set_page_dirty(page);

2778

set_page_dirty(page);

2771

err = 0;

2779

err = 0;

2772

2780

2773

unlock:

2781

unlock:

2774

unlock_page(page);

2782

unlock_page(page);

2775

page_cache_release(page);

2783

page_cache_release(page);

2776

out:

2784

out:

2777

return err;

2785

return err;

2778

}

2786

}

2779

EXPORT_SYMBOL(nobh_truncate_page);

2787

EXPORT_SYMBOL(nobh_truncate_page);

2780

2788

2781

int block_truncate_page(struct address_space *mapping,

2789

int block_truncate_page(struct address_space *mapping,

2782

loff_t from, get_block_t *get_block)

2790

loff_t from, get_block_t *get_block)

2783

{

2791

{

2784

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2792

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2785

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2793

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2786

unsigned blocksize;

2794

unsigned blocksize;

2787

sector_t iblock;

2795

sector_t iblock;

2788

unsigned length, pos;

2796

unsigned length, pos;

2789

struct inode *inode = mapping->host;

2797

struct inode *inode = mapping->host;

2790

struct page *page;

2798

struct page *page;

2791

struct buffer_head *bh;

2799

struct buffer_head *bh;

2792

int err;

2800

int err;

2793

2801

2794

blocksize = 1 << inode->i_blkbits;

2802

blocksize = 1 << inode->i_blkbits;

2795

length = offset & (blocksize - 1);

2803

length = offset & (blocksize - 1);

2796

2804

2797

/* Block boundary? Nothing to do */

2805

/* Block boundary? Nothing to do */

2798

if (!length)

2806

if (!length)

2799

return 0;

2807

return 0;

2800

2808

2801

length = blocksize - length;

2809

length = blocksize - length;

2802

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2810

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2803

2811

2804

page = grab_cache_page(mapping, index);

2812

page = grab_cache_page(mapping, index);

2805

err = -ENOMEM;

2813

err = -ENOMEM;

2806

if (!page)

2814

if (!page)

2807

goto out;

2815

goto out;

2808

2816

2809

if (!page_has_buffers(page))

2817

if (!page_has_buffers(page))

2810

create_empty_buffers(page, blocksize, 0);

2818

create_empty_buffers(page, blocksize, 0);

2811

2819

2812

/* Find the buffer that contains "offset" */

2820

/* Find the buffer that contains "offset" */

2813

bh = page_buffers(page);

2821

bh = page_buffers(page);

2814

pos = blocksize;

2822

pos = blocksize;

2815

while (offset >= pos) {

2823

while (offset >= pos) {

2816

bh = bh->b_this_page;

2824

bh = bh->b_this_page;

2817

iblock++;

2825

iblock++;

2818

pos += blocksize;

2826

pos += blocksize;

2819

}

2827

}

2820

2828

2821

err = 0;

2829

err = 0;

2822

if (!buffer_mapped(bh)) {

2830

if (!buffer_mapped(bh)) {

2823

WARN_ON(bh->b_size != blocksize);

2831

WARN_ON(bh->b_size != blocksize);

2824

err = get_block(inode, iblock, bh, 0);

2832

err = get_block(inode, iblock, bh, 0);

2825

if (err)

2833

if (err)

2826

goto unlock;

2834

goto unlock;

2827

/* unmapped? It's a hole - nothing to do */

2835

/* unmapped? It's a hole - nothing to do */

2828

if (!buffer_mapped(bh))

2836

if (!buffer_mapped(bh))

2829

goto unlock;

2837

goto unlock;

2830

}

2838

}

2831

2839

2832

/* Ok, it's mapped. Make sure it's up-to-date */

2840

/* Ok, it's mapped. Make sure it's up-to-date */

2833

if (PageUptodate(page))

2841

if (PageUptodate(page))

2834

set_buffer_uptodate(bh);

2842

set_buffer_uptodate(bh);

2835

2843

2836

if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {

2844

if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {

2837

err = -EIO;

2845

err = -EIO;

2838

ll_rw_block(READ, 1, &bh);

2846

ll_rw_block(READ, 1, &bh);

2839

wait_on_buffer(bh);

2847

wait_on_buffer(bh);

2840

/* Uhhuh. Read error. Complain and punt. */

2848

/* Uhhuh. Read error. Complain and punt. */

2841

if (!buffer_uptodate(bh))

2849

if (!buffer_uptodate(bh))

2842

goto unlock;

2850

goto unlock;

2843

}

2851

}

2844

2852

2845

zero_user(page, offset, length);

2853

zero_user(page, offset, length);

2846

mark_buffer_dirty(bh);

2854

mark_buffer_dirty(bh);

2847

err = 0;

2855

err = 0;

2848

2856

2849

unlock:

2857

unlock:

2850

unlock_page(page);

2858

unlock_page(page);

2851

page_cache_release(page);

2859

page_cache_release(page);

2852

out:

2860

out:

2853

return err;

2861

return err;

2854

}

2862

}

2855

2863

2856

/*

2864

/*

2857

* The generic ->writepage function for buffer-backed address_spaces

2865

* The generic ->writepage function for buffer-backed address_spaces

2858

*/

2866

*/

2859

int block_write_full_page(struct page *page, get_block_t *get_block,

2867

int block_write_full_page(struct page *page, get_block_t *get_block,

2860

struct writeback_control *wbc)

2868

struct writeback_control *wbc)

2861

{

2869

{

2862

struct inode * const inode = page->mapping->host;

2870

struct inode * const inode = page->mapping->host;

2863

loff_t i_size = i_size_read(inode);

2871

loff_t i_size = i_size_read(inode);

2864

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2872

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2865

unsigned offset;

2873

unsigned offset;

2866

2874

2867

/* Is the page fully inside i_size? */

2875

/* Is the page fully inside i_size? */

2868

if (page->index < end_index)

2876

if (page->index < end_index)

2869

return __block_write_full_page(inode, page, get_block, wbc);

2877

return __block_write_full_page(inode, page, get_block, wbc);

2870

2878

2871

/* Is the page fully outside i_size? (truncate in progress) */

2879

/* Is the page fully outside i_size? (truncate in progress) */

2872

offset = i_size & (PAGE_CACHE_SIZE-1);

2880

offset = i_size & (PAGE_CACHE_SIZE-1);

2873

if (page->index >= end_index+1 || !offset) {

2881

if (page->index >= end_index+1 || !offset) {

2874

/*

2882

/*

2875

* The page may have dirty, unmapped buffers. For example,

2883

* The page may have dirty, unmapped buffers. For example,

2876

* they may have been added in ext3_writepage(). Make them

2884

* they may have been added in ext3_writepage(). Make them

2877

* freeable here, so the page does not leak.

2885

* freeable here, so the page does not leak.

2878

*/

2886

*/

2879

do_invalidatepage(page, 0);

2887

do_invalidatepage(page, 0);

2880

unlock_page(page);

2888

unlock_page(page);

2881

return 0; /* don't care */

2889

return 0; /* don't care */

2882

}

2890

}

2883

2891

2884

/*

2892

/*

2885

* The page straddles i_size. It must be zeroed out on each and every

2893

* The page straddles i_size. It must be zeroed out on each and every

2886

* writepage invokation because it may be mmapped. "A file is mapped

2894

* writepage invokation because it may be mmapped. "A file is mapped

2887

* in multiples of the page size. For a file that is not a multiple of

2895

* in multiples of the page size. For a file that is not a multiple of

2888

* the page size, the remaining memory is zeroed when mapped, and

2896

* the page size, the remaining memory is zeroed when mapped, and

2889

* writes to that region are not written out to the file."

2897

* writes to that region are not written out to the file."

2890

*/

2898

*/

2891

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2899

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2892

return __block_write_full_page(inode, page, get_block, wbc);

2900

return __block_write_full_page(inode, page, get_block, wbc);

2893

}

2901

}

2894

2902

2895

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,

2903

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,

2896

get_block_t *get_block)

2904

get_block_t *get_block)

2897

{

2905

{

2898

struct buffer_head tmp;

2906

struct buffer_head tmp;

2899

struct inode *inode = mapping->host;

2907

struct inode *inode = mapping->host;

2900

tmp.b_state = 0;

2908

tmp.b_state = 0;

2901

tmp.b_blocknr = 0;

2909

tmp.b_blocknr = 0;

2902

tmp.b_size = 1 << inode->i_blkbits;

2910

tmp.b_size = 1 << inode->i_blkbits;

2903

get_block(inode, block, &tmp, 0);

2911

get_block(inode, block, &tmp, 0);

2904

return tmp.b_blocknr;

2912

return tmp.b_blocknr;

2905

}

2913

}

2906

2914

2907

static void end_bio_bh_io_sync(struct bio *bio, int err)

2915

static void end_bio_bh_io_sync(struct bio *bio, int err)

2908

{

2916

{

2909

struct buffer_head *bh = bio->bi_private;

2917

struct buffer_head *bh = bio->bi_private;

2910

2918

2911

if (err == -EOPNOTSUPP) {

2919

if (err == -EOPNOTSUPP) {

2912

set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);

2920

set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);

2913

set_bit(BH_Eopnotsupp, &bh->b_state);

2921

set_bit(BH_Eopnotsupp, &bh->b_state);

2914

}

2922

}

2923

2924

if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))

2925

set_bit(BH_Quiet, &bh->b_state);

2915

2926

2916

bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));

2927

bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));

2917

bio_put(bio);

2928

bio_put(bio);

2918

}

2929

}

2919

2930

2920

int submit_bh(int rw, struct buffer_head * bh)

2931

int submit_bh(int rw, struct buffer_head * bh)

2921

{

2932

{

2922

struct bio *bio;

2933

struct bio *bio;

2923

int ret = 0;

2934

int ret = 0;

2924

2935

2925

BUG_ON(!buffer_locked(bh));

2936

BUG_ON(!buffer_locked(bh));

2926

BUG_ON(!buffer_mapped(bh));

2937

BUG_ON(!buffer_mapped(bh));

2927

BUG_ON(!bh->b_end_io);

2938

BUG_ON(!bh->b_end_io);

2928

2939

2929

/*

2940

/*

2930

* Mask in barrier bit for a write (could be either a WRITE or a

2941

* Mask in barrier bit for a write (could be either a WRITE or a

2931

* WRITE_SYNC

2942

* WRITE_SYNC

2932

*/

2943

*/

2933

if (buffer_ordered(bh) && (rw & WRITE))

2944

if (buffer_ordered(bh) && (rw & WRITE))

2934

rw |= WRITE_BARRIER;

2945

rw |= WRITE_BARRIER;

2935

2946

2936

/*

2947

/*

2937

* Only clear out a write error when rewriting

2948

* Only clear out a write error when rewriting

2938

*/

2949

*/

2939

if (test_set_buffer_req(bh) && (rw & WRITE))

2950

if (test_set_buffer_req(bh) && (rw & WRITE))

2940

clear_buffer_write_io_error(bh);

2951

clear_buffer_write_io_error(bh);

2941

2952

2942

/*

2953

/*

2943

* from here on down, it's all bio -- do the initial mapping,

2954

* from here on down, it's all bio -- do the initial mapping,

2944

* submit_bio -> generic_make_request may further map this bio around

2955

* submit_bio -> generic_make_request may further map this bio around

2945

*/

2956

*/

2946

bio = bio_alloc(GFP_NOIO, 1);

2957

bio = bio_alloc(GFP_NOIO, 1);

2947

2958

2948

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

2959

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

2949

bio->bi_bdev = bh->b_bdev;

2960

bio->bi_bdev = bh->b_bdev;

2950

bio->bi_io_vec[0].bv_page = bh->b_page;

2961

bio->bi_io_vec[0].bv_page = bh->b_page;

2951

bio->bi_io_vec[0].bv_len = bh->b_size;

2962

bio->bi_io_vec[0].bv_len = bh->b_size;

2952

bio->bi_io_vec[0].bv_offset = bh_offset(bh);

2963

bio->bi_io_vec[0].bv_offset = bh_offset(bh);

2953

2964

2954

bio->bi_vcnt = 1;

2965

bio->bi_vcnt = 1;

2955

bio->bi_idx = 0;

2966

bio->bi_idx = 0;

2956

bio->bi_size = bh->b_size;

2967

bio->bi_size = bh->b_size;

2957

2968

2958

bio->bi_end_io = end_bio_bh_io_sync;

2969

bio->bi_end_io = end_bio_bh_io_sync;

2959

bio->bi_private = bh;

2970

bio->bi_private = bh;

2960

2971

2961

bio_get(bio);

2972

bio_get(bio);

2962

submit_bio(rw, bio);

2973

submit_bio(rw, bio);

2963

2974

2964

if (bio_flagged(bio, BIO_EOPNOTSUPP))

2975

if (bio_flagged(bio, BIO_EOPNOTSUPP))

2965

ret = -EOPNOTSUPP;

2976

ret = -EOPNOTSUPP;

2966

2977

2967

bio_put(bio);

2978

bio_put(bio);

2968

return ret;

2979

return ret;

2969

}

2980

}

2970

2981

2971

/**

2982

/**

2972

* ll_rw_block: low-level access to block devices (DEPRECATED)

2983

* ll_rw_block: low-level access to block devices (DEPRECATED)

2973

* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)

2984

* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)

2974

* @nr: number of &struct buffer_heads in the array

2985

* @nr: number of &struct buffer_heads in the array

2975

* @bhs: array of pointers to &struct buffer_head

2986

* @bhs: array of pointers to &struct buffer_head

2976

*

2987

*

2977

* ll_rw_block() takes an array of pointers to &struct buffer_heads, and

2988

* ll_rw_block() takes an array of pointers to &struct buffer_heads, and

2978

* requests an I/O operation on them, either a %READ or a %WRITE. The third

2989

* requests an I/O operation on them, either a %READ or a %WRITE. The third

2979

* %SWRITE is like %WRITE only we make sure that the *current* data in buffers

2990

* %SWRITE is like %WRITE only we make sure that the *current* data in buffers

2980

* are sent to disk. The fourth %READA option is described in the documentation

2991

* are sent to disk. The fourth %READA option is described in the documentation

2981

* for generic_make_request() which ll_rw_block() calls.

2992

* for generic_make_request() which ll_rw_block() calls.

2982

*

2993

*

2983

* This function drops any buffer that it cannot get a lock on (with the

2994

* This function drops any buffer that it cannot get a lock on (with the

2984

* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be

2995

* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be

2985

* clean when doing a write request, and any buffer that appears to be

2996

* clean when doing a write request, and any buffer that appears to be

2986

* up-to-date when doing read request. Further it marks as clean buffers that

2997

* up-to-date when doing read request. Further it marks as clean buffers that

2987

* are processed for writing (the buffer cache won't assume that they are

2998

* are processed for writing (the buffer cache won't assume that they are

2988

* actually clean until the buffer gets unlocked).

2999

* actually clean until the buffer gets unlocked).

2989

*

3000

*

2990

* ll_rw_block sets b_end_io to simple completion handler that marks

3001

* ll_rw_block sets b_end_io to simple completion handler that marks

2991

* the buffer up-to-date (if approriate), unlocks the buffer and wakes

3002

* the buffer up-to-date (if approriate), unlocks the buffer and wakes

2992

* any waiters.

3003

* any waiters.

2993

*

3004

*

2994

* All of the buffers must be for the same device, and must also be a

3005

* All of the buffers must be for the same device, and must also be a

2995

* multiple of the current approved size for the device.

3006

* multiple of the current approved size for the device.

2996

*/

3007

*/

2997

void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

3008

void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

2998

{

3009

{

2999

int i;

3010

int i;

3000

3011

3001

for (i = 0; i < nr; i++) {

3012

for (i = 0; i < nr; i++) {

3002

struct buffer_head *bh = bhs[i];

3013

struct buffer_head *bh = bhs[i];

3003

3014

3004

if (rw == SWRITE || rw == SWRITE_SYNC)

3015

if (rw == SWRITE || rw == SWRITE_SYNC)

3005

lock_buffer(bh);

3016

lock_buffer(bh);

3006

else if (!trylock_buffer(bh))

3017

else if (!trylock_buffer(bh))

3007

continue;

3018

continue;

3008

3019

3009

if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {

3020

if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {

3010

if (test_clear_buffer_dirty(bh)) {

3021

if (test_clear_buffer_dirty(bh)) {

3011

bh->b_end_io = end_buffer_write_sync;

3022

bh->b_end_io = end_buffer_write_sync;

3012

get_bh(bh);

3023

get_bh(bh);

3013

if (rw == SWRITE_SYNC)

3024

if (rw == SWRITE_SYNC)

3014

submit_bh(WRITE_SYNC, bh);

3025

submit_bh(WRITE_SYNC, bh);

3015

else

3026

else

3016

submit_bh(WRITE, bh);

3027

submit_bh(WRITE, bh);

3017

continue;

3028

continue;

3018

}

3029

}

3019

} else {

3030

} else {

3020

if (!buffer_uptodate(bh)) {

3031

if (!buffer_uptodate(bh)) {

3021

bh->b_end_io = end_buffer_read_sync;

3032

bh->b_end_io = end_buffer_read_sync;

3022

get_bh(bh);

3033

get_bh(bh);

3023

submit_bh(rw, bh);

3034

submit_bh(rw, bh);

3024

continue;

3035

continue;

3025

}

3036

}

3026

}

3037

}

3027

unlock_buffer(bh);

3038

unlock_buffer(bh);

3028

}

3039

}

3029

}

3040

}

3030

3041

3031

/*

3042

/*

3032

* For a data-integrity writeout, we need to wait upon any in-progress I/O

3043

* For a data-integrity writeout, we need to wait upon any in-progress I/O

3033

* and then start new I/O and then wait upon it. The caller must have a ref on

3044

* and then start new I/O and then wait upon it. The caller must have a ref on

3034

* the buffer_head.

3045

* the buffer_head.

3035

*/

3046

*/

3036

int sync_dirty_buffer(struct buffer_head *bh)

3047

int sync_dirty_buffer(struct buffer_head *bh)

3037

{

3048

{

3038

int ret = 0;

3049

int ret = 0;

3039

3050

3040

WARN_ON(atomic_read(&bh->b_count) < 1);

3051

WARN_ON(atomic_read(&bh->b_count) < 1);

3041

lock_buffer(bh);

3052

lock_buffer(bh);

3042

if (test_clear_buffer_dirty(bh)) {

3053

if (test_clear_buffer_dirty(bh)) {

3043

get_bh(bh);

3054

get_bh(bh);

3044

bh->b_end_io = end_buffer_write_sync;

3055

bh->b_end_io = end_buffer_write_sync;

3045

ret = submit_bh(WRITE_SYNC, bh);

3056

ret = submit_bh(WRITE_SYNC, bh);

3046

wait_on_buffer(bh);

3057

wait_on_buffer(bh);

3047

if (buffer_eopnotsupp(bh)) {

3058

if (buffer_eopnotsupp(bh)) {

3048

clear_buffer_eopnotsupp(bh);

3059

clear_buffer_eopnotsupp(bh);

3049

ret = -EOPNOTSUPP;

3060

ret = -EOPNOTSUPP;

3050

}

3061

}

3051

if (!ret && !buffer_uptodate(bh))

3062

if (!ret && !buffer_uptodate(bh))

3052

ret = -EIO;

3063

ret = -EIO;

3053

} else {

3064

} else {

3054

unlock_buffer(bh);

3065

unlock_buffer(bh);

3055

}

3066

}

3056

return ret;

3067

return ret;

3057

}

3068

}

3058

3069

3059

/*

3070

/*

3060

* try_to_free_buffers() checks if all the buffers on this particular page

3071

* try_to_free_buffers() checks if all the buffers on this particular page

3061

* are unused, and releases them if so.

3072

* are unused, and releases them if so.

3062

*

3073

*

3063

* Exclusion against try_to_free_buffers may be obtained by either

3074

* Exclusion against try_to_free_buffers may be obtained by either

3064

* locking the page or by holding its mapping's private_lock.

3075

* locking the page or by holding its mapping's private_lock.

3065

*

3076

*

3066

* If the page is dirty but all the buffers are clean then we need to

3077

* If the page is dirty but all the buffers are clean then we need to

3067

* be sure to mark the page clean as well. This is because the page

3078

* be sure to mark the page clean as well. This is because the page

3068

* may be against a block device, and a later reattachment of buffers

3079

* may be against a block device, and a later reattachment of buffers

3069

* to a dirty page will set *all* buffers dirty. Which would corrupt

3080

* to a dirty page will set *all* buffers dirty. Which would corrupt

3070

* filesystem data on the same device.

3081

* filesystem data on the same device.

3071

*

3082

*

3072

* The same applies to regular filesystem pages: if all the buffers are

3083

* The same applies to regular filesystem pages: if all the buffers are

3073

* clean then we set the page clean and proceed. To do that, we require

3084

* clean then we set the page clean and proceed. To do that, we require

3074

* total exclusion from __set_page_dirty_buffers(). That is obtained with

3085

* total exclusion from __set_page_dirty_buffers(). That is obtained with

3075

* private_lock.

3086

* private_lock.

3076

*

3087

*

3077

* try_to_free_buffers() is non-blocking.

3088

* try_to_free_buffers() is non-blocking.

3078

*/

3089

*/

3079

static inline int buffer_busy(struct buffer_head *bh)

3090

static inline int buffer_busy(struct buffer_head *bh)

3080

{

3091

{

3081

return atomic_read(&bh->b_count) |

3092

return atomic_read(&bh->b_count) |

3082

(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));

3093

(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));

3083

}

3094

}

3084

3095

3085

static int

3096

static int

3086

drop_buffers(struct page *page, struct buffer_head **buffers_to_free)

3097

drop_buffers(struct page *page, struct buffer_head **buffers_to_free)

3087

{

3098

{

3088

struct buffer_head *head = page_buffers(page);

3099

struct buffer_head *head = page_buffers(page);

3089

struct buffer_head *bh;

3100

struct buffer_head *bh;

3090

3101

3091

bh = head;

3102

bh = head;

3092

do {

3103

do {

3093

if (buffer_write_io_error(bh) && page->mapping)

3104

if (buffer_write_io_error(bh) && page->mapping)

3094

set_bit(AS_EIO, &page->mapping->flags);

3105

set_bit(AS_EIO, &page->mapping->flags);

3095

if (buffer_busy(bh))

3106

if (buffer_busy(bh))

3096

goto failed;

3107

goto failed;

3097

bh = bh->b_this_page;

3108

bh = bh->b_this_page;

3098

} while (bh != head);

3109

} while (bh != head);

3099

3110

3100

do {

3111

do {

3101

struct buffer_head *next = bh->b_this_page;

3112

struct buffer_head *next = bh->b_this_page;

3102

3113

3103

if (bh->b_assoc_map)

3114

if (bh->b_assoc_map)

3104

__remove_assoc_queue(bh);

3115

__remove_assoc_queue(bh);

3105

bh = next;

3116

bh = next;

3106

} while (bh != head);

3117

} while (bh != head);

3107

*buffers_to_free = head;

3118

*buffers_to_free = head;

3108

__clear_page_buffers(page);

3119

__clear_page_buffers(page);

3109

return 1;

3120

return 1;

3110

failed:

3121

failed:

3111

return 0;

3122

return 0;

3112

}

3123

}

3113

3124

3114

int try_to_free_buffers(struct page *page)

3125

int try_to_free_buffers(struct page *page)

3115

{

3126

{

3116

struct address_space * const mapping = page->mapping;

3127

struct address_space * const mapping = page->mapping;

3117

struct buffer_head *buffers_to_free = NULL;

3128

struct buffer_head *buffers_to_free = NULL;

3118

int ret = 0;

3129

int ret = 0;

3119

3130

3120

BUG_ON(!PageLocked(page));

3131

BUG_ON(!PageLocked(page));

3121

if (PageWriteback(page))

3132

if (PageWriteback(page))

3122

return 0;

3133

return 0;

3123

3134

3124

if (mapping == NULL) { /* can this still happen? */

3135

if (mapping == NULL) { /* can this still happen? */

3125

ret = drop_buffers(page, &buffers_to_free);

3136

ret = drop_buffers(page, &buffers_to_free);

3126

goto out;

3137

goto out;

3127

}

3138

}

3128

3139

3129

spin_lock(&mapping->private_lock);

3140

spin_lock(&mapping->private_lock);

3130

ret = drop_buffers(page, &buffers_to_free);

3141

ret = drop_buffers(page, &buffers_to_free);

3131

3142

3132

/*

3143

/*

3133

* If the filesystem writes its buffers by hand (eg ext3)

3144

* If the filesystem writes its buffers by hand (eg ext3)

3134

* then we can have clean buffers against a dirty page. We

3145

* then we can have clean buffers against a dirty page. We

3135

* clean the page here; otherwise the VM will never notice

3146

* clean the page here; otherwise the VM will never notice

3136

* that the filesystem did any IO at all.

3147

* that the filesystem did any IO at all.

3137

*

3148

*

3138

* Also, during truncate, discard_buffer will have marked all

3149

* Also, during truncate, discard_buffer will have marked all

3139

* the page's buffers clean. We discover that here and clean

3150

* the page's buffers clean. We discover that here and clean

3140

* the page also.

3151

* the page also.

3141

*

3152

*

3142

* private_lock must be held over this entire operation in order

3153

* private_lock must be held over this entire operation in order

3143

* to synchronise against __set_page_dirty_buffers and prevent the

3154

* to synchronise against __set_page_dirty_buffers and prevent the

3144

* dirty bit from being lost.

3155

* dirty bit from being lost.

3145

*/

3156

*/

3146

if (ret)

3157

if (ret)

3147

cancel_dirty_page(page, PAGE_CACHE_SIZE);

3158

cancel_dirty_page(page, PAGE_CACHE_SIZE);

3148

spin_unlock(&mapping->private_lock);

3159

spin_unlock(&mapping->private_lock);

3149

out:

3160

out:

3150

if (buffers_to_free) {

3161

if (buffers_to_free) {

3151

struct buffer_head *bh = buffers_to_free;

3162

struct buffer_head *bh = buffers_to_free;

3152

3163

3153

do {

3164

do {

3154

struct buffer_head *next = bh->b_this_page;

3165

struct buffer_head *next = bh->b_this_page;

3155

free_buffer_head(bh);

3166

free_buffer_head(bh);

3156

bh = next;

3167

bh = next;

3157

} while (bh != buffers_to_free);

3168

} while (bh != buffers_to_free);

3158

}

3169

}

3159

return ret;

3170

return ret;

3160

}

3171

}

3161

EXPORT_SYMBOL(try_to_free_buffers);

3172

EXPORT_SYMBOL(try_to_free_buffers);

3162

3173

3163

void block_sync_page(struct page *page)

3174

void block_sync_page(struct page *page)

3164

{

3175

{

3165

struct address_space *mapping;

3176

struct address_space *mapping;

3166

3177

3167

smp_mb();

3178

smp_mb();

3168

mapping = page_mapping(page);

3179

mapping = page_mapping(page);

3169

if (mapping)

3180

if (mapping)

3170

blk_run_backing_dev(mapping->backing_dev_info, page);

3181

blk_run_backing_dev(mapping->backing_dev_info, page);

3171

}

3182

}

3172

3183

3173

/*

3184

/*

3174

* There are no bdflush tunables left. But distributions are

3185

* There are no bdflush tunables left. But distributions are

3175

* still running obsolete flush daemons, so we terminate them here.

3186

* still running obsolete flush daemons, so we terminate them here.

3176

*

3187

*

3177

* Use of bdflush() is deprecated and will be removed in a future kernel.

3188

* Use of bdflush() is deprecated and will be removed in a future kernel.

3178

* The `pdflush' kernel threads fully replace bdflush daemons and this call.

3189

* The `pdflush' kernel threads fully replace bdflush daemons and this call.

3179

*/

3190

*/

3180

asmlinkage long sys_bdflush(int func, long data)

3191

asmlinkage long sys_bdflush(int func, long data)

3181

{

3192

{

3182

static int msg_count;

3193

static int msg_count;

3183

3194

3184

if (!capable(CAP_SYS_ADMIN))

3195

if (!capable(CAP_SYS_ADMIN))

3185

return -EPERM;

3196

return -EPERM;

3186

3197

3187

if (msg_count < 5) {

3198

if (msg_count < 5) {

3188

msg_count++;

3199

msg_count++;

3189

printk(KERN_INFO

3200

printk(KERN_INFO

3190

"warning: process `%s' used the obsolete bdflush"

3201

"warning: process `%s' used the obsolete bdflush"

3191

" system call\n", current->comm);

3202

" system call\n", current->comm);

3192

printk(KERN_INFO "Fix your initscripts?\n");

3203

printk(KERN_INFO "Fix your initscripts?\n");

3193

}

3204

}

3194

3205

3195

if (func == 1)

3206

if (func == 1)

3196

do_exit(0);

3207

do_exit(0);

3197

return 0;

3208

return 0;

3198

}

3209

}

3199

3210

3200

/*

3211

/*

3201

* Buffer-head allocation

3212

* Buffer-head allocation

3202

*/

3213

*/

3203

static struct kmem_cache *bh_cachep;

3214

static struct kmem_cache *bh_cachep;

3204

3215

3205

/*

3216

/*

3206

* Once the number of bh's in the machine exceeds this level, we start

3217

* Once the number of bh's in the machine exceeds this level, we start

3207

* stripping them in writeback.

3218

* stripping them in writeback.

3208

*/

3219

*/

3209

static int max_buffer_heads;

3220

static int max_buffer_heads;

3210

3221

3211

int buffer_heads_over_limit;

3222

int buffer_heads_over_limit;

3212

3223

3213

struct bh_accounting {

3224

struct bh_accounting {

3214

int nr; /* Number of live bh's */

3225

int nr; /* Number of live bh's */

3215

int ratelimit; /* Limit cacheline bouncing */

3226

int ratelimit; /* Limit cacheline bouncing */

3216

};

3227

};

3217

3228

3218

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

3229

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

3219

3230

3220

static void recalc_bh_state(void)

3231

static void recalc_bh_state(void)

3221

{

3232

{

3222

int i;

3233

int i;

3223

int tot = 0;

3234

int tot = 0;

3224

3235

3225

if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)

3236

if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)

3226

return;

3237

return;

3227

__get_cpu_var(bh_accounting).ratelimit = 0;

3238

__get_cpu_var(bh_accounting).ratelimit = 0;

3228

for_each_online_cpu(i)

3239

for_each_online_cpu(i)

3229

tot += per_cpu(bh_accounting, i).nr;

3240

tot += per_cpu(bh_accounting, i).nr;

3230

buffer_heads_over_limit = (tot > max_buffer_heads);

3241

buffer_heads_over_limit = (tot > max_buffer_heads);

3231

}

3242

}

3232

3243

3233

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)

3244

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)

3234

{

3245

{

3235

struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);

3246

struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);

3236

if (ret) {

3247

if (ret) {

3237

INIT_LIST_HEAD(&ret->b_assoc_buffers);

3248

INIT_LIST_HEAD(&ret->b_assoc_buffers);

3238

get_cpu_var(bh_accounting).nr++;

3249

get_cpu_var(bh_accounting).nr++;

3239

recalc_bh_state();

3250

recalc_bh_state();

3240

put_cpu_var(bh_accounting);

3251

put_cpu_var(bh_accounting);

3241

}

3252

}

3242

return ret;

3253

return ret;

3243

}

3254

}

3244

EXPORT_SYMBOL(alloc_buffer_head);

3255

EXPORT_SYMBOL(alloc_buffer_head);

3245

3256

3246

void free_buffer_head(struct buffer_head *bh)

3257

void free_buffer_head(struct buffer_head *bh)

3247

{

3258

{

3248

BUG_ON(!list_empty(&bh->b_assoc_buffers));

3259

BUG_ON(!list_empty(&bh->b_assoc_buffers));

3249

kmem_cache_free(bh_cachep, bh);

3260

kmem_cache_free(bh_cachep, bh);

3250

get_cpu_var(bh_accounting).nr--;

3261

get_cpu_var(bh_accounting).nr--;

3251

recalc_bh_state();

3262

recalc_bh_state();

3252

put_cpu_var(bh_accounting);

3263

put_cpu_var(bh_accounting);

3253

}

3264

}

3254

EXPORT_SYMBOL(free_buffer_head);

3265

EXPORT_SYMBOL(free_buffer_head);

3255

3266

3256

static void buffer_exit_cpu(int cpu)

3267

static void buffer_exit_cpu(int cpu)

3257

{

3268

{

3258

int i;

3269

int i;

3259

struct bh_lru *b = &per_cpu(bh_lrus, cpu);

3270

struct bh_lru *b = &per_cpu(bh_lrus, cpu);

3260

3271

3261

for (i = 0; i < BH_LRU_SIZE; i++) {

3272

for (i = 0; i < BH_LRU_SIZE; i++) {

3262

brelse(b->bhs[i]);

3273

brelse(b->bhs[i]);

3263

b->bhs[i] = NULL;

3274

b->bhs[i] = NULL;

3264

}

3275

}

3265

get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;

3276

get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;

3266

per_cpu(bh_accounting, cpu).nr = 0;

3277

per_cpu(bh_accounting, cpu).nr = 0;

3267

put_cpu_var(bh_accounting);

3278

put_cpu_var(bh_accounting);

3268

}

3279

}

3269

3280

3270

static int buffer_cpu_notify(struct notifier_block *self,

3281

static int buffer_cpu_notify(struct notifier_block *self,

3271

unsigned long action, void *hcpu)

3282

unsigned long action, void *hcpu)

3272

{

3283

{

3273

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)

3284

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)

3274

buffer_exit_cpu((unsigned long)hcpu);

3285

buffer_exit_cpu((unsigned long)hcpu);

3275

return NOTIFY_OK;

3286

return NOTIFY_OK;

3276

}

3287

}

3277

3288

3278

/**

3289

/**

3279

* bh_uptodate_or_lock - Test whether the buffer is uptodate

3290

* bh_uptodate_or_lock - Test whether the buffer is uptodate

3280

* @bh: struct buffer_head

3291

* @bh: struct buffer_head

3281

*

3292

*

3282

* Return true if the buffer is up-to-date and false,

3293

* Return true if the buffer is up-to-date and false,

3283

* with the buffer locked, if not.

3294

* with the buffer locked, if not.

3284

*/

3295

*/

3285

int bh_uptodate_or_lock(struct buffer_head *bh)

3296

int bh_uptodate_or_lock(struct buffer_head *bh)

3286

{

3297

{

3287

if (!buffer_uptodate(bh)) {

3298

if (!buffer_uptodate(bh)) {

3288

lock_buffer(bh);

3299

lock_buffer(bh);

3289

if (!buffer_uptodate(bh))

3300

if (!buffer_uptodate(bh))

3290

return 0;

3301

return 0;

3291

unlock_buffer(bh);

3302

unlock_buffer(bh);

3292

}

3303

}

3293

return 1;

3304

return 1;

3294

}

3305

}

3295

EXPORT_SYMBOL(bh_uptodate_or_lock);

3306

EXPORT_SYMBOL(bh_uptodate_or_lock);

3296

3307

3297

/**

3308

/**

3298

* bh_submit_read - Submit a locked buffer for reading

3309

* bh_submit_read - Submit a locked buffer for reading

3299

* @bh: struct buffer_head

3310

* @bh: struct buffer_head

3300

*

3311

*

3301

* Returns zero on success and -EIO on error.

3312

* Returns zero on success and -EIO on error.

3302

*/

3313

*/

3303

int bh_submit_read(struct buffer_head *bh)

3314

int bh_submit_read(struct buffer_head *bh)

3304

{

3315

{

3305

BUG_ON(!buffer_locked(bh));

3316

BUG_ON(!buffer_locked(bh));

3306

3317

3307

if (buffer_uptodate(bh)) {

3318

if (buffer_uptodate(bh)) {

3308

unlock_buffer(bh);

3319

unlock_buffer(bh);

3309

return 0;

3320

return 0;

3310

}

3321

}

3311

3322

3312

get_bh(bh);

3323

get_bh(bh);

3313

bh->b_end_io = end_buffer_read_sync;

3324

bh->b_end_io = end_buffer_read_sync;

3314

submit_bh(READ, bh);

3325

submit_bh(READ, bh);

3315

wait_on_buffer(bh);

3326

wait_on_buffer(bh);

3316

if (buffer_uptodate(bh))

3327

if (buffer_uptodate(bh))

3317

return 0;

3328

return 0;

3318

return -EIO;

3329

return -EIO;

3319

}

3330

}

3320

EXPORT_SYMBOL(bh_submit_read);

3331

EXPORT_SYMBOL(bh_submit_read);

3321

3332

3322

static void

3333

static void

3323

init_buffer_head(void *data)

3334

init_buffer_head(void *data)

3324

{

3335

{

3325

struct buffer_head *bh = data;

3336

struct buffer_head *bh = data;

3326

3337

3327

memset(bh, 0, sizeof(*bh));

3338

memset(bh, 0, sizeof(*bh));

3328

INIT_LIST_HEAD(&bh->b_assoc_buffers);

3339

INIT_LIST_HEAD(&bh->b_assoc_buffers);

3329

}

3340

}

3330

3341

3331

void __init buffer_init(void)

3342

void __init buffer_init(void)

3332

{

3343

{

3333

int nrpages;

3344

int nrpages;

3334

3345

3335

bh_cachep = kmem_cache_create("buffer_head",

3346

bh_cachep = kmem_cache_create("buffer_head",

3336

sizeof(struct buffer_head), 0,

3347

sizeof(struct buffer_head), 0,

3337

(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|

3348

(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|

3338

SLAB_MEM_SPREAD),

3349

SLAB_MEM_SPREAD),

3339

init_buffer_head);

3350

init_buffer_head);

3340

3351

3341

/*

3352

/*

3342

* Limit the bh occupancy to 10% of ZONE_NORMAL

3353

* Limit the bh occupancy to 10% of ZONE_NORMAL

3343

*/

3354

*/

3344

nrpages = (nr_free_buffer_pages() * 10) / 100;

3355

nrpages = (nr_free_buffer_pages() * 10) / 100;

3345

max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));

3356

max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));

3346

hotcpu_notifier(buffer_cpu_notify, 0);

3357

hotcpu_notifier(buffer_cpu_notify, 0);

3347

}

3358

}

3348

3359

3349

EXPORT_SYMBOL(__bforget);

3360

EXPORT_SYMBOL(__bforget);

3350

EXPORT_SYMBOL(__brelse);

3361

EXPORT_SYMBOL(__brelse);

3351

EXPORT_SYMBOL(__wait_on_buffer);

3362

EXPORT_SYMBOL(__wait_on_buffer);

3352

EXPORT_SYMBOL(block_commit_write);

3363

EXPORT_SYMBOL(block_commit_write);

3353

EXPORT_SYMBOL(block_prepare_write);

3364

EXPORT_SYMBOL(block_prepare_write);

3354

EXPORT_SYMBOL(block_page_mkwrite);

3365

EXPORT_SYMBOL(block_page_mkwrite);

3355

EXPORT_SYMBOL(block_read_full_page);

3366

EXPORT_SYMBOL(block_read_full_page);

3356

EXPORT_SYMBOL(block_sync_page);

3367

EXPORT_SYMBOL(block_sync_page);

3357

EXPORT_SYMBOL(block_truncate_page);

3368

EXPORT_SYMBOL(block_truncate_page);

3358

EXPORT_SYMBOL(block_write_full_page);

3369

EXPORT_SYMBOL(block_write_full_page);

3359

EXPORT_SYMBOL(cont_write_begin);

3370

EXPORT_SYMBOL(cont_write_begin);

3360

EXPORT_SYMBOL(end_buffer_read_sync);

3371

EXPORT_SYMBOL(end_buffer_read_sync);

3361

EXPORT_SYMBOL(end_buffer_write_sync);

3372

EXPORT_SYMBOL(end_buffer_write_sync);

3362

EXPORT_SYMBOL(file_fsync);

3373

EXPORT_SYMBOL(file_fsync);

3363

EXPORT_SYMBOL(fsync_bdev);

3374

EXPORT_SYMBOL(fsync_bdev);

3364

EXPORT_SYMBOL(generic_block_bmap);

3375

EXPORT_SYMBOL(generic_block_bmap);

3365

EXPORT_SYMBOL(generic_cont_expand_simple);

3376

EXPORT_SYMBOL(generic_cont_expand_simple);

3366

EXPORT_SYMBOL(init_buffer);

3377

EXPORT_SYMBOL(init_buffer);

3367

EXPORT_SYMBOL(invalidate_bdev);

3378

EXPORT_SYMBOL(invalidate_bdev);

3368

EXPORT_SYMBOL(ll_rw_block);

3379

EXPORT_SYMBOL(ll_rw_block);

3369

EXPORT_SYMBOL(mark_buffer_dirty);

3380

EXPORT_SYMBOL(mark_buffer_dirty);

3370

EXPORT_SYMBOL(submit_bh);

3381

EXPORT_SYMBOL(submit_bh);

3371

EXPORT_SYMBOL(sync_dirty_buffer);

3382

EXPORT_SYMBOL(sync_dirty_buffer);

3372

EXPORT_SYMBOL(unlock_buffer);

3383

EXPORT_SYMBOL(unlock_buffer);

GITLAB

block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set

 /*
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
  *	-  July2000
  * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
  */
 /*
  * This handles all read/write requests to block devices
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <trace/block.h>
 #include "blk.h"
 DEFINE_TRACE(block_plug);
 DEFINE_TRACE(block_unplug_io);
 DEFINE_TRACE(block_unplug_timer);
 DEFINE_TRACE(block_getrq);
 DEFINE_TRACE(block_sleeprq);
 DEFINE_TRACE(block_rq_requeue);
 DEFINE_TRACE(block_bio_backmerge);
 DEFINE_TRACE(block_bio_frontmerge);
 DEFINE_TRACE(block_bio_queue);
 DEFINE_TRACE(block_rq_complete);
 DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
 static int __make_request(struct request_queue *q, struct bio *bio);
 /*
  * For the allocated request tables
  */
 static struct kmem_cache *request_cachep;
 /*
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
 /*
  * Controlling structure to kblockd
  */
 static struct workqueue_struct *kblockd_workqueue;
 static void drive_stat_acct(struct request *rq, int new_io)
 {
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 	int cpu;
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 	cpu = part_stat_lock();
 	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
 	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part);
 	}
 	part_stat_unlock();
 }
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
 	nr = q->nr_requests - (q->nr_requests / 8) + 1;
 	if (nr > q->nr_requests)
 		nr = q->nr_requests;
 	q->nr_congestion_on = nr;
 	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 	if (nr < 1)
 		nr = 1;
 	q->nr_congestion_off = nr;
 }
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
  * backing_dev_info
  *
  * Will return NULL if the request queue cannot be located.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
 	struct backing_dev_info *ret = NULL;
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q)
 		ret = &q->backing_dev_info;
 	return ret;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
 	rq->sector = rq->hard_sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
 	rq->tag = -1;
 	rq->ref_count = 1;
 }
 EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	struct request_queue *q = rq->q;
 	if (&q->bar_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 		if (unlikely(nbytes > bio->bi_size)) {
 			printk(KERN_ERR "%s: want %u bytes done, %u left\n",
 			       __func__, nbytes, bio->bi_size);
 			nbytes = bio->bi_size;
 		}
+		if (unlikely(rq->cmd_flags & REQ_QUIET))
+			set_bit(BIO_QUIET, &bio->bi_flags);
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 		if (bio_integrity(bio))
 			bio_integrity_advance(bio, nbytes);
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
 		/*
 		 * Okay, this is the barrier request in progress, just
 		 * record the error;
 		 */
 		if (error && !q->orderr)
 			q->orderr = error;
 	}
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 	printk(KERN_INFO "  sector %llu, nr/cnr %lu/%u\n",
 						(unsigned long long)rq->sector,
 						rq->nr_sectors,
 						rq->current_nr_sectors);
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, data %p, len %u\n",
 						rq->bio, rq->biotail,
 						rq->buffer, rq->data,
 						rq->data_len);
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
 		for (bit = 0; bit < BLK_MAX_CDB; bit++)
 			printk("%02x ", rq->cmd[bit]);
 		printk("\n");
 	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 /*
  * "plug" the device if there are no outstanding requests: this will
  * force the transfer to start only after we have put all the requests
  * on the list.
  *
  * This is called with interrupts off and no requests on the queue and
  * with the queue lock held.
  */
 void blk_plug_device(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
 	if (blk_queue_stopped(q))
 		return;
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		trace_block_plug(q);
 	}
 }
 EXPORT_SYMBOL(blk_plug_device);
 /**
  * blk_plug_device_unlocked - plug a device without queue lock held
  * @q:    The &struct request_queue to plug
  *
  * Description:
  *   Like @blk_plug_device(), but grabs the queue lock and disables
  *   interrupts.
  **/
 void blk_plug_device_unlocked(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_plug_device(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_plug_device_unlocked);
 /*
  * remove the queue from the plugged list, if present. called with
  * queue lock held and interrupts disabled.
  */
 int blk_remove_plug(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
 		return 0;
 	del_timer(&q->unplug_timer);
 	return 1;
 }
 EXPORT_SYMBOL(blk_remove_plug);
 /*
  * remove the plug and let it rip..
  */
 void __generic_unplug_device(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	if (!blk_remove_plug(q))
 		return;
 	q->request_fn(q);
 }
 /**
  * generic_unplug_device - fire a request queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
  *   gets unplugged, the request_fn defined for the queue is invoked and
  *   transfers started.
  **/
 void generic_unplug_device(struct request_queue *q)
 {
 	if (blk_queue_plugged(q)) {
 		spin_lock_irq(q->queue_lock);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 	}
 }
 EXPORT_SYMBOL(generic_unplug_device);
 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 				   struct page *page)
 {
 	struct request_queue *q = bdi->unplug_io_data;
 	blk_unplug(q);
 }
 void blk_unplug_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 	trace_block_unplug_io(q);
 	q->unplug_fn(q);
 }
 void blk_unplug_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 	trace_block_unplug_timer(q);
 	kblockd_schedule_work(q, &q->unplug_work);
 }
 void blk_unplug(struct request_queue *q)
 {
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
 		trace_block_unplug_io(q);
 		q->unplug_fn(q);
 	}
 }
 EXPORT_SYMBOL(blk_unplug);
 static void blk_invoke_request_fn(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	/*
 	 * one level of recursion is ok and is much faster than kicking
 	 * the unplug handling
 	 */
 	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
 		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
 		kblockd_schedule_work(q, &q->unplug_work);
 	}
 }
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   blk_start_queue() will clear the stop flag on the queue, and call
  *   the request_fn for the queue if it was in a stopped state when
  *   entered. Also see blk_stop_queue(). Queue lock must be held.
  **/
 void blk_start_queue(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	blk_invoke_request_fn(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 /**
  * blk_stop_queue - stop a queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   The Linux block layer assumes that a block driver will consume all
  *   entries on the request queue when the request_fn strategy is called.
  *   Often this will not happen, because of hardware limitations (queue
  *   depth settings). If a device driver gets a 'queue full' response,
  *   or if it simply chooses not to queue more I/O at one point, it can
  *   call this function to prevent the request_fn from being called until
  *   the driver has signalled it's ready to go again. This happens by calling
  *   blk_start_queue() to restart queue operations. Queue lock must be held.
  **/
 void blk_stop_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
  *
  * Description:
  *     The block layer may perform asynchronous callback activity
  *     on a queue, such as calling the unplug function after a timeout.
  *     A block device may call blk_sync_queue to ensure that any
  *     such activity is cancelled, thus allowing it to release resources
  *     that the callbacks might use. The caller must already have made sure
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	kblockd_flush_work(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 /**
  * __blk_run_queue - run a single device queue
  * @q:	The queue to run
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
  *    held and interrupts disabled.
  *
  */
 void __blk_run_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!elv_queue_empty(q))
 		blk_invoke_request_fn(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 /**
  * blk_run_queue - run a single device queue
  * @q: The queue to run
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
  *    May be used to restart queueing when a request has completed. Also
  *    See @blk_start_queueing.
  *
  */
 void blk_run_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
 void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
 void blk_cleanup_queue(struct request_queue *q)
 {
 	/*
 	 * We know we have process context here, so we can be a little
 	 * cautious and ensure that pending block actions on this device
 	 * are done before moving on. Going into this function, we should
 	 * not have processes doing IO to this device.
 	 */
 	blk_sync_queue(q);
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
 	if (q->elevator)
 		elevator_exit(q->elevator);
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[READ] = rl->count[WRITE] = 0;
 	rl->starved[READ] = rl->starved[WRITE] = 0;
 	rl->elvpriv = 0;
 	init_waitqueue_head(&rl->wait[READ]);
 	init_waitqueue_head(&rl->wait[WRITE]);
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
 	if (!rl->rq_pool)
 		return -ENOMEM;
 	return 0;
 }
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
 	int err;
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	kobject_init(&q->kobj, &blk_queue_ktype);
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
  * @rfn:  The function to be called to process requests that have been
  *        placed on the queue.
  * @lock: Request queue spin lock
  *
  * Description:
  *    If a block device wishes to use the standard request handling procedures,
  *    which sorts requests and coalesces adjacent requests, then it must
  *    call blk_init_queue().  The function @rfn will be called when there
  *    are requests on the queue that need to be processed.  If the device
  *    supports plugging, then @rfn may not be called immediately when requests
  *    are available on the queue, but may be called at some time later instead.
  *    Plugged queues are generally unplugged when a buffer belonging to one
  *    of the requests on the queue is needed, or due to memory pressure.
  *
  *    @rfn is not required, or even expected, to remove all requests off the
  *    queue, but only as many as it can handle at a time.  If it does leave
  *    requests on the queue, it is responsible for arranging that the requests
  *    get dealt with eventually.
  *
  *    The queue spin lock must be held while manipulating the requests on the
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
  *    Function returns a pointer to the initialized request queue, or %NULL if
  *    it didn't succeed.
  *
  * Note:
  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
  *    when the block device is deactivated (such as at module unload).
  **/
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
 	return blk_init_queue_node(rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 	if (!q)
 		return NULL;
 	q->node = node_id;
 	if (blk_init_free_list(q)) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	/*
 	 * if caller didn't supply a lock, they get per-queue locking with
 	 * our embedded lock
 	 */
 	if (!lock)
 		lock = &q->__queue_lock;
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER |
 				   1 << QUEUE_FLAG_STACKABLE);
 	q->queue_lock		= lock;
 	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	q->sg_reserved_size = INT_MAX;
 	blk_set_cmd_filter_defaults(&q->cmd_filter);
 	/*
 	 * all done
 	 */
 	if (!elevator_init(q, NULL)) {
 		blk_queue_congestion_threshold(q);
 		return q;
 	}
 	blk_put_queue(q);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 int blk_get_queue(struct request_queue *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
 	return 1;
 }
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
 }
 static struct request *
 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 	if (!rq)
 		return NULL;
 	blk_rq_init(q, rq);
 	rq->cmd_flags = rw | REQ_ALLOCED;
 	if (priv) {
 		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
 		rq->cmd_flags |= REQ_ELVPRIV;
 	}
 	return rq;
 }
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
 	/*
 	 * Make sure the process is able to allocate at least 1 request
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
 	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
 /*
  * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
  * will cause the process to be a "batcher" on all queues in the system. This
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc || ioc_batching(q, ioc))
 		return;
 	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 static void __freed_request(struct request_queue *q, int rw)
 {
 	struct request_list *rl = &q->rq;
 	if (rl->count[rw] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, rw);
 	if (rl->count[rw] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[rw]))
 			wake_up(&rl->wait[rw]);
 		blk_clear_queue_full(q, rw);
 	}
 }
 /*
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
 static void freed_request(struct request_queue *q, int rw, int priv)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[rw]--;
 	if (priv)
 		rl->elvpriv--;
 	__freed_request(q, rw);
 	if (unlikely(rl->starved[rw ^ 1]))
 		__freed_request(q, rw ^ 1);
 }
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
  * Returns !NULL on success, with queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const int rw = rw_flags & 0x01;
 	int may_queue, priv;
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[rw]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
 			if (!blk_queue_full(q, rw)) {
 				ioc_set_batching(q, ioc);
 				blk_set_queue_full(q, rw);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
 					/*
 					 * The queue is full and the allocating
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
 					goto out;
 				}
 			}
 		}
 		blk_set_queue_congested(q, rw);
 	}
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[rw] >= (3 * q->nr_requests / 2))
 		goto out;
 	rl->count[rw]++;
 	rl->starved[rw] = 0;
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
 		rl->elvpriv++;
 	spin_unlock_irq(q->queue_lock);
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
 		 *
 		 * Allocating task should really be put onto the front of the
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
 		freed_request(q, rw, priv);
 		/*
 		 * in the very unlikely event that allocation failed and no
 		 * requests for this direction was pending, mark us starved
 		 * so that freeing of a request in the other direction will
 		 * notice us. another possible fix would be to split the
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
 		if (unlikely(rl->count[rw] == 0))
 			rl->starved[rw] = 1;
 		goto out;
 	}
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
 	 * not count toward the nr_batch_requests limit. There will always
 	 * be some limit enforced by BLK_BATCH_TIME.
 	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	trace_block_getrq(q, bio, rw);
 out:
 	return rq;
 }
 /*
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
 	const int rw = rw_flags & 0x01;
 	struct request *rq;
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 		trace_block_sleeprq(q, bio, rw);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 		io_schedule();
 		/*
 		 * After sleeping, we become a "batching" process and
 		 * will be able to allocate at least one request, and
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
 		ioc = current_io_context(GFP_NOIO, q->node);
 		ioc_set_batching(q, ioc);
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[rw], &wait);
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	};
 	return rq;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 	BUG_ON(rw != READ && rw != WRITE);
 	spin_lock_irq(q->queue_lock);
 	if (gfp_mask & __GFP_WAIT) {
 		rq = get_request_wait(q, rw, NULL);
 	} else {
 		rq = get_request(q, rw, NULL, gfp_mask);
 		if (!rq)
 			spin_unlock_irq(q->queue_lock);
 	}
 	/* q->queue_lock is unlocked at this point */
 	return rq;
 }
 EXPORT_SYMBOL(blk_get_request);
 /**
  * blk_start_queueing - initiate dispatch of requests to device
  * @q:		request queue to kick into gear
  *
  * This is basically a helper to remove the need to know whether a queue
  * is plugged or not if someone just wants to initiate dispatch of requests
  * for this queue. Should be used to start queueing on a device outside
  * of ->request_fn() context. Also see @blk_run_queue.
  *
  * The queue lock must be held with interrupts disabled.
  */
 void blk_start_queueing(struct request_queue *q)
 {
 	if (!blk_queue_plugged(q)) {
 		if (unlikely(blk_queue_stopped(q)))
 			return;
 		q->request_fn(q);
 	} else
 		__generic_unplug_device(q);
 }
 EXPORT_SYMBOL(blk_start_queueing);
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  *
  * Description:
  *    Drivers often keep queueing requests until the hardware cannot accept
  *    more, when that condition happens we need to put the request back
  *    on the queue. Must be called with queue lock held.
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
 /**
  * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  * @at_head:	insert request at head or tail of queue
  * @data:	private data
  *
  * Description:
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
  *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
  *    be scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
  *    of the queue for things like a QUEUE_FULL message from a device, or a
  *    host that is unable to accept a particular command.
  */
 void blk_insert_request(struct request_queue *q, struct request *rq,
 			int at_head, void *data)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 	unsigned long flags;
 	/*
 	 * tell I/O scheduler that this isn't a regular read/write (ie it
 	 * must not attempt merges on this) and that it acts as a soft
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_SOFTBARRIER;
 	rq->special = data;
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * If command is tagged, release the tag
 	 */
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
 	blk_start_queueing(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
 /*
  * add-request adds a request to the linked list.
  * queue lock is held and interrupts disabled, as we muck with the
  * request queue list.
  */
 static inline void add_request(struct request_queue *q, struct request *req)
 {
 	drive_stat_acct(req, 1);
 	/*
 	 * elevator indicated where it wants this request to be
 	 * inserted at elevator_merge time
 	 */
 	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 }
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
 	if (now == part->stamp)
 		return;
 	if (part->in_flight) {
 		__part_stat_add(cpu, part, time_in_queue,
 				part->in_flight * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
 }
 /**
  * part_round_stats() - Round off the performance stats on a struct disk_stats.
  * @cpu: cpu number for stats access
  * @part: target partition
  *
  * The average IO queue length and utilisation statistics are maintained
  * by observing the current state of the queue length and the amount of
  * time it has been in this state for.
  *
  * Normally, that accounting is done on IO completion, but that can result
  * in more than a second's worth of IO being accounted for within any one
  * second, leading to >100% utilisation.  To deal with that, we call this
  * function to do a round-off before returning the results when reading
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
 void part_round_stats(int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 	if (part->partno)
 		part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
 	part_round_stats_single(cpu, part, now);
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 /*
  * queue lock must be held
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
 	if (unlikely(--req->ref_count))
 		return;
 	elv_completed_request(q, req);
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		int rw = rq_data_dir(req);
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 		blk_free_request(q, req);
 		freed_request(q, rw, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
 void blk_put_request(struct request *req)
 {
 	unsigned long flags;
 	struct request_queue *q = req->q;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_put_request(q, req);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_put_request);
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 	/*
 	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
 	if (bio_rw_ahead(bio))
 		req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 				   REQ_FAILFAST_DRIVER);
 	if (bio_failfast_dev(bio))
 		req->cmd_flags |= REQ_FAILFAST_DEV;
 	if (bio_failfast_transport(bio))
 		req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
 	if (bio_failfast_driver(bio))
 		req->cmd_flags |= REQ_FAILFAST_DRIVER;
 	/*
 	 * REQ_BARRIER implies no merging, but lets make it explicit
 	 */
 	if (unlikely(bio_discard(bio))) {
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_barrier(bio))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	req->start_time = jiffies;
 	blk_rq_bio_prep(req->q, req, bio);
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
 	int el_ret, nr_sectors, barrier, discard, err;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
 	nr_sectors = bio_sectors(bio);
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
 	 * ISA dma in theory)
 	 */
 	blk_queue_bounce(q, &bio);
 	barrier = bio_barrier(bio);
 	if (unlikely(barrier) && bio_has_data(bio) &&
 	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
 	discard = bio_discard(bio);
 	if (unlikely(discard) && !q->prepare_discard_fn) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(barrier) || elv_queue_empty(q))
 		goto get_rq;
 	el_ret = elv_merge(q, &req, bio);
 	switch (el_ret) {
 	case ELEVATOR_BACK_MERGE:
 		BUG_ON(!rq_mergeable(req));
 		if (!ll_back_merge_fn(q, req, bio))
 			break;
 		trace_block_bio_backmerge(q, bio);
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
 		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		if (!attempt_back_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
 	case ELEVATOR_FRONT_MERGE:
 		BUG_ON(!rq_mergeable(req));
 		if (!ll_front_merge_fn(q, req, bio))
 			break;
 		trace_block_bio_frontmerge(q, bio);
 		bio->bi_next = req->bio;
 		req->bio = bio;
 		/*
 		 * may not be valid. if the low level driver said
 		 * it didn't need a bounce buffer then it better
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
 		req->current_nr_sectors = bio_cur_sectors(bio);
 		req->hard_cur_sectors = req->current_nr_sectors;
 		req->sector = req->hard_sector = bio->bi_sector;
 		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		if (!attempt_front_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
 	/* ELV_NO_MERGE: elevator says don't/can't merge. */
 	default:
 		;
 	}
 get_rq:
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
 	 * rq allocator and io schedulers.
 	 */
 	rw_flags = bio_data_dir(bio);
 	if (sync)
 		rw_flags |= REQ_RW_SYNC;
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
 	init_request_from_bio(req, bio);
 	spin_lock_irq(q->queue_lock);
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
 	if (sync)
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 end_io:
 	bio_endio(bio, err);
 	return 0;
 }
 /*
  * If bio->bi_dev is a partition, remap the location
  */
 static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev, bio->bi_sector,
 				    bio->bi_sector - p->start_sect);
 	}
 }
 static void handle_bad_sector(struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
 			(unsigned long long)bio->bi_sector + bio_sectors(bio),
 			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
 	set_bit(BIO_EOF, &bio->bi_flags);
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static DECLARE_FAULT_ATTR(fail_make_request);
 static int __init setup_fail_make_request(char *str)
 {
 	return setup_fault_attr(&fail_make_request, str);
 }
 __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
 	struct hd_struct *part = bio->bi_bdev->bd_part;
 	if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
 		return should_fail(&fail_make_request, bio->bi_size);
 	return 0;
 }
 static int __init fail_make_request_debugfs(void)
 {
 	return init_fault_attr_dentries(&fail_make_request,
 					"fail_make_request");
 }
 late_initcall(fail_make_request_debugfs);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 static inline int should_fail_request(struct bio *bio)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 /*
  * Check whether this bio extends beyond the end of the device.
  */
 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 {
 	sector_t maxsector;
 	if (!nr_sectors)
 		return 0;
 	/* Test device or partition size, when known. */
 	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
 	if (maxsector) {
 		sector_t sector = bio->bi_sector;
 		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
 			/*
 			 * This may well happen - the kernel calls bread()
 			 * without checking the size of the device, e.g., when
 			 * mounting a device.
 			 */
 			handle_bad_sector(bio);
 			return 1;
 		}
 	}
 	return 0;
 }
 /**
  * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
  * devices. It is passed a &struct bio, which describes the I/O that needs
  * to be done.
  *
  * generic_make_request() does not return any status.  The
  * success/failure status of the request, along with notification of
  * completion, is delivered asynchronously through the bio->bi_end_io
  * function described (one day) else where.
  *
  * The caller of generic_make_request must make sure that bi_io_vec
  * are set to describe the memory buffer, and that bi_dev and bi_sector are
  * set to describe the device address, and the
  * bi_end_io and optionally bi_private are set to describe how
  * completion notification should be signaled.
  *
  * generic_make_request and the drivers it calls may use bi_next if this
  * bio happens to be merged with someone else, and may change bi_dev and
  * bi_sector for remaps as it sees fit.  So the values of these fields
  * should NOT be depended on after the call to generic_make_request.
  */
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
 	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 	int err = -EIO;
 	might_sleep();
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
 	 * by explicitly returning 0)
 	 *
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
 	old_sector = -1;
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 		q = bdev_get_queue(bio->bi_bdev);
 		if (!q) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
 end_io:
 			bio_endio(bio, err);
 			break;
 		}
 		if (unlikely(nr_sectors > q->max_hw_sectors)) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 				bdevname(bio->bi_bdev, b),
 				bio_sectors(bio),
 				q->max_hw_sectors);
 			goto end_io;
 		}
 		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 			goto end_io;
 		if (should_fail_request(bio))
 			goto end_io;
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
 		 */
 		blk_partition_remap(bio);
 		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
 			goto end_io;
 		if (old_sector != -1)
 			trace_block_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 		trace_block_bio_queue(q, bio);
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 		if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
 		    (bio_discard(bio) && !q->prepare_discard_fn)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
 /*
  * We only want one ->make_request_fn to be active at a time,
  * else stack usage with stacked devices could be a problem.
  * So use current->bio_{list,tail} to keep a list of requests
  * submited by a make_request_fn function.
  * current->bio_tail is also used as a flag to say if
  * generic_make_request is currently active in this task or not.
  * If it is NULL, then no make_request is active.  If it is non-NULL,
  * then a make_request is active, and new requests should be added
  * at the tail
  */
 void generic_make_request(struct bio *bio)
 {
 	if (current->bio_tail) {
 		/* make_request is active */
 		*(current->bio_tail) = bio;
 		bio->bi_next = NULL;
 		current->bio_tail = &bio->bi_next;
 		return;
 	}
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
 	 * ensure that) so we have a list with a single bio.
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to the next (which is NULL) and bio_tail
 	 * to &bio_list, thus initialising the bio_list of new bios to be
 	 * added.  __generic_make_request may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so fixup bio_list and
 	 * bio_tail or bi_next, and call into __generic_make_request again.
 	 *
 	 * The loop was structured like this to make only one call to
 	 * __generic_make_request (which is important as it is large and
 	 * inlined) and to keep the structure simple.
 	 */
 	BUG_ON(bio->bi_next);
 	do {
 		current->bio_list = bio->bi_next;
 		if (bio->bi_next == NULL)
 			current->bio_tail = &current->bio_list;
 		else
 			bio->bi_next = NULL;
 		__generic_make_request(bio);
 		bio = current->bio_list;
 	} while (bio);
 	current->bio_tail = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
  * interfaces; @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 	bio->bi_rw |= rw;
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
 			task_io_account_read(bio->bi_size);
 			count_vm_events(PGPGIN, count);
 		}
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
 			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
 				bdevname(bio->bi_bdev, b));
 		}
 	}
 	generic_make_request(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 /**
  * blk_rq_check_limits - Helper function to check a request for the queue limit
  * @q:  the queue
  * @rq: the request being checked
  *
  * Description:
  *    @rq may have been made based on weaker limitations of upper-level queues
  *    in request stacking drivers, and it may violate the limitation of @q.
  *    Since the block layer and the underlying device driver trust @rq
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
  *    This function should also be useful for request stacking drivers
  *    in some cases below, so export this fuction.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
  *    Such request stacking drivers should check those requests agaist
  *    the new queue limits again when they dispatch those requests,
  *    although such checkings are also done against the old queue limits
  *    when submitting requests.
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (rq->nr_sectors > q->max_sectors ||
 	    rq->data_len > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
 	/*
 	 * queue's settings related to segment counting like q->bounce_pfn
 	 * may differ from that of other stacking queues.
 	 * Recalculate it to check the request correctly on this queue's
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
 	if (rq->nr_phys_segments > q->max_phys_segments ||
 	    rq->nr_phys_segments > q->max_hw_segments) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
 	    should_fail(&fail_make_request, blk_rq_bytes(rq)))
 		return -EIO;
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * Submitting request must be dequeued before calling this function
 	 * because it will be linked to another request_queue
 	 */
 	BUG_ON(blk_queued_rq(rq));
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 /**
  * blkdev_dequeue_request - dequeue request and start timeout timer
  * @req: request to dequeue
  *
  * Dequeue @req and start timeout timer on it.  This hands off the
  * request to the driver.
  *
  * Block internal functions which don't want to start timer should
  * call elv_dequeue_request().
  */
 void blkdev_dequeue_request(struct request *req)
 {
 	elv_dequeue_request(req->q, req);
 	/*
 	 * We are now handing the request to the hardware, add the
 	 * timeout handler.
 	 */
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blkdev_dequeue_request);
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @req, and sets it up
  *     for the next range of segments (if any) in the cluster.
  *
  * Return:
  *     %0 - we are done with this request, call end_that_request_last()
  *     %1 - still buffers pending for this request
  **/
 static int __end_that_request_first(struct request *req, int error,
 				    int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 	trace_block_rq_complete(req->q, req);
 	/*
 	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
 	 * sense key with us all the way through
 	 */
 	if (!blk_pc_request(req))
 		req->errors = 0;
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
 		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
 				(unsigned long long)req->sector);
 	}
 	if (blk_fs_request(req) && req->rq_disk) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 		part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
 		part_stat_unlock();
 	}
 	total_bytes = bio_nbytes = 0;
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 		/*
 		 * For an empty barrier request, the low level driver must
 		 * store a potential error location in ->sector. We pass
 		 * that back up in ->bi_sector.
 		 */
 		if (blk_empty_barrier(req))
 			bio->bi_sector = req->sector;
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
 			req_bio_endio(req, bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
 			int idx = bio->bi_idx + next_idx;
 			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
 				blk_dump_rq_flags(req, "__end_that");
 				printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
 				       __func__, bio->bi_idx, bio->bi_vcnt);
 				break;
 			}
 			nbytes = bio_iovec_idx(bio, idx)->bv_len;
 			BIO_BUG_ON(nbytes > bio->bi_size);
 			/*
 			 * not a complete bvec done
 			 */
 			if (unlikely(nbytes > nr_bytes)) {
 				bio_nbytes += nr_bytes;
 				total_bytes += nr_bytes;
 				break;
 			}
 			/*
 			 * advance to the next vector
 			 */
 			next_idx++;
 			bio_nbytes += nbytes;
 		}
 		total_bytes += nbytes;
 		nr_bytes -= nbytes;
 		bio = req->bio;
 		if (bio) {
 			/*
 			 * end more in this run, or just return 'not-done'
 			 */
 			if (unlikely(nr_bytes <= 0))
 				break;
 		}
 	}
 	/*
 	 * completely done
 	 */
 	if (!req->bio)
 		return 0;
 	/*
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
 		req_bio_endio(req, bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 	blk_recalc_rq_sectors(req, total_bytes >> 9);
 	blk_recalc_rq_segments(req);
 	return 1;
 }
 /*
  * queue lock must be held
  */
 static void end_that_request_last(struct request *req, int error)
 {
 	struct gendisk *disk = req->rq_disk;
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 	if (blk_queued_rq(req))
 		elv_dequeue_request(req->q, req);
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 	blk_delete_timer(req);
 	/*
 	 * Account IO completion.  bar_rq isn't accounted as a normal
 	 * IO on queueing nor completion.  Accounting the containing
 	 * request is enough.
 	 */
 	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(disk, req->sector);
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part);
 		part_stat_unlock();
 	}
 	if (req->end_io)
 		req->end_io(req, error);
 	else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 		__blk_put_request(req->q, req);
 	}
 }
 /**
  * blk_rq_bytes - Returns bytes left to complete in the entire request
  * @rq: the request being processed
  **/
 unsigned int blk_rq_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
 		return rq->hard_nr_sectors << 9;
 	return rq->data_len;
 }
 EXPORT_SYMBOL_GPL(blk_rq_bytes);
 /**
  * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
  * @rq: the request being processed
  **/
 unsigned int blk_rq_cur_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
 		return rq->current_nr_sectors << 9;
 	if (rq->bio)
 		return rq->bio->bi_size;
 	return rq->data_len;
 }
 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 /**
  * end_request - end I/O on the current segment of the request
  * @req:	the request being processed
  * @uptodate:	error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends I/O on the current segment of a request. If that is the only
  *     remaining segment, the request is also completed and freed.
  *
  *     This is a remnant of how older block drivers handled I/O completions.
  *     Modern drivers typically end I/O on the full request in one go, unless
  *     they have a residual value to account for. For that case this function
  *     isn't really useful, unless the residual just happens to be the
  *     full current segment. In other words, don't use this function in new
  *     code. Use blk_end_request() or __blk_end_request() to end a request.
  **/
 void end_request(struct request *req, int uptodate)
 {
 	int error = 0;
 	if (uptodate <= 0)
 		error = uptodate ? uptodate : -EIO;
 	__blk_end_request(req, error, req->hard_cur_sectors << 9);
 }
 EXPORT_SYMBOL(end_request);
 static int end_that_request_data(struct request *rq, int error,
 				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (rq->bio) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 		/* Bidi request must be completed as a whole */
 		if (blk_bidi_rq(rq) &&
 		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
 			return 1;
 	}
 	return 0;
 }
 /**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
  * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  * @bidi_bytes:   number of bytes to complete @rq->next_rq
  * @drv_callback: function called between completion of bios in the request
  *                and completion of the request.
  *                If the callback returns non %0, this helper returns without
  *                completion of the request.
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
  *     %0 - we are done with this request
  *     %1 - this request is not freed yet, it still has pending buffers.
  **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 		      unsigned int bidi_bytes,
 		      int (drv_callback)(struct request *))
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
 		return 1;
 	/* Special feature for tricky drivers */
 	if (drv_callback && drv_callback(rq))
 		return 1;
 	add_disk_randomness(rq->rq_disk);
 	spin_lock_irqsave(q->queue_lock, flags);
 	end_that_request_last(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 }
 /**
  * blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
  *     %0 - we are done with this request
  *     %1 - still buffers pending for this request
  **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	return blk_end_io(rq, error, nr_bytes, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(blk_end_request);
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Must be called with queue lock held unlike blk_end_request().
  *
  * Return:
  *     %0 - we are done with this request
  *     %1 - still buffers pending for this request
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
 		return 1;
 	add_disk_randomness(rq->rq_disk);
 	end_that_request_last(rq, error);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
  * blk_end_bidi_request - Helper function for drivers to complete bidi request.
  * @rq:         the bidi request being processed
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *
  * Return:
  *     %0 - we are done with this request
  *     %1 - still buffers pending for this request
  **/
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 			 unsigned int bidi_bytes)
 {
 	return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:           the request being processed
  * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
  *     the request structure even if @rq doesn't have leftover.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
  *     Actual device drivers should use blk_end_request instead.
  */
 void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	if (!end_that_request_data(rq, error, nr_bytes, 0)) {
 		/*
 		 * These members are not updated in end_that_request_data()
 		 * when all bios are completed.
 		 * Update them so that the request stacking driver can find
 		 * how many bytes remain in the request later.
 		 */
 		rq->nr_sectors = rq->hard_nr_sectors = 0;
 		rq->current_nr_sectors = rq->hard_cur_sectors = 0;
 	}
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 /**
  * blk_end_request_callback - Special helper function for tricky drivers
  * @rq:           the request being processed
  * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete
  * @drv_callback: function called between completion of bios in the request
  *                and completion of the request.
  *                If the callback returns non %0, this helper returns without
  *                completion of the request.
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is used only for existing tricky drivers.
  *     (e.g. cdrom_newpc_intr() of ide-cd)
  *     This interface will be removed when such drivers are rewritten.
  *     Don't use this interface in other places anymore.
  *
  * Return:
  *     %0 - we are done with this request
  *     %1 - this request is not freed yet.
  *          this request still has pending buffers or
  *          the driver doesn't want to finish this request yet.
  **/
 int blk_end_request_callback(struct request *rq, int error,
 			     unsigned int nr_bytes,
 			     int (drv_callback)(struct request *))
 {
 	return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
 }
 EXPORT_SYMBOL_GPL(blk_end_request_callback);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
 	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
 	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
 	rq->current_nr_sectors = bio_cur_sectors(bio);
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 	if (bio->bi_bdev)
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
  *
  * Description:
  *    Check if underlying low-level drivers of a device are busy.
  *    If the drivers want to export their busy state, they must set own
  *    exporting function using blk_queue_lld_busy() first.
  *
  *    Basically, this function is used only by request stacking drivers
  *    to stop dispatching requests to underlying devices when underlying
  *    devices are busy.  This behavior helps more I/O merging on the queue
  *    of the request stacking driver and prevents I/O throughput regression
  *    on burst I/O load.
  *
  * Return:
  *    0 - Not busy (The request stacking driver should dispatch request)
  *    1 - Busy (The request stacking driver should stop dispatching request)
  */
 int blk_lld_busy(struct request_queue *q)
 {
 	if (q->lld_busy_fn)
 		return q->lld_busy_fn(q);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 void kblockd_flush_work(struct work_struct *work)
 {
 	cancel_work_sync(work);
 }
 EXPORT_SYMBOL(kblockd_flush_work);
 int __init blk_dev_init(void)
 {
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 	return 0;
 }

 /*
  *  linux/fs/buffer.c
  *
  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
  */
 /*
  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  *
  * Removed a lot of unnecessary code and simplified things now that
  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  *
  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  *
  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  *
  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  */
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/capability.h>
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 inline void
 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 {
 	bh->b_end_io = handler;
 	bh->b_private = private;
 }
 static int sync_buffer(void *word)
 {
 	struct block_device *bd;
 	struct buffer_head *bh
 		= container_of(word, struct buffer_head, b_state);
 	smp_mb();
 	bd = bh->b_bdev;
 	if (bd)
 		blk_run_address_space(bd->bd_inode->i_mapping);
 	io_schedule();
 	return 0;
 }
 void __lock_buffer(struct buffer_head *bh)
 {
 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
 void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
 /*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
  * if you want to preserve its state.
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
 }
 static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
 	page_cache_release(page);
 }
+static int quiet_error(struct buffer_head *bh)
+{
+	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+		return 0;
+	return 1;
+}
 static void buffer_io_error(struct buffer_head *bh)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr);
 }
 /*
  * End-of-IO handler helper function which does not touch the bh after
  * unlocking it.
  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
  * a race there is benign: unlock_buffer() only use the bh's address for
  * hashing after unlocking the buffer, so it doesn't actually touch the bh
  * itself.
  */
 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 {
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
 		/* This happens, due to failed READA attempts. */
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
 }
 /*
  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
  * unlock the buffer. This is what ll_rw_block uses too.
  */
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 {
 	__end_buffer_read_notouch(bh, uptodate);
 	put_bh(bh);
 }
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
 		}
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
 	put_bh(bh);
 }
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
 	int ret = 0;
 	if (bdev)
 		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
 int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
 		int res = fsync_super(sb);
 		drop_super(sb);
 		return res;
 	}
 	return sync_blockdev(bdev);
 }
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
  * This takes the block device bd_mount_sem to make sure no new mounts
  * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
 	down(&bdev->bd_mount_sem);
 	sb = get_super(bdev);
 	if (sb && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 		__fsync_super(sb);
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
 		sync_blockdev(sb->s_bdev);
 		if (sb->s_op->write_super_lockfs)
 			sb->s_op->write_super_lockfs(sb);
 	}
 	sync_blockdev(bdev);
 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
 /**
  * thaw_bdev  -- unlock filesystem
  * @bdev:	blockdevice to unlock
  * @sb:		associated superblock
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
 		if (sb->s_op->unlockfs)
 			sb->s_op->unlockfs(sb);
 		sb->s_frozen = SB_UNFROZEN;
 		smp_wmb();
 		wake_up(&sb->s_wait_unfrozen);
 		drop_super(sb);
 	}
 	up(&bdev->bd_mount_sem);
 }
 EXPORT_SYMBOL(thaw_bdev);
 /*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
  * private_lock.
  *
  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
  * may be quite high.  This code could TryLock the page, and if that
  * succeeds, there is no need to take private_lock. (But if
  * private_lock is contended then so is mapping->tree_lock).
  */
 static struct buffer_head *
 __find_get_block_slow(struct block_device *bdev, sector_t block)
 {
 	struct inode *bd_inode = bdev->bd_inode;
 	struct address_space *bd_mapping = bd_inode->i_mapping;
 	struct buffer_head *ret = NULL;
 	pgoff_t index;
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
 	int all_mapped = 1;
 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 	page = find_get_page(bd_mapping, index);
 	if (!page)
 		goto out;
 	spin_lock(&bd_mapping->private_lock);
 	if (!page_has_buffers(page))
 		goto out_unlock;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		if (bh->b_blocknr == block) {
 			ret = bh;
 			get_bh(bh);
 			goto out_unlock;
 		}
 		if (!buffer_mapped(bh))
 			all_mapped = 0;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	/* we might be here because some of the buffers on this page are
 	 * not mapped.  This is due to various races between
 	 * file io on the block device and getblk.  It gets dealt with
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
 	if (all_mapped) {
 		printk("__find_get_block_slow() failed. "
 			"block=%llu, b_blocknr=%llu\n",
 			(unsigned long long)block,
 			(unsigned long long)bh->b_blocknr);
 		printk("b_state=0x%08lx, b_size=%zu\n",
 			bh->b_state, bh->b_size);
 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
 	page_cache_release(page);
 out:
 	return ret;
 }
 /* If invalidate_buffers() will trash dirty buffers, it means some kind
    of fs corruption is going on. Trashing dirty data always imply losing
    information that was supposed to be just stored on the physical layer
    by the user.
    Thus invalidate_buffers in general usage is not allwowed to trash
    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
    be preserved.  These buffers are simply skipped.
    We also skip buffers which are still in use.  For example this can
    happen if a userspace program is reading the block device.
    NOTE: In the case where the user removed a removable-media-disk even if
    there's still dirty data not synced on disk (due a bug in the device driver
    or due an error of the user), by not destroying the dirty buffers we could
    generate corruption also on the next media inserted, thus a parameter is
    necessary to handle this case in the most safe way possible (trying
    to not corrupt also the new disk inserted with the data belonging to
    the old now corrupted disk). Also for the ramdisk the natural thing
    to do in order to release the ramdisk memory is to destroy dirty buffers.
    These are two special cases. Normal usage imply the device driver
    to issue a sync on the device (without waiting I/O completion) and
    then an invalidate_buffers call that doesn't trash dirty buffers.
    For handling cache coherency with the blkdev pagecache the 'update' case
    is been introduced. It is needed to re-read from disk any pinned
    buffer. NOTE: re-reading from disk is destructive so we can do it only
    when we assume nobody is changing the buffercache under our I/O and when
    we think the disk contains more recent information than the buffercache.
    The update == 1 pass marks the buffers we need to update, the update == 2
    pass does the actual I/O. */
 void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	if (mapping->nrpages == 0)
 		return;
 	invalidate_bh_lrus();
 	invalidate_mapping_pages(mapping, 0, -1);
 }
 /*
  * Kick pdflush then try to free up some ZONE_NORMAL memory.
  */
 static void free_more_memory(void)
 {
 	struct zone *zone;
 	int nid;
 	wakeup_pdflush(1024);
 	yield();
 	for_each_online_node(nid) {
 		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 						gfp_zone(GFP_NOFS), NULL,
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS);
 	}
 }
 /*
  * I/O completion handler for block_read_full_page() - pages
  * which come unlocked at the end of I/O.
  */
 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
 	unsigned long flags;
 	struct buffer_head *first;
 	struct buffer_head *tmp;
 	struct page *page;
 	int page_uptodate = 1;
 	BUG_ON(!buffer_async_read(bh));
 	page = bh->b_page;
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
-		if (printk_ratelimit())
+		if (!quiet_error(bh))
 			buffer_io_error(bh);
 		SetPageError(page);
 	}
 	/*
 	 * Be _very_ careful from here on. Bad things can happen if
 	 * two buffer heads end IO at almost the same time and both
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
 	local_irq_save(flags);
 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
 	do {
 		if (!buffer_uptodate(tmp))
 			page_uptodate = 0;
 		if (buffer_async_read(tmp)) {
 			BUG_ON(!buffer_locked(tmp));
 			goto still_busy;
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	/*
 	 * If none of the buffers had errors and they are all
 	 * uptodate then we can set the page uptodate.
 	 */
 	if (page_uptodate && !PageError(page))
 		SetPageUptodate(page);
 	unlock_page(page);
 	return;
 still_busy:
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	return;
 }
 /*
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
 	struct buffer_head *first;
 	struct buffer_head *tmp;
 	struct page *page;
 	BUG_ON(!buffer_async_write(bh));
 	page = bh->b_page;
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 			       bdevname(bh->b_bdev, b));
 		}
 		set_bit(AS_EIO, &page->mapping->flags);
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 		SetPageError(page);
 	}
 	first = page_buffers(page);
 	local_irq_save(flags);
 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
 	tmp = bh->b_this_page;
 	while (tmp != bh) {
 		if (buffer_async_write(tmp)) {
 			BUG_ON(!buffer_locked(tmp));
 			goto still_busy;
 		}
 		tmp = tmp->b_this_page;
 	}
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	end_page_writeback(page);
 	return;
 still_busy:
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	return;
 }
 /*
  * If a page's buffers are under async readin (end_buffer_async_read
  * completion) then there is a possibility that another thread of
  * control could lock one of the buffers after it has completed
  * but while some of the other buffers have not completed.  This
  * locked buffer would confuse end_buffer_async_read() into not unlocking
  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
  * that this buffer is not under async I/O.
  *
  * The page comes unlocked when it has no locked buffer_async buffers
  * left.
  *
  * PageLocked prevents anyone starting new async I/O reads any of
  * the buffers.
  *
  * PageWriteback is used to prevent simultaneous writeout of the same
  * page.
  *
  * PageLocked prevents anyone from starting writeback of a page which is
  * under read I/O (PageWriteback is only ever set against a locked page).
  */
 static void mark_buffer_async_read(struct buffer_head *bh)
 {
 	bh->b_end_io = end_buffer_async_read;
 	set_buffer_async_read(bh);
 }
 void mark_buffer_async_write(struct buffer_head *bh)
 {
 	bh->b_end_io = end_buffer_async_write;
 	set_buffer_async_write(bh);
 }
 EXPORT_SYMBOL(mark_buffer_async_write);
 /*
  * fs/buffer.c contains helper functions for buffer-backed address space's
  * fsync functions.  A common requirement for buffer-based filesystems is
  * that certain data from the backing blockdev needs to be written out for
  * a successful fsync().  For example, ext2 indirect blocks need to be
  * written back and waited upon before fsync() returns.
  *
  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->private_list.
  *
  * Locking is a little subtle: try_to_free_buffers() will remove buffers
  * from their controlling inode's queue when they are being freed.  But
  * try_to_free_buffers() will be operating against the *blockdev* mapping
  * at the time, not against the S_ISREG file which depends on those buffers.
  * So the locking for private_list is via the private_lock in the address_space
  * which backs the buffers.  Which is different from the address_space
  * against which the buffers are listed.  So for a particular address_space,
  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
  * mapping->private_list will always be protected by the backing blockdev's
  * ->private_lock.
  *
  * Which introduces a requirement: all buffers on an address_space's
  * ->private_list must be from the same address_space: the blockdev's.
  *
  * address_spaces which do not place buffers at ->private_list via these
  * utility functions are free to use private_lock and private_list for
  * whatever they want.  The only requirement is that list_empty(private_list)
  * be true at clear_inode() time.
  *
  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
  * filesystems should do that.  invalidate_inode_buffers() should just go
  * BUG_ON(!list_empty).
  *
  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
  * take an address_space, not an inode.  And it should be called
  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
  * queued up.
  *
  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
  * list if it is already on a list.  Because if the buffer is on a list,
  * it *must* already be on the right one.  If not, the filesystem is being
  * silly.  This will save a ton of locking.  But first we have to ensure
  * that buffers are taken *off* the old inode's list when they are freed
  * (presumably in truncate).  That requires careful auditing of all
  * filesystems (do it inside bforget()).  It could also be done by bringing
  * b_inode back.
  */
 /*
  * The buffer's backing address_space's private_lock must be held
  */
 static void __remove_assoc_queue(struct buffer_head *bh)
 {
 	list_del_init(&bh->b_assoc_buffers);
 	WARN_ON(!bh->b_assoc_map);
 	if (buffer_write_io_error(bh))
 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 	bh->b_assoc_map = NULL;
 }
 int inode_has_buffers(struct inode *inode)
 {
 	return !list_empty(&inode->i_data.private_list);
 }
 /*
  * osync is designed to support O_SYNC io.  It waits synchronously for
  * all already-submitted IO to complete, but does not queue any new
  * writes to the disk.
  *
  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
  * you dirty the buffers, and then use osync_inode_buffers to wait for
  * completion.  Any other dirty buffers which are not yet queued for
  * write will not be flushed to disk by the osync.
  */
 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head *p;
 	int err = 0;
 	spin_lock(lock);
 repeat:
 	list_for_each_prev(p, list) {
 		bh = BH_ENTRY(p);
 		if (buffer_locked(bh)) {
 			get_bh(bh);
 			spin_unlock(lock);
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				err = -EIO;
 			brelse(bh);
 			spin_lock(lock);
 			goto repeat;
 		}
 	}
 	spin_unlock(lock);
 	return err;
 }
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
  *
  * Starts I/O against the buffers at mapping->private_list, and waits upon
  * that I/O.
  *
  * Basically, this is a convenience function for fsync().
  * @mapping is a file or directory which needs those buffers to be written for
  * a successful fsync().
  */
 int sync_mapping_buffers(struct address_space *mapping)
 {
 	struct address_space *buffer_mapping = mapping->assoc_mapping;
 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 		return 0;
 	return fsync_buffers_list(&buffer_mapping->private_lock,
 					&mapping->private_list);
 }
 EXPORT_SYMBOL(sync_mapping_buffers);
 /*
  * Called when we've recently written block `bblock', and it is known that
  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
  */
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize)
 {
 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 	if (bh) {
 		if (buffer_dirty(bh))
 			ll_rw_block(WRITE, 1, &bh);
 		put_bh(bh);
 	}
 }
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct address_space *buffer_mapping = bh->b_page->mapping;
 	mark_buffer_dirty(bh);
 	if (!mapping->assoc_mapping) {
 		mapping->assoc_mapping = buffer_mapping;
 	} else {
 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
 	}
 	if (!bh->b_assoc_map) {
 		spin_lock(&buffer_mapping->private_lock);
 		list_move_tail(&bh->b_assoc_buffers,
 				&mapping->private_list);
 		bh->b_assoc_map = mapping;
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 /*
  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
  * dirty.
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
  */
 static int __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
 	if (TestSetPageDirty(page))
 		return 0;
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
 	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return 1;
 }
 /*
  * Add a page to the dirty page list.
  *
  * It is a sad fact of life that this function is called from several places
  * deeply under spinlocking.  It may not sleep.
  *
  * If the page has buffers, the uptodate buffers are set dirty, to preserve
  * dirty-state coherency between the page and the buffers.  It the page does
  * not have buffers then when they are later attached they will all be set
  * dirty.
  *
  * The buffers are dirtied before the page is dirtied.  There's a small race
  * window in which a writepage caller may see the page cleanness but not the
  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
  * before the buffers, a concurrent writepage caller could clear the page dirty
  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
  * page on the dirty page list.
  *
  * We use private_lock to lock against try_to_free_buffers while using the
  * page's buffer list.  Also use this to protect against clean buffers being
  * added to the page after it was set dirty.
  *
  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
  * address_space though.
  */
 int __set_page_dirty_buffers(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
 		do {
 			set_buffer_dirty(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
 	spin_unlock(&mapping->private_lock);
 	return __set_page_dirty(page, mapping, 1);
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
 /*
  * Write out and wait upon a list of buffers.
  *
  * We have conflicting pressures: we want to make sure that all
  * initially dirty buffers get waited on, but that any subsequently
  * dirtied buffers don't.  After all, we don't want fsync to last
  * forever if somebody is actively writing to the file.
  *
  * Do this in two main stages: first we copy dirty buffers to a
  * temporary inode list, queueing the writes as we go.  Then we clean
  * up, waiting for those writes to complete.
  *
  * During this second stage, any subsequent updates to the file may end
  * up refiling the buffer on the original inode's dirty list again, so
  * there is a chance we will end up with a buffer queued for write but
  * not yet completed on that list.  So, as a final cleanup we go through
  * the osync code to catch these locked, dirty buffers without requeuing
  * any newly dirty buffers for write.
  */
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
 	struct address_space *mapping;
 	int err = 0, err2;
 	INIT_LIST_HEAD(&tmp);
 	spin_lock(lock);
 	while (!list_empty(list)) {
 		bh = BH_ENTRY(list->next);
 		mapping = bh->b_assoc_map;
 		__remove_assoc_queue(bh);
 		/* Avoid race with mark_buffer_dirty_inode() which does
 		 * a lockless check and we rely on seeing the dirty bit */
 		smp_mb();
 		if (buffer_dirty(bh) || buffer_locked(bh)) {
 			list_add(&bh->b_assoc_buffers, &tmp);
 			bh->b_assoc_map = mapping;
 			if (buffer_dirty(bh)) {
 				get_bh(bh);
 				spin_unlock(lock);
 				/*
 				 * Ensure any pending I/O completes so that
 				 * ll_rw_block() actually writes the current
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
 				ll_rw_block(SWRITE_SYNC, 1, &bh);
 				brelse(bh);
 				spin_lock(lock);
 			}
 		}
 	}
 	while (!list_empty(&tmp)) {
 		bh = BH_ENTRY(tmp.prev);
 		get_bh(bh);
 		mapping = bh->b_assoc_map;
 		__remove_assoc_queue(bh);
 		/* Avoid race with mark_buffer_dirty_inode() which does
 		 * a lockless check and we rely on seeing the dirty bit */
 		smp_mb();
 		if (buffer_dirty(bh)) {
 			list_add(&bh->b_assoc_buffers,
 				 &mapping->private_list);
 			bh->b_assoc_map = mapping;
 		}
 		spin_unlock(lock);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh))
 			err = -EIO;
 		brelse(bh);
 		spin_lock(lock);
 	}
 	spin_unlock(lock);
 	err2 = osync_buffers_list(lock, list);
 	if (err)
 		return err;
 	else
 		return err2;
 }
 /*
  * Invalidate any and all dirty buffers on a given inode.  We are
  * probably unmounting the fs, but that doesn't mean we have already
  * done a sync().  Just drop the buffers from the inode list.
  *
  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
  * assumes that all the buffers are against the blockdev.  Not true
  * for reiserfs.
  */
 void invalidate_inode_buffers(struct inode *inode)
 {
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
 		struct list_head *list = &mapping->private_list;
 		struct address_space *buffer_mapping = mapping->assoc_mapping;
 		spin_lock(&buffer_mapping->private_lock);
 		while (!list_empty(list))
 			__remove_assoc_queue(BH_ENTRY(list->next));
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
 EXPORT_SYMBOL(invalidate_inode_buffers);
 /*
  * Remove any clean buffers from the inode's buffer list.  This is called
  * when we're trying to free the inode itself.  Those buffers can pin it.
  *
  * Returns true if all buffers were removed.
  */
 int remove_inode_buffers(struct inode *inode)
 {
 	int ret = 1;
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
 		struct list_head *list = &mapping->private_list;
 		struct address_space *buffer_mapping = mapping->assoc_mapping;
 		spin_lock(&buffer_mapping->private_lock);
 		while (!list_empty(list)) {
 			struct buffer_head *bh = BH_ENTRY(list->next);
 			if (buffer_dirty(bh)) {
 				ret = 0;
 				break;
 			}
 			__remove_assoc_queue(bh);
 		}
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 	return ret;
 }
 /*
  * Create the appropriate buffers when given a page for data area and
  * the size of each buffer.. Use the bh->b_this_page linked list to
  * follow the buffers created.  Return NULL if unable to create more
  * buffers.
  *
  * The retry flag is used to differentiate async IO (paging, swapping)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		int retry)
 {
 	struct buffer_head *bh, *head;
 	long offset;
 try_again:
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
 		bh = alloc_buffer_head(GFP_NOFS);
 		if (!bh)
 			goto no_grow;
 		bh->b_bdev = NULL;
 		bh->b_this_page = head;
 		bh->b_blocknr = -1;
 		head = bh;
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
 		bh->b_private = NULL;
 		bh->b_size = size;
 		/* Link the buffer to its page */
 		set_bh_page(bh, page, offset);
 		init_buffer(bh, NULL, NULL);
 	}
 	return head;
 /*
  * In case anything failed, we just free everything we got.
  */
 no_grow:
 	if (head) {
 		do {
 			bh = head;
 			head = head->b_this_page;
 			free_buffer_head(bh);
 		} while (head);
 	}
 	/*
 	 * Return failure for non-async IO requests.  Async IO requests
 	 * are not allowed to fail, so we have to wait until buffer heads
 	 * become available.  But we don't want tasks sleeping with
 	 * partially complete buffers, so all were released above.
 	 */
 	if (!retry)
 		return NULL;
 	/* We're _really_ low on memory. Now we just
 	 * wait for old buffer heads to become free due to
 	 * finishing IO.  Since this is an async request and
 	 * the reserve list is empty, we're sure there are
 	 * async buffer heads in use.
 	 */
 	free_more_memory();
 	goto try_again;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 static inline void
 link_dev_buffers(struct page *page, struct buffer_head *head)
 {
 	struct buffer_head *bh, *tail;
 	bh = head;
 	do {
 		tail = bh;
 		bh = bh->b_this_page;
 	} while (bh);
 	tail->b_this_page = head;
 	attach_page_buffers(page, head);
 }
 /*
  * Initialise the state of a blockdev page's buffers.
  */
 static void
 init_page_buffers(struct page *page, struct block_device *bdev,
 			sector_t block, int size)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh = head;
 	int uptodate = PageUptodate(page);
 	do {
 		if (!buffer_mapped(bh)) {
 			init_buffer(bh, NULL, NULL);
 			bh->b_bdev = bdev;
 			bh->b_blocknr = block;
 			if (uptodate)
 				set_buffer_uptodate(bh);
 			set_buffer_mapped(bh);
 		}
 		block++;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 /*
  * Create the page-cache page that contains the requested block.
  *
  * This is user purely for blockdev mappings.
  */
 static struct page *
 grow_dev_page(struct block_device *bdev, sector_t block,
 		pgoff_t index, int size)
 {
 	struct inode *inode = bdev->bd_inode;
 	struct page *page;
 	struct buffer_head *bh;
 	page = find_or_create_page(inode->i_mapping, index,
 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
 	if (!page)
 		return NULL;
 	BUG_ON(!PageLocked(page));
 	if (page_has_buffers(page)) {
 		bh = page_buffers(page);
 		if (bh->b_size == size) {
 			init_page_buffers(page, bdev, block, size);
 			return page;
 		}
 		if (!try_to_free_buffers(page))
 			goto failed;
 	}
 	/*
 	 * Allocate some buffers for this page
 	 */
 	bh = alloc_page_buffers(page, size, 0);
 	if (!bh)
 		goto failed;
 	/*
 	 * Link the page to the buffers and initialise them.  Take the
 	 * lock to be atomic wrt __find_get_block(), which does not
 	 * run under the page lock.
 	 */
 	spin_lock(&inode->i_mapping->private_lock);
 	link_dev_buffers(page, bh);
 	init_page_buffers(page, bdev, block, size);
 	spin_unlock(&inode->i_mapping->private_lock);
 	return page;
 failed:
 	BUG();
 	unlock_page(page);
 	page_cache_release(page);
 	return NULL;
 }
 /*
  * Create buffers for the specified block device block's page.  If
  * that page was dirty, the buffers are set dirty also.
  */
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
 	struct page *page;
 	pgoff_t index;
 	int sizebits;
 	sizebits = -1;
 	do {
 		sizebits++;
 	} while ((size << sizebits) < PAGE_SIZE);
 	index = block >> sizebits;
 	/*
 	 * Check for a block which wants to lie outside our maximum possible
 	 * pagecache index.  (this comparison is done using sector_t types).
 	 */
 	if (unlikely(index != block >> sizebits)) {
 		char b[BDEVNAME_SIZE];
 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
 			"device %s\n",
 			__func__, (unsigned long long)block,
 			bdevname(bdev, b));
 		return -EIO;
 	}
 	block = index << sizebits;
 	/* Create a page with the proper size buffers.. */
 	page = grow_dev_page(bdev, block, index, size);
 	if (!page)
 		return 0;
 	unlock_page(page);
 	page_cache_release(page);
 	return 1;
 }
 static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
 		printk(KERN_ERR "hardsect size: %d\n",
 					bdev_hardsect_size(bdev));
 		dump_stack();
 		return NULL;
 	}
 	for (;;) {
 		struct buffer_head * bh;
 		int ret;
 		bh = __find_get_block(bdev, block, size);
 		if (bh)
 			return bh;
 		ret = grow_buffers(bdev, block, size);
 		if (ret < 0)
 			return NULL;
 		if (ret == 0)
 			free_more_memory();
 	}
 }
 /*
  * The relationship between dirty buffers and dirty pages:
  *
  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
  * the page is tagged dirty in its radix tree.
  *
  * At all times, the dirtiness of the buffers represents the dirtiness of
  * subsections of the page.  If the page has buffers, the page dirty bit is
  * merely a hint about the true dirty state.
  *
  * When a page is set dirty in its entirety, all its buffers are marked dirty
  * (if the page has buffers).
  *
  * When a buffer is marked dirty, its page is dirtied, but the page's other
  * buffers are not.
  *
  * Also.  When blockdev buffers are explicitly read with bread(), they
  * individually become uptodate.  But their backing page remains not
  * uptodate - even if all of its buffers are uptodate.  A subsequent
  * block_read_full_page() against that page will discover all the uptodate
  * buffers, will set the page uptodate and will perform no I/O.
  */
 /**
  * mark_buffer_dirty - mark a buffer_head as needing writeout
  * @bh: the buffer_head to mark dirty
  *
  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
  * backing page dirty, then tag the page as dirty in its address_space's radix
  * tree and then attach the address_space's inode to its superblock's dirty
  * inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
  * mapping->tree_lock and the global inode_lock.
  */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
 	WARN_ON_ONCE(!buffer_uptodate(bh));
 	/*
 	 * Very *carefully* optimize the it-is-already-dirty case.
 	 *
 	 * Don't let the final "is it dirty" escape to before we
 	 * perhaps modified the buffer.
 	 */
 	if (buffer_dirty(bh)) {
 		smp_mb();
 		if (buffer_dirty(bh))
 			return;
 	}
 	if (!test_set_buffer_dirty(bh))
 		__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
 }
 /*
  * Decrement a buffer_head's reference count.  If all buffers against a page
  * have zero reference count, are clean and unlocked, and if the page is clean
  * and unlocked then try_to_free_buffers() may strip the buffers from the page
  * in preparation for freeing it (sometimes, rarely, buffers are removed from
  * a page but it ends up not being freed, and buffers may later be reattached).
  */
 void __brelse(struct buffer_head * buf)
 {
 	if (atomic_read(&buf->b_count)) {
 		put_bh(buf);
 		return;
 	}
 	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
 }
 /*
  * bforget() is like brelse(), except it discards any
  * potentially dirty data.
  */
 void __bforget(struct buffer_head *bh)
 {
 	clear_buffer_dirty(bh);
 	if (bh->b_assoc_map) {
 		struct address_space *buffer_mapping = bh->b_page->mapping;
 		spin_lock(&buffer_mapping->private_lock);
 		list_del_init(&bh->b_assoc_buffers);
 		bh->b_assoc_map = NULL;
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 	__brelse(bh);
 }
 static struct buffer_head *__bread_slow(struct buffer_head *bh)
 {
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
 		return bh;
 	} else {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
 		wait_on_buffer(bh);
 		if (buffer_uptodate(bh))
 			return bh;
 	}
 	brelse(bh);
 	return NULL;
 }
 /*
  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
  * refcount elevated by one when they're in an LRU.  A buffer can only appear
  * once in a particular CPU's LRU.  A single buffer can be present in multiple
  * CPU's LRUs at the same time.
  *
  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
  * sb_find_get_block().
  *
  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
  * a local interrupt disable for that.
  */
 #define BH_LRU_SIZE	8
 struct bh_lru {
 	struct buffer_head *bhs[BH_LRU_SIZE];
 };
 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
 #ifdef CONFIG_SMP
 #define bh_lru_lock()	local_irq_disable()
 #define bh_lru_unlock()	local_irq_enable()
 #else
 #define bh_lru_lock()	preempt_disable()
 #define bh_lru_unlock()	preempt_enable()
 #endif
 static inline void check_irqs_on(void)
 {
 #ifdef irqs_disabled
 	BUG_ON(irqs_disabled());
 #endif
 }
 /*
  * The LRU management algorithm is dopey-but-simple.  Sorry.
  */
 static void bh_lru_install(struct buffer_head *bh)
 {
 	struct buffer_head *evictee = NULL;
 	struct bh_lru *lru;
 	check_irqs_on();
 	bh_lru_lock();
 	lru = &__get_cpu_var(bh_lrus);
 	if (lru->bhs[0] != bh) {
 		struct buffer_head *bhs[BH_LRU_SIZE];
 		int in;
 		int out = 0;
 		get_bh(bh);
 		bhs[out++] = bh;
 		for (in = 0; in < BH_LRU_SIZE; in++) {
 			struct buffer_head *bh2 = lru->bhs[in];
 			if (bh2 == bh) {
 				__brelse(bh2);
 			} else {
 				if (out >= BH_LRU_SIZE) {
 					BUG_ON(evictee != NULL);
 					evictee = bh2;
 				} else {
 					bhs[out++] = bh2;
 				}
 			}
 		}
 		while (out < BH_LRU_SIZE)
 			bhs[out++] = NULL;
 		memcpy(lru->bhs, bhs, sizeof(bhs));
 	}
 	bh_lru_unlock();
 	if (evictee)
 		__brelse(evictee);
 }
 /*
  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
  */
 static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *ret = NULL;
 	struct bh_lru *lru;
 	unsigned int i;
 	check_irqs_on();
 	bh_lru_lock();
 	lru = &__get_cpu_var(bh_lrus);
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		struct buffer_head *bh = lru->bhs[i];
 		if (bh && bh->b_bdev == bdev &&
 				bh->b_blocknr == block && bh->b_size == size) {
 			if (i) {
 				while (i) {
 					lru->bhs[i] = lru->bhs[i - 1];
 					i--;
 				}
 				lru->bhs[0] = bh;
 			}
 			get_bh(bh);
 			ret = bh;
 			break;
 		}
 	}
 	bh_lru_unlock();
 	return ret;
 }
 /*
  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
  * it in the LRU and mark it as accessed.  If it is not present then return
  * NULL
  */
 struct buffer_head *
 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
 	if (bh == NULL) {
 		bh = __find_get_block_slow(bdev, block);
 		if (bh)
 			bh_lru_install(bh);
 	}
 	if (bh)
 		touch_buffer(bh);
 	return bh;
 }
 EXPORT_SYMBOL(__find_get_block);
 /*
  * __getblk will locate (and, if necessary, create) the buffer_head
  * which corresponds to the passed block_device, block and size. The
  * returned buffer has its reference count incremented.
  *
  * __getblk() cannot fail - it just keeps trying.  If you pass it an
  * illegal block number, __getblk() will happily return a buffer_head
  * which represents the non-existent block.  Very weird.
  *
  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
  * attempt is failing.  FIXME, perhaps?
  */
 struct buffer_head *
 __getblk(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __find_get_block(bdev, block, size);
 	might_sleep();
 	if (bh == NULL)
 		bh = __getblk_slow(bdev, block, size);
 	return bh;
 }
 EXPORT_SYMBOL(__getblk);
 /*
  * Do async read-ahead on a buffer..
  */
 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __getblk(bdev, block, size);
 	if (likely(bh)) {
 		ll_rw_block(READA, 1, &bh);
 		brelse(bh);
 	}
 }
 EXPORT_SYMBOL(__breadahead);
 /**
  *  __bread() - reads a specified block and returns the bh
  *  @bdev: the block_device to read from
  *  @block: number of block
  *  @size: size (in bytes) to read
  *
  *  Reads a specified block, and returns buffer head that contains it.
  *  It returns NULL if the block was unreadable.
  */
 struct buffer_head *
 __bread(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __getblk(bdev, block, size);
 	if (likely(bh) && !buffer_uptodate(bh))
 		bh = __bread_slow(bh);
 	return bh;
 }
 EXPORT_SYMBOL(__bread);
 /*
  * invalidate_bh_lrus() is called rarely - but not only at unmount.
  * This doesn't race because it runs in each cpu either in irq
  * or with preempt disabled.
  */
 static void invalidate_bh_lru(void *arg)
 {
 	struct bh_lru *b = &get_cpu_var(bh_lrus);
 	int i;
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		brelse(b->bhs[i]);
 		b->bhs[i] = NULL;
 	}
 	put_cpu_var(bh_lrus);
 }
 void invalidate_bh_lrus(void)
 {
 	on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset)
 {
 	bh->b_page = page;
 	BUG_ON(offset >= PAGE_SIZE);
 	if (PageHighMem(page))
 		/*
 		 * This catches illegal uses and preserves the offset:
 		 */
 		bh->b_data = (char *)(0 + offset);
 	else
 		bh->b_data = page_address(page) + offset;
 }
 EXPORT_SYMBOL(set_bh_page);
 /*
  * Called when truncating a buffer on a page completely.
  */
 static void discard_buffer(struct buffer_head * bh)
 {
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	bh->b_bdev = NULL;
 	clear_buffer_mapped(bh);
 	clear_buffer_req(bh);
 	clear_buffer_new(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 	unlock_buffer(bh);
 }
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
  *
  * @page: the page which is affected
  * @offset: the index of the truncation point
  *
  * block_invalidatepage() is called when all or part of the page has become
  * invalidatedby a truncate operation.
  *
  * block_invalidatepage() does not have to release all buffers, but it must
  * ensure that no dirty buffer is left outside @offset and that no I/O
  * is underway against any of the blocks which are outside the truncation
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
 void block_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off)
 			discard_buffer(bh);
 		curr_off = next_off;
 		bh = next;
 	} while (bh != head);
 	/*
 	 * We release buffers only if the entire page is being invalidated.
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
 	if (offset == 0)
 		try_to_release_page(page, 0);
 out:
 	return;
 }
 EXPORT_SYMBOL(block_invalidatepage);
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
  * is already excluded via the page lock.
  */
 void create_empty_buffers(struct page *page,
 			unsigned long blocksize, unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
 	head = alloc_page_buffers(page, blocksize, 1);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
 		tail = bh;
 		bh = bh->b_this_page;
 	} while (bh);
 	tail->b_this_page = head;
 	spin_lock(&page->mapping->private_lock);
 	if (PageUptodate(page) || PageDirty(page)) {
 		bh = head;
 		do {
 			if (PageDirty(page))
 				set_buffer_dirty(bh);
 			if (PageUptodate(page))
 				set_buffer_uptodate(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
 	attach_page_buffers(page, head);
 	spin_unlock(&page->mapping->private_lock);
 }
 EXPORT_SYMBOL(create_empty_buffers);
 /*
  * We are taking a block for data and we don't want any output from any
  * buffer-cache aliases starting from return from that function and
  * until the moment when something will explicitly mark the buffer
  * dirty (hopefully that will not happen until we will free that block ;-)
  * We don't even need to mark it not-uptodate - nobody can expect
  * anything from a newly allocated buffer anyway. We used to used
  * unmap_buffer() for such invalidation, but that was wrong. We definitely
  * don't want to mark the alias unmapped, for example - it would confuse
  * anyone who might pick it with bread() afterwards...
  *
  * Also..  Note that bforget() doesn't lock the buffer.  So there can
  * be writeout I/O going on against recently-freed buffers.  We don't
  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
  * only if we really need to.  That happens here.
  */
 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 {
 	struct buffer_head *old_bh;
 	might_sleep();
 	old_bh = __find_get_block_slow(bdev, block);
 	if (old_bh) {
 		clear_buffer_dirty(old_bh);
 		wait_on_buffer(old_bh);
 		clear_buffer_req(old_bh);
 		__brelse(old_bh);
 	}
 }
 EXPORT_SYMBOL(unmap_underlying_metadata);
 /*
  * NOTE! All mapped/uptodate combinations are valid:
  *
  *	Mapped	Uptodate	Meaning
  *
  *	No	No		"unknown" - must do get_block()
  *	No	Yes		"hole" - zero-filled
  *	Yes	No		"allocated" - allocated on disk, not read in
  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
  *
  * "Dirty" is valid only with the last case (mapped+uptodate).
  */
 /*
  * While block_write_full_page is writing back the dirty buffers under
  * the page lock, whoever dirtied the buffers may decide to clean them
  * again at any time.  We handle that by only looking at the buffer
  * state inside lock_buffer().
  *
  * If block_write_full_page() is called for regular writeback
  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
  * locked buffer.   This only can happen if someone has written the buffer
  * directly, with submit_bh().  At the address_space level PageWriteback
  * prevents this contention from occurring.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc)
 {
 	int err;
 	sector_t block;
 	sector_t last_block;
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
 	BUG_ON(!PageLocked(page));
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
 	}
 	/*
 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
 	 * here, and the (potentially unmapped) buffers may become dirty at
 	 * any time.  If a buffer becomes dirty here after we've inspected it
 	 * then we just miss that fact, and the page stays dirty.
 	 *
 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
 	 * handle that here by just cleaning them.
 	 */
 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	head = page_buffers(page);
 	bh = head;
 	/*
 	 * Get all the dirty buffers mapped to disk addresses and
 	 * handle any aliases from the underlying blockdev's mapping.
 	 */
 	do {
 		if (block > last_block) {
 			/*
 			 * mapped buffers outside i_size will occur, because
 			 * this page can be outside i_size when there is a
 			 * truncate in progress.
 			 */
 			/*
 			 * The buffer was zeroed by block_write_full_page()
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
 			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
 			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
 			}
 		}
 		bh = bh->b_this_page;
 		block++;
 	} while (bh != head);
 	do {
 		if (!buffer_mapped(bh))
 			continue;
 		/*
 		 * If it's a fully non-blocking write attempt and we cannot
 		 * lock the buffer then redirty the page.  Note that this can
 		 * potentially cause a busy-wait loop from pdflush and kswapd
 		 * activity, but those code paths have their own higher-level
 		 * throttling.
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			mark_buffer_async_write(bh);
 		} else {
 			unlock_buffer(bh);
 		}
 	} while ((bh = bh->b_this_page) != head);
 	/*
 	 * The page and its buffers are protected by PageWriteback(), so we can
 	 * drop the bh refcounts early.
 	 */
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			submit_bh(WRITE, bh);
 			nr_underway++;
 		}
 		bh = next;
 	} while (bh != head);
 	unlock_page(page);
 	err = 0;
 done:
 	if (nr_underway == 0) {
 		/*
 		 * The page was marked dirty, but the buffers were
 		 * clean.  Someone wrote them back by hand with
 		 * ll_rw_block/submit_bh.  A rare case.
 		 */
 		end_page_writeback(page);
 		/*
 		 * The page and buffer_heads can be released at any time from
 		 * here on.
 		 */
 	}
 	return err;
 recover:
 	/*
 	 * ENOSPC, or some other error.  We may already have added some
 	 * blocks to the file, so we need to write these out to avoid
 	 * exposing stale data.
 	 * The page is currently locked and not marked for writeback
 	 */
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
 		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
 			/*
 			 * The buffer may have been set dirty during
 			 * attachment to a dirty page.
 			 */
 			clear_buffer_dirty(bh);
 		}
 	} while ((bh = bh->b_this_page) != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
 	mapping_set_error(page->mapping, err);
 	set_page_writeback(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
 			submit_bh(WRITE, bh);
 			nr_underway++;
 		}
 		bh = next;
 	} while (bh != head);
 	unlock_page(page);
 	goto done;
 }
 /*
  * If a page has any new buffers, zero them out here, and mark them uptodate
  * and dirty so they'll be written out (in order to prevent uninitialised
  * block data from leaking). And clear the new bit.
  */
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		return;
 	bh = head = page_buffers(page);
 	block_start = 0;
 	do {
 		block_end = block_start + bh->b_size;
 		if (buffer_new(bh)) {
 			if (block_end > from && block_start < to) {
 				if (!PageUptodate(page)) {
 					unsigned start, size;
 					start = max(from, block_start);
 					size = min(to, block_end) - start;
 					zero_user(page, start, size);
 					set_buffer_uptodate(bh);
 				}
 				clear_buffer_new(bh);
 				mark_buffer_dirty(bh);
 			}
 		}
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 static int __block_prepare_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to, get_block_t *get_block)
 {
 	unsigned block_start, block_end;
 	sector_t block;
 	int err = 0;
 	unsigned blocksize, bbits;
 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
 	BUG_ON(!PageLocked(page));
 	BUG_ON(from > PAGE_CACHE_SIZE);
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > to);
 	blocksize = 1 << inode->i_blkbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	head = page_buffers(page);
 	bbits = inode->i_blkbits;
 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
 	for(bh = head, block_start = 0; bh != head || !block_start;
 	    block++, block_start=block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (PageUptodate(page)) {
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 			}
 			continue;
 		}
 		if (buffer_new(bh))
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				break;
 			if (buffer_new(bh)) {
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
 				if (PageUptodate(page)) {
 					clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
 					mark_buffer_dirty(bh);
 					continue;
 				}
 				if (block_end > to || block_start < from)
 					zero_user_segments(page,
 						to, block_end,
 						block_start, from);
 				continue;
 			}
 		}
 		if (PageUptodate(page)) {
 			if (!buffer_uptodate(bh))
 				set_buffer_uptodate(bh);
 			continue;
 		}
 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 		    !buffer_unwritten(bh) &&
 		     (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 	}
 	/*
 	 * If we issued read requests - let them complete.
 	 */
 	while(wait_bh > wait) {
 		wait_on_buffer(*--wait_bh);
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	return err;
 }
 static int __block_commit_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to)
 {
 	unsigned block_start, block_end;
 	int partial = 0;
 	unsigned blocksize;
 	struct buffer_head *bh, *head;
 	blocksize = 1 << inode->i_blkbits;
 	for(bh = head = page_buffers(page), block_start = 0;
 	    bh != head || !block_start;
 	    block_start=block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (!buffer_uptodate(bh))
 				partial = 1;
 		} else {
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
 		clear_buffer_new(bh);
 	}
 	/*
 	 * If this is a partial write which happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
 	 * the next read(). Here we 'discover' whether the page went
 	 * uptodate as a result of this (potentially partial) write.
 	 */
 	if (!partial)
 		SetPageUptodate(page);
 	return 0;
 }
 /*
  * block_write_begin takes care of the basic task of block allocation and
  * bringing partial write blocks uptodate first.
  *
  * If *pagep is not NULL, then block_write_begin uses the locked page
  * at *pagep rather than allocating its own. In this case, the page will
  * not be unlocked or deallocated on failure.
  */
 int block_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
 {
 	struct inode *inode = mapping->host;
 	int status = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned start, end;
 	int ownpage = 0;
 	index = pos >> PAGE_CACHE_SHIFT;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + len;
 	page = *pagep;
 	if (page == NULL) {
 		ownpage = 1;
 		page = __grab_cache_page(mapping, index);
 		if (!page) {
 			status = -ENOMEM;
 			goto out;
 		}
 		*pagep = page;
 	} else
 		BUG_ON(!PageLocked(page));
 	status = __block_prepare_write(inode, page, start, end, get_block);
 	if (unlikely(status)) {
 		ClearPageUptodate(page);
 		if (ownpage) {
 			unlock_page(page);
 			page_cache_release(page);
 			*pagep = NULL;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again. Don't need
 			 * i_size_read because we hold i_mutex.
 			 */
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 		}
 		goto out;
 	}
 out:
 	return status;
 }
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	unsigned start;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	if (unlikely(copied < len)) {
 		/*
 		 * The buffers that were written will now be uptodate, so we
 		 * don't have to worry about a readpage reading them and
 		 * overwriting a partial write. However if we have encountered
 		 * a short write and only partially written into a buffer, it
 		 * will not be marked uptodate, so a readpage might come in and
 		 * destroy our partial write.
 		 *
 		 * Do the simplest thing, and just treat any short write to a
 		 * non uptodate page as a zero-length write, and force the
 		 * caller to redo the whole thing.
 		 */
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, start+copied, start+len);
 	}
 	flush_dcache_page(page);
 	/* This could be a short (even 0-length) commit */
 	__block_commit_write(inode, page, start, start+copied);
 	return copied;
 }
 EXPORT_SYMBOL(block_write_end);
 int generic_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int i_size_changed = 0;
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
 		i_size_changed = 1;
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
 	if (i_size_changed)
 		mark_inode_dirty(inode);
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
 /*
  * block_is_partially_uptodate checks whether buffers within a page are
  * uptodate or not.
  *
  * Returns true if all buffers which correspond to a file portion
  * we want to read are uptodate.
  */
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 					unsigned long from)
 {
 	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end, blocksize;
 	unsigned to;
 	struct buffer_head *bh, *head;
 	int ret = 1;
 	if (!page_has_buffers(page))
 		return 0;
 	blocksize = 1 << inode->i_blkbits;
 	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
 	to = from + to;
 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
 		return 0;
 	head = page_buffers(page);
 	bh = head;
 	block_start = 0;
 	do {
 		block_end = block_start + blocksize;
 		if (block_end > from && block_start < to) {
 			if (!buffer_uptodate(bh)) {
 				ret = 0;
 				break;
 			}
 			if (block_end >= to)
 				break;
 		}
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	return ret;
 }
 EXPORT_SYMBOL(block_is_partially_uptodate);
 /*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
  * Reads the page asynchronously --- the unlock_buffer() and
  * set/clear_buffer_uptodate() functions propagate buffer state into the
  * page struct once IO has completed.
  */
 int block_read_full_page(struct page *page, get_block_t *get_block)
 {
 	struct inode *inode = page->mapping->host;
 	sector_t iblock, lblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	unsigned int blocksize;
 	int nr, i;
 	int fully_mapped = 1;
 	BUG_ON(!PageLocked(page));
 	blocksize = 1 << inode->i_blkbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	head = page_buffers(page);
 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
 	bh = head;
 	nr = 0;
 	i = 0;
 	do {
 		if (buffer_uptodate(bh))
 			continue;
 		if (!buffer_mapped(bh)) {
 			int err = 0;
 			fully_mapped = 0;
 			if (iblock < lblock) {
 				WARN_ON(bh->b_size != blocksize);
 				err = get_block(inode, iblock, bh, 0);
 				if (err)
 					SetPageError(page);
 			}
 			if (!buffer_mapped(bh)) {
 				zero_user(page, i * blocksize, blocksize);
 				if (!err)
 					set_buffer_uptodate(bh);
 				continue;
 			}
 			/*
 			 * get_block() might have updated the buffer
 			 * synchronously
 			 */
 			if (buffer_uptodate(bh))
 				continue;
 		}
 		arr[nr++] = bh;
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
 	if (fully_mapped)
 		SetPageMappedToDisk(page);
 	if (!nr) {
 		/*
 		 * All buffers are uptodate - we can set the page uptodate
 		 * as well. But not if get_block() returned an error.
 		 */
 		if (!PageError(page))
 			SetPageUptodate(page);
 		unlock_page(page);
 		return 0;
 	}
 	/* Stage two: lock the buffers */
 	for (i = 0; i < nr; i++) {
 		bh = arr[i];
 		lock_buffer(bh);
 		mark_buffer_async_read(bh);
 	}
 	/*
 	 * Stage 3: start the IO.  Check for uptodateness
 	 * inside the buffer lock in case another process reading
 	 * the underlying blockdev brought it uptodate (the sct fix).
 	 */
 	for (i = 0; i < nr; i++) {
 		bh = arr[i];
 		if (buffer_uptodate(bh))
 			end_buffer_async_read(bh, 1);
 		else
 			submit_bh(READ, bh);
 	}
 	return 0;
 }
 /* utility function for filesystems that need to do work on expanding
  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
  * deal with the hole.
  */
 int generic_cont_expand_simple(struct inode *inode, loff_t size)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	void *fsdata;
 	unsigned long limit;
 	int err;
 	err = -EFBIG;
         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
 		send_sig(SIGXFSZ, current, 0);
 		goto out;
 	}
 	if (size > inode->i_sb->s_maxbytes)
 		goto out;
 	err = pagecache_write_begin(NULL, mapping, size, 0,
 				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
 				&page, &fsdata);
 	if (err)
 		goto out;
 	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
 	BUG_ON(err > 0);
 out:
 	return err;
 }
 static int cont_expand_zero(struct file *file, struct address_space *mapping,
 			    loff_t pos, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	struct page *page;
 	void *fsdata;
 	pgoff_t index, curidx;
 	loff_t curpos;
 	unsigned zerofrom, offset, len;
 	int err = 0;
 	index = pos >> PAGE_CACHE_SHIFT;
 	offset = pos & ~PAGE_CACHE_MASK;
 	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
 		zerofrom = curpos & ~PAGE_CACHE_MASK;
 		if (zerofrom & (blocksize-1)) {
 			*bytes |= (blocksize-1);
 			(*bytes)++;
 		}
 		len = PAGE_CACHE_SIZE - zerofrom;
 		err = pagecache_write_begin(file, mapping, curpos, len,
 						AOP_FLAG_UNINTERRUPTIBLE,
 						&page, &fsdata);
 		if (err)
 			goto out;
 		zero_user(page, zerofrom, len);
 		err = pagecache_write_end(file, mapping, curpos, len, len,
 						page, fsdata);
 		if (err < 0)
 			goto out;
 		BUG_ON(err != len);
 		err = 0;
 		balance_dirty_pages_ratelimited(mapping);
 	}
 	/* page covers the boundary, find the boundary offset */
 	if (index == curidx) {
 		zerofrom = curpos & ~PAGE_CACHE_MASK;
 		/* if we will expand the thing last block will be filled */
 		if (offset <= zerofrom) {
 			goto out;
 		}
 		if (zerofrom & (blocksize-1)) {
 			*bytes |= (blocksize-1);
 			(*bytes)++;
 		}
 		len = offset - zerofrom;
 		err = pagecache_write_begin(file, mapping, curpos, len,
 						AOP_FLAG_UNINTERRUPTIBLE,
 						&page, &fsdata);
 		if (err)
 			goto out;
 		zero_user(page, zerofrom, len);
 		err = pagecache_write_end(file, mapping, curpos, len, len,
 						page, fsdata);
 		if (err < 0)
 			goto out;
 		BUG_ON(err != len);
 		err = 0;
 	}
 out:
 	return err;
 }
 /*
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
 int cont_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	unsigned zerofrom;
 	int err;
 	err = cont_expand_zero(file, mapping, pos, bytes);
 	if (err)
 		goto out;
 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
 		*bytes |= (blocksize-1);
 		(*bytes)++;
 	}
 	*pagep = NULL;
 	err = block_write_begin(file, mapping, pos, len,
 				flags, pagep, fsdata, get_block);
 out:
 	return err;
 }
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
 			get_block_t *get_block)
 {
 	struct inode *inode = page->mapping->host;
 	int err = __block_prepare_write(inode, page, from, to, get_block);
 	if (err)
 		ClearPageUptodate(page);
 	return err;
 }
 int block_commit_write(struct page *page, unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	__block_commit_write(inode,page,from,to);
 	return 0;
 }
 /*
  * block_page_mkwrite() is not allowed to change the file size as it gets
  * called from a page fault handler when a page is first dirtied. Hence we must
  * be careful to check for EOF conditions here. We set the page up correctly
  * for a written page which means we get ENOSPC checking when writing into
  * holes and correct delalloc and unwritten extent mapping on filesystems that
  * support these features.
  *
  * We are not allowed to take the i_mutex here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
  * vmtruncate() writes the inode size before removing pages, once we have the
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
 int
 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 		   get_block_t get_block)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
 	int ret = -EINVAL;
 	lock_page(page);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
 	/* page is wholly or partially inside EOF */
 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
 		end = size & ~PAGE_CACHE_MASK;
 	else
 		end = PAGE_CACHE_SIZE;
 	ret = block_prepare_write(page, 0, end, get_block);
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 out_unlock:
 	unlock_page(page);
 	return ret;
 }
 /*
  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
  * immediately, while under the page lock.  So it needs a special end_io
  * handler which does not touch the bh after unlocking it.
  */
 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
 {
 	__end_buffer_read_notouch(bh, uptodate);
 }
 /*
  * Attach the singly-linked list of buffers created by nobh_write_begin, to
  * the page (converting it to circular linked list and taking care of page
  * dirty races).
  */
 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 {
 	struct buffer_head *bh;
 	BUG_ON(!PageLocked(page));
 	spin_lock(&page->mapping->private_lock);
 	bh = head;
 	do {
 		if (PageDirty(page))
 			set_buffer_dirty(bh);
 		if (!bh->b_this_page)
 			bh->b_this_page = head;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	attach_page_buffers(page, head);
 	spin_unlock(&page->mapping->private_lock);
 }
 /*
  * On entry, the page is fully not uptodate.
  * On exit the page is fully uptodate in the areas outside (from,to)
  */
 int nobh_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
 {
 	struct inode *inode = mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	const unsigned blocksize = 1 << blkbits;
 	struct buffer_head *head, *bh;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
 	unsigned block_in_page;
 	unsigned block_start, block_end;
 	sector_t block_in_file;
 	int nr_reads = 0;
 	int ret = 0;
 	int is_mapped_to_disk = 1;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	page = __grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 	*fsdata = NULL;
 	if (page_has_buffers(page)) {
 		unlock_page(page);
 		page_cache_release(page);
 		*pagep = NULL;
 		return block_write_begin(file, mapping, pos, len, flags, pagep,
 					fsdata, get_block);
 	}
 	if (PageMappedToDisk(page))
 		return 0;
 	/*
 	 * Allocate buffers so that we can keep track of state, and potentially
 	 * attach them to the page if an error occurs. In the common case of
 	 * no error, they will just be freed again without ever being attached
 	 * to the page (which is all OK, because we're under the page lock).
 	 *
 	 * Be careful: the buffer linked list is a NULL terminated one, rather
 	 * than the circular one we're used to.
 	 */
 	head = alloc_page_buffers(page, blocksize, 0);
 	if (!head) {
 		ret = -ENOMEM;
 		goto out_release;
 	}
 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	/*
 	 * We loop across all blocks in the page, whether or not they are
 	 * part of the affected region.  This is so we can discover if the
 	 * page is fully mapped-to-disk.
 	 */
 	for (block_start = 0, block_in_page = 0, bh = head;
 		  block_start < PAGE_CACHE_SIZE;
 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
 		int create;
 		block_end = block_start + blocksize;
 		bh->b_state = 0;
 		create = 1;
 		if (block_start >= to)
 			create = 0;
 		ret = get_block(inode, block_in_file + block_in_page,
 					bh, create);
 		if (ret)
 			goto failed;
 		if (!buffer_mapped(bh))
 			is_mapped_to_disk = 0;
 		if (buffer_new(bh))
 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 		if (PageUptodate(page)) {
 			set_buffer_uptodate(bh);
 			continue;
 		}
 		if (buffer_new(bh) || !buffer_mapped(bh)) {
 			zero_user_segments(page, block_start, from,
 							to, block_end);
 			continue;
 		}
 		if (buffer_uptodate(bh))
 			continue;	/* reiserfs does this */
 		if (block_start < from || block_end > to) {
 			lock_buffer(bh);
 			bh->b_end_io = end_buffer_read_nobh;
 			submit_bh(READ, bh);
 			nr_reads++;
 		}
 	}
 	if (nr_reads) {
 		/*
 		 * The page is locked, so these buffers are protected from
 		 * any VM or truncate activity.  Hence we don't need to care
 		 * for the buffer_head refcounts.
 		 */
 		for (bh = head; bh; bh = bh->b_this_page) {
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				ret = -EIO;
 		}
 		if (ret)
 			goto failed;
 	}
 	if (is_mapped_to_disk)
 		SetPageMappedToDisk(page);
 	*fsdata = head; /* to be released by nobh_write_end */
 	return 0;
 failed:
 	BUG_ON(!ret);
 	/*
 	 * Error recovery is a bit difficult. We need to zero out blocks that
 	 * were newly allocated, and dirty them to ensure they get written out.
 	 * Buffers need to be attached to the page at this point, otherwise
 	 * the handling of potential IO errors during writeout would be hard
 	 * (could try doing synchronous writeout, but what if that fails too?)
 	 */
 	attach_nobh_buffers(page, head);
 	page_zero_new_buffers(page, from, to);
 out_release:
 	unlock_page(page);
 	page_cache_release(page);
 	*pagep = NULL;
 	if (pos + len > inode->i_size)
 		vmtruncate(inode, inode->i_size);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_write_begin);
 int nobh_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *head = fsdata;
 	struct buffer_head *bh;
 	BUG_ON(fsdata != NULL && page_has_buffers(page));
 	if (unlikely(copied < len) && !page_has_buffers(page))
 		attach_nobh_buffers(page, head);
 	if (page_has_buffers(page))
 		return generic_write_end(file, mapping, pos, len,
 					copied, page, fsdata);
 	SetPageUptodate(page);
 	set_page_dirty(page);
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
 		mark_inode_dirty(inode);
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	while (head) {
 		bh = head;
 		head = head->b_this_page;
 		free_buffer_head(bh);
 	}
 	return copied;
 }
 EXPORT_SYMBOL(nobh_write_end);
 /*
  * nobh_writepage() - based on block_full_write_page() except
  * that it tries to operate without attaching bufferheads to
  * the page.
  */
 int nobh_writepage(struct page *page, get_block_t *get_block,
 			struct writeback_control *wbc)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
 	int ret;
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
 		goto out;
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index >= end_index+1 || !offset) {
 		/*
 		 * The page may have dirty, unmapped buffers.  For example,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
 #if 0
 		/* Not really sure about this  - do we need this ? */
 		if (page->mapping->a_ops->invalidatepage)
 			page->mapping->a_ops->invalidatepage(page, offset);
 #endif
 		unlock_page(page);
 		return 0; /* don't care */
 	}
 	/*
 	 * The page straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
 		ret = __block_write_full_page(inode, page, get_block, wbc);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
 int nobh_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
 	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head map_bh;
 	int err;
 	blocksize = 1 << inode->i_blkbits;
 	length = offset & (blocksize - 1);
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
 	length = blocksize - length;
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
 	if (!page)
 		goto out;
 	if (page_has_buffers(page)) {
 has_buffers:
 		unlock_page(page);
 		page_cache_release(page);
 		return block_truncate_page(mapping, from, get_block);
 	}
 	/* Find the buffer that contains "offset" */
 	pos = blocksize;
 	while (offset >= pos) {
 		iblock++;
 		pos += blocksize;
 	}
 	err = get_block(inode, iblock, &map_bh, 0);
 	if (err)
 		goto unlock;
 	/* unmapped? It's a hole - nothing to do */
 	if (!buffer_mapped(&map_bh))
 		goto unlock;
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (!PageUptodate(page)) {
 		err = mapping->a_ops->readpage(NULL, page);
 		if (err) {
 			page_cache_release(page);
 			goto out;
 		}
 		lock_page(page);
 		if (!PageUptodate(page)) {
 			err = -EIO;
 			goto unlock;
 		}
 		if (page_has_buffers(page))
 			goto has_buffers;
 	}
 	zero_user(page, offset, length);
 	set_page_dirty(page);
 	err = 0;
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return err;
 }
 EXPORT_SYMBOL(nobh_truncate_page);
 int block_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
 	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head *bh;
 	int err;
 	blocksize = 1 << inode->i_blkbits;
 	length = offset & (blocksize - 1);
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
 	length = blocksize - length;
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
 	if (!page)
 		goto out;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (!buffer_mapped(bh)) {
 		WARN_ON(bh->b_size != blocksize);
 		err = get_block(inode, iblock, bh, 0);
 		if (err)
 			goto unlock;
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh))
 			goto unlock;
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
 	err = 0;
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return err;
 }
 /*
  * The generic ->writepage function for buffer-backed address_spaces
  */
 int block_write_full_page(struct page *page, get_block_t *get_block,
 			struct writeback_control *wbc)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
 		return __block_write_full_page(inode, page, get_block, wbc);
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index >= end_index+1 || !offset) {
 		/*
 		 * The page may have dirty, unmapped buffers.  For example,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
 		do_invalidatepage(page, 0);
 		unlock_page(page);
 		return 0; /* don't care */
 	}
 	/*
 	 * The page straddles i_size.  It must be zeroed out on each and every
 	 * writepage invokation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	return __block_write_full_page(inode, page, get_block, wbc);
 }
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 			    get_block_t *get_block)
 {
 	struct buffer_head tmp;
 	struct inode *inode = mapping->host;
 	tmp.b_state = 0;
 	tmp.b_blocknr = 0;
 	tmp.b_size = 1 << inode->i_blkbits;
 	get_block(inode, block, &tmp, 0);
 	return tmp.b_blocknr;
 }
 static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
+	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+		set_bit(BH_Quiet, &bh->b_state);
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 }
 int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
 	int ret = 0;
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 	/*
 	 * Mask in barrier bit for a write (could be either a WRITE or a
 	 * WRITE_SYNC
 	 */
 	if (buffer_ordered(bh) && (rw & WRITE))
 		rw |= WRITE_BARRIER;
 	/*
 	 * Only clear out a write error when rewriting
 	 */
 	if (test_set_buffer_req(bh) && (rw & WRITE))
 		clear_buffer_write_io_error(bh);
 	/*
 	 * from here on down, it's all bio -- do the initial mapping,
 	 * submit_bio -> generic_make_request may further map this bio around
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
 	bio->bi_io_vec[0].bv_len = bh->b_size;
 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 	bio->bi_vcnt = 1;
 	bio->bi_idx = 0;
 	bio->bi_size = bh->b_size;
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 	bio_get(bio);
 	submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
  * are sent to disk. The fourth %READA option is described in the documentation
  * for generic_make_request() which ll_rw_block() calls.
  *
  * This function drops any buffer that it cannot get a lock on (with the
  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
  * clean when doing a write request, and any buffer that appears to be
  * up-to-date when doing read request.  Further it marks as clean buffers that
  * are processed for writing (the buffer cache won't assume that they are
  * actually clean until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
  * any waiters.
  *
  * All of the buffers must be for the same device, and must also be a
  * multiple of the current approved size for the device.
  */
 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 {
 	int i;
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 		if (rw == SWRITE || rw == SWRITE_SYNC)
 			lock_buffer(bh);
 		else if (!trylock_buffer(bh))
 			continue;
 		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
 				if (rw == SWRITE_SYNC)
 					submit_bh(WRITE_SYNC, bh);
 				else
 					submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
 			if (!buffer_uptodate(bh)) {
 				bh->b_end_io = end_buffer_read_sync;
 				get_bh(bh);
 				submit_bh(rw, bh);
 				continue;
 			}
 		}
 		unlock_buffer(bh);
 	}
 }
 /*
  * For a data-integrity writeout, we need to wait upon any in-progress I/O
  * and then start new I/O and then wait upon it.  The caller must have a ref on
  * the buffer_head.
  */
 int sync_dirty_buffer(struct buffer_head *bh)
 {
 	int ret = 0;
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	lock_buffer(bh);
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
 		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
 			ret = -EOPNOTSUPP;
 		}
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
 	} else {
 		unlock_buffer(bh);
 	}
 	return ret;
 }
 /*
  * try_to_free_buffers() checks if all the buffers on this particular page
  * are unused, and releases them if so.
  *
  * Exclusion against try_to_free_buffers may be obtained by either
  * locking the page or by holding its mapping's private_lock.
  *
  * If the page is dirty but all the buffers are clean then we need to
  * be sure to mark the page clean as well.  This is because the page
  * may be against a block device, and a later reattachment of buffers
  * to a dirty page will set *all* buffers dirty.  Which would corrupt
  * filesystem data on the same device.
  *
  * The same applies to regular filesystem pages: if all the buffers are
  * clean then we set the page clean and proceed.  To do that, we require
  * total exclusion from __set_page_dirty_buffers().  That is obtained with
  * private_lock.
  *
  * try_to_free_buffers() is non-blocking.
  */
 static inline int buffer_busy(struct buffer_head *bh)
 {
 	return atomic_read(&bh->b_count) |
 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
 }
 static int
 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh;
 	bh = head;
 	do {
 		if (buffer_write_io_error(bh) && page->mapping)
 			set_bit(AS_EIO, &page->mapping->flags);
 		if (buffer_busy(bh))
 			goto failed;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (bh->b_assoc_map)
 			__remove_assoc_queue(bh);
 		bh = next;
 	} while (bh != head);
 	*buffers_to_free = head;
 	__clear_page_buffers(page);
 	return 1;
 failed:
 	return 0;
 }
 int try_to_free_buffers(struct page *page)
 {
 	struct address_space * const mapping = page->mapping;
 	struct buffer_head *buffers_to_free = NULL;
 	int ret = 0;
 	BUG_ON(!PageLocked(page));
 	if (PageWriteback(page))
 		return 0;
 	if (mapping == NULL) {		/* can this still happen? */
 		ret = drop_buffers(page, &buffers_to_free);
 		goto out;
 	}
 	spin_lock(&mapping->private_lock);
 	ret = drop_buffers(page, &buffers_to_free);
 	/*
 	 * If the filesystem writes its buffers by hand (eg ext3)
 	 * then we can have clean buffers against a dirty page.  We
 	 * clean the page here; otherwise the VM will never notice
 	 * that the filesystem did any IO at all.
 	 *
 	 * Also, during truncate, discard_buffer will have marked all
 	 * the page's buffers clean.  We discover that here and clean
 	 * the page also.
 	 *
 	 * private_lock must be held over this entire operation in order
 	 * to synchronise against __set_page_dirty_buffers and prevent the
 	 * dirty bit from being lost.
 	 */
 	if (ret)
 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
 	spin_unlock(&mapping->private_lock);
 out:
 	if (buffers_to_free) {
 		struct buffer_head *bh = buffers_to_free;
 		do {
 			struct buffer_head *next = bh->b_this_page;
 			free_buffer_head(bh);
 			bh = next;
 		} while (bh != buffers_to_free);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 void block_sync_page(struct page *page)
 {
 	struct address_space *mapping;
 	smp_mb();
 	mapping = page_mapping(page);
 	if (mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, page);
 }
 /*
  * There are no bdflush tunables left.  But distributions are
  * still running obsolete flush daemons, so we terminate them here.
  *
  * Use of bdflush() is deprecated and will be removed in a future kernel.
  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
  */
 asmlinkage long sys_bdflush(int func, long data)
 {
 	static int msg_count;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (msg_count < 5) {
 		msg_count++;
 		printk(KERN_INFO
 			"warning: process `%s' used the obsolete bdflush"
 			" system call\n", current->comm);
 		printk(KERN_INFO "Fix your initscripts?\n");
 	}
 	if (func == 1)
 		do_exit(0);
 	return 0;
 }
 /*
  * Buffer-head allocation
  */
 static struct kmem_cache *bh_cachep;
 /*
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
 static int max_buffer_heads;
 int buffer_heads_over_limit;
 struct bh_accounting {
 	int nr;			/* Number of live bh's */
 	int ratelimit;		/* Limit cacheline bouncing */
 };
 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
 static void recalc_bh_state(void)
 {
 	int i;
 	int tot = 0;
 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
 		return;
 	__get_cpu_var(bh_accounting).ratelimit = 0;
 	for_each_online_cpu(i)
 		tot += per_cpu(bh_accounting, i).nr;
 	buffer_heads_over_limit = (tot > max_buffer_heads);
 }
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
 		get_cpu_var(bh_accounting).nr++;
 		recalc_bh_state();
 		put_cpu_var(bh_accounting);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
 	put_cpu_var(bh_accounting);
 }
 EXPORT_SYMBOL(free_buffer_head);
 static void buffer_exit_cpu(int cpu)
 {
 	int i;
 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		brelse(b->bhs[i]);
 		b->bhs[i] = NULL;
 	}
 	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
 	per_cpu(bh_accounting, cpu).nr = 0;
 	put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,
 			      unsigned long action, void *hcpu)
 {
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		buffer_exit_cpu((unsigned long)hcpu);
 	return NOTIFY_OK;
 }
 /**
  * bh_uptodate_or_lock - Test whether the buffer is uptodate
  * @bh: struct buffer_head
  *
  * Return true if the buffer is up-to-date and false,
  * with the buffer locked, if not.
  */
 int bh_uptodate_or_lock(struct buffer_head *bh)
 {
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		if (!buffer_uptodate(bh))
 			return 0;
 		unlock_buffer(bh);
 	}
 	return 1;
 }
 EXPORT_SYMBOL(bh_uptodate_or_lock);
 /**
  * bh_submit_read - Submit a locked buffer for reading
  * @bh: struct buffer_head
  *
  * Returns zero on success and -EIO on error.
  */
 int bh_submit_read(struct buffer_head *bh)
 {
 	BUG_ON(!buffer_locked(bh));
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
 		return 0;
 	}
 	get_bh(bh);
 	bh->b_end_io = end_buffer_read_sync;
 	submit_bh(READ, bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return 0;
 	return -EIO;
 }
 EXPORT_SYMBOL(bh_submit_read);
 static void
 init_buffer_head(void *data)
 {
 	struct buffer_head *bh = data;
 	memset(bh, 0, sizeof(*bh));
 	INIT_LIST_HEAD(&bh->b_assoc_buffers);
 }
 void __init buffer_init(void)
 {
 	int nrpages;
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 				SLAB_MEM_SPREAD),
 				init_buffer_head);
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */
 	nrpages = (nr_free_buffer_pages() * 10) / 100;
 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
 	hotcpu_notifier(buffer_cpu_notify, 0);
 }
 EXPORT_SYMBOL(__bforget);
 EXPORT_SYMBOL(__brelse);
 EXPORT_SYMBOL(__wait_on_buffer);
 EXPORT_SYMBOL(block_commit_write);
 EXPORT_SYMBOL(block_prepare_write);
 EXPORT_SYMBOL(block_page_mkwrite);
 EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
 EXPORT_SYMBOL(ll_rw_block);
 EXPORT_SYMBOL(mark_buffer_dirty);
 EXPORT_SYMBOL(submit_bh);
 EXPORT_SYMBOL(sync_dirty_buffer);
 EXPORT_SYMBOL(unlock_buffer);

 /*
  * 2.5 block I/O model
  *
  * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public Licens
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  */
 #ifndef __LINUX_BIO_H
 #define __LINUX_BIO_H
 #include <linux/highmem.h>
 #include <linux/mempool.h>
 #include <linux/ioprio.h>
 #ifdef CONFIG_BLOCK
 #include <asm/io.h>
 #define BIO_DEBUG
 #ifdef BIO_DEBUG
 #define BIO_BUG_ON	BUG_ON
 #else
 #define BIO_BUG_ON
 #endif
 #define BIO_MAX_PAGES		256
 #define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
 #define BIO_MAX_SECTORS		(BIO_MAX_SIZE >> 9)
 /*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
  */
 struct bio_vec {
 	struct page	*bv_page;
 	unsigned int	bv_len;
 	unsigned int	bv_offset;
 };
 struct bio_set;
 struct bio;
 struct bio_integrity_payload;
 typedef void (bio_end_io_t) (struct bio *, int);
 typedef void (bio_destructor_t) (struct bio *);
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
  */
 struct bio {
 	sector_t		bi_sector;	/* device address in 512 byte
 						   sectors */
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
 	unsigned long		bi_flags;	/* status, command, etc */
 	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
 						 * top bits priority
 						 */
 	unsigned short		bi_vcnt;	/* how many bio_vec's */
 	unsigned short		bi_idx;		/* current index into bvl_vec */
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
 	 */
 	unsigned int		bi_phys_segments;
 	unsigned int		bi_size;	/* residual I/O count */
 	/*
 	 * To keep track of the max segment size, we account for the
 	 * sizes of the first and last mergeable segments in this bio.
 	 */
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 	unsigned int		bi_comp_cpu;	/* completion CPU */
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 	bio_end_io_t		*bi_end_io;
 	atomic_t		bi_cnt;		/* pin count */
 	void			*bi_private;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
 	bio_destructor_t	*bi_destructor;	/* destructor */
 };
 /*
  * bio flags
  */
 #define BIO_UPTODATE	0	/* ok after I/O completion */
 #define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
 #define BIO_EOF		2	/* out-out-bounds error */
 #define BIO_SEG_VALID	3	/* bi_phys_segments valid */
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
 #define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	11	/* Make BIO Quiet */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 /*
  * top 4 bits of bio flags indicate the pool this bio came from
  */
 #define BIO_POOL_BITS		(4)
 #define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
 #define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
 #define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)
 /*
  * bio bi_rw flags
  *
  * bit 0 -- data direction
  *	If not set, bio is a read from device. If set, it's a write to device.
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
  *	submitted IO to be completed before this oen is issued.
  * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
  *	Note that this does NOT indicate that the IO itself is sync, just
  *	that the block layer will not postpone issue of this IO by plugging.
  * bit 4 -- metadata request
  *	Used for tracing to differentiate metadata and data IO. May also
  *	get some preferential treatment in the IO scheduler
  * bit 5 -- discard sectors
  *	Informs the lower level device that this range of sectors is no longer
  *	used by the file system and may thus be freed by the device. Used
  *	for flash based storage.
  * bit 6 -- fail fast device errors
  * bit 7 -- fail fast transport errors
  * bit 8 -- fail fast driver errors
  *	Don't want driver retries for any fast fail whatever the reason.
  */
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
 #define BIO_RW_BARRIER	2
 #define BIO_RW_SYNC	3
 #define BIO_RW_META	4
 #define BIO_RW_DISCARD	5
 #define BIO_RW_FAILFAST_DEV		6
 #define BIO_RW_FAILFAST_TRANSPORT	7
 #define BIO_RW_FAILFAST_DRIVER		8
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
 #define BIO_PRIO_SHIFT	(8 * sizeof(unsigned long) - IOPRIO_BITS)
 #define bio_prio(bio)	((bio)->bi_rw >> BIO_PRIO_SHIFT)
 #define bio_prio_valid(bio)	ioprio_valid(bio_prio(bio))
 #define bio_set_prio(bio, prio)		do {			\
 	WARN_ON(prio >= (1 << IOPRIO_BITS));			\
 	(bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1);		\
 	(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT);	\
 } while (0)
 /*
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
  */
 #define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
 #define bio_iovec(bio)		bio_iovec_idx((bio), (bio)->bi_idx)
 #define bio_page(bio)		bio_iovec((bio))->bv_page
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 #define bio_failfast_dev(bio)	((bio)->bi_rw &	(1 << BIO_RW_FAILFAST_DEV))
 #define bio_failfast_transport(bio)	\
 	((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT))
 #define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
 #define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len >> 9;
 	else /* dataless requests such as discard */
 		return bio->bi_size >> 9;
 }
 static inline void *bio_data(struct bio *bio)
 {
 	if (bio->bi_vcnt)
 		return page_address(bio_page(bio)) + bio_offset(bio);
 	return NULL;
 }
 /*
  * will die
  */
 #define bio_to_phys(bio)	(page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio)))
 #define bvec_to_phys(bv)	(page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
 /*
  * queues that have highmem support enabled may still need to revert to
  * PIO transfers occasionally and thus map high pages temporarily. For
  * permanent PIO fall back, user is probably better off disabling highmem
  * I/O completely on that queue (see ide-dma for example)
  */
 #define __bio_kmap_atomic(bio, idx, kmtype)				\
 	(kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) +	\
 		bio_iovec_idx((bio), (idx))->bv_offset)
 #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype)
 /*
  * merge helpers etc
  */
 #define __BVEC_END(bio)		bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
 #define __BVEC_START(bio)	bio_iovec_idx((bio), (bio)->bi_idx)
 /* Default implementation of BIOVEC_PHYS_MERGEABLE */
 #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
 	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
 /*
  * allow arch override, for eg virtualized architectures (put in asm/io.h)
  */
 #ifndef BIOVEC_PHYS_MERGEABLE
 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
 	__BIOVEC_PHYS_MERGEABLE(vec1, vec2)
 #endif
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
 	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
 #define BIO_SEG_BOUNDARY(q, b1, b2) \
 	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 /*
  * drivers should not use the __ version unless they _really_ want to
  * run through the entire bio and not just pending pieces
  */
 #define __bio_for_each_segment(bvl, bio, i, start_idx)			\
 	for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);	\
 	     i < (bio)->bi_vcnt;					\
 	     bvl++, i++)
 #define bio_for_each_segment(bvl, bio, i)				\
 	__bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
  *
  * bio_get(bio);
  * submit_bio(rw, bio);
  * if (bio->bi_flags ...)
  *	do_something
  * bio_put(bio);
  *
  * without the bio_get(), it could potentially complete I/O before submit_bio
  * returns. and then bio would be freed memory when if (bio->bi_flags ...)
  * runs
  */
 #define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 /*
  * bio integrity payload
  */
 struct bio_integrity_payload {
 	struct bio		*bip_bio;	/* parent bio */
 	struct bio_vec		*bip_vec;	/* integrity data vector */
 	sector_t		bip_sector;	/* virtual start sector */
 	void			*bip_buf;	/* generated integrity data */
 	bio_end_io_t		*bip_end_io;	/* saved I/O completion fn */
 	int			bip_error;	/* saved I/O error */
 	unsigned int		bip_size;
 	unsigned short		bip_pool;	/* pool the ivec came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
 	unsigned short		bip_idx;	/* current bip_vec index */
 	struct work_struct	bip_work;	/* I/O completion */
 };
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 /*
  * A bio_pair is used when we need to split a bio.
  * This can only happen for a bio that refers to just one
  * page of data, and in the unusual situation when the
  * page crosses a chunk/device boundary
  *
  * The address of the master bio is stored in bio1.bi_private
  * The address of the pool the pair was allocated from is stored
  *   in bio2.bi_private
  */
 struct bio_pair {
 	struct bio			bio1, bio2;
 	struct bio_vec			bv1, bv2;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	struct bio_integrity_payload	bip1, bip2;
 	struct bio_vec			iv1, iv2;
 #endif
 	atomic_t			cnt;
 	int				error;
 };
 extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 extern struct bio_set *bioset_create(int, int);
 extern void bioset_free(struct bio_set *);
 extern struct bio *bio_alloc(gfp_t, int);
 extern struct bio *bio_kmalloc(gfp_t, int);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 extern void bio_free(struct bio *, struct bio_set *);
 extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone(struct bio *, gfp_t);
 extern void bio_init(struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
 extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
 				    struct sg_iovec *, int, int, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
 extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 				 gfp_t, int);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
 				 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *, struct sg_iovec *,
 				     int, int, gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 /*
  * Allow queuer to specify a completion CPU for this bio
  */
 static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
 {
 	bio->bi_comp_cpu = cpu;
 }
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
  * These memory pools in turn all allocate from the bio_slab
  * and the bvec_slabs[].
  */
 #define BIO_POOL_SIZE 2
 #define BIOVEC_NR_POOLS 6
 struct bio_set {
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
 #endif
 	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
 };
 struct biovec_slab {
 	int nr_vecs;
 	char *name;
 	struct kmem_cache *slab;
 };
 extern struct bio_set *fs_bio_set;
 /*
  * a small number of entries is fine, not going to be performance critical.
  * basically we just need to survive
  */
 #define BIO_SPLIT_ENTRIES 2
 #ifdef CONFIG_HIGHMEM
 /*
  * remember to add offset! and never ever reenable interrupts between a
  * bvec_kmap_irq and bvec_kunmap_irq!!
  *
  * This function MUST be inlined - it plays with the CPU interrupt flags.
  */
 static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
 {
 	unsigned long addr;
 	/*
 	 * might not be a highmem page, but the preempt/irq count
 	 * balancing is a lot nicer this way
 	 */
 	local_irq_save(*flags);
 	addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ);
 	BUG_ON(addr & ~PAGE_MASK);
 	return (char *) addr + bvec->bv_offset;
 }
 static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
 {
 	unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
 	kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ);
 	local_irq_restore(*flags);
 }
 #else
 #define bvec_kmap_irq(bvec, flags)	(page_address((bvec)->bv_page) + (bvec)->bv_offset)
 #define bvec_kunmap_irq(buf, flags)	do { *(flags) = 0; } while (0)
 #endif
 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 				   unsigned long *flags)
 {
 	return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);
 }
 #define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
 #define bio_kmap_irq(bio, flags) \
 	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 /*
  * Check whether this bio carries any data or not. A NULL bio is allowed.
  */
 static inline int bio_has_data(struct bio *bio)
 {
 	return bio && bio->bi_io_vec != NULL;
 }
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 #define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
 #define bip_vec(bip)		bip_vec_idx(bip, 0)
 #define __bip_for_each_vec(bvl, bip, i, start_idx)			\
 	for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx);	\
 	     i < (bip)->bip_vcnt;					\
 	     bvl++, i++)
 #define bip_for_each_vec(bvl, bip, i)					\
 	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
 #define bio_integrity(bio) (bio->bi_integrity != NULL)
 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern void bio_integrity_free(struct bio *, struct bio_set *);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern int bio_integrity_enabled(struct bio *bio);
 extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
 extern int bio_integrity_get_tag(struct bio *, void *, unsigned int);
 extern int bio_integrity_prep(struct bio *);
 extern void bio_integrity_endio(struct bio *, int);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
 extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
 extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *);
 extern int bioset_integrity_create(struct bio_set *, int);
 extern void bioset_integrity_free(struct bio_set *);
 extern void bio_integrity_init_slab(void);
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 #define bio_integrity(a)		(0)
 #define bioset_integrity_create(a, b)	(0)
 #define bio_integrity_prep(a)		(0)
 #define bio_integrity_enabled(a)	(0)
 #define bio_integrity_clone(a, b, c)	(0)
 #define bioset_integrity_free(a)	do { } while (0)
 #define bio_integrity_free(a, b)	do { } while (0)
 #define bio_integrity_endio(a, b)	do { } while (0)
 #define bio_integrity_advance(a, b)	do { } while (0)
 #define bio_integrity_trim(a, b, c)	do { } while (0)
 #define bio_integrity_split(a, b, c)	do { } while (0)
 #define bio_integrity_set_tag(a, b, c)	do { } while (0)
 #define bio_integrity_get_tag(a, b, c)	do { } while (0)
 #define bio_integrity_init_slab(a)	do { } while (0)
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 #endif /* CONFIG_BLOCK */
 #endif /* __LINUX_BIO_H */

 /*
  * include/linux/buffer_head.h
  *
  * Everything to do with buffer_heads.
  */
 #ifndef _LINUX_BUFFER_HEAD_H
 #define _LINUX_BUFFER_HEAD_H
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/linkage.h>
 #include <linux/pagemap.h>
 #include <linux/wait.h>
 #include <asm/atomic.h>
 #ifdef CONFIG_BLOCK
 enum bh_state_bits {
 	BH_Uptodate,	/* Contains valid data */
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
 	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
 			  * IO completion of other buffers in the page
 			  */
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
 	BH_Async_Write,	/* Is under end_buffer_async_write I/O */
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
 	BH_Ordered,	/* ordered write */
 	BH_Eopnotsupp,	/* operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
+	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
 			 */
 };
 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
 struct page;
 struct buffer_head;
 struct address_space;
 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
 /*
  * Historically, a buffer_head was used to map a single block
  * within a page, and of course as the unit of I/O through the
  * filesystem and block layers.  Nowadays the basic I/O unit
  * is the bio, and buffer_heads are used for extracting block
  * mappings (via a get_block_t call), for tracking state within
  * a page (via a page_mapping) and for wrapping bio submission
  * for backward compatibility reasons (e.g. submit_bh).
  */
 struct buffer_head {
 	unsigned long b_state;		/* buffer state bitmap (see above) */
 	struct buffer_head *b_this_page;/* circular list of page's buffers */
 	struct page *b_page;		/* the page this bh is mapped to */
 	sector_t b_blocknr;		/* start block number */
 	size_t b_size;			/* size of mapping */
 	char *b_data;			/* pointer to data within the page */
 	struct block_device *b_bdev;
 	bh_end_io_t *b_end_io;		/* I/O completion */
  	void *b_private;		/* reserved for b_end_io */
 	struct list_head b_assoc_buffers; /* associated with another mapping */
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
 };
 /*
  * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  * and buffer_foo() functions.
  */
 #define BUFFER_FNS(bit, name)						\
 static inline void set_buffer_##name(struct buffer_head *bh)		\
 {									\
 	set_bit(BH_##bit, &(bh)->b_state);				\
 }									\
 static inline void clear_buffer_##name(struct buffer_head *bh)		\
 {									\
 	clear_bit(BH_##bit, &(bh)->b_state);				\
 }									\
 static inline int buffer_##name(const struct buffer_head *bh)		\
 {									\
 	return test_bit(BH_##bit, &(bh)->b_state);			\
 }
 /*
  * test_set_buffer_foo() and test_clear_buffer_foo()
  */
 #define TAS_BUFFER_FNS(bit, name)					\
 static inline int test_set_buffer_##name(struct buffer_head *bh)	\
 {									\
 	return test_and_set_bit(BH_##bit, &(bh)->b_state);		\
 }									\
 static inline int test_clear_buffer_##name(struct buffer_head *bh)	\
 {									\
 	return test_and_clear_bit(BH_##bit, &(bh)->b_state);		\
 }									\
 /*
  * Emit the buffer bitops functions.   Note that there are also functions
  * of the form "mark_buffer_foo()".  These are higher-level functions which
  * do something in addition to setting a b_state bit.
  */
 BUFFER_FNS(Uptodate, uptodate)
 BUFFER_FNS(Dirty, dirty)
 TAS_BUFFER_FNS(Dirty, dirty)
 BUFFER_FNS(Lock, locked)
 BUFFER_FNS(Req, req)
 TAS_BUFFER_FNS(Req, req)
 BUFFER_FNS(Mapped, mapped)
 BUFFER_FNS(New, new)
 BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
 BUFFER_FNS(Ordered, ordered)
 BUFFER_FNS(Eopnotsupp, eopnotsupp)
 BUFFER_FNS(Unwritten, unwritten)
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
 	({							\
 		BUG_ON(!PagePrivate(page));			\
 		((struct buffer_head *)page_private(page));	\
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 /*
  * Declarations
  */
 void mark_buffer_dirty(struct buffer_head *bh);
 void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
 int try_to_free_buffers(struct page *);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		int retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
 int inode_has_buffers(struct inode *);
 void invalidate_inode_buffers(struct inode *);
 int remove_inode_buffers(struct inode *inode);
 int sync_mapping_buffers(struct address_space *mapping);
 void unmap_underlying_metadata(struct block_device *bdev, sector_t block);
 void mark_buffer_async_write(struct buffer_head *bh);
 void invalidate_bdev(struct block_device *);
 int sync_blockdev(struct block_device *bdev);
 void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
 int fsync_bdev(struct block_device *);
 struct super_block *freeze_bdev(struct block_device *);
 void thaw_bdev(struct block_device *, struct super_block *);
 int fsync_super(struct super_block *);
 int fsync_no_super(struct block_device *);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
 struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
 			unsigned size);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
 struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
 void invalidate_bh_lrus(void);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
 int bh_submit_read(struct buffer_head *bh);
 extern int buffer_heads_over_limit;
 /*
  * Generic address_space_operations implementations for buffer_head-backed
  * address_spaces.
  */
 void block_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 				unsigned long from);
 int block_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
 int block_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 				get_block_t get_block);
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int file_fsync(struct file *, struct dentry *, int);
 int nobh_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
 int nobh_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 int nobh_truncate_page(struct address_space *, loff_t, get_block_t *);
 int nobh_writepage(struct page *page, get_block_t *get_block,
                         struct writeback_control *wbc);
 void buffer_init(void);
 /*
  * inline definitions
  */
 static inline void attach_page_buffers(struct page *page,
 		struct buffer_head *head)
 {
 	page_cache_get(page);
 	SetPagePrivate(page);
 	set_page_private(page, (unsigned long)head);
 }
 static inline void get_bh(struct buffer_head *bh)
 {
         atomic_inc(&bh->b_count);
 }
 static inline void put_bh(struct buffer_head *bh)
 {
         smp_mb__before_atomic_dec();
         atomic_dec(&bh->b_count);
 }
 static inline void brelse(struct buffer_head *bh)
 {
 	if (bh)
 		__brelse(bh);
 }
 static inline void bforget(struct buffer_head *bh)
 {
 	if (bh)
 		__bforget(bh);
 }
 static inline struct buffer_head *
 sb_bread(struct super_block *sb, sector_t block)
 {
 	return __bread(sb->s_bdev, block, sb->s_blocksize);
 }
 static inline void
 sb_breadahead(struct super_block *sb, sector_t block)
 {
 	__breadahead(sb->s_bdev, block, sb->s_blocksize);
 }
 static inline struct buffer_head *
 sb_getblk(struct super_block *sb, sector_t block)
 {
 	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 static inline struct buffer_head *
 sb_find_get_block(struct super_block *sb, sector_t block)
 {
 	return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
 }
 static inline void
 map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
 {
 	set_buffer_mapped(bh);
 	bh->b_bdev = sb->s_bdev;
 	bh->b_blocknr = block;
 	bh->b_size = sb->s_blocksize;
 }
 /*
  * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into
  * __wait_on_buffer() just to trip a debug check.  Because debug code in inline
  * functions is bloaty.
  */
 static inline void wait_on_buffer(struct buffer_head *bh)
 {
 	might_sleep();
 	if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0)
 		__wait_on_buffer(bh);
 }
 static inline int trylock_buffer(struct buffer_head *bh)
 {
 	return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
 }
 static inline void lock_buffer(struct buffer_head *bh)
 {
 	might_sleep();
 	if (!trylock_buffer(bh))
 		__lock_buffer(bh);
 }
 extern int __set_page_dirty_buffers(struct page *page);
 #else /* CONFIG_BLOCK */
 static inline void buffer_init(void) {}
 static inline int try_to_free_buffers(struct page *page) { return 1; }
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
 static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
 static inline void invalidate_bdev(struct block_device *bdev) {}
 #endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */