Eric Lee / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/fs/buffer.c

2

* linux/fs/buffer.c

3

*

3

*

4

5

*/

5

*/

6

7

/*

7

/*

8

* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95

8

* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95

9

*

9

*

10

* Removed a lot of unnecessary code and simplified things now that

10

* Removed a lot of unnecessary code and simplified things now that

11

* the buffer cache isn't our primary cache - Andrew Tridgell 12/96

11

* the buffer cache isn't our primary cache - Andrew Tridgell 12/96

12

*

12

*

13

* Speed up hash, lru, and free list operations. Use gfp() for allocating

13

* Speed up hash, lru, and free list operations. Use gfp() for allocating

14

* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM

14

* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM

15

*

15

*

16

* Added 32k buffer block sizes - these are required older ARM systems. - RMK

16

* Added 32k buffer block sizes - these are required older ARM systems. - RMK

17

*

17

*

18

* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>

18

* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>

19

*/

19

*/

20

21

#include <linux/kernel.h>

21

#include <linux/kernel.h>

22

#include <linux/syscalls.h>

22

#include <linux/syscalls.h>

23

#include <linux/fs.h>

23

#include <linux/fs.h>

24

#include <linux/mm.h>

24

#include <linux/mm.h>

25

#include <linux/percpu.h>

25

#include <linux/percpu.h>

26

#include <linux/slab.h>

26

#include <linux/slab.h>

27

#include <linux/capability.h>

27

#include <linux/capability.h>

28

#include <linux/blkdev.h>

28

#include <linux/blkdev.h>

29

#include <linux/file.h>

29

#include <linux/file.h>

30

#include <linux/quotaops.h>

30

#include <linux/quotaops.h>

31

#include <linux/highmem.h>

31

#include <linux/highmem.h>

32

#include <linux/module.h>

32

#include <linux/module.h>

33

#include <linux/writeback.h>

33

#include <linux/writeback.h>

34

#include <linux/hash.h>

34

#include <linux/hash.h>

35

#include <linux/suspend.h>

35

#include <linux/suspend.h>

36

#include <linux/buffer_head.h>

36

#include <linux/buffer_head.h>

37

#include <linux/task_io_accounting_ops.h>

37

#include <linux/task_io_accounting_ops.h>

38

#include <linux/bio.h>

38

#include <linux/bio.h>

39

#include <linux/notifier.h>

39

#include <linux/notifier.h>

40

#include <linux/cpu.h>

40

#include <linux/cpu.h>

41

#include <linux/bitops.h>

41

#include <linux/bitops.h>

42

#include <linux/mpage.h>

42

#include <linux/mpage.h>

43

#include <linux/bit_spinlock.h>

43

#include <linux/bit_spinlock.h>

44

45

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

45

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

46

47

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

47

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

48

49

inline void

49

inline void

50

init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)

50

init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)

51

{

51

{

52

bh->b_end_io = handler;

52

bh->b_end_io = handler;

53

bh->b_private = private;

53

bh->b_private = private;

54

}

54

}

55

56

static int sync_buffer(void *word)

56

static int sync_buffer(void *word)

57

{

57

{

58

struct block_device *bd;

58

struct block_device *bd;

59

struct buffer_head *bh

59

struct buffer_head *bh

60

= container_of(word, struct buffer_head, b_state);

60

= container_of(word, struct buffer_head, b_state);

61

62

smp_mb();

62

smp_mb();

63

bd = bh->b_bdev;

63

bd = bh->b_bdev;

64

if (bd)

64

if (bd)

65

blk_run_address_space(bd->bd_inode->i_mapping);

65

blk_run_address_space(bd->bd_inode->i_mapping);

66

io_schedule();

66

io_schedule();

67

return 0;

67

return 0;

68

}

68

}

69

70

void __lock_buffer(struct buffer_head *bh)

70

void __lock_buffer(struct buffer_head *bh)

71

{

71

{

72

wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,

72

wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,

73

TASK_UNINTERRUPTIBLE);

73

TASK_UNINTERRUPTIBLE);

74

}

74

}

75

EXPORT_SYMBOL(__lock_buffer);

75

EXPORT_SYMBOL(__lock_buffer);

76

77

void unlock_buffer(struct buffer_head *bh)

77

void unlock_buffer(struct buffer_head *bh)

78

{

78

{

79

smp_mb__before_clear_bit();

79

smp_mb__before_clear_bit();

80

clear_buffer_locked(bh);

80

clear_buffer_locked(bh);

81

smp_mb__after_clear_bit();

81

smp_mb__after_clear_bit();

82

wake_up_bit(&bh->b_state, BH_Lock);

82

wake_up_bit(&bh->b_state, BH_Lock);

83

}

83

}

84

85

/*

85

/*

86

* Block until a buffer comes unlocked. This doesn't stop it

86

* Block until a buffer comes unlocked. This doesn't stop it

87

* from becoming locked again - you have to lock it yourself

87

* from becoming locked again - you have to lock it yourself

88

* if you want to preserve its state.

88

* if you want to preserve its state.

89

*/

89

*/

90

void __wait_on_buffer(struct buffer_head * bh)

90

void __wait_on_buffer(struct buffer_head * bh)

91

{

91

{

92

wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);

92

wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);

93

}

93

}

94

95

static void

95

static void

96

__clear_page_buffers(struct page *page)

96

__clear_page_buffers(struct page *page)

97

{

97

{

98

ClearPagePrivate(page);

98

ClearPagePrivate(page);

99

set_page_private(page, 0);

99

set_page_private(page, 0);

100

page_cache_release(page);

100

page_cache_release(page);

101

}

101

}

102

103

static void buffer_io_error(struct buffer_head *bh)

103

static void buffer_io_error(struct buffer_head *bh)

104

{

104

{

105

char b[BDEVNAME_SIZE];

105

char b[BDEVNAME_SIZE];

106

107

printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",

107

printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",

108

bdevname(bh->b_bdev, b),

108

bdevname(bh->b_bdev, b),

109

(unsigned long long)bh->b_blocknr);

109

(unsigned long long)bh->b_blocknr);

110

}

110

}

111

112

/*

112

/*

113

* End-of-IO handler helper function which does not touch the bh after

113

* End-of-IO handler helper function which does not touch the bh after

114

* unlocking it.

114

* unlocking it.

115

* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but

115

* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but

116

* a race there is benign: unlock_buffer() only use the bh's address for

116

* a race there is benign: unlock_buffer() only use the bh's address for

117

* hashing after unlocking the buffer, so it doesn't actually touch the bh

117

* hashing after unlocking the buffer, so it doesn't actually touch the bh

118

* itself.

118

* itself.

119

*/

119

*/

120

static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)

120

static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)

121

{

121

{

122

if (uptodate) {

122

if (uptodate) {

123

set_buffer_uptodate(bh);

123

set_buffer_uptodate(bh);

124

} else {

124

} else {

125

/* This happens, due to failed READA attempts. */

125

/* This happens, due to failed READA attempts. */

126

clear_buffer_uptodate(bh);

126

clear_buffer_uptodate(bh);

127

}

127

}

128

unlock_buffer(bh);

128

unlock_buffer(bh);

129

}

129

}

130

131

/*

131

/*

132

* Default synchronous end-of-IO handler.. Just mark it up-to-date and

132

* Default synchronous end-of-IO handler.. Just mark it up-to-date and

133

* unlock the buffer. This is what ll_rw_block uses too.

133

* unlock the buffer. This is what ll_rw_block uses too.

134

*/

134

*/

135

void end_buffer_read_sync(struct buffer_head *bh, int uptodate)

135

void end_buffer_read_sync(struct buffer_head *bh, int uptodate)

136

{

136

{

137

__end_buffer_read_notouch(bh, uptodate);

137

__end_buffer_read_notouch(bh, uptodate);

138

put_bh(bh);

138

put_bh(bh);

139

}

139

}

140

141

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)

141

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)

142

{

142

{

143

char b[BDEVNAME_SIZE];

143

char b[BDEVNAME_SIZE];

144

145

if (uptodate) {

145

if (uptodate) {

146

set_buffer_uptodate(bh);

146

set_buffer_uptodate(bh);

147

} else {

147

} else {

148

if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {

148

if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {

149

buffer_io_error(bh);

149

buffer_io_error(bh);

150

printk(KERN_WARNING "lost page write due to "

150

printk(KERN_WARNING "lost page write due to "

151

"I/O error on %s\n",

151

"I/O error on %s\n",

152

bdevname(bh->b_bdev, b));

152

bdevname(bh->b_bdev, b));

153

}

153

}

154

set_buffer_write_io_error(bh);

154

set_buffer_write_io_error(bh);

155

clear_buffer_uptodate(bh);

155

clear_buffer_uptodate(bh);

156

}

156

}

157

unlock_buffer(bh);

157

unlock_buffer(bh);

158

put_bh(bh);

158

put_bh(bh);

159

}

159

}

160

161

/*

161

/*

162

* Write out and wait upon all the dirty data associated with a block

162

* Write out and wait upon all the dirty data associated with a block

163

* device via its mapping. Does not take the superblock lock.

163

* device via its mapping. Does not take the superblock lock.

164

*/

164

*/

165

int sync_blockdev(struct block_device *bdev)

165

int sync_blockdev(struct block_device *bdev)

166

{

166

{

167

int ret = 0;

167

int ret = 0;

168

169

if (bdev)

169

if (bdev)

170

ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);

170

ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);

171

return ret;

171

return ret;

172

}

172

}

173

EXPORT_SYMBOL(sync_blockdev);

173

EXPORT_SYMBOL(sync_blockdev);

174

175

/*

175

/*

176

* Write out and wait upon all dirty data associated with this

176

* Write out and wait upon all dirty data associated with this

177

* device. Filesystem data as well as the underlying block

177

* device. Filesystem data as well as the underlying block

178

* device. Takes the superblock lock.

178

* device. Takes the superblock lock.

179

*/

179

*/

180

int fsync_bdev(struct block_device *bdev)

180

int fsync_bdev(struct block_device *bdev)

181

{

181

{

182

struct super_block *sb = get_super(bdev);

182

struct super_block *sb = get_super(bdev);

183

if (sb) {

183

if (sb) {

184

int res = fsync_super(sb);

184

int res = fsync_super(sb);

185

drop_super(sb);

185

drop_super(sb);

186

return res;

186

return res;

187

}

187

}

188

return sync_blockdev(bdev);

188

return sync_blockdev(bdev);

189

}

189

}

190

191

/**

191

/**

192

* freeze_bdev -- lock a filesystem and force it into a consistent state

192

* freeze_bdev -- lock a filesystem and force it into a consistent state

193

* @bdev: blockdevice to lock

193

* @bdev: blockdevice to lock

194

*

194

*

195

* This takes the block device bd_mount_sem to make sure no new mounts

195

* This takes the block device bd_mount_sem to make sure no new mounts

196

* happen on bdev until thaw_bdev() is called.

196

* happen on bdev until thaw_bdev() is called.

197

* If a superblock is found on this device, we take the s_umount semaphore

197

* If a superblock is found on this device, we take the s_umount semaphore

198

* on it to make sure nobody unmounts until the snapshot creation is done.

198

* on it to make sure nobody unmounts until the snapshot creation is done.

199

*/

199

*/

200

struct super_block *freeze_bdev(struct block_device *bdev)

200

struct super_block *freeze_bdev(struct block_device *bdev)

201

{

201

{

202

struct super_block *sb;

202

struct super_block *sb;

203

204

down(&bdev->bd_mount_sem);

204

down(&bdev->bd_mount_sem);

205

sb = get_super(bdev);

205

sb = get_super(bdev);

206

if (sb && !(sb->s_flags & MS_RDONLY)) {

206

if (sb && !(sb->s_flags & MS_RDONLY)) {

207

sb->s_frozen = SB_FREEZE_WRITE;

207

sb->s_frozen = SB_FREEZE_WRITE;

208

smp_wmb();

208

smp_wmb();

209

210

__fsync_super(sb);

210

__fsync_super(sb);

211

212

sb->s_frozen = SB_FREEZE_TRANS;

212

sb->s_frozen = SB_FREEZE_TRANS;

213

smp_wmb();

213

smp_wmb();

214

215

sync_blockdev(sb->s_bdev);

215

sync_blockdev(sb->s_bdev);

216

217

if (sb->s_op->write_super_lockfs)

217

if (sb->s_op->write_super_lockfs)

218

sb->s_op->write_super_lockfs(sb);

218

sb->s_op->write_super_lockfs(sb);

219

}

219

}

220

221

sync_blockdev(bdev);

221

sync_blockdev(bdev);

222

return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */

222

return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */

223

}

223

}

224

EXPORT_SYMBOL(freeze_bdev);

224

EXPORT_SYMBOL(freeze_bdev);

225

226

/**

226

/**

227

* thaw_bdev -- unlock filesystem

227

* thaw_bdev -- unlock filesystem

228

* @bdev: blockdevice to unlock

228

* @bdev: blockdevice to unlock

229

* @sb: associated superblock

229

* @sb: associated superblock

230

*

230

*

231

* Unlocks the filesystem and marks it writeable again after freeze_bdev().

231

* Unlocks the filesystem and marks it writeable again after freeze_bdev().

232

*/

232

*/

233

void thaw_bdev(struct block_device *bdev, struct super_block *sb)

233

void thaw_bdev(struct block_device *bdev, struct super_block *sb)

234

{

234

{

235

if (sb) {

235

if (sb) {

236

BUG_ON(sb->s_bdev != bdev);

236

BUG_ON(sb->s_bdev != bdev);

237

238

if (sb->s_op->unlockfs)

238

if (sb->s_op->unlockfs)

239

sb->s_op->unlockfs(sb);

239

sb->s_op->unlockfs(sb);

240

sb->s_frozen = SB_UNFROZEN;

240

sb->s_frozen = SB_UNFROZEN;

241

smp_wmb();

241

smp_wmb();

242

wake_up(&sb->s_wait_unfrozen);

242

wake_up(&sb->s_wait_unfrozen);

243

drop_super(sb);

243

drop_super(sb);

244

}

244

}

245

246

up(&bdev->bd_mount_sem);

246

up(&bdev->bd_mount_sem);

247

}

247

}

248

EXPORT_SYMBOL(thaw_bdev);

248

EXPORT_SYMBOL(thaw_bdev);

249

250

/*

250

/*

251

* Various filesystems appear to want __find_get_block to be non-blocking.

251

* Various filesystems appear to want __find_get_block to be non-blocking.

252

* But it's the page lock which protects the buffers. To get around this,

252

* But it's the page lock which protects the buffers. To get around this,

253

* we get exclusion from try_to_free_buffers with the blockdev mapping's

253

* we get exclusion from try_to_free_buffers with the blockdev mapping's

254

* private_lock.

254

* private_lock.

255

*

255

*

256

* Hack idea: for the blockdev mapping, i_bufferlist_lock contention

256

* Hack idea: for the blockdev mapping, i_bufferlist_lock contention

257

* may be quite high. This code could TryLock the page, and if that

257

* may be quite high. This code could TryLock the page, and if that

258

* succeeds, there is no need to take private_lock. (But if

258

* succeeds, there is no need to take private_lock. (But if

259

* private_lock is contended then so is mapping->tree_lock).

259

* private_lock is contended then so is mapping->tree_lock).

260

*/

260

*/

261

static struct buffer_head *

261

static struct buffer_head *

262

__find_get_block_slow(struct block_device *bdev, sector_t block)

262

__find_get_block_slow(struct block_device *bdev, sector_t block)

263

{

263

{

264

struct inode *bd_inode = bdev->bd_inode;

264

struct inode *bd_inode = bdev->bd_inode;

265

struct address_space *bd_mapping = bd_inode->i_mapping;

265

struct address_space *bd_mapping = bd_inode->i_mapping;

266

struct buffer_head *ret = NULL;

266

struct buffer_head *ret = NULL;

267

pgoff_t index;

267

pgoff_t index;

268

struct buffer_head *bh;

268

struct buffer_head *bh;

269

struct buffer_head *head;

269

struct buffer_head *head;

270

struct page *page;

270

struct page *page;

271

int all_mapped = 1;

271

int all_mapped = 1;

272

273

index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);

273

index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);

274

page = find_get_page(bd_mapping, index);

274

page = find_get_page(bd_mapping, index);

275

if (!page)

275

if (!page)

276

goto out;

276

goto out;

277

278

spin_lock(&bd_mapping->private_lock);

278

spin_lock(&bd_mapping->private_lock);

279

if (!page_has_buffers(page))

279

if (!page_has_buffers(page))

280

goto out_unlock;

280

goto out_unlock;

281

head = page_buffers(page);

281

head = page_buffers(page);

282

bh = head;

282

bh = head;

283

do {

283

do {

284

if (bh->b_blocknr == block) {

284

if (bh->b_blocknr == block) {

285

ret = bh;

285

ret = bh;

286

get_bh(bh);

286

get_bh(bh);

287

goto out_unlock;

287

goto out_unlock;

288

}

288

}

289

if (!buffer_mapped(bh))

289

if (!buffer_mapped(bh))

290

all_mapped = 0;

290

all_mapped = 0;

291

bh = bh->b_this_page;

291

bh = bh->b_this_page;

292

} while (bh != head);

292

} while (bh != head);

293

294

/* we might be here because some of the buffers on this page are

294

/* we might be here because some of the buffers on this page are

295

* not mapped. This is due to various races between

295

* not mapped. This is due to various races between

296

* file io on the block device and getblk. It gets dealt with

296

* file io on the block device and getblk. It gets dealt with

297

* elsewhere, don't buffer_error if we had some unmapped buffers

297

* elsewhere, don't buffer_error if we had some unmapped buffers

298

*/

298

*/

299

if (all_mapped) {

299

if (all_mapped) {

300

printk("__find_get_block_slow() failed. "

300

printk("__find_get_block_slow() failed. "

301

"block=%llu, b_blocknr=%llu\n",

301

"block=%llu, b_blocknr=%llu\n",

302

(unsigned long long)block,

302

(unsigned long long)block,

303

(unsigned long long)bh->b_blocknr);

303

(unsigned long long)bh->b_blocknr);

304

printk("b_state=0x%08lx, b_size=%zu\n",

304

printk("b_state=0x%08lx, b_size=%zu\n",

305

bh->b_state, bh->b_size);

305

bh->b_state, bh->b_size);

306

printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);

306

printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);

307

}

307

}

308

out_unlock:

308

out_unlock:

309

spin_unlock(&bd_mapping->private_lock);

309

spin_unlock(&bd_mapping->private_lock);

310

page_cache_release(page);

310

page_cache_release(page);

311

out:

311

out:

312

return ret;

312

return ret;

313

}

313

}

314

315

/* If invalidate_buffers() will trash dirty buffers, it means some kind

315

/* If invalidate_buffers() will trash dirty buffers, it means some kind

316

of fs corruption is going on. Trashing dirty data always imply losing

316

of fs corruption is going on. Trashing dirty data always imply losing

317

information that was supposed to be just stored on the physical layer

317

information that was supposed to be just stored on the physical layer

318

by the user.

318

by the user.

319

320

Thus invalidate_buffers in general usage is not allwowed to trash

320

Thus invalidate_buffers in general usage is not allwowed to trash

321

dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to

321

dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to

322

be preserved. These buffers are simply skipped.

322

be preserved. These buffers are simply skipped.

323

324

We also skip buffers which are still in use. For example this can

324

We also skip buffers which are still in use. For example this can

325

happen if a userspace program is reading the block device.

325

happen if a userspace program is reading the block device.

326

327

NOTE: In the case where the user removed a removable-media-disk even if

327

NOTE: In the case where the user removed a removable-media-disk even if

328

there's still dirty data not synced on disk (due a bug in the device driver

328

there's still dirty data not synced on disk (due a bug in the device driver

329

or due an error of the user), by not destroying the dirty buffers we could

329

or due an error of the user), by not destroying the dirty buffers we could

330

generate corruption also on the next media inserted, thus a parameter is

330

generate corruption also on the next media inserted, thus a parameter is

331

necessary to handle this case in the most safe way possible (trying

331

necessary to handle this case in the most safe way possible (trying

332

to not corrupt also the new disk inserted with the data belonging to

332

to not corrupt also the new disk inserted with the data belonging to

333

the old now corrupted disk). Also for the ramdisk the natural thing

333

the old now corrupted disk). Also for the ramdisk the natural thing

334

to do in order to release the ramdisk memory is to destroy dirty buffers.

334

to do in order to release the ramdisk memory is to destroy dirty buffers.

335

336

These are two special cases. Normal usage imply the device driver

336

These are two special cases. Normal usage imply the device driver

337

to issue a sync on the device (without waiting I/O completion) and

337

to issue a sync on the device (without waiting I/O completion) and

338

then an invalidate_buffers call that doesn't trash dirty buffers.

338

then an invalidate_buffers call that doesn't trash dirty buffers.

339

340

For handling cache coherency with the blkdev pagecache the 'update' case

340

For handling cache coherency with the blkdev pagecache the 'update' case

341

is been introduced. It is needed to re-read from disk any pinned

341

is been introduced. It is needed to re-read from disk any pinned

342

buffer. NOTE: re-reading from disk is destructive so we can do it only

342

buffer. NOTE: re-reading from disk is destructive so we can do it only

343

when we assume nobody is changing the buffercache under our I/O and when

343

when we assume nobody is changing the buffercache under our I/O and when

344

we think the disk contains more recent information than the buffercache.

344

we think the disk contains more recent information than the buffercache.

345

The update == 1 pass marks the buffers we need to update, the update == 2

345

The update == 1 pass marks the buffers we need to update, the update == 2

346

pass does the actual I/O. */

346

pass does the actual I/O. */

347

void invalidate_bdev(struct block_device *bdev)

347

void invalidate_bdev(struct block_device *bdev)

348

{

348

{

349

struct address_space *mapping = bdev->bd_inode->i_mapping;

349

struct address_space *mapping = bdev->bd_inode->i_mapping;

350

351

if (mapping->nrpages == 0)

351

if (mapping->nrpages == 0)

352

return;

352

return;

353

354

invalidate_bh_lrus();

354

invalidate_bh_lrus();

355

invalidate_mapping_pages(mapping, 0, -1);

355

invalidate_mapping_pages(mapping, 0, -1);

356

}

356

}

357

358

/*

358

/*

359

* Kick pdflush then try to free up some ZONE_NORMAL memory.

359

* Kick pdflush then try to free up some ZONE_NORMAL memory.

360

*/

360

*/

361

static void free_more_memory(void)

361

static void free_more_memory(void)

362

{

362

{

363

struct zone *zone;

363

struct zone *zone;

364

int nid;

364

int nid;

365

366

wakeup_pdflush(1024);

366

wakeup_pdflush(1024);

367

yield();

367

yield();

368

369

for_each_online_node(nid) {

369

for_each_online_node(nid) {

370

(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),

370

(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),

371

gfp_zone(GFP_NOFS), NULL,

371

gfp_zone(GFP_NOFS), NULL,

372

&zone);

372

&zone);

373

if (zone)

373

if (zone)

374

try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,

374

try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,

375

GFP_NOFS);

375

GFP_NOFS);

376

}

376

}

377

}

377

}

378

379

/*

379

/*

380

* I/O completion handler for block_read_full_page() - pages

380

* I/O completion handler for block_read_full_page() - pages

381

* which come unlocked at the end of I/O.

381

* which come unlocked at the end of I/O.

382

*/

382

*/

383

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)

383

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)

384

{

384

{

385

unsigned long flags;

385

unsigned long flags;

386

struct buffer_head *first;

386

struct buffer_head *first;

387

struct buffer_head *tmp;

387

struct buffer_head *tmp;

388

struct page *page;

388

struct page *page;

389

int page_uptodate = 1;

389

int page_uptodate = 1;

390

391

BUG_ON(!buffer_async_read(bh));

391

BUG_ON(!buffer_async_read(bh));

392

393

page = bh->b_page;

393

page = bh->b_page;

394

if (uptodate) {

394

if (uptodate) {

395

set_buffer_uptodate(bh);

395

set_buffer_uptodate(bh);

396

} else {

396

} else {

397

clear_buffer_uptodate(bh);

397

clear_buffer_uptodate(bh);

398

if (printk_ratelimit())

398

if (printk_ratelimit())

399

buffer_io_error(bh);

399

buffer_io_error(bh);

400

SetPageError(page);

400

SetPageError(page);

401

}

401

}

402

403

/*

403

/*

404

* Be _very_ careful from here on. Bad things can happen if

404

* Be _very_ careful from here on. Bad things can happen if

405

* two buffer heads end IO at almost the same time and both

405

* two buffer heads end IO at almost the same time and both

406

* decide that the page is now completely done.

406

* decide that the page is now completely done.

407

*/

407

*/

408

first = page_buffers(page);

408

first = page_buffers(page);

409

local_irq_save(flags);

409

local_irq_save(flags);

410

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

410

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

411

clear_buffer_async_read(bh);

411

clear_buffer_async_read(bh);

412

unlock_buffer(bh);

412

unlock_buffer(bh);

413

tmp = bh;

413

tmp = bh;

414

do {

414

do {

415

if (!buffer_uptodate(tmp))

415

if (!buffer_uptodate(tmp))

416

page_uptodate = 0;

416

page_uptodate = 0;

417

if (buffer_async_read(tmp)) {

417

if (buffer_async_read(tmp)) {

418

BUG_ON(!buffer_locked(tmp));

418

BUG_ON(!buffer_locked(tmp));

419

goto still_busy;

419

goto still_busy;

420

}

420

}

421

tmp = tmp->b_this_page;

421

tmp = tmp->b_this_page;

422

} while (tmp != bh);

422

} while (tmp != bh);

423

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

423

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

424

local_irq_restore(flags);

424

local_irq_restore(flags);

425

426

/*

426

/*

427

* If none of the buffers had errors and they are all

427

* If none of the buffers had errors and they are all

428

* uptodate then we can set the page uptodate.

428

* uptodate then we can set the page uptodate.

429

*/

429

*/

430

if (page_uptodate && !PageError(page))

430

if (page_uptodate && !PageError(page))

431

SetPageUptodate(page);

431

SetPageUptodate(page);

432

unlock_page(page);

432

unlock_page(page);

433

return;

433

return;

434

435

still_busy:

435

still_busy:

436

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

436

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

437

local_irq_restore(flags);

437

local_irq_restore(flags);

438

return;

438

return;

439

}

439

}

440

441

/*

441

/*

442

* Completion handler for block_write_full_page() - pages which are unlocked

442

* Completion handler for block_write_full_page() - pages which are unlocked

443

* during I/O, and which have PageWriteback cleared upon I/O completion.

443

* during I/O, and which have PageWriteback cleared upon I/O completion.

444

*/

444

*/

445

static void end_buffer_async_write(struct buffer_head *bh, int uptodate)

445

static void end_buffer_async_write(struct buffer_head *bh, int uptodate)

446

{

446

{

447

char b[BDEVNAME_SIZE];

447

char b[BDEVNAME_SIZE];

448

unsigned long flags;

448

unsigned long flags;

449

struct buffer_head *first;

449

struct buffer_head *first;

450

struct buffer_head *tmp;

450

struct buffer_head *tmp;

451

struct page *page;

451

struct page *page;

452

453

BUG_ON(!buffer_async_write(bh));

453

BUG_ON(!buffer_async_write(bh));

454

455

page = bh->b_page;

455

page = bh->b_page;

456

if (uptodate) {

456

if (uptodate) {

457

set_buffer_uptodate(bh);

457

set_buffer_uptodate(bh);

458

} else {

458

} else {

459

if (printk_ratelimit()) {

459

if (printk_ratelimit()) {

460

buffer_io_error(bh);

460

buffer_io_error(bh);

461

printk(KERN_WARNING "lost page write due to "

461

printk(KERN_WARNING "lost page write due to "

462

"I/O error on %s\n",

462

"I/O error on %s\n",

463

bdevname(bh->b_bdev, b));

463

bdevname(bh->b_bdev, b));

464

}

464

}

465

set_bit(AS_EIO, &page->mapping->flags);

465

set_bit(AS_EIO, &page->mapping->flags);

466

set_buffer_write_io_error(bh);

466

set_buffer_write_io_error(bh);

467

clear_buffer_uptodate(bh);

467

clear_buffer_uptodate(bh);

468

SetPageError(page);

468

SetPageError(page);

469

}

469

}

470

471

first = page_buffers(page);

471

first = page_buffers(page);

472

local_irq_save(flags);

472

local_irq_save(flags);

473

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

473

bit_spin_lock(BH_Uptodate_Lock, &first->b_state);

474

475

clear_buffer_async_write(bh);

475

clear_buffer_async_write(bh);

476

unlock_buffer(bh);

476

unlock_buffer(bh);

477

tmp = bh->b_this_page;

477

tmp = bh->b_this_page;

478

while (tmp != bh) {

478

while (tmp != bh) {

479

if (buffer_async_write(tmp)) {

479

if (buffer_async_write(tmp)) {

480

BUG_ON(!buffer_locked(tmp));

480

BUG_ON(!buffer_locked(tmp));

481

goto still_busy;

481

goto still_busy;

482

}

482

}

483

tmp = tmp->b_this_page;

483

tmp = tmp->b_this_page;

484

}

484

}

485

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

485

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

486

local_irq_restore(flags);

486

local_irq_restore(flags);

487

end_page_writeback(page);

487

end_page_writeback(page);

488

return;

488

return;

489

490

still_busy:

490

still_busy:

491

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

491

bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);

492

local_irq_restore(flags);

492

local_irq_restore(flags);

493

return;

493

return;

494

}

494

}

495

496

/*

496

/*

497

* If a page's buffers are under async readin (end_buffer_async_read

497

* If a page's buffers are under async readin (end_buffer_async_read

498

* completion) then there is a possibility that another thread of

498

* completion) then there is a possibility that another thread of

499

* control could lock one of the buffers after it has completed

499

* control could lock one of the buffers after it has completed

500

* but while some of the other buffers have not completed. This

500

* but while some of the other buffers have not completed. This

501

* locked buffer would confuse end_buffer_async_read() into not unlocking

501

* locked buffer would confuse end_buffer_async_read() into not unlocking

502

* the page. So the absence of BH_Async_Read tells end_buffer_async_read()

502

* the page. So the absence of BH_Async_Read tells end_buffer_async_read()

503

* that this buffer is not under async I/O.

503

* that this buffer is not under async I/O.

504

*

504

*

505

* The page comes unlocked when it has no locked buffer_async buffers

505

* The page comes unlocked when it has no locked buffer_async buffers

506

* left.

506

* left.

507

*

507

*

508

* PageLocked prevents anyone starting new async I/O reads any of

508

* PageLocked prevents anyone starting new async I/O reads any of

509

* the buffers.

509

* the buffers.

510

*

510

*

511

* PageWriteback is used to prevent simultaneous writeout of the same

511

* PageWriteback is used to prevent simultaneous writeout of the same

512

* page.

512

* page.

513

*

513

*

514

* PageLocked prevents anyone from starting writeback of a page which is

514

* PageLocked prevents anyone from starting writeback of a page which is

515

* under read I/O (PageWriteback is only ever set against a locked page).

515

* under read I/O (PageWriteback is only ever set against a locked page).

516

*/

516

*/

517

static void mark_buffer_async_read(struct buffer_head *bh)

517

static void mark_buffer_async_read(struct buffer_head *bh)

518

{

518

{

519

bh->b_end_io = end_buffer_async_read;

519

bh->b_end_io = end_buffer_async_read;

520

set_buffer_async_read(bh);

520

set_buffer_async_read(bh);

521

}

521

}

522

523

void mark_buffer_async_write(struct buffer_head *bh)

523

void mark_buffer_async_write(struct buffer_head *bh)

524

{

524

{

525

bh->b_end_io = end_buffer_async_write;

525

bh->b_end_io = end_buffer_async_write;

526

set_buffer_async_write(bh);

526

set_buffer_async_write(bh);

527

}

527

}

528

EXPORT_SYMBOL(mark_buffer_async_write);

528

EXPORT_SYMBOL(mark_buffer_async_write);

529

530

531

/*

531

/*

532

* fs/buffer.c contains helper functions for buffer-backed address space's

532

* fs/buffer.c contains helper functions for buffer-backed address space's

533

* fsync functions. A common requirement for buffer-based filesystems is

533

* fsync functions. A common requirement for buffer-based filesystems is

534

* that certain data from the backing blockdev needs to be written out for

534

* that certain data from the backing blockdev needs to be written out for

535

* a successful fsync(). For example, ext2 indirect blocks need to be

535

* a successful fsync(). For example, ext2 indirect blocks need to be

536

* written back and waited upon before fsync() returns.

536

* written back and waited upon before fsync() returns.

537

*

537

*

538

* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),

538

* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),

539

* inode_has_buffers() and invalidate_inode_buffers() are provided for the

539

* inode_has_buffers() and invalidate_inode_buffers() are provided for the

540

* management of a list of dependent buffers at ->i_mapping->private_list.

540

* management of a list of dependent buffers at ->i_mapping->private_list.

541

*

541

*

542

* Locking is a little subtle: try_to_free_buffers() will remove buffers

542

* Locking is a little subtle: try_to_free_buffers() will remove buffers

543

* from their controlling inode's queue when they are being freed. But

543

* from their controlling inode's queue when they are being freed. But

544

* try_to_free_buffers() will be operating against the *blockdev* mapping

544

* try_to_free_buffers() will be operating against the *blockdev* mapping

545

* at the time, not against the S_ISREG file which depends on those buffers.

545

* at the time, not against the S_ISREG file which depends on those buffers.

546

* So the locking for private_list is via the private_lock in the address_space

546

* So the locking for private_list is via the private_lock in the address_space

547

* which backs the buffers. Which is different from the address_space

547

* which backs the buffers. Which is different from the address_space

548

* against which the buffers are listed. So for a particular address_space,

548

* against which the buffers are listed. So for a particular address_space,

549

* mapping->private_lock does *not* protect mapping->private_list! In fact,

549

* mapping->private_lock does *not* protect mapping->private_list! In fact,

550

* mapping->private_list will always be protected by the backing blockdev's

550

* mapping->private_list will always be protected by the backing blockdev's

551

* ->private_lock.

551

* ->private_lock.

552

*

552

*

553

* Which introduces a requirement: all buffers on an address_space's

553

* Which introduces a requirement: all buffers on an address_space's

554

* ->private_list must be from the same address_space: the blockdev's.

554

* ->private_list must be from the same address_space: the blockdev's.

555

*

555

*

556

* address_spaces which do not place buffers at ->private_list via these

556

* address_spaces which do not place buffers at ->private_list via these

557

* utility functions are free to use private_lock and private_list for

557

* utility functions are free to use private_lock and private_list for

558

* whatever they want. The only requirement is that list_empty(private_list)

558

* whatever they want. The only requirement is that list_empty(private_list)

559

* be true at clear_inode() time.

559

* be true at clear_inode() time.

560

*

560

*

561

* FIXME: clear_inode should not call invalidate_inode_buffers(). The

561

* FIXME: clear_inode should not call invalidate_inode_buffers(). The

562

* filesystems should do that. invalidate_inode_buffers() should just go

562

* filesystems should do that. invalidate_inode_buffers() should just go

563

* BUG_ON(!list_empty).

563

* BUG_ON(!list_empty).

564

*

564

*

565

* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should

565

* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should

566

* take an address_space, not an inode. And it should be called

566

* take an address_space, not an inode. And it should be called

567

* mark_buffer_dirty_fsync() to clearly define why those buffers are being

567

* mark_buffer_dirty_fsync() to clearly define why those buffers are being

568

* queued up.

568

* queued up.

569

*

569

*

570

* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the

570

* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the

571

* list if it is already on a list. Because if the buffer is on a list,

571

* list if it is already on a list. Because if the buffer is on a list,

572

* it *must* already be on the right one. If not, the filesystem is being

572

* it *must* already be on the right one. If not, the filesystem is being

573

* silly. This will save a ton of locking. But first we have to ensure

573

* silly. This will save a ton of locking. But first we have to ensure

574

* that buffers are taken *off* the old inode's list when they are freed

574

* that buffers are taken *off* the old inode's list when they are freed

575

* (presumably in truncate). That requires careful auditing of all

575

* (presumably in truncate). That requires careful auditing of all

576

* filesystems (do it inside bforget()). It could also be done by bringing

576

* filesystems (do it inside bforget()). It could also be done by bringing

577

* b_inode back.

577

* b_inode back.

578

*/

578

*/

579

580

/*

580

/*

581

* The buffer's backing address_space's private_lock must be held

581

* The buffer's backing address_space's private_lock must be held

582

*/

582

*/

583

static inline void __remove_assoc_queue(struct buffer_head *bh)

583

static inline void __remove_assoc_queue(struct buffer_head *bh)

584

{

584

{

585

list_del_init(&bh->b_assoc_buffers);

585

list_del_init(&bh->b_assoc_buffers);

586

WARN_ON(!bh->b_assoc_map);

586

WARN_ON(!bh->b_assoc_map);

587

if (buffer_write_io_error(bh))

587

if (buffer_write_io_error(bh))

588

set_bit(AS_EIO, &bh->b_assoc_map->flags);

588

set_bit(AS_EIO, &bh->b_assoc_map->flags);

589

bh->b_assoc_map = NULL;

589

bh->b_assoc_map = NULL;

590

}

590

}

591

592

int inode_has_buffers(struct inode *inode)

592

int inode_has_buffers(struct inode *inode)

593

{

593

{

594

return !list_empty(&inode->i_data.private_list);

594

return !list_empty(&inode->i_data.private_list);

595

}

595

}

596

597

/*

597

/*

598

* osync is designed to support O_SYNC io. It waits synchronously for

598

* osync is designed to support O_SYNC io. It waits synchronously for

599

* all already-submitted IO to complete, but does not queue any new

599

* all already-submitted IO to complete, but does not queue any new

600

* writes to the disk.

600

* writes to the disk.

601

*

601

*

602

* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as

602

* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as

603

* you dirty the buffers, and then use osync_inode_buffers to wait for

603

* you dirty the buffers, and then use osync_inode_buffers to wait for

604

* completion. Any other dirty buffers which are not yet queued for

604

* completion. Any other dirty buffers which are not yet queued for

605

* write will not be flushed to disk by the osync.

605

* write will not be flushed to disk by the osync.

606

*/

606

*/

607

static int osync_buffers_list(spinlock_t *lock, struct list_head *list)

607

static int osync_buffers_list(spinlock_t *lock, struct list_head *list)

608

{

608

{

609

struct buffer_head *bh;

609

struct buffer_head *bh;

610

struct list_head *p;

610

struct list_head *p;

611

int err = 0;

611

int err = 0;

612

613

spin_lock(lock);

613

spin_lock(lock);

614

repeat:

614

repeat:

615

list_for_each_prev(p, list) {

615

list_for_each_prev(p, list) {

616

bh = BH_ENTRY(p);

616

bh = BH_ENTRY(p);

617

if (buffer_locked(bh)) {

617

if (buffer_locked(bh)) {

618

get_bh(bh);

618

get_bh(bh);

619

spin_unlock(lock);

619

spin_unlock(lock);

620

wait_on_buffer(bh);

620

wait_on_buffer(bh);

621

if (!buffer_uptodate(bh))

621

if (!buffer_uptodate(bh))

622

err = -EIO;

622

err = -EIO;

623

brelse(bh);

623

brelse(bh);

624

spin_lock(lock);

624

spin_lock(lock);

625

goto repeat;

625

goto repeat;

626

}

626

}

627

}

627

}

628

spin_unlock(lock);

628

spin_unlock(lock);

629

return err;

629

return err;

630

}

630

}

631

632

/**

632

/**

633

* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers

633

* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers

634

* @mapping: the mapping which wants those buffers written

634

* @mapping: the mapping which wants those buffers written

635

*

635

*

636

* Starts I/O against the buffers at mapping->private_list, and waits upon

636

* Starts I/O against the buffers at mapping->private_list, and waits upon

637

* that I/O.

637

* that I/O.

638

*

638

*

639

* Basically, this is a convenience function for fsync().

639

* Basically, this is a convenience function for fsync().

640

* @mapping is a file or directory which needs those buffers to be written for

640

* @mapping is a file or directory which needs those buffers to be written for

641

* a successful fsync().

641

* a successful fsync().

642

*/

642

*/

643

int sync_mapping_buffers(struct address_space *mapping)

643

int sync_mapping_buffers(struct address_space *mapping)

644

{

644

{

645

struct address_space *buffer_mapping = mapping->assoc_mapping;

645

struct address_space *buffer_mapping = mapping->assoc_mapping;

646

647

if (buffer_mapping == NULL || list_empty(&mapping->private_list))

647

if (buffer_mapping == NULL || list_empty(&mapping->private_list))

648

return 0;

648

return 0;

649

650

return fsync_buffers_list(&buffer_mapping->private_lock,

650

return fsync_buffers_list(&buffer_mapping->private_lock,

651

&mapping->private_list);

651

&mapping->private_list);

652

}

652

}

653

EXPORT_SYMBOL(sync_mapping_buffers);

653

EXPORT_SYMBOL(sync_mapping_buffers);

654

655

/*

655

/*

656

* Called when we've recently written block `bblock', and it is known that

656

* Called when we've recently written block `bblock', and it is known that

657

* `bblock' was for a buffer_boundary() buffer. This means that the block at

657

* `bblock' was for a buffer_boundary() buffer. This means that the block at

658

* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's

658

* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's

659

* dirty, schedule it for IO. So that indirects merge nicely with their data.

659

* dirty, schedule it for IO. So that indirects merge nicely with their data.

660

*/

660

*/

661

void write_boundary_block(struct block_device *bdev,

661

void write_boundary_block(struct block_device *bdev,

662

sector_t bblock, unsigned blocksize)

662

sector_t bblock, unsigned blocksize)

663

{

663

{

664

struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);

664

struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);

665

if (bh) {

665

if (bh) {

666

if (buffer_dirty(bh))

666

if (buffer_dirty(bh))

667

ll_rw_block(WRITE, 1, &bh);

667

ll_rw_block(WRITE, 1, &bh);

668

put_bh(bh);

668

put_bh(bh);

669

}

669

}

670

}

670

}

671

672

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)

672

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)

673

{

673

{

674

struct address_space *mapping = inode->i_mapping;

674

struct address_space *mapping = inode->i_mapping;

675

struct address_space *buffer_mapping = bh->b_page->mapping;

675

struct address_space *buffer_mapping = bh->b_page->mapping;

676

677

mark_buffer_dirty(bh);

677

mark_buffer_dirty(bh);

678

if (!mapping->assoc_mapping) {

678

if (!mapping->assoc_mapping) {

679

mapping->assoc_mapping = buffer_mapping;

679

mapping->assoc_mapping = buffer_mapping;

680

} else {

680

} else {

681

BUG_ON(mapping->assoc_mapping != buffer_mapping);

681

BUG_ON(mapping->assoc_mapping != buffer_mapping);

682

}

682

}

683

if (!bh->b_assoc_map) {

683

if (!bh->b_assoc_map) {

684

spin_lock(&buffer_mapping->private_lock);

684

spin_lock(&buffer_mapping->private_lock);

685

list_move_tail(&bh->b_assoc_buffers,

685

list_move_tail(&bh->b_assoc_buffers,

686

&mapping->private_list);

686

&mapping->private_list);

687

bh->b_assoc_map = mapping;

687

bh->b_assoc_map = mapping;

688

spin_unlock(&buffer_mapping->private_lock);

688

spin_unlock(&buffer_mapping->private_lock);

689

}

689

}

690

}

690

}

691

EXPORT_SYMBOL(mark_buffer_dirty_inode);

691

EXPORT_SYMBOL(mark_buffer_dirty_inode);

692

693

/*

693

/*

694

* Mark the page dirty, and set it dirty in the radix tree, and mark the inode

694

* Mark the page dirty, and set it dirty in the radix tree, and mark the inode

695

* dirty.

695

* dirty.

696

*

696

*

697

* If warn is true, then emit a warning if the page is not uptodate and has

697

* If warn is true, then emit a warning if the page is not uptodate and has

698

* not been truncated.

698

* not been truncated.

699

*/

699

*/

700

static int __set_page_dirty(struct page *page,

700

static int __set_page_dirty(struct page *page,

701

struct address_space *mapping, int warn)

701

struct address_space *mapping, int warn)

702

{

702

{

703

if (unlikely(!mapping))

703

if (unlikely(!mapping))

704

return !TestSetPageDirty(page);

704

return !TestSetPageDirty(page);

705

706

if (TestSetPageDirty(page))

706

if (TestSetPageDirty(page))

707

return 0;

707

return 0;

708

709

write_lock_irq(&mapping->tree_lock);

709

write_lock_irq(&mapping->tree_lock);

710

if (page->mapping) { /* Race with truncate? */

710

if (page->mapping) { /* Race with truncate? */

711

WARN_ON_ONCE(warn && !PageUptodate(page));

711

WARN_ON_ONCE(warn && !PageUptodate(page));

712

713

if (mapping_cap_account_dirty(mapping)) {

713

if (mapping_cap_account_dirty(mapping)) {

714

__inc_zone_page_state(page, NR_FILE_DIRTY);

714

__inc_zone_page_state(page, NR_FILE_DIRTY);

715

__inc_bdi_stat(mapping->backing_dev_info,

715

__inc_bdi_stat(mapping->backing_dev_info,

716

BDI_RECLAIMABLE);

716

BDI_RECLAIMABLE);

717

task_io_account_write(PAGE_CACHE_SIZE);

717

task_io_account_write(PAGE_CACHE_SIZE);

718

}

718

}

719

radix_tree_tag_set(&mapping->page_tree,

719

radix_tree_tag_set(&mapping->page_tree,

720

page_index(page), PAGECACHE_TAG_DIRTY);

720

page_index(page), PAGECACHE_TAG_DIRTY);

721

}

721

}

722

write_unlock_irq(&mapping->tree_lock);

722

write_unlock_irq(&mapping->tree_lock);

723

__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

723

__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

724

725

return 1;

725

return 1;

726

}

726

}

727

728

/*

728

/*

729

* Add a page to the dirty page list.

729

* Add a page to the dirty page list.

730

*

730

*

731

* It is a sad fact of life that this function is called from several places

731

* It is a sad fact of life that this function is called from several places

732

* deeply under spinlocking. It may not sleep.

732

* deeply under spinlocking. It may not sleep.

733

*

733

*

734

* If the page has buffers, the uptodate buffers are set dirty, to preserve

734

* If the page has buffers, the uptodate buffers are set dirty, to preserve

735

* dirty-state coherency between the page and the buffers. It the page does

735

* dirty-state coherency between the page and the buffers. It the page does

736

* not have buffers then when they are later attached they will all be set

736

* not have buffers then when they are later attached they will all be set

737

* dirty.

737

* dirty.

738

*

738

*

739

* The buffers are dirtied before the page is dirtied. There's a small race

739

* The buffers are dirtied before the page is dirtied. There's a small race

740

* window in which a writepage caller may see the page cleanness but not the

740

* window in which a writepage caller may see the page cleanness but not the

741

* buffer dirtiness. That's fine. If this code were to set the page dirty

741

* buffer dirtiness. That's fine. If this code were to set the page dirty

742

* before the buffers, a concurrent writepage caller could clear the page dirty

742

* before the buffers, a concurrent writepage caller could clear the page dirty

743

* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean

743

* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean

744

* page on the dirty page list.

744

* page on the dirty page list.

745

*

745

*

746

* We use private_lock to lock against try_to_free_buffers while using the

746

* We use private_lock to lock against try_to_free_buffers while using the

747

* page's buffer list. Also use this to protect against clean buffers being

747

* page's buffer list. Also use this to protect against clean buffers being

748

* added to the page after it was set dirty.

748

* added to the page after it was set dirty.

749

*

749

*

750

* FIXME: may need to call ->reservepage here as well. That's rather up to the

750

* FIXME: may need to call ->reservepage here as well. That's rather up to the

751

* address_space though.

751

* address_space though.

752

*/

752

*/

753

int __set_page_dirty_buffers(struct page *page)

753

int __set_page_dirty_buffers(struct page *page)

754

{

754

{

755

struct address_space *mapping = page_mapping(page);

755

struct address_space *mapping = page_mapping(page);

756

757

if (unlikely(!mapping))

757

if (unlikely(!mapping))

758

return !TestSetPageDirty(page);

758

return !TestSetPageDirty(page);

759

760

spin_lock(&mapping->private_lock);

760

spin_lock(&mapping->private_lock);

761

if (page_has_buffers(page)) {

761

if (page_has_buffers(page)) {

762

struct buffer_head *head = page_buffers(page);

762

struct buffer_head *head = page_buffers(page);

763

struct buffer_head *bh = head;

763

struct buffer_head *bh = head;

764

765

do {

765

do {

766

set_buffer_dirty(bh);

766

set_buffer_dirty(bh);

767

bh = bh->b_this_page;

767

bh = bh->b_this_page;

768

} while (bh != head);

768

} while (bh != head);

769

}

769

}

770

spin_unlock(&mapping->private_lock);

770

spin_unlock(&mapping->private_lock);

771

772

return __set_page_dirty(page, mapping, 1);

772

return __set_page_dirty(page, mapping, 1);

773

}

773

}

774

EXPORT_SYMBOL(__set_page_dirty_buffers);

774

EXPORT_SYMBOL(__set_page_dirty_buffers);

775

776

/*

776

/*

777

* Write out and wait upon a list of buffers.

777

* Write out and wait upon a list of buffers.

778

*

778

*

779

* We have conflicting pressures: we want to make sure that all

779

* We have conflicting pressures: we want to make sure that all

780

* initially dirty buffers get waited on, but that any subsequently

780

* initially dirty buffers get waited on, but that any subsequently

781

* dirtied buffers don't. After all, we don't want fsync to last

781

* dirtied buffers don't. After all, we don't want fsync to last

782

* forever if somebody is actively writing to the file.

782

* forever if somebody is actively writing to the file.

783

*

783

*

784

* Do this in two main stages: first we copy dirty buffers to a

784

* Do this in two main stages: first we copy dirty buffers to a

785

* temporary inode list, queueing the writes as we go. Then we clean

785

* temporary inode list, queueing the writes as we go. Then we clean

786

* up, waiting for those writes to complete.

786

* up, waiting for those writes to complete.

787

*

787

*

788

* During this second stage, any subsequent updates to the file may end

788

* During this second stage, any subsequent updates to the file may end

789

* up refiling the buffer on the original inode's dirty list again, so

789

* up refiling the buffer on the original inode's dirty list again, so

790

* there is a chance we will end up with a buffer queued for write but

790

* there is a chance we will end up with a buffer queued for write but

791

* not yet completed on that list. So, as a final cleanup we go through

791

* not yet completed on that list. So, as a final cleanup we go through

792

* the osync code to catch these locked, dirty buffers without requeuing

792

* the osync code to catch these locked, dirty buffers without requeuing

793

* any newly dirty buffers for write.

793

* any newly dirty buffers for write.

794

*/

794

*/

795

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)

795

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)

796

{

796

{

797

struct buffer_head *bh;

797

struct buffer_head *bh;

798

struct list_head tmp;

798

struct list_head tmp;

799

struct address_space *mapping;

799

struct address_space *mapping;

800

int err = 0, err2;

800

int err = 0, err2;

801

802

INIT_LIST_HEAD(&tmp);

802

INIT_LIST_HEAD(&tmp);

803

804

spin_lock(lock);

804

spin_lock(lock);

805

while (!list_empty(list)) {

805

while (!list_empty(list)) {

806

bh = BH_ENTRY(list->next);

806

bh = BH_ENTRY(list->next);

807

mapping = bh->b_assoc_map;

807

mapping = bh->b_assoc_map;

808

__remove_assoc_queue(bh);

808

__remove_assoc_queue(bh);

809

/* Avoid race with mark_buffer_dirty_inode() which does

809

/* Avoid race with mark_buffer_dirty_inode() which does

810

* a lockless check and we rely on seeing the dirty bit */

810

* a lockless check and we rely on seeing the dirty bit */

811

smp_mb();

811

smp_mb();

812

if (buffer_dirty(bh) || buffer_locked(bh)) {

812

if (buffer_dirty(bh) || buffer_locked(bh)) {

813

list_add(&bh->b_assoc_buffers, &tmp);

813

list_add(&bh->b_assoc_buffers, &tmp);

814

bh->b_assoc_map = mapping;

814

bh->b_assoc_map = mapping;

815

if (buffer_dirty(bh)) {

815

if (buffer_dirty(bh)) {

816

get_bh(bh);

816

get_bh(bh);

817

spin_unlock(lock);

817

spin_unlock(lock);

818

/*

818

/*

819

* Ensure any pending I/O completes so that

819

* Ensure any pending I/O completes so that

820

* ll_rw_block() actually writes the current

820

* ll_rw_block() actually writes the current

821

* contents - it is a noop if I/O is still in

821

* contents - it is a noop if I/O is still in

822

* flight on potentially older contents.

822

* flight on potentially older contents.

823

*/

823

*/

824

ll_rw_block(SWRITE_SYNC, 1, &bh);

824

ll_rw_block(SWRITE_SYNC, 1, &bh);

825

brelse(bh);

825

brelse(bh);

826

spin_lock(lock);

826

spin_lock(lock);

827

}

827

}

828

}

828

}

829

}

829

}

830

831

while (!list_empty(&tmp)) {

831

while (!list_empty(&tmp)) {

832

bh = BH_ENTRY(tmp.prev);

832

bh = BH_ENTRY(tmp.prev);

833

get_bh(bh);

833

get_bh(bh);

834

mapping = bh->b_assoc_map;

834

mapping = bh->b_assoc_map;

835

__remove_assoc_queue(bh);

835

__remove_assoc_queue(bh);

836

/* Avoid race with mark_buffer_dirty_inode() which does

836

/* Avoid race with mark_buffer_dirty_inode() which does

837

* a lockless check and we rely on seeing the dirty bit */

837

* a lockless check and we rely on seeing the dirty bit */

838

smp_mb();

838

smp_mb();

839

if (buffer_dirty(bh)) {

839

if (buffer_dirty(bh)) {

840

list_add(&bh->b_assoc_buffers,

840

list_add(&bh->b_assoc_buffers,

841

&mapping->private_list);

841

&mapping->private_list);

842

bh->b_assoc_map = mapping;

842

bh->b_assoc_map = mapping;

843

}

843

}

844

spin_unlock(lock);

844

spin_unlock(lock);

845

wait_on_buffer(bh);

845

wait_on_buffer(bh);

846

if (!buffer_uptodate(bh))

846

if (!buffer_uptodate(bh))

847

err = -EIO;

847

err = -EIO;

848

brelse(bh);

848

brelse(bh);

849

spin_lock(lock);

849

spin_lock(lock);

850

}

850

}

851

852

spin_unlock(lock);

852

spin_unlock(lock);

853

err2 = osync_buffers_list(lock, list);

853

err2 = osync_buffers_list(lock, list);

854

if (err)

854

if (err)

855

return err;

855

return err;

856

else

856

else

857

return err2;

857

return err2;

858

}

858

}

859

860

/*

860

/*

861

* Invalidate any and all dirty buffers on a given inode. We are

861

* Invalidate any and all dirty buffers on a given inode. We are

862

* probably unmounting the fs, but that doesn't mean we have already

862

* probably unmounting the fs, but that doesn't mean we have already

863

* done a sync(). Just drop the buffers from the inode list.

863

* done a sync(). Just drop the buffers from the inode list.

864

*

864

*

865

* NOTE: we take the inode's blockdev's mapping's private_lock. Which

865

* NOTE: we take the inode's blockdev's mapping's private_lock. Which

866

* assumes that all the buffers are against the blockdev. Not true

866

* assumes that all the buffers are against the blockdev. Not true

867

* for reiserfs.

867

* for reiserfs.

868

*/

868

*/

869

void invalidate_inode_buffers(struct inode *inode)

869

void invalidate_inode_buffers(struct inode *inode)

870

{

870

{

871

if (inode_has_buffers(inode)) {

871

if (inode_has_buffers(inode)) {

872

struct address_space *mapping = &inode->i_data;

872

struct address_space *mapping = &inode->i_data;

873

struct list_head *list = &mapping->private_list;

873

struct list_head *list = &mapping->private_list;

874

struct address_space *buffer_mapping = mapping->assoc_mapping;

874

struct address_space *buffer_mapping = mapping->assoc_mapping;

875

876

spin_lock(&buffer_mapping->private_lock);

876

spin_lock(&buffer_mapping->private_lock);

877

while (!list_empty(list))

877

while (!list_empty(list))

878

__remove_assoc_queue(BH_ENTRY(list->next));

878

__remove_assoc_queue(BH_ENTRY(list->next));

879

spin_unlock(&buffer_mapping->private_lock);

879

spin_unlock(&buffer_mapping->private_lock);

880

}

880

}

881

}

881

}

882

883

/*

883

/*

884

* Remove any clean buffers from the inode's buffer list. This is called

884

* Remove any clean buffers from the inode's buffer list. This is called

885

* when we're trying to free the inode itself. Those buffers can pin it.

885

* when we're trying to free the inode itself. Those buffers can pin it.

886

*

886

*

887

* Returns true if all buffers were removed.

887

* Returns true if all buffers were removed.

888

*/

888

*/

889

int remove_inode_buffers(struct inode *inode)

889

int remove_inode_buffers(struct inode *inode)

890

{

890

{

891

int ret = 1;

891

int ret = 1;

892

893

if (inode_has_buffers(inode)) {

893

if (inode_has_buffers(inode)) {

894

struct address_space *mapping = &inode->i_data;

894

struct address_space *mapping = &inode->i_data;

895

struct list_head *list = &mapping->private_list;

895

struct list_head *list = &mapping->private_list;

896

struct address_space *buffer_mapping = mapping->assoc_mapping;

896

struct address_space *buffer_mapping = mapping->assoc_mapping;

897

898

spin_lock(&buffer_mapping->private_lock);

898

spin_lock(&buffer_mapping->private_lock);

899

while (!list_empty(list)) {

899

while (!list_empty(list)) {

900

struct buffer_head *bh = BH_ENTRY(list->next);

900

struct buffer_head *bh = BH_ENTRY(list->next);

901

if (buffer_dirty(bh)) {

901

if (buffer_dirty(bh)) {

902

ret = 0;

902

ret = 0;

903

break;

903

break;

904

}

904

}

905

__remove_assoc_queue(bh);

905

__remove_assoc_queue(bh);

906

}

906

}

907

spin_unlock(&buffer_mapping->private_lock);

907

spin_unlock(&buffer_mapping->private_lock);

908

}

908

}

909

return ret;

909

return ret;

910

}

910

}

911

912

/*

912

/*

913

* Create the appropriate buffers when given a page for data area and

913

* Create the appropriate buffers when given a page for data area and

914

* the size of each buffer.. Use the bh->b_this_page linked list to

914

* the size of each buffer.. Use the bh->b_this_page linked list to

915

* follow the buffers created. Return NULL if unable to create more

915

* follow the buffers created. Return NULL if unable to create more

916

* buffers.

916

* buffers.

917

*

917

*

918

* The retry flag is used to differentiate async IO (paging, swapping)

918

* The retry flag is used to differentiate async IO (paging, swapping)

919

* which may not fail from ordinary buffer allocations.

919

* which may not fail from ordinary buffer allocations.

920

*/

920

*/

921

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,

921

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,

922

int retry)

922

int retry)

923

{

923

{

924

struct buffer_head *bh, *head;

924

struct buffer_head *bh, *head;

925

long offset;

925

long offset;

926

927

try_again:

927

try_again:

928

head = NULL;

928

head = NULL;

929

offset = PAGE_SIZE;

929

offset = PAGE_SIZE;

930

while ((offset -= size) >= 0) {

930

while ((offset -= size) >= 0) {

931

bh = alloc_buffer_head(GFP_NOFS);

931

bh = alloc_buffer_head(GFP_NOFS);

932

if (!bh)

932

if (!bh)

933

goto no_grow;

933

goto no_grow;

934

935

bh->b_bdev = NULL;

935

bh->b_bdev = NULL;

936

bh->b_this_page = head;

936

bh->b_this_page = head;

937

bh->b_blocknr = -1;

937

bh->b_blocknr = -1;

938

head = bh;

938

head = bh;

939

940

bh->b_state = 0;

940

bh->b_state = 0;

941

atomic_set(&bh->b_count, 0);

941

atomic_set(&bh->b_count, 0);

942

bh->b_private = NULL;

942

bh->b_private = NULL;

943

bh->b_size = size;

943

bh->b_size = size;

944

945

/* Link the buffer to its page */

945

/* Link the buffer to its page */

946

set_bh_page(bh, page, offset);

946

set_bh_page(bh, page, offset);

947

948

init_buffer(bh, NULL, NULL);

948

init_buffer(bh, NULL, NULL);

949

}

949

}

950

return head;

950

return head;

951

/*

951

/*

952

* In case anything failed, we just free everything we got.

952

* In case anything failed, we just free everything we got.

953

*/

953

*/

954

no_grow:

954

no_grow:

955

if (head) {

955

if (head) {

956

do {

956

do {

957

bh = head;

957

bh = head;

958

head = head->b_this_page;

958

head = head->b_this_page;

959

free_buffer_head(bh);

959

free_buffer_head(bh);

960

} while (head);

960

} while (head);

961

}

961

}

962

963

/*

963

/*

964

* Return failure for non-async IO requests. Async IO requests

964

* Return failure for non-async IO requests. Async IO requests

965

* are not allowed to fail, so we have to wait until buffer heads

965

* are not allowed to fail, so we have to wait until buffer heads

966

* become available. But we don't want tasks sleeping with

966

* become available. But we don't want tasks sleeping with

967

* partially complete buffers, so all were released above.

967

* partially complete buffers, so all were released above.

968

*/

968

*/

969

if (!retry)

969

if (!retry)

970

return NULL;

970

return NULL;

971

972

/* We're _really_ low on memory. Now we just

972

/* We're _really_ low on memory. Now we just

973

* wait for old buffer heads to become free due to

973

* wait for old buffer heads to become free due to

974

* finishing IO. Since this is an async request and

974

* finishing IO. Since this is an async request and

975

* the reserve list is empty, we're sure there are

975

* the reserve list is empty, we're sure there are

976

* async buffer heads in use.

976

* async buffer heads in use.

977

*/

977

*/

978

free_more_memory();

978

free_more_memory();

979

goto try_again;

979

goto try_again;

980

}

980

}

981

EXPORT_SYMBOL_GPL(alloc_page_buffers);

981

EXPORT_SYMBOL_GPL(alloc_page_buffers);

982

983

static inline void

983

static inline void

984

link_dev_buffers(struct page *page, struct buffer_head *head)

984

link_dev_buffers(struct page *page, struct buffer_head *head)

985

{

985

{

986

struct buffer_head *bh, *tail;

986

struct buffer_head *bh, *tail;

987

988

bh = head;

988

bh = head;

989

do {

989

do {

990

tail = bh;

990

tail = bh;

991

bh = bh->b_this_page;

991

bh = bh->b_this_page;

992

} while (bh);

992

} while (bh);

993

tail->b_this_page = head;

993

tail->b_this_page = head;

994

attach_page_buffers(page, head);

994

attach_page_buffers(page, head);

995

}

995

}

996

997

/*

997

/*

998

* Initialise the state of a blockdev page's buffers.

998

* Initialise the state of a blockdev page's buffers.

999

*/

999

*/

1000

static void

1000

static void

1001

init_page_buffers(struct page *page, struct block_device *bdev,

1001

init_page_buffers(struct page *page, struct block_device *bdev,

1002

sector_t block, int size)

1002

sector_t block, int size)

1003

{

1003

{

1004

struct buffer_head *head = page_buffers(page);

1004

struct buffer_head *head = page_buffers(page);

1005

struct buffer_head *bh = head;

1005

struct buffer_head *bh = head;

1006

int uptodate = PageUptodate(page);

1006

int uptodate = PageUptodate(page);

1007

1008

do {

1008

do {

1009

if (!buffer_mapped(bh)) {

1009

if (!buffer_mapped(bh)) {

1010

init_buffer(bh, NULL, NULL);

1010

init_buffer(bh, NULL, NULL);

1011

bh->b_bdev = bdev;

1011

bh->b_bdev = bdev;

1012

bh->b_blocknr = block;

1012

bh->b_blocknr = block;

1013

if (uptodate)

1013

if (uptodate)

1014

set_buffer_uptodate(bh);

1014

set_buffer_uptodate(bh);

1015

set_buffer_mapped(bh);

1015

set_buffer_mapped(bh);

1016

}

1016

}

1017

block++;

1017

block++;

1018

bh = bh->b_this_page;

1018

bh = bh->b_this_page;

1019

} while (bh != head);

1019

} while (bh != head);

1020

}

1020

}

1021

1022

/*

1022

/*

1023

* Create the page-cache page that contains the requested block.

1023

* Create the page-cache page that contains the requested block.

1024

*

1024

*

1025

* This is user purely for blockdev mappings.

1025

* This is user purely for blockdev mappings.

1026

*/

1026

*/

1027

static struct page *

1027

static struct page *

1028

grow_dev_page(struct block_device *bdev, sector_t block,

1028

grow_dev_page(struct block_device *bdev, sector_t block,

1029

pgoff_t index, int size)

1029

pgoff_t index, int size)

1030

{

1030

{

1031

struct inode *inode = bdev->bd_inode;

1031

struct inode *inode = bdev->bd_inode;

1032

struct page *page;

1032

struct page *page;

1033

struct buffer_head *bh;

1033

struct buffer_head *bh;

1034

1035

page = find_or_create_page(inode->i_mapping, index,

1035

page = find_or_create_page(inode->i_mapping, index,

1036

(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);

1036

(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);

1037

if (!page)

1037

if (!page)

1038

return NULL;

1038

return NULL;

1039

1040

BUG_ON(!PageLocked(page));

1040

BUG_ON(!PageLocked(page));

1041

1042

if (page_has_buffers(page)) {

1042

if (page_has_buffers(page)) {

1043

bh = page_buffers(page);

1043

bh = page_buffers(page);

1044

if (bh->b_size == size) {

1044

if (bh->b_size == size) {

1045

init_page_buffers(page, bdev, block, size);

1045

init_page_buffers(page, bdev, block, size);

1046

return page;

1046

return page;

1047

}

1047

}

1048

if (!try_to_free_buffers(page))

1048

if (!try_to_free_buffers(page))

1049

goto failed;

1049

goto failed;

1050

}

1050

}

1051

1052

/*

1052

/*

1053

* Allocate some buffers for this page

1053

* Allocate some buffers for this page

1054

*/

1054

*/

1055

bh = alloc_page_buffers(page, size, 0);

1055

bh = alloc_page_buffers(page, size, 0);

1056

if (!bh)

1056

if (!bh)

1057

goto failed;

1057

goto failed;

1058

1059

/*

1059

/*

1060

* Link the page to the buffers and initialise them. Take the

1060

* Link the page to the buffers and initialise them. Take the

1061

* lock to be atomic wrt __find_get_block(), which does not

1061

* lock to be atomic wrt __find_get_block(), which does not

1062

* run under the page lock.

1062

* run under the page lock.

1063

*/

1063

*/

1064

spin_lock(&inode->i_mapping->private_lock);

1064

spin_lock(&inode->i_mapping->private_lock);

1065

link_dev_buffers(page, bh);

1065

link_dev_buffers(page, bh);

1066

init_page_buffers(page, bdev, block, size);

1066

init_page_buffers(page, bdev, block, size);

1067

spin_unlock(&inode->i_mapping->private_lock);

1067

spin_unlock(&inode->i_mapping->private_lock);

1068

return page;

1068

return page;

1069

1070

failed:

1070

failed:

1071

BUG();

1071

BUG();

1072

unlock_page(page);

1072

unlock_page(page);

1073

page_cache_release(page);

1073

page_cache_release(page);

1074

return NULL;

1074

return NULL;

1075

}

1075

}

1076

1077

/*

1077

/*

1078

* Create buffers for the specified block device block's page. If

1078

* Create buffers for the specified block device block's page. If

1079

* that page was dirty, the buffers are set dirty also.

1079

* that page was dirty, the buffers are set dirty also.

1080

*/

1080

*/

1081

static int

1081

static int

1082

grow_buffers(struct block_device *bdev, sector_t block, int size)

1082

grow_buffers(struct block_device *bdev, sector_t block, int size)

1083

{

1083

{

1084

struct page *page;

1084

struct page *page;

1085

pgoff_t index;

1085

pgoff_t index;

1086

int sizebits;

1086

int sizebits;

1087

1088

sizebits = -1;

1088

sizebits = -1;

1089

do {

1089

do {

1090

sizebits++;

1090

sizebits++;

1091

} while ((size << sizebits) < PAGE_SIZE);

1091

} while ((size << sizebits) < PAGE_SIZE);

1092

1093

index = block >> sizebits;

1093

index = block >> sizebits;

1094

1095

/*

1095

/*

1096

* Check for a block which wants to lie outside our maximum possible

1096

* Check for a block which wants to lie outside our maximum possible

1097

* pagecache index. (this comparison is done using sector_t types).

1097

* pagecache index. (this comparison is done using sector_t types).

1098

*/

1098

*/

1099

if (unlikely(index != block >> sizebits)) {

1099

if (unlikely(index != block >> sizebits)) {

1100

char b[BDEVNAME_SIZE];

1100

char b[BDEVNAME_SIZE];

1101

1102

printk(KERN_ERR "%s: requested out-of-range block %llu for "

1102

printk(KERN_ERR "%s: requested out-of-range block %llu for "

1103

"device %s\n",

1103

"device %s\n",

1104

__func__, (unsigned long long)block,

1104

__func__, (unsigned long long)block,

1105

bdevname(bdev, b));

1105

bdevname(bdev, b));

1106

return -EIO;

1106

return -EIO;

1107

}

1107

}

1108

block = index << sizebits;

1108

block = index << sizebits;

1109

/* Create a page with the proper size buffers.. */

1109

/* Create a page with the proper size buffers.. */

1110

page = grow_dev_page(bdev, block, index, size);

1110

page = grow_dev_page(bdev, block, index, size);

1111

if (!page)

1111

if (!page)

1112

return 0;

1112

return 0;

1113

unlock_page(page);

1113

unlock_page(page);

1114

page_cache_release(page);

1114

page_cache_release(page);

1115

return 1;

1115

return 1;

1116

}

1116

}

1117

1118

static struct buffer_head *

1118

static struct buffer_head *

1119

__getblk_slow(struct block_device *bdev, sector_t block, int size)

1119

__getblk_slow(struct block_device *bdev, sector_t block, int size)

1120

{

1120

{

1121

/* Size must be multiple of hard sectorsize */

1121

/* Size must be multiple of hard sectorsize */

1122

if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||

1122

if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||

1123

(size < 512 || size > PAGE_SIZE))) {

1123

(size < 512 || size > PAGE_SIZE))) {

1124

printk(KERN_ERR "getblk(): invalid block size %d requested\n",

1124

printk(KERN_ERR "getblk(): invalid block size %d requested\n",

1125

size);

1125

size);

1126

printk(KERN_ERR "hardsect size: %d\n",

1126

printk(KERN_ERR "hardsect size: %d\n",

1127

bdev_hardsect_size(bdev));

1127

bdev_hardsect_size(bdev));

1128

1129

dump_stack();

1129

dump_stack();

1130

return NULL;

1130

return NULL;

1131

}

1131

}

1132

1133

for (;;) {

1133

for (;;) {

1134

struct buffer_head * bh;

1134

struct buffer_head * bh;

1135

int ret;

1135

int ret;

1136

1137

bh = __find_get_block(bdev, block, size);

1137

bh = __find_get_block(bdev, block, size);

1138

if (bh)

1138

if (bh)

1139

return bh;

1139

return bh;

1140

1141

ret = grow_buffers(bdev, block, size);

1141

ret = grow_buffers(bdev, block, size);

1142

if (ret < 0)

1142

if (ret < 0)

1143

return NULL;

1143

return NULL;

1144

if (ret == 0)

1144

if (ret == 0)

1145

free_more_memory();

1145

free_more_memory();

1146

}

1146

}

1147

}

1147

}

1148

1149

/*

1149

/*

1150

* The relationship between dirty buffers and dirty pages:

1150

* The relationship between dirty buffers and dirty pages:

1151

*

1151

*

1152

* Whenever a page has any dirty buffers, the page's dirty bit is set, and

1152

* Whenever a page has any dirty buffers, the page's dirty bit is set, and

1153

* the page is tagged dirty in its radix tree.

1153

* the page is tagged dirty in its radix tree.

1154

*

1154

*

1155

* At all times, the dirtiness of the buffers represents the dirtiness of

1155

* At all times, the dirtiness of the buffers represents the dirtiness of

1156

* subsections of the page. If the page has buffers, the page dirty bit is

1156

* subsections of the page. If the page has buffers, the page dirty bit is

1157

* merely a hint about the true dirty state.

1157

* merely a hint about the true dirty state.

1158

*

1158

*

1159

* When a page is set dirty in its entirety, all its buffers are marked dirty

1159

* When a page is set dirty in its entirety, all its buffers are marked dirty

1160

* (if the page has buffers).

1160

* (if the page has buffers).

1161

*

1161

*

1162

* When a buffer is marked dirty, its page is dirtied, but the page's other

1162

* When a buffer is marked dirty, its page is dirtied, but the page's other

1163

* buffers are not.

1163

* buffers are not.

1164

*

1164

*

1165

* Also. When blockdev buffers are explicitly read with bread(), they

1165

* Also. When blockdev buffers are explicitly read with bread(), they

1166

* individually become uptodate. But their backing page remains not

1166

* individually become uptodate. But their backing page remains not

1167

* uptodate - even if all of its buffers are uptodate. A subsequent

1167

* uptodate - even if all of its buffers are uptodate. A subsequent

1168

* block_read_full_page() against that page will discover all the uptodate

1168

* block_read_full_page() against that page will discover all the uptodate

1169

* buffers, will set the page uptodate and will perform no I/O.

1169

* buffers, will set the page uptodate and will perform no I/O.

1170

*/

1170

*/

1171

1172

/**

1172

/**

1173

* mark_buffer_dirty - mark a buffer_head as needing writeout

1173

* mark_buffer_dirty - mark a buffer_head as needing writeout

1174

* @bh: the buffer_head to mark dirty

1174

* @bh: the buffer_head to mark dirty

1175

*

1175

*

1176

* mark_buffer_dirty() will set the dirty bit against the buffer, then set its

1176

* mark_buffer_dirty() will set the dirty bit against the buffer, then set its

1177

* backing page dirty, then tag the page as dirty in its address_space's radix

1177

* backing page dirty, then tag the page as dirty in its address_space's radix

1178

* tree and then attach the address_space's inode to its superblock's dirty

1178

* tree and then attach the address_space's inode to its superblock's dirty

1179

* inode list.

1179

* inode list.

1180

*

1180

*

1181

* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,

1181

* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,

1182

* mapping->tree_lock and the global inode_lock.

1182

* mapping->tree_lock and the global inode_lock.

1183

*/

1183

*/

1184

void mark_buffer_dirty(struct buffer_head *bh)

1184

void mark_buffer_dirty(struct buffer_head *bh)

1185

{

1185

{

1186

WARN_ON_ONCE(!buffer_uptodate(bh));

1186

WARN_ON_ONCE(!buffer_uptodate(bh));

1187

1188

/*

1188

/*

1189

* Very *carefully* optimize the it-is-already-dirty case.

1189

* Very *carefully* optimize the it-is-already-dirty case.

1190

*

1190

*

1191

* Don't let the final "is it dirty" escape to before we

1191

* Don't let the final "is it dirty" escape to before we

1192

* perhaps modified the buffer.

1192

* perhaps modified the buffer.

1193

*/

1193

*/

1194

if (buffer_dirty(bh)) {

1194

if (buffer_dirty(bh)) {

1195

smp_mb();

1195

smp_mb();

1196

if (buffer_dirty(bh))

1196

if (buffer_dirty(bh))

1197

return;

1197

return;

1198

}

1198

}

1199

1200

if (!test_set_buffer_dirty(bh))

1200

if (!test_set_buffer_dirty(bh))

1201

__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);

1201

__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);

1202

}

1202

}

1203

1204

/*

1204

/*

1205

* Decrement a buffer_head's reference count. If all buffers against a page

1205

* Decrement a buffer_head's reference count. If all buffers against a page

1206

* have zero reference count, are clean and unlocked, and if the page is clean

1206

* have zero reference count, are clean and unlocked, and if the page is clean

1207

* and unlocked then try_to_free_buffers() may strip the buffers from the page

1207

* and unlocked then try_to_free_buffers() may strip the buffers from the page

1208

* in preparation for freeing it (sometimes, rarely, buffers are removed from

1208

* in preparation for freeing it (sometimes, rarely, buffers are removed from

1209

* a page but it ends up not being freed, and buffers may later be reattached).

1209

* a page but it ends up not being freed, and buffers may later be reattached).

1210

*/

1210

*/

1211

void __brelse(struct buffer_head * buf)

1211

void __brelse(struct buffer_head * buf)

1212

{

1212

{

1213

if (atomic_read(&buf->b_count)) {

1213

if (atomic_read(&buf->b_count)) {

1214

put_bh(buf);

1214

put_bh(buf);

1215

return;

1215

return;

1216

}

1216

}

1217

printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");

1217

printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");

1218

WARN_ON(1);

1218

WARN_ON(1);

1219

}

1219

}

1220

1221

/*

1221

/*

1222

* bforget() is like brelse(), except it discards any

1222

* bforget() is like brelse(), except it discards any

1223

* potentially dirty data.

1223

* potentially dirty data.

1224

*/

1224

*/

1225

void __bforget(struct buffer_head *bh)

1225

void __bforget(struct buffer_head *bh)

1226

{

1226

{

1227

clear_buffer_dirty(bh);

1227

clear_buffer_dirty(bh);

1228

if (bh->b_assoc_map) {

1228

if (bh->b_assoc_map) {

1229

struct address_space *buffer_mapping = bh->b_page->mapping;

1229

struct address_space *buffer_mapping = bh->b_page->mapping;

1230

1231

spin_lock(&buffer_mapping->private_lock);

1231

spin_lock(&buffer_mapping->private_lock);

1232

list_del_init(&bh->b_assoc_buffers);

1232

list_del_init(&bh->b_assoc_buffers);

1233

bh->b_assoc_map = NULL;

1233

bh->b_assoc_map = NULL;

1234

spin_unlock(&buffer_mapping->private_lock);

1234

spin_unlock(&buffer_mapping->private_lock);

1235

}

1235

}

1236

__brelse(bh);

1236

__brelse(bh);

1237

}

1237

}

1238

1239

static struct buffer_head *__bread_slow(struct buffer_head *bh)

1239

static struct buffer_head *__bread_slow(struct buffer_head *bh)

1240

{

1240

{

1241

lock_buffer(bh);

1241

lock_buffer(bh);

1242

if (buffer_uptodate(bh)) {

1242

if (buffer_uptodate(bh)) {

1243

unlock_buffer(bh);

1243

unlock_buffer(bh);

1244

return bh;

1244

return bh;

1245

} else {

1245

} else {

1246

get_bh(bh);

1246

get_bh(bh);

1247

bh->b_end_io = end_buffer_read_sync;

1247

bh->b_end_io = end_buffer_read_sync;

1248

submit_bh(READ, bh);

1248

submit_bh(READ, bh);

1249

wait_on_buffer(bh);

1249

wait_on_buffer(bh);

1250

if (buffer_uptodate(bh))

1250

if (buffer_uptodate(bh))

1251

return bh;

1251

return bh;

1252

}

1252

}

1253

brelse(bh);

1253

brelse(bh);

1254

return NULL;

1254

return NULL;

1255

}

1255

}

1256

1257

/*

1257

/*

1258

* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().

1258

* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().

1259

* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their

1259

* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their

1260

* refcount elevated by one when they're in an LRU. A buffer can only appear

1260

* refcount elevated by one when they're in an LRU. A buffer can only appear

1261

* once in a particular CPU's LRU. A single buffer can be present in multiple

1261

* once in a particular CPU's LRU. A single buffer can be present in multiple

1262

* CPU's LRUs at the same time.

1262

* CPU's LRUs at the same time.

1263

*

1263

*

1264

* This is a transparent caching front-end to sb_bread(), sb_getblk() and

1264

* This is a transparent caching front-end to sb_bread(), sb_getblk() and

1265

* sb_find_get_block().

1265

* sb_find_get_block().

1266

*

1266

*

1267

* The LRUs themselves only need locking against invalidate_bh_lrus. We use

1267

* The LRUs themselves only need locking against invalidate_bh_lrus. We use

1268

* a local interrupt disable for that.

1268

* a local interrupt disable for that.

1269

*/

1269

*/

1270

1271

#define BH_LRU_SIZE 8

1271

#define BH_LRU_SIZE 8

1272

1273

struct bh_lru {

1273

struct bh_lru {

1274

struct buffer_head *bhs[BH_LRU_SIZE];

1274

struct buffer_head *bhs[BH_LRU_SIZE];

1275

};

1275

};

1276

1277

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

1277

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

1278

1279

#ifdef CONFIG_SMP

1279

#ifdef CONFIG_SMP

1280

#define bh_lru_lock() local_irq_disable()

1280

#define bh_lru_lock() local_irq_disable()

1281

#define bh_lru_unlock() local_irq_enable()

1281

#define bh_lru_unlock() local_irq_enable()

1282

#else

1282

#else

1283

#define bh_lru_lock() preempt_disable()

1283

#define bh_lru_lock() preempt_disable()

1284

#define bh_lru_unlock() preempt_enable()

1284

#define bh_lru_unlock() preempt_enable()

1285

#endif

1285

#endif

1286

1287

static inline void check_irqs_on(void)

1287

static inline void check_irqs_on(void)

1288

{

1288

{

1289

#ifdef irqs_disabled

1289

#ifdef irqs_disabled

1290

BUG_ON(irqs_disabled());

1290

BUG_ON(irqs_disabled());

1291

#endif

1291

#endif

1292

}

1292

}

1293

1294

/*

1294

/*

1295

* The LRU management algorithm is dopey-but-simple. Sorry.

1295

* The LRU management algorithm is dopey-but-simple. Sorry.

1296

*/

1296

*/

1297

static void bh_lru_install(struct buffer_head *bh)

1297

static void bh_lru_install(struct buffer_head *bh)

1298

{

1298

{

1299

struct buffer_head *evictee = NULL;

1299

struct buffer_head *evictee = NULL;

1300

struct bh_lru *lru;

1300

struct bh_lru *lru;

1301

1302

check_irqs_on();

1302

check_irqs_on();

1303

bh_lru_lock();

1303

bh_lru_lock();

1304

lru = &__get_cpu_var(bh_lrus);

1304

lru = &__get_cpu_var(bh_lrus);

1305

if (lru->bhs[0] != bh) {

1305

if (lru->bhs[0] != bh) {

1306

struct buffer_head *bhs[BH_LRU_SIZE];

1306

struct buffer_head *bhs[BH_LRU_SIZE];

1307

int in;

1307

int in;

1308

int out = 0;

1308

int out = 0;

1309

1310

get_bh(bh);

1310

get_bh(bh);

1311

bhs[out++] = bh;

1311

bhs[out++] = bh;

1312

for (in = 0; in < BH_LRU_SIZE; in++) {

1312

for (in = 0; in < BH_LRU_SIZE; in++) {

1313

struct buffer_head *bh2 = lru->bhs[in];

1313

struct buffer_head *bh2 = lru->bhs[in];

1314

1315

if (bh2 == bh) {

1315

if (bh2 == bh) {

1316

__brelse(bh2);

1316

__brelse(bh2);

1317

} else {

1317

} else {

1318

if (out >= BH_LRU_SIZE) {

1318

if (out >= BH_LRU_SIZE) {

1319

BUG_ON(evictee != NULL);

1319

BUG_ON(evictee != NULL);

1320

evictee = bh2;

1320

evictee = bh2;

1321

} else {

1321

} else {

1322

bhs[out++] = bh2;

1322

bhs[out++] = bh2;

1323

}

1323

}

1324

}

1324

}

1325

}

1325

}

1326

while (out < BH_LRU_SIZE)

1326

while (out < BH_LRU_SIZE)

1327

bhs[out++] = NULL;

1327

bhs[out++] = NULL;

1328

memcpy(lru->bhs, bhs, sizeof(bhs));

1328

memcpy(lru->bhs, bhs, sizeof(bhs));

1329

}

1329

}

1330

bh_lru_unlock();

1330

bh_lru_unlock();

1331

1332

if (evictee)

1332

if (evictee)

1333

__brelse(evictee);

1333

__brelse(evictee);

1334

}

1334

}

1335

1336

/*

1336

/*

1337

* Look up the bh in this cpu's LRU. If it's there, move it to the head.

1337

* Look up the bh in this cpu's LRU. If it's there, move it to the head.

1338

*/

1338

*/

1339

static struct buffer_head *

1339

static struct buffer_head *

1340

lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)

1340

lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)

1341

{

1341

{

1342

struct buffer_head *ret = NULL;

1342

struct buffer_head *ret = NULL;

1343

struct bh_lru *lru;

1343

struct bh_lru *lru;

1344

unsigned int i;

1344

unsigned int i;

1345

1346

check_irqs_on();

1346

check_irqs_on();

1347

bh_lru_lock();

1347

bh_lru_lock();

1348

lru = &__get_cpu_var(bh_lrus);

1348

lru = &__get_cpu_var(bh_lrus);

1349

for (i = 0; i < BH_LRU_SIZE; i++) {

1349

for (i = 0; i < BH_LRU_SIZE; i++) {

1350

struct buffer_head *bh = lru->bhs[i];

1350

struct buffer_head *bh = lru->bhs[i];

1351

1352

if (bh && bh->b_bdev == bdev &&

1352

if (bh && bh->b_bdev == bdev &&

1353

bh->b_blocknr == block && bh->b_size == size) {

1353

bh->b_blocknr == block && bh->b_size == size) {

1354

if (i) {

1354

if (i) {

1355

while (i) {

1355

while (i) {

1356

lru->bhs[i] = lru->bhs[i - 1];

1356

lru->bhs[i] = lru->bhs[i - 1];

1357

i--;

1357

i--;

1358

}

1358

}

1359

lru->bhs[0] = bh;

1359

lru->bhs[0] = bh;

1360

}

1360

}

1361

get_bh(bh);

1361

get_bh(bh);

1362

ret = bh;

1362

ret = bh;

1363

break;

1363

break;

1364

}

1364

}

1365

}

1365

}

1366

bh_lru_unlock();

1366

bh_lru_unlock();

1367

return ret;

1367

return ret;

1368

}

1368

}

1369

1370

/*

1370

/*

1371

* Perform a pagecache lookup for the matching buffer. If it's there, refresh

1371

* Perform a pagecache lookup for the matching buffer. If it's there, refresh

1372

* it in the LRU and mark it as accessed. If it is not present then return

1372

* it in the LRU and mark it as accessed. If it is not present then return

1373

* NULL

1373

* NULL

1374

*/

1374

*/

1375

struct buffer_head *

1375

struct buffer_head *

1376

__find_get_block(struct block_device *bdev, sector_t block, unsigned size)

1376

__find_get_block(struct block_device *bdev, sector_t block, unsigned size)

1377

{

1377

{

1378

struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

1378

struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

1379

1380

if (bh == NULL) {

1380

if (bh == NULL) {

1381

bh = __find_get_block_slow(bdev, block);

1381

bh = __find_get_block_slow(bdev, block);

1382

if (bh)

1382

if (bh)

1383

bh_lru_install(bh);

1383

bh_lru_install(bh);

1384

}

1384

}

1385

if (bh)

1385

if (bh)

1386

touch_buffer(bh);

1386

touch_buffer(bh);

1387

return bh;

1387

return bh;

1388

}

1388

}

1389

EXPORT_SYMBOL(__find_get_block);

1389

EXPORT_SYMBOL(__find_get_block);

1390

1391

/*

1391

/*

1392

* __getblk will locate (and, if necessary, create) the buffer_head

1392

* __getblk will locate (and, if necessary, create) the buffer_head

1393

* which corresponds to the passed block_device, block and size. The

1393

* which corresponds to the passed block_device, block and size. The

1394

* returned buffer has its reference count incremented.

1394

* returned buffer has its reference count incremented.

1395

*

1395

*

1396

* __getblk() cannot fail - it just keeps trying. If you pass it an

1396

* __getblk() cannot fail - it just keeps trying. If you pass it an

1397

* illegal block number, __getblk() will happily return a buffer_head

1397

* illegal block number, __getblk() will happily return a buffer_head

1398

* which represents the non-existent block. Very weird.

1398

* which represents the non-existent block. Very weird.

1399

*

1399

*

1400

* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()

1400

* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()

1401

* attempt is failing. FIXME, perhaps?

1401

* attempt is failing. FIXME, perhaps?

1402

*/

1402

*/

1403

struct buffer_head *

1403

struct buffer_head *

1404

__getblk(struct block_device *bdev, sector_t block, unsigned size)

1404

__getblk(struct block_device *bdev, sector_t block, unsigned size)

1405

{

1405

{

1406

struct buffer_head *bh = __find_get_block(bdev, block, size);

1406

struct buffer_head *bh = __find_get_block(bdev, block, size);

1407

1408

might_sleep();

1408

might_sleep();

1409

if (bh == NULL)

1409

if (bh == NULL)

1410

bh = __getblk_slow(bdev, block, size);

1410

bh = __getblk_slow(bdev, block, size);

1411

return bh;

1411

return bh;

1412

}

1412

}

1413

EXPORT_SYMBOL(__getblk);

1413

EXPORT_SYMBOL(__getblk);

1414

1415

/*

1415

/*

1416

* Do async read-ahead on a buffer..

1416

* Do async read-ahead on a buffer..

1417

*/

1417

*/

1418

void __breadahead(struct block_device *bdev, sector_t block, unsigned size)

1418

void __breadahead(struct block_device *bdev, sector_t block, unsigned size)

1419

{

1419

{

1420

struct buffer_head *bh = __getblk(bdev, block, size);

1420

struct buffer_head *bh = __getblk(bdev, block, size);

1421

if (likely(bh)) {

1421

if (likely(bh)) {

1422

ll_rw_block(READA, 1, &bh);

1422

ll_rw_block(READA, 1, &bh);

1423

brelse(bh);

1423

brelse(bh);

1424

}

1424

}

1425

}

1425

}

1426

EXPORT_SYMBOL(__breadahead);

1426

EXPORT_SYMBOL(__breadahead);

1427

1428

/**

1428

/**

1429

* __bread() - reads a specified block and returns the bh

1429

* __bread() - reads a specified block and returns the bh

1430

* @bdev: the block_device to read from

1430

* @bdev: the block_device to read from

1431

* @block: number of block

1431

* @block: number of block

1432

* @size: size (in bytes) to read

1432

* @size: size (in bytes) to read

1433

*

1433

*

1434

* Reads a specified block, and returns buffer head that contains it.

1434

* Reads a specified block, and returns buffer head that contains it.

1435

* It returns NULL if the block was unreadable.

1435

* It returns NULL if the block was unreadable.

1436

*/

1436

*/

1437

struct buffer_head *

1437

struct buffer_head *

1438

__bread(struct block_device *bdev, sector_t block, unsigned size)

1438

__bread(struct block_device *bdev, sector_t block, unsigned size)

1439

{

1439

{

1440

struct buffer_head *bh = __getblk(bdev, block, size);

1440

struct buffer_head *bh = __getblk(bdev, block, size);

1441

1442

if (likely(bh) && !buffer_uptodate(bh))

1442

if (likely(bh) && !buffer_uptodate(bh))

1443

bh = __bread_slow(bh);

1443

bh = __bread_slow(bh);

1444

return bh;

1444

return bh;

1445

}

1445

}

1446

EXPORT_SYMBOL(__bread);

1446

EXPORT_SYMBOL(__bread);

1447

1448

/*

1448

/*

1449

* invalidate_bh_lrus() is called rarely - but not only at unmount.

1449

* invalidate_bh_lrus() is called rarely - but not only at unmount.

1450

* This doesn't race because it runs in each cpu either in irq

1450

* This doesn't race because it runs in each cpu either in irq

1451

* or with preempt disabled.

1451

* or with preempt disabled.

1452

*/

1452

*/

1453

static void invalidate_bh_lru(void *arg)

1453

static void invalidate_bh_lru(void *arg)

1454

{

1454

{

1455

struct bh_lru *b = &get_cpu_var(bh_lrus);

1455

struct bh_lru *b = &get_cpu_var(bh_lrus);

1456

int i;

1456

int i;

1457

1458

for (i = 0; i < BH_LRU_SIZE; i++) {

1458

for (i = 0; i < BH_LRU_SIZE; i++) {

1459

brelse(b->bhs[i]);

1459

brelse(b->bhs[i]);

1460

b->bhs[i] = NULL;

1460

b->bhs[i] = NULL;

1461

}

1461

}

1462

put_cpu_var(bh_lrus);

1462

put_cpu_var(bh_lrus);

1463

}

1463

}

1464

1465

void invalidate_bh_lrus(void)

1465

void invalidate_bh_lrus(void)

1466

{

1466

{

1467

on_each_cpu(invalidate_bh_lru, NULL, 1, 1);

1467

on_each_cpu(invalidate_bh_lru, NULL, 1, 1);

1468

}

1468

}

1469

EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

1469

EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

1470

1471

void set_bh_page(struct buffer_head *bh,

1471

void set_bh_page(struct buffer_head *bh,

1472

struct page *page, unsigned long offset)

1472

struct page *page, unsigned long offset)

1473

{

1473

{

1474

bh->b_page = page;

1474

bh->b_page = page;

1475

BUG_ON(offset >= PAGE_SIZE);

1475

BUG_ON(offset >= PAGE_SIZE);

1476

if (PageHighMem(page))

1476

if (PageHighMem(page))

1477

/*

1477

/*

1478

* This catches illegal uses and preserves the offset:

1478

* This catches illegal uses and preserves the offset:

1479

*/

1479

*/

1480

bh->b_data = (char *)(0 + offset);

1480

bh->b_data = (char *)(0 + offset);

1481

else

1481

else

1482

bh->b_data = page_address(page) + offset;

1482

bh->b_data = page_address(page) + offset;

1483

}

1483

}

1484

EXPORT_SYMBOL(set_bh_page);

1484

EXPORT_SYMBOL(set_bh_page);

1485

1486

/*

1486

/*

1487

* Called when truncating a buffer on a page completely.

1487

* Called when truncating a buffer on a page completely.

1488

*/

1488

*/

1489

static void discard_buffer(struct buffer_head * bh)

1489

static void discard_buffer(struct buffer_head * bh)

1490

{

1490

{

1491

lock_buffer(bh);

1491

lock_buffer(bh);

1492

clear_buffer_dirty(bh);

1492

clear_buffer_dirty(bh);

1493

bh->b_bdev = NULL;

1493

bh->b_bdev = NULL;

1494

clear_buffer_mapped(bh);

1494

clear_buffer_mapped(bh);

1495

clear_buffer_req(bh);

1495

clear_buffer_req(bh);

1496

clear_buffer_new(bh);

1496

clear_buffer_new(bh);

1497

clear_buffer_delay(bh);

1497

clear_buffer_delay(bh);

1498

clear_buffer_unwritten(bh);

1498

clear_buffer_unwritten(bh);

1499

unlock_buffer(bh);

1499

unlock_buffer(bh);

1500

}

1500

}

1501

1502

/**

1502

/**

1503

* block_invalidatepage - invalidate part of all of a buffer-backed page

1503

* block_invalidatepage - invalidate part of all of a buffer-backed page

1504

*

1504

*

1505

* @page: the page which is affected

1505

* @page: the page which is affected

1506

* @offset: the index of the truncation point

1506

* @offset: the index of the truncation point

1507

*

1507

*

1508

* block_invalidatepage() is called when all or part of the page has become

1508

* block_invalidatepage() is called when all or part of the page has become

1509

* invalidatedby a truncate operation.

1509

* invalidatedby a truncate operation.

1510

*

1510

*

1511

* block_invalidatepage() does not have to release all buffers, but it must

1511

* block_invalidatepage() does not have to release all buffers, but it must

1512

* ensure that no dirty buffer is left outside @offset and that no I/O

1512

* ensure that no dirty buffer is left outside @offset and that no I/O

1513

* is underway against any of the blocks which are outside the truncation

1513

* is underway against any of the blocks which are outside the truncation

1514

* point. Because the caller is about to free (and possibly reuse) those

1514

* point. Because the caller is about to free (and possibly reuse) those

1515

* blocks on-disk.

1515

* blocks on-disk.

1516

*/

1516

*/

1517

void block_invalidatepage(struct page *page, unsigned long offset)

1517

void block_invalidatepage(struct page *page, unsigned long offset)

1518

{

1518

{

1519

struct buffer_head *head, *bh, *next;

1519

struct buffer_head *head, *bh, *next;

1520

unsigned int curr_off = 0;

1520

unsigned int curr_off = 0;

1521

1522

BUG_ON(!PageLocked(page));

1522

BUG_ON(!PageLocked(page));

1523

if (!page_has_buffers(page))

1523

if (!page_has_buffers(page))

1524

goto out;

1524

goto out;

1525

1526

head = page_buffers(page);

1526

head = page_buffers(page);

1527

bh = head;

1527

bh = head;

1528

do {

1528

do {

1529

unsigned int next_off = curr_off + bh->b_size;

1529

unsigned int next_off = curr_off + bh->b_size;

1530

next = bh->b_this_page;

1530

next = bh->b_this_page;

1531

1532

/*

1532

/*

1533

* is this block fully invalidated?

1533

* is this block fully invalidated?

1534

*/

1534

*/

1535

if (offset <= curr_off)

1535

if (offset <= curr_off)

1536

discard_buffer(bh);

1536

discard_buffer(bh);

1537

curr_off = next_off;

1537

curr_off = next_off;

1538

bh = next;

1538

bh = next;

1539

} while (bh != head);

1539

} while (bh != head);

1540

1541

/*

1541

/*

1542

* We release buffers only if the entire page is being invalidated.

1542

* We release buffers only if the entire page is being invalidated.

1543

* The get_block cached value has been unconditionally invalidated,

1543

* The get_block cached value has been unconditionally invalidated,

1544

* so real IO is not possible anymore.

1544

* so real IO is not possible anymore.

1545

*/

1545

*/

1546

if (offset == 0)

1546

if (offset == 0)

1547

try_to_release_page(page, 0);

1547

try_to_release_page(page, 0);

1548

out:

1548

out:

1549

return;

1549

return;

1550

}

1550

}

1551

EXPORT_SYMBOL(block_invalidatepage);

1551

EXPORT_SYMBOL(block_invalidatepage);

1552

1553

/*

1553

/*

1554

* We attach and possibly dirty the buffers atomically wrt

1554

* We attach and possibly dirty the buffers atomically wrt

1555

* __set_page_dirty_buffers() via private_lock. try_to_free_buffers

1555

* __set_page_dirty_buffers() via private_lock. try_to_free_buffers

1556

* is already excluded via the page lock.

1556

* is already excluded via the page lock.

1557

*/

1557

*/

1558

void create_empty_buffers(struct page *page,

1558

void create_empty_buffers(struct page *page,

1559

unsigned long blocksize, unsigned long b_state)

1559

unsigned long blocksize, unsigned long b_state)

1560

{

1560

{

1561

struct buffer_head *bh, *head, *tail;

1561

struct buffer_head *bh, *head, *tail;

1562

1563

head = alloc_page_buffers(page, blocksize, 1);

1563

head = alloc_page_buffers(page, blocksize, 1);

1564

bh = head;

1564

bh = head;

1565

do {

1565

do {

1566

bh->b_state |= b_state;

1566

bh->b_state |= b_state;

1567

tail = bh;

1567

tail = bh;

1568

bh = bh->b_this_page;

1568

bh = bh->b_this_page;

1569

} while (bh);

1569

} while (bh);

1570

tail->b_this_page = head;

1570

tail->b_this_page = head;

1571

1572

spin_lock(&page->mapping->private_lock);

1572

spin_lock(&page->mapping->private_lock);

1573

if (PageUptodate(page) || PageDirty(page)) {

1573

if (PageUptodate(page) || PageDirty(page)) {

1574

bh = head;

1574

bh = head;

1575

do {

1575

do {

1576

if (PageDirty(page))

1576

if (PageDirty(page))

1577

set_buffer_dirty(bh);

1577

set_buffer_dirty(bh);

1578

if (PageUptodate(page))

1578

if (PageUptodate(page))

1579

set_buffer_uptodate(bh);

1579

set_buffer_uptodate(bh);

1580

bh = bh->b_this_page;

1580

bh = bh->b_this_page;

1581

} while (bh != head);

1581

} while (bh != head);

1582

}

1582

}

1583

attach_page_buffers(page, head);

1583

attach_page_buffers(page, head);

1584

spin_unlock(&page->mapping->private_lock);

1584

spin_unlock(&page->mapping->private_lock);

1585

}

1585

}

1586

EXPORT_SYMBOL(create_empty_buffers);

1586

EXPORT_SYMBOL(create_empty_buffers);

1587

1588

/*

1588

/*

1589

* We are taking a block for data and we don't want any output from any

1589

* We are taking a block for data and we don't want any output from any

1590

* buffer-cache aliases starting from return from that function and

1590

* buffer-cache aliases starting from return from that function and

1591

* until the moment when something will explicitly mark the buffer

1591

* until the moment when something will explicitly mark the buffer

1592

* dirty (hopefully that will not happen until we will free that block ;-)

1592

* dirty (hopefully that will not happen until we will free that block ;-)

1593

* We don't even need to mark it not-uptodate - nobody can expect

1593

* We don't even need to mark it not-uptodate - nobody can expect

1594

* anything from a newly allocated buffer anyway. We used to used

1594

* anything from a newly allocated buffer anyway. We used to used

1595

* unmap_buffer() for such invalidation, but that was wrong. We definitely

1595

* unmap_buffer() for such invalidation, but that was wrong. We definitely

1596

* don't want to mark the alias unmapped, for example - it would confuse

1596

* don't want to mark the alias unmapped, for example - it would confuse

1597

* anyone who might pick it with bread() afterwards...

1597

* anyone who might pick it with bread() afterwards...

1598

*

1598

*

1599

* Also.. Note that bforget() doesn't lock the buffer. So there can

1599

* Also.. Note that bforget() doesn't lock the buffer. So there can

1600

* be writeout I/O going on against recently-freed buffers. We don't

1600

* be writeout I/O going on against recently-freed buffers. We don't

1601

* wait on that I/O in bforget() - it's more efficient to wait on the I/O

1601

* wait on that I/O in bforget() - it's more efficient to wait on the I/O

1602

* only if we really need to. That happens here.

1602

* only if we really need to. That happens here.

1603

*/

1603

*/

1604

void unmap_underlying_metadata(struct block_device *bdev, sector_t block)

1604

void unmap_underlying_metadata(struct block_device *bdev, sector_t block)

1605

{

1605

{

1606

struct buffer_head *old_bh;

1606

struct buffer_head *old_bh;

1607

1608

might_sleep();

1608

might_sleep();

1609

1610

old_bh = __find_get_block_slow(bdev, block);

1610

old_bh = __find_get_block_slow(bdev, block);

1611

if (old_bh) {

1611

if (old_bh) {

1612

clear_buffer_dirty(old_bh);

1612

clear_buffer_dirty(old_bh);

1613

wait_on_buffer(old_bh);

1613

wait_on_buffer(old_bh);

1614

clear_buffer_req(old_bh);

1614

clear_buffer_req(old_bh);

1615

__brelse(old_bh);

1615

__brelse(old_bh);

1616

}

1616

}

1617

}

1617

}

1618

EXPORT_SYMBOL(unmap_underlying_metadata);

1618

EXPORT_SYMBOL(unmap_underlying_metadata);

1619

1620

/*

1620

/*

1621

* NOTE! All mapped/uptodate combinations are valid:

1621

* NOTE! All mapped/uptodate combinations are valid:

1622

*

1622

*

1623

* Mapped Uptodate Meaning

1623

* Mapped Uptodate Meaning

1624

*

1624

*

1625

* No No "unknown" - must do get_block()

1625

* No No "unknown" - must do get_block()

1626

* No Yes "hole" - zero-filled

1626

* No Yes "hole" - zero-filled

1627

* Yes No "allocated" - allocated on disk, not read in

1627

* Yes No "allocated" - allocated on disk, not read in

1628

* Yes Yes "valid" - allocated and up-to-date in memory.

1628

* Yes Yes "valid" - allocated and up-to-date in memory.

1629

*

1629

*

1630

* "Dirty" is valid only with the last case (mapped+uptodate).

1630

* "Dirty" is valid only with the last case (mapped+uptodate).

1631

*/

1631

*/

1632

1633

/*

1633

/*

1634

* While block_write_full_page is writing back the dirty buffers under

1634

* While block_write_full_page is writing back the dirty buffers under

1635

* the page lock, whoever dirtied the buffers may decide to clean them

1635

* the page lock, whoever dirtied the buffers may decide to clean them

1636

* again at any time. We handle that by only looking at the buffer

1636

* again at any time. We handle that by only looking at the buffer

1637

* state inside lock_buffer().

1637

* state inside lock_buffer().

1638

*

1638

*

1639

* If block_write_full_page() is called for regular writeback

1639

* If block_write_full_page() is called for regular writeback

1640

* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a

1640

* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a

1641

* locked buffer. This only can happen if someone has written the buffer

1641

* locked buffer. This only can happen if someone has written the buffer

1642

* directly, with submit_bh(). At the address_space level PageWriteback

1642

* directly, with submit_bh(). At the address_space level PageWriteback

1643

* prevents this contention from occurring.

1643

* prevents this contention from occurring.

1644

*/

1644

*/

1645

static int __block_write_full_page(struct inode *inode, struct page *page,

1645

static int __block_write_full_page(struct inode *inode, struct page *page,

1646

get_block_t *get_block, struct writeback_control *wbc)

1646

get_block_t *get_block, struct writeback_control *wbc)

1647

{

1647

{

1648

int err;

1648

int err;

1649

sector_t block;

1649

sector_t block;

1650

sector_t last_block;

1650

sector_t last_block;

1651

struct buffer_head *bh, *head;

1651

struct buffer_head *bh, *head;

1652

const unsigned blocksize = 1 << inode->i_blkbits;

1652

const unsigned blocksize = 1 << inode->i_blkbits;

1653

int nr_underway = 0;

1653

int nr_underway = 0;

1654

1655

BUG_ON(!PageLocked(page));

1655

BUG_ON(!PageLocked(page));

1656

1657

last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;

1657

last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;

1658

1659

if (!page_has_buffers(page)) {

1659

if (!page_has_buffers(page)) {

1660

create_empty_buffers(page, blocksize,

1660

create_empty_buffers(page, blocksize,

1661

(1 << BH_Dirty)|(1 << BH_Uptodate));

1661

(1 << BH_Dirty)|(1 << BH_Uptodate));

1662

}

1662

}

1663

1664

/*

1664

/*

1665

* Be very careful. We have no exclusion from __set_page_dirty_buffers

1665

* Be very careful. We have no exclusion from __set_page_dirty_buffers

1666

* here, and the (potentially unmapped) buffers may become dirty at

1666

* here, and the (potentially unmapped) buffers may become dirty at

1667

* any time. If a buffer becomes dirty here after we've inspected it

1667

* any time. If a buffer becomes dirty here after we've inspected it

1668

* then we just miss that fact, and the page stays dirty.

1668

* then we just miss that fact, and the page stays dirty.

1669

*

1669

*

1670

* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;

1670

* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;

1671

* handle that here by just cleaning them.

1671

* handle that here by just cleaning them.

1672

*/

1672

*/

1673

1674

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

1674

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

1675

head = page_buffers(page);

1675

head = page_buffers(page);

1676

bh = head;

1676

bh = head;

1677

1678

/*

1678

/*

1679

* Get all the dirty buffers mapped to disk addresses and

1679

* Get all the dirty buffers mapped to disk addresses and

1680

* handle any aliases from the underlying blockdev's mapping.

1680

* handle any aliases from the underlying blockdev's mapping.

1681

*/

1681

*/

1682

do {

1682

do {

1683

if (block > last_block) {

1683

if (block > last_block) {

1684

/*

1684

/*

1685

* mapped buffers outside i_size will occur, because

1685

* mapped buffers outside i_size will occur, because

1686

* this page can be outside i_size when there is a

1686

* this page can be outside i_size when there is a

1687

* truncate in progress.

1687

* truncate in progress.

1688

*/

1688

*/

1689

/*

1689

/*

1690

* The buffer was zeroed by block_write_full_page()

1690

* The buffer was zeroed by block_write_full_page()

1691

*/

1691

*/

1692

clear_buffer_dirty(bh);

1692

clear_buffer_dirty(bh);

1693

set_buffer_uptodate(bh);

1693

set_buffer_uptodate(bh);

1694

} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {

1694

} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&

1695

buffer_dirty(bh)) {

1695

WARN_ON(bh->b_size != blocksize);

1696

WARN_ON(bh->b_size != blocksize);

1696

err = get_block(inode, block, bh, 1);

1697

err = get_block(inode, block, bh, 1);

1697

if (err)

1698

if (err)

1698

goto recover;

1699

goto recover;

1700

clear_buffer_delay(bh);

1699

if (buffer_new(bh)) {

1701

if (buffer_new(bh)) {

1700

/* blockdev mappings never come here */

1702

/* blockdev mappings never come here */

1701

clear_buffer_new(bh);

1703

clear_buffer_new(bh);

1702

unmap_underlying_metadata(bh->b_bdev,

1704

unmap_underlying_metadata(bh->b_bdev,

1703

bh->b_blocknr);

1705

bh->b_blocknr);

1704

}

1706

}

1705

}

1707

}

1706

bh = bh->b_this_page;

1708

bh = bh->b_this_page;

1707

block++;

1709

block++;

1708

} while (bh != head);

1710

} while (bh != head);

1709

1711

1710

do {

1712

do {

1711

if (!buffer_mapped(bh))

1713

if (!buffer_mapped(bh))

1712

continue;

1714

continue;

1713

/*

1715

/*

1714

* If it's a fully non-blocking write attempt and we cannot

1716

* If it's a fully non-blocking write attempt and we cannot

1715

* lock the buffer then redirty the page. Note that this can

1717

* lock the buffer then redirty the page. Note that this can

1716

* potentially cause a busy-wait loop from pdflush and kswapd

1718

* potentially cause a busy-wait loop from pdflush and kswapd

1717

* activity, but those code paths have their own higher-level

1719

* activity, but those code paths have their own higher-level

1718

* throttling.

1720

* throttling.

1719

*/

1721

*/

1720

if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {

1722

if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {

1721

lock_buffer(bh);

1723

lock_buffer(bh);

1722

} else if (test_set_buffer_locked(bh)) {

1724

} else if (test_set_buffer_locked(bh)) {

1723

redirty_page_for_writepage(wbc, page);

1725

redirty_page_for_writepage(wbc, page);

1724

continue;

1726

continue;

1725

}

1727

}

1726

if (test_clear_buffer_dirty(bh)) {

1728

if (test_clear_buffer_dirty(bh)) {

1727

mark_buffer_async_write(bh);

1729

mark_buffer_async_write(bh);

1728

} else {

1730

} else {

1729

unlock_buffer(bh);

1731

unlock_buffer(bh);

1730

}

1732

}

1731

} while ((bh = bh->b_this_page) != head);

1733

} while ((bh = bh->b_this_page) != head);

1732

1734

1733

/*

1735

/*

1734

* The page and its buffers are protected by PageWriteback(), so we can

1736

* The page and its buffers are protected by PageWriteback(), so we can

1735

* drop the bh refcounts early.

1737

* drop the bh refcounts early.

1736

*/

1738

*/

1737

BUG_ON(PageWriteback(page));

1739

BUG_ON(PageWriteback(page));

1738

set_page_writeback(page);

1740

set_page_writeback(page);

1739

1741

1740

do {

1742

do {

1741

struct buffer_head *next = bh->b_this_page;

1743

struct buffer_head *next = bh->b_this_page;

1742

if (buffer_async_write(bh)) {

1744

if (buffer_async_write(bh)) {

1743

submit_bh(WRITE, bh);

1745

submit_bh(WRITE, bh);

1744

nr_underway++;

1746

nr_underway++;

1745

}

1747

}

1746

bh = next;

1748

bh = next;

1747

} while (bh != head);

1749

} while (bh != head);

1748

unlock_page(page);

1750

unlock_page(page);

1749

1751

1750

err = 0;

1752

err = 0;

1751

done:

1753

done:

1752

if (nr_underway == 0) {

1754

if (nr_underway == 0) {

1753

/*

1755

/*

1754

* The page was marked dirty, but the buffers were

1756

* The page was marked dirty, but the buffers were

1755

* clean. Someone wrote them back by hand with

1757

* clean. Someone wrote them back by hand with

1756

* ll_rw_block/submit_bh. A rare case.

1758

* ll_rw_block/submit_bh. A rare case.

1757

*/

1759

*/

1758

end_page_writeback(page);

1760

end_page_writeback(page);

1759

1761

1760

/*

1762

/*

1761

* The page and buffer_heads can be released at any time from

1763

* The page and buffer_heads can be released at any time from

1762

* here on.

1764

* here on.

1763

*/

1765

*/

1764

}

1766

}

1765

return err;

1767

return err;

1766

1768

1767

recover:

1769

recover:

1768

/*

1770

/*

1769

* ENOSPC, or some other error. We may already have added some

1771

* ENOSPC, or some other error. We may already have added some

1770

* blocks to the file, so we need to write these out to avoid

1772

* blocks to the file, so we need to write these out to avoid

1771

* exposing stale data.

1773

* exposing stale data.

1772

* The page is currently locked and not marked for writeback

1774

* The page is currently locked and not marked for writeback

1773

*/

1775

*/

1774

bh = head;

1776

bh = head;

1775

/* Recovery: lock and submit the mapped buffers */

1777

/* Recovery: lock and submit the mapped buffers */

1776

do {

1778

do {

1777

if (buffer_mapped(bh) && buffer_dirty(bh)) {

1779

if (buffer_mapped(bh) && buffer_dirty(bh) &&

1780

!buffer_delay(bh)) {

1778

lock_buffer(bh);

1781

lock_buffer(bh);

1779

mark_buffer_async_write(bh);

1782

mark_buffer_async_write(bh);

1780

} else {

1783

} else {

1781

/*

1784

/*

1782

* The buffer may have been set dirty during

1785

* The buffer may have been set dirty during

1783

* attachment to a dirty page.

1786

* attachment to a dirty page.

1784

*/

1787

*/

1785

clear_buffer_dirty(bh);

1788

clear_buffer_dirty(bh);

1786

}

1789

}

1787

} while ((bh = bh->b_this_page) != head);

1790

} while ((bh = bh->b_this_page) != head);

1788

SetPageError(page);

1791

SetPageError(page);

1789

BUG_ON(PageWriteback(page));

1792

BUG_ON(PageWriteback(page));

1790

mapping_set_error(page->mapping, err);

1793

mapping_set_error(page->mapping, err);

1791

set_page_writeback(page);

1794

set_page_writeback(page);

1792

do {

1795

do {

1793

struct buffer_head *next = bh->b_this_page;

1796

struct buffer_head *next = bh->b_this_page;

1794

if (buffer_async_write(bh)) {

1797

if (buffer_async_write(bh)) {

1795

clear_buffer_dirty(bh);

1798

clear_buffer_dirty(bh);

1796

submit_bh(WRITE, bh);

1799

submit_bh(WRITE, bh);

1797

nr_underway++;

1800

nr_underway++;

1798

}

1801

}

1799

bh = next;

1802

bh = next;

1800

} while (bh != head);

1803

} while (bh != head);

1801

unlock_page(page);

1804

unlock_page(page);

1802

goto done;

1805

goto done;

1803

}

1806

}

1804

1807

1805

/*

1808

/*

1806

* If a page has any new buffers, zero them out here, and mark them uptodate

1809

* If a page has any new buffers, zero them out here, and mark them uptodate

1807

* and dirty so they'll be written out (in order to prevent uninitialised

1810

* and dirty so they'll be written out (in order to prevent uninitialised

1808

* block data from leaking). And clear the new bit.

1811

* block data from leaking). And clear the new bit.

1809

*/

1812

*/

1810

void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)

1813

void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)

1811

{

1814

{

1812

unsigned int block_start, block_end;

1815

unsigned int block_start, block_end;

1813

struct buffer_head *head, *bh;

1816

struct buffer_head *head, *bh;

1814

1817

1815

BUG_ON(!PageLocked(page));

1818

BUG_ON(!PageLocked(page));

1816

if (!page_has_buffers(page))

1819

if (!page_has_buffers(page))

1817

return;

1820

return;

1818

1821

1819

bh = head = page_buffers(page);

1822

bh = head = page_buffers(page);

1820

block_start = 0;

1823

block_start = 0;

1821

do {

1824

do {

1822

block_end = block_start + bh->b_size;

1825

block_end = block_start + bh->b_size;

1823

1826

1824

if (buffer_new(bh)) {

1827

if (buffer_new(bh)) {

1825

if (block_end > from && block_start < to) {

1828

if (block_end > from && block_start < to) {

1826

if (!PageUptodate(page)) {

1829

if (!PageUptodate(page)) {

1827

unsigned start, size;

1830

unsigned start, size;

1828

1831

1829

start = max(from, block_start);

1832

start = max(from, block_start);

1830

size = min(to, block_end) - start;

1833

size = min(to, block_end) - start;

1831

1834

1832

zero_user(page, start, size);

1835

zero_user(page, start, size);

1833

set_buffer_uptodate(bh);

1836

set_buffer_uptodate(bh);

1834

}

1837

}

1835

1838

1836

clear_buffer_new(bh);

1839

clear_buffer_new(bh);

1837

mark_buffer_dirty(bh);

1840

mark_buffer_dirty(bh);

1838

}

1841

}

1839

}

1842

}

1840

1843

1841

block_start = block_end;

1844

block_start = block_end;

1842

bh = bh->b_this_page;

1845

bh = bh->b_this_page;

1843

} while (bh != head);

1846

} while (bh != head);

1844

}

1847

}

1845

EXPORT_SYMBOL(page_zero_new_buffers);

1848

EXPORT_SYMBOL(page_zero_new_buffers);

1846

1849

1847

static int __block_prepare_write(struct inode *inode, struct page *page,

1850

static int __block_prepare_write(struct inode *inode, struct page *page,

1848

unsigned from, unsigned to, get_block_t *get_block)

1851

unsigned from, unsigned to, get_block_t *get_block)

1849

{

1852

{

1850

unsigned block_start, block_end;

1853

unsigned block_start, block_end;

1851

sector_t block;

1854

sector_t block;

1852

int err = 0;

1855

int err = 0;

1853

unsigned blocksize, bbits;

1856

unsigned blocksize, bbits;

1854

struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

1857

struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

1855

1858

1856

BUG_ON(!PageLocked(page));

1859

BUG_ON(!PageLocked(page));

1857

BUG_ON(from > PAGE_CACHE_SIZE);

1860

BUG_ON(from > PAGE_CACHE_SIZE);

1858

BUG_ON(to > PAGE_CACHE_SIZE);

1861

BUG_ON(to > PAGE_CACHE_SIZE);

1859

BUG_ON(from > to);

1862

BUG_ON(from > to);

1860

1863

1861

blocksize = 1 << inode->i_blkbits;

1864

blocksize = 1 << inode->i_blkbits;

1862

if (!page_has_buffers(page))

1865

if (!page_has_buffers(page))

1863

create_empty_buffers(page, blocksize, 0);

1866

create_empty_buffers(page, blocksize, 0);

1864

head = page_buffers(page);

1867

head = page_buffers(page);

1865

1868

1866

bbits = inode->i_blkbits;

1869

bbits = inode->i_blkbits;

1867

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

1870

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

1868

1871

1869

for(bh = head, block_start = 0; bh != head || !block_start;

1872

for(bh = head, block_start = 0; bh != head || !block_start;

1870

block++, block_start=block_end, bh = bh->b_this_page) {

1873

block++, block_start=block_end, bh = bh->b_this_page) {

1871

block_end = block_start + blocksize;

1874

block_end = block_start + blocksize;

1872

if (block_end <= from || block_start >= to) {

1875

if (block_end <= from || block_start >= to) {

1873

if (PageUptodate(page)) {

1876

if (PageUptodate(page)) {

1874

if (!buffer_uptodate(bh))

1877

if (!buffer_uptodate(bh))

1875

set_buffer_uptodate(bh);

1878

set_buffer_uptodate(bh);

1876

}

1879

}

1877

continue;

1880

continue;

1878

}

1881

}

1879

if (buffer_new(bh))

1882

if (buffer_new(bh))

1880

clear_buffer_new(bh);

1883

clear_buffer_new(bh);

1881

if (!buffer_mapped(bh)) {

1884

if (!buffer_mapped(bh)) {

1882

WARN_ON(bh->b_size != blocksize);

1885

WARN_ON(bh->b_size != blocksize);

1883

err = get_block(inode, block, bh, 1);

1886

err = get_block(inode, block, bh, 1);

1884

if (err)

1887

if (err)

1885

break;

1888

break;

1886

if (buffer_new(bh)) {

1889

if (buffer_new(bh)) {

1887

unmap_underlying_metadata(bh->b_bdev,

1890

unmap_underlying_metadata(bh->b_bdev,

1888

bh->b_blocknr);

1891

bh->b_blocknr);

1889

if (PageUptodate(page)) {

1892

if (PageUptodate(page)) {

1890

clear_buffer_new(bh);

1893

clear_buffer_new(bh);

1891

set_buffer_uptodate(bh);

1894

set_buffer_uptodate(bh);

1892

mark_buffer_dirty(bh);

1895

mark_buffer_dirty(bh);

1893

continue;

1896

continue;

1894

}

1897

}

1895

if (block_end > to || block_start < from)

1898

if (block_end > to || block_start < from)

1896

zero_user_segments(page,

1899

zero_user_segments(page,

1897

to, block_end,

1900

to, block_end,

1898

block_start, from);

1901

block_start, from);

1899

continue;

1902

continue;

1900

}

1903

}

1901

}

1904

}

1902

if (PageUptodate(page)) {

1905

if (PageUptodate(page)) {

1903

if (!buffer_uptodate(bh))

1906

if (!buffer_uptodate(bh))

1904

set_buffer_uptodate(bh);

1907

set_buffer_uptodate(bh);

1905

continue;

1908

continue;

1906

}

1909

}

1907

if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

1910

if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

1908

!buffer_unwritten(bh) &&

1911

!buffer_unwritten(bh) &&

1909

(block_start < from || block_end > to)) {

1912

(block_start < from || block_end > to)) {

1910

ll_rw_block(READ, 1, &bh);

1913

ll_rw_block(READ, 1, &bh);

1911

*wait_bh++=bh;

1914

*wait_bh++=bh;

1912

}

1915

}

1913

}

1916

}

1914

/*

1917

/*

1915

* If we issued read requests - let them complete.

1918

* If we issued read requests - let them complete.

1916

*/

1919

*/

1917

while(wait_bh > wait) {

1920

while(wait_bh > wait) {

1918

wait_on_buffer(*--wait_bh);

1921

wait_on_buffer(*--wait_bh);

1919

if (!buffer_uptodate(*wait_bh))

1922

if (!buffer_uptodate(*wait_bh))

1920

err = -EIO;

1923

err = -EIO;

1921

}

1924

}

1922

if (unlikely(err))

1925

if (unlikely(err))

1923

page_zero_new_buffers(page, from, to);

1926

page_zero_new_buffers(page, from, to);

1924

return err;

1927

return err;

1925

}

1928

}

1926

1929

1927

static int __block_commit_write(struct inode *inode, struct page *page,

1930

static int __block_commit_write(struct inode *inode, struct page *page,

1928

unsigned from, unsigned to)

1931

unsigned from, unsigned to)

1929

{

1932

{

1930

unsigned block_start, block_end;

1933

unsigned block_start, block_end;

1931

int partial = 0;

1934

int partial = 0;

1932

unsigned blocksize;

1935

unsigned blocksize;

1933

struct buffer_head *bh, *head;

1936

struct buffer_head *bh, *head;

1934

1937

1935

blocksize = 1 << inode->i_blkbits;

1938

blocksize = 1 << inode->i_blkbits;

1936

1939

1937

for(bh = head = page_buffers(page), block_start = 0;

1940

for(bh = head = page_buffers(page), block_start = 0;

1938

bh != head || !block_start;

1941

bh != head || !block_start;

1939

block_start=block_end, bh = bh->b_this_page) {

1942

block_start=block_end, bh = bh->b_this_page) {

1940

block_end = block_start + blocksize;

1943

block_end = block_start + blocksize;

1941

if (block_end <= from || block_start >= to) {

1944

if (block_end <= from || block_start >= to) {

1942

if (!buffer_uptodate(bh))

1945

if (!buffer_uptodate(bh))

1943

partial = 1;

1946

partial = 1;

1944

} else {

1947

} else {

1945

set_buffer_uptodate(bh);

1948

set_buffer_uptodate(bh);

1946

mark_buffer_dirty(bh);

1949

mark_buffer_dirty(bh);

1947

}

1950

}

1948

clear_buffer_new(bh);

1951

clear_buffer_new(bh);

1949

}

1952

}

1950

1953

1951

/*

1954

/*

1952

* If this is a partial write which happened to make all buffers

1955

* If this is a partial write which happened to make all buffers

1953

* uptodate then we can optimize away a bogus readpage() for

1956

* uptodate then we can optimize away a bogus readpage() for

1954

* the next read(). Here we 'discover' whether the page went

1957

* the next read(). Here we 'discover' whether the page went

1955

* uptodate as a result of this (potentially partial) write.

1958

* uptodate as a result of this (potentially partial) write.

1956

*/

1959

*/

1957

if (!partial)

1960

if (!partial)

1958

SetPageUptodate(page);

1961

SetPageUptodate(page);

1959

return 0;

1962

return 0;

1960

}

1963

}

1961

1964

1962

/*

1965

/*

1963

* block_write_begin takes care of the basic task of block allocation and

1966

* block_write_begin takes care of the basic task of block allocation and

1964

* bringing partial write blocks uptodate first.

1967

* bringing partial write blocks uptodate first.

1965

*

1968

*

1966

* If *pagep is not NULL, then block_write_begin uses the locked page

1969

* If *pagep is not NULL, then block_write_begin uses the locked page

1967

* at *pagep rather than allocating its own. In this case, the page will

1970

* at *pagep rather than allocating its own. In this case, the page will

1968

* not be unlocked or deallocated on failure.

1971

* not be unlocked or deallocated on failure.

1969

*/

1972

*/

1970

int block_write_begin(struct file *file, struct address_space *mapping,

1973

int block_write_begin(struct file *file, struct address_space *mapping,

1971

loff_t pos, unsigned len, unsigned flags,

1974

loff_t pos, unsigned len, unsigned flags,

1972

struct page **pagep, void **fsdata,

1975

struct page **pagep, void **fsdata,

1973

get_block_t *get_block)

1976

get_block_t *get_block)

1974

{

1977

{

1975

struct inode *inode = mapping->host;

1978

struct inode *inode = mapping->host;

1976

int status = 0;

1979

int status = 0;

1977

struct page *page;

1980

struct page *page;

1978

pgoff_t index;

1981

pgoff_t index;

1979

unsigned start, end;

1982

unsigned start, end;

1980

int ownpage = 0;

1983

int ownpage = 0;

1981

1984

1982

index = pos >> PAGE_CACHE_SHIFT;

1985

index = pos >> PAGE_CACHE_SHIFT;

1983

start = pos & (PAGE_CACHE_SIZE - 1);

1986

start = pos & (PAGE_CACHE_SIZE - 1);

1984

end = start + len;

1987

end = start + len;

1985

1988

1986

page = *pagep;

1989

page = *pagep;

1987

if (page == NULL) {

1990

if (page == NULL) {

1988

ownpage = 1;

1991

ownpage = 1;

1989

page = __grab_cache_page(mapping, index);

1992

page = __grab_cache_page(mapping, index);

1990

if (!page) {

1993

if (!page) {

1991

status = -ENOMEM;

1994

status = -ENOMEM;

1992

goto out;

1995

goto out;

1993

}

1996

}

1994

*pagep = page;

1997

*pagep = page;

1995

} else

1998

} else

1996

BUG_ON(!PageLocked(page));

1999

BUG_ON(!PageLocked(page));

1997

2000

1998

status = __block_prepare_write(inode, page, start, end, get_block);

2001

status = __block_prepare_write(inode, page, start, end, get_block);

1999

if (unlikely(status)) {

2002

if (unlikely(status)) {

2000

ClearPageUptodate(page);

2003

ClearPageUptodate(page);

2001

2004

2002

if (ownpage) {

2005

if (ownpage) {

2003

unlock_page(page);

2006

unlock_page(page);

2004

page_cache_release(page);

2007

page_cache_release(page);

2005

*pagep = NULL;

2008

*pagep = NULL;

2006

2009

2007

/*

2010

/*

2008

* prepare_write() may have instantiated a few blocks

2011

* prepare_write() may have instantiated a few blocks

2009

* outside i_size. Trim these off again. Don't need

2012

* outside i_size. Trim these off again. Don't need

2010

* i_size_read because we hold i_mutex.

2013

* i_size_read because we hold i_mutex.

2011

*/

2014

*/

2012

if (pos + len > inode->i_size)

2015

if (pos + len > inode->i_size)

2013

vmtruncate(inode, inode->i_size);

2016

vmtruncate(inode, inode->i_size);

2014

}

2017

}

2015

goto out;

2018

goto out;

2016

}

2019

}

2017

2020

2018

out:

2021

out:

2019

return status;

2022

return status;

2020

}

2023

}

2021

EXPORT_SYMBOL(block_write_begin);

2024

EXPORT_SYMBOL(block_write_begin);

2022

2025

2023

int block_write_end(struct file *file, struct address_space *mapping,

2026

int block_write_end(struct file *file, struct address_space *mapping,

2024

loff_t pos, unsigned len, unsigned copied,

2027

loff_t pos, unsigned len, unsigned copied,

2025

struct page *page, void *fsdata)

2028

struct page *page, void *fsdata)

2026

{

2029

{

2027

struct inode *inode = mapping->host;

2030

struct inode *inode = mapping->host;

2028

unsigned start;

2031

unsigned start;

2029

2032

2030

start = pos & (PAGE_CACHE_SIZE - 1);

2033

start = pos & (PAGE_CACHE_SIZE - 1);

2031

2034

2032

if (unlikely(copied < len)) {

2035

if (unlikely(copied < len)) {

2033

/*

2036

/*

2034

* The buffers that were written will now be uptodate, so we

2037

* The buffers that were written will now be uptodate, so we

2035

* don't have to worry about a readpage reading them and

2038

* don't have to worry about a readpage reading them and

2036

* overwriting a partial write. However if we have encountered

2039

* overwriting a partial write. However if we have encountered

2037

* a short write and only partially written into a buffer, it

2040

* a short write and only partially written into a buffer, it

2038

* will not be marked uptodate, so a readpage might come in and

2041

* will not be marked uptodate, so a readpage might come in and

2039

* destroy our partial write.

2042

* destroy our partial write.

2040

*

2043

*

2041

* Do the simplest thing, and just treat any short write to a

2044

* Do the simplest thing, and just treat any short write to a

2042

* non uptodate page as a zero-length write, and force the

2045

* non uptodate page as a zero-length write, and force the

2043

* caller to redo the whole thing.

2046

* caller to redo the whole thing.

2044

*/

2047

*/

2045

if (!PageUptodate(page))

2048

if (!PageUptodate(page))

2046

copied = 0;

2049

copied = 0;

2047

2050

2048

page_zero_new_buffers(page, start+copied, start+len);

2051

page_zero_new_buffers(page, start+copied, start+len);

2049

}

2052

}

2050

flush_dcache_page(page);

2053

flush_dcache_page(page);

2051

2054

2052

/* This could be a short (even 0-length) commit */

2055

/* This could be a short (even 0-length) commit */

2053

__block_commit_write(inode, page, start, start+copied);

2056

__block_commit_write(inode, page, start, start+copied);

2054

2057

2055

return copied;

2058

return copied;

2056

}

2059

}

2057

EXPORT_SYMBOL(block_write_end);

2060

EXPORT_SYMBOL(block_write_end);

2058

2061

2059

int generic_write_end(struct file *file, struct address_space *mapping,

2062

int generic_write_end(struct file *file, struct address_space *mapping,

2060

loff_t pos, unsigned len, unsigned copied,

2063

loff_t pos, unsigned len, unsigned copied,

2061

struct page *page, void *fsdata)

2064

struct page *page, void *fsdata)

2062

{

2065

{

2063

struct inode *inode = mapping->host;

2066

struct inode *inode = mapping->host;

2064

int i_size_changed = 0;

2067

int i_size_changed = 0;

2065

2068

2066

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

2069

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

2067

2070

2068

/*

2071

/*

2069

* No need to use i_size_read() here, the i_size

2072

* No need to use i_size_read() here, the i_size

2070

* cannot change under us because we hold i_mutex.

2073

* cannot change under us because we hold i_mutex.

2071

*

2074

*

2072

* But it's important to update i_size while still holding page lock:

2075

* But it's important to update i_size while still holding page lock:

2073

* page writeout could otherwise come in and zero beyond i_size.

2076

* page writeout could otherwise come in and zero beyond i_size.

2074

*/

2077

*/

2075

if (pos+copied > inode->i_size) {

2078

if (pos+copied > inode->i_size) {

2076

i_size_write(inode, pos+copied);

2079

i_size_write(inode, pos+copied);

2077

i_size_changed = 1;

2080

i_size_changed = 1;

2078

}

2081

}

2079

2082

2080

unlock_page(page);

2083

unlock_page(page);

2081

page_cache_release(page);

2084

page_cache_release(page);

2082

2085

2083

/*

2086

/*

2084

* Don't mark the inode dirty under page lock. First, it unnecessarily

2087

* Don't mark the inode dirty under page lock. First, it unnecessarily

2085

* makes the holding time of page lock longer. Second, it forces lock

2088

* makes the holding time of page lock longer. Second, it forces lock

2086

* ordering of page lock and transaction start for journaling

2089

* ordering of page lock and transaction start for journaling

2087

* filesystems.

2090

* filesystems.

2088

*/

2091

*/

2089

if (i_size_changed)

2092

if (i_size_changed)

2090

mark_inode_dirty(inode);

2093

mark_inode_dirty(inode);

2091

2094

2092

return copied;

2095

return copied;

2093

}

2096

}

2094

EXPORT_SYMBOL(generic_write_end);

2097

EXPORT_SYMBOL(generic_write_end);

2095

2098

2096

/*

2099

/*

2097

* Generic "read page" function for block devices that have the normal

2100

* Generic "read page" function for block devices that have the normal

2098

* get_block functionality. This is most of the block device filesystems.

2101

* get_block functionality. This is most of the block device filesystems.

2099

* Reads the page asynchronously --- the unlock_buffer() and

2102

* Reads the page asynchronously --- the unlock_buffer() and

2100

* set/clear_buffer_uptodate() functions propagate buffer state into the

2103

* set/clear_buffer_uptodate() functions propagate buffer state into the

2101

* page struct once IO has completed.

2104

* page struct once IO has completed.

2102

*/

2105

*/

2103

int block_read_full_page(struct page *page, get_block_t *get_block)

2106

int block_read_full_page(struct page *page, get_block_t *get_block)

2104

{

2107

{

2105

struct inode *inode = page->mapping->host;

2108

struct inode *inode = page->mapping->host;

2106

sector_t iblock, lblock;

2109

sector_t iblock, lblock;

2107

struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];

2110

struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];

2108

unsigned int blocksize;

2111

unsigned int blocksize;

2109

int nr, i;

2112

int nr, i;

2110

int fully_mapped = 1;

2113

int fully_mapped = 1;

2111

2114

2112

BUG_ON(!PageLocked(page));

2115

BUG_ON(!PageLocked(page));

2113

blocksize = 1 << inode->i_blkbits;

2116

blocksize = 1 << inode->i_blkbits;

2114

if (!page_has_buffers(page))

2117

if (!page_has_buffers(page))

2115

create_empty_buffers(page, blocksize, 0);

2118

create_empty_buffers(page, blocksize, 0);

2116

head = page_buffers(page);

2119

head = page_buffers(page);

2117

2120

2118

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2121

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2119

lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;

2122

lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;

2120

bh = head;

2123

bh = head;

2121

nr = 0;

2124

nr = 0;

2122

i = 0;

2125

i = 0;

2123

2126

2124

do {

2127

do {

2125

if (buffer_uptodate(bh))

2128

if (buffer_uptodate(bh))

2126

continue;

2129

continue;

2127

2130

2128

if (!buffer_mapped(bh)) {

2131

if (!buffer_mapped(bh)) {

2129

int err = 0;

2132

int err = 0;

2130

2133

2131

fully_mapped = 0;

2134

fully_mapped = 0;

2132

if (iblock < lblock) {

2135

if (iblock < lblock) {

2133

WARN_ON(bh->b_size != blocksize);

2136

WARN_ON(bh->b_size != blocksize);

2134

err = get_block(inode, iblock, bh, 0);

2137

err = get_block(inode, iblock, bh, 0);

2135

if (err)

2138

if (err)

2136

SetPageError(page);

2139

SetPageError(page);

2137

}

2140

}

2138

if (!buffer_mapped(bh)) {

2141

if (!buffer_mapped(bh)) {

2139

zero_user(page, i * blocksize, blocksize);

2142

zero_user(page, i * blocksize, blocksize);

2140

if (!err)

2143

if (!err)

2141

set_buffer_uptodate(bh);

2144

set_buffer_uptodate(bh);

2142

continue;

2145

continue;

2143

}

2146

}

2144

/*

2147

/*

2145

* get_block() might have updated the buffer

2148

* get_block() might have updated the buffer

2146

* synchronously

2149

* synchronously

2147

*/

2150

*/

2148

if (buffer_uptodate(bh))

2151

if (buffer_uptodate(bh))

2149

continue;

2152

continue;

2150

}

2153

}

2151

arr[nr++] = bh;

2154

arr[nr++] = bh;

2152

} while (i++, iblock++, (bh = bh->b_this_page) != head);

2155

} while (i++, iblock++, (bh = bh->b_this_page) != head);

2153

2156

2154

if (fully_mapped)

2157

if (fully_mapped)

2155

SetPageMappedToDisk(page);

2158

SetPageMappedToDisk(page);

2156

2159

2157

if (!nr) {

2160

if (!nr) {

2158

/*

2161

/*

2159

* All buffers are uptodate - we can set the page uptodate

2162

* All buffers are uptodate - we can set the page uptodate

2160

* as well. But not if get_block() returned an error.

2163

* as well. But not if get_block() returned an error.

2161

*/

2164

*/

2162

if (!PageError(page))

2165

if (!PageError(page))

2163

SetPageUptodate(page);

2166

SetPageUptodate(page);

2164

unlock_page(page);

2167

unlock_page(page);

2165

return 0;

2168

return 0;

2166

}

2169

}

2167

2170

2168

/* Stage two: lock the buffers */

2171

/* Stage two: lock the buffers */

2169

for (i = 0; i < nr; i++) {

2172

for (i = 0; i < nr; i++) {

2170

bh = arr[i];

2173

bh = arr[i];

2171

lock_buffer(bh);

2174

lock_buffer(bh);

2172

mark_buffer_async_read(bh);

2175

mark_buffer_async_read(bh);

2173

}

2176

}

2174

2177

2175

/*

2178

/*

2176

* Stage 3: start the IO. Check for uptodateness

2179

* Stage 3: start the IO. Check for uptodateness

2177

* inside the buffer lock in case another process reading

2180

* inside the buffer lock in case another process reading

2178

* the underlying blockdev brought it uptodate (the sct fix).

2181

* the underlying blockdev brought it uptodate (the sct fix).

2179

*/

2182

*/

2180

for (i = 0; i < nr; i++) {

2183

for (i = 0; i < nr; i++) {

2181

bh = arr[i];

2184

bh = arr[i];

2182

if (buffer_uptodate(bh))

2185

if (buffer_uptodate(bh))

2183

end_buffer_async_read(bh, 1);

2186

end_buffer_async_read(bh, 1);

2184

else

2187

else

2185

submit_bh(READ, bh);

2188

submit_bh(READ, bh);

2186

}

2189

}

2187

return 0;

2190

return 0;

2188

}

2191

}

2189

2192

2190

/* utility function for filesystems that need to do work on expanding

2193

/* utility function for filesystems that need to do work on expanding

2191

* truncates. Uses filesystem pagecache writes to allow the filesystem to

2194

* truncates. Uses filesystem pagecache writes to allow the filesystem to

2192

* deal with the hole.

2195

* deal with the hole.

2193

*/

2196

*/

2194

int generic_cont_expand_simple(struct inode *inode, loff_t size)

2197

int generic_cont_expand_simple(struct inode *inode, loff_t size)

2195

{

2198

{

2196

struct address_space *mapping = inode->i_mapping;

2199

struct address_space *mapping = inode->i_mapping;

2197

struct page *page;

2200

struct page *page;

2198

void *fsdata;

2201

void *fsdata;

2199

unsigned long limit;

2202

unsigned long limit;

2200

int err;

2203

int err;

2201

2204

2202

err = -EFBIG;

2205

err = -EFBIG;

2203

limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;

2206

limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;

2204

if (limit != RLIM_INFINITY && size > (loff_t)limit) {

2207

if (limit != RLIM_INFINITY && size > (loff_t)limit) {

2205

send_sig(SIGXFSZ, current, 0);

2208

send_sig(SIGXFSZ, current, 0);

2206

goto out;

2209

goto out;

2207

}

2210

}

2208

if (size > inode->i_sb->s_maxbytes)

2211

if (size > inode->i_sb->s_maxbytes)

2209

goto out;

2212

goto out;

2210

2213

2211

err = pagecache_write_begin(NULL, mapping, size, 0,

2214

err = pagecache_write_begin(NULL, mapping, size, 0,

2212

AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,

2215

AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,

2213

&page, &fsdata);

2216

&page, &fsdata);

2214

if (err)

2217

if (err)

2215

goto out;

2218

goto out;

2216

2219

2217

err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);

2220

err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);

2218

BUG_ON(err > 0);

2221

BUG_ON(err > 0);

2219

2222

2220

out:

2223

out:

2221

return err;

2224

return err;

2222

}

2225

}

2223

2226

2224

static int cont_expand_zero(struct file *file, struct address_space *mapping,

2227

static int cont_expand_zero(struct file *file, struct address_space *mapping,

2225

loff_t pos, loff_t *bytes)

2228

loff_t pos, loff_t *bytes)

2226

{

2229

{

2227

struct inode *inode = mapping->host;

2230

struct inode *inode = mapping->host;

2228

unsigned blocksize = 1 << inode->i_blkbits;

2231

unsigned blocksize = 1 << inode->i_blkbits;

2229

struct page *page;

2232

struct page *page;

2230

void *fsdata;

2233

void *fsdata;

2231

pgoff_t index, curidx;

2234

pgoff_t index, curidx;

2232

loff_t curpos;

2235

loff_t curpos;

2233

unsigned zerofrom, offset, len;

2236

unsigned zerofrom, offset, len;

2234

int err = 0;

2237

int err = 0;

2235

2238

2236

index = pos >> PAGE_CACHE_SHIFT;

2239

index = pos >> PAGE_CACHE_SHIFT;

2237

offset = pos & ~PAGE_CACHE_MASK;

2240

offset = pos & ~PAGE_CACHE_MASK;

2238

2241

2239

while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {

2242

while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {

2240

zerofrom = curpos & ~PAGE_CACHE_MASK;

2243

zerofrom = curpos & ~PAGE_CACHE_MASK;

2241

if (zerofrom & (blocksize-1)) {

2244

if (zerofrom & (blocksize-1)) {

2242

*bytes |= (blocksize-1);

2245

*bytes |= (blocksize-1);

2243

(*bytes)++;

2246

(*bytes)++;

2244

}

2247

}

2245

len = PAGE_CACHE_SIZE - zerofrom;

2248

len = PAGE_CACHE_SIZE - zerofrom;

2246

2249

2247

err = pagecache_write_begin(file, mapping, curpos, len,

2250

err = pagecache_write_begin(file, mapping, curpos, len,

2248

AOP_FLAG_UNINTERRUPTIBLE,

2251

AOP_FLAG_UNINTERRUPTIBLE,

2249

&page, &fsdata);

2252

&page, &fsdata);

2250

if (err)

2253

if (err)

2251

goto out;

2254

goto out;

2252

zero_user(page, zerofrom, len);

2255

zero_user(page, zerofrom, len);

2253

err = pagecache_write_end(file, mapping, curpos, len, len,

2256

err = pagecache_write_end(file, mapping, curpos, len, len,

2254

page, fsdata);

2257

page, fsdata);

2255

if (err < 0)

2258

if (err < 0)

2256

goto out;

2259

goto out;

2257

BUG_ON(err != len);

2260

BUG_ON(err != len);

2258

err = 0;

2261

err = 0;

2259

2262

2260

balance_dirty_pages_ratelimited(mapping);

2263

balance_dirty_pages_ratelimited(mapping);

2261

}

2264

}

2262

2265

2263

/* page covers the boundary, find the boundary offset */

2266

/* page covers the boundary, find the boundary offset */

2264

if (index == curidx) {

2267

if (index == curidx) {

2265

zerofrom = curpos & ~PAGE_CACHE_MASK;

2268

zerofrom = curpos & ~PAGE_CACHE_MASK;

2266

/* if we will expand the thing last block will be filled */

2269

/* if we will expand the thing last block will be filled */

2267

if (offset <= zerofrom) {

2270

if (offset <= zerofrom) {

2268

goto out;

2271

goto out;

2269

}

2272

}

2270

if (zerofrom & (blocksize-1)) {

2273

if (zerofrom & (blocksize-1)) {

2271

*bytes |= (blocksize-1);

2274

*bytes |= (blocksize-1);

2272

(*bytes)++;

2275

(*bytes)++;

2273

}

2276

}

2274

len = offset - zerofrom;

2277

len = offset - zerofrom;

2275

2278

2276

err = pagecache_write_begin(file, mapping, curpos, len,

2279

err = pagecache_write_begin(file, mapping, curpos, len,

2277

AOP_FLAG_UNINTERRUPTIBLE,

2280

AOP_FLAG_UNINTERRUPTIBLE,

2278

&page, &fsdata);

2281

&page, &fsdata);

2279

if (err)

2282

if (err)

2280

goto out;

2283

goto out;

2281

zero_user(page, zerofrom, len);

2284

zero_user(page, zerofrom, len);

2282

err = pagecache_write_end(file, mapping, curpos, len, len,

2285

err = pagecache_write_end(file, mapping, curpos, len, len,

2283

page, fsdata);

2286

page, fsdata);

2284

if (err < 0)

2287

if (err < 0)

2285

goto out;

2288

goto out;

2286

BUG_ON(err != len);

2289

BUG_ON(err != len);

2287

err = 0;

2290

err = 0;

2288

}

2291

}

2289

out:

2292

out:

2290

return err;

2293

return err;

2291

}

2294

}

2292

2295

2293

/*

2296

/*

2294

* For moronic filesystems that do not allow holes in file.

2297

* For moronic filesystems that do not allow holes in file.

2295

* We may have to extend the file.

2298

* We may have to extend the file.

2296

*/

2299

*/

2297

int cont_write_begin(struct file *file, struct address_space *mapping,

2300

int cont_write_begin(struct file *file, struct address_space *mapping,

2298

loff_t pos, unsigned len, unsigned flags,

2301

loff_t pos, unsigned len, unsigned flags,

2299

struct page **pagep, void **fsdata,

2302

struct page **pagep, void **fsdata,

2300

get_block_t *get_block, loff_t *bytes)

2303

get_block_t *get_block, loff_t *bytes)

2301

{

2304

{

2302

struct inode *inode = mapping->host;

2305

struct inode *inode = mapping->host;

2303

unsigned blocksize = 1 << inode->i_blkbits;

2306

unsigned blocksize = 1 << inode->i_blkbits;

2304

unsigned zerofrom;

2307

unsigned zerofrom;

2305

int err;

2308

int err;

2306

2309

2307

err = cont_expand_zero(file, mapping, pos, bytes);

2310

err = cont_expand_zero(file, mapping, pos, bytes);

2308

if (err)

2311

if (err)

2309

goto out;

2312

goto out;

2310

2313

2311

zerofrom = *bytes & ~PAGE_CACHE_MASK;

2314

zerofrom = *bytes & ~PAGE_CACHE_MASK;

2312

if (pos+len > *bytes && zerofrom & (blocksize-1)) {

2315

if (pos+len > *bytes && zerofrom & (blocksize-1)) {

2313

*bytes |= (blocksize-1);

2316

*bytes |= (blocksize-1);

2314

(*bytes)++;

2317

(*bytes)++;

2315

}

2318

}

2316

2319

2317

*pagep = NULL;

2320

*pagep = NULL;

2318

err = block_write_begin(file, mapping, pos, len,

2321

err = block_write_begin(file, mapping, pos, len,

2319

flags, pagep, fsdata, get_block);

2322

flags, pagep, fsdata, get_block);

2320

out:

2323

out:

2321

return err;

2324

return err;

2322

}

2325

}

2323

2326

2324

int block_prepare_write(struct page *page, unsigned from, unsigned to,

2327

int block_prepare_write(struct page *page, unsigned from, unsigned to,

2325

get_block_t *get_block)

2328

get_block_t *get_block)

2326

{

2329

{

2327

struct inode *inode = page->mapping->host;

2330

struct inode *inode = page->mapping->host;

2328

int err = __block_prepare_write(inode, page, from, to, get_block);

2331

int err = __block_prepare_write(inode, page, from, to, get_block);

2329

if (err)

2332

if (err)

2330

ClearPageUptodate(page);

2333

ClearPageUptodate(page);

2331

return err;

2334

return err;

2332

}

2335

}

2333

2336

2334

int block_commit_write(struct page *page, unsigned from, unsigned to)

2337

int block_commit_write(struct page *page, unsigned from, unsigned to)

2335

{

2338

{

2336

struct inode *inode = page->mapping->host;

2339

struct inode *inode = page->mapping->host;

2337

__block_commit_write(inode,page,from,to);

2340

__block_commit_write(inode,page,from,to);

2338

return 0;

2341

return 0;

2339

}

2342

}

2340

2343

2341

/*

2344

/*

2342

* block_page_mkwrite() is not allowed to change the file size as it gets

2345

* block_page_mkwrite() is not allowed to change the file size as it gets

2343

* called from a page fault handler when a page is first dirtied. Hence we must

2346

* called from a page fault handler when a page is first dirtied. Hence we must

2344

* be careful to check for EOF conditions here. We set the page up correctly

2347

* be careful to check for EOF conditions here. We set the page up correctly

2345

* for a written page which means we get ENOSPC checking when writing into

2348

* for a written page which means we get ENOSPC checking when writing into

2346

* holes and correct delalloc and unwritten extent mapping on filesystems that

2349

* holes and correct delalloc and unwritten extent mapping on filesystems that

2347

* support these features.

2350

* support these features.

2348

*

2351

*

2349

* We are not allowed to take the i_mutex here so we have to play games to

2352

* We are not allowed to take the i_mutex here so we have to play games to

2350

* protect against truncate races as the page could now be beyond EOF. Because

2353

* protect against truncate races as the page could now be beyond EOF. Because

2351

* vmtruncate() writes the inode size before removing pages, once we have the

2354

* vmtruncate() writes the inode size before removing pages, once we have the

2352

* page lock we can determine safely if the page is beyond EOF. If it is not

2355

* page lock we can determine safely if the page is beyond EOF. If it is not

2353

* beyond EOF, then the page is guaranteed safe against truncation until we

2356

* beyond EOF, then the page is guaranteed safe against truncation until we

2354

* unlock the page.

2357

* unlock the page.

2355

*/

2358

*/

2356

int

2359

int

2357

block_page_mkwrite(struct vm_area_struct *vma, struct page *page,

2360

block_page_mkwrite(struct vm_area_struct *vma, struct page *page,

2358

get_block_t get_block)

2361

get_block_t get_block)

2359

{

2362

{

2360

struct inode *inode = vma->vm_file->f_path.dentry->d_inode;

2363

struct inode *inode = vma->vm_file->f_path.dentry->d_inode;

2361

unsigned long end;

2364

unsigned long end;

2362

loff_t size;

2365

loff_t size;

2363

int ret = -EINVAL;

2366

int ret = -EINVAL;

2364

2367

2365

lock_page(page);

2368

lock_page(page);

2366

size = i_size_read(inode);

2369

size = i_size_read(inode);

2367

if ((page->mapping != inode->i_mapping) ||

2370

if ((page->mapping != inode->i_mapping) ||

2368

(page_offset(page) > size)) {

2371

(page_offset(page) > size)) {

2369

/* page got truncated out from underneath us */

2372

/* page got truncated out from underneath us */

2370

goto out_unlock;

2373

goto out_unlock;

2371

}

2374

}

2372

2375

2373

/* page is wholly or partially inside EOF */

2376

/* page is wholly or partially inside EOF */

2374

if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)

2377

if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)

2375

end = size & ~PAGE_CACHE_MASK;

2378

end = size & ~PAGE_CACHE_MASK;

2376

else

2379

else

2377

end = PAGE_CACHE_SIZE;

2380

end = PAGE_CACHE_SIZE;

2378

2381

2379

ret = block_prepare_write(page, 0, end, get_block);

2382

ret = block_prepare_write(page, 0, end, get_block);

2380

if (!ret)

2383

if (!ret)

2381

ret = block_commit_write(page, 0, end);

2384

ret = block_commit_write(page, 0, end);

2382

2385

2383

out_unlock:

2386

out_unlock:

2384

unlock_page(page);

2387

unlock_page(page);

2385

return ret;

2388

return ret;

2386

}

2389

}

2387

2390

2388

/*

2391

/*

2389

* nobh_write_begin()'s prereads are special: the buffer_heads are freed

2392

* nobh_write_begin()'s prereads are special: the buffer_heads are freed

2390

* immediately, while under the page lock. So it needs a special end_io

2393

* immediately, while under the page lock. So it needs a special end_io

2391

* handler which does not touch the bh after unlocking it.

2394

* handler which does not touch the bh after unlocking it.

2392

*/

2395

*/

2393

static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)

2396

static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)

2394

{

2397

{

2395

__end_buffer_read_notouch(bh, uptodate);

2398

__end_buffer_read_notouch(bh, uptodate);

2396

}

2399

}

2397

2400

2398

/*

2401

/*

2399

* Attach the singly-linked list of buffers created by nobh_write_begin, to

2402

* Attach the singly-linked list of buffers created by nobh_write_begin, to

2400

* the page (converting it to circular linked list and taking care of page

2403

* the page (converting it to circular linked list and taking care of page

2401

* dirty races).

2404

* dirty races).

2402

*/

2405

*/

2403

static void attach_nobh_buffers(struct page *page, struct buffer_head *head)

2406

static void attach_nobh_buffers(struct page *page, struct buffer_head *head)

2404

{

2407

{

2405

struct buffer_head *bh;

2408

struct buffer_head *bh;

2406

2409

2407

BUG_ON(!PageLocked(page));

2410

BUG_ON(!PageLocked(page));

2408

2411

2409

spin_lock(&page->mapping->private_lock);

2412

spin_lock(&page->mapping->private_lock);

2410

bh = head;

2413

bh = head;

2411

do {

2414

do {

2412

if (PageDirty(page))

2415

if (PageDirty(page))

2413

set_buffer_dirty(bh);

2416

set_buffer_dirty(bh);

2414

if (!bh->b_this_page)

2417

if (!bh->b_this_page)

2415

bh->b_this_page = head;

2418

bh->b_this_page = head;

2416

bh = bh->b_this_page;

2419

bh = bh->b_this_page;

2417

} while (bh != head);

2420

} while (bh != head);

2418

attach_page_buffers(page, head);

2421

attach_page_buffers(page, head);

2419

spin_unlock(&page->mapping->private_lock);

2422

spin_unlock(&page->mapping->private_lock);

2420

}

2423

}

2421

2424

2422

/*

2425

/*

2423

* On entry, the page is fully not uptodate.

2426

* On entry, the page is fully not uptodate.

2424

* On exit the page is fully uptodate in the areas outside (from,to)

2427

* On exit the page is fully uptodate in the areas outside (from,to)

2425

*/

2428

*/

2426

int nobh_write_begin(struct file *file, struct address_space *mapping,

2429

int nobh_write_begin(struct file *file, struct address_space *mapping,

2427

loff_t pos, unsigned len, unsigned flags,

2430

loff_t pos, unsigned len, unsigned flags,

2428

struct page **pagep, void **fsdata,

2431

struct page **pagep, void **fsdata,

2429

get_block_t *get_block)

2432

get_block_t *get_block)

2430

{

2433

{

2431

struct inode *inode = mapping->host;

2434

struct inode *inode = mapping->host;

2432

const unsigned blkbits = inode->i_blkbits;

2435

const unsigned blkbits = inode->i_blkbits;

2433

const unsigned blocksize = 1 << blkbits;

2436

const unsigned blocksize = 1 << blkbits;

2434

struct buffer_head *head, *bh;

2437

struct buffer_head *head, *bh;

2435

struct page *page;

2438

struct page *page;

2436

pgoff_t index;

2439

pgoff_t index;

2437

unsigned from, to;

2440

unsigned from, to;

2438

unsigned block_in_page;

2441

unsigned block_in_page;

2439

unsigned block_start, block_end;

2442

unsigned block_start, block_end;

2440

sector_t block_in_file;

2443

sector_t block_in_file;

2441

int nr_reads = 0;

2444

int nr_reads = 0;

2442

int ret = 0;

2445

int ret = 0;

2443

int is_mapped_to_disk = 1;

2446

int is_mapped_to_disk = 1;

2444

2447

2445

index = pos >> PAGE_CACHE_SHIFT;

2448

index = pos >> PAGE_CACHE_SHIFT;

2446

from = pos & (PAGE_CACHE_SIZE - 1);

2449

from = pos & (PAGE_CACHE_SIZE - 1);

2447

to = from + len;

2450

to = from + len;

2448

2451

2449

page = __grab_cache_page(mapping, index);

2452

page = __grab_cache_page(mapping, index);

2450

if (!page)

2453

if (!page)

2451

return -ENOMEM;

2454

return -ENOMEM;

2452

*pagep = page;

2455

*pagep = page;

2453

*fsdata = NULL;

2456

*fsdata = NULL;

2454

2457

2455

if (page_has_buffers(page)) {

2458

if (page_has_buffers(page)) {

2456

unlock_page(page);

2459

unlock_page(page);

2457

page_cache_release(page);

2460

page_cache_release(page);

2458

*pagep = NULL;

2461

*pagep = NULL;

2459

return block_write_begin(file, mapping, pos, len, flags, pagep,

2462

return block_write_begin(file, mapping, pos, len, flags, pagep,

2460

fsdata, get_block);

2463

fsdata, get_block);

2461

}

2464

}

2462

2465

2463

if (PageMappedToDisk(page))

2466

if (PageMappedToDisk(page))

2464

return 0;

2467

return 0;

2465

2468

2466

/*

2469

/*

2467

* Allocate buffers so that we can keep track of state, and potentially

2470

* Allocate buffers so that we can keep track of state, and potentially

2468

* attach them to the page if an error occurs. In the common case of

2471

* attach them to the page if an error occurs. In the common case of

2469

* no error, they will just be freed again without ever being attached

2472

* no error, they will just be freed again without ever being attached

2470

* to the page (which is all OK, because we're under the page lock).

2473

* to the page (which is all OK, because we're under the page lock).

2471

*

2474

*

2472

* Be careful: the buffer linked list is a NULL terminated one, rather

2475

* Be careful: the buffer linked list is a NULL terminated one, rather

2473

* than the circular one we're used to.

2476

* than the circular one we're used to.

2474

*/

2477

*/

2475

head = alloc_page_buffers(page, blocksize, 0);

2478

head = alloc_page_buffers(page, blocksize, 0);

2476

if (!head) {

2479

if (!head) {

2477

ret = -ENOMEM;

2480

ret = -ENOMEM;

2478

goto out_release;

2481

goto out_release;

2479

}

2482

}

2480

2483

2481

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);

2484

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);

2482

2485

2483

/*

2486

/*

2484

* We loop across all blocks in the page, whether or not they are

2487

* We loop across all blocks in the page, whether or not they are

2485

* part of the affected region. This is so we can discover if the

2488

* part of the affected region. This is so we can discover if the

2486

* page is fully mapped-to-disk.

2489

* page is fully mapped-to-disk.

2487

*/

2490

*/

2488

for (block_start = 0, block_in_page = 0, bh = head;

2491

for (block_start = 0, block_in_page = 0, bh = head;

2489

block_start < PAGE_CACHE_SIZE;

2492

block_start < PAGE_CACHE_SIZE;

2490

block_in_page++, block_start += blocksize, bh = bh->b_this_page) {

2493

block_in_page++, block_start += blocksize, bh = bh->b_this_page) {

2491

int create;

2494

int create;

2492

2495

2493

block_end = block_start + blocksize;

2496

block_end = block_start + blocksize;

2494

bh->b_state = 0;

2497

bh->b_state = 0;

2495

create = 1;

2498

create = 1;

2496

if (block_start >= to)

2499

if (block_start >= to)

2497

create = 0;

2500

create = 0;

2498

ret = get_block(inode, block_in_file + block_in_page,

2501

ret = get_block(inode, block_in_file + block_in_page,

2499

bh, create);

2502

bh, create);

2500

if (ret)

2503

if (ret)

2501

goto failed;

2504

goto failed;

2502

if (!buffer_mapped(bh))

2505

if (!buffer_mapped(bh))

2503

is_mapped_to_disk = 0;

2506

is_mapped_to_disk = 0;

2504

if (buffer_new(bh))

2507

if (buffer_new(bh))

2505

unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);

2508

unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);

2506

if (PageUptodate(page)) {

2509

if (PageUptodate(page)) {

2507

set_buffer_uptodate(bh);

2510

set_buffer_uptodate(bh);

2508

continue;

2511

continue;

2509

}

2512

}

2510

if (buffer_new(bh) || !buffer_mapped(bh)) {

2513

if (buffer_new(bh) || !buffer_mapped(bh)) {

2511

zero_user_segments(page, block_start, from,

2514

zero_user_segments(page, block_start, from,

2512

to, block_end);

2515

to, block_end);

2513

continue;

2516

continue;

2514

}

2517

}

2515

if (buffer_uptodate(bh))

2518

if (buffer_uptodate(bh))

2516

continue; /* reiserfs does this */

2519

continue; /* reiserfs does this */

2517

if (block_start < from || block_end > to) {

2520

if (block_start < from || block_end > to) {

2518

lock_buffer(bh);

2521

lock_buffer(bh);

2519

bh->b_end_io = end_buffer_read_nobh;

2522

bh->b_end_io = end_buffer_read_nobh;

2520

submit_bh(READ, bh);

2523

submit_bh(READ, bh);

2521

nr_reads++;

2524

nr_reads++;

2522

}

2525

}

2523

}

2526

}

2524

2527

2525

if (nr_reads) {

2528

if (nr_reads) {

2526

/*

2529

/*

2527

* The page is locked, so these buffers are protected from

2530

* The page is locked, so these buffers are protected from

2528

* any VM or truncate activity. Hence we don't need to care

2531

* any VM or truncate activity. Hence we don't need to care

2529

* for the buffer_head refcounts.

2532

* for the buffer_head refcounts.

2530

*/

2533

*/

2531

for (bh = head; bh; bh = bh->b_this_page) {

2534

for (bh = head; bh; bh = bh->b_this_page) {

2532

wait_on_buffer(bh);

2535

wait_on_buffer(bh);

2533

if (!buffer_uptodate(bh))

2536

if (!buffer_uptodate(bh))

2534

ret = -EIO;

2537

ret = -EIO;

2535

}

2538

}

2536

if (ret)

2539

if (ret)

2537

goto failed;

2540

goto failed;

2538

}

2541

}

2539

2542

2540

if (is_mapped_to_disk)

2543

if (is_mapped_to_disk)

2541

SetPageMappedToDisk(page);

2544

SetPageMappedToDisk(page);

2542

2545

2543

*fsdata = head; /* to be released by nobh_write_end */

2546

*fsdata = head; /* to be released by nobh_write_end */

2544

2547

2545

return 0;

2548

return 0;

2546

2549

2547

failed:

2550

failed:

2548

BUG_ON(!ret);

2551

BUG_ON(!ret);

2549

/*

2552

/*

2550

* Error recovery is a bit difficult. We need to zero out blocks that

2553

* Error recovery is a bit difficult. We need to zero out blocks that

2551

* were newly allocated, and dirty them to ensure they get written out.

2554

* were newly allocated, and dirty them to ensure they get written out.

2552

* Buffers need to be attached to the page at this point, otherwise

2555

* Buffers need to be attached to the page at this point, otherwise

2553

* the handling of potential IO errors during writeout would be hard

2556

* the handling of potential IO errors during writeout would be hard

2554

* (could try doing synchronous writeout, but what if that fails too?)

2557

* (could try doing synchronous writeout, but what if that fails too?)

2555

*/

2558

*/

2556

attach_nobh_buffers(page, head);

2559

attach_nobh_buffers(page, head);

2557

page_zero_new_buffers(page, from, to);

2560

page_zero_new_buffers(page, from, to);

2558

2561

2559

out_release:

2562

out_release:

2560

unlock_page(page);

2563

unlock_page(page);

2561

page_cache_release(page);

2564

page_cache_release(page);

2562

*pagep = NULL;

2565

*pagep = NULL;

2563

2566

2564

if (pos + len > inode->i_size)

2567

if (pos + len > inode->i_size)

2565

vmtruncate(inode, inode->i_size);

2568

vmtruncate(inode, inode->i_size);

2566

2569

2567

return ret;

2570

return ret;

2568

}

2571

}

2569

EXPORT_SYMBOL(nobh_write_begin);

2572

EXPORT_SYMBOL(nobh_write_begin);

2570

2573

2571

int nobh_write_end(struct file *file, struct address_space *mapping,

2574

int nobh_write_end(struct file *file, struct address_space *mapping,

2572

loff_t pos, unsigned len, unsigned copied,

2575

loff_t pos, unsigned len, unsigned copied,

2573

struct page *page, void *fsdata)

2576

struct page *page, void *fsdata)

2574

{

2577

{

2575

struct inode *inode = page->mapping->host;

2578

struct inode *inode = page->mapping->host;

2576

struct buffer_head *head = fsdata;

2579

struct buffer_head *head = fsdata;

2577

struct buffer_head *bh;

2580

struct buffer_head *bh;

2578

BUG_ON(fsdata != NULL && page_has_buffers(page));

2581

BUG_ON(fsdata != NULL && page_has_buffers(page));

2579

2582

2580

if (unlikely(copied < len) && !page_has_buffers(page))

2583

if (unlikely(copied < len) && !page_has_buffers(page))

2581

attach_nobh_buffers(page, head);

2584

attach_nobh_buffers(page, head);

2582

if (page_has_buffers(page))

2585

if (page_has_buffers(page))

2583

return generic_write_end(file, mapping, pos, len,

2586

return generic_write_end(file, mapping, pos, len,

2584

copied, page, fsdata);

2587

copied, page, fsdata);

2585

2588

2586

SetPageUptodate(page);

2589

SetPageUptodate(page);

2587

set_page_dirty(page);

2590

set_page_dirty(page);

2588

if (pos+copied > inode->i_size) {

2591

if (pos+copied > inode->i_size) {

2589

i_size_write(inode, pos+copied);

2592

i_size_write(inode, pos+copied);

2590

mark_inode_dirty(inode);

2593

mark_inode_dirty(inode);

2591

}

2594

}

2592

2595

2593

unlock_page(page);

2596

unlock_page(page);

2594

page_cache_release(page);

2597

page_cache_release(page);

2595

2598

2596

while (head) {

2599

while (head) {

2597

bh = head;

2600

bh = head;

2598

head = head->b_this_page;

2601

head = head->b_this_page;

2599

free_buffer_head(bh);

2602

free_buffer_head(bh);

2600

}

2603

}

2601

2604

2602

return copied;

2605

return copied;

2603

}

2606

}

2604

EXPORT_SYMBOL(nobh_write_end);

2607

EXPORT_SYMBOL(nobh_write_end);

2605

2608

2606

/*

2609

/*

2607

* nobh_writepage() - based on block_full_write_page() except

2610

* nobh_writepage() - based on block_full_write_page() except

2608

* that it tries to operate without attaching bufferheads to

2611

* that it tries to operate without attaching bufferheads to

2609

* the page.

2612

* the page.

2610

*/

2613

*/

2611

int nobh_writepage(struct page *page, get_block_t *get_block,

2614

int nobh_writepage(struct page *page, get_block_t *get_block,

2612

struct writeback_control *wbc)

2615

struct writeback_control *wbc)

2613

{

2616

{

2614

struct inode * const inode = page->mapping->host;

2617

struct inode * const inode = page->mapping->host;

2615

loff_t i_size = i_size_read(inode);

2618

loff_t i_size = i_size_read(inode);

2616

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2619

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2617

unsigned offset;

2620

unsigned offset;

2618

int ret;

2621

int ret;

2619

2622

2620

/* Is the page fully inside i_size? */

2623

/* Is the page fully inside i_size? */

2621

if (page->index < end_index)

2624

if (page->index < end_index)

2622

goto out;

2625

goto out;

2623

2626

2624

/* Is the page fully outside i_size? (truncate in progress) */

2627

/* Is the page fully outside i_size? (truncate in progress) */

2625

offset = i_size & (PAGE_CACHE_SIZE-1);

2628

offset = i_size & (PAGE_CACHE_SIZE-1);

2626

if (page->index >= end_index+1 || !offset) {

2629

if (page->index >= end_index+1 || !offset) {

2627

/*

2630

/*

2628

* The page may have dirty, unmapped buffers. For example,

2631

* The page may have dirty, unmapped buffers. For example,

2629

* they may have been added in ext3_writepage(). Make them

2632

* they may have been added in ext3_writepage(). Make them

2630

* freeable here, so the page does not leak.

2633

* freeable here, so the page does not leak.

2631

*/

2634

*/

2632

#if 0

2635

#if 0

2633

/* Not really sure about this - do we need this ? */

2636

/* Not really sure about this - do we need this ? */

2634

if (page->mapping->a_ops->invalidatepage)

2637

if (page->mapping->a_ops->invalidatepage)

2635

page->mapping->a_ops->invalidatepage(page, offset);

2638

page->mapping->a_ops->invalidatepage(page, offset);

2636

#endif

2639

#endif

2637

unlock_page(page);

2640

unlock_page(page);

2638

return 0; /* don't care */

2641

return 0; /* don't care */

2639

}

2642

}

2640

2643

2641

/*

2644

/*

2642

* The page straddles i_size. It must be zeroed out on each and every

2645

* The page straddles i_size. It must be zeroed out on each and every

2643

* writepage invocation because it may be mmapped. "A file is mapped

2646

* writepage invocation because it may be mmapped. "A file is mapped

2644

* in multiples of the page size. For a file that is not a multiple of

2647

* in multiples of the page size. For a file that is not a multiple of

2645

* the page size, the remaining memory is zeroed when mapped, and

2648

* the page size, the remaining memory is zeroed when mapped, and

2646

* writes to that region are not written out to the file."

2649

* writes to that region are not written out to the file."

2647

*/

2650

*/

2648

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2651

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2649

out:

2652

out:

2650

ret = mpage_writepage(page, get_block, wbc);

2653

ret = mpage_writepage(page, get_block, wbc);

2651

if (ret == -EAGAIN)

2654

if (ret == -EAGAIN)

2652

ret = __block_write_full_page(inode, page, get_block, wbc);

2655

ret = __block_write_full_page(inode, page, get_block, wbc);

2653

return ret;

2656

return ret;

2654

}

2657

}

2655

EXPORT_SYMBOL(nobh_writepage);

2658

EXPORT_SYMBOL(nobh_writepage);

2656

2659

2657

int nobh_truncate_page(struct address_space *mapping,

2660

int nobh_truncate_page(struct address_space *mapping,

2658

loff_t from, get_block_t *get_block)

2661

loff_t from, get_block_t *get_block)

2659

{

2662

{

2660

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2663

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2661

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2664

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2662

unsigned blocksize;

2665

unsigned blocksize;

2663

sector_t iblock;

2666

sector_t iblock;

2664

unsigned length, pos;

2667

unsigned length, pos;

2665

struct inode *inode = mapping->host;

2668

struct inode *inode = mapping->host;

2666

struct page *page;

2669

struct page *page;

2667

struct buffer_head map_bh;

2670

struct buffer_head map_bh;

2668

int err;

2671

int err;

2669

2672

2670

blocksize = 1 << inode->i_blkbits;

2673

blocksize = 1 << inode->i_blkbits;

2671

length = offset & (blocksize - 1);

2674

length = offset & (blocksize - 1);

2672

2675

2673

/* Block boundary? Nothing to do */

2676

/* Block boundary? Nothing to do */

2674

if (!length)

2677

if (!length)

2675

return 0;

2678

return 0;

2676

2679

2677

length = blocksize - length;

2680

length = blocksize - length;

2678

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2681

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2679

2682

2680

page = grab_cache_page(mapping, index);

2683

page = grab_cache_page(mapping, index);

2681

err = -ENOMEM;

2684

err = -ENOMEM;

2682

if (!page)

2685

if (!page)

2683

goto out;

2686

goto out;

2684

2687

2685

if (page_has_buffers(page)) {

2688

if (page_has_buffers(page)) {

2686

has_buffers:

2689

has_buffers:

2687

unlock_page(page);

2690

unlock_page(page);

2688

page_cache_release(page);

2691

page_cache_release(page);

2689

return block_truncate_page(mapping, from, get_block);

2692

return block_truncate_page(mapping, from, get_block);

2690

}

2693

}

2691

2694

2692

/* Find the buffer that contains "offset" */

2695

/* Find the buffer that contains "offset" */

2693

pos = blocksize;

2696

pos = blocksize;

2694

while (offset >= pos) {

2697

while (offset >= pos) {

2695

iblock++;

2698

iblock++;

2696

pos += blocksize;

2699

pos += blocksize;

2697

}

2700

}

2698

2701

2699

err = get_block(inode, iblock, &map_bh, 0);

2702

err = get_block(inode, iblock, &map_bh, 0);

2700

if (err)

2703

if (err)

2701

goto unlock;

2704

goto unlock;

2702

/* unmapped? It's a hole - nothing to do */

2705

/* unmapped? It's a hole - nothing to do */

2703

if (!buffer_mapped(&map_bh))

2706

if (!buffer_mapped(&map_bh))

2704

goto unlock;

2707

goto unlock;

2705

2708

2706

/* Ok, it's mapped. Make sure it's up-to-date */

2709

/* Ok, it's mapped. Make sure it's up-to-date */

2707

if (!PageUptodate(page)) {

2710

if (!PageUptodate(page)) {

2708

err = mapping->a_ops->readpage(NULL, page);

2711

err = mapping->a_ops->readpage(NULL, page);

2709

if (err) {

2712

if (err) {

2710

page_cache_release(page);

2713

page_cache_release(page);

2711

goto out;

2714

goto out;

2712

}

2715

}

2713

lock_page(page);

2716

lock_page(page);

2714

if (!PageUptodate(page)) {

2717

if (!PageUptodate(page)) {

2715

err = -EIO;

2718

err = -EIO;

2716

goto unlock;

2719

goto unlock;

2717

}

2720

}

2718

if (page_has_buffers(page))

2721

if (page_has_buffers(page))

2719

goto has_buffers;

2722

goto has_buffers;

2720

}

2723

}

2721

zero_user(page, offset, length);

2724

zero_user(page, offset, length);

2722

set_page_dirty(page);

2725

set_page_dirty(page);

2723

err = 0;

2726

err = 0;

2724

2727

2725

unlock:

2728

unlock:

2726

unlock_page(page);

2729

unlock_page(page);

2727

page_cache_release(page);

2730

page_cache_release(page);

2728

out:

2731

out:

2729

return err;

2732

return err;

2730

}

2733

}

2731

EXPORT_SYMBOL(nobh_truncate_page);

2734

EXPORT_SYMBOL(nobh_truncate_page);

2732

2735

2733

int block_truncate_page(struct address_space *mapping,

2736

int block_truncate_page(struct address_space *mapping,

2734

loff_t from, get_block_t *get_block)

2737

loff_t from, get_block_t *get_block)

2735

{

2738

{

2736

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2739

pgoff_t index = from >> PAGE_CACHE_SHIFT;

2737

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2740

unsigned offset = from & (PAGE_CACHE_SIZE-1);

2738

unsigned blocksize;

2741

unsigned blocksize;

2739

sector_t iblock;

2742

sector_t iblock;

2740

unsigned length, pos;

2743

unsigned length, pos;

2741

struct inode *inode = mapping->host;

2744

struct inode *inode = mapping->host;

2742

struct page *page;

2745

struct page *page;

2743

struct buffer_head *bh;

2746

struct buffer_head *bh;

2744

int err;

2747

int err;

2745

2748

2746

blocksize = 1 << inode->i_blkbits;

2749

blocksize = 1 << inode->i_blkbits;

2747

length = offset & (blocksize - 1);

2750

length = offset & (blocksize - 1);

2748

2751

2749

/* Block boundary? Nothing to do */

2752

/* Block boundary? Nothing to do */

2750

if (!length)

2753

if (!length)

2751

return 0;

2754

return 0;

2752

2755

2753

length = blocksize - length;

2756

length = blocksize - length;

2754

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2757

iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

2755

2758

2756

page = grab_cache_page(mapping, index);

2759

page = grab_cache_page(mapping, index);

2757

err = -ENOMEM;

2760

err = -ENOMEM;

2758

if (!page)

2761

if (!page)

2759

goto out;

2762

goto out;

2760

2763

2761

if (!page_has_buffers(page))

2764

if (!page_has_buffers(page))

2762

create_empty_buffers(page, blocksize, 0);

2765

create_empty_buffers(page, blocksize, 0);

2763

2766

2764

/* Find the buffer that contains "offset" */

2767

/* Find the buffer that contains "offset" */

2765

bh = page_buffers(page);

2768

bh = page_buffers(page);

2766

pos = blocksize;

2769

pos = blocksize;

2767

while (offset >= pos) {

2770

while (offset >= pos) {

2768

bh = bh->b_this_page;

2771

bh = bh->b_this_page;

2769

iblock++;

2772

iblock++;

2770

pos += blocksize;

2773

pos += blocksize;

2771

}

2774

}

2772

2775

2773

err = 0;

2776

err = 0;

2774

if (!buffer_mapped(bh)) {

2777

if (!buffer_mapped(bh)) {

2775

WARN_ON(bh->b_size != blocksize);

2778

WARN_ON(bh->b_size != blocksize);

2776

err = get_block(inode, iblock, bh, 0);

2779

err = get_block(inode, iblock, bh, 0);

2777

if (err)

2780

if (err)

2778

goto unlock;

2781

goto unlock;

2779

/* unmapped? It's a hole - nothing to do */

2782

/* unmapped? It's a hole - nothing to do */

2780

if (!buffer_mapped(bh))

2783

if (!buffer_mapped(bh))

2781

goto unlock;

2784

goto unlock;

2782

}

2785

}

2783

2786

2784

/* Ok, it's mapped. Make sure it's up-to-date */

2787

/* Ok, it's mapped. Make sure it's up-to-date */

2785

if (PageUptodate(page))

2788

if (PageUptodate(page))

2786

set_buffer_uptodate(bh);

2789

set_buffer_uptodate(bh);

2787

2790

2788

if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {

2791

if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {

2789

err = -EIO;

2792

err = -EIO;

2790

ll_rw_block(READ, 1, &bh);

2793

ll_rw_block(READ, 1, &bh);

2791

wait_on_buffer(bh);

2794

wait_on_buffer(bh);

2792

/* Uhhuh. Read error. Complain and punt. */

2795

/* Uhhuh. Read error. Complain and punt. */

2793

if (!buffer_uptodate(bh))

2796

if (!buffer_uptodate(bh))

2794

goto unlock;

2797

goto unlock;

2795

}

2798

}

2796

2799

2797

zero_user(page, offset, length);

2800

zero_user(page, offset, length);

2798

mark_buffer_dirty(bh);

2801

mark_buffer_dirty(bh);

2799

err = 0;

2802

err = 0;

2800

2803

2801

unlock:

2804

unlock:

2802

unlock_page(page);

2805

unlock_page(page);

2803

page_cache_release(page);

2806

page_cache_release(page);

2804

out:

2807

out:

2805

return err;

2808

return err;

2806

}

2809

}

2807

2810

2808

/*

2811

/*

2809

* The generic ->writepage function for buffer-backed address_spaces

2812

* The generic ->writepage function for buffer-backed address_spaces

2810

*/

2813

*/

2811

int block_write_full_page(struct page *page, get_block_t *get_block,

2814

int block_write_full_page(struct page *page, get_block_t *get_block,

2812

struct writeback_control *wbc)

2815

struct writeback_control *wbc)

2813

{

2816

{

2814

struct inode * const inode = page->mapping->host;

2817

struct inode * const inode = page->mapping->host;

2815

loff_t i_size = i_size_read(inode);

2818

loff_t i_size = i_size_read(inode);

2816

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2819

const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

2817

unsigned offset;

2820

unsigned offset;

2818

2821

2819

/* Is the page fully inside i_size? */

2822

/* Is the page fully inside i_size? */

2820

if (page->index < end_index)

2823

if (page->index < end_index)

2821

return __block_write_full_page(inode, page, get_block, wbc);

2824

return __block_write_full_page(inode, page, get_block, wbc);

2822

2825

2823

/* Is the page fully outside i_size? (truncate in progress) */

2826

/* Is the page fully outside i_size? (truncate in progress) */

2824

offset = i_size & (PAGE_CACHE_SIZE-1);

2827

offset = i_size & (PAGE_CACHE_SIZE-1);

2825

if (page->index >= end_index+1 || !offset) {

2828

if (page->index >= end_index+1 || !offset) {

2826

/*

2829

/*

2827

* The page may have dirty, unmapped buffers. For example,

2830

* The page may have dirty, unmapped buffers. For example,

2828

* they may have been added in ext3_writepage(). Make them

2831

* they may have been added in ext3_writepage(). Make them

2829

* freeable here, so the page does not leak.

2832

* freeable here, so the page does not leak.

2830

*/

2833

*/

2831

do_invalidatepage(page, 0);

2834

do_invalidatepage(page, 0);

2832

unlock_page(page);

2835

unlock_page(page);

2833

return 0; /* don't care */

2836

return 0; /* don't care */

2834

}

2837

}

2835

2838

2836

/*

2839

/*

2837

* The page straddles i_size. It must be zeroed out on each and every

2840

* The page straddles i_size. It must be zeroed out on each and every

2838

* writepage invokation because it may be mmapped. "A file is mapped

2841

* writepage invokation because it may be mmapped. "A file is mapped

2839

* in multiples of the page size. For a file that is not a multiple of

2842

* in multiples of the page size. For a file that is not a multiple of

2840

* the page size, the remaining memory is zeroed when mapped, and

2843

* the page size, the remaining memory is zeroed when mapped, and

2841

* writes to that region are not written out to the file."

2844

* writes to that region are not written out to the file."

2842

*/

2845

*/

2843

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2846

zero_user_segment(page, offset, PAGE_CACHE_SIZE);

2844

return __block_write_full_page(inode, page, get_block, wbc);

2847

return __block_write_full_page(inode, page, get_block, wbc);

2845

}

2848

}

2846

2849

2847

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,

2850

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,

2848

get_block_t *get_block)

2851

get_block_t *get_block)

2849

{

2852

{

2850

struct buffer_head tmp;

2853

struct buffer_head tmp;

2851

struct inode *inode = mapping->host;

2854

struct inode *inode = mapping->host;

2852

tmp.b_state = 0;

2855

tmp.b_state = 0;

2853

tmp.b_blocknr = 0;

2856

tmp.b_blocknr = 0;

2854

tmp.b_size = 1 << inode->i_blkbits;

2857

tmp.b_size = 1 << inode->i_blkbits;

2855

get_block(inode, block, &tmp, 0);

2858

get_block(inode, block, &tmp, 0);

2856

return tmp.b_blocknr;

2859

return tmp.b_blocknr;

2857

}

2860

}

2858

2861

2859

static void end_bio_bh_io_sync(struct bio *bio, int err)

2862

static void end_bio_bh_io_sync(struct bio *bio, int err)

2860

{

2863

{

2861

struct buffer_head *bh = bio->bi_private;

2864

struct buffer_head *bh = bio->bi_private;

2862

2865

2863

if (err == -EOPNOTSUPP) {

2866

if (err == -EOPNOTSUPP) {

2864

set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);

2867

set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);

2865

set_bit(BH_Eopnotsupp, &bh->b_state);

2868

set_bit(BH_Eopnotsupp, &bh->b_state);

2866

}

2869

}

2867

2870

2868

bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));

2871

bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));

2869

bio_put(bio);

2872

bio_put(bio);

2870

}

2873

}

2871

2874

2872

int submit_bh(int rw, struct buffer_head * bh)

2875

int submit_bh(int rw, struct buffer_head * bh)

2873

{

2876

{

2874

struct bio *bio;

2877

struct bio *bio;

2875

int ret = 0;

2878

int ret = 0;

2876

2879

2877

BUG_ON(!buffer_locked(bh));

2880

BUG_ON(!buffer_locked(bh));

2878

BUG_ON(!buffer_mapped(bh));

2881

BUG_ON(!buffer_mapped(bh));

2879

BUG_ON(!bh->b_end_io);

2882

BUG_ON(!bh->b_end_io);

2880

2883

2881

if (buffer_ordered(bh) && (rw == WRITE))

2884

if (buffer_ordered(bh) && (rw == WRITE))

2882

rw = WRITE_BARRIER;

2885

rw = WRITE_BARRIER;

2883

2886

2884

/*

2887

/*

2885

* Only clear out a write error when rewriting, should this

2888

* Only clear out a write error when rewriting, should this

2886

* include WRITE_SYNC as well?

2889

* include WRITE_SYNC as well?

2887

*/

2890

*/

2888

if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))

2891

if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))

2889

clear_buffer_write_io_error(bh);

2892

clear_buffer_write_io_error(bh);

2890

2893

2891

/*

2894

/*

2892

* from here on down, it's all bio -- do the initial mapping,

2895

* from here on down, it's all bio -- do the initial mapping,

2893

* submit_bio -> generic_make_request may further map this bio around

2896

* submit_bio -> generic_make_request may further map this bio around

2894

*/

2897

*/

2895

bio = bio_alloc(GFP_NOIO, 1);

2898

bio = bio_alloc(GFP_NOIO, 1);

2896

2899

2897

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

2900

bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);

2898

bio->bi_bdev = bh->b_bdev;

2901

bio->bi_bdev = bh->b_bdev;

2899

bio->bi_io_vec[0].bv_page = bh->b_page;

2902

bio->bi_io_vec[0].bv_page = bh->b_page;

2900

bio->bi_io_vec[0].bv_len = bh->b_size;

2903

bio->bi_io_vec[0].bv_len = bh->b_size;

2901

bio->bi_io_vec[0].bv_offset = bh_offset(bh);

2904

bio->bi_io_vec[0].bv_offset = bh_offset(bh);

2902

2905

2903

bio->bi_vcnt = 1;

2906

bio->bi_vcnt = 1;

2904

bio->bi_idx = 0;

2907

bio->bi_idx = 0;

2905

bio->bi_size = bh->b_size;

2908

bio->bi_size = bh->b_size;

2906

2909

2907

bio->bi_end_io = end_bio_bh_io_sync;

2910

bio->bi_end_io = end_bio_bh_io_sync;

2908

bio->bi_private = bh;

2911

bio->bi_private = bh;

2909

2912

2910

bio_get(bio);

2913

bio_get(bio);

2911

submit_bio(rw, bio);

2914

submit_bio(rw, bio);

2912

2915

2913

if (bio_flagged(bio, BIO_EOPNOTSUPP))

2916

if (bio_flagged(bio, BIO_EOPNOTSUPP))

2914

ret = -EOPNOTSUPP;

2917

ret = -EOPNOTSUPP;

2915

2918

2916

bio_put(bio);

2919

bio_put(bio);

2917

return ret;

2920

return ret;

2918

}

2921

}

2919

2922

2920

/**

2923

/**

2921

* ll_rw_block: low-level access to block devices (DEPRECATED)

2924

* ll_rw_block: low-level access to block devices (DEPRECATED)

2922

* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)

2925

* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)

2923

* @nr: number of &struct buffer_heads in the array

2926

* @nr: number of &struct buffer_heads in the array

2924

* @bhs: array of pointers to &struct buffer_head

2927

* @bhs: array of pointers to &struct buffer_head

2925

*

2928

*

2926

* ll_rw_block() takes an array of pointers to &struct buffer_heads, and

2929

* ll_rw_block() takes an array of pointers to &struct buffer_heads, and

2927

* requests an I/O operation on them, either a %READ or a %WRITE. The third

2930

* requests an I/O operation on them, either a %READ or a %WRITE. The third

2928

* %SWRITE is like %WRITE only we make sure that the *current* data in buffers

2931

* %SWRITE is like %WRITE only we make sure that the *current* data in buffers

2929

* are sent to disk. The fourth %READA option is described in the documentation

2932

* are sent to disk. The fourth %READA option is described in the documentation

2930

* for generic_make_request() which ll_rw_block() calls.

2933

* for generic_make_request() which ll_rw_block() calls.

2931

*

2934

*

2932

* This function drops any buffer that it cannot get a lock on (with the

2935

* This function drops any buffer that it cannot get a lock on (with the

2933

* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be

2936

* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be

2934

* clean when doing a write request, and any buffer that appears to be

2937

* clean when doing a write request, and any buffer that appears to be

2935

* up-to-date when doing read request. Further it marks as clean buffers that

2938

* up-to-date when doing read request. Further it marks as clean buffers that

2936

* are processed for writing (the buffer cache won't assume that they are

2939

* are processed for writing (the buffer cache won't assume that they are

2937

* actually clean until the buffer gets unlocked).

2940

* actually clean until the buffer gets unlocked).

2938

*

2941

*

2939

* ll_rw_block sets b_end_io to simple completion handler that marks

2942

* ll_rw_block sets b_end_io to simple completion handler that marks

2940

* the buffer up-to-date (if approriate), unlocks the buffer and wakes

2943

* the buffer up-to-date (if approriate), unlocks the buffer and wakes

2941

* any waiters.

2944

* any waiters.

2942

*

2945

*

2943

* All of the buffers must be for the same device, and must also be a

2946

* All of the buffers must be for the same device, and must also be a

2944

* multiple of the current approved size for the device.

2947

* multiple of the current approved size for the device.

2945

*/

2948

*/

2946

void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

2949

void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])

2947

{

2950

{

2948

int i;

2951

int i;

2949

2952

2950

for (i = 0; i < nr; i++) {

2953

for (i = 0; i < nr; i++) {

2951

struct buffer_head *bh = bhs[i];

2954

struct buffer_head *bh = bhs[i];

2952

2955

2953

if (rw == SWRITE || rw == SWRITE_SYNC)

2956

if (rw == SWRITE || rw == SWRITE_SYNC)

2954

lock_buffer(bh);

2957

lock_buffer(bh);

2955

else if (test_set_buffer_locked(bh))

2958

else if (test_set_buffer_locked(bh))

2956

continue;

2959

continue;

2957

2960

2958

if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {

2961

if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {

2959

if (test_clear_buffer_dirty(bh)) {

2962

if (test_clear_buffer_dirty(bh)) {

2960

bh->b_end_io = end_buffer_write_sync;

2963

bh->b_end_io = end_buffer_write_sync;

2961

get_bh(bh);

2964

get_bh(bh);

2962

if (rw == SWRITE_SYNC)

2965

if (rw == SWRITE_SYNC)

2963

submit_bh(WRITE_SYNC, bh);

2966

submit_bh(WRITE_SYNC, bh);

2964

else

2967

else

2965

submit_bh(WRITE, bh);

2968

submit_bh(WRITE, bh);

2966

continue;

2969

continue;

2967

}

2970

}

2968

} else {

2971

} else {

2969

if (!buffer_uptodate(bh)) {

2972

if (!buffer_uptodate(bh)) {

2970

bh->b_end_io = end_buffer_read_sync;

2973

bh->b_end_io = end_buffer_read_sync;

2971

get_bh(bh);

2974

get_bh(bh);

2972

submit_bh(rw, bh);

2975

submit_bh(rw, bh);

2973

continue;

2976

continue;

2974

}

2977

}

2975

}

2978

}

2976

unlock_buffer(bh);

2979

unlock_buffer(bh);

2977

}

2980

}

2978

}

2981

}

2979

2982

2980

/*

2983

/*

2981

* For a data-integrity writeout, we need to wait upon any in-progress I/O

2984

* For a data-integrity writeout, we need to wait upon any in-progress I/O

2982

* and then start new I/O and then wait upon it. The caller must have a ref on

2985

* and then start new I/O and then wait upon it. The caller must have a ref on

2983

* the buffer_head.

2986

* the buffer_head.

2984

*/

2987

*/

2985

int sync_dirty_buffer(struct buffer_head *bh)

2988

int sync_dirty_buffer(struct buffer_head *bh)

2986

{

2989

{

2987

int ret = 0;

2990

int ret = 0;

2988

2991

2989

WARN_ON(atomic_read(&bh->b_count) < 1);

2992

WARN_ON(atomic_read(&bh->b_count) < 1);

2990

lock_buffer(bh);

2993

lock_buffer(bh);

2991

if (test_clear_buffer_dirty(bh)) {

2994

if (test_clear_buffer_dirty(bh)) {

2992

get_bh(bh);

2995

get_bh(bh);

2993

bh->b_end_io = end_buffer_write_sync;

2996

bh->b_end_io = end_buffer_write_sync;

2994

ret = submit_bh(WRITE_SYNC, bh);

2997

ret = submit_bh(WRITE_SYNC, bh);

2995

wait_on_buffer(bh);

2998

wait_on_buffer(bh);

2996

if (buffer_eopnotsupp(bh)) {

2999

if (buffer_eopnotsupp(bh)) {

2997

clear_buffer_eopnotsupp(bh);

3000

clear_buffer_eopnotsupp(bh);

2998

ret = -EOPNOTSUPP;

3001

ret = -EOPNOTSUPP;

2999

}

3002

}

3000

if (!ret && !buffer_uptodate(bh))

3003

if (!ret && !buffer_uptodate(bh))

3001

ret = -EIO;

3004

ret = -EIO;

3002

} else {

3005

} else {

3003

unlock_buffer(bh);

3006

unlock_buffer(bh);

3004

}

3007

}

3005

return ret;

3008

return ret;

3006

}

3009

}

3007

3010

3008

/*

3011

/*

3009

* try_to_free_buffers() checks if all the buffers on this particular page

3012

* try_to_free_buffers() checks if all the buffers on this particular page

3010

* are unused, and releases them if so.

3013

* are unused, and releases them if so.

3011

*

3014

*

3012

* Exclusion against try_to_free_buffers may be obtained by either

3015

* Exclusion against try_to_free_buffers may be obtained by either

3013

* locking the page or by holding its mapping's private_lock.

3016

* locking the page or by holding its mapping's private_lock.

3014

*

3017

*

3015

* If the page is dirty but all the buffers are clean then we need to

3018

* If the page is dirty but all the buffers are clean then we need to

3016

* be sure to mark the page clean as well. This is because the page

3019

* be sure to mark the page clean as well. This is because the page

3017

* may be against a block device, and a later reattachment of buffers

3020

* may be against a block device, and a later reattachment of buffers

3018

* to a dirty page will set *all* buffers dirty. Which would corrupt

3021

* to a dirty page will set *all* buffers dirty. Which would corrupt

3019

* filesystem data on the same device.

3022

* filesystem data on the same device.

3020

*

3023

*

3021

* The same applies to regular filesystem pages: if all the buffers are

3024

* The same applies to regular filesystem pages: if all the buffers are

3022

* clean then we set the page clean and proceed. To do that, we require

3025

* clean then we set the page clean and proceed. To do that, we require

3023

* total exclusion from __set_page_dirty_buffers(). That is obtained with

3026

* total exclusion from __set_page_dirty_buffers(). That is obtained with

3024

* private_lock.

3027

* private_lock.

3025

*

3028

*

3026

* try_to_free_buffers() is non-blocking.

3029

* try_to_free_buffers() is non-blocking.

3027

*/

3030

*/

3028

static inline int buffer_busy(struct buffer_head *bh)

3031

static inline int buffer_busy(struct buffer_head *bh)

3029

{

3032

{

3030

return atomic_read(&bh->b_count) |

3033

return atomic_read(&bh->b_count) |

3031

(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));

3034

(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));

3032

}

3035

}

3033

3036

3034

static int

3037

static int

3035

drop_buffers(struct page *page, struct buffer_head **buffers_to_free)

3038

drop_buffers(struct page *page, struct buffer_head **buffers_to_free)

3036

{

3039

{

3037

struct buffer_head *head = page_buffers(page);

3040

struct buffer_head *head = page_buffers(page);

3038

struct buffer_head *bh;

3041

struct buffer_head *bh;

3039

3042

3040

bh = head;

3043

bh = head;

3041

do {

3044

do {

3042

if (buffer_write_io_error(bh) && page->mapping)

3045

if (buffer_write_io_error(bh) && page->mapping)

3043

set_bit(AS_EIO, &page->mapping->flags);

3046

set_bit(AS_EIO, &page->mapping->flags);

3044

if (buffer_busy(bh))

3047

if (buffer_busy(bh))

3045

goto failed;

3048

goto failed;

3046

bh = bh->b_this_page;

3049

bh = bh->b_this_page;

3047

} while (bh != head);

3050

} while (bh != head);

3048

3051

3049

do {

3052

do {

3050

struct buffer_head *next = bh->b_this_page;

3053

struct buffer_head *next = bh->b_this_page;

3051

3054

3052

if (bh->b_assoc_map)

3055

if (bh->b_assoc_map)

3053

__remove_assoc_queue(bh);

3056

__remove_assoc_queue(bh);

3054

bh = next;

3057

bh = next;

3055

} while (bh != head);

3058

} while (bh != head);

3056

*buffers_to_free = head;

3059

*buffers_to_free = head;

3057

__clear_page_buffers(page);

3060

__clear_page_buffers(page);

3058

return 1;

3061

return 1;

3059

failed:

3062

failed:

3060

return 0;

3063

return 0;

3061

}

3064

}

3062

3065

3063

int try_to_free_buffers(struct page *page)

3066

int try_to_free_buffers(struct page *page)

3064

{

3067

{

3065

struct address_space * const mapping = page->mapping;

3068

struct address_space * const mapping = page->mapping;

3066

struct buffer_head *buffers_to_free = NULL;

3069

struct buffer_head *buffers_to_free = NULL;

3067

int ret = 0;

3070

int ret = 0;

3068

3071

3069

BUG_ON(!PageLocked(page));

3072

BUG_ON(!PageLocked(page));

3070

if (PageWriteback(page))

3073

if (PageWriteback(page))

3071

return 0;

3074

return 0;

3072

3075

3073

if (mapping == NULL) { /* can this still happen? */

3076

if (mapping == NULL) { /* can this still happen? */

3074

ret = drop_buffers(page, &buffers_to_free);

3077

ret = drop_buffers(page, &buffers_to_free);

3075

goto out;

3078

goto out;

3076

}

3079

}

3077

3080

3078

spin_lock(&mapping->private_lock);

3081

spin_lock(&mapping->private_lock);

3079

ret = drop_buffers(page, &buffers_to_free);

3082

ret = drop_buffers(page, &buffers_to_free);

3080

3083

3081

/*

3084

/*

3082

* If the filesystem writes its buffers by hand (eg ext3)

3085

* If the filesystem writes its buffers by hand (eg ext3)

3083

* then we can have clean buffers against a dirty page. We

3086

* then we can have clean buffers against a dirty page. We

3084

* clean the page here; otherwise the VM will never notice

3087

* clean the page here; otherwise the VM will never notice

3085

* that the filesystem did any IO at all.

3088

* that the filesystem did any IO at all.

3086

*

3089

*

3087

* Also, during truncate, discard_buffer will have marked all

3090

* Also, during truncate, discard_buffer will have marked all

3088

* the page's buffers clean. We discover that here and clean

3091

* the page's buffers clean. We discover that here and clean

3089

* the page also.

3092

* the page also.

3090

*

3093

*

3091

* private_lock must be held over this entire operation in order

3094

* private_lock must be held over this entire operation in order

3092

* to synchronise against __set_page_dirty_buffers and prevent the

3095

* to synchronise against __set_page_dirty_buffers and prevent the

3093

* dirty bit from being lost.

3096

* dirty bit from being lost.

3094

*/

3097

*/

3095

if (ret)

3098

if (ret)

3096

cancel_dirty_page(page, PAGE_CACHE_SIZE);

3099

cancel_dirty_page(page, PAGE_CACHE_SIZE);

3097

spin_unlock(&mapping->private_lock);

3100

spin_unlock(&mapping->private_lock);

3098

out:

3101

out:

3099

if (buffers_to_free) {

3102

if (buffers_to_free) {

3100

struct buffer_head *bh = buffers_to_free;

3103

struct buffer_head *bh = buffers_to_free;

3101

3104

3102

do {

3105

do {

3103

struct buffer_head *next = bh->b_this_page;

3106

struct buffer_head *next = bh->b_this_page;

3104

free_buffer_head(bh);

3107

free_buffer_head(bh);

3105

bh = next;

3108

bh = next;

3106

} while (bh != buffers_to_free);

3109

} while (bh != buffers_to_free);

3107

}

3110

}

3108

return ret;

3111

return ret;

3109

}

3112

}

3110

EXPORT_SYMBOL(try_to_free_buffers);

3113

EXPORT_SYMBOL(try_to_free_buffers);

3111

3114

3112

void block_sync_page(struct page *page)

3115

void block_sync_page(struct page *page)

3113

{

3116

{

3114

struct address_space *mapping;

3117

struct address_space *mapping;

3115

3118

3116

smp_mb();

3119

smp_mb();

3117

mapping = page_mapping(page);

3120

mapping = page_mapping(page);

3118

if (mapping)

3121

if (mapping)

3119

blk_run_backing_dev(mapping->backing_dev_info, page);

3122

blk_run_backing_dev(mapping->backing_dev_info, page);

3120

}

3123

}

3121

3124

3122

/*

3125

/*

3123

* There are no bdflush tunables left. But distributions are

3126

* There are no bdflush tunables left. But distributions are

3124

* still running obsolete flush daemons, so we terminate them here.

3127

* still running obsolete flush daemons, so we terminate them here.

3125

*

3128

*

3126

* Use of bdflush() is deprecated and will be removed in a future kernel.

3129

* Use of bdflush() is deprecated and will be removed in a future kernel.

3127

* The `pdflush' kernel threads fully replace bdflush daemons and this call.

3130

* The `pdflush' kernel threads fully replace bdflush daemons and this call.

3128

*/

3131

*/

3129

asmlinkage long sys_bdflush(int func, long data)

3132

asmlinkage long sys_bdflush(int func, long data)

3130

{

3133

{

3131

static int msg_count;

3134

static int msg_count;

3132

3135

3133

if (!capable(CAP_SYS_ADMIN))

3136

if (!capable(CAP_SYS_ADMIN))

3134

return -EPERM;

3137

return -EPERM;

3135

3138

3136

if (msg_count < 5) {

3139

if (msg_count < 5) {

3137

msg_count++;

3140

msg_count++;

3138

printk(KERN_INFO

3141

printk(KERN_INFO

3139

"warning: process `%s' used the obsolete bdflush"

3142

"warning: process `%s' used the obsolete bdflush"

3140

" system call\n", current->comm);

3143

" system call\n", current->comm);

3141

printk(KERN_INFO "Fix your initscripts?\n");

3144

printk(KERN_INFO "Fix your initscripts?\n");

3142

}

3145

}

3143

3146

3144

if (func == 1)

3147

if (func == 1)

3145

do_exit(0);

3148

do_exit(0);

3146

return 0;

3149

return 0;

3147

}

3150

}

3148

3151

3149

/*

3152

/*

3150

* Buffer-head allocation

3153

* Buffer-head allocation

3151

*/

3154

*/

3152

static struct kmem_cache *bh_cachep;

3155

static struct kmem_cache *bh_cachep;

3153

3156

3154

/*

3157

/*

3155

* Once the number of bh's in the machine exceeds this level, we start

3158

* Once the number of bh's in the machine exceeds this level, we start

3156

* stripping them in writeback.

3159

* stripping them in writeback.

3157

*/

3160

*/

3158

static int max_buffer_heads;

3161

static int max_buffer_heads;

3159

3162

3160

int buffer_heads_over_limit;

3163

int buffer_heads_over_limit;

3161

3164

3162

struct bh_accounting {

3165

struct bh_accounting {

3163

int nr; /* Number of live bh's */

3166

int nr; /* Number of live bh's */

3164

int ratelimit; /* Limit cacheline bouncing */

3167

int ratelimit; /* Limit cacheline bouncing */

3165

};

3168

};

3166

3169

3167

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

3170

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

3168

3171

3169

static void recalc_bh_state(void)

3172

static void recalc_bh_state(void)

3170

{

3173

{

3171

int i;

3174

int i;

3172

int tot = 0;

3175

int tot = 0;

3173

3176

3174

if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)

3177

if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)

3175

return;

3178

return;

3176

__get_cpu_var(bh_accounting).ratelimit = 0;

3179

__get_cpu_var(bh_accounting).ratelimit = 0;

3177

for_each_online_cpu(i)

3180

for_each_online_cpu(i)

3178

tot += per_cpu(bh_accounting, i).nr;

3181

tot += per_cpu(bh_accounting, i).nr;

3179

buffer_heads_over_limit = (tot > max_buffer_heads);

3182

buffer_heads_over_limit = (tot > max_buffer_heads);

3180

}

3183

}

3181

3184

3182

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)

3185

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)

3183

{

3186

{

3184

struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);

3187

struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);

3185

if (ret) {

3188

if (ret) {

3186

INIT_LIST_HEAD(&ret->b_assoc_buffers);

3189

INIT_LIST_HEAD(&ret->b_assoc_buffers);

3187

get_cpu_var(bh_accounting).nr++;

3190

get_cpu_var(bh_accounting).nr++;

3188

recalc_bh_state();

3191

recalc_bh_state();

3189

put_cpu_var(bh_accounting);

3192

put_cpu_var(bh_accounting);

3190

}

3193

}

3191

return ret;

3194

return ret;

3192

}

3195

}

3193

EXPORT_SYMBOL(alloc_buffer_head);

3196

EXPORT_SYMBOL(alloc_buffer_head);

3194

3197

3195

void free_buffer_head(struct buffer_head *bh)

3198

void free_buffer_head(struct buffer_head *bh)

3196

{

3199

{

3197

BUG_ON(!list_empty(&bh->b_assoc_buffers));

3200

BUG_ON(!list_empty(&bh->b_assoc_buffers));

3198

kmem_cache_free(bh_cachep, bh);

3201

kmem_cache_free(bh_cachep, bh);

3199

get_cpu_var(bh_accounting).nr--;

3202

get_cpu_var(bh_accounting).nr--;

3200

recalc_bh_state();

3203

recalc_bh_state();

3201

put_cpu_var(bh_accounting);

3204

put_cpu_var(bh_accounting);

3202

}

3205

}

3203

EXPORT_SYMBOL(free_buffer_head);

3206

EXPORT_SYMBOL(free_buffer_head);

3204

3207

3205

static void buffer_exit_cpu(int cpu)

3208

static void buffer_exit_cpu(int cpu)

3206

{

3209

{

3207

int i;

3210

int i;

3208

struct bh_lru *b = &per_cpu(bh_lrus, cpu);

3211

struct bh_lru *b = &per_cpu(bh_lrus, cpu);

3209

3212

3210

for (i = 0; i < BH_LRU_SIZE; i++) {

3213

for (i = 0; i < BH_LRU_SIZE; i++) {

3211

brelse(b->bhs[i]);

3214

brelse(b->bhs[i]);

3212

b->bhs[i] = NULL;

3215

b->bhs[i] = NULL;

3213

}

3216

}

3214

get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;

3217

get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;

3215

per_cpu(bh_accounting, cpu).nr = 0;

3218

per_cpu(bh_accounting, cpu).nr = 0;

3216

put_cpu_var(bh_accounting);

3219

put_cpu_var(bh_accounting);

3217

}

3220

}

3218

3221

3219

static int buffer_cpu_notify(struct notifier_block *self,

3222

static int buffer_cpu_notify(struct notifier_block *self,

3220

unsigned long action, void *hcpu)

3223

unsigned long action, void *hcpu)

3221

{

3224

{

3222

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)

3225

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)

3223

buffer_exit_cpu((unsigned long)hcpu);

3226

buffer_exit_cpu((unsigned long)hcpu);

3224

return NOTIFY_OK;

3227

return NOTIFY_OK;

3225

}

3228

}

3226

3229

3227

/**

3230

/**

3228

* bh_uptodate_or_lock - Test whether the buffer is uptodate

3231

* bh_uptodate_or_lock - Test whether the buffer is uptodate

3229

* @bh: struct buffer_head

3232

* @bh: struct buffer_head

3230

*

3233

*

3231

* Return true if the buffer is up-to-date and false,

3234

* Return true if the buffer is up-to-date and false,

3232

* with the buffer locked, if not.

3235

* with the buffer locked, if not.

3233

*/

3236

*/

3234

int bh_uptodate_or_lock(struct buffer_head *bh)

3237

int bh_uptodate_or_lock(struct buffer_head *bh)

3235

{

3238

{

3236

if (!buffer_uptodate(bh)) {

3239

if (!buffer_uptodate(bh)) {

3237

lock_buffer(bh);

3240

lock_buffer(bh);

3238

if (!buffer_uptodate(bh))

3241

if (!buffer_uptodate(bh))

3239

return 0;

3242

return 0;

3240

unlock_buffer(bh);

3243

unlock_buffer(bh);

3241

}

3244

}

3242

return 1;

3245

return 1;

3243

}

3246

}

3244

EXPORT_SYMBOL(bh_uptodate_or_lock);

3247

EXPORT_SYMBOL(bh_uptodate_or_lock);

3245

3248

3246

/**

3249

/**

3247

* bh_submit_read - Submit a locked buffer for reading

3250

* bh_submit_read - Submit a locked buffer for reading

3248

* @bh: struct buffer_head

3251

* @bh: struct buffer_head

3249

*

3252

*

3250

* Returns zero on success and -EIO on error.

3253

* Returns zero on success and -EIO on error.

3251

*/

3254

*/

3252

int bh_submit_read(struct buffer_head *bh)

3255

int bh_submit_read(struct buffer_head *bh)

3253

{

3256

{

3254

BUG_ON(!buffer_locked(bh));

3257

BUG_ON(!buffer_locked(bh));

3255

3258

3256

if (buffer_uptodate(bh)) {

3259

if (buffer_uptodate(bh)) {

3257

unlock_buffer(bh);

3260

unlock_buffer(bh);

3258

return 0;

3261

return 0;

3259

}

3262

}

3260

3263

3261

get_bh(bh);

3264

get_bh(bh);

3262

bh->b_end_io = end_buffer_read_sync;

3265

bh->b_end_io = end_buffer_read_sync;

3263

submit_bh(READ, bh);

3266

submit_bh(READ, bh);

3264

wait_on_buffer(bh);

3267

wait_on_buffer(bh);

3265

if (buffer_uptodate(bh))

3268

if (buffer_uptodate(bh))

3266

return 0;

3269

return 0;

3267

return -EIO;

3270

return -EIO;

3268

}

3271

}

3269

EXPORT_SYMBOL(bh_submit_read);

3272

EXPORT_SYMBOL(bh_submit_read);

3270

3273

3271

static void

3274

static void

3272

init_buffer_head(struct kmem_cache *cachep, void *data)

3275

init_buffer_head(struct kmem_cache *cachep, void *data)

3273

{

3276

{

3274

struct buffer_head *bh = data;

3277

struct buffer_head *bh = data;

3275

3278

3276

memset(bh, 0, sizeof(*bh));

3279

memset(bh, 0, sizeof(*bh));

3277

INIT_LIST_HEAD(&bh->b_assoc_buffers);

3280

INIT_LIST_HEAD(&bh->b_assoc_buffers);

3278

}

3281

}

3279

3282

3280

void __init buffer_init(void)

3283

void __init buffer_init(void)

3281

{

3284

{

3282

int nrpages;

3285

int nrpages;

3283

3286

3284

bh_cachep = kmem_cache_create("buffer_head",

3287

bh_cachep = kmem_cache_create("buffer_head",

3285

sizeof(struct buffer_head), 0,

3288

sizeof(struct buffer_head), 0,

3286

(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|

3289

(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|

3287

SLAB_MEM_SPREAD),

3290

SLAB_MEM_SPREAD),

3288

init_buffer_head);

3291

init_buffer_head);

3289

3292

3290

/*

3293

/*

3291

* Limit the bh occupancy to 10% of ZONE_NORMAL

3294

* Limit the bh occupancy to 10% of ZONE_NORMAL

3292

*/

3295

*/

3293

nrpages = (nr_free_buffer_pages() * 10) / 100;

3296

nrpages = (nr_free_buffer_pages() * 10) / 100;

3294

max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));

3297

max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));

3295

hotcpu_notifier(buffer_cpu_notify, 0);

3298

hotcpu_notifier(buffer_cpu_notify, 0);

3296

}

3299

}

3297

3300

3298

EXPORT_SYMBOL(__bforget);

3301

EXPORT_SYMBOL(__bforget);

3299

EXPORT_SYMBOL(__brelse);

3302

EXPORT_SYMBOL(__brelse);

3300

EXPORT_SYMBOL(__wait_on_buffer);

3303

EXPORT_SYMBOL(__wait_on_buffer);

3301

EXPORT_SYMBOL(block_commit_write);

3304

EXPORT_SYMBOL(block_commit_write);

3302

EXPORT_SYMBOL(block_prepare_write);

3305

EXPORT_SYMBOL(block_prepare_write);

3303

EXPORT_SYMBOL(block_page_mkwrite);

3306

EXPORT_SYMBOL(block_page_mkwrite);

3304

EXPORT_SYMBOL(block_read_full_page);

3307

EXPORT_SYMBOL(block_read_full_page);

3305

EXPORT_SYMBOL(block_sync_page);

3308

EXPORT_SYMBOL(block_sync_page);

3306

EXPORT_SYMBOL(block_truncate_page);

3309

EXPORT_SYMBOL(block_truncate_page);

3307

EXPORT_SYMBOL(block_write_full_page);

3310

EXPORT_SYMBOL(block_write_full_page);

3308

EXPORT_SYMBOL(cont_write_begin);

3311

EXPORT_SYMBOL(cont_write_begin);

3309

EXPORT_SYMBOL(end_buffer_read_sync);

3312

EXPORT_SYMBOL(end_buffer_read_sync);

3310

EXPORT_SYMBOL(end_buffer_write_sync);

3313

EXPORT_SYMBOL(end_buffer_write_sync);

3311

EXPORT_SYMBOL(file_fsync);

3314

EXPORT_SYMBOL(file_fsync);

3312

EXPORT_SYMBOL(fsync_bdev);

3315

EXPORT_SYMBOL(fsync_bdev);

3313

EXPORT_SYMBOL(generic_block_bmap);

3316

EXPORT_SYMBOL(generic_block_bmap);

3314

EXPORT_SYMBOL(generic_cont_expand_simple);

3317

EXPORT_SYMBOL(generic_cont_expand_simple);

3315

EXPORT_SYMBOL(init_buffer);

3318

EXPORT_SYMBOL(init_buffer);

3316

EXPORT_SYMBOL(invalidate_bdev);

3319

EXPORT_SYMBOL(invalidate_bdev);

3317

EXPORT_SYMBOL(ll_rw_block);

3320

EXPORT_SYMBOL(ll_rw_block);

3318

EXPORT_SYMBOL(mark_buffer_dirty);

3321

EXPORT_SYMBOL(mark_buffer_dirty);

3319

EXPORT_SYMBOL(submit_bh);

3322

EXPORT_SYMBOL(submit_bh);

3320

EXPORT_SYMBOL(sync_dirty_buffer);

3323

EXPORT_SYMBOL(sync_dirty_buffer);

3321

EXPORT_SYMBOL(unlock_buffer);

3324

EXPORT_SYMBOL(unlock_buffer);

3322

3325

GITLAB

Eric Lee / smarc-fsl-linux-kernel

vfs: add hooks for ext4's delayed allocation support

 /*
  *  linux/fs/buffer.c
  *
  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
  */
 /*
  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  *
  * Removed a lot of unnecessary code and simplified things now that
  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  *
  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  *
  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  *
  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  */
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/capability.h>
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 inline void
 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 {
 	bh->b_end_io = handler;
 	bh->b_private = private;
 }
 static int sync_buffer(void *word)
 {
 	struct block_device *bd;
 	struct buffer_head *bh
 		= container_of(word, struct buffer_head, b_state);
 	smp_mb();
 	bd = bh->b_bdev;
 	if (bd)
 		blk_run_address_space(bd->bd_inode->i_mapping);
 	io_schedule();
 	return 0;
 }
 void __lock_buffer(struct buffer_head *bh)
 {
 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
 void unlock_buffer(struct buffer_head *bh)
 {
 	smp_mb__before_clear_bit();
 	clear_buffer_locked(bh);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
 /*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
  * if you want to preserve its state.
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
 }
 static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
 	page_cache_release(page);
 }
 static void buffer_io_error(struct buffer_head *bh)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr);
 }
 /*
  * End-of-IO handler helper function which does not touch the bh after
  * unlocking it.
  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
  * a race there is benign: unlock_buffer() only use the bh's address for
  * hashing after unlocking the buffer, so it doesn't actually touch the bh
  * itself.
  */
 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 {
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
 		/* This happens, due to failed READA attempts. */
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
 }
 /*
  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
  * unlock the buffer. This is what ll_rw_block uses too.
  */
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 {
 	__end_buffer_read_notouch(bh, uptodate);
 	put_bh(bh);
 }
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
 		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
 		}
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
 	put_bh(bh);
 }
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
 	int ret = 0;
 	if (bdev)
 		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
 int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
 		int res = fsync_super(sb);
 		drop_super(sb);
 		return res;
 	}
 	return sync_blockdev(bdev);
 }
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
  * This takes the block device bd_mount_sem to make sure no new mounts
  * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
 	down(&bdev->bd_mount_sem);
 	sb = get_super(bdev);
 	if (sb && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 		__fsync_super(sb);
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
 		sync_blockdev(sb->s_bdev);
 		if (sb->s_op->write_super_lockfs)
 			sb->s_op->write_super_lockfs(sb);
 	}
 	sync_blockdev(bdev);
 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
 /**
  * thaw_bdev  -- unlock filesystem
  * @bdev:	blockdevice to unlock
  * @sb:		associated superblock
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
 		if (sb->s_op->unlockfs)
 			sb->s_op->unlockfs(sb);
 		sb->s_frozen = SB_UNFROZEN;
 		smp_wmb();
 		wake_up(&sb->s_wait_unfrozen);
 		drop_super(sb);
 	}
 	up(&bdev->bd_mount_sem);
 }
 EXPORT_SYMBOL(thaw_bdev);
 /*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
  * private_lock.
  *
  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
  * may be quite high.  This code could TryLock the page, and if that
  * succeeds, there is no need to take private_lock. (But if
  * private_lock is contended then so is mapping->tree_lock).
  */
 static struct buffer_head *
 __find_get_block_slow(struct block_device *bdev, sector_t block)
 {
 	struct inode *bd_inode = bdev->bd_inode;
 	struct address_space *bd_mapping = bd_inode->i_mapping;
 	struct buffer_head *ret = NULL;
 	pgoff_t index;
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
 	int all_mapped = 1;
 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 	page = find_get_page(bd_mapping, index);
 	if (!page)
 		goto out;
 	spin_lock(&bd_mapping->private_lock);
 	if (!page_has_buffers(page))
 		goto out_unlock;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		if (bh->b_blocknr == block) {
 			ret = bh;
 			get_bh(bh);
 			goto out_unlock;
 		}
 		if (!buffer_mapped(bh))
 			all_mapped = 0;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	/* we might be here because some of the buffers on this page are
 	 * not mapped.  This is due to various races between
 	 * file io on the block device and getblk.  It gets dealt with
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
 	if (all_mapped) {
 		printk("__find_get_block_slow() failed. "
 			"block=%llu, b_blocknr=%llu\n",
 			(unsigned long long)block,
 			(unsigned long long)bh->b_blocknr);
 		printk("b_state=0x%08lx, b_size=%zu\n",
 			bh->b_state, bh->b_size);
 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
 	page_cache_release(page);
 out:
 	return ret;
 }
 /* If invalidate_buffers() will trash dirty buffers, it means some kind
    of fs corruption is going on. Trashing dirty data always imply losing
    information that was supposed to be just stored on the physical layer
    by the user.
    Thus invalidate_buffers in general usage is not allwowed to trash
    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
    be preserved.  These buffers are simply skipped.
    We also skip buffers which are still in use.  For example this can
    happen if a userspace program is reading the block device.
    NOTE: In the case where the user removed a removable-media-disk even if
    there's still dirty data not synced on disk (due a bug in the device driver
    or due an error of the user), by not destroying the dirty buffers we could
    generate corruption also on the next media inserted, thus a parameter is
    necessary to handle this case in the most safe way possible (trying
    to not corrupt also the new disk inserted with the data belonging to
    the old now corrupted disk). Also for the ramdisk the natural thing
    to do in order to release the ramdisk memory is to destroy dirty buffers.
    These are two special cases. Normal usage imply the device driver
    to issue a sync on the device (without waiting I/O completion) and
    then an invalidate_buffers call that doesn't trash dirty buffers.
    For handling cache coherency with the blkdev pagecache the 'update' case
    is been introduced. It is needed to re-read from disk any pinned
    buffer. NOTE: re-reading from disk is destructive so we can do it only
    when we assume nobody is changing the buffercache under our I/O and when
    we think the disk contains more recent information than the buffercache.
    The update == 1 pass marks the buffers we need to update, the update == 2
    pass does the actual I/O. */
 void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	if (mapping->nrpages == 0)
 		return;
 	invalidate_bh_lrus();
 	invalidate_mapping_pages(mapping, 0, -1);
 }
 /*
  * Kick pdflush then try to free up some ZONE_NORMAL memory.
  */
 static void free_more_memory(void)
 {
 	struct zone *zone;
 	int nid;
 	wakeup_pdflush(1024);
 	yield();
 	for_each_online_node(nid) {
 		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 						gfp_zone(GFP_NOFS), NULL,
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS);
 	}
 }
 /*
  * I/O completion handler for block_read_full_page() - pages
  * which come unlocked at the end of I/O.
  */
 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
 	unsigned long flags;
 	struct buffer_head *first;
 	struct buffer_head *tmp;
 	struct page *page;
 	int page_uptodate = 1;
 	BUG_ON(!buffer_async_read(bh));
 	page = bh->b_page;
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
 		if (printk_ratelimit())
 			buffer_io_error(bh);
 		SetPageError(page);
 	}
 	/*
 	 * Be _very_ careful from here on. Bad things can happen if
 	 * two buffer heads end IO at almost the same time and both
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
 	local_irq_save(flags);
 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
 	do {
 		if (!buffer_uptodate(tmp))
 			page_uptodate = 0;
 		if (buffer_async_read(tmp)) {
 			BUG_ON(!buffer_locked(tmp));
 			goto still_busy;
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	/*
 	 * If none of the buffers had errors and they are all
 	 * uptodate then we can set the page uptodate.
 	 */
 	if (page_uptodate && !PageError(page))
 		SetPageUptodate(page);
 	unlock_page(page);
 	return;
 still_busy:
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	return;
 }
 /*
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
 	struct buffer_head *first;
 	struct buffer_head *tmp;
 	struct page *page;
 	BUG_ON(!buffer_async_write(bh));
 	page = bh->b_page;
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
 		if (printk_ratelimit()) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 			       bdevname(bh->b_bdev, b));
 		}
 		set_bit(AS_EIO, &page->mapping->flags);
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 		SetPageError(page);
 	}
 	first = page_buffers(page);
 	local_irq_save(flags);
 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
 	tmp = bh->b_this_page;
 	while (tmp != bh) {
 		if (buffer_async_write(tmp)) {
 			BUG_ON(!buffer_locked(tmp));
 			goto still_busy;
 		}
 		tmp = tmp->b_this_page;
 	}
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	end_page_writeback(page);
 	return;
 still_busy:
 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 	local_irq_restore(flags);
 	return;
 }
 /*
  * If a page's buffers are under async readin (end_buffer_async_read
  * completion) then there is a possibility that another thread of
  * control could lock one of the buffers after it has completed
  * but while some of the other buffers have not completed.  This
  * locked buffer would confuse end_buffer_async_read() into not unlocking
  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
  * that this buffer is not under async I/O.
  *
  * The page comes unlocked when it has no locked buffer_async buffers
  * left.
  *
  * PageLocked prevents anyone starting new async I/O reads any of
  * the buffers.
  *
  * PageWriteback is used to prevent simultaneous writeout of the same
  * page.
  *
  * PageLocked prevents anyone from starting writeback of a page which is
  * under read I/O (PageWriteback is only ever set against a locked page).
  */
 static void mark_buffer_async_read(struct buffer_head *bh)
 {
 	bh->b_end_io = end_buffer_async_read;
 	set_buffer_async_read(bh);
 }
 void mark_buffer_async_write(struct buffer_head *bh)
 {
 	bh->b_end_io = end_buffer_async_write;
 	set_buffer_async_write(bh);
 }
 EXPORT_SYMBOL(mark_buffer_async_write);
 /*
  * fs/buffer.c contains helper functions for buffer-backed address space's
  * fsync functions.  A common requirement for buffer-based filesystems is
  * that certain data from the backing blockdev needs to be written out for
  * a successful fsync().  For example, ext2 indirect blocks need to be
  * written back and waited upon before fsync() returns.
  *
  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->private_list.
  *
  * Locking is a little subtle: try_to_free_buffers() will remove buffers
  * from their controlling inode's queue when they are being freed.  But
  * try_to_free_buffers() will be operating against the *blockdev* mapping
  * at the time, not against the S_ISREG file which depends on those buffers.
  * So the locking for private_list is via the private_lock in the address_space
  * which backs the buffers.  Which is different from the address_space
  * against which the buffers are listed.  So for a particular address_space,
  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
  * mapping->private_list will always be protected by the backing blockdev's
  * ->private_lock.
  *
  * Which introduces a requirement: all buffers on an address_space's
  * ->private_list must be from the same address_space: the blockdev's.
  *
  * address_spaces which do not place buffers at ->private_list via these
  * utility functions are free to use private_lock and private_list for
  * whatever they want.  The only requirement is that list_empty(private_list)
  * be true at clear_inode() time.
  *
  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
  * filesystems should do that.  invalidate_inode_buffers() should just go
  * BUG_ON(!list_empty).
  *
  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
  * take an address_space, not an inode.  And it should be called
  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
  * queued up.
  *
  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
  * list if it is already on a list.  Because if the buffer is on a list,
  * it *must* already be on the right one.  If not, the filesystem is being
  * silly.  This will save a ton of locking.  But first we have to ensure
  * that buffers are taken *off* the old inode's list when they are freed
  * (presumably in truncate).  That requires careful auditing of all
  * filesystems (do it inside bforget()).  It could also be done by bringing
  * b_inode back.
  */
 /*
  * The buffer's backing address_space's private_lock must be held
  */
 static inline void __remove_assoc_queue(struct buffer_head *bh)
 {
 	list_del_init(&bh->b_assoc_buffers);
 	WARN_ON(!bh->b_assoc_map);
 	if (buffer_write_io_error(bh))
 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
 	bh->b_assoc_map = NULL;
 }
 int inode_has_buffers(struct inode *inode)
 {
 	return !list_empty(&inode->i_data.private_list);
 }
 /*
  * osync is designed to support O_SYNC io.  It waits synchronously for
  * all already-submitted IO to complete, but does not queue any new
  * writes to the disk.
  *
  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
  * you dirty the buffers, and then use osync_inode_buffers to wait for
  * completion.  Any other dirty buffers which are not yet queued for
  * write will not be flushed to disk by the osync.
  */
 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head *p;
 	int err = 0;
 	spin_lock(lock);
 repeat:
 	list_for_each_prev(p, list) {
 		bh = BH_ENTRY(p);
 		if (buffer_locked(bh)) {
 			get_bh(bh);
 			spin_unlock(lock);
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				err = -EIO;
 			brelse(bh);
 			spin_lock(lock);
 			goto repeat;
 		}
 	}
 	spin_unlock(lock);
 	return err;
 }
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
  *
  * Starts I/O against the buffers at mapping->private_list, and waits upon
  * that I/O.
  *
  * Basically, this is a convenience function for fsync().
  * @mapping is a file or directory which needs those buffers to be written for
  * a successful fsync().
  */
 int sync_mapping_buffers(struct address_space *mapping)
 {
 	struct address_space *buffer_mapping = mapping->assoc_mapping;
 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 		return 0;
 	return fsync_buffers_list(&buffer_mapping->private_lock,
 					&mapping->private_list);
 }
 EXPORT_SYMBOL(sync_mapping_buffers);
 /*
  * Called when we've recently written block `bblock', and it is known that
  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
  */
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize)
 {
 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 	if (bh) {
 		if (buffer_dirty(bh))
 			ll_rw_block(WRITE, 1, &bh);
 		put_bh(bh);
 	}
 }
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct address_space *buffer_mapping = bh->b_page->mapping;
 	mark_buffer_dirty(bh);
 	if (!mapping->assoc_mapping) {
 		mapping->assoc_mapping = buffer_mapping;
 	} else {
 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
 	}
 	if (!bh->b_assoc_map) {
 		spin_lock(&buffer_mapping->private_lock);
 		list_move_tail(&bh->b_assoc_buffers,
 				&mapping->private_list);
 		bh->b_assoc_map = mapping;
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 /*
  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
  * dirty.
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
  */
 static int __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
 	if (TestSetPageDirty(page))
 		return 0;
 	write_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
 	write_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return 1;
 }
 /*
  * Add a page to the dirty page list.
  *
  * It is a sad fact of life that this function is called from several places
  * deeply under spinlocking.  It may not sleep.
  *
  * If the page has buffers, the uptodate buffers are set dirty, to preserve
  * dirty-state coherency between the page and the buffers.  It the page does
  * not have buffers then when they are later attached they will all be set
  * dirty.
  *
  * The buffers are dirtied before the page is dirtied.  There's a small race
  * window in which a writepage caller may see the page cleanness but not the
  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
  * before the buffers, a concurrent writepage caller could clear the page dirty
  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
  * page on the dirty page list.
  *
  * We use private_lock to lock against try_to_free_buffers while using the
  * page's buffer list.  Also use this to protect against clean buffers being
  * added to the page after it was set dirty.
  *
  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
  * address_space though.
  */
 int __set_page_dirty_buffers(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
 		do {
 			set_buffer_dirty(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
 	spin_unlock(&mapping->private_lock);
 	return __set_page_dirty(page, mapping, 1);
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
 /*
  * Write out and wait upon a list of buffers.
  *
  * We have conflicting pressures: we want to make sure that all
  * initially dirty buffers get waited on, but that any subsequently
  * dirtied buffers don't.  After all, we don't want fsync to last
  * forever if somebody is actively writing to the file.
  *
  * Do this in two main stages: first we copy dirty buffers to a
  * temporary inode list, queueing the writes as we go.  Then we clean
  * up, waiting for those writes to complete.
  *
  * During this second stage, any subsequent updates to the file may end
  * up refiling the buffer on the original inode's dirty list again, so
  * there is a chance we will end up with a buffer queued for write but
  * not yet completed on that list.  So, as a final cleanup we go through
  * the osync code to catch these locked, dirty buffers without requeuing
  * any newly dirty buffers for write.
  */
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
 	struct address_space *mapping;
 	int err = 0, err2;
 	INIT_LIST_HEAD(&tmp);
 	spin_lock(lock);
 	while (!list_empty(list)) {
 		bh = BH_ENTRY(list->next);
 		mapping = bh->b_assoc_map;
 		__remove_assoc_queue(bh);
 		/* Avoid race with mark_buffer_dirty_inode() which does
 		 * a lockless check and we rely on seeing the dirty bit */
 		smp_mb();
 		if (buffer_dirty(bh) || buffer_locked(bh)) {
 			list_add(&bh->b_assoc_buffers, &tmp);
 			bh->b_assoc_map = mapping;
 			if (buffer_dirty(bh)) {
 				get_bh(bh);
 				spin_unlock(lock);
 				/*
 				 * Ensure any pending I/O completes so that
 				 * ll_rw_block() actually writes the current
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
 				ll_rw_block(SWRITE_SYNC, 1, &bh);
 				brelse(bh);
 				spin_lock(lock);
 			}
 		}
 	}
 	while (!list_empty(&tmp)) {
 		bh = BH_ENTRY(tmp.prev);
 		get_bh(bh);
 		mapping = bh->b_assoc_map;
 		__remove_assoc_queue(bh);
 		/* Avoid race with mark_buffer_dirty_inode() which does
 		 * a lockless check and we rely on seeing the dirty bit */
 		smp_mb();
 		if (buffer_dirty(bh)) {
 			list_add(&bh->b_assoc_buffers,
 				 &mapping->private_list);
 			bh->b_assoc_map = mapping;
 		}
 		spin_unlock(lock);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh))
 			err = -EIO;
 		brelse(bh);
 		spin_lock(lock);
 	}
 	spin_unlock(lock);
 	err2 = osync_buffers_list(lock, list);
 	if (err)
 		return err;
 	else
 		return err2;
 }
 /*
  * Invalidate any and all dirty buffers on a given inode.  We are
  * probably unmounting the fs, but that doesn't mean we have already
  * done a sync().  Just drop the buffers from the inode list.
  *
  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
  * assumes that all the buffers are against the blockdev.  Not true
  * for reiserfs.
  */
 void invalidate_inode_buffers(struct inode *inode)
 {
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
 		struct list_head *list = &mapping->private_list;
 		struct address_space *buffer_mapping = mapping->assoc_mapping;
 		spin_lock(&buffer_mapping->private_lock);
 		while (!list_empty(list))
 			__remove_assoc_queue(BH_ENTRY(list->next));
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
 /*
  * Remove any clean buffers from the inode's buffer list.  This is called
  * when we're trying to free the inode itself.  Those buffers can pin it.
  *
  * Returns true if all buffers were removed.
  */
 int remove_inode_buffers(struct inode *inode)
 {
 	int ret = 1;
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
 		struct list_head *list = &mapping->private_list;
 		struct address_space *buffer_mapping = mapping->assoc_mapping;
 		spin_lock(&buffer_mapping->private_lock);
 		while (!list_empty(list)) {
 			struct buffer_head *bh = BH_ENTRY(list->next);
 			if (buffer_dirty(bh)) {
 				ret = 0;
 				break;
 			}
 			__remove_assoc_queue(bh);
 		}
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 	return ret;
 }
 /*
  * Create the appropriate buffers when given a page for data area and
  * the size of each buffer.. Use the bh->b_this_page linked list to
  * follow the buffers created.  Return NULL if unable to create more
  * buffers.
  *
  * The retry flag is used to differentiate async IO (paging, swapping)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		int retry)
 {
 	struct buffer_head *bh, *head;
 	long offset;
 try_again:
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
 		bh = alloc_buffer_head(GFP_NOFS);
 		if (!bh)
 			goto no_grow;
 		bh->b_bdev = NULL;
 		bh->b_this_page = head;
 		bh->b_blocknr = -1;
 		head = bh;
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
 		bh->b_private = NULL;
 		bh->b_size = size;
 		/* Link the buffer to its page */
 		set_bh_page(bh, page, offset);
 		init_buffer(bh, NULL, NULL);
 	}
 	return head;
 /*
  * In case anything failed, we just free everything we got.
  */
 no_grow:
 	if (head) {
 		do {
 			bh = head;
 			head = head->b_this_page;
 			free_buffer_head(bh);
 		} while (head);
 	}
 	/*
 	 * Return failure for non-async IO requests.  Async IO requests
 	 * are not allowed to fail, so we have to wait until buffer heads
 	 * become available.  But we don't want tasks sleeping with
 	 * partially complete buffers, so all were released above.
 	 */
 	if (!retry)
 		return NULL;
 	/* We're _really_ low on memory. Now we just
 	 * wait for old buffer heads to become free due to
 	 * finishing IO.  Since this is an async request and
 	 * the reserve list is empty, we're sure there are
 	 * async buffer heads in use.
 	 */
 	free_more_memory();
 	goto try_again;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 static inline void
 link_dev_buffers(struct page *page, struct buffer_head *head)
 {
 	struct buffer_head *bh, *tail;
 	bh = head;
 	do {
 		tail = bh;
 		bh = bh->b_this_page;
 	} while (bh);
 	tail->b_this_page = head;
 	attach_page_buffers(page, head);
 }
 /*
  * Initialise the state of a blockdev page's buffers.
  */
 static void
 init_page_buffers(struct page *page, struct block_device *bdev,
 			sector_t block, int size)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh = head;
 	int uptodate = PageUptodate(page);
 	do {
 		if (!buffer_mapped(bh)) {
 			init_buffer(bh, NULL, NULL);
 			bh->b_bdev = bdev;
 			bh->b_blocknr = block;
 			if (uptodate)
 				set_buffer_uptodate(bh);
 			set_buffer_mapped(bh);
 		}
 		block++;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 /*
  * Create the page-cache page that contains the requested block.
  *
  * This is user purely for blockdev mappings.
  */
 static struct page *
 grow_dev_page(struct block_device *bdev, sector_t block,
 		pgoff_t index, int size)
 {
 	struct inode *inode = bdev->bd_inode;
 	struct page *page;
 	struct buffer_head *bh;
 	page = find_or_create_page(inode->i_mapping, index,
 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
 	if (!page)
 		return NULL;
 	BUG_ON(!PageLocked(page));
 	if (page_has_buffers(page)) {
 		bh = page_buffers(page);
 		if (bh->b_size == size) {
 			init_page_buffers(page, bdev, block, size);
 			return page;
 		}
 		if (!try_to_free_buffers(page))
 			goto failed;
 	}
 	/*
 	 * Allocate some buffers for this page
 	 */
 	bh = alloc_page_buffers(page, size, 0);
 	if (!bh)
 		goto failed;
 	/*
 	 * Link the page to the buffers and initialise them.  Take the
 	 * lock to be atomic wrt __find_get_block(), which does not
 	 * run under the page lock.
 	 */
 	spin_lock(&inode->i_mapping->private_lock);
 	link_dev_buffers(page, bh);
 	init_page_buffers(page, bdev, block, size);
 	spin_unlock(&inode->i_mapping->private_lock);
 	return page;
 failed:
 	BUG();
 	unlock_page(page);
 	page_cache_release(page);
 	return NULL;
 }
 /*
  * Create buffers for the specified block device block's page.  If
  * that page was dirty, the buffers are set dirty also.
  */
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
 	struct page *page;
 	pgoff_t index;
 	int sizebits;
 	sizebits = -1;
 	do {
 		sizebits++;
 	} while ((size << sizebits) < PAGE_SIZE);
 	index = block >> sizebits;
 	/*
 	 * Check for a block which wants to lie outside our maximum possible
 	 * pagecache index.  (this comparison is done using sector_t types).
 	 */
 	if (unlikely(index != block >> sizebits)) {
 		char b[BDEVNAME_SIZE];
 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
 			"device %s\n",
 			__func__, (unsigned long long)block,
 			bdevname(bdev, b));
 		return -EIO;
 	}
 	block = index << sizebits;
 	/* Create a page with the proper size buffers.. */
 	page = grow_dev_page(bdev, block, index, size);
 	if (!page)
 		return 0;
 	unlock_page(page);
 	page_cache_release(page);
 	return 1;
 }
 static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
 		printk(KERN_ERR "hardsect size: %d\n",
 					bdev_hardsect_size(bdev));
 		dump_stack();
 		return NULL;
 	}
 	for (;;) {
 		struct buffer_head * bh;
 		int ret;
 		bh = __find_get_block(bdev, block, size);
 		if (bh)
 			return bh;
 		ret = grow_buffers(bdev, block, size);
 		if (ret < 0)
 			return NULL;
 		if (ret == 0)
 			free_more_memory();
 	}
 }
 /*
  * The relationship between dirty buffers and dirty pages:
  *
  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
  * the page is tagged dirty in its radix tree.
  *
  * At all times, the dirtiness of the buffers represents the dirtiness of
  * subsections of the page.  If the page has buffers, the page dirty bit is
  * merely a hint about the true dirty state.
  *
  * When a page is set dirty in its entirety, all its buffers are marked dirty
  * (if the page has buffers).
  *
  * When a buffer is marked dirty, its page is dirtied, but the page's other
  * buffers are not.
  *
  * Also.  When blockdev buffers are explicitly read with bread(), they
  * individually become uptodate.  But their backing page remains not
  * uptodate - even if all of its buffers are uptodate.  A subsequent
  * block_read_full_page() against that page will discover all the uptodate
  * buffers, will set the page uptodate and will perform no I/O.
  */
 /**
  * mark_buffer_dirty - mark a buffer_head as needing writeout
  * @bh: the buffer_head to mark dirty
  *
  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
  * backing page dirty, then tag the page as dirty in its address_space's radix
  * tree and then attach the address_space's inode to its superblock's dirty
  * inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
  * mapping->tree_lock and the global inode_lock.
  */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
 	WARN_ON_ONCE(!buffer_uptodate(bh));
 	/*
 	 * Very *carefully* optimize the it-is-already-dirty case.
 	 *
 	 * Don't let the final "is it dirty" escape to before we
 	 * perhaps modified the buffer.
 	 */
 	if (buffer_dirty(bh)) {
 		smp_mb();
 		if (buffer_dirty(bh))
 			return;
 	}
 	if (!test_set_buffer_dirty(bh))
 		__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
 }
 /*
  * Decrement a buffer_head's reference count.  If all buffers against a page
  * have zero reference count, are clean and unlocked, and if the page is clean
  * and unlocked then try_to_free_buffers() may strip the buffers from the page
  * in preparation for freeing it (sometimes, rarely, buffers are removed from
  * a page but it ends up not being freed, and buffers may later be reattached).
  */
 void __brelse(struct buffer_head * buf)
 {
 	if (atomic_read(&buf->b_count)) {
 		put_bh(buf);
 		return;
 	}
 	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
 	WARN_ON(1);
 }
 /*
  * bforget() is like brelse(), except it discards any
  * potentially dirty data.
  */
 void __bforget(struct buffer_head *bh)
 {
 	clear_buffer_dirty(bh);
 	if (bh->b_assoc_map) {
 		struct address_space *buffer_mapping = bh->b_page->mapping;
 		spin_lock(&buffer_mapping->private_lock);
 		list_del_init(&bh->b_assoc_buffers);
 		bh->b_assoc_map = NULL;
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 	__brelse(bh);
 }
 static struct buffer_head *__bread_slow(struct buffer_head *bh)
 {
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
 		return bh;
 	} else {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
 		wait_on_buffer(bh);
 		if (buffer_uptodate(bh))
 			return bh;
 	}
 	brelse(bh);
 	return NULL;
 }
 /*
  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
  * refcount elevated by one when they're in an LRU.  A buffer can only appear
  * once in a particular CPU's LRU.  A single buffer can be present in multiple
  * CPU's LRUs at the same time.
  *
  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
  * sb_find_get_block().
  *
  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
  * a local interrupt disable for that.
  */
 #define BH_LRU_SIZE	8
 struct bh_lru {
 	struct buffer_head *bhs[BH_LRU_SIZE];
 };
 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
 #ifdef CONFIG_SMP
 #define bh_lru_lock()	local_irq_disable()
 #define bh_lru_unlock()	local_irq_enable()
 #else
 #define bh_lru_lock()	preempt_disable()
 #define bh_lru_unlock()	preempt_enable()
 #endif
 static inline void check_irqs_on(void)
 {
 #ifdef irqs_disabled
 	BUG_ON(irqs_disabled());
 #endif
 }
 /*
  * The LRU management algorithm is dopey-but-simple.  Sorry.
  */
 static void bh_lru_install(struct buffer_head *bh)
 {
 	struct buffer_head *evictee = NULL;
 	struct bh_lru *lru;
 	check_irqs_on();
 	bh_lru_lock();
 	lru = &__get_cpu_var(bh_lrus);
 	if (lru->bhs[0] != bh) {
 		struct buffer_head *bhs[BH_LRU_SIZE];
 		int in;
 		int out = 0;
 		get_bh(bh);
 		bhs[out++] = bh;
 		for (in = 0; in < BH_LRU_SIZE; in++) {
 			struct buffer_head *bh2 = lru->bhs[in];
 			if (bh2 == bh) {
 				__brelse(bh2);
 			} else {
 				if (out >= BH_LRU_SIZE) {
 					BUG_ON(evictee != NULL);
 					evictee = bh2;
 				} else {
 					bhs[out++] = bh2;
 				}
 			}
 		}
 		while (out < BH_LRU_SIZE)
 			bhs[out++] = NULL;
 		memcpy(lru->bhs, bhs, sizeof(bhs));
 	}
 	bh_lru_unlock();
 	if (evictee)
 		__brelse(evictee);
 }
 /*
  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
  */
 static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *ret = NULL;
 	struct bh_lru *lru;
 	unsigned int i;
 	check_irqs_on();
 	bh_lru_lock();
 	lru = &__get_cpu_var(bh_lrus);
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		struct buffer_head *bh = lru->bhs[i];
 		if (bh && bh->b_bdev == bdev &&
 				bh->b_blocknr == block && bh->b_size == size) {
 			if (i) {
 				while (i) {
 					lru->bhs[i] = lru->bhs[i - 1];
 					i--;
 				}
 				lru->bhs[0] = bh;
 			}
 			get_bh(bh);
 			ret = bh;
 			break;
 		}
 	}
 	bh_lru_unlock();
 	return ret;
 }
 /*
  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
  * it in the LRU and mark it as accessed.  If it is not present then return
  * NULL
  */
 struct buffer_head *
 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
 	if (bh == NULL) {
 		bh = __find_get_block_slow(bdev, block);
 		if (bh)
 			bh_lru_install(bh);
 	}
 	if (bh)
 		touch_buffer(bh);
 	return bh;
 }
 EXPORT_SYMBOL(__find_get_block);
 /*
  * __getblk will locate (and, if necessary, create) the buffer_head
  * which corresponds to the passed block_device, block and size. The
  * returned buffer has its reference count incremented.
  *
  * __getblk() cannot fail - it just keeps trying.  If you pass it an
  * illegal block number, __getblk() will happily return a buffer_head
  * which represents the non-existent block.  Very weird.
  *
  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
  * attempt is failing.  FIXME, perhaps?
  */
 struct buffer_head *
 __getblk(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __find_get_block(bdev, block, size);
 	might_sleep();
 	if (bh == NULL)
 		bh = __getblk_slow(bdev, block, size);
 	return bh;
 }
 EXPORT_SYMBOL(__getblk);
 /*
  * Do async read-ahead on a buffer..
  */
 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __getblk(bdev, block, size);
 	if (likely(bh)) {
 		ll_rw_block(READA, 1, &bh);
 		brelse(bh);
 	}
 }
 EXPORT_SYMBOL(__breadahead);
 /**
  *  __bread() - reads a specified block and returns the bh
  *  @bdev: the block_device to read from
  *  @block: number of block
  *  @size: size (in bytes) to read
  *
  *  Reads a specified block, and returns buffer head that contains it.
  *  It returns NULL if the block was unreadable.
  */
 struct buffer_head *
 __bread(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *bh = __getblk(bdev, block, size);
 	if (likely(bh) && !buffer_uptodate(bh))
 		bh = __bread_slow(bh);
 	return bh;
 }
 EXPORT_SYMBOL(__bread);
 /*
  * invalidate_bh_lrus() is called rarely - but not only at unmount.
  * This doesn't race because it runs in each cpu either in irq
  * or with preempt disabled.
  */
 static void invalidate_bh_lru(void *arg)
 {
 	struct bh_lru *b = &get_cpu_var(bh_lrus);
 	int i;
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		brelse(b->bhs[i]);
 		b->bhs[i] = NULL;
 	}
 	put_cpu_var(bh_lrus);
 }
 void invalidate_bh_lrus(void)
 {
 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset)
 {
 	bh->b_page = page;
 	BUG_ON(offset >= PAGE_SIZE);
 	if (PageHighMem(page))
 		/*
 		 * This catches illegal uses and preserves the offset:
 		 */
 		bh->b_data = (char *)(0 + offset);
 	else
 		bh->b_data = page_address(page) + offset;
 }
 EXPORT_SYMBOL(set_bh_page);
 /*
  * Called when truncating a buffer on a page completely.
  */
 static void discard_buffer(struct buffer_head * bh)
 {
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	bh->b_bdev = NULL;
 	clear_buffer_mapped(bh);
 	clear_buffer_req(bh);
 	clear_buffer_new(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
 	unlock_buffer(bh);
 }
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
  *
  * @page: the page which is affected
  * @offset: the index of the truncation point
  *
  * block_invalidatepage() is called when all or part of the page has become
  * invalidatedby a truncate operation.
  *
  * block_invalidatepage() does not have to release all buffers, but it must
  * ensure that no dirty buffer is left outside @offset and that no I/O
  * is underway against any of the blocks which are outside the truncation
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
 void block_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off)
 			discard_buffer(bh);
 		curr_off = next_off;
 		bh = next;
 	} while (bh != head);
 	/*
 	 * We release buffers only if the entire page is being invalidated.
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
 	if (offset == 0)
 		try_to_release_page(page, 0);
 out:
 	return;
 }
 EXPORT_SYMBOL(block_invalidatepage);
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
  * is already excluded via the page lock.
  */
 void create_empty_buffers(struct page *page,
 			unsigned long blocksize, unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
 	head = alloc_page_buffers(page, blocksize, 1);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
 		tail = bh;
 		bh = bh->b_this_page;
 	} while (bh);
 	tail->b_this_page = head;
 	spin_lock(&page->mapping->private_lock);
 	if (PageUptodate(page) || PageDirty(page)) {
 		bh = head;
 		do {
 			if (PageDirty(page))
 				set_buffer_dirty(bh);
 			if (PageUptodate(page))
 				set_buffer_uptodate(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
 	attach_page_buffers(page, head);
 	spin_unlock(&page->mapping->private_lock);
 }
 EXPORT_SYMBOL(create_empty_buffers);
 /*
  * We are taking a block for data and we don't want any output from any
  * buffer-cache aliases starting from return from that function and
  * until the moment when something will explicitly mark the buffer
  * dirty (hopefully that will not happen until we will free that block ;-)
  * We don't even need to mark it not-uptodate - nobody can expect
  * anything from a newly allocated buffer anyway. We used to used
  * unmap_buffer() for such invalidation, but that was wrong. We definitely
  * don't want to mark the alias unmapped, for example - it would confuse
  * anyone who might pick it with bread() afterwards...
  *
  * Also..  Note that bforget() doesn't lock the buffer.  So there can
  * be writeout I/O going on against recently-freed buffers.  We don't
  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
  * only if we really need to.  That happens here.
  */
 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 {
 	struct buffer_head *old_bh;
 	might_sleep();
 	old_bh = __find_get_block_slow(bdev, block);
 	if (old_bh) {
 		clear_buffer_dirty(old_bh);
 		wait_on_buffer(old_bh);
 		clear_buffer_req(old_bh);
 		__brelse(old_bh);
 	}
 }
 EXPORT_SYMBOL(unmap_underlying_metadata);
 /*
  * NOTE! All mapped/uptodate combinations are valid:
  *
  *	Mapped	Uptodate	Meaning
  *
  *	No	No		"unknown" - must do get_block()
  *	No	Yes		"hole" - zero-filled
  *	Yes	No		"allocated" - allocated on disk, not read in
  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
  *
  * "Dirty" is valid only with the last case (mapped+uptodate).
  */
 /*
  * While block_write_full_page is writing back the dirty buffers under
  * the page lock, whoever dirtied the buffers may decide to clean them
  * again at any time.  We handle that by only looking at the buffer
  * state inside lock_buffer().
  *
  * If block_write_full_page() is called for regular writeback
  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
  * locked buffer.   This only can happen if someone has written the buffer
  * directly, with submit_bh().  At the address_space level PageWriteback
  * prevents this contention from occurring.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc)
 {
 	int err;
 	sector_t block;
 	sector_t last_block;
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
 	BUG_ON(!PageLocked(page));
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
 	}
 	/*
 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
 	 * here, and the (potentially unmapped) buffers may become dirty at
 	 * any time.  If a buffer becomes dirty here after we've inspected it
 	 * then we just miss that fact, and the page stays dirty.
 	 *
 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
 	 * handle that here by just cleaning them.
 	 */
 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	head = page_buffers(page);
 	bh = head;
 	/*
 	 * Get all the dirty buffers mapped to disk addresses and
 	 * handle any aliases from the underlying blockdev's mapping.
 	 */
 	do {
 		if (block > last_block) {
 			/*
 			 * mapped buffers outside i_size will occur, because
 			 * this page can be outside i_size when there is a
 			 * truncate in progress.
 			 */
 			/*
 			 * The buffer was zeroed by block_write_full_page()
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
 			}
 		}
 		bh = bh->b_this_page;
 		block++;
 	} while (bh != head);
 	do {
 		if (!buffer_mapped(bh))
 			continue;
 		/*
 		 * If it's a fully non-blocking write attempt and we cannot
 		 * lock the buffer then redirty the page.  Note that this can
 		 * potentially cause a busy-wait loop from pdflush and kswapd
 		 * activity, but those code paths have their own higher-level
 		 * throttling.
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else if (test_set_buffer_locked(bh)) {
 			redirty_page_for_writepage(wbc, page);
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			mark_buffer_async_write(bh);
 		} else {
 			unlock_buffer(bh);
 		}
 	} while ((bh = bh->b_this_page) != head);
 	/*
 	 * The page and its buffers are protected by PageWriteback(), so we can
 	 * drop the bh refcounts early.
 	 */
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			submit_bh(WRITE, bh);
 			nr_underway++;
 		}
 		bh = next;
 	} while (bh != head);
 	unlock_page(page);
 	err = 0;
 done:
 	if (nr_underway == 0) {
 		/*
 		 * The page was marked dirty, but the buffers were
 		 * clean.  Someone wrote them back by hand with
 		 * ll_rw_block/submit_bh.  A rare case.
 		 */
 		end_page_writeback(page);
 		/*
 		 * The page and buffer_heads can be released at any time from
 		 * here on.
 		 */
 	}
 	return err;
 recover:
 	/*
 	 * ENOSPC, or some other error.  We may already have added some
 	 * blocks to the file, so we need to write these out to avoid
 	 * exposing stale data.
 	 * The page is currently locked and not marked for writeback
 	 */
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
 			/*
 			 * The buffer may have been set dirty during
 			 * attachment to a dirty page.
 			 */
 			clear_buffer_dirty(bh);
 		}
 	} while ((bh = bh->b_this_page) != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
 	mapping_set_error(page->mapping, err);
 	set_page_writeback(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
 			submit_bh(WRITE, bh);
 			nr_underway++;
 		}
 		bh = next;
 	} while (bh != head);
 	unlock_page(page);
 	goto done;
 }
 /*
  * If a page has any new buffers, zero them out here, and mark them uptodate
  * and dirty so they'll be written out (in order to prevent uninitialised
  * block data from leaking). And clear the new bit.
  */
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		return;
 	bh = head = page_buffers(page);
 	block_start = 0;
 	do {
 		block_end = block_start + bh->b_size;
 		if (buffer_new(bh)) {
 			if (block_end > from && block_start < to) {
 				if (!PageUptodate(page)) {
 					unsigned start, size;
 					start = max(from, block_start);
 					size = min(to, block_end) - start;
 					zero_user(page, start, size);
 					set_buffer_uptodate(bh);
 				}
 				clear_buffer_new(bh);
 				mark_buffer_dirty(bh);
 			}
 		}
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 static int __block_prepare_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to, get_block_t *get_block)
 {
 	unsigned block_start, block_end;
 	sector_t block;
 	int err = 0;
 	unsigned blocksize, bbits;
 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
 	BUG_ON(!PageLocked(page));
 	BUG_ON(from > PAGE_CACHE_SIZE);
 	BUG_ON(to > PAGE_CACHE_SIZE);
 	BUG_ON(from > to);
 	blocksize = 1 << inode->i_blkbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	head = page_buffers(page);
 	bbits = inode->i_blkbits;
 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
 	for(bh = head, block_start = 0; bh != head || !block_start;
 	    block++, block_start=block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (PageUptodate(page)) {
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 			}
 			continue;
 		}
 		if (buffer_new(bh))
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				break;
 			if (buffer_new(bh)) {
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
 				if (PageUptodate(page)) {
 					clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
 					mark_buffer_dirty(bh);
 					continue;
 				}
 				if (block_end > to || block_start < from)
 					zero_user_segments(page,
 						to, block_end,
 						block_start, from);
 				continue;
 			}
 		}
 		if (PageUptodate(page)) {
 			if (!buffer_uptodate(bh))
 				set_buffer_uptodate(bh);
 			continue;
 		}
 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 		    !buffer_unwritten(bh) &&
 		     (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
 		}
 	}
 	/*
 	 * If we issued read requests - let them complete.
 	 */
 	while(wait_bh > wait) {
 		wait_on_buffer(*--wait_bh);
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	return err;
 }
 static int __block_commit_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to)
 {
 	unsigned block_start, block_end;
 	int partial = 0;
 	unsigned blocksize;
 	struct buffer_head *bh, *head;
 	blocksize = 1 << inode->i_blkbits;
 	for(bh = head = page_buffers(page), block_start = 0;
 	    bh != head || !block_start;
 	    block_start=block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
 			if (!buffer_uptodate(bh))
 				partial = 1;
 		} else {
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
 		clear_buffer_new(bh);
 	}
 	/*
 	 * If this is a partial write which happened to make all buffers
 	 * uptodate then we can optimize away a bogus readpage() for
 	 * the next read(). Here we 'discover' whether the page went
 	 * uptodate as a result of this (potentially partial) write.
 	 */
 	if (!partial)
 		SetPageUptodate(page);
 	return 0;
 }
 /*
  * block_write_begin takes care of the basic task of block allocation and
  * bringing partial write blocks uptodate first.
  *
  * If *pagep is not NULL, then block_write_begin uses the locked page
  * at *pagep rather than allocating its own. In this case, the page will
  * not be unlocked or deallocated on failure.
  */
 int block_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
 {
 	struct inode *inode = mapping->host;
 	int status = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned start, end;
 	int ownpage = 0;
 	index = pos >> PAGE_CACHE_SHIFT;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + len;
 	page = *pagep;
 	if (page == NULL) {
 		ownpage = 1;
 		page = __grab_cache_page(mapping, index);
 		if (!page) {
 			status = -ENOMEM;
 			goto out;
 		}
 		*pagep = page;
 	} else
 		BUG_ON(!PageLocked(page));
 	status = __block_prepare_write(inode, page, start, end, get_block);
 	if (unlikely(status)) {
 		ClearPageUptodate(page);
 		if (ownpage) {
 			unlock_page(page);
 			page_cache_release(page);
 			*pagep = NULL;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again. Don't need
 			 * i_size_read because we hold i_mutex.
 			 */
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 		}
 		goto out;
 	}
 out:
 	return status;
 }
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	unsigned start;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	if (unlikely(copied < len)) {
 		/*
 		 * The buffers that were written will now be uptodate, so we
 		 * don't have to worry about a readpage reading them and
 		 * overwriting a partial write. However if we have encountered
 		 * a short write and only partially written into a buffer, it
 		 * will not be marked uptodate, so a readpage might come in and
 		 * destroy our partial write.
 		 *
 		 * Do the simplest thing, and just treat any short write to a
 		 * non uptodate page as a zero-length write, and force the
 		 * caller to redo the whole thing.
 		 */
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, start+copied, start+len);
 	}
 	flush_dcache_page(page);
 	/* This could be a short (even 0-length) commit */
 	__block_commit_write(inode, page, start, start+copied);
 	return copied;
 }
 EXPORT_SYMBOL(block_write_end);
 int generic_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int i_size_changed = 0;
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
 		i_size_changed = 1;
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
 	 * makes the holding time of page lock longer. Second, it forces lock
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
 	if (i_size_changed)
 		mark_inode_dirty(inode);
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
 /*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
  * Reads the page asynchronously --- the unlock_buffer() and
  * set/clear_buffer_uptodate() functions propagate buffer state into the
  * page struct once IO has completed.
  */
 int block_read_full_page(struct page *page, get_block_t *get_block)
 {
 	struct inode *inode = page->mapping->host;
 	sector_t iblock, lblock;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	unsigned int blocksize;
 	int nr, i;
 	int fully_mapped = 1;
 	BUG_ON(!PageLocked(page));
 	blocksize = 1 << inode->i_blkbits;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	head = page_buffers(page);
 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
 	bh = head;
 	nr = 0;
 	i = 0;
 	do {
 		if (buffer_uptodate(bh))
 			continue;
 		if (!buffer_mapped(bh)) {
 			int err = 0;
 			fully_mapped = 0;
 			if (iblock < lblock) {
 				WARN_ON(bh->b_size != blocksize);
 				err = get_block(inode, iblock, bh, 0);
 				if (err)
 					SetPageError(page);
 			}
 			if (!buffer_mapped(bh)) {
 				zero_user(page, i * blocksize, blocksize);
 				if (!err)
 					set_buffer_uptodate(bh);
 				continue;
 			}
 			/*
 			 * get_block() might have updated the buffer
 			 * synchronously
 			 */
 			if (buffer_uptodate(bh))
 				continue;
 		}
 		arr[nr++] = bh;
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
 	if (fully_mapped)
 		SetPageMappedToDisk(page);
 	if (!nr) {
 		/*
 		 * All buffers are uptodate - we can set the page uptodate
 		 * as well. But not if get_block() returned an error.
 		 */
 		if (!PageError(page))
 			SetPageUptodate(page);
 		unlock_page(page);
 		return 0;
 	}
 	/* Stage two: lock the buffers */
 	for (i = 0; i < nr; i++) {
 		bh = arr[i];
 		lock_buffer(bh);
 		mark_buffer_async_read(bh);
 	}
 	/*
 	 * Stage 3: start the IO.  Check for uptodateness
 	 * inside the buffer lock in case another process reading
 	 * the underlying blockdev brought it uptodate (the sct fix).
 	 */
 	for (i = 0; i < nr; i++) {
 		bh = arr[i];
 		if (buffer_uptodate(bh))
 			end_buffer_async_read(bh, 1);
 		else
 			submit_bh(READ, bh);
 	}
 	return 0;
 }
 /* utility function for filesystems that need to do work on expanding
  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
  * deal with the hole.
  */
 int generic_cont_expand_simple(struct inode *inode, loff_t size)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	void *fsdata;
 	unsigned long limit;
 	int err;
 	err = -EFBIG;
         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
 		send_sig(SIGXFSZ, current, 0);
 		goto out;
 	}
 	if (size > inode->i_sb->s_maxbytes)
 		goto out;
 	err = pagecache_write_begin(NULL, mapping, size, 0,
 				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
 				&page, &fsdata);
 	if (err)
 		goto out;
 	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
 	BUG_ON(err > 0);
 out:
 	return err;
 }
 static int cont_expand_zero(struct file *file, struct address_space *mapping,
 			    loff_t pos, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	struct page *page;
 	void *fsdata;
 	pgoff_t index, curidx;
 	loff_t curpos;
 	unsigned zerofrom, offset, len;
 	int err = 0;
 	index = pos >> PAGE_CACHE_SHIFT;
 	offset = pos & ~PAGE_CACHE_MASK;
 	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
 		zerofrom = curpos & ~PAGE_CACHE_MASK;
 		if (zerofrom & (blocksize-1)) {
 			*bytes |= (blocksize-1);
 			(*bytes)++;
 		}
 		len = PAGE_CACHE_SIZE - zerofrom;
 		err = pagecache_write_begin(file, mapping, curpos, len,
 						AOP_FLAG_UNINTERRUPTIBLE,
 						&page, &fsdata);
 		if (err)
 			goto out;
 		zero_user(page, zerofrom, len);
 		err = pagecache_write_end(file, mapping, curpos, len, len,
 						page, fsdata);
 		if (err < 0)
 			goto out;
 		BUG_ON(err != len);
 		err = 0;
 		balance_dirty_pages_ratelimited(mapping);
 	}
 	/* page covers the boundary, find the boundary offset */
 	if (index == curidx) {
 		zerofrom = curpos & ~PAGE_CACHE_MASK;
 		/* if we will expand the thing last block will be filled */
 		if (offset <= zerofrom) {
 			goto out;
 		}
 		if (zerofrom & (blocksize-1)) {
 			*bytes |= (blocksize-1);
 			(*bytes)++;
 		}
 		len = offset - zerofrom;
 		err = pagecache_write_begin(file, mapping, curpos, len,
 						AOP_FLAG_UNINTERRUPTIBLE,
 						&page, &fsdata);
 		if (err)
 			goto out;
 		zero_user(page, zerofrom, len);
 		err = pagecache_write_end(file, mapping, curpos, len, len,
 						page, fsdata);
 		if (err < 0)
 			goto out;
 		BUG_ON(err != len);
 		err = 0;
 	}
 out:
 	return err;
 }
 /*
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
 int cont_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	unsigned zerofrom;
 	int err;
 	err = cont_expand_zero(file, mapping, pos, bytes);
 	if (err)
 		goto out;
 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
 		*bytes |= (blocksize-1);
 		(*bytes)++;
 	}
 	*pagep = NULL;
 	err = block_write_begin(file, mapping, pos, len,
 				flags, pagep, fsdata, get_block);
 out:
 	return err;
 }
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
 			get_block_t *get_block)
 {
 	struct inode *inode = page->mapping->host;
 	int err = __block_prepare_write(inode, page, from, to, get_block);
 	if (err)
 		ClearPageUptodate(page);
 	return err;
 }
 int block_commit_write(struct page *page, unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	__block_commit_write(inode,page,from,to);
 	return 0;
 }
 /*
  * block_page_mkwrite() is not allowed to change the file size as it gets
  * called from a page fault handler when a page is first dirtied. Hence we must
  * be careful to check for EOF conditions here. We set the page up correctly
  * for a written page which means we get ENOSPC checking when writing into
  * holes and correct delalloc and unwritten extent mapping on filesystems that
  * support these features.
  *
  * We are not allowed to take the i_mutex here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
  * vmtruncate() writes the inode size before removing pages, once we have the
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
 int
 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 		   get_block_t get_block)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
 	int ret = -EINVAL;
 	lock_page(page);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
 	/* page is wholly or partially inside EOF */
 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
 		end = size & ~PAGE_CACHE_MASK;
 	else
 		end = PAGE_CACHE_SIZE;
 	ret = block_prepare_write(page, 0, end, get_block);
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 out_unlock:
 	unlock_page(page);
 	return ret;
 }
 /*
  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
  * immediately, while under the page lock.  So it needs a special end_io
  * handler which does not touch the bh after unlocking it.
  */
 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
 {
 	__end_buffer_read_notouch(bh, uptodate);
 }
 /*
  * Attach the singly-linked list of buffers created by nobh_write_begin, to
  * the page (converting it to circular linked list and taking care of page
  * dirty races).
  */
 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 {
 	struct buffer_head *bh;
 	BUG_ON(!PageLocked(page));
 	spin_lock(&page->mapping->private_lock);
 	bh = head;
 	do {
 		if (PageDirty(page))
 			set_buffer_dirty(bh);
 		if (!bh->b_this_page)
 			bh->b_this_page = head;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	attach_page_buffers(page, head);
 	spin_unlock(&page->mapping->private_lock);
 }
 /*
  * On entry, the page is fully not uptodate.
  * On exit the page is fully uptodate in the areas outside (from,to)
  */
 int nobh_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
 {
 	struct inode *inode = mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	const unsigned blocksize = 1 << blkbits;
 	struct buffer_head *head, *bh;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
 	unsigned block_in_page;
 	unsigned block_start, block_end;
 	sector_t block_in_file;
 	int nr_reads = 0;
 	int ret = 0;
 	int is_mapped_to_disk = 1;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 	page = __grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 	*fsdata = NULL;
 	if (page_has_buffers(page)) {
 		unlock_page(page);
 		page_cache_release(page);
 		*pagep = NULL;
 		return block_write_begin(file, mapping, pos, len, flags, pagep,
 					fsdata, get_block);
 	}
 	if (PageMappedToDisk(page))
 		return 0;
 	/*
 	 * Allocate buffers so that we can keep track of state, and potentially
 	 * attach them to the page if an error occurs. In the common case of
 	 * no error, they will just be freed again without ever being attached
 	 * to the page (which is all OK, because we're under the page lock).
 	 *
 	 * Be careful: the buffer linked list is a NULL terminated one, rather
 	 * than the circular one we're used to.
 	 */
 	head = alloc_page_buffers(page, blocksize, 0);
 	if (!head) {
 		ret = -ENOMEM;
 		goto out_release;
 	}
 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	/*
 	 * We loop across all blocks in the page, whether or not they are
 	 * part of the affected region.  This is so we can discover if the
 	 * page is fully mapped-to-disk.
 	 */
 	for (block_start = 0, block_in_page = 0, bh = head;
 		  block_start < PAGE_CACHE_SIZE;
 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
 		int create;
 		block_end = block_start + blocksize;
 		bh->b_state = 0;
 		create = 1;
 		if (block_start >= to)
 			create = 0;
 		ret = get_block(inode, block_in_file + block_in_page,
 					bh, create);
 		if (ret)
 			goto failed;
 		if (!buffer_mapped(bh))
 			is_mapped_to_disk = 0;
 		if (buffer_new(bh))
 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 		if (PageUptodate(page)) {
 			set_buffer_uptodate(bh);
 			continue;
 		}
 		if (buffer_new(bh) || !buffer_mapped(bh)) {
 			zero_user_segments(page, block_start, from,
 							to, block_end);
 			continue;
 		}
 		if (buffer_uptodate(bh))
 			continue;	/* reiserfs does this */
 		if (block_start < from || block_end > to) {
 			lock_buffer(bh);
 			bh->b_end_io = end_buffer_read_nobh;
 			submit_bh(READ, bh);
 			nr_reads++;
 		}
 	}
 	if (nr_reads) {
 		/*
 		 * The page is locked, so these buffers are protected from
 		 * any VM or truncate activity.  Hence we don't need to care
 		 * for the buffer_head refcounts.
 		 */
 		for (bh = head; bh; bh = bh->b_this_page) {
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				ret = -EIO;
 		}
 		if (ret)
 			goto failed;
 	}
 	if (is_mapped_to_disk)
 		SetPageMappedToDisk(page);
 	*fsdata = head; /* to be released by nobh_write_end */
 	return 0;
 failed:
 	BUG_ON(!ret);
 	/*
 	 * Error recovery is a bit difficult. We need to zero out blocks that
 	 * were newly allocated, and dirty them to ensure they get written out.
 	 * Buffers need to be attached to the page at this point, otherwise
 	 * the handling of potential IO errors during writeout would be hard
 	 * (could try doing synchronous writeout, but what if that fails too?)
 	 */
 	attach_nobh_buffers(page, head);
 	page_zero_new_buffers(page, from, to);
 out_release:
 	unlock_page(page);
 	page_cache_release(page);
 	*pagep = NULL;
 	if (pos + len > inode->i_size)
 		vmtruncate(inode, inode->i_size);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_write_begin);
 int nobh_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *head = fsdata;
 	struct buffer_head *bh;
 	BUG_ON(fsdata != NULL && page_has_buffers(page));
 	if (unlikely(copied < len) && !page_has_buffers(page))
 		attach_nobh_buffers(page, head);
 	if (page_has_buffers(page))
 		return generic_write_end(file, mapping, pos, len,
 					copied, page, fsdata);
 	SetPageUptodate(page);
 	set_page_dirty(page);
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
 		mark_inode_dirty(inode);
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	while (head) {
 		bh = head;
 		head = head->b_this_page;
 		free_buffer_head(bh);
 	}
 	return copied;
 }
 EXPORT_SYMBOL(nobh_write_end);
 /*
  * nobh_writepage() - based on block_full_write_page() except
  * that it tries to operate without attaching bufferheads to
  * the page.
  */
 int nobh_writepage(struct page *page, get_block_t *get_block,
 			struct writeback_control *wbc)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
 	int ret;
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
 		goto out;
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index >= end_index+1 || !offset) {
 		/*
 		 * The page may have dirty, unmapped buffers.  For example,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
 #if 0
 		/* Not really sure about this  - do we need this ? */
 		if (page->mapping->a_ops->invalidatepage)
 			page->mapping->a_ops->invalidatepage(page, offset);
 #endif
 		unlock_page(page);
 		return 0; /* don't care */
 	}
 	/*
 	 * The page straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
 		ret = __block_write_full_page(inode, page, get_block, wbc);
 	return ret;
 }
 EXPORT_SYMBOL(nobh_writepage);
 int nobh_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
 	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head map_bh;
 	int err;
 	blocksize = 1 << inode->i_blkbits;
 	length = offset & (blocksize - 1);
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
 	length = blocksize - length;
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
 	if (!page)
 		goto out;
 	if (page_has_buffers(page)) {
 has_buffers:
 		unlock_page(page);
 		page_cache_release(page);
 		return block_truncate_page(mapping, from, get_block);
 	}
 	/* Find the buffer that contains "offset" */
 	pos = blocksize;
 	while (offset >= pos) {
 		iblock++;
 		pos += blocksize;
 	}
 	err = get_block(inode, iblock, &map_bh, 0);
 	if (err)
 		goto unlock;
 	/* unmapped? It's a hole - nothing to do */
 	if (!buffer_mapped(&map_bh))
 		goto unlock;
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (!PageUptodate(page)) {
 		err = mapping->a_ops->readpage(NULL, page);
 		if (err) {
 			page_cache_release(page);
 			goto out;
 		}
 		lock_page(page);
 		if (!PageUptodate(page)) {
 			err = -EIO;
 			goto unlock;
 		}
 		if (page_has_buffers(page))
 			goto has_buffers;
 	}
 	zero_user(page, offset, length);
 	set_page_dirty(page);
 	err = 0;
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return err;
 }
 EXPORT_SYMBOL(nobh_truncate_page);
 int block_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
 	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head *bh;
 	int err;
 	blocksize = 1 << inode->i_blkbits;
 	length = offset & (blocksize - 1);
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
 	length = blocksize - length;
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
 	if (!page)
 		goto out;
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	/* Find the buffer that contains "offset" */
 	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
 		iblock++;
 		pos += blocksize;
 	}
 	err = 0;
 	if (!buffer_mapped(bh)) {
 		WARN_ON(bh->b_size != blocksize);
 		err = get_block(inode, iblock, bh, 0);
 		if (err)
 			goto unlock;
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh))
 			goto unlock;
 	}
 	/* Ok, it's mapped. Make sure it's up-to-date */
 	if (PageUptodate(page))
 		set_buffer_uptodate(bh);
 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
 		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
 	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
 	err = 0;
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
 	return err;
 }
 /*
  * The generic ->writepage function for buffer-backed address_spaces
  */
 int block_write_full_page(struct page *page, get_block_t *get_block,
 			struct writeback_control *wbc)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
 		return __block_write_full_page(inode, page, get_block, wbc);
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index >= end_index+1 || !offset) {
 		/*
 		 * The page may have dirty, unmapped buffers.  For example,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
 		do_invalidatepage(page, 0);
 		unlock_page(page);
 		return 0; /* don't care */
 	}
 	/*
 	 * The page straddles i_size.  It must be zeroed out on each and every
 	 * writepage invokation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	return __block_write_full_page(inode, page, get_block, wbc);
 }
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 			    get_block_t *get_block)
 {
 	struct buffer_head tmp;
 	struct inode *inode = mapping->host;
 	tmp.b_state = 0;
 	tmp.b_blocknr = 0;
 	tmp.b_size = 1 << inode->i_blkbits;
 	get_block(inode, block, &tmp, 0);
 	return tmp.b_blocknr;
 }
 static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 }
 int submit_bh(int rw, struct buffer_head * bh)
 {
 	struct bio *bio;
 	int ret = 0;
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 	if (buffer_ordered(bh) && (rw == WRITE))
 		rw = WRITE_BARRIER;
 	/*
 	 * Only clear out a write error when rewriting, should this
 	 * include WRITE_SYNC as well?
 	 */
 	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
 		clear_buffer_write_io_error(bh);
 	/*
 	 * from here on down, it's all bio -- do the initial mapping,
 	 * submit_bio -> generic_make_request may further map this bio around
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
 	bio->bi_io_vec[0].bv_len = bh->b_size;
 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 	bio->bi_vcnt = 1;
 	bio->bi_idx = 0;
 	bio->bi_size = bh->b_size;
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
 	bio_get(bio);
 	submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
  * are sent to disk. The fourth %READA option is described in the documentation
  * for generic_make_request() which ll_rw_block() calls.
  *
  * This function drops any buffer that it cannot get a lock on (with the
  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
  * clean when doing a write request, and any buffer that appears to be
  * up-to-date when doing read request.  Further it marks as clean buffers that
  * are processed for writing (the buffer cache won't assume that they are
  * actually clean until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
  * any waiters.
  *
  * All of the buffers must be for the same device, and must also be a
  * multiple of the current approved size for the device.
  */
 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 {
 	int i;
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 		if (rw == SWRITE || rw == SWRITE_SYNC)
 			lock_buffer(bh);
 		else if (test_set_buffer_locked(bh))
 			continue;
 		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
 				if (rw == SWRITE_SYNC)
 					submit_bh(WRITE_SYNC, bh);
 				else
 					submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
 			if (!buffer_uptodate(bh)) {
 				bh->b_end_io = end_buffer_read_sync;
 				get_bh(bh);
 				submit_bh(rw, bh);
 				continue;
 			}
 		}
 		unlock_buffer(bh);
 	}
 }
 /*
  * For a data-integrity writeout, we need to wait upon any in-progress I/O
  * and then start new I/O and then wait upon it.  The caller must have a ref on
  * the buffer_head.
  */
 int sync_dirty_buffer(struct buffer_head *bh)
 {
 	int ret = 0;
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	lock_buffer(bh);
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
 		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
 			ret = -EOPNOTSUPP;
 		}
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
 	} else {
 		unlock_buffer(bh);
 	}
 	return ret;
 }
 /*
  * try_to_free_buffers() checks if all the buffers on this particular page
  * are unused, and releases them if so.
  *
  * Exclusion against try_to_free_buffers may be obtained by either
  * locking the page or by holding its mapping's private_lock.
  *
  * If the page is dirty but all the buffers are clean then we need to
  * be sure to mark the page clean as well.  This is because the page
  * may be against a block device, and a later reattachment of buffers
  * to a dirty page will set *all* buffers dirty.  Which would corrupt
  * filesystem data on the same device.
  *
  * The same applies to regular filesystem pages: if all the buffers are
  * clean then we set the page clean and proceed.  To do that, we require
  * total exclusion from __set_page_dirty_buffers().  That is obtained with
  * private_lock.
  *
  * try_to_free_buffers() is non-blocking.
  */
 static inline int buffer_busy(struct buffer_head *bh)
 {
 	return atomic_read(&bh->b_count) |
 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
 }
 static int
 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh;
 	bh = head;
 	do {
 		if (buffer_write_io_error(bh) && page->mapping)
 			set_bit(AS_EIO, &page->mapping->flags);
 		if (buffer_busy(bh))
 			goto failed;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (bh->b_assoc_map)
 			__remove_assoc_queue(bh);
 		bh = next;
 	} while (bh != head);
 	*buffers_to_free = head;
 	__clear_page_buffers(page);
 	return 1;
 failed:
 	return 0;
 }
 int try_to_free_buffers(struct page *page)
 {
 	struct address_space * const mapping = page->mapping;
 	struct buffer_head *buffers_to_free = NULL;
 	int ret = 0;
 	BUG_ON(!PageLocked(page));
 	if (PageWriteback(page))
 		return 0;
 	if (mapping == NULL) {		/* can this still happen? */
 		ret = drop_buffers(page, &buffers_to_free);
 		goto out;
 	}
 	spin_lock(&mapping->private_lock);
 	ret = drop_buffers(page, &buffers_to_free);
 	/*
 	 * If the filesystem writes its buffers by hand (eg ext3)
 	 * then we can have clean buffers against a dirty page.  We
 	 * clean the page here; otherwise the VM will never notice
 	 * that the filesystem did any IO at all.
 	 *
 	 * Also, during truncate, discard_buffer will have marked all
 	 * the page's buffers clean.  We discover that here and clean
 	 * the page also.
 	 *
 	 * private_lock must be held over this entire operation in order
 	 * to synchronise against __set_page_dirty_buffers and prevent the
 	 * dirty bit from being lost.
 	 */
 	if (ret)
 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
 	spin_unlock(&mapping->private_lock);
 out:
 	if (buffers_to_free) {
 		struct buffer_head *bh = buffers_to_free;
 		do {
 			struct buffer_head *next = bh->b_this_page;
 			free_buffer_head(bh);
 			bh = next;
 		} while (bh != buffers_to_free);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 void block_sync_page(struct page *page)
 {
 	struct address_space *mapping;
 	smp_mb();
 	mapping = page_mapping(page);
 	if (mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, page);
 }
 /*
  * There are no bdflush tunables left.  But distributions are
  * still running obsolete flush daemons, so we terminate them here.
  *
  * Use of bdflush() is deprecated and will be removed in a future kernel.
  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
  */
 asmlinkage long sys_bdflush(int func, long data)
 {
 	static int msg_count;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (msg_count < 5) {
 		msg_count++;
 		printk(KERN_INFO
 			"warning: process `%s' used the obsolete bdflush"
 			" system call\n", current->comm);
 		printk(KERN_INFO "Fix your initscripts?\n");
 	}
 	if (func == 1)
 		do_exit(0);
 	return 0;
 }
 /*
  * Buffer-head allocation
  */
 static struct kmem_cache *bh_cachep;
 /*
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
 static int max_buffer_heads;
 int buffer_heads_over_limit;
 struct bh_accounting {
 	int nr;			/* Number of live bh's */
 	int ratelimit;		/* Limit cacheline bouncing */
 };
 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
 static void recalc_bh_state(void)
 {
 	int i;
 	int tot = 0;
 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
 		return;
 	__get_cpu_var(bh_accounting).ratelimit = 0;
 	for_each_online_cpu(i)
 		tot += per_cpu(bh_accounting, i).nr;
 	buffer_heads_over_limit = (tot > max_buffer_heads);
 }
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
 		get_cpu_var(bh_accounting).nr++;
 		recalc_bh_state();
 		put_cpu_var(bh_accounting);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
 	put_cpu_var(bh_accounting);
 }
 EXPORT_SYMBOL(free_buffer_head);
 static void buffer_exit_cpu(int cpu)
 {
 	int i;
 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
 	for (i = 0; i < BH_LRU_SIZE; i++) {
 		brelse(b->bhs[i]);
 		b->bhs[i] = NULL;
 	}
 	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
 	per_cpu(bh_accounting, cpu).nr = 0;
 	put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,
 			      unsigned long action, void *hcpu)
 {
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		buffer_exit_cpu((unsigned long)hcpu);
 	return NOTIFY_OK;
 }
 /**
  * bh_uptodate_or_lock - Test whether the buffer is uptodate
  * @bh: struct buffer_head
  *
  * Return true if the buffer is up-to-date and false,
  * with the buffer locked, if not.
  */
 int bh_uptodate_or_lock(struct buffer_head *bh)
 {
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 		if (!buffer_uptodate(bh))
 			return 0;
 		unlock_buffer(bh);
 	}
 	return 1;
 }
 EXPORT_SYMBOL(bh_uptodate_or_lock);
 /**
  * bh_submit_read - Submit a locked buffer for reading
  * @bh: struct buffer_head
  *
  * Returns zero on success and -EIO on error.
  */
 int bh_submit_read(struct buffer_head *bh)
 {
 	BUG_ON(!buffer_locked(bh));
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
 		return 0;
 	}
 	get_bh(bh);
 	bh->b_end_io = end_buffer_read_sync;
 	submit_bh(READ, bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return 0;
 	return -EIO;
 }
 EXPORT_SYMBOL(bh_submit_read);
 static void
 init_buffer_head(struct kmem_cache *cachep, void *data)
 {
 	struct buffer_head *bh = data;
 	memset(bh, 0, sizeof(*bh));
 	INIT_LIST_HEAD(&bh->b_assoc_buffers);
 }
 void __init buffer_init(void)
 {
 	int nrpages;
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 				SLAB_MEM_SPREAD),
 				init_buffer_head);
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */
 	nrpages = (nr_free_buffer_pages() * 10) / 100;
 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
 	hotcpu_notifier(buffer_cpu_notify, 0);
 }
 EXPORT_SYMBOL(__bforget);
 EXPORT_SYMBOL(__brelse);
 EXPORT_SYMBOL(__wait_on_buffer);
 EXPORT_SYMBOL(block_commit_write);
 EXPORT_SYMBOL(block_prepare_write);
 EXPORT_SYMBOL(block_page_mkwrite);
 EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
 EXPORT_SYMBOL(ll_rw_block);
 EXPORT_SYMBOL(mark_buffer_dirty);
 EXPORT_SYMBOL(submit_bh);
 EXPORT_SYMBOL(sync_dirty_buffer);
 EXPORT_SYMBOL(unlock_buffer);

 /*
  * fs/mpage.c
  *
  * Copyright (C) 2002, Linus Torvalds.
  *
  * Contains functions related to preparing and submitting BIOs which contain
  * multiple pagecache pages.
  *
  * 15May2002	akpm@zip.com.au
  *		Initial version
  * 27Jun2002	axboe@suse.de
  *		use bio_add_page() to build bio's just the right size
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
 #include <linux/prefetch.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 /*
  * I/O completion handler for multipage BIOs.
  *
  * The mpage code never puts partial pages into a BIO (except for end-of-file).
  * If a page does not map to a contiguous run of blocks then it simply falls
  * back to block_read_full_page().
  *
  * Why is this?  If a page's completion depends on a number of different BIOs
  * which can complete in any order (or at the same time) then determining the
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
 static void mpage_end_io_read(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	do {
 		struct page *page = bvec->bv_page;
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 		if (uptodate) {
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 		unlock_page(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 }
 static void mpage_end_io_write(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	do {
 		struct page *page = bvec->bv_page;
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 		if (!uptodate){
 			SetPageError(page);
 			if (page->mapping)
 				set_bit(AS_EIO, &page->mapping->flags);
 		}
 		end_page_writeback(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 }
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
 		bio->bi_end_io = mpage_end_io_write;
 	submit_bio(rw, bio);
 	return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 static struct bio *
 mpage_alloc(struct block_device *bdev,
 		sector_t first_sector, int nr_vecs,
 		gfp_t gfp_flags)
 {
 	struct bio *bio;
 	bio = bio_alloc(gfp_flags, nr_vecs);
 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
 		while (!bio && (nr_vecs /= 2))
 			bio = bio_alloc(gfp_flags, nr_vecs);
 	}
 	if (bio) {
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
 	}
 	return bio;
 }
 /*
  * support function for mpage_readpages.  The fs supplied get_block might
  * return an up to date buffer.  This is used to map that buffer into
  * the page, which allows readpage to avoid triggering a duplicate call
  * to get_block.
  *
  * The idea is to avoid adding buffers to pages that don't already have
  * them.  So when the buffer is up to date and the page size == block size,
  * this marks the page up to date instead of adding new buffers.
  */
 static void
 map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *page_bh, *head;
 	int block = 0;
 	if (!page_has_buffers(page)) {
 		/*
 		 * don't make any buffers if there is only one buffer on
 		 * the page and the page just needs to be set up to date
 		 */
 		if (inode->i_blkbits == PAGE_CACHE_SHIFT &&
 		    buffer_uptodate(bh)) {
 			SetPageUptodate(page);
 			return;
 		}
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 	}
 	head = page_buffers(page);
 	page_bh = head;
 	do {
 		if (block == page_block) {
 			page_bh->b_state = bh->b_state;
 			page_bh->b_bdev = bh->b_bdev;
 			page_bh->b_blocknr = bh->b_blocknr;
 			break;
 		}
 		page_bh = page_bh->b_this_page;
 		block++;
 	} while (page_bh != head);
 }
 /*
  * This is the worker routine which does all the work of mapping the disk
  * blocks and constructs largest possible bios, submits them for IO if the
  * blocks are not contiguous on the disk.
  *
  * We pass a buffer_head back and forth and use its buffer_mapped() flag to
  * represent the validity of its disk mapping and to decide when to do the next
  * get_block() call.
  */
 static struct bio *
 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		sector_t *last_block_in_bio, struct buffer_head *map_bh,
 		unsigned long *first_logical_block, get_block_t get_block)
 {
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
 	const unsigned blocksize = 1 << blkbits;
 	sector_t block_in_file;
 	sector_t last_block;
 	sector_t last_block_in_file;
 	sector_t blocks[MAX_BUF_PER_PAGE];
 	unsigned page_block;
 	unsigned first_hole = blocks_per_page;
 	struct block_device *bdev = NULL;
 	int length;
 	int fully_mapped = 1;
 	unsigned nblocks;
 	unsigned relative_block;
 	if (page_has_buffers(page))
 		goto confused;
 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = block_in_file + nr_pages * blocks_per_page;
 	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 	if (last_block > last_block_in_file)
 		last_block = last_block_in_file;
 	page_block = 0;
 	/*
 	 * Map blocks using the result from the previous get_blocks call first.
 	 */
 	nblocks = map_bh->b_size >> blkbits;
 	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
 			block_in_file < (*first_logical_block + nblocks)) {
 		unsigned map_offset = block_in_file - *first_logical_block;
 		unsigned last = nblocks - map_offset;
 		for (relative_block = 0; ; relative_block++) {
 			if (relative_block == last) {
 				clear_buffer_mapped(map_bh);
 				break;
 			}
 			if (page_block == blocks_per_page)
 				break;
 			blocks[page_block] = map_bh->b_blocknr + map_offset +
 						relative_block;
 			page_block++;
 			block_in_file++;
 		}
 		bdev = map_bh->b_bdev;
 	}
 	/*
 	 * Then do more get_blocks calls until we are done with this page.
 	 */
 	map_bh->b_page = page;
 	while (page_block < blocks_per_page) {
 		map_bh->b_state = 0;
 		map_bh->b_size = 0;
 		if (block_in_file < last_block) {
 			map_bh->b_size = (last_block-block_in_file) << blkbits;
 			if (get_block(inode, block_in_file, map_bh, 0))
 				goto confused;
 			*first_logical_block = block_in_file;
 		}
 		if (!buffer_mapped(map_bh)) {
 			fully_mapped = 0;
 			if (first_hole == blocks_per_page)
 				first_hole = page_block;
 			page_block++;
 			block_in_file++;
 			clear_buffer_mapped(map_bh);
 			continue;
 		}
 		/* some filesystems will copy data into the page during
 		 * the get_block call, in which case we don't want to
 		 * read it again.  map_buffer_to_page copies the data
 		 * we just collected from get_block into the page's buffers
 		 * so readpage doesn't have to repeat the get_block call
 		 */
 		if (buffer_uptodate(map_bh)) {
 			map_buffer_to_page(page, map_bh, page_block);
 			goto confused;
 		}
 		if (first_hole != blocks_per_page)
 			goto confused;		/* hole -> non-hole */
 		/* Contiguous blocks? */
 		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
 			goto confused;
 		nblocks = map_bh->b_size >> blkbits;
 		for (relative_block = 0; ; relative_block++) {
 			if (relative_block == nblocks) {
 				clear_buffer_mapped(map_bh);
 				break;
 			} else if (page_block == blocks_per_page)
 				break;
 			blocks[page_block] = map_bh->b_blocknr+relative_block;
 			page_block++;
 			block_in_file++;
 		}
 		bdev = map_bh->b_bdev;
 	}
 	if (first_hole != blocks_per_page) {
 		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
 		if (first_hole == 0) {
 			SetPageUptodate(page);
 			unlock_page(page);
 			goto out;
 		}
 	} else if (fully_mapped) {
 		SetPageMappedToDisk(page);
 	}
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (bio && (*last_block_in_bio != blocks[0] - 1))
 		bio = mpage_bio_submit(READ, bio);
 alloc_new:
 	if (bio == NULL) {
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
 			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
 				GFP_KERNEL);
 		if (bio == NULL)
 			goto confused;
 	}
 	length = first_hole << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(READ, bio);
 		goto alloc_new;
 	}
 	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
 		bio = mpage_bio_submit(READ, bio);
 	else
 		*last_block_in_bio = blocks[blocks_per_page - 1];
 out:
 	return bio;
 confused:
 	if (bio)
 		bio = mpage_bio_submit(READ, bio);
 	if (!PageUptodate(page))
 	        block_read_full_page(page, get_block);
 	else
 		unlock_page(page);
 	goto out;
 }
 /**
  * mpage_readpages - populate an address space with some pages & start reads against them
  * @mapping: the address_space
  * @pages: The address of a list_head which contains the target pages.  These
  *   pages have their ->index populated and are otherwise uninitialised.
  *   The page at @pages->prev has the lowest file offset, and reads should be
  *   issued in @pages->prev to @pages->next order.
  * @nr_pages: The number of pages at *@pages
  * @get_block: The filesystem's block mapper function.
  *
  * This function walks the pages and the blocks within each page, building and
  * emitting large BIOs.
  *
  * If anything unusual happens, such as:
  *
  * - encountering a page which has buffers
  * - encountering a page which has a non-hole after a hole
  * - encountering a page with non-contiguous blocks
  *
  * then this code just gives up and calls the buffer_head-based read function.
  * It does handle a page which has holes at the end - that is a common case:
  * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
  *
  * BH_Boundary explanation:
  *
  * There is a problem.  The mpage read code assembles several pages, gets all
  * their disk mappings, and then submits them all.  That's fine, but obtaining
  * the disk mappings may require I/O.  Reads of indirect blocks, for example.
  *
  * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
  * submitted in the following order:
  * 	12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
  *
  * because the indirect block has to be read to get the mappings of blocks
  * 13,14,15,16.  Obviously, this impacts performance.
  *
  * So what we do it to allow the filesystem's get_block() function to set
  * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
  * after this one will require I/O against a block which is probably close to
  * this one.  So you should push what I/O you have currently accumulated.
  *
  * This all causes the disk requests to be issued in the correct order.
  */
 int
 mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 	clear_buffer_mapped(&map_bh);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 		prefetchw(&page->flags);
 		list_del(&page->lru);
 		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
 			bio = do_mpage_readpage(bio, page,
 					nr_pages - page_idx,
 					&last_block_in_bio, &map_bh,
 					&first_logical_block,
 					get_block);
 		}
 		page_cache_release(page);
 	}
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
 /*
  * This isn't called much at all
  */
 int mpage_readpage(struct page *page, get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 	clear_buffer_mapped(&map_bh);
 	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
 			&map_bh, &first_logical_block, get_block);
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpage);
 /*
  * Writing is not so simple.
  *
  * If the page has buffers then they will be used for obtaining the disk
  * mapping.  We only support pages which are fully mapped-and-dirty, with a
  * special case for pages which are unmapped at the end: end-of-file.
  *
  * If the page has no buffers (preferred) then the page is mapped here.
  *
  * If all blocks are found to be contiguous then the page can go into the
  * BIO.  Otherwise fall back to the mapping's writepage().
  *
  * FIXME: This code wants an estimate of how many pages are still to be
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-			     void *data)
+		      void *data)
 {
 	struct mpage_data *mpd = data;
 	struct bio *bio = mpd->bio;
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	unsigned long end_index;
 	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
 	sector_t last_block;
 	sector_t block_in_file;
 	sector_t blocks[MAX_BUF_PER_PAGE];
 	unsigned page_block;
 	unsigned first_unmapped = blocks_per_page;
 	struct block_device *bdev = NULL;
 	int boundary = 0;
 	sector_t boundary_block = 0;
 	struct block_device *boundary_bdev = NULL;
 	int length;
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
 		/* If they're all mapped and dirty, do it */
 		page_block = 0;
 		do {
 			BUG_ON(buffer_locked(bh));
 			if (!buffer_mapped(bh)) {
 				/*
 				 * unmapped dirty buffers are created by
 				 * __set_page_dirty_buffers -> mmapped data
 				 */
 				if (buffer_dirty(bh))
 					goto confused;
 				if (first_unmapped == blocks_per_page)
 					first_unmapped = page_block;
 				continue;
 			}
 			if (first_unmapped != blocks_per_page)
 				goto confused;	/* hole -> non-hole */
 			if (!buffer_dirty(bh) || !buffer_uptodate(bh))
 				goto confused;
 			if (page_block) {
 				if (bh->b_blocknr != blocks[page_block-1] + 1)
 					goto confused;
 			}
 			blocks[page_block++] = bh->b_blocknr;
 			boundary = buffer_boundary(bh);
 			if (boundary) {
 				boundary_block = bh->b_blocknr;
 				boundary_bdev = bh->b_bdev;
 			}
 			bdev = bh->b_bdev;
 		} while ((bh = bh->b_this_page) != head);
 		if (first_unmapped)
 			goto page_is_mapped;
 		/*
 		 * Page has buffers, but they are all unmapped. The page was
 		 * created by pagein or read over a hole which was handled by
 		 * block_read_full_page().  If this address_space is also
 		 * using mpage_readpages then this can rarely happen.
 		 */
 		goto confused;
 	}
 	/*
 	 * The page has no buffers: map it to disk
 	 */
 	BUG_ON(!PageUptodate(page));
 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size - 1) >> blkbits;
 	map_bh.b_page = page;
 	for (page_block = 0; page_block < blocks_per_page; ) {
 		map_bh.b_state = 0;
 		map_bh.b_size = 1 << blkbits;
 		if (mpd->get_block(inode, block_in_file, &map_bh, 1))
 			goto confused;
 		if (buffer_new(&map_bh))
 			unmap_underlying_metadata(map_bh.b_bdev,
 						map_bh.b_blocknr);
 		if (buffer_boundary(&map_bh)) {
 			boundary_block = map_bh.b_blocknr;
 			boundary_bdev = map_bh.b_bdev;
 		}
 		if (page_block) {
 			if (map_bh.b_blocknr != blocks[page_block-1] + 1)
 				goto confused;
 		}
 		blocks[page_block++] = map_bh.b_blocknr;
 		boundary = buffer_boundary(&map_bh);
 		bdev = map_bh.b_bdev;
 		if (block_in_file == last_block)
 			break;
 		block_in_file++;
 	}
 	BUG_ON(page_block == 0);
 	first_unmapped = page_block;
 page_is_mapped:
 	end_index = i_size >> PAGE_CACHE_SHIFT;
 	if (page->index >= end_index) {
 		/*
 		 * The page straddles i_size.  It must be zeroed out on each
 		 * and every writepage invokation because it may be mmapped.
 		 * "A file is mapped in multiples of the page size.  For a file
 		 * that is not a multiple of the page size, the remaining memory
 		 * is zeroed when mapped, and writes to that region are not
 		 * written out to the file."
 		 */
 		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
 		if (page->index > end_index || !offset)
 			goto confused;
 		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	}
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
 		bio = mpage_bio_submit(WRITE, bio);
 alloc_new:
 	if (bio == NULL) {
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
 				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
 	}
 	/*
 	 * Must try to add the page before marking the buffer clean or
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(WRITE, bio);
 		goto alloc_new;
 	}
 	/*
 	 * OK, we have our BIO, so we can now mark the buffers clean.  Make
 	 * sure to only clean buffers which we know we'll be writing.
 	 */
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
 		unsigned buffer_counter = 0;
 		do {
 			if (buffer_counter++ == first_unmapped)
 				break;
 			clear_buffer_dirty(bh);
 			bh = bh->b_this_page;
 		} while (bh != head);
 		/*
 		 * we cannot drop the bh if the page is not uptodate
 		 * or a concurrent readpage would fail to serialize with the bh
 		 * and it would read from disk before we reach the platter.
 		 */
 		if (buffer_heads_over_limit && PageUptodate(page))
 			try_to_free_buffers(page);
 	}
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
 	if (boundary || (first_unmapped != blocks_per_page)) {
 		bio = mpage_bio_submit(WRITE, bio);
 		if (boundary_block) {
 			write_boundary_block(boundary_bdev,
 					boundary_block, 1 << blkbits);
 		}
 	} else {
 		mpd->last_block_in_bio = blocks[blocks_per_page - 1];
 	}
 	goto out;
 confused:
 	if (bio)
 		bio = mpage_bio_submit(WRITE, bio);
 	if (mpd->use_writepage) {
 		ret = mapping->a_ops->writepage(page, wbc);
 	} else {
 		ret = -EAGAIN;
 		goto out;
 	}
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
 	 */
 	mapping_set_error(mapping, ret);
 out:
 	mpd->bio = bio;
 	return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @get_block: the filesystem's block mapper function.
  *             If this is NULL then use a_ops->writepage.  Otherwise, go
  *             direct-to-BIO.
  *
  * This is a library function, which implements the writepages()
  * address_space_operation.
  *
  * If a page is already under I/O, generic_writepages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
 int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
 	int ret;
 	if (!get_block)
 		ret = generic_writepages(mapping, wbc);
 	else {
 		struct mpage_data mpd = {
 			.bio = NULL,
 			.last_block_in_bio = 0,
 			.get_block = get_block,
 			.use_writepage = 1,
 		};
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
 		if (mpd.bio)
 			mpage_bio_submit(WRITE, mpd.bio);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
 int mpage_writepage(struct page *page, get_block_t get_block,
 	struct writeback_control *wbc)
 {
 	struct mpage_data mpd = {
 		.bio = NULL,
 		.last_block_in_bio = 0,
 		.get_block = get_block,
 		.use_writepage = 0,
 	};
 	int ret = __mpage_writepage(page, wbc, &mpd);
 	if (mpd.bio)
 		mpage_bio_submit(WRITE, mpd.bio);
 	return ret;
 }

 /*
  * include/linux/mpage.h
  *
  * Contains declarations related to preparing and submitting BIOS which contain
  * multiple pagecache pages.
  */
 /*
  * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do
  * nested includes.  Get it right in the .c file).
  */
 #ifdef CONFIG_BLOCK
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
 struct writeback_control;
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
 		struct writeback_control *wbc);
 #endif