Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

** Write ahead logging implementation copyright Chris Mason 2000

2

** Write ahead logging implementation copyright Chris Mason 2000

3

**

3

**

4

** The background commits make this code very interelated, and

4

** The background commits make this code very interelated, and

5

** overly complex. I need to rethink things a bit....The major players:

5

** overly complex. I need to rethink things a bit....The major players:

6

**

6

**

7

** journal_begin -- call with the number of blocks you expect to log.

7

** journal_begin -- call with the number of blocks you expect to log.

8

** If the current transaction is too

8

** If the current transaction is too

9

** old, it will block until the current transaction is

9

** old, it will block until the current transaction is

10

** finished, and then start a new one.

10

** finished, and then start a new one.

11

** Usually, your transaction will get joined in with

11

** Usually, your transaction will get joined in with

12

** previous ones for speed.

12

** previous ones for speed.

13

**

13

**

14

** journal_join -- same as journal_begin, but won't block on the current

14

** journal_join -- same as journal_begin, but won't block on the current

15

** transaction regardless of age. Don't ever call

15

** transaction regardless of age. Don't ever call

16

** this. Ever. There are only two places it should be

16

** this. Ever. There are only two places it should be

17

** called from, and they are both inside this file.

17

** called from, and they are both inside this file.

18

**

18

**

19

** journal_mark_dirty -- adds blocks into this transaction. clears any flags

19

** journal_mark_dirty -- adds blocks into this transaction. clears any flags

20

** that might make them get sent to disk

20

** that might make them get sent to disk

21

** and then marks them BH_JDirty. Puts the buffer head

21

** and then marks them BH_JDirty. Puts the buffer head

22

** into the current transaction hash.

22

** into the current transaction hash.

23

**

23

**

24

** journal_end -- if the current transaction is batchable, it does nothing

24

** journal_end -- if the current transaction is batchable, it does nothing

25

** otherwise, it could do an async/synchronous commit, or

25

** otherwise, it could do an async/synchronous commit, or

26

** a full flush of all log and real blocks in the

26

** a full flush of all log and real blocks in the

27

** transaction.

27

** transaction.

28

**

28

**

29

** flush_old_commits -- if the current transaction is too old, it is ended and

29

** flush_old_commits -- if the current transaction is too old, it is ended and

30

** commit blocks are sent to disk. Forces commit blocks

30

** commit blocks are sent to disk. Forces commit blocks

31

** to disk for all backgrounded commits that have been

31

** to disk for all backgrounded commits that have been

32

** around too long.

32

** around too long.

33

** -- Note, if you call this as an immediate flush from

33

** -- Note, if you call this as an immediate flush from

34

** from within kupdate, it will ignore the immediate flag

34

** from within kupdate, it will ignore the immediate flag

35

*/

35

*/

36

37

#include <linux/config.h>

37

#include <linux/config.h>

38

#include <asm/uaccess.h>

38

#include <asm/uaccess.h>

39

#include <asm/system.h>

39

#include <asm/system.h>

40

41

#include <linux/time.h>

41

#include <linux/time.h>

42

#include <asm/semaphore.h>

42

#include <asm/semaphore.h>

43

44

#include <linux/vmalloc.h>

44

#include <linux/vmalloc.h>

45

#include <linux/reiserfs_fs.h>

45

#include <linux/reiserfs_fs.h>

46

47

#include <linux/kernel.h>

47

#include <linux/kernel.h>

48

#include <linux/errno.h>

48

#include <linux/errno.h>

49

#include <linux/fcntl.h>

49

#include <linux/fcntl.h>

50

#include <linux/stat.h>

50

#include <linux/stat.h>

51

#include <linux/string.h>

51

#include <linux/string.h>

52

#include <linux/smp_lock.h>

52

#include <linux/smp_lock.h>

53

#include <linux/buffer_head.h>

53

#include <linux/buffer_head.h>

54

#include <linux/workqueue.h>

54

#include <linux/workqueue.h>

55

#include <linux/writeback.h>

55

#include <linux/writeback.h>

56

#include <linux/blkdev.h>

56

#include <linux/blkdev.h>

57

58

/* gets a struct reiserfs_journal_list * from a list head */

58

/* gets a struct reiserfs_journal_list * from a list head */

59

#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \

59

#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \

60

j_list))

60

j_list))

61

#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \

61

#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \

62

j_working_list))

62

j_working_list))

63

64

/* the number of mounted filesystems. This is used to decide when to

64

/* the number of mounted filesystems. This is used to decide when to

65

** start and kill the commit workqueue

65

** start and kill the commit workqueue

66

*/

66

*/

67

static int reiserfs_mounted_fs_count;

67

static int reiserfs_mounted_fs_count;

68

69

static struct workqueue_struct *commit_wq;

69

static struct workqueue_struct *commit_wq;

70

71

#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit

71

#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit

72

structs at 4k */

72

structs at 4k */

73

#define BUFNR 64 /*read ahead */

73

#define BUFNR 64 /*read ahead */

74

75

/* cnode stat bits. Move these into reiserfs_fs.h */

75

/* cnode stat bits. Move these into reiserfs_fs.h */

76

77

#define BLOCK_FREED 2 /* this block was freed, and can't be written. */

77

#define BLOCK_FREED 2 /* this block was freed, and can't be written. */

78

#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */

78

#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */

79

80

#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */

80

#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */

81

#define BLOCK_DIRTIED 5

81

#define BLOCK_DIRTIED 5

82

83

/* journal list state bits */

83

/* journal list state bits */

84

#define LIST_TOUCHED 1

84

#define LIST_TOUCHED 1

85

#define LIST_DIRTY 2

85

#define LIST_DIRTY 2

86

#define LIST_COMMIT_PENDING 4 /* someone will commit this list */

86

#define LIST_COMMIT_PENDING 4 /* someone will commit this list */

87

88

/* flags for do_journal_end */

88

/* flags for do_journal_end */

89

#define FLUSH_ALL 1 /* flush commit and real blocks */

89

#define FLUSH_ALL 1 /* flush commit and real blocks */

90

#define COMMIT_NOW 2 /* end and commit this transaction */

90

#define COMMIT_NOW 2 /* end and commit this transaction */

91

#define WAIT 4 /* wait for the log blocks to hit the disk */

91

#define WAIT 4 /* wait for the log blocks to hit the disk */

92

93

static int do_journal_end(struct reiserfs_transaction_handle *,

93

static int do_journal_end(struct reiserfs_transaction_handle *,

94

struct super_block *, unsigned long nblocks,

94

struct super_block *, unsigned long nblocks,

95

int flags);

95

int flags);

96

static int flush_journal_list(struct super_block *s,

96

static int flush_journal_list(struct super_block *s,

97

struct reiserfs_journal_list *jl, int flushall);

97

struct reiserfs_journal_list *jl, int flushall);

98

static int flush_commit_list(struct super_block *s,

98

static int flush_commit_list(struct super_block *s,

99

struct reiserfs_journal_list *jl, int flushall);

99

struct reiserfs_journal_list *jl, int flushall);

100

static int can_dirty(struct reiserfs_journal_cnode *cn);

100

static int can_dirty(struct reiserfs_journal_cnode *cn);

101

static int journal_join(struct reiserfs_transaction_handle *th,

101

static int journal_join(struct reiserfs_transaction_handle *th,

102

struct super_block *p_s_sb, unsigned long nblocks);

102

struct super_block *p_s_sb, unsigned long nblocks);

103

static int release_journal_dev(struct super_block *super,

103

static int release_journal_dev(struct super_block *super,

104

struct reiserfs_journal *journal);

104

struct reiserfs_journal *journal);

105

static int dirty_one_transaction(struct super_block *s,

105

static int dirty_one_transaction(struct super_block *s,

106

struct reiserfs_journal_list *jl);

106

struct reiserfs_journal_list *jl);

107

static void flush_async_commits(void *p);

107

static void flush_async_commits(void *p);

108

static void queue_log_writer(struct super_block *s);

108

static void queue_log_writer(struct super_block *s);

109

110

/* values for join in do_journal_begin_r */

110

/* values for join in do_journal_begin_r */

111

enum {

111

enum {

112

JBEGIN_REG = 0, /* regular journal begin */

112

JBEGIN_REG = 0, /* regular journal begin */

113

JBEGIN_JOIN = 1, /* join the running transaction if at all possible */

113

JBEGIN_JOIN = 1, /* join the running transaction if at all possible */

114

JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */

114

JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */

115

};

115

};

116

117

static int do_journal_begin_r(struct reiserfs_transaction_handle *th,

117

static int do_journal_begin_r(struct reiserfs_transaction_handle *th,

118

struct super_block *p_s_sb,

118

struct super_block *p_s_sb,

119

unsigned long nblocks, int join);

119

unsigned long nblocks, int join);

120

121

static void init_journal_hash(struct super_block *p_s_sb)

121

static void init_journal_hash(struct super_block *p_s_sb)

122

{

122

{

123

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

123

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

124

memset(journal->j_hash_table, 0,

124

memset(journal->j_hash_table, 0,

125

JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));

125

JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));

126

}

126

}

127

128

/*

128

/*

129

** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to

129

** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to

130

** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for

130

** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for

131

** more details.

131

** more details.

132

*/

132

*/

133

static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)

133

static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)

134

{

134

{

135

if (bh) {

135

if (bh) {

136

clear_buffer_dirty(bh);

136

clear_buffer_dirty(bh);

137

clear_buffer_journal_test(bh);

137

clear_buffer_journal_test(bh);

138

}

138

}

139

return 0;

139

return 0;

140

}

140

}

141

142

static void disable_barrier(struct super_block *s)

142

static void disable_barrier(struct super_block *s)

143

{

143

{

144

REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);

144

REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);

145

printk("reiserfs: disabling flush barriers on %s\n",

145

printk("reiserfs: disabling flush barriers on %s\n",

146

reiserfs_bdevname(s));

146

reiserfs_bdevname(s));

147

}

147

}

148

149

static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block

149

static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block

150

*p_s_sb)

150

*p_s_sb)

151

{

151

{

152

struct reiserfs_bitmap_node *bn;

152

struct reiserfs_bitmap_node *bn;

153

static int id;

153

static int id;

154

155

bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);

155

bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);

156

if (!bn) {

156

if (!bn) {

157

return NULL;

157

return NULL;

158

}

158

}

159

bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);

159

bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);

160

if (!bn->data) {

160

if (!bn->data) {

161

kfree(bn);

161

kfree(bn);

162

return NULL;

162

return NULL;

163

}

163

}

164

bn->id = id++;

164

bn->id = id++;

165

INIT_LIST_HEAD(&bn->list);

165

INIT_LIST_HEAD(&bn->list);

166

return bn;

166

return bn;

167

}

167

}

168

169

static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)

169

static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)

170

{

170

{

171

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

171

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

172

struct reiserfs_bitmap_node *bn = NULL;

172

struct reiserfs_bitmap_node *bn = NULL;

173

struct list_head *entry = journal->j_bitmap_nodes.next;

173

struct list_head *entry = journal->j_bitmap_nodes.next;

174

175

journal->j_used_bitmap_nodes++;

175

journal->j_used_bitmap_nodes++;

176

repeat:

176

repeat:

177

178

if (entry != &journal->j_bitmap_nodes) {

178

if (entry != &journal->j_bitmap_nodes) {

179

bn = list_entry(entry, struct reiserfs_bitmap_node, list);

179

bn = list_entry(entry, struct reiserfs_bitmap_node, list);

180

list_del(entry);

180

list_del(entry);

181

memset(bn->data, 0, p_s_sb->s_blocksize);

181

memset(bn->data, 0, p_s_sb->s_blocksize);

182

journal->j_free_bitmap_nodes--;

182

journal->j_free_bitmap_nodes--;

183

return bn;

183

return bn;

184

}

184

}

185

bn = allocate_bitmap_node(p_s_sb);

185

bn = allocate_bitmap_node(p_s_sb);

186

if (!bn) {

186

if (!bn) {

187

yield();

187

yield();

188

goto repeat;

188

goto repeat;

189

}

189

}

190

return bn;

190

return bn;

191

}

191

}

192

static inline void free_bitmap_node(struct super_block *p_s_sb,

192

static inline void free_bitmap_node(struct super_block *p_s_sb,

193

struct reiserfs_bitmap_node *bn)

193

struct reiserfs_bitmap_node *bn)

194

{

194

{

195

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

195

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

196

journal->j_used_bitmap_nodes--;

196

journal->j_used_bitmap_nodes--;

197

if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {

197

if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {

198

kfree(bn->data);

198

kfree(bn->data);

199

kfree(bn);

199

kfree(bn);

200

} else {

200

} else {

201

list_add(&bn->list, &journal->j_bitmap_nodes);

201

list_add(&bn->list, &journal->j_bitmap_nodes);

202

journal->j_free_bitmap_nodes++;

202

journal->j_free_bitmap_nodes++;

203

}

203

}

204

}

204

}

205

206

static void allocate_bitmap_nodes(struct super_block *p_s_sb)

206

static void allocate_bitmap_nodes(struct super_block *p_s_sb)

207

{

207

{

208

int i;

208

int i;

209

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

209

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

210

struct reiserfs_bitmap_node *bn = NULL;

210

struct reiserfs_bitmap_node *bn = NULL;

211

for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {

211

for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {

212

bn = allocate_bitmap_node(p_s_sb);

212

bn = allocate_bitmap_node(p_s_sb);

213

if (bn) {

213

if (bn) {

214

list_add(&bn->list, &journal->j_bitmap_nodes);

214

list_add(&bn->list, &journal->j_bitmap_nodes);

215

journal->j_free_bitmap_nodes++;

215

journal->j_free_bitmap_nodes++;

216

} else {

216

} else {

217

break; // this is ok, we'll try again when more are needed

217

break; // this is ok, we'll try again when more are needed

218

}

218

}

219

}

219

}

220

}

220

}

221

222

static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block,

222

static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block,

223

struct reiserfs_list_bitmap *jb)

223

struct reiserfs_list_bitmap *jb)

224

{

224

{

225

int bmap_nr = block / (p_s_sb->s_blocksize << 3);

225

int bmap_nr = block / (p_s_sb->s_blocksize << 3);

226

int bit_nr = block % (p_s_sb->s_blocksize << 3);

226

int bit_nr = block % (p_s_sb->s_blocksize << 3);

227

228

if (!jb->bitmaps[bmap_nr]) {

228

if (!jb->bitmaps[bmap_nr]) {

229

jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb);

229

jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb);

230

}

230

}

231

set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);

231

set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);

232

return 0;

232

return 0;

233

}

233

}

234

235

static void cleanup_bitmap_list(struct super_block *p_s_sb,

235

static void cleanup_bitmap_list(struct super_block *p_s_sb,

236

struct reiserfs_list_bitmap *jb)

236

struct reiserfs_list_bitmap *jb)

237

{

237

{

238

int i;

238

int i;

239

if (jb->bitmaps == NULL)

239

if (jb->bitmaps == NULL)

240

return;

240

return;

241

242

for (i = 0; i < SB_BMAP_NR(p_s_sb); i++) {

242

for (i = 0; i < SB_BMAP_NR(p_s_sb); i++) {

243

if (jb->bitmaps[i]) {

243

if (jb->bitmaps[i]) {

244

free_bitmap_node(p_s_sb, jb->bitmaps[i]);

244

free_bitmap_node(p_s_sb, jb->bitmaps[i]);

245

jb->bitmaps[i] = NULL;

245

jb->bitmaps[i] = NULL;

246

}

246

}

247

}

247

}

248

}

248

}

249

250

/*

250

/*

251

** only call this on FS unmount.

251

** only call this on FS unmount.

252

*/

252

*/

253

static int free_list_bitmaps(struct super_block *p_s_sb,

253

static int free_list_bitmaps(struct super_block *p_s_sb,

254

struct reiserfs_list_bitmap *jb_array)

254

struct reiserfs_list_bitmap *jb_array)

255

{

255

{

256

int i;

256

int i;

257

struct reiserfs_list_bitmap *jb;

257

struct reiserfs_list_bitmap *jb;

258

for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {

258

for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {

259

jb = jb_array + i;

259

jb = jb_array + i;

260

jb->journal_list = NULL;

260

jb->journal_list = NULL;

261

cleanup_bitmap_list(p_s_sb, jb);

261

cleanup_bitmap_list(p_s_sb, jb);

262

vfree(jb->bitmaps);

262

vfree(jb->bitmaps);

263

jb->bitmaps = NULL;

263

jb->bitmaps = NULL;

264

}

264

}

265

return 0;

265

return 0;

266

}

266

}

267

268

static int free_bitmap_nodes(struct super_block *p_s_sb)

268

static int free_bitmap_nodes(struct super_block *p_s_sb)

269

{

269

{

270

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

270

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

271

struct list_head *next = journal->j_bitmap_nodes.next;

271

struct list_head *next = journal->j_bitmap_nodes.next;

272

struct reiserfs_bitmap_node *bn;

272

struct reiserfs_bitmap_node *bn;

273

274

while (next != &journal->j_bitmap_nodes) {

274

while (next != &journal->j_bitmap_nodes) {

275

bn = list_entry(next, struct reiserfs_bitmap_node, list);

275

bn = list_entry(next, struct reiserfs_bitmap_node, list);

276

list_del(next);

276

list_del(next);

277

kfree(bn->data);

277

kfree(bn->data);

278

kfree(bn);

278

kfree(bn);

279

next = journal->j_bitmap_nodes.next;

279

next = journal->j_bitmap_nodes.next;

280

journal->j_free_bitmap_nodes--;

280

journal->j_free_bitmap_nodes--;

281

}

281

}

282

283

return 0;

283

return 0;

284

}

284

}

285

286

/*

286

/*

287

** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.

287

** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.

288

** jb_array is the array to be filled in.

288

** jb_array is the array to be filled in.

289

*/

289

*/

290

int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,

290

int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,

291

struct reiserfs_list_bitmap *jb_array,

291

struct reiserfs_list_bitmap *jb_array,

292

int bmap_nr)

292

int bmap_nr)

293

{

293

{

294

int i;

294

int i;

295

int failed = 0;

295

int failed = 0;

296

struct reiserfs_list_bitmap *jb;

296

struct reiserfs_list_bitmap *jb;

297

int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);

297

int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);

298

299

for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {

299

for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {

300

jb = jb_array + i;

300

jb = jb_array + i;

301

jb->journal_list = NULL;

301

jb->journal_list = NULL;

302

jb->bitmaps = vmalloc(mem);

302

jb->bitmaps = vmalloc(mem);

303

if (!jb->bitmaps) {

303

if (!jb->bitmaps) {

304

reiserfs_warning(p_s_sb,

304

reiserfs_warning(p_s_sb,

305

"clm-2000, unable to allocate bitmaps for journal lists");

305

"clm-2000, unable to allocate bitmaps for journal lists");

306

failed = 1;

306

failed = 1;

307

break;

307

break;

308

}

308

}

309

memset(jb->bitmaps, 0, mem);

309

memset(jb->bitmaps, 0, mem);

310

}

310

}

311

if (failed) {

311

if (failed) {

312

free_list_bitmaps(p_s_sb, jb_array);

312

free_list_bitmaps(p_s_sb, jb_array);

313

return -1;

313

return -1;

314

}

314

}

315

return 0;

315

return 0;

316

}

316

}

317

318

/*

318

/*

319

** find an available list bitmap. If you can't find one, flush a commit list

319

** find an available list bitmap. If you can't find one, flush a commit list

320

** and try again

320

** and try again

321

*/

321

*/

322

static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,

322

static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,

323

struct reiserfs_journal_list

323

struct reiserfs_journal_list

324

*jl)

324

*jl)

325

{

325

{

326

int i, j;

326

int i, j;

327

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

327

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

328

struct reiserfs_list_bitmap *jb = NULL;

328

struct reiserfs_list_bitmap *jb = NULL;

329

330

for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {

330

for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {

331

i = journal->j_list_bitmap_index;

331

i = journal->j_list_bitmap_index;

332

journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;

332

journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;

333

jb = journal->j_list_bitmap + i;

333

jb = journal->j_list_bitmap + i;

334

if (journal->j_list_bitmap[i].journal_list) {

334

if (journal->j_list_bitmap[i].journal_list) {

335

flush_commit_list(p_s_sb,

335

flush_commit_list(p_s_sb,

336

journal->j_list_bitmap[i].

336

journal->j_list_bitmap[i].

337

journal_list, 1);

337

journal_list, 1);

338

if (!journal->j_list_bitmap[i].journal_list) {

338

if (!journal->j_list_bitmap[i].journal_list) {

339

break;

339

break;

340

}

340

}

341

} else {

341

} else {

342

break;

342

break;

343

}

343

}

344

}

344

}

345

if (jb->journal_list) { /* double check to make sure if flushed correctly */

345

if (jb->journal_list) { /* double check to make sure if flushed correctly */

346

return NULL;

346

return NULL;

347

}

347

}

348

jb->journal_list = jl;

348

jb->journal_list = jl;

349

return jb;

349

return jb;

350

}

350

}

351

352

/*

352

/*

353

** allocates a new chunk of X nodes, and links them all together as a list.

353

** allocates a new chunk of X nodes, and links them all together as a list.

354

** Uses the cnode->next and cnode->prev pointers

354

** Uses the cnode->next and cnode->prev pointers

355

** returns NULL on failure

355

** returns NULL on failure

356

*/

356

*/

357

static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)

357

static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)

358

{

358

{

359

struct reiserfs_journal_cnode *head;

359

struct reiserfs_journal_cnode *head;

360

int i;

360

int i;

361

if (num_cnodes <= 0) {

361

if (num_cnodes <= 0) {

362

return NULL;

362

return NULL;

363

}

363

}

364

head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));

364

head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));

365

if (!head) {

365

if (!head) {

366

return NULL;

366

return NULL;

367

}

367

}

368

memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));

368

memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));

369

head[0].prev = NULL;

369

head[0].prev = NULL;

370

head[0].next = head + 1;

370

head[0].next = head + 1;

371

for (i = 1; i < num_cnodes; i++) {

371

for (i = 1; i < num_cnodes; i++) {

372

head[i].prev = head + (i - 1);

372

head[i].prev = head + (i - 1);

373

head[i].next = head + (i + 1); /* if last one, overwrite it after the if */

373

head[i].next = head + (i + 1); /* if last one, overwrite it after the if */

374

}

374

}

375

head[num_cnodes - 1].next = NULL;

375

head[num_cnodes - 1].next = NULL;

376

return head;

376

return head;

377

}

377

}

378

379

/*

379

/*

380

** pulls a cnode off the free list, or returns NULL on failure

380

** pulls a cnode off the free list, or returns NULL on failure

381

*/

381

*/

382

static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)

382

static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)

383

{

383

{

384

struct reiserfs_journal_cnode *cn;

384

struct reiserfs_journal_cnode *cn;

385

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

385

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

386

387

reiserfs_check_lock_depth(p_s_sb, "get_cnode");

387

reiserfs_check_lock_depth(p_s_sb, "get_cnode");

388

389

if (journal->j_cnode_free <= 0) {

389

if (journal->j_cnode_free <= 0) {

390

return NULL;

390

return NULL;

391

}

391

}

392

journal->j_cnode_used++;

392

journal->j_cnode_used++;

393

journal->j_cnode_free--;

393

journal->j_cnode_free--;

394

cn = journal->j_cnode_free_list;

394

cn = journal->j_cnode_free_list;

395

if (!cn) {

395

if (!cn) {

396

return cn;

396

return cn;

397

}

397

}

398

if (cn->next) {

398

if (cn->next) {

399

cn->next->prev = NULL;

399

cn->next->prev = NULL;

400

}

400

}

401

journal->j_cnode_free_list = cn->next;

401

journal->j_cnode_free_list = cn->next;

402

memset(cn, 0, sizeof(struct reiserfs_journal_cnode));

402

memset(cn, 0, sizeof(struct reiserfs_journal_cnode));

403

return cn;

403

return cn;

404

}

404

}

405

406

/*

406

/*

407

** returns a cnode to the free list

407

** returns a cnode to the free list

408

*/

408

*/

409

static void free_cnode(struct super_block *p_s_sb,

409

static void free_cnode(struct super_block *p_s_sb,

410

struct reiserfs_journal_cnode *cn)

410

struct reiserfs_journal_cnode *cn)

411

{

411

{

412

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

412

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

413

414

reiserfs_check_lock_depth(p_s_sb, "free_cnode");

414

reiserfs_check_lock_depth(p_s_sb, "free_cnode");

415

416

journal->j_cnode_used--;

416

journal->j_cnode_used--;

417

journal->j_cnode_free++;

417

journal->j_cnode_free++;

418

/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */

418

/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */

419

cn->next = journal->j_cnode_free_list;

419

cn->next = journal->j_cnode_free_list;

420

if (journal->j_cnode_free_list) {

420

if (journal->j_cnode_free_list) {

421

journal->j_cnode_free_list->prev = cn;

421

journal->j_cnode_free_list->prev = cn;

422

}

422

}

423

cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */

423

cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */

424

journal->j_cnode_free_list = cn;

424

journal->j_cnode_free_list = cn;

425

}

425

}

426

427

static void clear_prepared_bits(struct buffer_head *bh)

427

static void clear_prepared_bits(struct buffer_head *bh)

428

{

428

{

429

clear_buffer_journal_prepared(bh);

429

clear_buffer_journal_prepared(bh);

430

clear_buffer_journal_restore_dirty(bh);

430

clear_buffer_journal_restore_dirty(bh);

431

}

431

}

432

433

/* utility function to force a BUG if it is called without the big

433

/* utility function to force a BUG if it is called without the big

434

** kernel lock held. caller is the string printed just before calling BUG()

434

** kernel lock held. caller is the string printed just before calling BUG()

435

*/

435

*/

436

void reiserfs_check_lock_depth(struct super_block *sb, char *caller)

436

void reiserfs_check_lock_depth(struct super_block *sb, char *caller)

437

{

437

{

438

#ifdef CONFIG_SMP

438

#ifdef CONFIG_SMP

439

if (current->lock_depth < 0) {

439

if (current->lock_depth < 0) {

440

reiserfs_panic(sb, "%s called without kernel lock held",

440

reiserfs_panic(sb, "%s called without kernel lock held",

441

caller);

441

caller);

442

}

442

}

443

#else

443

#else

444

;

444

;

445

#endif

445

#endif

446

}

446

}

447

448

/* return a cnode with same dev, block number and size in table, or null if not found */

448

/* return a cnode with same dev, block number and size in table, or null if not found */

449

static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct

449

static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct

450

super_block

450

super_block

451

*sb,

451

*sb,

452

struct

452

struct

453

reiserfs_journal_cnode

453

reiserfs_journal_cnode

454

**table,

454

**table,

455

long bl)

455

long bl)

456

{

456

{

457

struct reiserfs_journal_cnode *cn;

457

struct reiserfs_journal_cnode *cn;

458

cn = journal_hash(table, sb, bl);

458

cn = journal_hash(table, sb, bl);

459

while (cn) {

459

while (cn) {

460

if (cn->blocknr == bl && cn->sb == sb)

460

if (cn->blocknr == bl && cn->sb == sb)

461

return cn;

461

return cn;

462

cn = cn->hnext;

462

cn = cn->hnext;

463

}

463

}

464

return (struct reiserfs_journal_cnode *)0;

464

return (struct reiserfs_journal_cnode *)0;

465

}

465

}

466

467

/*

467

/*

468

** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated

468

** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated

469

** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever

469

** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever

470

** being overwritten by a replay after crashing.

470

** being overwritten by a replay after crashing.

471

**

471

**

472

** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting

472

** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting

473

** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make

473

** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make

474

** sure you never write the block without logging it.

474

** sure you never write the block without logging it.

475

**

475

**

476

** next_zero_bit is a suggestion about the next block to try for find_forward.

476

** next_zero_bit is a suggestion about the next block to try for find_forward.

477

** when bl is rejected because it is set in a journal list bitmap, we search

477

** when bl is rejected because it is set in a journal list bitmap, we search

478

** for the next zero bit in the bitmap that rejected bl. Then, we return that

478

** for the next zero bit in the bitmap that rejected bl. Then, we return that

479

** through next_zero_bit for find_forward to try.

479

** through next_zero_bit for find_forward to try.

480

**

480

**

481

** Just because we return something in next_zero_bit does not mean we won't

481

** Just because we return something in next_zero_bit does not mean we won't

482

** reject it on the next call to reiserfs_in_journal

482

** reject it on the next call to reiserfs_in_journal

483

**

483

**

484

*/

484

*/

485

int reiserfs_in_journal(struct super_block *p_s_sb,

485

int reiserfs_in_journal(struct super_block *p_s_sb,

486

int bmap_nr, int bit_nr, int search_all,

486

int bmap_nr, int bit_nr, int search_all,

487

b_blocknr_t * next_zero_bit)

487

b_blocknr_t * next_zero_bit)

488

{

488

{

489

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

489

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

490

struct reiserfs_journal_cnode *cn;

490

struct reiserfs_journal_cnode *cn;

491

struct reiserfs_list_bitmap *jb;

491

struct reiserfs_list_bitmap *jb;

492

int i;

492

int i;

493

unsigned long bl;

493

unsigned long bl;

494

495

*next_zero_bit = 0; /* always start this at zero. */

495

*next_zero_bit = 0; /* always start this at zero. */

496

497

PROC_INFO_INC(p_s_sb, journal.in_journal);

497

PROC_INFO_INC(p_s_sb, journal.in_journal);

498

/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.

498

/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.

499

** if we crash before the transaction that freed it commits, this transaction won't

499

** if we crash before the transaction that freed it commits, this transaction won't

500

** have committed either, and the block will never be written

500

** have committed either, and the block will never be written

501

*/

501

*/

502

if (search_all) {

502

if (search_all) {

503

for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {

503

for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {

504

PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap);

504

PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap);

505

jb = journal->j_list_bitmap + i;

505

jb = journal->j_list_bitmap + i;

506

if (jb->journal_list && jb->bitmaps[bmap_nr] &&

506

if (jb->journal_list && jb->bitmaps[bmap_nr] &&

507

test_bit(bit_nr,

507

test_bit(bit_nr,

508

(unsigned long *)jb->bitmaps[bmap_nr]->

508

(unsigned long *)jb->bitmaps[bmap_nr]->

509

data)) {

509

data)) {

510

*next_zero_bit =

510

*next_zero_bit =

511

find_next_zero_bit((unsigned long *)

511

find_next_zero_bit((unsigned long *)

512

(jb->bitmaps[bmap_nr]->

512

(jb->bitmaps[bmap_nr]->

513

data),

513

data),

514

p_s_sb->s_blocksize << 3,

514

p_s_sb->s_blocksize << 3,

515

bit_nr + 1);

515

bit_nr + 1);

516

return 1;

516

return 1;

517

}

517

}

518

}

518

}

519

}

519

}

520

521

bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;

521

bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;

522

/* is it in any old transactions? */

522

/* is it in any old transactions? */

523

if (search_all

523

if (search_all

524

&& (cn =

524

&& (cn =

525

get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {

525

get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {

526

return 1;

526

return 1;

527

}

527

}

528

529

/* is it in the current transaction. This should never happen */

529

/* is it in the current transaction. This should never happen */

530

if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {

530

if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {

531

BUG();

531

BUG();

532

return 1;

532

return 1;

533

}

533

}

534

535

PROC_INFO_INC(p_s_sb, journal.in_journal_reusable);

535

PROC_INFO_INC(p_s_sb, journal.in_journal_reusable);

536

/* safe for reuse */

536

/* safe for reuse */

537

return 0;

537

return 0;

538

}

538

}

539

540

/* insert cn into table

540

/* insert cn into table

541

*/

541

*/

542

static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,

542

static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,

543

struct reiserfs_journal_cnode *cn)

543

struct reiserfs_journal_cnode *cn)

544

{

544

{

545

struct reiserfs_journal_cnode *cn_orig;

545

struct reiserfs_journal_cnode *cn_orig;

546

547

cn_orig = journal_hash(table, cn->sb, cn->blocknr);

547

cn_orig = journal_hash(table, cn->sb, cn->blocknr);

548

cn->hnext = cn_orig;

548

cn->hnext = cn_orig;

549

cn->hprev = NULL;

549

cn->hprev = NULL;

550

if (cn_orig) {

550

if (cn_orig) {

551

cn_orig->hprev = cn;

551

cn_orig->hprev = cn;

552

}

552

}

553

journal_hash(table, cn->sb, cn->blocknr) = cn;

553

journal_hash(table, cn->sb, cn->blocknr) = cn;

554

}

554

}

555

556

/* lock the current transaction */

556

/* lock the current transaction */

557

static inline void lock_journal(struct super_block *p_s_sb)

557

static inline void lock_journal(struct super_block *p_s_sb)

558

{

558

{

559

PROC_INFO_INC(p_s_sb, journal.lock_journal);

559

PROC_INFO_INC(p_s_sb, journal.lock_journal);

560

down(&SB_JOURNAL(p_s_sb)->j_lock);

560

down(&SB_JOURNAL(p_s_sb)->j_lock);

561

}

561

}

562

563

/* unlock the current transaction */

563

/* unlock the current transaction */

564

static inline void unlock_journal(struct super_block *p_s_sb)

564

static inline void unlock_journal(struct super_block *p_s_sb)

565

{

565

{

566

up(&SB_JOURNAL(p_s_sb)->j_lock);

566

up(&SB_JOURNAL(p_s_sb)->j_lock);

567

}

567

}

568

569

static inline void get_journal_list(struct reiserfs_journal_list *jl)

569

static inline void get_journal_list(struct reiserfs_journal_list *jl)

570

{

570

{

571

jl->j_refcount++;

571

jl->j_refcount++;

572

}

572

}

573

574

static inline void put_journal_list(struct super_block *s,

574

static inline void put_journal_list(struct super_block *s,

575

struct reiserfs_journal_list *jl)

575

struct reiserfs_journal_list *jl)

576

{

576

{

577

if (jl->j_refcount < 1) {

577

if (jl->j_refcount < 1) {

578

reiserfs_panic(s, "trans id %lu, refcount at %d",

578

reiserfs_panic(s, "trans id %lu, refcount at %d",

579

jl->j_trans_id, jl->j_refcount);

579

jl->j_trans_id, jl->j_refcount);

580

}

580

}

581

if (--jl->j_refcount == 0)

581

if (--jl->j_refcount == 0)

582

kfree(jl);

582

kfree(jl);

583

}

583

}

584

585

/*

585

/*

586

** this used to be much more involved, and I'm keeping it just in case things get ugly again.

586

** this used to be much more involved, and I'm keeping it just in case things get ugly again.

587

** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a

587

** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a

588

** transaction.

588

** transaction.

589

*/

589

*/

590

static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,

590

static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,

591

struct reiserfs_journal_list *jl)

591

struct reiserfs_journal_list *jl)

592

{

592

{

593

594

struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;

594

struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;

595

if (jb) {

595

if (jb) {

596

cleanup_bitmap_list(p_s_sb, jb);

596

cleanup_bitmap_list(p_s_sb, jb);

597

}

597

}

598

jl->j_list_bitmap->journal_list = NULL;

598

jl->j_list_bitmap->journal_list = NULL;

599

jl->j_list_bitmap = NULL;

599

jl->j_list_bitmap = NULL;

600

}

600

}

601

602

static int journal_list_still_alive(struct super_block *s,

602

static int journal_list_still_alive(struct super_block *s,

603

unsigned long trans_id)

603

unsigned long trans_id)

604

{

604

{

605

struct reiserfs_journal *journal = SB_JOURNAL(s);

605

struct reiserfs_journal *journal = SB_JOURNAL(s);

606

struct list_head *entry = &journal->j_journal_list;

606

struct list_head *entry = &journal->j_journal_list;

607

struct reiserfs_journal_list *jl;

607

struct reiserfs_journal_list *jl;

608

609

if (!list_empty(entry)) {

609

if (!list_empty(entry)) {

610

jl = JOURNAL_LIST_ENTRY(entry->next);

610

jl = JOURNAL_LIST_ENTRY(entry->next);

611

if (jl->j_trans_id <= trans_id) {

611

if (jl->j_trans_id <= trans_id) {

612

return 1;

612

return 1;

613

}

613

}

614

}

614

}

615

return 0;

615

return 0;

616

}

616

}

617

618

static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)

618

static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)

619

{

619

{

620

char b[BDEVNAME_SIZE];

620

char b[BDEVNAME_SIZE];

621

622

if (buffer_journaled(bh)) {

622

if (buffer_journaled(bh)) {

623

reiserfs_warning(NULL,

623

reiserfs_warning(NULL,

624

"clm-2084: pinned buffer %lu:%s sent to disk",

624

"clm-2084: pinned buffer %lu:%s sent to disk",

625

bh->b_blocknr, bdevname(bh->b_bdev, b));

625

bh->b_blocknr, bdevname(bh->b_bdev, b));

626

}

626

}

627

if (uptodate)

627

if (uptodate)

628

set_buffer_uptodate(bh);

628

set_buffer_uptodate(bh);

629

else

629

else

630

clear_buffer_uptodate(bh);

630

clear_buffer_uptodate(bh);

631

unlock_buffer(bh);

631

unlock_buffer(bh);

632

put_bh(bh);

632

put_bh(bh);

633

}

633

}

634

635

static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)

635

static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)

636

{

636

{

637

if (uptodate)

637

if (uptodate)

638

set_buffer_uptodate(bh);

638

set_buffer_uptodate(bh);

639

else

639

else

640

clear_buffer_uptodate(bh);

640

clear_buffer_uptodate(bh);

641

unlock_buffer(bh);

641

unlock_buffer(bh);

642

put_bh(bh);

642

put_bh(bh);

643

}

643

}

644

645

static void submit_logged_buffer(struct buffer_head *bh)

645

static void submit_logged_buffer(struct buffer_head *bh)

646

{

646

{

647

get_bh(bh);

647

get_bh(bh);

648

bh->b_end_io = reiserfs_end_buffer_io_sync;

648

bh->b_end_io = reiserfs_end_buffer_io_sync;

649

clear_buffer_journal_new(bh);

649

clear_buffer_journal_new(bh);

650

clear_buffer_dirty(bh);

650

clear_buffer_dirty(bh);

651

if (!test_clear_buffer_journal_test(bh))

651

if (!test_clear_buffer_journal_test(bh))

652

BUG();

652

BUG();

653

if (!buffer_uptodate(bh))

653

if (!buffer_uptodate(bh))

654

BUG();

654

BUG();

655

submit_bh(WRITE, bh);

655

submit_bh(WRITE, bh);

656

}

656

}

657

658

static void submit_ordered_buffer(struct buffer_head *bh)

658

static void submit_ordered_buffer(struct buffer_head *bh)

659

{

659

{

660

get_bh(bh);

660

get_bh(bh);

661

bh->b_end_io = reiserfs_end_ordered_io;

661

bh->b_end_io = reiserfs_end_ordered_io;

662

clear_buffer_dirty(bh);

662

clear_buffer_dirty(bh);

663

if (!buffer_uptodate(bh))

663

if (!buffer_uptodate(bh))

664

BUG();

664

BUG();

665

submit_bh(WRITE, bh);

665

submit_bh(WRITE, bh);

666

}

666

}

667

668

static int submit_barrier_buffer(struct buffer_head *bh)

668

static int submit_barrier_buffer(struct buffer_head *bh)

669

{

669

{

670

get_bh(bh);

670

get_bh(bh);

671

bh->b_end_io = reiserfs_end_ordered_io;

671

bh->b_end_io = reiserfs_end_ordered_io;

672

clear_buffer_dirty(bh);

672

clear_buffer_dirty(bh);

673

if (!buffer_uptodate(bh))

673

if (!buffer_uptodate(bh))

674

BUG();

674

BUG();

675

return submit_bh(WRITE_BARRIER, bh);

675

return submit_bh(WRITE_BARRIER, bh);

676

}

676

}

677

678

static void check_barrier_completion(struct super_block *s,

678

static void check_barrier_completion(struct super_block *s,

679

struct buffer_head *bh)

679

struct buffer_head *bh)

680

{

680

{

681

if (buffer_eopnotsupp(bh)) {

681

if (buffer_eopnotsupp(bh)) {

682

clear_buffer_eopnotsupp(bh);

682

clear_buffer_eopnotsupp(bh);

683

disable_barrier(s);

683

disable_barrier(s);

684

set_buffer_uptodate(bh);

684

set_buffer_uptodate(bh);

685

set_buffer_dirty(bh);

685

set_buffer_dirty(bh);

686

sync_dirty_buffer(bh);

686

sync_dirty_buffer(bh);

687

}

687

}

688

}

688

}

689

690

#define CHUNK_SIZE 32

690

#define CHUNK_SIZE 32

691

struct buffer_chunk {

691

struct buffer_chunk {

692

struct buffer_head *bh[CHUNK_SIZE];

692

struct buffer_head *bh[CHUNK_SIZE];

693

int nr;

693

int nr;

694

};

694

};

695

696

static void write_chunk(struct buffer_chunk *chunk)

696

static void write_chunk(struct buffer_chunk *chunk)

697

{

697

{

698

int i;

698

int i;

699

get_fs_excl();

699

get_fs_excl();

700

for (i = 0; i < chunk->nr; i++) {

700

for (i = 0; i < chunk->nr; i++) {

701

submit_logged_buffer(chunk->bh[i]);

701

submit_logged_buffer(chunk->bh[i]);

702

}

702

}

703

chunk->nr = 0;

703

chunk->nr = 0;

704

put_fs_excl();

704

put_fs_excl();

705

}

705

}

706

707

static void write_ordered_chunk(struct buffer_chunk *chunk)

707

static void write_ordered_chunk(struct buffer_chunk *chunk)

708

{

708

{

709

int i;

709

int i;

710

get_fs_excl();

710

get_fs_excl();

711

for (i = 0; i < chunk->nr; i++) {

711

for (i = 0; i < chunk->nr; i++) {

712

submit_ordered_buffer(chunk->bh[i]);

712

submit_ordered_buffer(chunk->bh[i]);

713

}

713

}

714

chunk->nr = 0;

714

chunk->nr = 0;

715

put_fs_excl();

715

put_fs_excl();

716

}

716

}

717

718

static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,

718

static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,

719

spinlock_t * lock, void (fn) (struct buffer_chunk *))

719

spinlock_t * lock, void (fn) (struct buffer_chunk *))

720

{

720

{

721

int ret = 0;

721

int ret = 0;

722

if (chunk->nr >= CHUNK_SIZE)

722

if (chunk->nr >= CHUNK_SIZE)

723

BUG();

723

BUG();

724

chunk->bh[chunk->nr++] = bh;

724

chunk->bh[chunk->nr++] = bh;

725

if (chunk->nr >= CHUNK_SIZE) {

725

if (chunk->nr >= CHUNK_SIZE) {

726

ret = 1;

726

ret = 1;

727

if (lock)

727

if (lock)

728

spin_unlock(lock);

728

spin_unlock(lock);

729

fn(chunk);

729

fn(chunk);

730

if (lock)

730

if (lock)

731

spin_lock(lock);

731

spin_lock(lock);

732

}

732

}

733

return ret;

733

return ret;

734

}

734

}

735

736

static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);

736

static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);

737

static struct reiserfs_jh *alloc_jh(void)

737

static struct reiserfs_jh *alloc_jh(void)

738

{

738

{

739

struct reiserfs_jh *jh;

739

struct reiserfs_jh *jh;

740

while (1) {

740

while (1) {

741

jh = kmalloc(sizeof(*jh), GFP_NOFS);

741

jh = kmalloc(sizeof(*jh), GFP_NOFS);

742

if (jh) {

742

if (jh) {

743

atomic_inc(&nr_reiserfs_jh);

743

atomic_inc(&nr_reiserfs_jh);

744

return jh;

744

return jh;

745

}

745

}

746

yield();

746

yield();

747

}

747

}

748

}

748

}

749

750

/*

750

/*

751

* we want to free the jh when the buffer has been written

751

* we want to free the jh when the buffer has been written

752

* and waited on

752

* and waited on

753

*/

753

*/

754

void reiserfs_free_jh(struct buffer_head *bh)

754

void reiserfs_free_jh(struct buffer_head *bh)

755

{

755

{

756

struct reiserfs_jh *jh;

756

struct reiserfs_jh *jh;

757

758

jh = bh->b_private;

758

jh = bh->b_private;

759

if (jh) {

759

if (jh) {

760

bh->b_private = NULL;

760

bh->b_private = NULL;

761

jh->bh = NULL;

761

jh->bh = NULL;

762

list_del_init(&jh->list);

762

list_del_init(&jh->list);

763

kfree(jh);

763

kfree(jh);

764

if (atomic_read(&nr_reiserfs_jh) <= 0)

764

if (atomic_read(&nr_reiserfs_jh) <= 0)

765

BUG();

765

BUG();

766

atomic_dec(&nr_reiserfs_jh);

766

atomic_dec(&nr_reiserfs_jh);

767

put_bh(bh);

767

put_bh(bh);

768

}

768

}

769

}

769

}

770

771

static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,

771

static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,

772

int tail)

772

int tail)

773

{

773

{

774

struct reiserfs_jh *jh;

774

struct reiserfs_jh *jh;

775

776

if (bh->b_private) {

776

if (bh->b_private) {

777

spin_lock(&j->j_dirty_buffers_lock);

777

spin_lock(&j->j_dirty_buffers_lock);

778

if (!bh->b_private) {

778

if (!bh->b_private) {

779

spin_unlock(&j->j_dirty_buffers_lock);

779

spin_unlock(&j->j_dirty_buffers_lock);

780

goto no_jh;

780

goto no_jh;

781

}

781

}

782

jh = bh->b_private;

782

jh = bh->b_private;

783

list_del_init(&jh->list);

783

list_del_init(&jh->list);

784

} else {

784

} else {

785

no_jh:

785

no_jh:

786

get_bh(bh);

786

get_bh(bh);

787

jh = alloc_jh();

787

jh = alloc_jh();

788

spin_lock(&j->j_dirty_buffers_lock);

788

spin_lock(&j->j_dirty_buffers_lock);

789

/* buffer must be locked for __add_jh, should be able to have

789

/* buffer must be locked for __add_jh, should be able to have

790

* two adds at the same time

790

* two adds at the same time

791

*/

791

*/

792

if (bh->b_private)

792

if (bh->b_private)

793

BUG();

793

BUG();

794

jh->bh = bh;

794

jh->bh = bh;

795

bh->b_private = jh;

795

bh->b_private = jh;

796

}

796

}

797

jh->jl = j->j_current_jl;

797

jh->jl = j->j_current_jl;

798

if (tail)

798

if (tail)

799

list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);

799

list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);

800

else {

800

else {

801

list_add_tail(&jh->list, &jh->jl->j_bh_list);

801

list_add_tail(&jh->list, &jh->jl->j_bh_list);

802

}

802

}

803

spin_unlock(&j->j_dirty_buffers_lock);

803

spin_unlock(&j->j_dirty_buffers_lock);

804

return 0;

804

return 0;

805

}

805

}

806

807

int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)

807

int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)

808

{

808

{

809

return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);

809

return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);

810

}

810

}

811

int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)

811

int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)

812

{

812

{

813

return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);

813

return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);

814

}

814

}

815

816

#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)

816

#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)

817

static int write_ordered_buffers(spinlock_t * lock,

817

static int write_ordered_buffers(spinlock_t * lock,

818

struct reiserfs_journal *j,

818

struct reiserfs_journal *j,

819

struct reiserfs_journal_list *jl,

819

struct reiserfs_journal_list *jl,

820

struct list_head *list)

820

struct list_head *list)

821

{

821

{

822

struct buffer_head *bh;

822

struct buffer_head *bh;

823

struct reiserfs_jh *jh;

823

struct reiserfs_jh *jh;

824

int ret = j->j_errno;

824

int ret = j->j_errno;

825

struct buffer_chunk chunk;

825

struct buffer_chunk chunk;

826

struct list_head tmp;

826

struct list_head tmp;

827

INIT_LIST_HEAD(&tmp);

827

INIT_LIST_HEAD(&tmp);

828

829

chunk.nr = 0;

829

chunk.nr = 0;

830

spin_lock(lock);

830

spin_lock(lock);

831

while (!list_empty(list)) {

831

while (!list_empty(list)) {

832

jh = JH_ENTRY(list->next);

832

jh = JH_ENTRY(list->next);

833

bh = jh->bh;

833

bh = jh->bh;

834

get_bh(bh);

834

get_bh(bh);

835

if (test_set_buffer_locked(bh)) {

835

if (test_set_buffer_locked(bh)) {

836

if (!buffer_dirty(bh)) {

836

if (!buffer_dirty(bh)) {

837

list_del_init(&jh->list);

837

list_del_init(&jh->list);

838

list_add(&jh->list, &tmp);

838

list_add(&jh->list, &tmp);

839

goto loop_next;

839

goto loop_next;

840

}

840

}

841

spin_unlock(lock);

841

spin_unlock(lock);

842

if (chunk.nr)

842

if (chunk.nr)

843

write_ordered_chunk(&chunk);

843

write_ordered_chunk(&chunk);

844

wait_on_buffer(bh);

844

wait_on_buffer(bh);

845

cond_resched();

845

cond_resched();

846

spin_lock(lock);

846

spin_lock(lock);

847

goto loop_next;

847

goto loop_next;

848

}

848

}

849

if (buffer_dirty(bh)) {

849

if (buffer_dirty(bh)) {

850

list_del_init(&jh->list);

850

list_del_init(&jh->list);

851

list_add(&jh->list, &tmp);

851

list_add(&jh->list, &tmp);

852

add_to_chunk(&chunk, bh, lock, write_ordered_chunk);

852

add_to_chunk(&chunk, bh, lock, write_ordered_chunk);

853

} else {

853

} else {

854

reiserfs_free_jh(bh);

854

reiserfs_free_jh(bh);

855

unlock_buffer(bh);

855

unlock_buffer(bh);

856

}

856

}

857

loop_next:

857

loop_next:

858

put_bh(bh);

858

put_bh(bh);

859

cond_resched_lock(lock);

859

cond_resched_lock(lock);

860

}

860

}

861

if (chunk.nr) {

861

if (chunk.nr) {

862

spin_unlock(lock);

862

spin_unlock(lock);

863

write_ordered_chunk(&chunk);

863

write_ordered_chunk(&chunk);

864

spin_lock(lock);

864

spin_lock(lock);

865

}

865

}

866

while (!list_empty(&tmp)) {

866

while (!list_empty(&tmp)) {

867

jh = JH_ENTRY(tmp.prev);

867

jh = JH_ENTRY(tmp.prev);

868

bh = jh->bh;

868

bh = jh->bh;

869

get_bh(bh);

869

get_bh(bh);

870

reiserfs_free_jh(bh);

870

reiserfs_free_jh(bh);

871

872

if (buffer_locked(bh)) {

872

if (buffer_locked(bh)) {

873

spin_unlock(lock);

873

spin_unlock(lock);

874

wait_on_buffer(bh);

874

wait_on_buffer(bh);

875

spin_lock(lock);

875

spin_lock(lock);

876

}

876

}

877

if (!buffer_uptodate(bh)) {

877

if (!buffer_uptodate(bh)) {

878

ret = -EIO;

878

ret = -EIO;

879

}

879

}

880

/* ugly interaction with invalidatepage here.

880

/* ugly interaction with invalidatepage here.

881

* reiserfs_invalidate_page will pin any buffer that has a valid

881

* reiserfs_invalidate_page will pin any buffer that has a valid

882

* journal head from an older transaction. If someone else sets

882

* journal head from an older transaction. If someone else sets

883

* our buffer dirty after we write it in the first loop, and

883

* our buffer dirty after we write it in the first loop, and

884

* then someone truncates the page away, nobody will ever write

884

* then someone truncates the page away, nobody will ever write

885

* the buffer. We're safe if we write the page one last time

885

* the buffer. We're safe if we write the page one last time

886

* after freeing the journal header.

886

* after freeing the journal header.

887

*/

887

*/

888

if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {

888

if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {

889

spin_unlock(lock);

889

spin_unlock(lock);

890

ll_rw_block(WRITE, 1, &bh);

890

ll_rw_block(WRITE, 1, &bh);

891

spin_lock(lock);

891

spin_lock(lock);

892

}

892

}

893

put_bh(bh);

893

put_bh(bh);

894

cond_resched_lock(lock);

894

cond_resched_lock(lock);

895

}

895

}

896

spin_unlock(lock);

896

spin_unlock(lock);

897

return ret;

897

return ret;

898

}

898

}

899

900

static int flush_older_commits(struct super_block *s,

900

static int flush_older_commits(struct super_block *s,

901

struct reiserfs_journal_list *jl)

901

struct reiserfs_journal_list *jl)

902

{

902

{

903

struct reiserfs_journal *journal = SB_JOURNAL(s);

903

struct reiserfs_journal *journal = SB_JOURNAL(s);

904

struct reiserfs_journal_list *other_jl;

904

struct reiserfs_journal_list *other_jl;

905

struct reiserfs_journal_list *first_jl;

905

struct reiserfs_journal_list *first_jl;

906

struct list_head *entry;

906

struct list_head *entry;

907

unsigned long trans_id = jl->j_trans_id;

907

unsigned long trans_id = jl->j_trans_id;

908

unsigned long other_trans_id;

908

unsigned long other_trans_id;

909

unsigned long first_trans_id;

909

unsigned long first_trans_id;

910

911

find_first:

911

find_first:

912

/*

912

/*

913

* first we walk backwards to find the oldest uncommitted transation

913

* first we walk backwards to find the oldest uncommitted transation

914

*/

914

*/

915

first_jl = jl;

915

first_jl = jl;

916

entry = jl->j_list.prev;

916

entry = jl->j_list.prev;

917

while (1) {

917

while (1) {

918

other_jl = JOURNAL_LIST_ENTRY(entry);

918

other_jl = JOURNAL_LIST_ENTRY(entry);

919

if (entry == &journal->j_journal_list ||

919

if (entry == &journal->j_journal_list ||

920

atomic_read(&other_jl->j_older_commits_done))

920

atomic_read(&other_jl->j_older_commits_done))

921

break;

921

break;

922

923

first_jl = other_jl;

923

first_jl = other_jl;

924

entry = other_jl->j_list.prev;

924

entry = other_jl->j_list.prev;

925

}

925

}

926

927

/* if we didn't find any older uncommitted transactions, return now */

927

/* if we didn't find any older uncommitted transactions, return now */

928

if (first_jl == jl) {

928

if (first_jl == jl) {

929

return 0;

929

return 0;

930

}

930

}

931

932

first_trans_id = first_jl->j_trans_id;

932

first_trans_id = first_jl->j_trans_id;

933

934

entry = &first_jl->j_list;

934

entry = &first_jl->j_list;

935

while (1) {

935

while (1) {

936

other_jl = JOURNAL_LIST_ENTRY(entry);

936

other_jl = JOURNAL_LIST_ENTRY(entry);

937

other_trans_id = other_jl->j_trans_id;

937

other_trans_id = other_jl->j_trans_id;

938

939

if (other_trans_id < trans_id) {

939

if (other_trans_id < trans_id) {

940

if (atomic_read(&other_jl->j_commit_left) != 0) {

940

if (atomic_read(&other_jl->j_commit_left) != 0) {

941

flush_commit_list(s, other_jl, 0);

941

flush_commit_list(s, other_jl, 0);

942

943

/* list we were called with is gone, return */

943

/* list we were called with is gone, return */

944

if (!journal_list_still_alive(s, trans_id))

944

if (!journal_list_still_alive(s, trans_id))

945

return 1;

945

return 1;

946

947

/* the one we just flushed is gone, this means all

947

/* the one we just flushed is gone, this means all

948

* older lists are also gone, so first_jl is no longer

948

* older lists are also gone, so first_jl is no longer

949

* valid either. Go back to the beginning.

949

* valid either. Go back to the beginning.

950

*/

950

*/

951

if (!journal_list_still_alive

951

if (!journal_list_still_alive

952

(s, other_trans_id)) {

952

(s, other_trans_id)) {

953

goto find_first;

953

goto find_first;

954

}

954

}

955

}

955

}

956

entry = entry->next;

956

entry = entry->next;

957

if (entry == &journal->j_journal_list)

957

if (entry == &journal->j_journal_list)

958

return 0;

958

return 0;

959

} else {

959

} else {

960

return 0;

960

return 0;

961

}

961

}

962

}

962

}

963

return 0;

963

return 0;

964

}

964

}

965

int reiserfs_async_progress_wait(struct super_block *s)

965

int reiserfs_async_progress_wait(struct super_block *s)

966

{

966

{

967

DEFINE_WAIT(wait);

967

DEFINE_WAIT(wait);

968

struct reiserfs_journal *j = SB_JOURNAL(s);

968

struct reiserfs_journal *j = SB_JOURNAL(s);

969

if (atomic_read(&j->j_async_throttle))

969

if (atomic_read(&j->j_async_throttle))

970

blk_congestion_wait(WRITE, HZ / 10);

970

blk_congestion_wait(WRITE, HZ / 10);

971

return 0;

971

return 0;

972

}

972

}

973

974

/*

974

/*

975

** if this journal list still has commit blocks unflushed, send them to disk.

975

** if this journal list still has commit blocks unflushed, send them to disk.

976

**

976

**

977

** log areas must be flushed in order (transaction 2 can't commit before transaction 1)

977

** log areas must be flushed in order (transaction 2 can't commit before transaction 1)

978

** Before the commit block can by written, every other log block must be safely on disk

978

** Before the commit block can by written, every other log block must be safely on disk

979

**

979

**

980

*/

980

*/

981

static int flush_commit_list(struct super_block *s,

981

static int flush_commit_list(struct super_block *s,

982

struct reiserfs_journal_list *jl, int flushall)

982

struct reiserfs_journal_list *jl, int flushall)

983

{

983

{

984

int i;

984

int i;

985

int bn;

985

int bn;

986

struct buffer_head *tbh = NULL;

986

struct buffer_head *tbh = NULL;

987

unsigned long trans_id = jl->j_trans_id;

987

unsigned long trans_id = jl->j_trans_id;

988

struct reiserfs_journal *journal = SB_JOURNAL(s);

988

struct reiserfs_journal *journal = SB_JOURNAL(s);

989

int barrier = 0;

989

int barrier = 0;

990

int retval = 0;

990

int retval = 0;

991

int write_len;

991

992

reiserfs_check_lock_depth(s, "flush_commit_list");

993

reiserfs_check_lock_depth(s, "flush_commit_list");

993

994

if (atomic_read(&jl->j_older_commits_done)) {

995

if (atomic_read(&jl->j_older_commits_done)) {

995

return 0;

996

return 0;

996

}

997

}

997

998

get_fs_excl();

999

get_fs_excl();

999

1000

/* before we can put our commit blocks on disk, we have to make sure everyone older than

1001

/* before we can put our commit blocks on disk, we have to make sure everyone older than

1001

** us is on disk too

1002

** us is on disk too

1002

*/

1003

*/

1003

BUG_ON(jl->j_len <= 0);

1004

BUG_ON(jl->j_len <= 0);

1004

BUG_ON(trans_id == journal->j_trans_id);

1005

BUG_ON(trans_id == journal->j_trans_id);

1005

1006

get_journal_list(jl);

1007

get_journal_list(jl);

1007

if (flushall) {

1008

if (flushall) {

1008

if (flush_older_commits(s, jl) == 1) {

1009

if (flush_older_commits(s, jl) == 1) {

1009

/* list disappeared during flush_older_commits. return */

1010

/* list disappeared during flush_older_commits. return */

1010

goto put_jl;

1011

goto put_jl;

1011

}

1012

}

1012

}

1013

}

1013

1014

/* make sure nobody is trying to flush this one at the same time */

1015

/* make sure nobody is trying to flush this one at the same time */

1015

down(&jl->j_commit_lock);

1016

down(&jl->j_commit_lock);

1016

if (!journal_list_still_alive(s, trans_id)) {

1017

if (!journal_list_still_alive(s, trans_id)) {

1017

up(&jl->j_commit_lock);

1018

up(&jl->j_commit_lock);

1018

goto put_jl;

1019

goto put_jl;

1019

}

1020

}

1020

BUG_ON(jl->j_trans_id == 0);

1021

BUG_ON(jl->j_trans_id == 0);

1021

1022

/* this commit is done, exit */

1023

/* this commit is done, exit */

1023

if (atomic_read(&(jl->j_commit_left)) <= 0) {

1024

if (atomic_read(&(jl->j_commit_left)) <= 0) {

1024

if (flushall) {

1025

if (flushall) {

1025

atomic_set(&(jl->j_older_commits_done), 1);

1026

atomic_set(&(jl->j_older_commits_done), 1);

1026

}

1027

}

1027

up(&jl->j_commit_lock);

1028

up(&jl->j_commit_lock);

1028

goto put_jl;

1029

goto put_jl;

1029

}

1030

}

1030

1031

if (!list_empty(&jl->j_bh_list)) {

1032

if (!list_empty(&jl->j_bh_list)) {

1032

unlock_kernel();

1033

unlock_kernel();

1033

write_ordered_buffers(&journal->j_dirty_buffers_lock,

1034

write_ordered_buffers(&journal->j_dirty_buffers_lock,

1034

journal, jl, &jl->j_bh_list);

1035

journal, jl, &jl->j_bh_list);

1035

lock_kernel();

1036

lock_kernel();

1036

}

1037

}

1037

BUG_ON(!list_empty(&jl->j_bh_list));

1038

BUG_ON(!list_empty(&jl->j_bh_list));

1038

/*

1039

/*

1039

* for the description block and all the log blocks, submit any buffers

1040

* for the description block and all the log blocks, submit any buffers

1040

* that haven't already reached the disk

1041

* that haven't already reached the disk. Try to write at least 256

1042

* log blocks. later on, we will only wait on blocks that correspond

1043

* to this transaction, but while we're unplugging we might as well

1044

* get a chunk of data on there.

1041

*/

1045

*/

1042

atomic_inc(&journal->j_async_throttle);

1046

atomic_inc(&journal->j_async_throttle);

1043

for (i = 0; i < (jl->j_len + 1); i++) {

1047

write_len = jl->j_len + 1;

1048

if (write_len < 256)

1049

write_len = 256;

1050

for (i = 0 ; i < write_len ; i++) {

1044

bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %

1051

bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %

1045

SB_ONDISK_JOURNAL_SIZE(s);

1052

SB_ONDISK_JOURNAL_SIZE(s);

1046

tbh = journal_find_get_block(s, bn);

1053

tbh = journal_find_get_block(s, bn);

1047

if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */

1054

if (tbh) {

1048

ll_rw_block(SWRITE, 1, &tbh);

1055

if (buffer_dirty(tbh))

1049

put_bh(tbh);

1056

ll_rw_block(WRITE, 1, &tbh) ;

1057

put_bh(tbh) ;

1058

}

1050

}

1059

}

1051

atomic_dec(&journal->j_async_throttle);

1060

atomic_dec(&journal->j_async_throttle);

1052

1061

1053

/* We're skipping the commit if there's an error */

1062

/* We're skipping the commit if there's an error */

1054

if (retval || reiserfs_is_journal_aborted(journal))

1063

if (retval || reiserfs_is_journal_aborted(journal))

1055

barrier = 0;

1064

barrier = 0;

1056

1065

1057

/* wait on everything written so far before writing the commit

1066

/* wait on everything written so far before writing the commit

1058

* if we are in barrier mode, send the commit down now

1067

* if we are in barrier mode, send the commit down now

1059

*/

1068

*/

1060

barrier = reiserfs_barrier_flush(s);

1069

barrier = reiserfs_barrier_flush(s);

1061

if (barrier) {

1070

if (barrier) {

1062

int ret;

1071

int ret;

1063

lock_buffer(jl->j_commit_bh);

1072

lock_buffer(jl->j_commit_bh);

1064

ret = submit_barrier_buffer(jl->j_commit_bh);

1073

ret = submit_barrier_buffer(jl->j_commit_bh);

1065

if (ret == -EOPNOTSUPP) {

1074

if (ret == -EOPNOTSUPP) {

1066

set_buffer_uptodate(jl->j_commit_bh);

1075

set_buffer_uptodate(jl->j_commit_bh);

1067

disable_barrier(s);

1076

disable_barrier(s);

1068

barrier = 0;

1077

barrier = 0;

1069

}

1078

}

1070

}

1079

}

1071

for (i = 0; i < (jl->j_len + 1); i++) {

1080

for (i = 0; i < (jl->j_len + 1); i++) {

1072

bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +

1081

bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +

1073

(jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);

1082

(jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);

1074

tbh = journal_find_get_block(s, bn);

1083

tbh = journal_find_get_block(s, bn);

1075

wait_on_buffer(tbh);

1084

wait_on_buffer(tbh);

1076

// since we're using ll_rw_blk above, it might have skipped over

1085

// since we're using ll_rw_blk above, it might have skipped over

1077

// a locked buffer. Double check here

1086

// a locked buffer. Double check here

1078

//

1087

//

1079

if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */

1088

if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */

1080

sync_dirty_buffer(tbh);

1089

sync_dirty_buffer(tbh);

1081

if (unlikely(!buffer_uptodate(tbh))) {

1090

if (unlikely(!buffer_uptodate(tbh))) {

1082

#ifdef CONFIG_REISERFS_CHECK

1091

#ifdef CONFIG_REISERFS_CHECK

1083

reiserfs_warning(s, "journal-601, buffer write failed");

1092

reiserfs_warning(s, "journal-601, buffer write failed");

1084

#endif

1093

#endif

1085

retval = -EIO;

1094

retval = -EIO;

1086

}

1095

}

1087

put_bh(tbh); /* once for journal_find_get_block */

1096

put_bh(tbh); /* once for journal_find_get_block */

1088

put_bh(tbh); /* once due to original getblk in do_journal_end */

1097

put_bh(tbh); /* once due to original getblk in do_journal_end */

1089

atomic_dec(&(jl->j_commit_left));

1098

atomic_dec(&(jl->j_commit_left));

1090

}

1099

}

1091

1100

1092

BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);

1101

BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);

1093

1102

1094

if (!barrier) {

1103

if (!barrier) {

1095

/* If there was a write error in the journal - we can't commit

1104

/* If there was a write error in the journal - we can't commit

1096

* this transaction - it will be invalid and, if successful,

1105

* this transaction - it will be invalid and, if successful,

1097

* will just end up propogating the write error out to

1106

* will just end up propogating the write error out to

1098

* the file system. */

1107

* the file system. */

1099

if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {

1108

if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {

1100

if (buffer_dirty(jl->j_commit_bh))

1109

if (buffer_dirty(jl->j_commit_bh))

1101

BUG();

1110

BUG();

1102

mark_buffer_dirty(jl->j_commit_bh) ;

1111

mark_buffer_dirty(jl->j_commit_bh) ;

1103

sync_dirty_buffer(jl->j_commit_bh) ;

1112

sync_dirty_buffer(jl->j_commit_bh) ;

1104

}

1113

}

1105

} else

1114

} else

1106

wait_on_buffer(jl->j_commit_bh);

1115

wait_on_buffer(jl->j_commit_bh);

1107

1116

1108

check_barrier_completion(s, jl->j_commit_bh);

1117

check_barrier_completion(s, jl->j_commit_bh);

1109

1118

1110

/* If there was a write error in the journal - we can't commit this

1119

/* If there was a write error in the journal - we can't commit this

1111

* transaction - it will be invalid and, if successful, will just end

1120

* transaction - it will be invalid and, if successful, will just end

1112

* up propogating the write error out to the filesystem. */

1121

* up propogating the write error out to the filesystem. */

1113

if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {

1122

if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {

1114

#ifdef CONFIG_REISERFS_CHECK

1123

#ifdef CONFIG_REISERFS_CHECK

1115

reiserfs_warning(s, "journal-615: buffer write failed");

1124

reiserfs_warning(s, "journal-615: buffer write failed");

1116

#endif

1125

#endif

1117

retval = -EIO;

1126

retval = -EIO;

1118

}

1127

}

1119

bforget(jl->j_commit_bh);

1128

bforget(jl->j_commit_bh);

1120

if (journal->j_last_commit_id != 0 &&

1129

if (journal->j_last_commit_id != 0 &&

1121

(jl->j_trans_id - journal->j_last_commit_id) != 1) {

1130

(jl->j_trans_id - journal->j_last_commit_id) != 1) {

1122

reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",

1131

reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",

1123

journal->j_last_commit_id, jl->j_trans_id);

1132

journal->j_last_commit_id, jl->j_trans_id);

1124

}

1133

}

1125

journal->j_last_commit_id = jl->j_trans_id;

1134

journal->j_last_commit_id = jl->j_trans_id;

1126

1135

1127

/* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */

1136

/* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */

1128

cleanup_freed_for_journal_list(s, jl);

1137

cleanup_freed_for_journal_list(s, jl);

1129

1138

1130

retval = retval ? retval : journal->j_errno;

1139

retval = retval ? retval : journal->j_errno;

1131

1140

1132

/* mark the metadata dirty */

1141

/* mark the metadata dirty */

1133

if (!retval)

1142

if (!retval)

1134

dirty_one_transaction(s, jl);

1143

dirty_one_transaction(s, jl);

1135

atomic_dec(&(jl->j_commit_left));

1144

atomic_dec(&(jl->j_commit_left));

1136

1145

1137

if (flushall) {

1146

if (flushall) {

1138

atomic_set(&(jl->j_older_commits_done), 1);

1147

atomic_set(&(jl->j_older_commits_done), 1);

1139

}

1148

}

1140

up(&jl->j_commit_lock);

1149

up(&jl->j_commit_lock);

1141

put_jl:

1150

put_jl:

1142

put_journal_list(s, jl);

1151

put_journal_list(s, jl);

1143

1152

1144

if (retval)

1153

if (retval)

1145

reiserfs_abort(s, retval, "Journal write error in %s",

1154

reiserfs_abort(s, retval, "Journal write error in %s",

1146

__FUNCTION__);

1155

__FUNCTION__);

1147

put_fs_excl();

1156

put_fs_excl();

1148

return retval;

1157

return retval;

1149

}

1158

}

1150

1159

1151

/*

1160

/*

1152

** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or

1161

** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or

1153

** returns NULL if it can't find anything

1162

** returns NULL if it can't find anything

1154

*/

1163

*/

1155

static struct reiserfs_journal_list *find_newer_jl_for_cn(struct

1164

static struct reiserfs_journal_list *find_newer_jl_for_cn(struct

1156

reiserfs_journal_cnode

1165

reiserfs_journal_cnode

1157

*cn)

1166

*cn)

1158

{

1167

{

1159

struct super_block *sb = cn->sb;

1168

struct super_block *sb = cn->sb;

1160

b_blocknr_t blocknr = cn->blocknr;

1169

b_blocknr_t blocknr = cn->blocknr;

1161

1170

1162

cn = cn->hprev;

1171

cn = cn->hprev;

1163

while (cn) {

1172

while (cn) {

1164

if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {

1173

if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {

1165

return cn->jlist;

1174

return cn->jlist;

1166

}

1175

}

1167

cn = cn->hprev;

1176

cn = cn->hprev;

1168

}

1177

}

1169

return NULL;

1178

return NULL;

1170

}

1179

}

1171

1180

1172

static void remove_journal_hash(struct super_block *,

1181

static void remove_journal_hash(struct super_block *,

1173

struct reiserfs_journal_cnode **,

1182

struct reiserfs_journal_cnode **,

1174

struct reiserfs_journal_list *, unsigned long,

1183

struct reiserfs_journal_list *, unsigned long,

1175

int);

1184

int);

1176

1185

1177

/*

1186

/*

1178

** once all the real blocks have been flushed, it is safe to remove them from the

1187

** once all the real blocks have been flushed, it is safe to remove them from the

1179

** journal list for this transaction. Aside from freeing the cnode, this also allows the

1188

** journal list for this transaction. Aside from freeing the cnode, this also allows the

1180

** block to be reallocated for data blocks if it had been deleted.

1189

** block to be reallocated for data blocks if it had been deleted.

1181

*/

1190

*/

1182

static void remove_all_from_journal_list(struct super_block *p_s_sb,

1191

static void remove_all_from_journal_list(struct super_block *p_s_sb,

1183

struct reiserfs_journal_list *jl,

1192

struct reiserfs_journal_list *jl,

1184

int debug)

1193

int debug)

1185

{

1194

{

1186

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1195

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1187

struct reiserfs_journal_cnode *cn, *last;

1196

struct reiserfs_journal_cnode *cn, *last;

1188

cn = jl->j_realblock;

1197

cn = jl->j_realblock;

1189

1198

1190

/* which is better, to lock once around the whole loop, or

1199

/* which is better, to lock once around the whole loop, or

1191

** to lock for each call to remove_journal_hash?

1200

** to lock for each call to remove_journal_hash?

1192

*/

1201

*/

1193

while (cn) {

1202

while (cn) {

1194

if (cn->blocknr != 0) {

1203

if (cn->blocknr != 0) {

1195

if (debug) {

1204

if (debug) {

1196

reiserfs_warning(p_s_sb,

1205

reiserfs_warning(p_s_sb,

1197

"block %u, bh is %d, state %ld",

1206

"block %u, bh is %d, state %ld",

1198

cn->blocknr, cn->bh ? 1 : 0,

1207

cn->blocknr, cn->bh ? 1 : 0,

1199

cn->state);

1208

cn->state);

1200

}

1209

}

1201

cn->state = 0;

1210

cn->state = 0;

1202

remove_journal_hash(p_s_sb, journal->j_list_hash_table,

1211

remove_journal_hash(p_s_sb, journal->j_list_hash_table,

1203

jl, cn->blocknr, 1);

1212

jl, cn->blocknr, 1);

1204

}

1213

}

1205

last = cn;

1214

last = cn;

1206

cn = cn->next;

1215

cn = cn->next;

1207

free_cnode(p_s_sb, last);

1216

free_cnode(p_s_sb, last);

1208

}

1217

}

1209

jl->j_realblock = NULL;

1218

jl->j_realblock = NULL;

1210

}

1219

}

1211

1220

1212

/*

1221

/*

1213

** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.

1222

** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.

1214

** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start

1223

** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start

1215

** releasing blocks in this transaction for reuse as data blocks.

1224

** releasing blocks in this transaction for reuse as data blocks.

1216

** called by flush_journal_list, before it calls remove_all_from_journal_list

1225

** called by flush_journal_list, before it calls remove_all_from_journal_list

1217

**

1226

**

1218

*/

1227

*/

1219

static int _update_journal_header_block(struct super_block *p_s_sb,

1228

static int _update_journal_header_block(struct super_block *p_s_sb,

1220

unsigned long offset,

1229

unsigned long offset,

1221

unsigned long trans_id)

1230

unsigned long trans_id)

1222

{

1231

{

1223

struct reiserfs_journal_header *jh;

1232

struct reiserfs_journal_header *jh;

1224

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1233

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1225

1234

1226

if (reiserfs_is_journal_aborted(journal))

1235

if (reiserfs_is_journal_aborted(journal))

1227

return -EIO;

1236

return -EIO;

1228

1237

1229

if (trans_id >= journal->j_last_flush_trans_id) {

1238

if (trans_id >= journal->j_last_flush_trans_id) {

1230

if (buffer_locked((journal->j_header_bh))) {

1239

if (buffer_locked((journal->j_header_bh))) {

1231

wait_on_buffer((journal->j_header_bh));

1240

wait_on_buffer((journal->j_header_bh));

1232

if (unlikely(!buffer_uptodate(journal->j_header_bh))) {

1241

if (unlikely(!buffer_uptodate(journal->j_header_bh))) {

1233

#ifdef CONFIG_REISERFS_CHECK

1242

#ifdef CONFIG_REISERFS_CHECK

1234

reiserfs_warning(p_s_sb,

1243

reiserfs_warning(p_s_sb,

1235

"journal-699: buffer write failed");

1244

"journal-699: buffer write failed");

1236

#endif

1245

#endif

1237

return -EIO;

1246

return -EIO;

1238

}

1247

}

1239

}

1248

}

1240

journal->j_last_flush_trans_id = trans_id;

1249

journal->j_last_flush_trans_id = trans_id;

1241

journal->j_first_unflushed_offset = offset;

1250

journal->j_first_unflushed_offset = offset;

1242

jh = (struct reiserfs_journal_header *)(journal->j_header_bh->

1251

jh = (struct reiserfs_journal_header *)(journal->j_header_bh->

1243

b_data);

1252

b_data);

1244

jh->j_last_flush_trans_id = cpu_to_le32(trans_id);

1253

jh->j_last_flush_trans_id = cpu_to_le32(trans_id);

1245

jh->j_first_unflushed_offset = cpu_to_le32(offset);

1254

jh->j_first_unflushed_offset = cpu_to_le32(offset);

1246

jh->j_mount_id = cpu_to_le32(journal->j_mount_id);

1255

jh->j_mount_id = cpu_to_le32(journal->j_mount_id);

1247

1256

1248

if (reiserfs_barrier_flush(p_s_sb)) {

1257

if (reiserfs_barrier_flush(p_s_sb)) {

1249

int ret;

1258

int ret;

1250

lock_buffer(journal->j_header_bh);

1259

lock_buffer(journal->j_header_bh);

1251

ret = submit_barrier_buffer(journal->j_header_bh);

1260

ret = submit_barrier_buffer(journal->j_header_bh);

1252

if (ret == -EOPNOTSUPP) {

1261

if (ret == -EOPNOTSUPP) {

1253

set_buffer_uptodate(journal->j_header_bh);

1262

set_buffer_uptodate(journal->j_header_bh);

1254

disable_barrier(p_s_sb);

1263

disable_barrier(p_s_sb);

1255

goto sync;

1264

goto sync;

1256

}

1265

}

1257

wait_on_buffer(journal->j_header_bh);

1266

wait_on_buffer(journal->j_header_bh);

1258

check_barrier_completion(p_s_sb, journal->j_header_bh);

1267

check_barrier_completion(p_s_sb, journal->j_header_bh);

1259

} else {

1268

} else {

1260

sync:

1269

sync:

1261

set_buffer_dirty(journal->j_header_bh);

1270

set_buffer_dirty(journal->j_header_bh);

1262

sync_dirty_buffer(journal->j_header_bh);

1271

sync_dirty_buffer(journal->j_header_bh);

1263

}

1272

}

1264

if (!buffer_uptodate(journal->j_header_bh)) {

1273

if (!buffer_uptodate(journal->j_header_bh)) {

1265

reiserfs_warning(p_s_sb,

1274

reiserfs_warning(p_s_sb,

1266

"journal-837: IO error during journal replay");

1275

"journal-837: IO error during journal replay");

1267

return -EIO;

1276

return -EIO;

1268

}

1277

}

1269

}

1278

}

1270

return 0;

1279

return 0;

1271

}

1280

}

1272

1281

1273

static int update_journal_header_block(struct super_block *p_s_sb,

1282

static int update_journal_header_block(struct super_block *p_s_sb,

1274

unsigned long offset,

1283

unsigned long offset,

1275

unsigned long trans_id)

1284

unsigned long trans_id)

1276

{

1285

{

1277

return _update_journal_header_block(p_s_sb, offset, trans_id);

1286

return _update_journal_header_block(p_s_sb, offset, trans_id);

1278

}

1287

}

1279

1288

1280

/*

1289

/*

1281

** flush any and all journal lists older than you are

1290

** flush any and all journal lists older than you are

1282

** can only be called from flush_journal_list

1291

** can only be called from flush_journal_list

1283

*/

1292

*/

1284

static int flush_older_journal_lists(struct super_block *p_s_sb,

1293

static int flush_older_journal_lists(struct super_block *p_s_sb,

1285

struct reiserfs_journal_list *jl)

1294

struct reiserfs_journal_list *jl)

1286

{

1295

{

1287

struct list_head *entry;

1296

struct list_head *entry;

1288

struct reiserfs_journal_list *other_jl;

1297

struct reiserfs_journal_list *other_jl;

1289

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1298

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1290

unsigned long trans_id = jl->j_trans_id;

1299

unsigned long trans_id = jl->j_trans_id;

1291

1300

1292

/* we know we are the only ones flushing things, no extra race

1301

/* we know we are the only ones flushing things, no extra race

1293

* protection is required.

1302

* protection is required.

1294

*/

1303

*/

1295

restart:

1304

restart:

1296

entry = journal->j_journal_list.next;

1305

entry = journal->j_journal_list.next;

1297

/* Did we wrap? */

1306

/* Did we wrap? */

1298

if (entry == &journal->j_journal_list)

1307

if (entry == &journal->j_journal_list)

1299

return 0;

1308

return 0;

1300

other_jl = JOURNAL_LIST_ENTRY(entry);

1309

other_jl = JOURNAL_LIST_ENTRY(entry);

1301

if (other_jl->j_trans_id < trans_id) {

1310

if (other_jl->j_trans_id < trans_id) {

1302

BUG_ON(other_jl->j_refcount <= 0);

1311

BUG_ON(other_jl->j_refcount <= 0);

1303

/* do not flush all */

1312

/* do not flush all */

1304

flush_journal_list(p_s_sb, other_jl, 0);

1313

flush_journal_list(p_s_sb, other_jl, 0);

1305

1314

1306

/* other_jl is now deleted from the list */

1315

/* other_jl is now deleted from the list */

1307

goto restart;

1316

goto restart;

1308

}

1317

}

1309

return 0;

1318

return 0;

1310

}

1319

}

1311

1320

1312

static void del_from_work_list(struct super_block *s,

1321

static void del_from_work_list(struct super_block *s,

1313

struct reiserfs_journal_list *jl)

1322

struct reiserfs_journal_list *jl)

1314

{

1323

{

1315

struct reiserfs_journal *journal = SB_JOURNAL(s);

1324

struct reiserfs_journal *journal = SB_JOURNAL(s);

1316

if (!list_empty(&jl->j_working_list)) {

1325

if (!list_empty(&jl->j_working_list)) {

1317

list_del_init(&jl->j_working_list);

1326

list_del_init(&jl->j_working_list);

1318

journal->j_num_work_lists--;

1327

journal->j_num_work_lists--;

1319

}

1328

}

1320

}

1329

}

1321

1330

1322

/* flush a journal list, both commit and real blocks

1331

/* flush a journal list, both commit and real blocks

1323

**

1332

**

1324

** always set flushall to 1, unless you are calling from inside

1333

** always set flushall to 1, unless you are calling from inside

1325

** flush_journal_list

1334

** flush_journal_list

1326

**

1335

**

1327

** IMPORTANT. This can only be called while there are no journal writers,

1336

** IMPORTANT. This can only be called while there are no journal writers,

1328

** and the journal is locked. That means it can only be called from

1337

** and the journal is locked. That means it can only be called from

1329

** do_journal_end, or by journal_release

1338

** do_journal_end, or by journal_release

1330

*/

1339

*/

1331

static int flush_journal_list(struct super_block *s,

1340

static int flush_journal_list(struct super_block *s,

1332

struct reiserfs_journal_list *jl, int flushall)

1341

struct reiserfs_journal_list *jl, int flushall)

1333

{

1342

{

1334

struct reiserfs_journal_list *pjl;

1343

struct reiserfs_journal_list *pjl;

1335

struct reiserfs_journal_cnode *cn, *last;

1344

struct reiserfs_journal_cnode *cn, *last;

1336

int count;

1345

int count;

1337

int was_jwait = 0;

1346

int was_jwait = 0;

1338

int was_dirty = 0;

1347

int was_dirty = 0;

1339

struct buffer_head *saved_bh;

1348

struct buffer_head *saved_bh;

1340

unsigned long j_len_saved = jl->j_len;

1349

unsigned long j_len_saved = jl->j_len;

1341

struct reiserfs_journal *journal = SB_JOURNAL(s);

1350

struct reiserfs_journal *journal = SB_JOURNAL(s);

1342

int err = 0;

1351

int err = 0;

1343

1352

1344

BUG_ON(j_len_saved <= 0);

1353

BUG_ON(j_len_saved <= 0);

1345

1354

1346

if (atomic_read(&journal->j_wcount) != 0) {

1355

if (atomic_read(&journal->j_wcount) != 0) {

1347

reiserfs_warning(s,

1356

reiserfs_warning(s,

1348

"clm-2048: flush_journal_list called with wcount %d",

1357

"clm-2048: flush_journal_list called with wcount %d",

1349

atomic_read(&journal->j_wcount));

1358

atomic_read(&journal->j_wcount));

1350

}

1359

}

1351

BUG_ON(jl->j_trans_id == 0);

1360

BUG_ON(jl->j_trans_id == 0);

1352

1361

1353

/* if flushall == 0, the lock is already held */

1362

/* if flushall == 0, the lock is already held */

1354

if (flushall) {

1363

if (flushall) {

1355

down(&journal->j_flush_sem);

1364

down(&journal->j_flush_sem);

1356

} else if (!down_trylock(&journal->j_flush_sem)) {

1365

} else if (!down_trylock(&journal->j_flush_sem)) {

1357

BUG();

1366

BUG();

1358

}

1367

}

1359

1368

1360

count = 0;

1369

count = 0;

1361

if (j_len_saved > journal->j_trans_max) {

1370

if (j_len_saved > journal->j_trans_max) {

1362

reiserfs_panic(s,

1371

reiserfs_panic(s,

1363

"journal-715: flush_journal_list, length is %lu, trans id %lu\n",

1372

"journal-715: flush_journal_list, length is %lu, trans id %lu\n",

1364

j_len_saved, jl->j_trans_id);

1373

j_len_saved, jl->j_trans_id);

1365

return 0;

1374

return 0;

1366

}

1375

}

1367

1376

1368

get_fs_excl();

1377

get_fs_excl();

1369

1378

1370

/* if all the work is already done, get out of here */

1379

/* if all the work is already done, get out of here */

1371

if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&

1380

if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&

1372

atomic_read(&(jl->j_commit_left)) <= 0) {

1381

atomic_read(&(jl->j_commit_left)) <= 0) {

1373

goto flush_older_and_return;

1382

goto flush_older_and_return;

1374

}

1383

}

1375

1384

1376

/* start by putting the commit list on disk. This will also flush

1385

/* start by putting the commit list on disk. This will also flush

1377

** the commit lists of any olders transactions

1386

** the commit lists of any olders transactions

1378

*/

1387

*/

1379

flush_commit_list(s, jl, 1);

1388

flush_commit_list(s, jl, 1);

1380

1389

1381

if (!(jl->j_state & LIST_DIRTY)

1390

if (!(jl->j_state & LIST_DIRTY)

1382

&& !reiserfs_is_journal_aborted(journal))

1391

&& !reiserfs_is_journal_aborted(journal))

1383

BUG();

1392

BUG();

1384

1393

1385

/* are we done now? */

1394

/* are we done now? */

1386

if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&

1395

if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&

1387

atomic_read(&(jl->j_commit_left)) <= 0) {

1396

atomic_read(&(jl->j_commit_left)) <= 0) {

1388

goto flush_older_and_return;

1397

goto flush_older_and_return;

1389

}

1398

}

1390

1399

1391

/* loop through each cnode, see if we need to write it,

1400

/* loop through each cnode, see if we need to write it,

1392

** or wait on a more recent transaction, or just ignore it

1401

** or wait on a more recent transaction, or just ignore it

1393

*/

1402

*/

1394

if (atomic_read(&(journal->j_wcount)) != 0) {

1403

if (atomic_read(&(journal->j_wcount)) != 0) {

1395

reiserfs_panic(s,

1404

reiserfs_panic(s,

1396

"journal-844: panic journal list is flushing, wcount is not 0\n");

1405

"journal-844: panic journal list is flushing, wcount is not 0\n");

1397

}

1406

}

1398

cn = jl->j_realblock;

1407

cn = jl->j_realblock;

1399

while (cn) {

1408

while (cn) {

1400

was_jwait = 0;

1409

was_jwait = 0;

1401

was_dirty = 0;

1410

was_dirty = 0;

1402

saved_bh = NULL;

1411

saved_bh = NULL;

1403

/* blocknr of 0 is no longer in the hash, ignore it */

1412

/* blocknr of 0 is no longer in the hash, ignore it */

1404

if (cn->blocknr == 0) {

1413

if (cn->blocknr == 0) {

1405

goto free_cnode;

1414

goto free_cnode;

1406

}

1415

}

1407

1416

1408

/* This transaction failed commit. Don't write out to the disk */

1417

/* This transaction failed commit. Don't write out to the disk */

1409

if (!(jl->j_state & LIST_DIRTY))

1418

if (!(jl->j_state & LIST_DIRTY))

1410

goto free_cnode;

1419

goto free_cnode;

1411

1420

1412

pjl = find_newer_jl_for_cn(cn);

1421

pjl = find_newer_jl_for_cn(cn);

1413

/* the order is important here. We check pjl to make sure we

1422

/* the order is important here. We check pjl to make sure we

1414

** don't clear BH_JDirty_wait if we aren't the one writing this

1423

** don't clear BH_JDirty_wait if we aren't the one writing this

1415

** block to disk

1424

** block to disk

1416

*/

1425

*/

1417

if (!pjl && cn->bh) {

1426

if (!pjl && cn->bh) {

1418

saved_bh = cn->bh;

1427

saved_bh = cn->bh;

1419

1428

1420

/* we do this to make sure nobody releases the buffer while

1429

/* we do this to make sure nobody releases the buffer while

1421

** we are working with it

1430

** we are working with it

1422

*/

1431

*/

1423

get_bh(saved_bh);

1432

get_bh(saved_bh);

1424

1433

1425

if (buffer_journal_dirty(saved_bh)) {

1434

if (buffer_journal_dirty(saved_bh)) {

1426

BUG_ON(!can_dirty(cn));

1435

BUG_ON(!can_dirty(cn));

1427

was_jwait = 1;

1436

was_jwait = 1;

1428

was_dirty = 1;

1437

was_dirty = 1;

1429

} else if (can_dirty(cn)) {

1438

} else if (can_dirty(cn)) {

1430

/* everything with !pjl && jwait should be writable */

1439

/* everything with !pjl && jwait should be writable */

1431

BUG();

1440

BUG();

1432

}

1441

}

1433

}

1442

}

1434

1443

1435

/* if someone has this block in a newer transaction, just make

1444

/* if someone has this block in a newer transaction, just make

1436

** sure they are commited, and don't try writing it to disk

1445

** sure they are commited, and don't try writing it to disk

1437

*/

1446

*/

1438

if (pjl) {

1447

if (pjl) {

1439

if (atomic_read(&pjl->j_commit_left))

1448

if (atomic_read(&pjl->j_commit_left))

1440

flush_commit_list(s, pjl, 1);

1449

flush_commit_list(s, pjl, 1);

1441

goto free_cnode;

1450

goto free_cnode;

1442

}

1451

}

1443

1452

1444

/* bh == NULL when the block got to disk on its own, OR,

1453

/* bh == NULL when the block got to disk on its own, OR,

1445

** the block got freed in a future transaction

1454

** the block got freed in a future transaction

1446

*/

1455

*/

1447

if (saved_bh == NULL) {

1456

if (saved_bh == NULL) {

1448

goto free_cnode;

1457

goto free_cnode;

1449

}

1458

}

1450

1459

1451

/* this should never happen. kupdate_one_transaction has this list

1460

/* this should never happen. kupdate_one_transaction has this list

1452

** locked while it works, so we should never see a buffer here that

1461

** locked while it works, so we should never see a buffer here that

1453

** is not marked JDirty_wait

1462

** is not marked JDirty_wait

1454

*/

1463

*/

1455

if ((!was_jwait) && !buffer_locked(saved_bh)) {

1464

if ((!was_jwait) && !buffer_locked(saved_bh)) {

1456

reiserfs_warning(s,

1465

reiserfs_warning(s,

1457

"journal-813: BAD! buffer %llu %cdirty %cjwait, "

1466

"journal-813: BAD! buffer %llu %cdirty %cjwait, "

1458

"not in a newer tranasction",

1467

"not in a newer tranasction",

1459

(unsigned long long)saved_bh->

1468

(unsigned long long)saved_bh->

1460

b_blocknr, was_dirty ? ' ' : '!',

1469

b_blocknr, was_dirty ? ' ' : '!',

1461

was_jwait ? ' ' : '!');

1470

was_jwait ? ' ' : '!');

1462

}

1471

}

1463

if (was_dirty) {

1472

if (was_dirty) {

1464

/* we inc again because saved_bh gets decremented at free_cnode */

1473

/* we inc again because saved_bh gets decremented at free_cnode */

1465

get_bh(saved_bh);

1474

get_bh(saved_bh);

1466

set_bit(BLOCK_NEEDS_FLUSH, &cn->state);

1475

set_bit(BLOCK_NEEDS_FLUSH, &cn->state);

1467

lock_buffer(saved_bh);

1476

lock_buffer(saved_bh);

1468

BUG_ON(cn->blocknr != saved_bh->b_blocknr);

1477

BUG_ON(cn->blocknr != saved_bh->b_blocknr);

1469

if (buffer_dirty(saved_bh))

1478

if (buffer_dirty(saved_bh))

1470

submit_logged_buffer(saved_bh);

1479

submit_logged_buffer(saved_bh);

1471

else

1480

else

1472

unlock_buffer(saved_bh);

1481

unlock_buffer(saved_bh);

1473

count++;

1482

count++;

1474

} else {

1483

} else {

1475

reiserfs_warning(s,

1484

reiserfs_warning(s,

1476

"clm-2082: Unable to flush buffer %llu in %s",

1485

"clm-2082: Unable to flush buffer %llu in %s",

1477

(unsigned long long)saved_bh->

1486

(unsigned long long)saved_bh->

1478

b_blocknr, __FUNCTION__);

1487

b_blocknr, __FUNCTION__);

1479

}

1488

}

1480

free_cnode:

1489

free_cnode:

1481

last = cn;

1490

last = cn;

1482

cn = cn->next;

1491

cn = cn->next;

1483

if (saved_bh) {

1492

if (saved_bh) {

1484

/* we incremented this to keep others from taking the buffer head away */

1493

/* we incremented this to keep others from taking the buffer head away */

1485

put_bh(saved_bh);

1494

put_bh(saved_bh);

1486

if (atomic_read(&(saved_bh->b_count)) < 0) {

1495

if (atomic_read(&(saved_bh->b_count)) < 0) {

1487

reiserfs_warning(s,

1496

reiserfs_warning(s,

1488

"journal-945: saved_bh->b_count < 0");

1497

"journal-945: saved_bh->b_count < 0");

1489

}

1498

}

1490

}

1499

}

1491

}

1500

}

1492

if (count > 0) {

1501

if (count > 0) {

1493

cn = jl->j_realblock;

1502

cn = jl->j_realblock;

1494

while (cn) {

1503

while (cn) {

1495

if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {

1504

if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {

1496

if (!cn->bh) {

1505

if (!cn->bh) {

1497

reiserfs_panic(s,

1506

reiserfs_panic(s,

1498

"journal-1011: cn->bh is NULL\n");

1507

"journal-1011: cn->bh is NULL\n");

1499

}

1508

}

1500

wait_on_buffer(cn->bh);

1509

wait_on_buffer(cn->bh);

1501

if (!cn->bh) {

1510

if (!cn->bh) {

1502

reiserfs_panic(s,

1511

reiserfs_panic(s,

1503

"journal-1012: cn->bh is NULL\n");

1512

"journal-1012: cn->bh is NULL\n");

1504

}

1513

}

1505

if (unlikely(!buffer_uptodate(cn->bh))) {

1514

if (unlikely(!buffer_uptodate(cn->bh))) {

1506

#ifdef CONFIG_REISERFS_CHECK

1515

#ifdef CONFIG_REISERFS_CHECK

1507

reiserfs_warning(s,

1516

reiserfs_warning(s,

1508

"journal-949: buffer write failed\n");

1517

"journal-949: buffer write failed\n");

1509

#endif

1518

#endif

1510

err = -EIO;

1519

err = -EIO;

1511

}

1520

}

1512

/* note, we must clear the JDirty_wait bit after the up to date

1521

/* note, we must clear the JDirty_wait bit after the up to date

1513

** check, otherwise we race against our flushpage routine

1522

** check, otherwise we race against our flushpage routine

1514

*/

1523

*/

1515

BUG_ON(!test_clear_buffer_journal_dirty

1524

BUG_ON(!test_clear_buffer_journal_dirty

1516

(cn->bh));

1525

(cn->bh));

1517

1526

1518

/* undo the inc from journal_mark_dirty */

1527

/* undo the inc from journal_mark_dirty */

1519

put_bh(cn->bh);

1528

put_bh(cn->bh);

1520

brelse(cn->bh);

1529

brelse(cn->bh);

1521

}

1530

}

1522

cn = cn->next;

1531

cn = cn->next;

1523

}

1532

}

1524

}

1533

}

1525

1534

1526

if (err)

1535

if (err)

1527

reiserfs_abort(s, -EIO,

1536

reiserfs_abort(s, -EIO,

1528

"Write error while pushing transaction to disk in %s",

1537

"Write error while pushing transaction to disk in %s",

1529

__FUNCTION__);

1538

__FUNCTION__);

1530

flush_older_and_return:

1539

flush_older_and_return:

1531

1540

1532

/* before we can update the journal header block, we _must_ flush all

1541

/* before we can update the journal header block, we _must_ flush all

1533

** real blocks from all older transactions to disk. This is because

1542

** real blocks from all older transactions to disk. This is because

1534

** once the header block is updated, this transaction will not be

1543

** once the header block is updated, this transaction will not be

1535

** replayed after a crash

1544

** replayed after a crash

1536

*/

1545

*/

1537

if (flushall) {

1546

if (flushall) {

1538

flush_older_journal_lists(s, jl);

1547

flush_older_journal_lists(s, jl);

1539

}

1548

}

1540

1549

1541

err = journal->j_errno;

1550

err = journal->j_errno;

1542

/* before we can remove everything from the hash tables for this

1551

/* before we can remove everything from the hash tables for this

1543

** transaction, we must make sure it can never be replayed

1552

** transaction, we must make sure it can never be replayed

1544

**

1553

**

1545

** since we are only called from do_journal_end, we know for sure there

1554

** since we are only called from do_journal_end, we know for sure there

1546

** are no allocations going on while we are flushing journal lists. So,

1555

** are no allocations going on while we are flushing journal lists. So,

1547

** we only need to update the journal header block for the last list

1556

** we only need to update the journal header block for the last list

1548

** being flushed

1557

** being flushed

1549

*/

1558

*/

1550

if (!err && flushall) {

1559

if (!err && flushall) {

1551

err =

1560

err =

1552

update_journal_header_block(s,

1561

update_journal_header_block(s,

1553

(jl->j_start + jl->j_len +

1562

(jl->j_start + jl->j_len +

1554

2) % SB_ONDISK_JOURNAL_SIZE(s),

1563

2) % SB_ONDISK_JOURNAL_SIZE(s),

1555

jl->j_trans_id);

1564

jl->j_trans_id);

1556

if (err)

1565

if (err)

1557

reiserfs_abort(s, -EIO,

1566

reiserfs_abort(s, -EIO,

1558

"Write error while updating journal header in %s",

1567

"Write error while updating journal header in %s",

1559

__FUNCTION__);

1568

__FUNCTION__);

1560

}

1569

}

1561

remove_all_from_journal_list(s, jl, 0);

1570

remove_all_from_journal_list(s, jl, 0);

1562

list_del_init(&jl->j_list);

1571

list_del_init(&jl->j_list);

1563

journal->j_num_lists--;

1572

journal->j_num_lists--;

1564

del_from_work_list(s, jl);

1573

del_from_work_list(s, jl);

1565

1574

1566

if (journal->j_last_flush_id != 0 &&

1575

if (journal->j_last_flush_id != 0 &&

1567

(jl->j_trans_id - journal->j_last_flush_id) != 1) {

1576

(jl->j_trans_id - journal->j_last_flush_id) != 1) {

1568

reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",

1577

reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",

1569

journal->j_last_flush_id, jl->j_trans_id);

1578

journal->j_last_flush_id, jl->j_trans_id);

1570

}

1579

}

1571

journal->j_last_flush_id = jl->j_trans_id;

1580

journal->j_last_flush_id = jl->j_trans_id;

1572

1581

1573

/* not strictly required since we are freeing the list, but it should

1582

/* not strictly required since we are freeing the list, but it should

1574

* help find code using dead lists later on

1583

* help find code using dead lists later on

1575

*/

1584

*/

1576

jl->j_len = 0;

1585

jl->j_len = 0;

1577

atomic_set(&(jl->j_nonzerolen), 0);

1586

atomic_set(&(jl->j_nonzerolen), 0);

1578

jl->j_start = 0;

1587

jl->j_start = 0;

1579

jl->j_realblock = NULL;

1588

jl->j_realblock = NULL;

1580

jl->j_commit_bh = NULL;

1589

jl->j_commit_bh = NULL;

1581

jl->j_trans_id = 0;

1590

jl->j_trans_id = 0;

1582

jl->j_state = 0;

1591

jl->j_state = 0;

1583

put_journal_list(s, jl);

1592

put_journal_list(s, jl);

1584

if (flushall)

1593

if (flushall)

1585

up(&journal->j_flush_sem);

1594

up(&journal->j_flush_sem);

1586

put_fs_excl();

1595

put_fs_excl();

1587

return err;

1596

return err;

1588

}

1597

}

1589

1598

1590

static int write_one_transaction(struct super_block *s,

1599

static int write_one_transaction(struct super_block *s,

1591

struct reiserfs_journal_list *jl,

1600

struct reiserfs_journal_list *jl,

1592

struct buffer_chunk *chunk)

1601

struct buffer_chunk *chunk)

1593

{

1602

{

1594

struct reiserfs_journal_cnode *cn;

1603

struct reiserfs_journal_cnode *cn;

1595

int ret = 0;

1604

int ret = 0;

1596

1605

1597

jl->j_state |= LIST_TOUCHED;

1606

jl->j_state |= LIST_TOUCHED;

1598

del_from_work_list(s, jl);

1607

del_from_work_list(s, jl);

1599

if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {

1608

if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {

1600

return 0;

1609

return 0;

1601

}

1610

}

1602

1611

1603

cn = jl->j_realblock;

1612

cn = jl->j_realblock;

1604

while (cn) {

1613

while (cn) {

1605

/* if the blocknr == 0, this has been cleared from the hash,

1614

/* if the blocknr == 0, this has been cleared from the hash,

1606

** skip it

1615

** skip it

1607

*/

1616

*/

1608

if (cn->blocknr == 0) {

1617

if (cn->blocknr == 0) {

1609

goto next;

1618

goto next;

1610

}

1619

}

1611

if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {

1620

if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {

1612

struct buffer_head *tmp_bh;

1621

struct buffer_head *tmp_bh;

1613

/* we can race against journal_mark_freed when we try

1622

/* we can race against journal_mark_freed when we try

1614

* to lock_buffer(cn->bh), so we have to inc the buffer

1623

* to lock_buffer(cn->bh), so we have to inc the buffer

1615

* count, and recheck things after locking

1624

* count, and recheck things after locking

1616

*/

1625

*/

1617

tmp_bh = cn->bh;

1626

tmp_bh = cn->bh;

1618

get_bh(tmp_bh);

1627

get_bh(tmp_bh);

1619

lock_buffer(tmp_bh);

1628

lock_buffer(tmp_bh);

1620

if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {

1629

if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {

1621

if (!buffer_journal_dirty(tmp_bh) ||

1630

if (!buffer_journal_dirty(tmp_bh) ||

1622

buffer_journal_prepared(tmp_bh))

1631

buffer_journal_prepared(tmp_bh))

1623

BUG();

1632

BUG();

1624

add_to_chunk(chunk, tmp_bh, NULL, write_chunk);

1633

add_to_chunk(chunk, tmp_bh, NULL, write_chunk);

1625

ret++;

1634

ret++;

1626

} else {

1635

} else {

1627

/* note, cn->bh might be null now */

1636

/* note, cn->bh might be null now */

1628

unlock_buffer(tmp_bh);

1637

unlock_buffer(tmp_bh);

1629

}

1638

}

1630

put_bh(tmp_bh);

1639

put_bh(tmp_bh);

1631

}

1640

}

1632

cn = cn->next;

1642

cn = cn->next;

1634

cond_resched();

1643

cond_resched();

1635

}

1644

}

1636

return ret;

1645

return ret;

1637

}

1646

}

1638

1647

1639

/* used by flush_commit_list */

1648

/* used by flush_commit_list */

1640

static int dirty_one_transaction(struct super_block *s,

1649

static int dirty_one_transaction(struct super_block *s,

1641

struct reiserfs_journal_list *jl)

1650

struct reiserfs_journal_list *jl)

1642

{

1651

{

1643

struct reiserfs_journal_cnode *cn;

1652

struct reiserfs_journal_cnode *cn;

1644

struct reiserfs_journal_list *pjl;

1653

struct reiserfs_journal_list *pjl;

1645

int ret = 0;

1654

int ret = 0;

1646

1655

1647

jl->j_state |= LIST_DIRTY;

1656

jl->j_state |= LIST_DIRTY;

1648

cn = jl->j_realblock;

1657

cn = jl->j_realblock;

1649

while (cn) {

1658

while (cn) {

1650

/* look for a more recent transaction that logged this

1659

/* look for a more recent transaction that logged this

1651

** buffer. Only the most recent transaction with a buffer in

1660

** buffer. Only the most recent transaction with a buffer in

1652

** it is allowed to send that buffer to disk

1661

** it is allowed to send that buffer to disk

1653

*/

1662

*/

1654

pjl = find_newer_jl_for_cn(cn);

1663

pjl = find_newer_jl_for_cn(cn);

1655

if (!pjl && cn->blocknr && cn->bh

1664

if (!pjl && cn->blocknr && cn->bh

1656

&& buffer_journal_dirty(cn->bh)) {

1665

&& buffer_journal_dirty(cn->bh)) {

1657

BUG_ON(!can_dirty(cn));

1666

BUG_ON(!can_dirty(cn));

1658

/* if the buffer is prepared, it will either be logged

1667

/* if the buffer is prepared, it will either be logged

1659

* or restored. If restored, we need to make sure

1668

* or restored. If restored, we need to make sure

1660

* it actually gets marked dirty

1669

* it actually gets marked dirty

1661

*/

1670

*/

1662

clear_buffer_journal_new(cn->bh);

1671

clear_buffer_journal_new(cn->bh);

1663

if (buffer_journal_prepared(cn->bh)) {

1672

if (buffer_journal_prepared(cn->bh)) {

1664

set_buffer_journal_restore_dirty(cn->bh);

1673

set_buffer_journal_restore_dirty(cn->bh);

1665

} else {

1674

} else {

1666

set_buffer_journal_test(cn->bh);

1675

set_buffer_journal_test(cn->bh);

1667

mark_buffer_dirty(cn->bh);

1676

mark_buffer_dirty(cn->bh);

1668

}

1677

}

1669

}

1678

}

1670

cn = cn->next;

1679

cn = cn->next;

1671

}

1680

}

1672

return ret;

1681

return ret;

1673

}

1682

}

1674

1683

1675

static int kupdate_transactions(struct super_block *s,

1684

static int kupdate_transactions(struct super_block *s,

1676

struct reiserfs_journal_list *jl,

1685

struct reiserfs_journal_list *jl,

1677

struct reiserfs_journal_list **next_jl,

1686

struct reiserfs_journal_list **next_jl,

1678

unsigned long *next_trans_id,

1687

unsigned long *next_trans_id,

1679

int num_blocks, int num_trans)

1688

int num_blocks, int num_trans)

1680

{

1689

{

1681

int ret = 0;

1690

int ret = 0;

1682

int written = 0;

1691

int written = 0;

1683

int transactions_flushed = 0;

1692

int transactions_flushed = 0;

1684

unsigned long orig_trans_id = jl->j_trans_id;

1693

unsigned long orig_trans_id = jl->j_trans_id;

1685

struct buffer_chunk chunk;

1694

struct buffer_chunk chunk;

1686

struct list_head *entry;

1695

struct list_head *entry;

1687

struct reiserfs_journal *journal = SB_JOURNAL(s);

1696

struct reiserfs_journal *journal = SB_JOURNAL(s);

1688

chunk.nr = 0;

1697

chunk.nr = 0;

1689

1698

1690

down(&journal->j_flush_sem);

1699

down(&journal->j_flush_sem);

1691

if (!journal_list_still_alive(s, orig_trans_id)) {

1700

if (!journal_list_still_alive(s, orig_trans_id)) {

1692

goto done;

1701

goto done;

1693

}

1702

}

1694

1703

1695

/* we've got j_flush_sem held, nobody is going to delete any

1704

/* we've got j_flush_sem held, nobody is going to delete any

1696

* of these lists out from underneath us

1705

* of these lists out from underneath us

1697

*/

1706

*/

1698

while ((num_trans && transactions_flushed < num_trans) ||

1707

while ((num_trans && transactions_flushed < num_trans) ||

1699

(!num_trans && written < num_blocks)) {

1708

(!num_trans && written < num_blocks)) {

1700

1709

1701

if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||

1710

if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||

1702

atomic_read(&jl->j_commit_left)

1711

atomic_read(&jl->j_commit_left)

1703

|| !(jl->j_state & LIST_DIRTY)) {

1712

|| !(jl->j_state & LIST_DIRTY)) {

1704

del_from_work_list(s, jl);

1713

del_from_work_list(s, jl);

1705

break;

1714

break;

1706

}

1715

}

1707

ret = write_one_transaction(s, jl, &chunk);

1716

ret = write_one_transaction(s, jl, &chunk);

1708

1717

1709

if (ret < 0)

1718

if (ret < 0)

1710

goto done;

1719

goto done;

1711

transactions_flushed++;

1720

transactions_flushed++;

1712

written += ret;

1721

written += ret;

1713

entry = jl->j_list.next;

1722

entry = jl->j_list.next;

1714

1723

1715

/* did we wrap? */

1724

/* did we wrap? */

1716

if (entry == &journal->j_journal_list) {

1725

if (entry == &journal->j_journal_list) {

1717

break;

1726

break;

1718

}

1727

}

1719

jl = JOURNAL_LIST_ENTRY(entry);

1728

jl = JOURNAL_LIST_ENTRY(entry);

1720

1729

1721

/* don't bother with older transactions */

1730

/* don't bother with older transactions */

1722

if (jl->j_trans_id <= orig_trans_id)

1731

if (jl->j_trans_id <= orig_trans_id)

1723

break;

1732

break;

1724

}

1733

}

1725

if (chunk.nr) {

1734

if (chunk.nr) {

1726

write_chunk(&chunk);

1735

write_chunk(&chunk);

1727

}

1736

}

1728

1737

1729

done:

1738

done:

1730

up(&journal->j_flush_sem);

1739

up(&journal->j_flush_sem);

1731

return ret;

1740

return ret;

1732

}

1741

}

1733

1742

1734

/* for o_sync and fsync heavy applications, they tend to use

1743

/* for o_sync and fsync heavy applications, they tend to use

1735

** all the journa list slots with tiny transactions. These

1744

** all the journa list slots with tiny transactions. These

1736

** trigger lots and lots of calls to update the header block, which

1745

** trigger lots and lots of calls to update the header block, which

1737

** adds seeks and slows things down.

1746

** adds seeks and slows things down.

1738

**

1747

**

1739

** This function tries to clear out a large chunk of the journal lists

1748

** This function tries to clear out a large chunk of the journal lists

1740

** at once, which makes everything faster since only the newest journal

1749

** at once, which makes everything faster since only the newest journal

1741

** list updates the header block

1750

** list updates the header block

1742

*/

1751

*/

1743

static int flush_used_journal_lists(struct super_block *s,

1752

static int flush_used_journal_lists(struct super_block *s,

1744

struct reiserfs_journal_list *jl)

1753

struct reiserfs_journal_list *jl)

1745

{

1754

{

1746

unsigned long len = 0;

1755

unsigned long len = 0;

1747

unsigned long cur_len;

1756

unsigned long cur_len;

1748

int ret;

1757

int ret;

1749

int i;

1758

int i;

1750

int limit = 256;

1759

int limit = 256;

1751

struct reiserfs_journal_list *tjl;

1760

struct reiserfs_journal_list *tjl;

1752

struct reiserfs_journal_list *flush_jl;

1761

struct reiserfs_journal_list *flush_jl;

1753

unsigned long trans_id;

1762

unsigned long trans_id;

1754

struct reiserfs_journal *journal = SB_JOURNAL(s);

1763

struct reiserfs_journal *journal = SB_JOURNAL(s);

1755

1764

1756

flush_jl = tjl = jl;

1765

flush_jl = tjl = jl;

1757

1766

1758

/* in data logging mode, try harder to flush a lot of blocks */

1767

/* in data logging mode, try harder to flush a lot of blocks */

1759

if (reiserfs_data_log(s))

1768

if (reiserfs_data_log(s))

1760

limit = 1024;

1769

limit = 1024;

1761

/* flush for 256 transactions or limit blocks, whichever comes first */

1770

/* flush for 256 transactions or limit blocks, whichever comes first */

1762

for (i = 0; i < 256 && len < limit; i++) {

1771

for (i = 0; i < 256 && len < limit; i++) {

1763

if (atomic_read(&tjl->j_commit_left) ||

1772

if (atomic_read(&tjl->j_commit_left) ||

1764

tjl->j_trans_id < jl->j_trans_id) {

1773

tjl->j_trans_id < jl->j_trans_id) {

1765

break;

1774

break;

1766

}

1775

}

1767

cur_len = atomic_read(&tjl->j_nonzerolen);

1776

cur_len = atomic_read(&tjl->j_nonzerolen);

1768

if (cur_len > 0) {

1777

if (cur_len > 0) {

1769

tjl->j_state &= ~LIST_TOUCHED;

1778

tjl->j_state &= ~LIST_TOUCHED;

1770

}

1779

}

1771

len += cur_len;

1780

len += cur_len;

1772

flush_jl = tjl;

1781

flush_jl = tjl;

1773

if (tjl->j_list.next == &journal->j_journal_list)

1782

if (tjl->j_list.next == &journal->j_journal_list)

1774

break;

1783

break;

1775

tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);

1784

tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);

1776

}

1785

}

1777

/* try to find a group of blocks we can flush across all the

1786

/* try to find a group of blocks we can flush across all the

1778

** transactions, but only bother if we've actually spanned

1787

** transactions, but only bother if we've actually spanned

1779

** across multiple lists

1788

** across multiple lists

1780

*/

1789

*/

1781

if (flush_jl != jl) {

1790

if (flush_jl != jl) {

1782

ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);

1791

ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);

1783

}

1792

}

1784

flush_journal_list(s, flush_jl, 1);

1793

flush_journal_list(s, flush_jl, 1);

1785

return 0;

1794

return 0;

1786

}

1795

}

1787

1796

1788

/*

1797

/*

1789

** removes any nodes in table with name block and dev as bh.

1798

** removes any nodes in table with name block and dev as bh.

1790

** only touchs the hnext and hprev pointers.

1799

** only touchs the hnext and hprev pointers.

1791

*/

1800

*/

1792

void remove_journal_hash(struct super_block *sb,

1801

void remove_journal_hash(struct super_block *sb,

1793

struct reiserfs_journal_cnode **table,

1802

struct reiserfs_journal_cnode **table,

1794

struct reiserfs_journal_list *jl,

1803

struct reiserfs_journal_list *jl,

1795

unsigned long block, int remove_freed)

1804

unsigned long block, int remove_freed)

1796

{

1805

{

1797

struct reiserfs_journal_cnode *cur;

1806

struct reiserfs_journal_cnode *cur;

1798

struct reiserfs_journal_cnode **head;

1807

struct reiserfs_journal_cnode **head;

1799

1808

1800

head = &(journal_hash(table, sb, block));

1809

head = &(journal_hash(table, sb, block));

1801

if (!head) {

1810

if (!head) {

1802

return;

1811

return;

1803

}

1812

}

1804

cur = *head;

1813

cur = *head;

1805

while (cur) {

1814

while (cur) {

1806

if (cur->blocknr == block && cur->sb == sb

1815

if (cur->blocknr == block && cur->sb == sb

1807

&& (jl == NULL || jl == cur->jlist)

1816

&& (jl == NULL || jl == cur->jlist)

1808

&& (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {

1817

&& (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {

1809

if (cur->hnext) {

1818

if (cur->hnext) {

1810

cur->hnext->hprev = cur->hprev;

1819

cur->hnext->hprev = cur->hprev;

1811

}

1820

}

1812

if (cur->hprev) {

1821

if (cur->hprev) {

1813

cur->hprev->hnext = cur->hnext;

1822

cur->hprev->hnext = cur->hnext;

1814

} else {

1823

} else {

1815

*head = cur->hnext;

1824

*head = cur->hnext;

1816

}

1825

}

1817

cur->blocknr = 0;

1826

cur->blocknr = 0;

1818

cur->sb = NULL;

1827

cur->sb = NULL;

1819

cur->state = 0;

1828

cur->state = 0;

1820

if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */

1829

if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */

1821

atomic_dec(&(cur->jlist->j_nonzerolen));

1830

atomic_dec(&(cur->jlist->j_nonzerolen));

1822

cur->bh = NULL;

1831

cur->bh = NULL;

1823

cur->jlist = NULL;

1832

cur->jlist = NULL;

1824

}

1833

}

1825

cur = cur->hnext;

1834

cur = cur->hnext;

1826

}

1835

}

1827

}

1836

}

1828

1837

1829

static void free_journal_ram(struct super_block *p_s_sb)

1838

static void free_journal_ram(struct super_block *p_s_sb)

1830

{

1839

{

1831

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1840

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1832

kfree(journal->j_current_jl);

1841

kfree(journal->j_current_jl);

1833

journal->j_num_lists--;

1842

journal->j_num_lists--;

1834

1843

1835

vfree(journal->j_cnode_free_orig);

1844

vfree(journal->j_cnode_free_orig);

1836

free_list_bitmaps(p_s_sb, journal->j_list_bitmap);

1845

free_list_bitmaps(p_s_sb, journal->j_list_bitmap);

1837

free_bitmap_nodes(p_s_sb); /* must be after free_list_bitmaps */

1846

free_bitmap_nodes(p_s_sb); /* must be after free_list_bitmaps */

1838

if (journal->j_header_bh) {

1847

if (journal->j_header_bh) {

1839

brelse(journal->j_header_bh);

1848

brelse(journal->j_header_bh);

1840

}

1849

}

1841

/* j_header_bh is on the journal dev, make sure not to release the journal

1850

/* j_header_bh is on the journal dev, make sure not to release the journal

1842

* dev until we brelse j_header_bh

1851

* dev until we brelse j_header_bh

1843

*/

1852

*/

1844

release_journal_dev(p_s_sb, journal);

1853

release_journal_dev(p_s_sb, journal);

1845

vfree(journal);

1854

vfree(journal);

1846

}

1855

}

1847

1856

1848

/*

1857

/*

1849

** call on unmount. Only set error to 1 if you haven't made your way out

1858

** call on unmount. Only set error to 1 if you haven't made your way out

1850

** of read_super() yet. Any other caller must keep error at 0.

1859

** of read_super() yet. Any other caller must keep error at 0.

1851

*/

1860

*/

1852

static int do_journal_release(struct reiserfs_transaction_handle *th,

1861

static int do_journal_release(struct reiserfs_transaction_handle *th,

1853

struct super_block *p_s_sb, int error)

1862

struct super_block *p_s_sb, int error)

1854

{

1863

{

1855

struct reiserfs_transaction_handle myth;

1864

struct reiserfs_transaction_handle myth;

1856

int flushed = 0;

1865

int flushed = 0;

1857

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1866

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

1858

1867

1859

/* we only want to flush out transactions if we were called with error == 0

1868

/* we only want to flush out transactions if we were called with error == 0

1860

*/

1869

*/

1861

if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {

1870

if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {

1862

/* end the current trans */

1871

/* end the current trans */

1863

BUG_ON(!th->t_trans_id);

1872

BUG_ON(!th->t_trans_id);

1864

do_journal_end(th, p_s_sb, 10, FLUSH_ALL);

1873

do_journal_end(th, p_s_sb, 10, FLUSH_ALL);

1865

1874

1866

/* make sure something gets logged to force our way into the flush code */

1875

/* make sure something gets logged to force our way into the flush code */

1867

if (!journal_join(&myth, p_s_sb, 1)) {

1876

if (!journal_join(&myth, p_s_sb, 1)) {

1868

reiserfs_prepare_for_journal(p_s_sb,

1877

reiserfs_prepare_for_journal(p_s_sb,

1869

SB_BUFFER_WITH_SB(p_s_sb),

1878

SB_BUFFER_WITH_SB(p_s_sb),

1870

1);

1879

1);

1871

journal_mark_dirty(&myth, p_s_sb,

1880

journal_mark_dirty(&myth, p_s_sb,

1872

SB_BUFFER_WITH_SB(p_s_sb));

1881

SB_BUFFER_WITH_SB(p_s_sb));

1873

do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);

1882

do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);

1874

flushed = 1;

1883

flushed = 1;

1875

}

1884

}

1876

}

1885

}

1877

1886

1878

/* this also catches errors during the do_journal_end above */

1887

/* this also catches errors during the do_journal_end above */

1879

if (!error && reiserfs_is_journal_aborted(journal)) {

1888

if (!error && reiserfs_is_journal_aborted(journal)) {

1880

memset(&myth, 0, sizeof(myth));

1889

memset(&myth, 0, sizeof(myth));

1881

if (!journal_join_abort(&myth, p_s_sb, 1)) {

1890

if (!journal_join_abort(&myth, p_s_sb, 1)) {

1882

reiserfs_prepare_for_journal(p_s_sb,

1891

reiserfs_prepare_for_journal(p_s_sb,

1883

SB_BUFFER_WITH_SB(p_s_sb),

1892

SB_BUFFER_WITH_SB(p_s_sb),

1884

1);

1893

1);

1885

journal_mark_dirty(&myth, p_s_sb,

1894

journal_mark_dirty(&myth, p_s_sb,

1886

SB_BUFFER_WITH_SB(p_s_sb));

1895

SB_BUFFER_WITH_SB(p_s_sb));

1887

do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);

1896

do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);

1888

}

1897

}

1889

}

1898

}

1890

1899

1891

reiserfs_mounted_fs_count--;

1900

reiserfs_mounted_fs_count--;

1892

/* wait for all commits to finish */

1901

/* wait for all commits to finish */

1893

cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);

1902

cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);

1894

flush_workqueue(commit_wq);

1903

flush_workqueue(commit_wq);

1895

if (!reiserfs_mounted_fs_count) {

1904

if (!reiserfs_mounted_fs_count) {

1896

destroy_workqueue(commit_wq);

1905

destroy_workqueue(commit_wq);

1897

commit_wq = NULL;

1906

commit_wq = NULL;

1898

}

1907

}

1899

1908

1900

free_journal_ram(p_s_sb);

1909

free_journal_ram(p_s_sb);

1901

1910

1902

return 0;

1911

return 0;

1903

}

1912

}

1904

1913

1905

/*

1914

/*

1906

** call on unmount. flush all journal trans, release all alloc'd ram

1915

** call on unmount. flush all journal trans, release all alloc'd ram

1907

*/

1916

*/

1908

int journal_release(struct reiserfs_transaction_handle *th,

1917

int journal_release(struct reiserfs_transaction_handle *th,

1909

struct super_block *p_s_sb)

1918

struct super_block *p_s_sb)

1910

{

1919

{

1911

return do_journal_release(th, p_s_sb, 0);

1920

return do_journal_release(th, p_s_sb, 0);

1912

}

1921

}

1913

1922

1914

/*

1923

/*

1915

** only call from an error condition inside reiserfs_read_super!

1924

** only call from an error condition inside reiserfs_read_super!

1916

*/

1925

*/

1917

int journal_release_error(struct reiserfs_transaction_handle *th,

1926

int journal_release_error(struct reiserfs_transaction_handle *th,

1918

struct super_block *p_s_sb)

1927

struct super_block *p_s_sb)

1919

{

1928

{

1920

return do_journal_release(th, p_s_sb, 1);

1929

return do_journal_release(th, p_s_sb, 1);

1921

}

1930

}

1922

1931

1923

/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */

1932

/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */

1924

static int journal_compare_desc_commit(struct super_block *p_s_sb,

1933

static int journal_compare_desc_commit(struct super_block *p_s_sb,

1925

struct reiserfs_journal_desc *desc,

1934

struct reiserfs_journal_desc *desc,

1926

struct reiserfs_journal_commit *commit)

1935

struct reiserfs_journal_commit *commit)

1927

{

1936

{

1928

if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||

1937

if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||

1929

get_commit_trans_len(commit) != get_desc_trans_len(desc) ||

1938

get_commit_trans_len(commit) != get_desc_trans_len(desc) ||

1930

get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||

1939

get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||

1931

get_commit_trans_len(commit) <= 0) {

1940

get_commit_trans_len(commit) <= 0) {

1932

return 1;

1941

return 1;

1933

}

1942

}

1934

return 0;

1943

return 0;

1935

}

1944

}

1936

1945

1937

/* returns 0 if it did not find a description block

1946

/* returns 0 if it did not find a description block

1938

** returns -1 if it found a corrupt commit block

1947

** returns -1 if it found a corrupt commit block

1939

** returns 1 if both desc and commit were valid

1948

** returns 1 if both desc and commit were valid

1940

*/

1949

*/

1941

static int journal_transaction_is_valid(struct super_block *p_s_sb,

1950

static int journal_transaction_is_valid(struct super_block *p_s_sb,

1942

struct buffer_head *d_bh,

1951

struct buffer_head *d_bh,

1943

unsigned long *oldest_invalid_trans_id,

1952

unsigned long *oldest_invalid_trans_id,

1944

unsigned long *newest_mount_id)

1953

unsigned long *newest_mount_id)

1945

{

1954

{

1946

struct reiserfs_journal_desc *desc;

1955

struct reiserfs_journal_desc *desc;

1947

struct reiserfs_journal_commit *commit;

1956

struct reiserfs_journal_commit *commit;

1948

struct buffer_head *c_bh;

1957

struct buffer_head *c_bh;

1949

unsigned long offset;

1958

unsigned long offset;

1950

1959

1951

if (!d_bh)

1960

if (!d_bh)

1952

return 0;

1961

return 0;

1953

1962

1954

desc = (struct reiserfs_journal_desc *)d_bh->b_data;

1963

desc = (struct reiserfs_journal_desc *)d_bh->b_data;

1955

if (get_desc_trans_len(desc) > 0

1964

if (get_desc_trans_len(desc) > 0

1956

&& !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {

1965

&& !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {

1957

if (oldest_invalid_trans_id && *oldest_invalid_trans_id

1966

if (oldest_invalid_trans_id && *oldest_invalid_trans_id

1958

&& get_desc_trans_id(desc) > *oldest_invalid_trans_id) {

1967

&& get_desc_trans_id(desc) > *oldest_invalid_trans_id) {

1959

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

1968

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

1960

"journal-986: transaction "

1969

"journal-986: transaction "

1961

"is valid returning because trans_id %d is greater than "

1970

"is valid returning because trans_id %d is greater than "

1962

"oldest_invalid %lu",

1971

"oldest_invalid %lu",

1963

get_desc_trans_id(desc),

1972

get_desc_trans_id(desc),

1964

*oldest_invalid_trans_id);

1973

*oldest_invalid_trans_id);

1965

return 0;

1974

return 0;

1966

}

1975

}

1967

if (newest_mount_id

1976

if (newest_mount_id

1968

&& *newest_mount_id > get_desc_mount_id(desc)) {

1977

&& *newest_mount_id > get_desc_mount_id(desc)) {

1969

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

1978

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

1970

"journal-1087: transaction "

1979

"journal-1087: transaction "

1971

"is valid returning because mount_id %d is less than "

1980

"is valid returning because mount_id %d is less than "

1972

"newest_mount_id %lu",

1981

"newest_mount_id %lu",

1973

get_desc_mount_id(desc),

1982

get_desc_mount_id(desc),

1974

*newest_mount_id);

1983

*newest_mount_id);

1975

return -1;

1984

return -1;

1976

}

1985

}

1977

if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {

1986

if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {

1978

reiserfs_warning(p_s_sb,

1987

reiserfs_warning(p_s_sb,

1979

"journal-2018: Bad transaction length %d encountered, ignoring transaction",

1988

"journal-2018: Bad transaction length %d encountered, ignoring transaction",

1980

get_desc_trans_len(desc));

1989

get_desc_trans_len(desc));

1981

return -1;

1990

return -1;

1982

}

1991

}

1983

offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

1992

offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

1984

1993

1985

/* ok, we have a journal description block, lets see if the transaction was valid */

1994

/* ok, we have a journal description block, lets see if the transaction was valid */

1986

c_bh =

1995

c_bh =

1987

journal_bread(p_s_sb,

1996

journal_bread(p_s_sb,

1988

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

1997

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

1989

((offset + get_desc_trans_len(desc) +

1998

((offset + get_desc_trans_len(desc) +

1990

1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

1999

1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

1991

if (!c_bh)

2000

if (!c_bh)

1992

return 0;

2001

return 0;

1993

commit = (struct reiserfs_journal_commit *)c_bh->b_data;

2002

commit = (struct reiserfs_journal_commit *)c_bh->b_data;

1994

if (journal_compare_desc_commit(p_s_sb, desc, commit)) {

2003

if (journal_compare_desc_commit(p_s_sb, desc, commit)) {

1995

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2004

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

1996

"journal_transaction_is_valid, commit offset %ld had bad "

2005

"journal_transaction_is_valid, commit offset %ld had bad "

1997

"time %d or length %d",

2006

"time %d or length %d",

1998

c_bh->b_blocknr -

2007

c_bh->b_blocknr -

1999

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2008

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2000

get_commit_trans_id(commit),

2009

get_commit_trans_id(commit),

2001

get_commit_trans_len(commit));

2010

get_commit_trans_len(commit));

2002

brelse(c_bh);

2011

brelse(c_bh);

2003

if (oldest_invalid_trans_id) {

2012

if (oldest_invalid_trans_id) {

2004

*oldest_invalid_trans_id =

2013

*oldest_invalid_trans_id =

2005

get_desc_trans_id(desc);

2014

get_desc_trans_id(desc);

2006

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2015

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2007

"journal-1004: "

2016

"journal-1004: "

2008

"transaction_is_valid setting oldest invalid trans_id "

2017

"transaction_is_valid setting oldest invalid trans_id "

2009

"to %d",

2018

"to %d",

2010

get_desc_trans_id(desc));

2019

get_desc_trans_id(desc));

2011

}

2020

}

2012

return -1;

2021

return -1;

2013

}

2022

}

2014

brelse(c_bh);

2023

brelse(c_bh);

2015

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2024

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2016

"journal-1006: found valid "

2025

"journal-1006: found valid "

2017

"transaction start offset %llu, len %d id %d",

2026

"transaction start offset %llu, len %d id %d",

2018

d_bh->b_blocknr -

2027

d_bh->b_blocknr -

2019

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2028

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2020

get_desc_trans_len(desc),

2029

get_desc_trans_len(desc),

2021

get_desc_trans_id(desc));

2030

get_desc_trans_id(desc));

2022

return 1;

2031

return 1;

2023

} else {

2032

} else {

2024

return 0;

2033

return 0;

2025

}

2034

}

2026

}

2035

}

2027

2036

2028

static void brelse_array(struct buffer_head **heads, int num)

2037

static void brelse_array(struct buffer_head **heads, int num)

2029

{

2038

{

2030

int i;

2039

int i;

2031

for (i = 0; i < num; i++) {

2040

for (i = 0; i < num; i++) {

2032

brelse(heads[i]);

2041

brelse(heads[i]);

2033

}

2042

}

2034

}

2043

}

2035

2044

2036

/*

2045

/*

2037

** given the start, and values for the oldest acceptable transactions,

2046

** given the start, and values for the oldest acceptable transactions,

2038

** this either reads in a replays a transaction, or returns because the transaction

2047

** this either reads in a replays a transaction, or returns because the transaction

2039

** is invalid, or too old.

2048

** is invalid, or too old.

2040

*/

2049

*/

2041

static int journal_read_transaction(struct super_block *p_s_sb,

2050

static int journal_read_transaction(struct super_block *p_s_sb,

2042

unsigned long cur_dblock,

2051

unsigned long cur_dblock,

2043

unsigned long oldest_start,

2052

unsigned long oldest_start,

2044

unsigned long oldest_trans_id,

2053

unsigned long oldest_trans_id,

2045

unsigned long newest_mount_id)

2054

unsigned long newest_mount_id)

2046

{

2055

{

2047

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

2056

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

2048

struct reiserfs_journal_desc *desc;

2057

struct reiserfs_journal_desc *desc;

2049

struct reiserfs_journal_commit *commit;

2058

struct reiserfs_journal_commit *commit;

2050

unsigned long trans_id = 0;

2059

unsigned long trans_id = 0;

2051

struct buffer_head *c_bh;

2060

struct buffer_head *c_bh;

2052

struct buffer_head *d_bh;

2061

struct buffer_head *d_bh;

2053

struct buffer_head **log_blocks = NULL;

2062

struct buffer_head **log_blocks = NULL;

2054

struct buffer_head **real_blocks = NULL;

2063

struct buffer_head **real_blocks = NULL;

2055

unsigned long trans_offset;

2064

unsigned long trans_offset;

2056

int i;

2065

int i;

2057

int trans_half;

2066

int trans_half;

2058

2067

2059

d_bh = journal_bread(p_s_sb, cur_dblock);

2068

d_bh = journal_bread(p_s_sb, cur_dblock);

2060

if (!d_bh)

2069

if (!d_bh)

2061

return 1;

2070

return 1;

2062

desc = (struct reiserfs_journal_desc *)d_bh->b_data;

2071

desc = (struct reiserfs_journal_desc *)d_bh->b_data;

2063

trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

2072

trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

2064

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "

2073

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "

2065

"journal_read_transaction, offset %llu, len %d mount_id %d",

2074

"journal_read_transaction, offset %llu, len %d mount_id %d",

2066

d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2075

d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2067

get_desc_trans_len(desc), get_desc_mount_id(desc));

2076

get_desc_trans_len(desc), get_desc_mount_id(desc));

2068

if (get_desc_trans_id(desc) < oldest_trans_id) {

2077

if (get_desc_trans_id(desc) < oldest_trans_id) {

2069

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "

2078

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "

2070

"journal_read_trans skipping because %lu is too old",

2079

"journal_read_trans skipping because %lu is too old",

2071

cur_dblock -

2080

cur_dblock -

2072

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));

2081

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));

2073

brelse(d_bh);

2082

brelse(d_bh);

2074

return 1;

2083

return 1;

2075

}

2084

}

2076

if (get_desc_mount_id(desc) != newest_mount_id) {

2085

if (get_desc_mount_id(desc) != newest_mount_id) {

2077

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "

2086

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "

2078

"journal_read_trans skipping because %d is != "

2087

"journal_read_trans skipping because %d is != "

2079

"newest_mount_id %lu", get_desc_mount_id(desc),

2088

"newest_mount_id %lu", get_desc_mount_id(desc),

2080

newest_mount_id);

2089

newest_mount_id);

2081

brelse(d_bh);

2090

brelse(d_bh);

2082

return 1;

2091

return 1;

2083

}

2092

}

2084

c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2093

c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2085

((trans_offset + get_desc_trans_len(desc) + 1) %

2094

((trans_offset + get_desc_trans_len(desc) + 1) %

2086

SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

2095

SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

2087

if (!c_bh) {

2096

if (!c_bh) {

2088

brelse(d_bh);

2097

brelse(d_bh);

2089

return 1;

2098

return 1;

2090

}

2099

}

2091

commit = (struct reiserfs_journal_commit *)c_bh->b_data;

2100

commit = (struct reiserfs_journal_commit *)c_bh->b_data;

2092

if (journal_compare_desc_commit(p_s_sb, desc, commit)) {

2101

if (journal_compare_desc_commit(p_s_sb, desc, commit)) {

2093

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2102

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2094

"journal_read_transaction, "

2103

"journal_read_transaction, "

2095

"commit offset %llu had bad time %d or length %d",

2104

"commit offset %llu had bad time %d or length %d",

2096

c_bh->b_blocknr -

2105

c_bh->b_blocknr -

2097

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2106

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2098

get_commit_trans_id(commit),

2107

get_commit_trans_id(commit),

2099

get_commit_trans_len(commit));

2108

get_commit_trans_len(commit));

2100

brelse(c_bh);

2109

brelse(c_bh);

2101

brelse(d_bh);

2110

brelse(d_bh);

2102

return 1;

2111

return 1;

2103

}

2112

}

2104

trans_id = get_desc_trans_id(desc);

2113

trans_id = get_desc_trans_id(desc);

2105

/* now we know we've got a good transaction, and it was inside the valid time ranges */

2114

/* now we know we've got a good transaction, and it was inside the valid time ranges */

2106

log_blocks = kmalloc(get_desc_trans_len(desc) *

2115

log_blocks = kmalloc(get_desc_trans_len(desc) *

2107

sizeof(struct buffer_head *), GFP_NOFS);

2116

sizeof(struct buffer_head *), GFP_NOFS);

2108

real_blocks = kmalloc(get_desc_trans_len(desc) *

2117

real_blocks = kmalloc(get_desc_trans_len(desc) *

2109

sizeof(struct buffer_head *), GFP_NOFS);

2118

sizeof(struct buffer_head *), GFP_NOFS);

2110

if (!log_blocks || !real_blocks) {

2119

if (!log_blocks || !real_blocks) {

2111

brelse(c_bh);

2120

brelse(c_bh);

2112

brelse(d_bh);

2121

brelse(d_bh);

2113

kfree(log_blocks);

2122

kfree(log_blocks);

2114

kfree(real_blocks);

2123

kfree(real_blocks);

2115

reiserfs_warning(p_s_sb,

2124

reiserfs_warning(p_s_sb,

2116

"journal-1169: kmalloc failed, unable to mount FS");

2125

"journal-1169: kmalloc failed, unable to mount FS");

2117

return -1;

2126

return -1;

2118

}

2127

}

2119

/* get all the buffer heads */

2128

/* get all the buffer heads */

2120

trans_half = journal_trans_half(p_s_sb->s_blocksize);

2129

trans_half = journal_trans_half(p_s_sb->s_blocksize);

2121

for (i = 0; i < get_desc_trans_len(desc); i++) {

2130

for (i = 0; i < get_desc_trans_len(desc); i++) {

2122

log_blocks[i] =

2131

log_blocks[i] =

2123

journal_getblk(p_s_sb,

2132

journal_getblk(p_s_sb,

2124

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2133

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2125

(trans_offset + 1 +

2134

(trans_offset + 1 +

2126

i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2135

i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2127

if (i < trans_half) {

2136

if (i < trans_half) {

2128

real_blocks[i] =

2137

real_blocks[i] =

2129

sb_getblk(p_s_sb,

2138

sb_getblk(p_s_sb,

2130

le32_to_cpu(desc->j_realblock[i]));

2139

le32_to_cpu(desc->j_realblock[i]));

2131

} else {

2140

} else {

2132

real_blocks[i] =

2141

real_blocks[i] =

2133

sb_getblk(p_s_sb,

2142

sb_getblk(p_s_sb,

2134

le32_to_cpu(commit->

2143

le32_to_cpu(commit->

2135

j_realblock[i - trans_half]));

2144

j_realblock[i - trans_half]));

2136

}

2145

}

2137

if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {

2146

if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {

2138

reiserfs_warning(p_s_sb,

2147

reiserfs_warning(p_s_sb,

2139

"journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");

2148

"journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");

2140

goto abort_replay;

2149

goto abort_replay;

2141

}

2150

}

2142

/* make sure we don't try to replay onto log or reserved area */

2151

/* make sure we don't try to replay onto log or reserved area */

2143

if (is_block_in_log_or_reserved_area

2152

if (is_block_in_log_or_reserved_area

2144

(p_s_sb, real_blocks[i]->b_blocknr)) {

2153

(p_s_sb, real_blocks[i]->b_blocknr)) {

2145

reiserfs_warning(p_s_sb,

2154

reiserfs_warning(p_s_sb,

2146

"journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block");

2155

"journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block");

2147

abort_replay:

2156

abort_replay:

2148

brelse_array(log_blocks, i);

2157

brelse_array(log_blocks, i);

2149

brelse_array(real_blocks, i);

2158

brelse_array(real_blocks, i);

2150

brelse(c_bh);

2159

brelse(c_bh);

2151

brelse(d_bh);

2160

brelse(d_bh);

2152

kfree(log_blocks);

2161

kfree(log_blocks);

2153

kfree(real_blocks);

2162

kfree(real_blocks);

2154

return -1;

2163

return -1;

2155

}

2164

}

2156

}

2165

}

2157

/* read in the log blocks, memcpy to the corresponding real block */

2166

/* read in the log blocks, memcpy to the corresponding real block */

2158

ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);

2167

ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);

2159

for (i = 0; i < get_desc_trans_len(desc); i++) {

2168

for (i = 0; i < get_desc_trans_len(desc); i++) {

2160

wait_on_buffer(log_blocks[i]);

2169

wait_on_buffer(log_blocks[i]);

2161

if (!buffer_uptodate(log_blocks[i])) {

2170

if (!buffer_uptodate(log_blocks[i])) {

2162

reiserfs_warning(p_s_sb,

2171

reiserfs_warning(p_s_sb,

2163

"journal-1212: REPLAY FAILURE fsck required! buffer write failed");

2172

"journal-1212: REPLAY FAILURE fsck required! buffer write failed");

2164

brelse_array(log_blocks + i,

2173

brelse_array(log_blocks + i,

2165

get_desc_trans_len(desc) - i);

2174

get_desc_trans_len(desc) - i);

2166

brelse_array(real_blocks, get_desc_trans_len(desc));

2175

brelse_array(real_blocks, get_desc_trans_len(desc));

2167

brelse(c_bh);

2176

brelse(c_bh);

2168

brelse(d_bh);

2177

brelse(d_bh);

2169

kfree(log_blocks);

2178

kfree(log_blocks);

2170

kfree(real_blocks);

2179

kfree(real_blocks);

2171

return -1;

2180

return -1;

2172

}

2181

}

2173

memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,

2182

memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,

2174

real_blocks[i]->b_size);

2183

real_blocks[i]->b_size);

2175

set_buffer_uptodate(real_blocks[i]);

2184

set_buffer_uptodate(real_blocks[i]);

2176

brelse(log_blocks[i]);

2185

brelse(log_blocks[i]);

2177

}

2186

}

2178

/* flush out the real blocks */

2187

/* flush out the real blocks */

2179

for (i = 0; i < get_desc_trans_len(desc); i++) {

2188

for (i = 0; i < get_desc_trans_len(desc); i++) {

2180

set_buffer_dirty(real_blocks[i]);

2189

set_buffer_dirty(real_blocks[i]);

2181

ll_rw_block(SWRITE, 1, real_blocks + i);

2190

ll_rw_block(SWRITE, 1, real_blocks + i);

2182

}

2191

}

2183

for (i = 0; i < get_desc_trans_len(desc); i++) {

2192

for (i = 0; i < get_desc_trans_len(desc); i++) {

2184

wait_on_buffer(real_blocks[i]);

2193

wait_on_buffer(real_blocks[i]);

2185

if (!buffer_uptodate(real_blocks[i])) {

2194

if (!buffer_uptodate(real_blocks[i])) {

2186

reiserfs_warning(p_s_sb,

2195

reiserfs_warning(p_s_sb,

2187

"journal-1226: REPLAY FAILURE, fsck required! buffer write failed");

2196

"journal-1226: REPLAY FAILURE, fsck required! buffer write failed");

2188

brelse_array(real_blocks + i,

2197

brelse_array(real_blocks + i,

2189

get_desc_trans_len(desc) - i);

2198

get_desc_trans_len(desc) - i);

2190

brelse(c_bh);

2199

brelse(c_bh);

2191

brelse(d_bh);

2200

brelse(d_bh);

2192

kfree(log_blocks);

2201

kfree(log_blocks);

2193

kfree(real_blocks);

2202

kfree(real_blocks);

2194

return -1;

2203

return -1;

2195

}

2204

}

2196

brelse(real_blocks[i]);

2205

brelse(real_blocks[i]);

2197

}

2206

}

2198

cur_dblock =

2207

cur_dblock =

2199

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2208

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2200

((trans_offset + get_desc_trans_len(desc) +

2209

((trans_offset + get_desc_trans_len(desc) +

2201

2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2210

2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2202

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2211

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2203

"journal-1095: setting journal " "start to offset %ld",

2212

"journal-1095: setting journal " "start to offset %ld",

2204

cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));

2213

cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));

2205

2214

2206

/* init starting values for the first transaction, in case this is the last transaction to be replayed. */

2215

/* init starting values for the first transaction, in case this is the last transaction to be replayed. */

2207

journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

2216

journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

2208

journal->j_last_flush_trans_id = trans_id;

2217

journal->j_last_flush_trans_id = trans_id;

2209

journal->j_trans_id = trans_id + 1;

2218

journal->j_trans_id = trans_id + 1;

2210

brelse(c_bh);

2219

brelse(c_bh);

2211

brelse(d_bh);

2220

brelse(d_bh);

2212

kfree(log_blocks);

2221

kfree(log_blocks);

2213

kfree(real_blocks);

2222

kfree(real_blocks);

2214

return 0;

2223

return 0;

2215

}

2224

}

2216

2225

2217

/* This function reads blocks starting from block and to max_block of bufsize

2226

/* This function reads blocks starting from block and to max_block of bufsize

2218

size (but no more than BUFNR blocks at a time). This proved to improve

2227

size (but no more than BUFNR blocks at a time). This proved to improve

2219

mounting speed on self-rebuilding raid5 arrays at least.

2228

mounting speed on self-rebuilding raid5 arrays at least.

2220

Right now it is only used from journal code. But later we might use it

2229

Right now it is only used from journal code. But later we might use it

2221

from other places.

2230

from other places.

2222

Note: Do not use journal_getblk/sb_getblk functions here! */

2231

Note: Do not use journal_getblk/sb_getblk functions here! */

2223

static struct buffer_head *reiserfs_breada(struct block_device *dev, int block,

2232

static struct buffer_head *reiserfs_breada(struct block_device *dev, int block,

2224

int bufsize, unsigned int max_block)

2233

int bufsize, unsigned int max_block)

2225

{

2234

{

2226

struct buffer_head *bhlist[BUFNR];

2235

struct buffer_head *bhlist[BUFNR];

2227

unsigned int blocks = BUFNR;

2236

unsigned int blocks = BUFNR;

2228

struct buffer_head *bh;

2237

struct buffer_head *bh;

2229

int i, j;

2238

int i, j;

2230

2239

2231

bh = __getblk(dev, block, bufsize);

2240

bh = __getblk(dev, block, bufsize);

2232

if (buffer_uptodate(bh))

2241

if (buffer_uptodate(bh))

2233

return (bh);

2242

return (bh);

2234

2243

2235

if (block + BUFNR > max_block) {

2244

if (block + BUFNR > max_block) {

2236

blocks = max_block - block;

2245

blocks = max_block - block;

2237

}

2246

}

2238

bhlist[0] = bh;

2247

bhlist[0] = bh;

2239

j = 1;

2248

j = 1;

2240

for (i = 1; i < blocks; i++) {

2249

for (i = 1; i < blocks; i++) {

2241

bh = __getblk(dev, block + i, bufsize);

2250

bh = __getblk(dev, block + i, bufsize);

2242

if (buffer_uptodate(bh)) {

2251

if (buffer_uptodate(bh)) {

2243

brelse(bh);

2252

brelse(bh);

2244

break;

2253

break;

2245

} else

2254

} else

2246

bhlist[j++] = bh;

2255

bhlist[j++] = bh;

2247

}

2256

}

2248

ll_rw_block(READ, j, bhlist);

2257

ll_rw_block(READ, j, bhlist);

2249

for (i = 1; i < j; i++)

2258

for (i = 1; i < j; i++)

2250

brelse(bhlist[i]);

2259

brelse(bhlist[i]);

2251

bh = bhlist[0];

2260

bh = bhlist[0];

2252

wait_on_buffer(bh);

2261

wait_on_buffer(bh);

2253

if (buffer_uptodate(bh))

2262

if (buffer_uptodate(bh))

2254

return bh;

2263

return bh;

2255

brelse(bh);

2264

brelse(bh);

2256

return NULL;

2265

return NULL;

2257

}

2266

}

2258

2267

2259

/*

2268

/*

2260

** read and replay the log

2269

** read and replay the log

2261

** on a clean unmount, the journal header's next unflushed pointer will be to an invalid

2270

** on a clean unmount, the journal header's next unflushed pointer will be to an invalid

2262

** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast.

2271

** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast.

2263

**

2272

**

2264

** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.

2273

** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.

2265

**

2274

**

2266

** On exit, it sets things up so the first transaction will work correctly.

2275

** On exit, it sets things up so the first transaction will work correctly.

2267

*/

2276

*/

2268

static int journal_read(struct super_block *p_s_sb)

2277

static int journal_read(struct super_block *p_s_sb)

2269

{

2278

{

2270

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

2279

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

2271

struct reiserfs_journal_desc *desc;

2280

struct reiserfs_journal_desc *desc;

2272

unsigned long oldest_trans_id = 0;

2281

unsigned long oldest_trans_id = 0;

2273

unsigned long oldest_invalid_trans_id = 0;

2282

unsigned long oldest_invalid_trans_id = 0;

2274

time_t start;

2283

time_t start;

2275

unsigned long oldest_start = 0;

2284

unsigned long oldest_start = 0;

2276

unsigned long cur_dblock = 0;

2285

unsigned long cur_dblock = 0;

2277

unsigned long newest_mount_id = 9;

2286

unsigned long newest_mount_id = 9;

2278

struct buffer_head *d_bh;

2287

struct buffer_head *d_bh;

2279

struct reiserfs_journal_header *jh;

2288

struct reiserfs_journal_header *jh;

2280

int valid_journal_header = 0;

2289

int valid_journal_header = 0;

2281

int replay_count = 0;

2290

int replay_count = 0;

2282

int continue_replay = 1;

2291

int continue_replay = 1;

2283

int ret;

2292

int ret;

2284

char b[BDEVNAME_SIZE];

2293

char b[BDEVNAME_SIZE];

2285

2294

2286

cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

2295

cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);

2287

reiserfs_info(p_s_sb, "checking transaction log (%s)\n",

2296

reiserfs_info(p_s_sb, "checking transaction log (%s)\n",

2288

bdevname(journal->j_dev_bd, b));

2297

bdevname(journal->j_dev_bd, b));

2289

start = get_seconds();

2298

start = get_seconds();

2290

2299

2291

/* step 1, read in the journal header block. Check the transaction it says

2300

/* step 1, read in the journal header block. Check the transaction it says

2292

** is the first unflushed, and if that transaction is not valid,

2301

** is the first unflushed, and if that transaction is not valid,

2293

** replay is done

2302

** replay is done

2294

*/

2303

*/

2295

journal->j_header_bh = journal_bread(p_s_sb,

2304

journal->j_header_bh = journal_bread(p_s_sb,

2296

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)

2305

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)

2297

+ SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2306

+ SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2298

if (!journal->j_header_bh) {

2307

if (!journal->j_header_bh) {

2299

return 1;

2308

return 1;

2300

}

2309

}

2301

jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);

2310

jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);

2302

if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&

2311

if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&

2303

le32_to_cpu(jh->j_first_unflushed_offset) <

2312

le32_to_cpu(jh->j_first_unflushed_offset) <

2304

SB_ONDISK_JOURNAL_SIZE(p_s_sb)

2313

SB_ONDISK_JOURNAL_SIZE(p_s_sb)

2305

&& le32_to_cpu(jh->j_last_flush_trans_id) > 0) {

2314

&& le32_to_cpu(jh->j_last_flush_trans_id) > 0) {

2306

oldest_start =

2315

oldest_start =

2307

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2316

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2308

le32_to_cpu(jh->j_first_unflushed_offset);

2317

le32_to_cpu(jh->j_first_unflushed_offset);

2309

oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;

2318

oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;

2310

newest_mount_id = le32_to_cpu(jh->j_mount_id);

2319

newest_mount_id = le32_to_cpu(jh->j_mount_id);

2311

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2320

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2312

"journal-1153: found in "

2321

"journal-1153: found in "

2313

"header: first_unflushed_offset %d, last_flushed_trans_id "

2322

"header: first_unflushed_offset %d, last_flushed_trans_id "

2314

"%lu", le32_to_cpu(jh->j_first_unflushed_offset),

2323

"%lu", le32_to_cpu(jh->j_first_unflushed_offset),

2315

le32_to_cpu(jh->j_last_flush_trans_id));

2324

le32_to_cpu(jh->j_last_flush_trans_id));

2316

valid_journal_header = 1;

2325

valid_journal_header = 1;

2317

2326

2318

/* now, we try to read the first unflushed offset. If it is not valid,

2327

/* now, we try to read the first unflushed offset. If it is not valid,

2319

** there is nothing more we can do, and it makes no sense to read

2328

** there is nothing more we can do, and it makes no sense to read

2320

** through the whole log.

2329

** through the whole log.

2321

*/

2330

*/

2322

d_bh =

2331

d_bh =

2323

journal_bread(p_s_sb,

2332

journal_bread(p_s_sb,

2324

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2333

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2325

le32_to_cpu(jh->j_first_unflushed_offset));

2334

le32_to_cpu(jh->j_first_unflushed_offset));

2326

ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL);

2335

ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL);

2327

if (!ret) {

2336

if (!ret) {

2328

continue_replay = 0;

2337

continue_replay = 0;

2329

}

2338

}

2330

brelse(d_bh);

2339

brelse(d_bh);

2331

goto start_log_replay;

2340

goto start_log_replay;

2332

}

2341

}

2333

2342

2334

if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {

2343

if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {

2335

reiserfs_warning(p_s_sb,

2344

reiserfs_warning(p_s_sb,

2336

"clm-2076: device is readonly, unable to replay log");

2345

"clm-2076: device is readonly, unable to replay log");

2337

return -1;

2346

return -1;

2338

}

2347

}

2339

2348

2340

/* ok, there are transactions that need to be replayed. start with the first log block, find

2349

/* ok, there are transactions that need to be replayed. start with the first log block, find

2341

** all the valid transactions, and pick out the oldest.

2350

** all the valid transactions, and pick out the oldest.

2342

*/

2351

*/

2343

while (continue_replay

2352

while (continue_replay

2344

&& cur_dblock <

2353

&& cur_dblock <

2345

(SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2354

(SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2346

SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {

2355

SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {

2347

/* Note that it is required for blocksize of primary fs device and journal

2356

/* Note that it is required for blocksize of primary fs device and journal

2348

device to be the same */

2357

device to be the same */

2349

d_bh =

2358

d_bh =

2350

reiserfs_breada(journal->j_dev_bd, cur_dblock,

2359

reiserfs_breada(journal->j_dev_bd, cur_dblock,

2351

p_s_sb->s_blocksize,

2360

p_s_sb->s_blocksize,

2352

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2361

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2353

SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2362

SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2354

ret =

2363

ret =

2355

journal_transaction_is_valid(p_s_sb, d_bh,

2364

journal_transaction_is_valid(p_s_sb, d_bh,

2356

&oldest_invalid_trans_id,

2365

&oldest_invalid_trans_id,

2357

&newest_mount_id);

2366

&newest_mount_id);

2358

if (ret == 1) {

2367

if (ret == 1) {

2359

desc = (struct reiserfs_journal_desc *)d_bh->b_data;

2368

desc = (struct reiserfs_journal_desc *)d_bh->b_data;

2360

if (oldest_start == 0) { /* init all oldest_ values */

2369

if (oldest_start == 0) { /* init all oldest_ values */

2361

oldest_trans_id = get_desc_trans_id(desc);

2370

oldest_trans_id = get_desc_trans_id(desc);

2362

oldest_start = d_bh->b_blocknr;

2371

oldest_start = d_bh->b_blocknr;

2363

newest_mount_id = get_desc_mount_id(desc);

2372

newest_mount_id = get_desc_mount_id(desc);

2364

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2373

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2365

"journal-1179: Setting "

2374

"journal-1179: Setting "

2366

"oldest_start to offset %llu, trans_id %lu",

2375

"oldest_start to offset %llu, trans_id %lu",

2367

oldest_start -

2376

oldest_start -

2368

SB_ONDISK_JOURNAL_1st_BLOCK

2377

SB_ONDISK_JOURNAL_1st_BLOCK

2369

(p_s_sb), oldest_trans_id);

2378

(p_s_sb), oldest_trans_id);

2370

} else if (oldest_trans_id > get_desc_trans_id(desc)) {

2379

} else if (oldest_trans_id > get_desc_trans_id(desc)) {

2371

/* one we just read was older */

2380

/* one we just read was older */

2372

oldest_trans_id = get_desc_trans_id(desc);

2381

oldest_trans_id = get_desc_trans_id(desc);

2373

oldest_start = d_bh->b_blocknr;

2382

oldest_start = d_bh->b_blocknr;

2374

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2383

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2375

"journal-1180: Resetting "

2384

"journal-1180: Resetting "

2376

"oldest_start to offset %lu, trans_id %lu",

2385

"oldest_start to offset %lu, trans_id %lu",

2377

oldest_start -

2386

oldest_start -

2378

SB_ONDISK_JOURNAL_1st_BLOCK

2387

SB_ONDISK_JOURNAL_1st_BLOCK

2379

(p_s_sb), oldest_trans_id);

2388

(p_s_sb), oldest_trans_id);

2380

}

2389

}

2381

if (newest_mount_id < get_desc_mount_id(desc)) {

2390

if (newest_mount_id < get_desc_mount_id(desc)) {

2382

newest_mount_id = get_desc_mount_id(desc);

2391

newest_mount_id = get_desc_mount_id(desc);

2383

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2392

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2384

"journal-1299: Setting "

2393

"journal-1299: Setting "

2385

"newest_mount_id to %d",

2394

"newest_mount_id to %d",

2386

get_desc_mount_id(desc));

2395

get_desc_mount_id(desc));

2387

}

2396

}

2388

cur_dblock += get_desc_trans_len(desc) + 2;

2397

cur_dblock += get_desc_trans_len(desc) + 2;

2389

} else {

2398

} else {

2390

cur_dblock++;

2399

cur_dblock++;

2391

}

2400

}

2392

brelse(d_bh);

2401

brelse(d_bh);

2393

}

2402

}

2394

2403

2395

start_log_replay:

2404

start_log_replay:

2396

cur_dblock = oldest_start;

2405

cur_dblock = oldest_start;

2397

if (oldest_trans_id) {

2406

if (oldest_trans_id) {

2398

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2407

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2399

"journal-1206: Starting replay "

2408

"journal-1206: Starting replay "

2400

"from offset %llu, trans_id %lu",

2409

"from offset %llu, trans_id %lu",

2401

cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2410

cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2402

oldest_trans_id);

2411

oldest_trans_id);

2403

2412

2404

}

2413

}

2405

replay_count = 0;

2414

replay_count = 0;

2406

while (continue_replay && oldest_trans_id > 0) {

2415

while (continue_replay && oldest_trans_id > 0) {

2407

ret =

2416

ret =

2408

journal_read_transaction(p_s_sb, cur_dblock, oldest_start,

2417

journal_read_transaction(p_s_sb, cur_dblock, oldest_start,

2409

oldest_trans_id, newest_mount_id);

2418

oldest_trans_id, newest_mount_id);

2410

if (ret < 0) {

2419

if (ret < 0) {

2411

return ret;

2420

return ret;

2412

} else if (ret != 0) {

2421

} else if (ret != 0) {

2413

break;

2422

break;

2414

}

2423

}

2415

cur_dblock =

2424

cur_dblock =

2416

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start;

2425

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start;

2417

replay_count++;

2426

replay_count++;

2418

if (cur_dblock == oldest_start)

2427

if (cur_dblock == oldest_start)

2419

break;

2428

break;

2420

}

2429

}

2421

2430

2422

if (oldest_trans_id == 0) {

2431

if (oldest_trans_id == 0) {

2423

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2432

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,

2424

"journal-1225: No valid " "transactions found");

2433

"journal-1225: No valid " "transactions found");

2425

}

2434

}

2426

/* j_start does not get set correctly if we don't replay any transactions.

2435

/* j_start does not get set correctly if we don't replay any transactions.

2427

** if we had a valid journal_header, set j_start to the first unflushed transaction value,

2436

** if we had a valid journal_header, set j_start to the first unflushed transaction value,

2428

** copy the trans_id from the header

2437

** copy the trans_id from the header

2429

*/

2438

*/

2430

if (valid_journal_header && replay_count == 0) {

2439

if (valid_journal_header && replay_count == 0) {

2431

journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);

2440

journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);

2432

journal->j_trans_id =

2441

journal->j_trans_id =

2433

le32_to_cpu(jh->j_last_flush_trans_id) + 1;

2442

le32_to_cpu(jh->j_last_flush_trans_id) + 1;

2434

journal->j_last_flush_trans_id =

2443

journal->j_last_flush_trans_id =

2435

le32_to_cpu(jh->j_last_flush_trans_id);

2444

le32_to_cpu(jh->j_last_flush_trans_id);

2436

journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;

2445

journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;

2437

} else {

2446

} else {

2438

journal->j_mount_id = newest_mount_id + 1;

2447

journal->j_mount_id = newest_mount_id + 1;

2439

}

2448

}

2440

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "

2449

reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "

2441

"newest_mount_id to %lu", journal->j_mount_id);

2450

"newest_mount_id to %lu", journal->j_mount_id);

2442

journal->j_first_unflushed_offset = journal->j_start;

2451

journal->j_first_unflushed_offset = journal->j_start;

2443

if (replay_count > 0) {

2452

if (replay_count > 0) {

2444

reiserfs_info(p_s_sb,

2453

reiserfs_info(p_s_sb,

2445

"replayed %d transactions in %lu seconds\n",

2454

"replayed %d transactions in %lu seconds\n",

2446

replay_count, get_seconds() - start);

2455

replay_count, get_seconds() - start);

2447

}

2456

}

2448

if (!bdev_read_only(p_s_sb->s_bdev) &&

2457

if (!bdev_read_only(p_s_sb->s_bdev) &&

2449

_update_journal_header_block(p_s_sb, journal->j_start,

2458

_update_journal_header_block(p_s_sb, journal->j_start,

2450

journal->j_last_flush_trans_id)) {

2459

journal->j_last_flush_trans_id)) {

2451

/* replay failed, caller must call free_journal_ram and abort

2460

/* replay failed, caller must call free_journal_ram and abort

2452

** the mount

2461

** the mount

2453

*/

2462

*/

2454

return -1;

2463

return -1;

2455

}

2464

}

2456

return 0;

2465

return 0;

2457

}

2466

}

2458

2467

2459

static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)

2468

static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)

2460

{

2469

{

2461

struct reiserfs_journal_list *jl;

2470

struct reiserfs_journal_list *jl;

2462

jl = kzalloc(sizeof(struct reiserfs_journal_list),

2471

jl = kzalloc(sizeof(struct reiserfs_journal_list),

2463

GFP_NOFS | __GFP_NOFAIL);

2472

GFP_NOFS | __GFP_NOFAIL);

2464

INIT_LIST_HEAD(&jl->j_list);

2473

INIT_LIST_HEAD(&jl->j_list);

2465

INIT_LIST_HEAD(&jl->j_working_list);

2474

INIT_LIST_HEAD(&jl->j_working_list);

2466

INIT_LIST_HEAD(&jl->j_tail_bh_list);

2475

INIT_LIST_HEAD(&jl->j_tail_bh_list);

2467

INIT_LIST_HEAD(&jl->j_bh_list);

2476

INIT_LIST_HEAD(&jl->j_bh_list);

2468

sema_init(&jl->j_commit_lock, 1);

2477

sema_init(&jl->j_commit_lock, 1);

2469

SB_JOURNAL(s)->j_num_lists++;

2478

SB_JOURNAL(s)->j_num_lists++;

2470

get_journal_list(jl);

2479

get_journal_list(jl);

2471

return jl;

2480

return jl;

2472

}

2481

}

2473

2482

2474

static void journal_list_init(struct super_block *p_s_sb)

2483

static void journal_list_init(struct super_block *p_s_sb)

2475

{

2484

{

2476

SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);

2485

SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);

2477

}

2486

}

2478

2487

2479

static int release_journal_dev(struct super_block *super,

2488

static int release_journal_dev(struct super_block *super,

2480

struct reiserfs_journal *journal)

2489

struct reiserfs_journal *journal)

2481

{

2490

{

2482

int result;

2491

int result;

2483

2492

2484

result = 0;

2493

result = 0;

2485

2494

2486

if (journal->j_dev_file != NULL) {

2495

if (journal->j_dev_file != NULL) {

2487

result = filp_close(journal->j_dev_file, NULL);

2496

result = filp_close(journal->j_dev_file, NULL);

2488

journal->j_dev_file = NULL;

2497

journal->j_dev_file = NULL;

2489

journal->j_dev_bd = NULL;

2498

journal->j_dev_bd = NULL;

2490

} else if (journal->j_dev_bd != NULL) {

2499

} else if (journal->j_dev_bd != NULL) {

2491

result = blkdev_put(journal->j_dev_bd);

2500

result = blkdev_put(journal->j_dev_bd);

2492

journal->j_dev_bd = NULL;

2501

journal->j_dev_bd = NULL;

2493

}

2502

}

2494

2503

2495

if (result != 0) {

2504

if (result != 0) {

2496

reiserfs_warning(super,

2505

reiserfs_warning(super,

2497

"sh-457: release_journal_dev: Cannot release journal device: %i",

2506

"sh-457: release_journal_dev: Cannot release journal device: %i",

2498

result);

2507

result);

2499

}

2508

}

2500

return result;

2509

return result;

2501

}

2510

}

2502

2511

2503

static int journal_init_dev(struct super_block *super,

2512

static int journal_init_dev(struct super_block *super,

2504

struct reiserfs_journal *journal,

2513

struct reiserfs_journal *journal,

2505

const char *jdev_name)

2514

const char *jdev_name)

2506

{

2515

{

2507

int result;

2516

int result;

2508

dev_t jdev;

2517

dev_t jdev;

2509

int blkdev_mode = FMODE_READ | FMODE_WRITE;

2518

int blkdev_mode = FMODE_READ | FMODE_WRITE;

2510

char b[BDEVNAME_SIZE];

2519

char b[BDEVNAME_SIZE];

2511

2520

2512

result = 0;

2521

result = 0;

2513

2522

2514

journal->j_dev_bd = NULL;

2523

journal->j_dev_bd = NULL;

2515

journal->j_dev_file = NULL;

2524

journal->j_dev_file = NULL;

2516

jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?

2525

jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?

2517

new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;

2526

new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;

2518

2527

2519

if (bdev_read_only(super->s_bdev))

2528

if (bdev_read_only(super->s_bdev))

2520

blkdev_mode = FMODE_READ;

2529

blkdev_mode = FMODE_READ;

2521

2530

2522

/* there is no "jdev" option and journal is on separate device */

2531

/* there is no "jdev" option and journal is on separate device */

2523

if ((!jdev_name || !jdev_name[0])) {

2532

if ((!jdev_name || !jdev_name[0])) {

2524

journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);

2533

journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);

2525

if (IS_ERR(journal->j_dev_bd)) {

2534

if (IS_ERR(journal->j_dev_bd)) {

2526

result = PTR_ERR(journal->j_dev_bd);

2535

result = PTR_ERR(journal->j_dev_bd);

2527

journal->j_dev_bd = NULL;

2536

journal->j_dev_bd = NULL;

2528

reiserfs_warning(super, "sh-458: journal_init_dev: "

2537

reiserfs_warning(super, "sh-458: journal_init_dev: "

2529

"cannot init journal device '%s': %i",

2538

"cannot init journal device '%s': %i",

2530

__bdevname(jdev, b), result);

2539

__bdevname(jdev, b), result);

2531

return result;

2540

return result;

2532

} else if (jdev != super->s_dev)

2541

} else if (jdev != super->s_dev)

2533

set_blocksize(journal->j_dev_bd, super->s_blocksize);

2542

set_blocksize(journal->j_dev_bd, super->s_blocksize);

2534

return 0;

2543

return 0;

2535

}

2544

}

2536

2545

2537

journal->j_dev_file = filp_open(jdev_name, 0, 0);

2546

journal->j_dev_file = filp_open(jdev_name, 0, 0);

2538

if (!IS_ERR(journal->j_dev_file)) {

2547

if (!IS_ERR(journal->j_dev_file)) {

2539

struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;

2548

struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;

2540

if (!S_ISBLK(jdev_inode->i_mode)) {

2549

if (!S_ISBLK(jdev_inode->i_mode)) {

2541

reiserfs_warning(super, "journal_init_dev: '%s' is "

2550

reiserfs_warning(super, "journal_init_dev: '%s' is "

2542

"not a block device", jdev_name);

2551

"not a block device", jdev_name);

2543

result = -ENOTBLK;

2552

result = -ENOTBLK;

2544

release_journal_dev(super, journal);

2553

release_journal_dev(super, journal);

2545

} else {

2554

} else {

2546

/* ok */

2555

/* ok */

2547

journal->j_dev_bd = I_BDEV(jdev_inode);

2556

journal->j_dev_bd = I_BDEV(jdev_inode);

2548

set_blocksize(journal->j_dev_bd, super->s_blocksize);

2557

set_blocksize(journal->j_dev_bd, super->s_blocksize);

2549

reiserfs_info(super,

2558

reiserfs_info(super,

2550

"journal_init_dev: journal device: %s\n",

2559

"journal_init_dev: journal device: %s\n",

2551

bdevname(journal->j_dev_bd, b));

2560

bdevname(journal->j_dev_bd, b));

2552

}

2561

}

2553

} else {

2562

} else {

2554

result = PTR_ERR(journal->j_dev_file);

2563

result = PTR_ERR(journal->j_dev_file);

2555

journal->j_dev_file = NULL;

2564

journal->j_dev_file = NULL;

2556

reiserfs_warning(super,

2565

reiserfs_warning(super,

2557

"journal_init_dev: Cannot open '%s': %i",

2566

"journal_init_dev: Cannot open '%s': %i",

2558

jdev_name, result);

2567

jdev_name, result);

2559

}

2568

}

2560

return result;

2569

return result;

2561

}

2570

}

2562

2571

2563

/*

2572

/*

2564

** must be called once on fs mount. calls journal_read for you

2573

** must be called once on fs mount. calls journal_read for you

2565

*/

2574

*/

2566

int journal_init(struct super_block *p_s_sb, const char *j_dev_name,

2575

int journal_init(struct super_block *p_s_sb, const char *j_dev_name,

2567

int old_format, unsigned int commit_max_age)

2576

int old_format, unsigned int commit_max_age)

2568

{

2577

{

2569

int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2;

2578

int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2;

2570

struct buffer_head *bhjh;

2579

struct buffer_head *bhjh;

2571

struct reiserfs_super_block *rs;

2580

struct reiserfs_super_block *rs;

2572

struct reiserfs_journal_header *jh;

2581

struct reiserfs_journal_header *jh;

2573

struct reiserfs_journal *journal;

2582

struct reiserfs_journal *journal;

2574

struct reiserfs_journal_list *jl;

2583

struct reiserfs_journal_list *jl;

2575

char b[BDEVNAME_SIZE];

2584

char b[BDEVNAME_SIZE];

2576

2585

2577

journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));

2586

journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));

2578

if (!journal) {

2587

if (!journal) {

2579

reiserfs_warning(p_s_sb,

2588

reiserfs_warning(p_s_sb,

2580

"journal-1256: unable to get memory for journal structure");

2589

"journal-1256: unable to get memory for journal structure");

2581

return 1;

2590

return 1;

2582

}

2591

}

2583

memset(journal, 0, sizeof(struct reiserfs_journal));

2592

memset(journal, 0, sizeof(struct reiserfs_journal));

2584

INIT_LIST_HEAD(&journal->j_bitmap_nodes);

2593

INIT_LIST_HEAD(&journal->j_bitmap_nodes);

2585

INIT_LIST_HEAD(&journal->j_prealloc_list);

2594

INIT_LIST_HEAD(&journal->j_prealloc_list);

2586

INIT_LIST_HEAD(&journal->j_working_list);

2595

INIT_LIST_HEAD(&journal->j_working_list);

2587

INIT_LIST_HEAD(&journal->j_journal_list);

2596

INIT_LIST_HEAD(&journal->j_journal_list);

2588

journal->j_persistent_trans = 0;

2597

journal->j_persistent_trans = 0;

2589

if (reiserfs_allocate_list_bitmaps(p_s_sb,

2598

if (reiserfs_allocate_list_bitmaps(p_s_sb,

2590

journal->j_list_bitmap,

2599

journal->j_list_bitmap,

2591

SB_BMAP_NR(p_s_sb)))

2600

SB_BMAP_NR(p_s_sb)))

2592

goto free_and_return;

2601

goto free_and_return;

2593

allocate_bitmap_nodes(p_s_sb);

2602

allocate_bitmap_nodes(p_s_sb);

2594

2603

2595

/* reserved for journal area support */

2604

/* reserved for journal area support */

2596

SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?

2605

SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?

2597

REISERFS_OLD_DISK_OFFSET_IN_BYTES

2606

REISERFS_OLD_DISK_OFFSET_IN_BYTES

2598

/ p_s_sb->s_blocksize +

2607

/ p_s_sb->s_blocksize +

2599

SB_BMAP_NR(p_s_sb) +

2608

SB_BMAP_NR(p_s_sb) +

2600

1 :

2609

1 :

2601

REISERFS_DISK_OFFSET_IN_BYTES /

2610

REISERFS_DISK_OFFSET_IN_BYTES /

2602

p_s_sb->s_blocksize + 2);

2611

p_s_sb->s_blocksize + 2);

2603

2612

2604

/* Sanity check to see is the standard journal fitting withing first bitmap

2613

/* Sanity check to see is the standard journal fitting withing first bitmap

2605

(actual for small blocksizes) */

2614

(actual for small blocksizes) */

2606

if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&

2615

if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&

2607

(SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +

2616

(SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +

2608

SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {

2617

SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {

2609

reiserfs_warning(p_s_sb,

2618

reiserfs_warning(p_s_sb,

2610

"journal-1393: journal does not fit for area "

2619

"journal-1393: journal does not fit for area "

2611

"addressed by first of bitmap blocks. It starts at "

2620

"addressed by first of bitmap blocks. It starts at "

2612

"%u and its size is %u. Block size %ld",

2621

"%u and its size is %u. Block size %ld",

2613

SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),

2622

SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),

2614

SB_ONDISK_JOURNAL_SIZE(p_s_sb),

2623

SB_ONDISK_JOURNAL_SIZE(p_s_sb),

2615

p_s_sb->s_blocksize);

2624

p_s_sb->s_blocksize);

2616

goto free_and_return;

2625

goto free_and_return;

2617

}

2626

}

2618

2627

2619

if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {

2628

if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {

2620

reiserfs_warning(p_s_sb,

2629

reiserfs_warning(p_s_sb,

2621

"sh-462: unable to initialize jornal device");

2630

"sh-462: unable to initialize jornal device");

2622

goto free_and_return;

2631

goto free_and_return;

2623

}

2632

}

2624

2633

2625

rs = SB_DISK_SUPER_BLOCK(p_s_sb);

2634

rs = SB_DISK_SUPER_BLOCK(p_s_sb);

2626

2635

2627

/* read journal header */

2636

/* read journal header */

2628

bhjh = journal_bread(p_s_sb,

2637

bhjh = journal_bread(p_s_sb,

2629

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2638

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

2630

SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2639

SB_ONDISK_JOURNAL_SIZE(p_s_sb));

2631

if (!bhjh) {

2640

if (!bhjh) {

2632

reiserfs_warning(p_s_sb,

2641

reiserfs_warning(p_s_sb,

2633

"sh-459: unable to read journal header");

2642

"sh-459: unable to read journal header");

2634

goto free_and_return;

2643

goto free_and_return;

2635

}

2644

}

2636

jh = (struct reiserfs_journal_header *)(bhjh->b_data);

2645

jh = (struct reiserfs_journal_header *)(bhjh->b_data);

2637

2646

2638

/* make sure that journal matches to the super block */

2647

/* make sure that journal matches to the super block */

2639

if (is_reiserfs_jr(rs)

2648

if (is_reiserfs_jr(rs)

2640

&& (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=

2649

&& (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=

2641

sb_jp_journal_magic(rs))) {

2650

sb_jp_journal_magic(rs))) {

2642

reiserfs_warning(p_s_sb,

2651

reiserfs_warning(p_s_sb,

2643

"sh-460: journal header magic %x "

2652

"sh-460: journal header magic %x "

2644

"(device %s) does not match to magic found in super "

2653

"(device %s) does not match to magic found in super "

2645

"block %x", jh->jh_journal.jp_journal_magic,

2654

"block %x", jh->jh_journal.jp_journal_magic,

2646

bdevname(journal->j_dev_bd, b),

2655

bdevname(journal->j_dev_bd, b),

2647

sb_jp_journal_magic(rs));

2656

sb_jp_journal_magic(rs));

2648

brelse(bhjh);

2657

brelse(bhjh);

2649

goto free_and_return;

2658

goto free_and_return;

2650

}

2659

}

2651

2660

2652

journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);

2661

journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);

2653

journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);

2662

journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);

2654

journal->j_max_commit_age =

2663

journal->j_max_commit_age =

2655

le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);

2664

le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);

2656

journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;

2665

journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;

2657

2666

2658

if (journal->j_trans_max) {

2667

if (journal->j_trans_max) {

2659

/* make sure these parameters are available, assign it if they are not */

2668

/* make sure these parameters are available, assign it if they are not */

2660

__u32 initial = journal->j_trans_max;

2669

__u32 initial = journal->j_trans_max;

2661

__u32 ratio = 1;

2670

__u32 ratio = 1;

2662

2671

2663

if (p_s_sb->s_blocksize < 4096)

2672

if (p_s_sb->s_blocksize < 4096)

2664

ratio = 4096 / p_s_sb->s_blocksize;

2673

ratio = 4096 / p_s_sb->s_blocksize;

2665

2674

2666

if (SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <

2675

if (SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <

2667

JOURNAL_MIN_RATIO)

2676

JOURNAL_MIN_RATIO)

2668

journal->j_trans_max =

2677

journal->j_trans_max =

2669

SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;

2678

SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;

2670

if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)

2679

if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)

2671

journal->j_trans_max =

2680

journal->j_trans_max =

2672

JOURNAL_TRANS_MAX_DEFAULT / ratio;

2681

JOURNAL_TRANS_MAX_DEFAULT / ratio;

2673

if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)

2682

if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)

2674

journal->j_trans_max =

2683

journal->j_trans_max =

2675

JOURNAL_TRANS_MIN_DEFAULT / ratio;

2684

JOURNAL_TRANS_MIN_DEFAULT / ratio;

2676

2685

2677

if (journal->j_trans_max != initial)

2686

if (journal->j_trans_max != initial)

2678

reiserfs_warning(p_s_sb,

2687

reiserfs_warning(p_s_sb,

2679

"sh-461: journal_init: wrong transaction max size (%u). Changed to %u",

2688

"sh-461: journal_init: wrong transaction max size (%u). Changed to %u",

2680

initial, journal->j_trans_max);

2689

initial, journal->j_trans_max);

2681

2690

2682

journal->j_max_batch = journal->j_trans_max *

2691

journal->j_max_batch = journal->j_trans_max *

2683

JOURNAL_MAX_BATCH_DEFAULT / JOURNAL_TRANS_MAX_DEFAULT;

2692

JOURNAL_MAX_BATCH_DEFAULT / JOURNAL_TRANS_MAX_DEFAULT;

2684

}

2693

}

2685

2694

2686

if (!journal->j_trans_max) {

2695

if (!journal->j_trans_max) {

2687

/*we have the file system was created by old version of mkreiserfs

2696

/*we have the file system was created by old version of mkreiserfs

2688

so this field contains zero value */

2697

so this field contains zero value */

2689

journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;

2698

journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;

2690

journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;

2699

journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;

2691

journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;

2700

journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;

2692

2701

2693

/* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096

2702

/* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096

2694

trans max size is decreased proportionally */

2703

trans max size is decreased proportionally */

2695

if (p_s_sb->s_blocksize < 4096) {

2704

if (p_s_sb->s_blocksize < 4096) {

2696

journal->j_trans_max /= (4096 / p_s_sb->s_blocksize);

2705

journal->j_trans_max /= (4096 / p_s_sb->s_blocksize);

2697

journal->j_max_batch = (journal->j_trans_max) * 9 / 10;

2706

journal->j_max_batch = (journal->j_trans_max) * 9 / 10;

2698

}

2707

}

2699

}

2708

}

2700

2709

2701

journal->j_default_max_commit_age = journal->j_max_commit_age;

2710

journal->j_default_max_commit_age = journal->j_max_commit_age;

2702

2711

2703

if (commit_max_age != 0) {

2712

if (commit_max_age != 0) {

2704

journal->j_max_commit_age = commit_max_age;

2713

journal->j_max_commit_age = commit_max_age;

2705

journal->j_max_trans_age = commit_max_age;

2714

journal->j_max_trans_age = commit_max_age;

2706

}

2715

}

2707

2716

2708

reiserfs_info(p_s_sb, "journal params: device %s, size %u, "

2717

reiserfs_info(p_s_sb, "journal params: device %s, size %u, "

2709

"journal first block %u, max trans len %u, max batch %u, "

2718

"journal first block %u, max trans len %u, max batch %u, "

2710

"max commit age %u, max trans age %u\n",

2719

"max commit age %u, max trans age %u\n",

2711

bdevname(journal->j_dev_bd, b),

2720

bdevname(journal->j_dev_bd, b),

2712

SB_ONDISK_JOURNAL_SIZE(p_s_sb),

2721

SB_ONDISK_JOURNAL_SIZE(p_s_sb),

2713

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2722

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),

2714

journal->j_trans_max,

2723

journal->j_trans_max,

2715

journal->j_max_batch,

2724

journal->j_max_batch,

2716

journal->j_max_commit_age, journal->j_max_trans_age);

2725

journal->j_max_commit_age, journal->j_max_trans_age);

2717

2726

2718

brelse(bhjh);

2727

brelse(bhjh);

2719

2728

2720

journal->j_list_bitmap_index = 0;

2729

journal->j_list_bitmap_index = 0;

2721

journal_list_init(p_s_sb);

2730

journal_list_init(p_s_sb);

2722

2731

2723

memset(journal->j_list_hash_table, 0,

2732

memset(journal->j_list_hash_table, 0,

2724

JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));

2733

JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));

2725

2734

2726

INIT_LIST_HEAD(&journal->j_dirty_buffers);

2735

INIT_LIST_HEAD(&journal->j_dirty_buffers);

2727

spin_lock_init(&journal->j_dirty_buffers_lock);

2736

spin_lock_init(&journal->j_dirty_buffers_lock);

2728

2737

2729

journal->j_start = 0;

2738

journal->j_start = 0;

2730

journal->j_len = 0;

2739

journal->j_len = 0;

2731

journal->j_len_alloc = 0;

2740

journal->j_len_alloc = 0;

2732

atomic_set(&(journal->j_wcount), 0);

2741

atomic_set(&(journal->j_wcount), 0);

2733

atomic_set(&(journal->j_async_throttle), 0);

2742

atomic_set(&(journal->j_async_throttle), 0);

2734

journal->j_bcount = 0;

2743

journal->j_bcount = 0;

2735

journal->j_trans_start_time = 0;

2744

journal->j_trans_start_time = 0;

2736

journal->j_last = NULL;

2745

journal->j_last = NULL;

2737

journal->j_first = NULL;

2746

journal->j_first = NULL;

2738

init_waitqueue_head(&(journal->j_join_wait));

2747

init_waitqueue_head(&(journal->j_join_wait));

2739

sema_init(&journal->j_lock, 1);

2748

sema_init(&journal->j_lock, 1);

2740

sema_init(&journal->j_flush_sem, 1);

2749

sema_init(&journal->j_flush_sem, 1);

2741

2750

2742

journal->j_trans_id = 10;

2751

journal->j_trans_id = 10;

2743

journal->j_mount_id = 10;

2752

journal->j_mount_id = 10;

2744

journal->j_state = 0;

2753

journal->j_state = 0;

2745

atomic_set(&(journal->j_jlock), 0);

2754

atomic_set(&(journal->j_jlock), 0);

2746

journal->j_cnode_free_list = allocate_cnodes(num_cnodes);

2755

journal->j_cnode_free_list = allocate_cnodes(num_cnodes);

2747

journal->j_cnode_free_orig = journal->j_cnode_free_list;

2756

journal->j_cnode_free_orig = journal->j_cnode_free_list;

2748

journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;

2757

journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;

2749

journal->j_cnode_used = 0;

2758

journal->j_cnode_used = 0;

2750

journal->j_must_wait = 0;

2759

journal->j_must_wait = 0;

2751

2760

2752

if (journal->j_cnode_free == 0) {

2761

if (journal->j_cnode_free == 0) {

2753

reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory "

2762

reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory "

2754

"allocation failed (%ld bytes). Journal is "

2763

"allocation failed (%ld bytes). Journal is "

2755

"too large for available memory. Usually "

2764

"too large for available memory. Usually "

2756

"this is due to a journal that is too large.",

2765

"this is due to a journal that is too large.",

2757

sizeof (struct reiserfs_journal_cnode) * num_cnodes);

2766

sizeof (struct reiserfs_journal_cnode) * num_cnodes);

2758

goto free_and_return;

2767

goto free_and_return;

2759

}

2768

}

2760

2769

2761

init_journal_hash(p_s_sb);

2770

init_journal_hash(p_s_sb);

2762

jl = journal->j_current_jl;

2771

jl = journal->j_current_jl;

2763

jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);

2772

jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);

2764

if (!jl->j_list_bitmap) {

2773

if (!jl->j_list_bitmap) {

2765

reiserfs_warning(p_s_sb,

2774

reiserfs_warning(p_s_sb,

2766

"journal-2005, get_list_bitmap failed for journal list 0");

2775

"journal-2005, get_list_bitmap failed for journal list 0");

2767

goto free_and_return;

2776

goto free_and_return;

2768

}

2777

}

2769

if (journal_read(p_s_sb) < 0) {

2778

if (journal_read(p_s_sb) < 0) {

2770

reiserfs_warning(p_s_sb, "Replay Failure, unable to mount");

2779

reiserfs_warning(p_s_sb, "Replay Failure, unable to mount");

2771

goto free_and_return;

2780

goto free_and_return;

2772

}

2781

}

2773

2782

2774

reiserfs_mounted_fs_count++;

2783

reiserfs_mounted_fs_count++;

2775

if (reiserfs_mounted_fs_count <= 1)

2784

if (reiserfs_mounted_fs_count <= 1)

2776

commit_wq = create_workqueue("reiserfs");

2785

commit_wq = create_workqueue("reiserfs");

2777

2786

2778

INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);

2787

INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);

2779

return 0;

2788

return 0;

2780

free_and_return:

2789

free_and_return:

2781

free_journal_ram(p_s_sb);

2790

free_journal_ram(p_s_sb);

2782

return 1;

2791

return 1;

2783

}

2792

}

2784

2793

2785

/*

2794

/*

2786

** test for a polite end of the current transaction. Used by file_write, and should

2795

** test for a polite end of the current transaction. Used by file_write, and should

2787

** be used by delete to make sure they don't write more than can fit inside a single

2796

** be used by delete to make sure they don't write more than can fit inside a single

2788

** transaction

2797

** transaction

2789

*/

2798

*/

2790

int journal_transaction_should_end(struct reiserfs_transaction_handle *th,

2799

int journal_transaction_should_end(struct reiserfs_transaction_handle *th,

2791

int new_alloc)

2800

int new_alloc)

2792

{

2801

{

2793

struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);

2802

struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);

2794

time_t now = get_seconds();

2803

time_t now = get_seconds();

2795

/* cannot restart while nested */

2804

/* cannot restart while nested */

2796

BUG_ON(!th->t_trans_id);

2805

BUG_ON(!th->t_trans_id);

2797

if (th->t_refcount > 1)

2806

if (th->t_refcount > 1)

2798

return 0;

2807

return 0;

2799

if (journal->j_must_wait > 0 ||

2808

if (journal->j_must_wait > 0 ||

2800

(journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||

2809

(journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||

2801

atomic_read(&(journal->j_jlock)) ||

2810

atomic_read(&(journal->j_jlock)) ||

2802

(now - journal->j_trans_start_time) > journal->j_max_trans_age ||

2811

(now - journal->j_trans_start_time) > journal->j_max_trans_age ||

2803

journal->j_cnode_free < (journal->j_trans_max * 3)) {

2812

journal->j_cnode_free < (journal->j_trans_max * 3)) {

2804

return 1;

2813

return 1;

2805

}

2814

}

2806

return 0;

2815

return 0;

2807

}

2816

}

2808

2817

2809

/* this must be called inside a transaction, and requires the

2818

/* this must be called inside a transaction, and requires the

2810

** kernel_lock to be held

2819

** kernel_lock to be held

2811

*/

2820

*/

2812

void reiserfs_block_writes(struct reiserfs_transaction_handle *th)

2821

void reiserfs_block_writes(struct reiserfs_transaction_handle *th)

2813

{

2822

{

2814

struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);

2823

struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);

2815

BUG_ON(!th->t_trans_id);

2824

BUG_ON(!th->t_trans_id);

2816

journal->j_must_wait = 1;

2825

journal->j_must_wait = 1;

2817

set_bit(J_WRITERS_BLOCKED, &journal->j_state);

2826

set_bit(J_WRITERS_BLOCKED, &journal->j_state);

2818

return;

2827

return;

2819

}

2828

}

2820

2829

2821

/* this must be called without a transaction started, and does not

2830

/* this must be called without a transaction started, and does not

2822

** require BKL

2831

** require BKL

2823

*/

2832

*/

2824

void reiserfs_allow_writes(struct super_block *s)

2833

void reiserfs_allow_writes(struct super_block *s)

2825

{

2834

{

2826

struct reiserfs_journal *journal = SB_JOURNAL(s);

2835

struct reiserfs_journal *journal = SB_JOURNAL(s);

2827

clear_bit(J_WRITERS_BLOCKED, &journal->j_state);

2836

clear_bit(J_WRITERS_BLOCKED, &journal->j_state);

2828

wake_up(&journal->j_join_wait);

2837

wake_up(&journal->j_join_wait);

2829

}

2838

}

2830

2839

2831

/* this must be called without a transaction started, and does not

2840

/* this must be called without a transaction started, and does not

2832

** require BKL

2841

** require BKL

2833

*/

2842

*/

2834

void reiserfs_wait_on_write_block(struct super_block *s)

2843

void reiserfs_wait_on_write_block(struct super_block *s)

2835

{

2844

{

2836

struct reiserfs_journal *journal = SB_JOURNAL(s);

2845

struct reiserfs_journal *journal = SB_JOURNAL(s);

2837

wait_event(journal->j_join_wait,

2846

wait_event(journal->j_join_wait,

2838

!test_bit(J_WRITERS_BLOCKED, &journal->j_state));

2847

!test_bit(J_WRITERS_BLOCKED, &journal->j_state));

2839

}

2848

}

2840

2849

2841

static void queue_log_writer(struct super_block *s)

2850

static void queue_log_writer(struct super_block *s)

2842

{

2851

{

2843

wait_queue_t wait;

2852

wait_queue_t wait;

2844

struct reiserfs_journal *journal = SB_JOURNAL(s);

2853

struct reiserfs_journal *journal = SB_JOURNAL(s);

2845

set_bit(J_WRITERS_QUEUED, &journal->j_state);

2854

set_bit(J_WRITERS_QUEUED, &journal->j_state);

2846

2855

2847

/*

2856

/*

2848

* we don't want to use wait_event here because

2857

* we don't want to use wait_event here because

2849

* we only want to wait once.

2858

* we only want to wait once.

2850

*/

2859

*/

2851

init_waitqueue_entry(&wait, current);

2860

init_waitqueue_entry(&wait, current);

2852

add_wait_queue(&journal->j_join_wait, &wait);

2861

add_wait_queue(&journal->j_join_wait, &wait);

2853

set_current_state(TASK_UNINTERRUPTIBLE);

2862

set_current_state(TASK_UNINTERRUPTIBLE);

2854

if (test_bit(J_WRITERS_QUEUED, &journal->j_state))

2863

if (test_bit(J_WRITERS_QUEUED, &journal->j_state))

2855

schedule();

2864

schedule();

2856

current->state = TASK_RUNNING;

2865

current->state = TASK_RUNNING;

2857

remove_wait_queue(&journal->j_join_wait, &wait);

2866

remove_wait_queue(&journal->j_join_wait, &wait);

2858

}

2867

}

2859

2868

2860

static void wake_queued_writers(struct super_block *s)

2869

static void wake_queued_writers(struct super_block *s)

2861

{

2870

{

2862

struct reiserfs_journal *journal = SB_JOURNAL(s);

2871

struct reiserfs_journal *journal = SB_JOURNAL(s);

2863

if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))

2872

if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))

2864

wake_up(&journal->j_join_wait);

2873

wake_up(&journal->j_join_wait);

2865

}

2874

}

2866

2875

2867

static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)

2876

static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)

2868

{

2877

{

2869

struct reiserfs_journal *journal = SB_JOURNAL(sb);

2878

struct reiserfs_journal *journal = SB_JOURNAL(sb);

2870

unsigned long bcount = journal->j_bcount;

2879

unsigned long bcount = journal->j_bcount;

2871

while (1) {

2880

while (1) {

2872

schedule_timeout_uninterruptible(1);

2881

schedule_timeout_uninterruptible(1);

2873

journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;

2882

journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;

2874

while ((atomic_read(&journal->j_wcount) > 0 ||

2883

while ((atomic_read(&journal->j_wcount) > 0 ||

2875

atomic_read(&journal->j_jlock)) &&

2884

atomic_read(&journal->j_jlock)) &&

2876

journal->j_trans_id == trans_id) {

2885

journal->j_trans_id == trans_id) {

2877

queue_log_writer(sb);

2886

queue_log_writer(sb);

2878

}

2887

}

2879

if (journal->j_trans_id != trans_id)

2888

if (journal->j_trans_id != trans_id)

2880

break;

2889

break;

2881

if (bcount == journal->j_bcount)

2890

if (bcount == journal->j_bcount)

2882

break;

2891

break;

2883

bcount = journal->j_bcount;

2892

bcount = journal->j_bcount;

2884

}

2893

}

2885

}

2894

}

2886

2895

2887

/* join == true if you must join an existing transaction.

2896

/* join == true if you must join an existing transaction.

2888

** join == false if you can deal with waiting for others to finish

2897

** join == false if you can deal with waiting for others to finish

2889

**

2898

**

2890

** this will block until the transaction is joinable. send the number of blocks you

2899

** this will block until the transaction is joinable. send the number of blocks you

2891

** expect to use in nblocks.

2900

** expect to use in nblocks.

2892

*/

2901

*/

2893

static int do_journal_begin_r(struct reiserfs_transaction_handle *th,

2902

static int do_journal_begin_r(struct reiserfs_transaction_handle *th,

2894

struct super_block *p_s_sb, unsigned long nblocks,

2903

struct super_block *p_s_sb, unsigned long nblocks,

2895

int join)

2904

int join)

2896

{

2905

{

2897

time_t now = get_seconds();

2906

time_t now = get_seconds();

2898

int old_trans_id;

2907

int old_trans_id;

2899

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

2908

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

2900

struct reiserfs_transaction_handle myth;

2909

struct reiserfs_transaction_handle myth;

2901

int sched_count = 0;

2910

int sched_count = 0;

2902

int retval;

2911

int retval;

2903

2912

2904

reiserfs_check_lock_depth(p_s_sb, "journal_begin");

2913

reiserfs_check_lock_depth(p_s_sb, "journal_begin");

2905

if (nblocks > journal->j_trans_max)

2914

if (nblocks > journal->j_trans_max)

2906

BUG();

2915

BUG();

2907

2916

2908

PROC_INFO_INC(p_s_sb, journal.journal_being);

2917

PROC_INFO_INC(p_s_sb, journal.journal_being);

2909

/* set here for journal_join */

2918

/* set here for journal_join */

2910

th->t_refcount = 1;

2919

th->t_refcount = 1;

2911

th->t_super = p_s_sb;

2920

th->t_super = p_s_sb;

2912

2921

2913

relock:

2922

relock:

2914

lock_journal(p_s_sb);

2923

lock_journal(p_s_sb);

2915

if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {

2924

if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {

2916

unlock_journal(p_s_sb);

2925

unlock_journal(p_s_sb);

2917

retval = journal->j_errno;

2926

retval = journal->j_errno;

2918

goto out_fail;

2927

goto out_fail;

2919

}

2928

}

2920

journal->j_bcount++;

2929

journal->j_bcount++;

2921

2930

2922

if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {

2931

if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {

2923

unlock_journal(p_s_sb);

2932

unlock_journal(p_s_sb);

2924

reiserfs_wait_on_write_block(p_s_sb);

2933

reiserfs_wait_on_write_block(p_s_sb);

2925

PROC_INFO_INC(p_s_sb, journal.journal_relock_writers);

2934

PROC_INFO_INC(p_s_sb, journal.journal_relock_writers);

2926

goto relock;

2935

goto relock;

2927

}

2936

}

2928

now = get_seconds();

2937

now = get_seconds();

2929

2938

2930

/* if there is no room in the journal OR

2939

/* if there is no room in the journal OR

2931

** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning

2940

** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning

2932

** we don't sleep if there aren't other writers

2941

** we don't sleep if there aren't other writers

2933

*/

2942

*/

2934

2943

2935

if ((!join && journal->j_must_wait > 0) ||

2944

if ((!join && journal->j_must_wait > 0) ||

2936

(!join

2945

(!join

2937

&& (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)

2946

&& (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)

2938

|| (!join && atomic_read(&journal->j_wcount) > 0

2947

|| (!join && atomic_read(&journal->j_wcount) > 0

2939

&& journal->j_trans_start_time > 0

2948

&& journal->j_trans_start_time > 0

2940

&& (now - journal->j_trans_start_time) >

2949

&& (now - journal->j_trans_start_time) >

2941

journal->j_max_trans_age) || (!join

2950

journal->j_max_trans_age) || (!join

2942

&& atomic_read(&journal->j_jlock))

2951

&& atomic_read(&journal->j_jlock))

2943

|| (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {

2952

|| (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {

2944

2953

2945

old_trans_id = journal->j_trans_id;

2954

old_trans_id = journal->j_trans_id;

2946

unlock_journal(p_s_sb); /* allow others to finish this transaction */

2955

unlock_journal(p_s_sb); /* allow others to finish this transaction */

2947

2956

2948

if (!join && (journal->j_len_alloc + nblocks + 2) >=

2957

if (!join && (journal->j_len_alloc + nblocks + 2) >=

2949

journal->j_max_batch &&

2958

journal->j_max_batch &&

2950

((journal->j_len + nblocks + 2) * 100) <

2959

((journal->j_len + nblocks + 2) * 100) <

2951

(journal->j_len_alloc * 75)) {

2960

(journal->j_len_alloc * 75)) {

2952

if (atomic_read(&journal->j_wcount) > 10) {

2961

if (atomic_read(&journal->j_wcount) > 10) {

2953

sched_count++;

2962

sched_count++;

2954

queue_log_writer(p_s_sb);

2963

queue_log_writer(p_s_sb);

2955

goto relock;

2964

goto relock;

2956

}

2965

}

2957

}

2966

}

2958

/* don't mess with joining the transaction if all we have to do is

2967

/* don't mess with joining the transaction if all we have to do is

2959

* wait for someone else to do a commit

2968

* wait for someone else to do a commit

2960

*/

2969

*/

2961

if (atomic_read(&journal->j_jlock)) {

2970

if (atomic_read(&journal->j_jlock)) {

2962

while (journal->j_trans_id == old_trans_id &&

2971

while (journal->j_trans_id == old_trans_id &&

2963

atomic_read(&journal->j_jlock)) {

2972

atomic_read(&journal->j_jlock)) {

2964

queue_log_writer(p_s_sb);

2973

queue_log_writer(p_s_sb);

2965

}

2974

}

2966

goto relock;

2975

goto relock;

2967

}

2976

}

2968

retval = journal_join(&myth, p_s_sb, 1);

2977

retval = journal_join(&myth, p_s_sb, 1);

2969

if (retval)

2978

if (retval)

2970

goto out_fail;

2979

goto out_fail;

2971

2980

2972

/* someone might have ended the transaction while we joined */

2981

/* someone might have ended the transaction while we joined */

2973

if (old_trans_id != journal->j_trans_id) {

2982

if (old_trans_id != journal->j_trans_id) {

2974

retval = do_journal_end(&myth, p_s_sb, 1, 0);

2983

retval = do_journal_end(&myth, p_s_sb, 1, 0);

2975

} else {

2984

} else {

2976

retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW);

2985

retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW);

2977

}

2986

}

2978

2987

2979

if (retval)

2988

if (retval)

2980

goto out_fail;

2989

goto out_fail;

2981

2990

2982

PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount);

2991

PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount);

2983

goto relock;

2992

goto relock;

2984

}

2993

}

2985

/* we are the first writer, set trans_id */

2994

/* we are the first writer, set trans_id */

2986

if (journal->j_trans_start_time == 0) {

2995

if (journal->j_trans_start_time == 0) {

2987

journal->j_trans_start_time = get_seconds();

2996

journal->j_trans_start_time = get_seconds();

2988

}

2997

}

2989

atomic_inc(&(journal->j_wcount));

2998

atomic_inc(&(journal->j_wcount));

2990

journal->j_len_alloc += nblocks;

2999

journal->j_len_alloc += nblocks;

2991

th->t_blocks_logged = 0;

3000

th->t_blocks_logged = 0;

2992

th->t_blocks_allocated = nblocks;

3001

th->t_blocks_allocated = nblocks;

2993

th->t_trans_id = journal->j_trans_id;

3002

th->t_trans_id = journal->j_trans_id;

2994

unlock_journal(p_s_sb);

3003

unlock_journal(p_s_sb);

2995

INIT_LIST_HEAD(&th->t_list);

3004

INIT_LIST_HEAD(&th->t_list);

2996

get_fs_excl();

3005

get_fs_excl();

2997

return 0;

3006

return 0;

2998

3007

2999

out_fail:

3008

out_fail:

3000

memset(th, 0, sizeof(*th));

3009

memset(th, 0, sizeof(*th));

3001

/* Re-set th->t_super, so we can properly keep track of how many

3010

/* Re-set th->t_super, so we can properly keep track of how many

3002

* persistent transactions there are. We need to do this so if this

3011

* persistent transactions there are. We need to do this so if this

3003

* call is part of a failed restart_transaction, we can free it later */

3012

* call is part of a failed restart_transaction, we can free it later */

3004

th->t_super = p_s_sb;

3013

th->t_super = p_s_sb;

3005

return retval;

3014

return retval;

3006

}

3015

}

3007

3016

3008

struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct

3017

struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct

3009

super_block

3018

super_block

3010

*s,

3019

*s,

3011

int nblocks)

3020

int nblocks)

3012

{

3021

{

3013

int ret;

3022

int ret;

3014

struct reiserfs_transaction_handle *th;

3023

struct reiserfs_transaction_handle *th;

3015

3024

3016

/* if we're nesting into an existing transaction. It will be

3025

/* if we're nesting into an existing transaction. It will be

3017

** persistent on its own

3026

** persistent on its own

3018

*/

3027

*/

3019

if (reiserfs_transaction_running(s)) {

3028

if (reiserfs_transaction_running(s)) {

3020

th = current->journal_info;

3029

th = current->journal_info;

3021

th->t_refcount++;

3030

th->t_refcount++;

3022

if (th->t_refcount < 2) {

3031

if (th->t_refcount < 2) {

3023

BUG();

3032

BUG();

3024

}

3033

}

3025

return th;

3034

return th;

3026

}

3035

}

3027

th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);

3036

th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);

3028

if (!th)

3037

if (!th)

3029

return NULL;

3038

return NULL;

3030

ret = journal_begin(th, s, nblocks);

3039

ret = journal_begin(th, s, nblocks);

3031

if (ret) {

3040

if (ret) {

3032

kfree(th);

3041

kfree(th);

3033

return NULL;

3042

return NULL;

3034

}

3043

}

3035

3044

3036

SB_JOURNAL(s)->j_persistent_trans++;

3045

SB_JOURNAL(s)->j_persistent_trans++;

3037

return th;

3046

return th;

3038

}

3047

}

3039

3048

3040

int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)

3049

int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)

3041

{

3050

{

3042

struct super_block *s = th->t_super;

3051

struct super_block *s = th->t_super;

3043

int ret = 0;

3052

int ret = 0;

3044

if (th->t_trans_id)

3053

if (th->t_trans_id)

3045

ret = journal_end(th, th->t_super, th->t_blocks_allocated);

3054

ret = journal_end(th, th->t_super, th->t_blocks_allocated);

3046

else

3055

else

3047

ret = -EIO;

3056

ret = -EIO;

3048

if (th->t_refcount == 0) {

3057

if (th->t_refcount == 0) {

3049

SB_JOURNAL(s)->j_persistent_trans--;

3058

SB_JOURNAL(s)->j_persistent_trans--;

3050

kfree(th);

3059

kfree(th);

3051

}

3060

}

3052

return ret;

3061

return ret;

3053

}

3062

}

3054

3063

3055

static int journal_join(struct reiserfs_transaction_handle *th,

3064

static int journal_join(struct reiserfs_transaction_handle *th,

3056

struct super_block *p_s_sb, unsigned long nblocks)

3065

struct super_block *p_s_sb, unsigned long nblocks)

3057

{

3066

{

3058

struct reiserfs_transaction_handle *cur_th = current->journal_info;

3067

struct reiserfs_transaction_handle *cur_th = current->journal_info;

3059

3068

3060

/* this keeps do_journal_end from NULLing out the current->journal_info

3069

/* this keeps do_journal_end from NULLing out the current->journal_info

3061

** pointer

3070

** pointer

3062

*/

3071

*/

3063

th->t_handle_save = cur_th;

3072

th->t_handle_save = cur_th;

3064

if (cur_th && cur_th->t_refcount > 1) {

3073

if (cur_th && cur_th->t_refcount > 1) {

3065

BUG();

3074

BUG();

3066

}

3075

}

3067

return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);

3076

return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);

3068

}

3077

}

3069

3078

3070

int journal_join_abort(struct reiserfs_transaction_handle *th,

3079

int journal_join_abort(struct reiserfs_transaction_handle *th,

3071

struct super_block *p_s_sb, unsigned long nblocks)

3080

struct super_block *p_s_sb, unsigned long nblocks)

3072

{

3081

{

3073

struct reiserfs_transaction_handle *cur_th = current->journal_info;

3082

struct reiserfs_transaction_handle *cur_th = current->journal_info;

3074

3083

3075

/* this keeps do_journal_end from NULLing out the current->journal_info

3084

/* this keeps do_journal_end from NULLing out the current->journal_info

3076

** pointer

3085

** pointer

3077

*/

3086

*/

3078

th->t_handle_save = cur_th;

3087

th->t_handle_save = cur_th;

3079

if (cur_th && cur_th->t_refcount > 1) {

3088

if (cur_th && cur_th->t_refcount > 1) {

3080

BUG();

3089

BUG();

3081

}

3090

}

3082

return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);

3091

return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);

3083

}

3092

}

3084

3093

3085

int journal_begin(struct reiserfs_transaction_handle *th,

3094

int journal_begin(struct reiserfs_transaction_handle *th,

3086

struct super_block *p_s_sb, unsigned long nblocks)

3095

struct super_block *p_s_sb, unsigned long nblocks)

3087

{

3096

{

3088

struct reiserfs_transaction_handle *cur_th = current->journal_info;

3097

struct reiserfs_transaction_handle *cur_th = current->journal_info;

3089

int ret;

3098

int ret;

3090

3099

3091

th->t_handle_save = NULL;

3100

th->t_handle_save = NULL;

3092

if (cur_th) {

3101

if (cur_th) {

3093

/* we are nesting into the current transaction */

3102

/* we are nesting into the current transaction */

3094

if (cur_th->t_super == p_s_sb) {

3103

if (cur_th->t_super == p_s_sb) {

3095

BUG_ON(!cur_th->t_refcount);

3104

BUG_ON(!cur_th->t_refcount);

3096

cur_th->t_refcount++;

3105

cur_th->t_refcount++;

3097

memcpy(th, cur_th, sizeof(*th));

3106

memcpy(th, cur_th, sizeof(*th));

3098

if (th->t_refcount <= 1)

3107

if (th->t_refcount <= 1)

3099

reiserfs_warning(p_s_sb,

3108

reiserfs_warning(p_s_sb,

3100

"BAD: refcount <= 1, but journal_info != 0");

3109

"BAD: refcount <= 1, but journal_info != 0");

3101

return 0;

3110

return 0;

3102

} else {

3111

} else {

3103

/* we've ended up with a handle from a different filesystem.

3112

/* we've ended up with a handle from a different filesystem.

3104

** save it and restore on journal_end. This should never

3113

** save it and restore on journal_end. This should never

3105

** really happen...

3114

** really happen...

3106

*/

3115

*/

3107

reiserfs_warning(p_s_sb,

3116

reiserfs_warning(p_s_sb,

3108

"clm-2100: nesting info a different FS");

3117

"clm-2100: nesting info a different FS");

3109

th->t_handle_save = current->journal_info;

3118

th->t_handle_save = current->journal_info;

3110

current->journal_info = th;

3119

current->journal_info = th;

3111

}

3120

}

3112

} else {

3121

} else {

3113

current->journal_info = th;

3122

current->journal_info = th;

3114

}

3123

}

3115

ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);

3124

ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);

3116

if (current->journal_info != th)

3125

if (current->journal_info != th)

3117

BUG();

3126

BUG();

3118

3127

3119

/* I guess this boils down to being the reciprocal of clm-2100 above.

3128

/* I guess this boils down to being the reciprocal of clm-2100 above.

3120

* If do_journal_begin_r fails, we need to put it back, since journal_end

3129

* If do_journal_begin_r fails, we need to put it back, since journal_end

3121

* won't be called to do it. */

3130

* won't be called to do it. */

3122

if (ret)

3131

if (ret)

3123

current->journal_info = th->t_handle_save;

3132

current->journal_info = th->t_handle_save;

3124

else

3133

else

3125

BUG_ON(!th->t_refcount);

3134

BUG_ON(!th->t_refcount);

3126

3135

3127

return ret;

3136

return ret;

3128

}

3137

}

3129

3138

3130

/*

3139

/*

3131

** puts bh into the current transaction. If it was already there, reorders removes the

3140

** puts bh into the current transaction. If it was already there, reorders removes the

3132

** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).

3141

** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).

3133

**

3142

**

3134

** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the

3143

** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the

3135

** transaction is committed.

3144

** transaction is committed.

3136

**

3145

**

3137

** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.

3146

** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.

3138

*/

3147

*/

3139

int journal_mark_dirty(struct reiserfs_transaction_handle *th,

3148

int journal_mark_dirty(struct reiserfs_transaction_handle *th,

3140

struct super_block *p_s_sb, struct buffer_head *bh)

3149

struct super_block *p_s_sb, struct buffer_head *bh)

3141

{

3150

{

3142

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3151

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3143

struct reiserfs_journal_cnode *cn = NULL;

3152

struct reiserfs_journal_cnode *cn = NULL;

3144

int count_already_incd = 0;

3153

int count_already_incd = 0;

3145

int prepared = 0;

3154

int prepared = 0;

3146

BUG_ON(!th->t_trans_id);

3155

BUG_ON(!th->t_trans_id);

3147

3156

3148

PROC_INFO_INC(p_s_sb, journal.mark_dirty);

3157

PROC_INFO_INC(p_s_sb, journal.mark_dirty);

3149

if (th->t_trans_id != journal->j_trans_id) {

3158

if (th->t_trans_id != journal->j_trans_id) {

3150

reiserfs_panic(th->t_super,

3159

reiserfs_panic(th->t_super,

3151

"journal-1577: handle trans id %ld != current trans id %ld\n",

3160

"journal-1577: handle trans id %ld != current trans id %ld\n",

3152

th->t_trans_id, journal->j_trans_id);

3161

th->t_trans_id, journal->j_trans_id);

3153

}

3162

}

3154

3163

3155

p_s_sb->s_dirt = 1;

3164

p_s_sb->s_dirt = 1;

3156

3165

3157

prepared = test_clear_buffer_journal_prepared(bh);

3166

prepared = test_clear_buffer_journal_prepared(bh);

3158

clear_buffer_journal_restore_dirty(bh);

3167

clear_buffer_journal_restore_dirty(bh);

3159

/* already in this transaction, we are done */

3168

/* already in this transaction, we are done */

3160

if (buffer_journaled(bh)) {

3169

if (buffer_journaled(bh)) {

3161

PROC_INFO_INC(p_s_sb, journal.mark_dirty_already);

3170

PROC_INFO_INC(p_s_sb, journal.mark_dirty_already);

3162

return 0;

3171

return 0;

3163

}

3172

}

3164

3173

3165

/* this must be turned into a panic instead of a warning. We can't allow

3174

/* this must be turned into a panic instead of a warning. We can't allow

3166

** a dirty or journal_dirty or locked buffer to be logged, as some changes

3175

** a dirty or journal_dirty or locked buffer to be logged, as some changes

3167

** could get to disk too early. NOT GOOD.

3176

** could get to disk too early. NOT GOOD.

3168

*/

3177

*/

3169

if (!prepared || buffer_dirty(bh)) {

3178

if (!prepared || buffer_dirty(bh)) {

3170

reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state "

3179

reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state "

3171

"%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",

3180

"%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",

3172

(unsigned long long)bh->b_blocknr,

3181

(unsigned long long)bh->b_blocknr,

3173

prepared ? ' ' : '!',

3182

prepared ? ' ' : '!',

3174

buffer_locked(bh) ? ' ' : '!',

3183

buffer_locked(bh) ? ' ' : '!',

3175

buffer_dirty(bh) ? ' ' : '!',

3184

buffer_dirty(bh) ? ' ' : '!',

3176

buffer_journal_dirty(bh) ? ' ' : '!');

3185

buffer_journal_dirty(bh) ? ' ' : '!');

3177

}

3186

}

3178

3187

3179

if (atomic_read(&(journal->j_wcount)) <= 0) {

3188

if (atomic_read(&(journal->j_wcount)) <= 0) {

3180

reiserfs_warning(p_s_sb,

3189

reiserfs_warning(p_s_sb,

3181

"journal-1409: journal_mark_dirty returning because j_wcount was %d",

3190

"journal-1409: journal_mark_dirty returning because j_wcount was %d",

3182

atomic_read(&(journal->j_wcount)));

3191

atomic_read(&(journal->j_wcount)));

3183

return 1;

3192

return 1;

3184

}

3193

}

3185

/* this error means I've screwed up, and we've overflowed the transaction.

3194

/* this error means I've screwed up, and we've overflowed the transaction.

3186

** Nothing can be done here, except make the FS readonly or panic.

3195

** Nothing can be done here, except make the FS readonly or panic.

3187

*/

3196

*/

3188

if (journal->j_len >= journal->j_trans_max) {

3197

if (journal->j_len >= journal->j_trans_max) {

3189

reiserfs_panic(th->t_super,

3198

reiserfs_panic(th->t_super,

3190

"journal-1413: journal_mark_dirty: j_len (%lu) is too big\n",

3199

"journal-1413: journal_mark_dirty: j_len (%lu) is too big\n",

3191

journal->j_len);

3200

journal->j_len);

3192

}

3201

}

3193

3202

3194

if (buffer_journal_dirty(bh)) {

3203

if (buffer_journal_dirty(bh)) {

3195

count_already_incd = 1;

3204

count_already_incd = 1;

3196

PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal);

3205

PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal);

3197

clear_buffer_journal_dirty(bh);

3206

clear_buffer_journal_dirty(bh);

3198

}

3207

}

3199

3208

3200

if (journal->j_len > journal->j_len_alloc) {

3209

if (journal->j_len > journal->j_len_alloc) {

3201

journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;

3210

journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;

3202

}

3211

}

3203

3212

3204

set_buffer_journaled(bh);

3213

set_buffer_journaled(bh);

3205

3214

3206

/* now put this guy on the end */

3215

/* now put this guy on the end */

3207

if (!cn) {

3216

if (!cn) {

3208

cn = get_cnode(p_s_sb);

3217

cn = get_cnode(p_s_sb);

3209

if (!cn) {

3218

if (!cn) {

3210

reiserfs_panic(p_s_sb, "get_cnode failed!\n");

3219

reiserfs_panic(p_s_sb, "get_cnode failed!\n");

3211

}

3220

}

3212

3221

3213

if (th->t_blocks_logged == th->t_blocks_allocated) {

3222

if (th->t_blocks_logged == th->t_blocks_allocated) {

3214

th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;

3223

th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;

3215

journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;

3224

journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;

3216

}

3225

}

3217

th->t_blocks_logged++;

3226

th->t_blocks_logged++;

3218

journal->j_len++;

3227

journal->j_len++;

3219

3228

3220

cn->bh = bh;

3229

cn->bh = bh;

3221

cn->blocknr = bh->b_blocknr;

3230

cn->blocknr = bh->b_blocknr;

3222

cn->sb = p_s_sb;

3231

cn->sb = p_s_sb;

3223

cn->jlist = NULL;

3232

cn->jlist = NULL;

3224

insert_journal_hash(journal->j_hash_table, cn);

3233

insert_journal_hash(journal->j_hash_table, cn);

3225

if (!count_already_incd) {

3234

if (!count_already_incd) {

3226

get_bh(bh);

3235

get_bh(bh);

3227

}

3236

}

3228

}

3237

}

3229

cn->next = NULL;

3238

cn->next = NULL;

3230

cn->prev = journal->j_last;

3239

cn->prev = journal->j_last;

3231

cn->bh = bh;

3240

cn->bh = bh;

3232

if (journal->j_last) {

3241

if (journal->j_last) {

3233

journal->j_last->next = cn;

3242

journal->j_last->next = cn;

3234

journal->j_last = cn;

3243

journal->j_last = cn;

3235

} else {

3244

} else {

3236

journal->j_first = cn;

3245

journal->j_first = cn;

3237

journal->j_last = cn;

3246

journal->j_last = cn;

3238

}

3247

}

3239

return 0;

3248

return 0;

3240

}

3249

}

3241

3250

3242

int journal_end(struct reiserfs_transaction_handle *th,

3251

int journal_end(struct reiserfs_transaction_handle *th,

3243

struct super_block *p_s_sb, unsigned long nblocks)

3252

struct super_block *p_s_sb, unsigned long nblocks)

3244

{

3253

{

3245

if (!current->journal_info && th->t_refcount > 1)

3254

if (!current->journal_info && th->t_refcount > 1)

3246

reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d",

3255

reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d",

3247

th->t_refcount);

3256

th->t_refcount);

3248

3257

3249

if (!th->t_trans_id) {

3258

if (!th->t_trans_id) {

3250

WARN_ON(1);

3259

WARN_ON(1);

3251

return -EIO;

3260

return -EIO;

3252

}

3261

}

3253

3262

3254

th->t_refcount--;

3263

th->t_refcount--;

3255

if (th->t_refcount > 0) {

3264

if (th->t_refcount > 0) {

3256

struct reiserfs_transaction_handle *cur_th =

3265

struct reiserfs_transaction_handle *cur_th =

3257

current->journal_info;

3266

current->journal_info;

3258

3267

3259

/* we aren't allowed to close a nested transaction on a different

3268

/* we aren't allowed to close a nested transaction on a different

3260

** filesystem from the one in the task struct

3269

** filesystem from the one in the task struct

3261

*/

3270

*/

3262

if (cur_th->t_super != th->t_super)

3271

if (cur_th->t_super != th->t_super)

3263

BUG();

3272

BUG();

3264

3273

3265

if (th != cur_th) {

3274

if (th != cur_th) {

3266

memcpy(current->journal_info, th, sizeof(*th));

3275

memcpy(current->journal_info, th, sizeof(*th));

3267

th->t_trans_id = 0;

3276

th->t_trans_id = 0;

3268

}

3277

}

3269

return 0;

3278

return 0;

3270

} else {

3279

} else {

3271

return do_journal_end(th, p_s_sb, nblocks, 0);

3280

return do_journal_end(th, p_s_sb, nblocks, 0);

3272

}

3281

}

3273

}

3282

}

3274

3283

3275

/* removes from the current transaction, relsing and descrementing any counters.

3284

/* removes from the current transaction, relsing and descrementing any counters.

3276

** also files the removed buffer directly onto the clean list

3285

** also files the removed buffer directly onto the clean list

3277

**

3286

**

3278

** called by journal_mark_freed when a block has been deleted

3287

** called by journal_mark_freed when a block has been deleted

3279

**

3288

**

3280

** returns 1 if it cleaned and relsed the buffer. 0 otherwise

3289

** returns 1 if it cleaned and relsed the buffer. 0 otherwise

3281

*/

3290

*/

3282

static int remove_from_transaction(struct super_block *p_s_sb,

3291

static int remove_from_transaction(struct super_block *p_s_sb,

3283

b_blocknr_t blocknr, int already_cleaned)

3292

b_blocknr_t blocknr, int already_cleaned)

3284

{

3293

{

3285

struct buffer_head *bh;

3294

struct buffer_head *bh;

3286

struct reiserfs_journal_cnode *cn;

3295

struct reiserfs_journal_cnode *cn;

3287

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3296

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3288

int ret = 0;

3297

int ret = 0;

3289

3298

3290

cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);

3299

cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);

3291

if (!cn || !cn->bh) {

3300

if (!cn || !cn->bh) {

3292

return ret;

3301

return ret;

3293

}

3302

}

3294

bh = cn->bh;

3303

bh = cn->bh;

3295

if (cn->prev) {

3304

if (cn->prev) {

3296

cn->prev->next = cn->next;

3305

cn->prev->next = cn->next;

3297

}

3306

}

3298

if (cn->next) {

3307

if (cn->next) {

3299

cn->next->prev = cn->prev;

3308

cn->next->prev = cn->prev;

3300

}

3309

}

3301

if (cn == journal->j_first) {

3310

if (cn == journal->j_first) {

3302

journal->j_first = cn->next;

3311

journal->j_first = cn->next;

3303

}

3312

}

3304

if (cn == journal->j_last) {

3313

if (cn == journal->j_last) {

3305

journal->j_last = cn->prev;

3314

journal->j_last = cn->prev;

3306

}

3315

}

3307

if (bh)

3316

if (bh)

3308

remove_journal_hash(p_s_sb, journal->j_hash_table, NULL,

3317

remove_journal_hash(p_s_sb, journal->j_hash_table, NULL,

3309

bh->b_blocknr, 0);

3318

bh->b_blocknr, 0);

3310

clear_buffer_journaled(bh); /* don't log this one */

3319

clear_buffer_journaled(bh); /* don't log this one */

3311

3320

3312

if (!already_cleaned) {

3321

if (!already_cleaned) {

3313

clear_buffer_journal_dirty(bh);

3322

clear_buffer_journal_dirty(bh);

3314

clear_buffer_dirty(bh);

3323

clear_buffer_dirty(bh);

3315

clear_buffer_journal_test(bh);

3324

clear_buffer_journal_test(bh);

3316

put_bh(bh);

3325

put_bh(bh);

3317

if (atomic_read(&(bh->b_count)) < 0) {

3326

if (atomic_read(&(bh->b_count)) < 0) {

3318

reiserfs_warning(p_s_sb,

3327

reiserfs_warning(p_s_sb,

3319

"journal-1752: remove from trans, b_count < 0");

3328

"journal-1752: remove from trans, b_count < 0");

3320

}

3329

}

3321

ret = 1;

3330

ret = 1;

3322

}

3331

}

3323

journal->j_len--;

3332

journal->j_len--;

3324

journal->j_len_alloc--;

3333

journal->j_len_alloc--;

3325

free_cnode(p_s_sb, cn);

3334

free_cnode(p_s_sb, cn);

3326

return ret;

3335

return ret;

3327

}

3336

}

3328

3337

3329

/*

3338

/*

3330

** for any cnode in a journal list, it can only be dirtied of all the

3339

** for any cnode in a journal list, it can only be dirtied of all the

3331

** transactions that include it are commited to disk.

3340

** transactions that include it are commited to disk.

3332

** this checks through each transaction, and returns 1 if you are allowed to dirty,

3341

** this checks through each transaction, and returns 1 if you are allowed to dirty,

3333

** and 0 if you aren't

3342

** and 0 if you aren't

3334

**

3343

**

3335

** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log

3344

** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log

3336

** blocks for a given transaction on disk

3345

** blocks for a given transaction on disk

3337

**

3346

**

3338

*/

3347

*/

3339

static int can_dirty(struct reiserfs_journal_cnode *cn)

3348

static int can_dirty(struct reiserfs_journal_cnode *cn)

3340

{

3349

{

3341

struct super_block *sb = cn->sb;

3350

struct super_block *sb = cn->sb;

3342

b_blocknr_t blocknr = cn->blocknr;

3351

b_blocknr_t blocknr = cn->blocknr;

3343

struct reiserfs_journal_cnode *cur = cn->hprev;

3352

struct reiserfs_journal_cnode *cur = cn->hprev;

3344

int can_dirty = 1;

3353

int can_dirty = 1;

3345

3354

3346

/* first test hprev. These are all newer than cn, so any node here

3355

/* first test hprev. These are all newer than cn, so any node here

3347

** with the same block number and dev means this node can't be sent

3356

** with the same block number and dev means this node can't be sent

3348

** to disk right now.

3357

** to disk right now.

3349

*/

3358

*/

3350

while (cur && can_dirty) {

3359

while (cur && can_dirty) {

3351

if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&

3360

if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&

3352

cur->blocknr == blocknr) {

3361

cur->blocknr == blocknr) {

3353

can_dirty = 0;

3362

can_dirty = 0;

3354

}

3363

}

3355

cur = cur->hprev;

3364

cur = cur->hprev;

3356

}

3365

}

3357

/* then test hnext. These are all older than cn. As long as they

3366

/* then test hnext. These are all older than cn. As long as they

3358

** are committed to the log, it is safe to write cn to disk

3367

** are committed to the log, it is safe to write cn to disk

3359

*/

3368

*/

3360

cur = cn->hnext;

3369

cur = cn->hnext;

3361

while (cur && can_dirty) {

3370

while (cur && can_dirty) {

3362

if (cur->jlist && cur->jlist->j_len > 0 &&

3371

if (cur->jlist && cur->jlist->j_len > 0 &&

3363

atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&

3372

atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&

3364

cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {

3373

cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {

3365

can_dirty = 0;

3374

can_dirty = 0;

3366

}

3375

}

3367

cur = cur->hnext;

3376

cur = cur->hnext;

3368

}

3377

}

3369

return can_dirty;

3378

return can_dirty;

3370

}

3379

}

3371

3380

3372

/* syncs the commit blocks, but does not force the real buffers to disk

3381

/* syncs the commit blocks, but does not force the real buffers to disk

3373

** will wait until the current transaction is done/commited before returning

3382

** will wait until the current transaction is done/commited before returning

3374

*/

3383

*/

3375

int journal_end_sync(struct reiserfs_transaction_handle *th,

3384

int journal_end_sync(struct reiserfs_transaction_handle *th,

3376

struct super_block *p_s_sb, unsigned long nblocks)

3385

struct super_block *p_s_sb, unsigned long nblocks)

3377

{

3386

{

3378

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3387

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3379

3388

3380

BUG_ON(!th->t_trans_id);

3389

BUG_ON(!th->t_trans_id);

3381

/* you can sync while nested, very, very bad */

3390

/* you can sync while nested, very, very bad */

3382

if (th->t_refcount > 1) {

3391

if (th->t_refcount > 1) {

3383

BUG();

3392

BUG();

3384

}

3393

}

3385

if (journal->j_len == 0) {

3394

if (journal->j_len == 0) {

3386

reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),

3395

reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),

3387

1);

3396

1);

3388

journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));

3397

journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));

3389

}

3398

}

3390

return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT);

3399

return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT);

3391

}

3400

}

3392

3401

3393

/*

3402

/*

3394

** writeback the pending async commits to disk

3403

** writeback the pending async commits to disk

3395

*/

3404

*/

3396

static void flush_async_commits(void *p)

3405

static void flush_async_commits(void *p)

3397

{

3406

{

3398

struct super_block *p_s_sb = p;

3407

struct super_block *p_s_sb = p;

3399

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3408

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3400

struct reiserfs_journal_list *jl;

3409

struct reiserfs_journal_list *jl;

3401

struct list_head *entry;

3410

struct list_head *entry;

3402

3411

3403

lock_kernel();

3412

lock_kernel();

3404

if (!list_empty(&journal->j_journal_list)) {

3413

if (!list_empty(&journal->j_journal_list)) {

3405

/* last entry is the youngest, commit it and you get everything */

3414

/* last entry is the youngest, commit it and you get everything */

3406

entry = journal->j_journal_list.prev;

3415

entry = journal->j_journal_list.prev;

3407

jl = JOURNAL_LIST_ENTRY(entry);

3416

jl = JOURNAL_LIST_ENTRY(entry);

3408

flush_commit_list(p_s_sb, jl, 1);

3417

flush_commit_list(p_s_sb, jl, 1);

3409

}

3418

}

3410

unlock_kernel();

3419

unlock_kernel();

3411

/*

3420

/*

3412

* this is a little racey, but there's no harm in missing

3421

* this is a little racey, but there's no harm in missing

3413

* the filemap_fdata_write

3422

* the filemap_fdata_write

3414

*/

3423

*/

3415

if (!atomic_read(&journal->j_async_throttle)

3424

if (!atomic_read(&journal->j_async_throttle)

3416

&& !reiserfs_is_journal_aborted(journal)) {

3425

&& !reiserfs_is_journal_aborted(journal)) {

3417

atomic_inc(&journal->j_async_throttle);

3426

atomic_inc(&journal->j_async_throttle);

3418

filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);

3427

filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);

3419

atomic_dec(&journal->j_async_throttle);

3428

atomic_dec(&journal->j_async_throttle);

3420

}

3429

}

3421

}

3430

}

3422

3431

3423

/*

3432

/*

3424

** flushes any old transactions to disk

3433

** flushes any old transactions to disk

3425

** ends the current transaction if it is too old

3434

** ends the current transaction if it is too old

3426

*/

3435

*/

3427

int reiserfs_flush_old_commits(struct super_block *p_s_sb)

3436

int reiserfs_flush_old_commits(struct super_block *p_s_sb)

3428

{

3437

{

3429

time_t now;

3438

time_t now;

3430

struct reiserfs_transaction_handle th;

3439

struct reiserfs_transaction_handle th;

3431

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3440

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3432

3441

3433

now = get_seconds();

3442

now = get_seconds();

3434

/* safety check so we don't flush while we are replaying the log during

3443

/* safety check so we don't flush while we are replaying the log during

3435

* mount

3444

* mount

3436

*/

3445

*/

3437

if (list_empty(&journal->j_journal_list)) {

3446

if (list_empty(&journal->j_journal_list)) {

3438

return 0;

3447

return 0;

3439

}

3448

}

3440

3449

3441

/* check the current transaction. If there are no writers, and it is

3450

/* check the current transaction. If there are no writers, and it is

3442

* too old, finish it, and force the commit blocks to disk

3451

* too old, finish it, and force the commit blocks to disk

3443

*/

3452

*/

3444

if (atomic_read(&journal->j_wcount) <= 0 &&

3453

if (atomic_read(&journal->j_wcount) <= 0 &&

3445

journal->j_trans_start_time > 0 &&

3454

journal->j_trans_start_time > 0 &&

3446

journal->j_len > 0 &&

3455

journal->j_len > 0 &&

3447

(now - journal->j_trans_start_time) > journal->j_max_trans_age) {

3456

(now - journal->j_trans_start_time) > journal->j_max_trans_age) {

3448

if (!journal_join(&th, p_s_sb, 1)) {

3457

if (!journal_join(&th, p_s_sb, 1)) {

3449

reiserfs_prepare_for_journal(p_s_sb,

3458

reiserfs_prepare_for_journal(p_s_sb,

3450

SB_BUFFER_WITH_SB(p_s_sb),

3459

SB_BUFFER_WITH_SB(p_s_sb),

3451

1);

3460

1);

3452

journal_mark_dirty(&th, p_s_sb,

3461

journal_mark_dirty(&th, p_s_sb,

3453

SB_BUFFER_WITH_SB(p_s_sb));

3462

SB_BUFFER_WITH_SB(p_s_sb));

3454

3463

3455

/* we're only being called from kreiserfsd, it makes no sense to do

3464

/* we're only being called from kreiserfsd, it makes no sense to do

3456

** an async commit so that kreiserfsd can do it later

3465

** an async commit so that kreiserfsd can do it later

3457

*/

3466

*/

3458

do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT);

3467

do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT);

3459

}

3468

}

3460

}

3469

}

3461

return p_s_sb->s_dirt;

3470

return p_s_sb->s_dirt;

3462

}

3471

}

3463

3472

3464

/*

3473

/*

3465

** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit

3474

** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit

3466

**

3475

**

3467

** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all

3476

** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all

3468

** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just

3477

** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just

3469

** flushes the commit list and returns 0.

3478

** flushes the commit list and returns 0.

3470

**

3479

**

3471

** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.

3480

** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.

3472

**

3481

**

3473

** Note, we can't allow the journal_end to proceed while there are still writers in the log.

3482

** Note, we can't allow the journal_end to proceed while there are still writers in the log.

3474

*/

3483

*/

3475

static int check_journal_end(struct reiserfs_transaction_handle *th,

3484

static int check_journal_end(struct reiserfs_transaction_handle *th,

3476

struct super_block *p_s_sb, unsigned long nblocks,

3485

struct super_block *p_s_sb, unsigned long nblocks,

3477

int flags)

3486

int flags)

3478

{

3487

{

3479

3488

3480

time_t now;

3489

time_t now;

3481

int flush = flags & FLUSH_ALL;

3490

int flush = flags & FLUSH_ALL;

3482

int commit_now = flags & COMMIT_NOW;

3491

int commit_now = flags & COMMIT_NOW;

3483

int wait_on_commit = flags & WAIT;

3492

int wait_on_commit = flags & WAIT;

3484

struct reiserfs_journal_list *jl;

3493

struct reiserfs_journal_list *jl;

3485

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3494

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3486

3495

3487

BUG_ON(!th->t_trans_id);

3496

BUG_ON(!th->t_trans_id);

3488

3497

3489

if (th->t_trans_id != journal->j_trans_id) {

3498

if (th->t_trans_id != journal->j_trans_id) {

3490

reiserfs_panic(th->t_super,

3499

reiserfs_panic(th->t_super,

3491

"journal-1577: handle trans id %ld != current trans id %ld\n",

3500

"journal-1577: handle trans id %ld != current trans id %ld\n",

3492

th->t_trans_id, journal->j_trans_id);

3501

th->t_trans_id, journal->j_trans_id);

3493

}

3502

}

3494

3503

3495

journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);

3504

journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);

3496

if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */

3505

if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */

3497

atomic_dec(&(journal->j_wcount));

3506

atomic_dec(&(journal->j_wcount));

3498

}

3507

}

3499

3508

3500

/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released

3509

/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released

3501

** will be dealt with by next transaction that actually writes something, but should be taken

3510

** will be dealt with by next transaction that actually writes something, but should be taken

3502

** care of in this trans

3511

** care of in this trans

3503

*/

3512

*/

3504

if (journal->j_len == 0) {

3513

if (journal->j_len == 0) {

3505

BUG();

3514

BUG();

3506

}

3515

}

3507

/* if wcount > 0, and we are called to with flush or commit_now,

3516

/* if wcount > 0, and we are called to with flush or commit_now,

3508

** we wait on j_join_wait. We will wake up when the last writer has

3517

** we wait on j_join_wait. We will wake up when the last writer has

3509

** finished the transaction, and started it on its way to the disk.

3518

** finished the transaction, and started it on its way to the disk.

3510

** Then, we flush the commit or journal list, and just return 0

3519

** Then, we flush the commit or journal list, and just return 0

3511

** because the rest of journal end was already done for this transaction.

3520

** because the rest of journal end was already done for this transaction.

3512

*/

3521

*/

3513

if (atomic_read(&(journal->j_wcount)) > 0) {

3522

if (atomic_read(&(journal->j_wcount)) > 0) {

3514

if (flush || commit_now) {

3523

if (flush || commit_now) {

3515

unsigned trans_id;

3524

unsigned trans_id;

3516

3525

3517

jl = journal->j_current_jl;

3526

jl = journal->j_current_jl;

3518

trans_id = jl->j_trans_id;

3527

trans_id = jl->j_trans_id;

3519

if (wait_on_commit)

3528

if (wait_on_commit)

3520

jl->j_state |= LIST_COMMIT_PENDING;

3529

jl->j_state |= LIST_COMMIT_PENDING;

3521

atomic_set(&(journal->j_jlock), 1);

3530

atomic_set(&(journal->j_jlock), 1);

3522

if (flush) {

3531

if (flush) {

3523

journal->j_next_full_flush = 1;

3532

journal->j_next_full_flush = 1;

3524

}

3533

}

3525

unlock_journal(p_s_sb);

3534

unlock_journal(p_s_sb);

3526

3535

3527

/* sleep while the current transaction is still j_jlocked */

3536

/* sleep while the current transaction is still j_jlocked */

3528

while (journal->j_trans_id == trans_id) {

3537

while (journal->j_trans_id == trans_id) {

3529

if (atomic_read(&journal->j_jlock)) {

3538

if (atomic_read(&journal->j_jlock)) {

3530

queue_log_writer(p_s_sb);

3539

queue_log_writer(p_s_sb);

3531

} else {

3540

} else {

3532

lock_journal(p_s_sb);

3541

lock_journal(p_s_sb);

3533

if (journal->j_trans_id == trans_id) {

3542

if (journal->j_trans_id == trans_id) {

3534

atomic_set(&(journal->j_jlock),

3543

atomic_set(&(journal->j_jlock),

3535

1);

3544

1);

3536

}

3545

}

3537

unlock_journal(p_s_sb);

3546

unlock_journal(p_s_sb);

3538

}

3547

}

3539

}

3548

}

3540

if (journal->j_trans_id == trans_id) {

3549

if (journal->j_trans_id == trans_id) {

3541

BUG();

3550

BUG();

3542

}

3551

}

3543

if (commit_now

3552

if (commit_now

3544

&& journal_list_still_alive(p_s_sb, trans_id)

3553

&& journal_list_still_alive(p_s_sb, trans_id)

3545

&& wait_on_commit) {

3554

&& wait_on_commit) {

3546

flush_commit_list(p_s_sb, jl, 1);

3555

flush_commit_list(p_s_sb, jl, 1);

3547

}

3556

}

3548

return 0;

3557

return 0;

3549

}

3558

}

3550

unlock_journal(p_s_sb);

3559

unlock_journal(p_s_sb);

3551

return 0;

3560

return 0;

3552

}

3561

}

3553

3562

3554

/* deal with old transactions where we are the last writers */

3563

/* deal with old transactions where we are the last writers */

3555

now = get_seconds();

3564

now = get_seconds();

3556

if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {

3565

if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {

3557

commit_now = 1;

3566

commit_now = 1;

3558

journal->j_next_async_flush = 1;

3567

journal->j_next_async_flush = 1;

3559

}

3568

}

3560

/* don't batch when someone is waiting on j_join_wait */

3569

/* don't batch when someone is waiting on j_join_wait */

3561

/* don't batch when syncing the commit or flushing the whole trans */

3570

/* don't batch when syncing the commit or flushing the whole trans */

3562

if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))

3571

if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))

3563

&& !flush && !commit_now && (journal->j_len < journal->j_max_batch)

3572

&& !flush && !commit_now && (journal->j_len < journal->j_max_batch)

3564

&& journal->j_len_alloc < journal->j_max_batch

3573

&& journal->j_len_alloc < journal->j_max_batch

3565

&& journal->j_cnode_free > (journal->j_trans_max * 3)) {

3574

&& journal->j_cnode_free > (journal->j_trans_max * 3)) {

3566

journal->j_bcount++;

3575

journal->j_bcount++;

3567

unlock_journal(p_s_sb);

3576

unlock_journal(p_s_sb);

3568

return 0;

3577

return 0;

3569

}

3578

}

3570

3579

3571

if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {

3580

if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {

3572

reiserfs_panic(p_s_sb,

3581

reiserfs_panic(p_s_sb,

3573

"journal-003: journal_end: j_start (%ld) is too high\n",

3582

"journal-003: journal_end: j_start (%ld) is too high\n",

3574

journal->j_start);

3583

journal->j_start);

3575

}

3584

}

3576

return 1;

3585

return 1;

3577

}

3586

}

3578

3587

3579

/*

3588

/*

3580

** Does all the work that makes deleting blocks safe.

3589

** Does all the work that makes deleting blocks safe.

3581

** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.

3590

** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.

3582

**

3591

**

3583

** otherwise:

3592

** otherwise:

3584

** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes

3593

** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes

3585

** before this transaction has finished.

3594

** before this transaction has finished.

3586

**

3595

**

3587

** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with

3596

** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with

3588

** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash,

3597

** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash,

3589

** the block can't be reallocated yet.

3598

** the block can't be reallocated yet.

3590

**

3599

**

3591

** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.

3600

** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.

3592

*/

3601

*/

3593

int journal_mark_freed(struct reiserfs_transaction_handle *th,

3602

int journal_mark_freed(struct reiserfs_transaction_handle *th,

3594

struct super_block *p_s_sb, b_blocknr_t blocknr)

3603

struct super_block *p_s_sb, b_blocknr_t blocknr)

3595

{

3604

{

3596

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3605

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3597

struct reiserfs_journal_cnode *cn = NULL;

3606

struct reiserfs_journal_cnode *cn = NULL;

3598

struct buffer_head *bh = NULL;

3607

struct buffer_head *bh = NULL;

3599

struct reiserfs_list_bitmap *jb = NULL;

3608

struct reiserfs_list_bitmap *jb = NULL;

3600

int cleaned = 0;

3609

int cleaned = 0;

3601

BUG_ON(!th->t_trans_id);

3610

BUG_ON(!th->t_trans_id);

3602

3611

3603

cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);

3612

cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);

3604

if (cn && cn->bh) {

3613

if (cn && cn->bh) {

3605

bh = cn->bh;

3614

bh = cn->bh;

3606

get_bh(bh);

3615

get_bh(bh);

3607

}

3616

}

3608

/* if it is journal new, we just remove it from this transaction */

3617

/* if it is journal new, we just remove it from this transaction */

3609

if (bh && buffer_journal_new(bh)) {

3618

if (bh && buffer_journal_new(bh)) {

3610

clear_buffer_journal_new(bh);

3619

clear_buffer_journal_new(bh);

3611

clear_prepared_bits(bh);

3620

clear_prepared_bits(bh);

3612

reiserfs_clean_and_file_buffer(bh);

3621

reiserfs_clean_and_file_buffer(bh);

3613

cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);

3622

cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);

3614

} else {

3623

} else {

3615

/* set the bit for this block in the journal bitmap for this transaction */

3624

/* set the bit for this block in the journal bitmap for this transaction */

3616

jb = journal->j_current_jl->j_list_bitmap;

3625

jb = journal->j_current_jl->j_list_bitmap;

3617

if (!jb) {

3626

if (!jb) {

3618

reiserfs_panic(p_s_sb,

3627

reiserfs_panic(p_s_sb,

3619

"journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n");

3628

"journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n");

3620

}

3629

}

3621

set_bit_in_list_bitmap(p_s_sb, blocknr, jb);

3630

set_bit_in_list_bitmap(p_s_sb, blocknr, jb);

3622

3631

3623

/* Note, the entire while loop is not allowed to schedule. */

3632

/* Note, the entire while loop is not allowed to schedule. */

3624

3633

3625

if (bh) {

3634

if (bh) {

3626

clear_prepared_bits(bh);

3635

clear_prepared_bits(bh);

3627

reiserfs_clean_and_file_buffer(bh);

3636

reiserfs_clean_and_file_buffer(bh);

3628

}

3637

}

3629

cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);

3638

cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);

3630

3639

3631

/* find all older transactions with this block, make sure they don't try to write it out */

3640

/* find all older transactions with this block, make sure they don't try to write it out */

3632

cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table,

3641

cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table,

3633

blocknr);

3642

blocknr);

3634

while (cn) {

3643

while (cn) {

3635

if (p_s_sb == cn->sb && blocknr == cn->blocknr) {

3644

if (p_s_sb == cn->sb && blocknr == cn->blocknr) {

3636

set_bit(BLOCK_FREED, &cn->state);

3645

set_bit(BLOCK_FREED, &cn->state);

3637

if (cn->bh) {

3646

if (cn->bh) {

3638

if (!cleaned) {

3647

if (!cleaned) {

3639

/* remove_from_transaction will brelse the buffer if it was

3648

/* remove_from_transaction will brelse the buffer if it was

3640

** in the current trans

3649

** in the current trans

3641

*/

3650

*/

3642

clear_buffer_journal_dirty(cn->

3651

clear_buffer_journal_dirty(cn->

3643

bh);

3652

bh);

3644

clear_buffer_dirty(cn->bh);

3653

clear_buffer_dirty(cn->bh);

3645

clear_buffer_journal_test(cn->

3654

clear_buffer_journal_test(cn->

3646

bh);

3655

bh);

3647

cleaned = 1;

3656

cleaned = 1;

3648

put_bh(cn->bh);

3657

put_bh(cn->bh);

3649

if (atomic_read

3658

if (atomic_read

3650

(&(cn->bh->b_count)) < 0) {

3659

(&(cn->bh->b_count)) < 0) {

3651

reiserfs_warning(p_s_sb,

3660

reiserfs_warning(p_s_sb,

3652

"journal-2138: cn->bh->b_count < 0");

3661

"journal-2138: cn->bh->b_count < 0");

3653

}

3662

}

3654

}

3663

}

3655

if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */

3664

if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */

3656

atomic_dec(&

3665

atomic_dec(&

3657

(cn->jlist->

3666

(cn->jlist->

3658

j_nonzerolen));

3667

j_nonzerolen));

3659

}

3668

}

3660

cn->bh = NULL;

3669

cn->bh = NULL;

3661

}

3670

}

3662

}

3671

}

3663

cn = cn->hnext;

3672

cn = cn->hnext;

3664

}

3673

}

3665

}

3674

}

3666

3675

3667

if (bh) {

3676

if (bh) {

3668

put_bh(bh); /* get_hash grabs the buffer */

3677

put_bh(bh); /* get_hash grabs the buffer */

3669

if (atomic_read(&(bh->b_count)) < 0) {

3678

if (atomic_read(&(bh->b_count)) < 0) {

3670

reiserfs_warning(p_s_sb,

3679

reiserfs_warning(p_s_sb,

3671

"journal-2165: bh->b_count < 0");

3680

"journal-2165: bh->b_count < 0");

3672

}

3681

}

3673

}

3682

}

3674

return 0;

3683

return 0;

3675

}

3684

}

3676

3685

3677

void reiserfs_update_inode_transaction(struct inode *inode)

3686

void reiserfs_update_inode_transaction(struct inode *inode)

3678

{

3687

{

3679

struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);

3688

struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);

3680

REISERFS_I(inode)->i_jl = journal->j_current_jl;

3689

REISERFS_I(inode)->i_jl = journal->j_current_jl;

3681

REISERFS_I(inode)->i_trans_id = journal->j_trans_id;

3690

REISERFS_I(inode)->i_trans_id = journal->j_trans_id;

3682

}

3691

}

3683

3692

3684

/*

3693

/*

3685

* returns -1 on error, 0 if no commits/barriers were done and 1

3694

* returns -1 on error, 0 if no commits/barriers were done and 1

3686

* if a transaction was actually committed and the barrier was done

3695

* if a transaction was actually committed and the barrier was done

3687

*/

3696

*/

3688

static int __commit_trans_jl(struct inode *inode, unsigned long id,

3697

static int __commit_trans_jl(struct inode *inode, unsigned long id,

3689

struct reiserfs_journal_list *jl)

3698

struct reiserfs_journal_list *jl)

3690

{

3699

{

3691

struct reiserfs_transaction_handle th;

3700

struct reiserfs_transaction_handle th;

3692

struct super_block *sb = inode->i_sb;

3701

struct super_block *sb = inode->i_sb;

3693

struct reiserfs_journal *journal = SB_JOURNAL(sb);

3702

struct reiserfs_journal *journal = SB_JOURNAL(sb);

3694

int ret = 0;

3703

int ret = 0;

3695

3704

3696

/* is it from the current transaction, or from an unknown transaction? */

3705

/* is it from the current transaction, or from an unknown transaction? */

3697

if (id == journal->j_trans_id) {

3706

if (id == journal->j_trans_id) {

3698

jl = journal->j_current_jl;

3707

jl = journal->j_current_jl;

3699

/* try to let other writers come in and grow this transaction */

3708

/* try to let other writers come in and grow this transaction */

3700

let_transaction_grow(sb, id);

3709

let_transaction_grow(sb, id);

3701

if (journal->j_trans_id != id) {

3710

if (journal->j_trans_id != id) {

3702

goto flush_commit_only;

3711

goto flush_commit_only;

3703

}

3712

}

3704

3713

3705

ret = journal_begin(&th, sb, 1);

3714

ret = journal_begin(&th, sb, 1);

3706

if (ret)

3715

if (ret)

3707

return ret;

3716

return ret;

3708

3717

3709

/* someone might have ended this transaction while we joined */

3718

/* someone might have ended this transaction while we joined */

3710

if (journal->j_trans_id != id) {

3719

if (journal->j_trans_id != id) {

3711

reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),

3720

reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),

3712

1);

3721

1);

3713

journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));

3722

journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));

3714

ret = journal_end(&th, sb, 1);

3723

ret = journal_end(&th, sb, 1);

3715

goto flush_commit_only;

3724

goto flush_commit_only;

3716

}

3725

}

3717

3726

3718

ret = journal_end_sync(&th, sb, 1);

3727

ret = journal_end_sync(&th, sb, 1);

3719

if (!ret)

3728

if (!ret)

3720

ret = 1;

3729

ret = 1;

3721

3730

3722

} else {

3731

} else {

3723

/* this gets tricky, we have to make sure the journal list in

3732

/* this gets tricky, we have to make sure the journal list in

3724

* the inode still exists. We know the list is still around

3733

* the inode still exists. We know the list is still around

3725

* if we've got a larger transaction id than the oldest list

3734

* if we've got a larger transaction id than the oldest list

3726

*/

3735

*/

3727

flush_commit_only:

3736

flush_commit_only:

3728

if (journal_list_still_alive(inode->i_sb, id)) {

3737

if (journal_list_still_alive(inode->i_sb, id)) {

3729

/*

3738

/*

3730

* we only set ret to 1 when we know for sure

3739

* we only set ret to 1 when we know for sure

3731

* the barrier hasn't been started yet on the commit

3740

* the barrier hasn't been started yet on the commit

3732

* block.

3741

* block.

3733

*/

3742

*/

3734

if (atomic_read(&jl->j_commit_left) > 1)

3743

if (atomic_read(&jl->j_commit_left) > 1)

3735

ret = 1;

3744

ret = 1;

3736

flush_commit_list(sb, jl, 1);

3745

flush_commit_list(sb, jl, 1);

3737

if (journal->j_errno)

3746

if (journal->j_errno)

3738

ret = journal->j_errno;

3747

ret = journal->j_errno;

3739

}

3748

}

3740

}

3749

}

3741

/* otherwise the list is gone, and long since committed */

3750

/* otherwise the list is gone, and long since committed */

3742

return ret;

3751

return ret;

3743

}

3752

}

3744

3753

3745

int reiserfs_commit_for_inode(struct inode *inode)

3754

int reiserfs_commit_for_inode(struct inode *inode)

3746

{

3755

{

3747

unsigned long id = REISERFS_I(inode)->i_trans_id;

3756

unsigned long id = REISERFS_I(inode)->i_trans_id;

3748

struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;

3757

struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;

3749

3758

3750

/* for the whole inode, assume unset id means it was

3759

/* for the whole inode, assume unset id means it was

3751

* changed in the current transaction. More conservative

3760

* changed in the current transaction. More conservative

3752

*/

3761

*/

3753

if (!id || !jl) {

3762

if (!id || !jl) {

3754

reiserfs_update_inode_transaction(inode);

3763

reiserfs_update_inode_transaction(inode);

3755

id = REISERFS_I(inode)->i_trans_id;

3764

id = REISERFS_I(inode)->i_trans_id;

3756

/* jl will be updated in __commit_trans_jl */

3765

/* jl will be updated in __commit_trans_jl */

3757

}

3766

}

3758

3767

3759

return __commit_trans_jl(inode, id, jl);

3768

return __commit_trans_jl(inode, id, jl);

3760

}

3769

}

3761

3770

3762

void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,

3771

void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,

3763

struct buffer_head *bh)

3772

struct buffer_head *bh)

3764

{

3773

{

3765

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3774

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3766

PROC_INFO_INC(p_s_sb, journal.restore_prepared);

3775

PROC_INFO_INC(p_s_sb, journal.restore_prepared);

3767

if (!bh) {

3776

if (!bh) {

3768

return;

3777

return;

3769

}

3778

}

3770

if (test_clear_buffer_journal_restore_dirty(bh) &&

3779

if (test_clear_buffer_journal_restore_dirty(bh) &&

3771

buffer_journal_dirty(bh)) {

3780

buffer_journal_dirty(bh)) {

3772

struct reiserfs_journal_cnode *cn;

3781

struct reiserfs_journal_cnode *cn;

3773

cn = get_journal_hash_dev(p_s_sb,

3782

cn = get_journal_hash_dev(p_s_sb,

3774

journal->j_list_hash_table,

3783

journal->j_list_hash_table,

3775

bh->b_blocknr);

3784

bh->b_blocknr);

3776

if (cn && can_dirty(cn)) {

3785

if (cn && can_dirty(cn)) {

3777

set_buffer_journal_test(bh);

3786

set_buffer_journal_test(bh);

3778

mark_buffer_dirty(bh);

3787

mark_buffer_dirty(bh);

3779

}

3788

}

3780

}

3789

}

3781

clear_buffer_journal_prepared(bh);

3790

clear_buffer_journal_prepared(bh);

3782

}

3791

}

3783

3792

3784

extern struct tree_balance *cur_tb;

3793

extern struct tree_balance *cur_tb;

3785

/*

3794

/*

3786

** before we can change a metadata block, we have to make sure it won't

3795

** before we can change a metadata block, we have to make sure it won't

3787

** be written to disk while we are altering it. So, we must:

3796

** be written to disk while we are altering it. So, we must:

3788

** clean it

3797

** clean it

3789

** wait on it.

3798

** wait on it.

3790

**

3799

**

3791

*/

3800

*/

3792

int reiserfs_prepare_for_journal(struct super_block *p_s_sb,

3801

int reiserfs_prepare_for_journal(struct super_block *p_s_sb,

3793

struct buffer_head *bh, int wait)

3802

struct buffer_head *bh, int wait)

3794

{

3803

{

3795

PROC_INFO_INC(p_s_sb, journal.prepare);

3804

PROC_INFO_INC(p_s_sb, journal.prepare);

3796

3805

3797

if (test_set_buffer_locked(bh)) {

3806

if (test_set_buffer_locked(bh)) {

3798

if (!wait)

3807

if (!wait)

3799

return 0;

3808

return 0;

3800

lock_buffer(bh);

3809

lock_buffer(bh);

3801

}

3810

}

3802

set_buffer_journal_prepared(bh);

3811

set_buffer_journal_prepared(bh);

3803

if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {

3812

if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {

3804

clear_buffer_journal_test(bh);

3813

clear_buffer_journal_test(bh);

3805

set_buffer_journal_restore_dirty(bh);

3814

set_buffer_journal_restore_dirty(bh);

3806

}

3815

}

3807

unlock_buffer(bh);

3816

unlock_buffer(bh);

3808

return 1;

3817

return 1;

3809

}

3818

}

3810

3819

3811

static void flush_old_journal_lists(struct super_block *s)

3820

static void flush_old_journal_lists(struct super_block *s)

3812

{

3821

{

3813

struct reiserfs_journal *journal = SB_JOURNAL(s);

3822

struct reiserfs_journal *journal = SB_JOURNAL(s);

3814

struct reiserfs_journal_list *jl;

3823

struct reiserfs_journal_list *jl;

3815

struct list_head *entry;

3824

struct list_head *entry;

3816

time_t now = get_seconds();

3825

time_t now = get_seconds();

3817

3826

3818

while (!list_empty(&journal->j_journal_list)) {

3827

while (!list_empty(&journal->j_journal_list)) {

3819

entry = journal->j_journal_list.next;

3828

entry = journal->j_journal_list.next;

3820

jl = JOURNAL_LIST_ENTRY(entry);

3829

jl = JOURNAL_LIST_ENTRY(entry);

3821

/* this check should always be run, to send old lists to disk */

3830

/* this check should always be run, to send old lists to disk */

3822

if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {

3831

if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {

3823

flush_used_journal_lists(s, jl);

3832

flush_used_journal_lists(s, jl);

3824

} else {

3833

} else {

3825

break;

3834

break;

3826

}

3835

}

3827

}

3836

}

3828

}

3837

}

3829

3838

3830

/*

3839

/*

3831

** long and ugly. If flush, will not return until all commit

3840

** long and ugly. If flush, will not return until all commit

3832

** blocks and all real buffers in the trans are on disk.

3841

** blocks and all real buffers in the trans are on disk.

3833

** If no_async, won't return until all commit blocks are on disk.

3842

** If no_async, won't return until all commit blocks are on disk.

3834

**

3843

**

3835

** keep reading, there are comments as you go along

3844

** keep reading, there are comments as you go along

3836

**

3845

**

3837

** If the journal is aborted, we just clean up. Things like flushing

3846

** If the journal is aborted, we just clean up. Things like flushing

3838

** journal lists, etc just won't happen.

3847

** journal lists, etc just won't happen.

3839

*/

3848

*/

3840

static int do_journal_end(struct reiserfs_transaction_handle *th,

3849

static int do_journal_end(struct reiserfs_transaction_handle *th,

3841

struct super_block *p_s_sb, unsigned long nblocks,

3850

struct super_block *p_s_sb, unsigned long nblocks,

3842

int flags)

3851

int flags)

3843

{

3852

{

3844

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3853

struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);

3845

struct reiserfs_journal_cnode *cn, *next, *jl_cn;

3854

struct reiserfs_journal_cnode *cn, *next, *jl_cn;

3846

struct reiserfs_journal_cnode *last_cn = NULL;

3855

struct reiserfs_journal_cnode *last_cn = NULL;

3847

struct reiserfs_journal_desc *desc;

3856

struct reiserfs_journal_desc *desc;

3848

struct reiserfs_journal_commit *commit;

3857

struct reiserfs_journal_commit *commit;

3849

struct buffer_head *c_bh; /* commit bh */

3858

struct buffer_head *c_bh; /* commit bh */

3850

struct buffer_head *d_bh; /* desc bh */

3859

struct buffer_head *d_bh; /* desc bh */

3851

int cur_write_start = 0; /* start index of current log write */

3860

int cur_write_start = 0; /* start index of current log write */

3852

int old_start;

3861

int old_start;

3853

int i;

3862

int i;

3854

int flush = flags & FLUSH_ALL;

3863

int flush = flags & FLUSH_ALL;

3855

int wait_on_commit = flags & WAIT;

3864

int wait_on_commit = flags & WAIT;

3856

struct reiserfs_journal_list *jl, *temp_jl;

3865

struct reiserfs_journal_list *jl, *temp_jl;

3857

struct list_head *entry, *safe;

3866

struct list_head *entry, *safe;

3858

unsigned long jindex;

3867

unsigned long jindex;

3859

unsigned long commit_trans_id;

3868

unsigned long commit_trans_id;

3860

int trans_half;

3869

int trans_half;

3861

3870

3862

BUG_ON(th->t_refcount > 1);

3871

BUG_ON(th->t_refcount > 1);

3863

BUG_ON(!th->t_trans_id);

3872

BUG_ON(!th->t_trans_id);

3864

3873

3865

put_fs_excl();

3874

put_fs_excl();

3866

current->journal_info = th->t_handle_save;

3875

current->journal_info = th->t_handle_save;

3867

reiserfs_check_lock_depth(p_s_sb, "journal end");

3876

reiserfs_check_lock_depth(p_s_sb, "journal end");

3868

if (journal->j_len == 0) {

3877

if (journal->j_len == 0) {

3869

reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),

3878

reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),

3870

1);

3879

1);

3871

journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));

3880

journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));

3872

}

3881

}

3873

3882

3874

lock_journal(p_s_sb);

3883

lock_journal(p_s_sb);

3875

if (journal->j_next_full_flush) {

3884

if (journal->j_next_full_flush) {

3876

flags |= FLUSH_ALL;

3885

flags |= FLUSH_ALL;

3877

flush = 1;

3886

flush = 1;

3878

}

3887

}

3879

if (journal->j_next_async_flush) {

3888

if (journal->j_next_async_flush) {

3880

flags |= COMMIT_NOW | WAIT;

3889

flags |= COMMIT_NOW | WAIT;

3881

wait_on_commit = 1;

3890

wait_on_commit = 1;

3882

}

3891

}

3883

3892

3884

/* check_journal_end locks the journal, and unlocks if it does not return 1

3893

/* check_journal_end locks the journal, and unlocks if it does not return 1

3885

** it tells us if we should continue with the journal_end, or just return

3894

** it tells us if we should continue with the journal_end, or just return

3886

*/

3895

*/

3887

if (!check_journal_end(th, p_s_sb, nblocks, flags)) {

3896

if (!check_journal_end(th, p_s_sb, nblocks, flags)) {

3888

p_s_sb->s_dirt = 1;

3897

p_s_sb->s_dirt = 1;

3889

wake_queued_writers(p_s_sb);

3898

wake_queued_writers(p_s_sb);

3890

reiserfs_async_progress_wait(p_s_sb);

3899

reiserfs_async_progress_wait(p_s_sb);

3891

goto out;

3900

goto out;

3892

}

3901

}

3893

3902

3894

/* check_journal_end might set these, check again */

3903

/* check_journal_end might set these, check again */

3895

if (journal->j_next_full_flush) {

3904

if (journal->j_next_full_flush) {

3896

flush = 1;

3905

flush = 1;

3897

}

3906

}

3898

3907

3899

/*

3908

/*

3900

** j must wait means we have to flush the log blocks, and the real blocks for

3909

** j must wait means we have to flush the log blocks, and the real blocks for

3901

** this transaction

3910

** this transaction

3902

*/

3911

*/

3903

if (journal->j_must_wait > 0) {

3912

if (journal->j_must_wait > 0) {

3904

flush = 1;

3913

flush = 1;

3905

}

3914

}

3906

#ifdef REISERFS_PREALLOCATE

3915

#ifdef REISERFS_PREALLOCATE

3907

/* quota ops might need to nest, setup the journal_info pointer for them

3916

/* quota ops might need to nest, setup the journal_info pointer for them

3908

* and raise the refcount so that it is > 0. */

3917

* and raise the refcount so that it is > 0. */

3909

current->journal_info = th;

3918

current->journal_info = th;

3910

th->t_refcount++;

3919

th->t_refcount++;

3911

reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into

3920

reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into

3912

* the transaction */

3921

* the transaction */

3913

th->t_refcount--;

3922

th->t_refcount--;

3914

current->journal_info = th->t_handle_save;

3923

current->journal_info = th->t_handle_save;

3915

#endif

3924

#endif

3916

3925

3917

/* setup description block */

3926

/* setup description block */

3918

d_bh =

3927

d_bh =

3919

journal_getblk(p_s_sb,

3928

journal_getblk(p_s_sb,

3920

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

3929

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

3921

journal->j_start);

3930

journal->j_start);

3922

set_buffer_uptodate(d_bh);

3931

set_buffer_uptodate(d_bh);

3923

desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;

3932

desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;

3924

memset(d_bh->b_data, 0, d_bh->b_size);

3933

memset(d_bh->b_data, 0, d_bh->b_size);

3925

memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);

3934

memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);

3926

set_desc_trans_id(desc, journal->j_trans_id);

3935

set_desc_trans_id(desc, journal->j_trans_id);

3927

3936

3928

/* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */

3937

/* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */

3929

c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

3938

c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

3930

((journal->j_start + journal->j_len +

3939

((journal->j_start + journal->j_len +

3931

1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

3940

1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

3932

commit = (struct reiserfs_journal_commit *)c_bh->b_data;

3941

commit = (struct reiserfs_journal_commit *)c_bh->b_data;

3933

memset(c_bh->b_data, 0, c_bh->b_size);

3942

memset(c_bh->b_data, 0, c_bh->b_size);

3934

set_commit_trans_id(commit, journal->j_trans_id);

3943

set_commit_trans_id(commit, journal->j_trans_id);

3935

set_buffer_uptodate(c_bh);

3944

set_buffer_uptodate(c_bh);

3936

3945

3937

/* init this journal list */

3946

/* init this journal list */

3938

jl = journal->j_current_jl;

3947

jl = journal->j_current_jl;

3939

3948

3940

/* we lock the commit before doing anything because

3949

/* we lock the commit before doing anything because

3941

* we want to make sure nobody tries to run flush_commit_list until

3950

* we want to make sure nobody tries to run flush_commit_list until

3942

* the new transaction is fully setup, and we've already flushed the

3951

* the new transaction is fully setup, and we've already flushed the

3943

* ordered bh list

3952

* ordered bh list

3944

*/

3953

*/

3945

down(&jl->j_commit_lock);

3954

down(&jl->j_commit_lock);

3946

3955

3947

/* save the transaction id in case we need to commit it later */

3956

/* save the transaction id in case we need to commit it later */

3948

commit_trans_id = jl->j_trans_id;

3957

commit_trans_id = jl->j_trans_id;

3949

3958

3950

atomic_set(&jl->j_older_commits_done, 0);

3959

atomic_set(&jl->j_older_commits_done, 0);

3951

jl->j_trans_id = journal->j_trans_id;

3960

jl->j_trans_id = journal->j_trans_id;

3952

jl->j_timestamp = journal->j_trans_start_time;

3961

jl->j_timestamp = journal->j_trans_start_time;

3953

jl->j_commit_bh = c_bh;

3962

jl->j_commit_bh = c_bh;

3954

jl->j_start = journal->j_start;

3963

jl->j_start = journal->j_start;

3955

jl->j_len = journal->j_len;

3964

jl->j_len = journal->j_len;

3956

atomic_set(&jl->j_nonzerolen, journal->j_len);

3965

atomic_set(&jl->j_nonzerolen, journal->j_len);

3957

atomic_set(&jl->j_commit_left, journal->j_len + 2);

3966

atomic_set(&jl->j_commit_left, journal->j_len + 2);

3958

jl->j_realblock = NULL;

3967

jl->j_realblock = NULL;

3959

3968

3960

/* The ENTIRE FOR LOOP MUST not cause schedule to occur.

3969

/* The ENTIRE FOR LOOP MUST not cause schedule to occur.

3961

** for each real block, add it to the journal list hash,

3970

** for each real block, add it to the journal list hash,

3962

** copy into real block index array in the commit or desc block

3971

** copy into real block index array in the commit or desc block

3963

*/

3972

*/

3964

trans_half = journal_trans_half(p_s_sb->s_blocksize);

3973

trans_half = journal_trans_half(p_s_sb->s_blocksize);

3965

for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {

3974

for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {

3966

if (buffer_journaled(cn->bh)) {

3975

if (buffer_journaled(cn->bh)) {

3967

jl_cn = get_cnode(p_s_sb);

3976

jl_cn = get_cnode(p_s_sb);

3968

if (!jl_cn) {

3977

if (!jl_cn) {

3969

reiserfs_panic(p_s_sb,

3978

reiserfs_panic(p_s_sb,

3970

"journal-1676, get_cnode returned NULL\n");

3979

"journal-1676, get_cnode returned NULL\n");

3971

}

3980

}

3972

if (i == 0) {

3981

if (i == 0) {

3973

jl->j_realblock = jl_cn;

3982

jl->j_realblock = jl_cn;

3974

}

3983

}

3975

jl_cn->prev = last_cn;

3984

jl_cn->prev = last_cn;

3976

jl_cn->next = NULL;

3985

jl_cn->next = NULL;

3977

if (last_cn) {

3986

if (last_cn) {

3978

last_cn->next = jl_cn;

3987

last_cn->next = jl_cn;

3979

}

3988

}

3980

last_cn = jl_cn;

3989

last_cn = jl_cn;

3981

/* make sure the block we are trying to log is not a block

3990

/* make sure the block we are trying to log is not a block

3982

of journal or reserved area */

3991

of journal or reserved area */

3983

3992

3984

if (is_block_in_log_or_reserved_area

3993

if (is_block_in_log_or_reserved_area

3985

(p_s_sb, cn->bh->b_blocknr)) {

3994

(p_s_sb, cn->bh->b_blocknr)) {

3986

reiserfs_panic(p_s_sb,

3995

reiserfs_panic(p_s_sb,

3987

"journal-2332: Trying to log block %lu, which is a log block\n",

3996

"journal-2332: Trying to log block %lu, which is a log block\n",

3988

cn->bh->b_blocknr);

3997

cn->bh->b_blocknr);

3989

}

3998

}

3990

jl_cn->blocknr = cn->bh->b_blocknr;

3999

jl_cn->blocknr = cn->bh->b_blocknr;

3991

jl_cn->state = 0;

4000

jl_cn->state = 0;

3992

jl_cn->sb = p_s_sb;

4001

jl_cn->sb = p_s_sb;

3993

jl_cn->bh = cn->bh;

4002

jl_cn->bh = cn->bh;

3994

jl_cn->jlist = jl;

4003

jl_cn->jlist = jl;

3995

insert_journal_hash(journal->j_list_hash_table, jl_cn);

4004

insert_journal_hash(journal->j_list_hash_table, jl_cn);

3996

if (i < trans_half) {

4005

if (i < trans_half) {

3997

desc->j_realblock[i] =

4006

desc->j_realblock[i] =

3998

cpu_to_le32(cn->bh->b_blocknr);

4007

cpu_to_le32(cn->bh->b_blocknr);

3999

} else {

4008

} else {

4000

commit->j_realblock[i - trans_half] =

4009

commit->j_realblock[i - trans_half] =

4001

cpu_to_le32(cn->bh->b_blocknr);

4010

cpu_to_le32(cn->bh->b_blocknr);

4002

}

4011

}

4003

} else {

4012

} else {

4004

i--;

4013

i--;

4005

}

4014

}

4006

}

4015

}

4007

set_desc_trans_len(desc, journal->j_len);

4016

set_desc_trans_len(desc, journal->j_len);

4008

set_desc_mount_id(desc, journal->j_mount_id);

4017

set_desc_mount_id(desc, journal->j_mount_id);

4009

set_desc_trans_id(desc, journal->j_trans_id);

4018

set_desc_trans_id(desc, journal->j_trans_id);

4010

set_commit_trans_len(commit, journal->j_len);

4019

set_commit_trans_len(commit, journal->j_len);

4011

4020

4012

/* special check in case all buffers in the journal were marked for not logging */

4021

/* special check in case all buffers in the journal were marked for not logging */

4013

if (journal->j_len == 0) {

4022

if (journal->j_len == 0) {

4014

BUG();

4023

BUG();

4015

}

4024

}

4016

4025

4017

/* we're about to dirty all the log blocks, mark the description block

4026

/* we're about to dirty all the log blocks, mark the description block

4018

* dirty now too. Don't mark the commit block dirty until all the

4027

* dirty now too. Don't mark the commit block dirty until all the

4019

* others are on disk

4028

* others are on disk

4020

*/

4029

*/

4021

mark_buffer_dirty(d_bh);

4030

mark_buffer_dirty(d_bh);

4022

4031

4023

/* first data block is j_start + 1, so add one to cur_write_start wherever you use it */

4032

/* first data block is j_start + 1, so add one to cur_write_start wherever you use it */

4024

cur_write_start = journal->j_start;

4033

cur_write_start = journal->j_start;

4025

cn = journal->j_first;

4034

cn = journal->j_first;

4026

jindex = 1; /* start at one so we don't get the desc again */

4035

jindex = 1; /* start at one so we don't get the desc again */

4027

while (cn) {

4036

while (cn) {

4028

clear_buffer_journal_new(cn->bh);

4037

clear_buffer_journal_new(cn->bh);

4029

/* copy all the real blocks into log area. dirty log blocks */

4038

/* copy all the real blocks into log area. dirty log blocks */

4030

if (buffer_journaled(cn->bh)) {

4039

if (buffer_journaled(cn->bh)) {

4031

struct buffer_head *tmp_bh;

4040

struct buffer_head *tmp_bh;

4032

char *addr;

4041

char *addr;

4033

struct page *page;

4042

struct page *page;

4034

tmp_bh =

4043

tmp_bh =

4035

journal_getblk(p_s_sb,

4044

journal_getblk(p_s_sb,

4036

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

4045

SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +

4037

((cur_write_start +

4046

((cur_write_start +

4038

jindex) %

4047

jindex) %

4039

SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

4048

SB_ONDISK_JOURNAL_SIZE(p_s_sb)));

4040

set_buffer_uptodate(tmp_bh);

4049

set_buffer_uptodate(tmp_bh);

4041

page = cn->bh->b_page;

4050

page = cn->bh->b_page;

4042

addr = kmap(page);

4051

addr = kmap(page);

4043

memcpy(tmp_bh->b_data,

4052

memcpy(tmp_bh->b_data,

4044

addr + offset_in_page(cn->bh->b_data),

4053

addr + offset_in_page(cn->bh->b_data),

4045

cn->bh->b_size);

4054

cn->bh->b_size);

4046

kunmap(page);

4055

kunmap(page);

4047

mark_buffer_dirty(tmp_bh);

4056

mark_buffer_dirty(tmp_bh);

4048

jindex++;

4057

jindex++;

4049

set_buffer_journal_dirty(cn->bh);

4058

set_buffer_journal_dirty(cn->bh);

4050

clear_buffer_journaled(cn->bh);

4059

clear_buffer_journaled(cn->bh);

4051

} else {

4060

} else {

4052

/* JDirty cleared sometime during transaction. don't log this one */

4061

/* JDirty cleared sometime during transaction. don't log this one */

4053

reiserfs_warning(p_s_sb,

4062

reiserfs_warning(p_s_sb,

4054

"journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!");

4063

"journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!");

4055

brelse(cn->bh);

4064

brelse(cn->bh);

4056

}

4065

}

4057

next = cn->next;

4066

next = cn->next;

4058

free_cnode(p_s_sb, cn);

4067

free_cnode(p_s_sb, cn);

4059

cn = next;

4068

cn = next;

4060

cond_resched();

4069

cond_resched();

4061

}

4070

}

4062

4071

4063

/* we are done with both the c_bh and d_bh, but

4072

/* we are done with both the c_bh and d_bh, but

4064

** c_bh must be written after all other commit blocks,

4073

** c_bh must be written after all other commit blocks,

4065

** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.

4074

** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.

4066

*/

4075

*/

4067

4076

4068

journal->j_current_jl = alloc_journal_list(p_s_sb);

4077

journal->j_current_jl = alloc_journal_list(p_s_sb);

4069

4078

4070

/* now it is safe to insert this transaction on the main list */

4079

/* now it is safe to insert this transaction on the main list */

4071

list_add_tail(&jl->j_list, &journal->j_journal_list);

4080

list_add_tail(&jl->j_list, &journal->j_journal_list);

4072

list_add_tail(&jl->j_working_list, &journal->j_working_list);

4081

list_add_tail(&jl->j_working_list, &journal->j_working_list);

4073

journal->j_num_work_lists++;

4082

journal->j_num_work_lists++;

4074

4083

4075

/* reset journal values for the next transaction */

4084

/* reset journal values for the next transaction */

4076

old_start = journal->j_start;

4085

old_start = journal->j_start;

4077

journal->j_start =

4086

journal->j_start =

4078

(journal->j_start + journal->j_len +

4087

(journal->j_start + journal->j_len +

4079

2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);

4088

2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);

4080

atomic_set(&(journal->j_wcount), 0);

4089

atomic_set(&(journal->j_wcount), 0);

4081

journal->j_bcount = 0;

4090

journal->j_bcount = 0;

4082

journal->j_last = NULL;

4091

journal->j_last = NULL;

4083

journal->j_first = NULL;

4092

journal->j_first = NULL;

4084

journal->j_len = 0;

4093

journal->j_len = 0;

4085

journal->j_trans_start_time = 0;

4094

journal->j_trans_start_time = 0;

4086

journal->j_trans_id++;

4095

journal->j_trans_id++;

4087

journal->j_current_jl->j_trans_id = journal->j_trans_id;

4096

journal->j_current_jl->j_trans_id = journal->j_trans_id;

4088

journal->j_must_wait = 0;

4097

journal->j_must_wait = 0;

4089

journal->j_len_alloc = 0;

4098

journal->j_len_alloc = 0;

4090

journal->j_next_full_flush = 0;

4099

journal->j_next_full_flush = 0;

4091

journal->j_next_async_flush = 0;

4100

journal->j_next_async_flush = 0;

4092

init_journal_hash(p_s_sb);

4101

init_journal_hash(p_s_sb);

4093

4102

4094

// make sure reiserfs_add_jh sees the new current_jl before we

4103

// make sure reiserfs_add_jh sees the new current_jl before we

4095

// write out the tails

4104

// write out the tails

4096

smp_mb();

4105

smp_mb();

4097

4106

4098

/* tail conversion targets have to hit the disk before we end the

4107

/* tail conversion targets have to hit the disk before we end the

4099

* transaction. Otherwise a later transaction might repack the tail

4108

* transaction. Otherwise a later transaction might repack the tail

4100

* before this transaction commits, leaving the data block unflushed and

4109

* before this transaction commits, leaving the data block unflushed and

4101

* clean, if we crash before the later transaction commits, the data block

4110

* clean, if we crash before the later transaction commits, the data block

4102

* is lost.

4111

* is lost.

4103

*/

4112

*/

4104

if (!list_empty(&jl->j_tail_bh_list)) {

4113

if (!list_empty(&jl->j_tail_bh_list)) {

4105

unlock_kernel();

4114

unlock_kernel();

4106

write_ordered_buffers(&journal->j_dirty_buffers_lock,

4115

write_ordered_buffers(&journal->j_dirty_buffers_lock,

4107

journal, jl, &jl->j_tail_bh_list);

4116

journal, jl, &jl->j_tail_bh_list);

4108

lock_kernel();

4117

lock_kernel();

4109

}

4118

}

4110

if (!list_empty(&jl->j_tail_bh_list))

4119

if (!list_empty(&jl->j_tail_bh_list))

4111

BUG();

4120

BUG();

4112

up(&jl->j_commit_lock);

4121

up(&jl->j_commit_lock);

4113

4122

4114

/* honor the flush wishes from the caller, simple commits can

4123

/* honor the flush wishes from the caller, simple commits can

4115

** be done outside the journal lock, they are done below

4124

** be done outside the journal lock, they are done below

4116

**

4125

**

4117

** if we don't flush the commit list right now, we put it into

4126

** if we don't flush the commit list right now, we put it into

4118

** the work queue so the people waiting on the async progress work

4127

** the work queue so the people waiting on the async progress work

4119

** queue don't wait for this proc to flush journal lists and such.

4128

** queue don't wait for this proc to flush journal lists and such.

4120

*/

4129

*/

4121

if (flush) {

4130

if (flush) {

4122

flush_commit_list(p_s_sb, jl, 1);

4131

flush_commit_list(p_s_sb, jl, 1);

4123

flush_journal_list(p_s_sb, jl, 1);

4132

flush_journal_list(p_s_sb, jl, 1);

4124

} else if (!(jl->j_state & LIST_COMMIT_PENDING))

4133

} else if (!(jl->j_state & LIST_COMMIT_PENDING))

4125

queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);

4134

queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);

4126

4135

4127

/* if the next transaction has any chance of wrapping, flush

4136

/* if the next transaction has any chance of wrapping, flush

4128

** transactions that might get overwritten. If any journal lists are very

4137

** transactions that might get overwritten. If any journal lists are very

4129

** old flush them as well.

4138

** old flush them as well.

4130

*/

4139

*/

4131

first_jl:

4140

first_jl:

4132

list_for_each_safe(entry, safe, &journal->j_journal_list) {

4141

list_for_each_safe(entry, safe, &journal->j_journal_list) {

4133

temp_jl = JOURNAL_LIST_ENTRY(entry);

4142

temp_jl = JOURNAL_LIST_ENTRY(entry);

4134

if (journal->j_start <= temp_jl->j_start) {

4143

if (journal->j_start <= temp_jl->j_start) {

4135

if ((journal->j_start + journal->j_trans_max + 1) >=

4144

if ((journal->j_start + journal->j_trans_max + 1) >=

4136

temp_jl->j_start) {

4145

temp_jl->j_start) {

4137

flush_used_journal_lists(p_s_sb, temp_jl);

4146

flush_used_journal_lists(p_s_sb, temp_jl);

4138

goto first_jl;

4147

goto first_jl;

4139

} else if ((journal->j_start +

4148

} else if ((journal->j_start +

4140

journal->j_trans_max + 1) <

4149

journal->j_trans_max + 1) <

4141

SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {

4150

SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {

4142

/* if we don't cross into the next transaction and we don't

4151

/* if we don't cross into the next transaction and we don't

4143

* wrap, there is no way we can overlap any later transactions

4152

* wrap, there is no way we can overlap any later transactions

4144

* break now

4153

* break now

4145

*/

4154

*/

4146

break;

4155

break;

4147

}

4156

}

4148

} else if ((journal->j_start +

4157

} else if ((journal->j_start +

4149

journal->j_trans_max + 1) >

4158

journal->j_trans_max + 1) >

4150

SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {

4159

SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {

4151

if (((journal->j_start + journal->j_trans_max + 1) %

4160

if (((journal->j_start + journal->j_trans_max + 1) %

4152

SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=

4161

SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=

4153

temp_jl->j_start) {

4162

temp_jl->j_start) {

4154

flush_used_journal_lists(p_s_sb, temp_jl);

4163

flush_used_journal_lists(p_s_sb, temp_jl);

4155

goto first_jl;

4164

goto first_jl;

4156

} else {

4165

} else {

4157

/* we don't overlap anything from out start to the end of the

4166

/* we don't overlap anything from out start to the end of the

4158

* log, and our wrapped portion doesn't overlap anything at

4167

* log, and our wrapped portion doesn't overlap anything at

4159

* the start of the log. We can break

4168

* the start of the log. We can break

4160

*/

4169

*/

4161

break;

4170

break;

4162

}

4171

}

4163

}

4172

}

4164

}

4173

}

4165

flush_old_journal_lists(p_s_sb);

4174

flush_old_journal_lists(p_s_sb);

4166

4175

4167

journal->j_current_jl->j_list_bitmap =

4176

journal->j_current_jl->j_list_bitmap =

4168

get_list_bitmap(p_s_sb, journal->j_current_jl);

4177

get_list_bitmap(p_s_sb, journal->j_current_jl);

4169

4178

4170

if (!(journal->j_current_jl->j_list_bitmap)) {

4179

if (!(journal->j_current_jl->j_list_bitmap)) {

4171

reiserfs_panic(p_s_sb,

4180

reiserfs_panic(p_s_sb,

4172

"journal-1996: do_journal_end, could not get a list bitmap\n");

4181

"journal-1996: do_journal_end, could not get a list bitmap\n");

4173

}

4182

}

4174

4183

4175

atomic_set(&(journal->j_jlock), 0);

4184

atomic_set(&(journal->j_jlock), 0);

4176

unlock_journal(p_s_sb);

4185

unlock_journal(p_s_sb);

4177

/* wake up any body waiting to join. */

4186

/* wake up any body waiting to join. */

4178

clear_bit(J_WRITERS_QUEUED, &journal->j_state);

4187

clear_bit(J_WRITERS_QUEUED, &journal->j_state);

4179

wake_up(&(journal->j_join_wait));

4188

wake_up(&(journal->j_join_wait));

4180

4189

4181

if (!flush && wait_on_commit &&

4190

if (!flush && wait_on_commit &&

4182

journal_list_still_alive(p_s_sb, commit_trans_id)) {

4191

journal_list_still_alive(p_s_sb, commit_trans_id)) {

4183

flush_commit_list(p_s_sb, jl, 1);

4192

flush_commit_list(p_s_sb, jl, 1);

4184

}

4193

}

4185

out:

4194

out:

4186

reiserfs_check_lock_depth(p_s_sb, "journal end2");

4195

reiserfs_check_lock_depth(p_s_sb, "journal end2");

4187

4196

4188

memset(th, 0, sizeof(*th));

4197

memset(th, 0, sizeof(*th));

4189

/* Re-set th->t_super, so we can properly keep track of how many

4198

/* Re-set th->t_super, so we can properly keep track of how many

4190

* persistent transactions there are. We need to do this so if this

4199

* persistent transactions there are. We need to do this so if this

4191

* call is part of a failed restart_transaction, we can free it later */

4200

* call is part of a failed restart_transaction, we can free it later */

4192

th->t_super = p_s_sb;

4201

th->t_super = p_s_sb;

4193

4202

4194

return journal->j_errno;

4203

return journal->j_errno;

4195

}

4204

}

4196

4205

4197

static void __reiserfs_journal_abort_hard(struct super_block *sb)

4206

static void __reiserfs_journal_abort_hard(struct super_block *sb)

4198

{

4207

{

4199

struct reiserfs_journal *journal = SB_JOURNAL(sb);

4208

struct reiserfs_journal *journal = SB_JOURNAL(sb);

4200

if (test_bit(J_ABORTED, &journal->j_state))

4209

if (test_bit(J_ABORTED, &journal->j_state))

4201

return;

4210

return;

4202

4211

4203

printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",

4212

printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",

4204

reiserfs_bdevname(sb));

4213

reiserfs_bdevname(sb));

4205

4214

4206

sb->s_flags |= MS_RDONLY;

4215

sb->s_flags |= MS_RDONLY;

4207

set_bit(J_ABORTED, &journal->j_state);

4216

set_bit(J_ABORTED, &journal->j_state);

4208

4217

4209

#ifdef CONFIG_REISERFS_CHECK

4218

#ifdef CONFIG_REISERFS_CHECK

4210

dump_stack();

4219

dump_stack();

4211

#endif

4220

#endif

4212

}

4221

}

4213

4222

4214

static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)

4223

static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)

4215

{

4224

{

4216

struct reiserfs_journal *journal = SB_JOURNAL(sb);

4225

struct reiserfs_journal *journal = SB_JOURNAL(sb);

4217

if (test_bit(J_ABORTED, &journal->j_state))

4226

if (test_bit(J_ABORTED, &journal->j_state))

4218

return;

4227

return;

4219

4228

4220

if (!journal->j_errno)

4229

if (!journal->j_errno)

4221

journal->j_errno = errno;

4230

journal->j_errno = errno;

4222

4231

4223

__reiserfs_journal_abort_hard(sb);

4232

__reiserfs_journal_abort_hard(sb);

4224

}

4233

}

4225

4234

4226

void reiserfs_journal_abort(struct super_block *sb, int errno)

4235

void reiserfs_journal_abort(struct super_block *sb, int errno)

4227

{

4236

{

4228

return __reiserfs_journal_abort_soft(sb, errno);

4237

return __reiserfs_journal_abort_soft(sb, errno);

4229

}

4238

}

4230

4239

GITLAB

[PATCH] reiserfs: reiserfs hang and performance fix for data=journal mode

 /*
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 #include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 extern int reiserfs_default_io_size;	/* default io size devuned in super.c */
 static int reiserfs_commit_write(struct file *f, struct page *page,
 				 unsigned from, unsigned to);
 static int reiserfs_prepare_write(struct file *f, struct page *page,
 				  unsigned from, unsigned to);
 void reiserfs_delete_inode(struct inode *inode)
 {
 	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 2 +
 	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
 	struct reiserfs_transaction_handle th;
 	int err;
 	truncate_inode_pages(&inode->i_data, 0);
 	reiserfs_write_lock(inode->i_sb);
 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
 	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
 		mutex_lock(&inode->i_mutex);
 		reiserfs_delete_xattrs(inode);
 		if (journal_begin(&th, inode->i_sb, jbegin_count)) {
 			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 		reiserfs_update_inode_transaction(inode);
 		err = reiserfs_delete_object(&th, inode);
 		/* Do quota update inside a transaction for journaled quotas. We must do that
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
 		if (!err)
 			DQUOT_FREE_INODE(inode);
 		if (journal_end(&th, inode->i_sb, jbegin_count)) {
 			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 		mutex_unlock(&inode->i_mutex);
 		/* check return value from reiserfs_delete_object after
 		 * ending the transaction
 		 */
 		if (err)
 		    goto out;
 		/* all items of file are deleted, so we can remove "save" link */
 		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
 								 * about an error here */
 	} else {
 		/* no object items are in the tree */
 		;
 	}
       out:
 	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
 	inode->i_blocks = 0;
 	reiserfs_write_unlock(inode->i_sb);
 }
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
 			  __u32 objectid, loff_t offset, int type, int length)
 {
 	key->version = version;
 	key->on_disk_key.k_dir_id = dirid;
 	key->on_disk_key.k_objectid = objectid;
 	set_cpu_key_k_offset(key, offset);
 	set_cpu_key_k_type(key, type);
 	key->key_length = length;
 }
 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
    offset and type of key */
 void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
 		  int type, int length)
 {
 	_make_cpu_key(key, get_inode_item_key_version(inode),
 		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
 		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
 		      length);
 }
 //
 // when key is 0, do not set version and short key
 //
 inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
 			      int version,
 			      loff_t offset, int type, int length,
 			      int entry_count /*or ih_free_space */ )
 {
 	if (key) {
 		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
 		ih->ih_key.k_objectid =
 		    cpu_to_le32(key->on_disk_key.k_objectid);
 	}
 	put_ih_version(ih, version);
 	set_le_ih_k_offset(ih, offset);
 	set_le_ih_k_type(ih, type);
 	put_ih_item_len(ih, length);
 	/*    set_ih_free_space (ih, 0); */
 	// for directory items it is entry count, for directs and stat
 	// datas - 0xffff, for indirects - 0
 	put_ih_entry_count(ih, entry_count);
 }
 //
 // FIXME: we might cache recently accessed indirect item
 // Ugh.  Not too eager for that....
 //  I cut the code until such time as I see a convincing argument (benchmark).
 // I don't want a bloated inode struct..., and I don't like code complexity....
 /* cutting the code is fine, since it really isn't in use yet and is easy
 ** to add back in.  But, Vladimir has a really good idea here.  Think
 ** about what happens for reading a file.  For each page,
 ** The VFS layer calls reiserfs_readpage, who searches the tree to find
 ** an indirect item.  This indirect item has X number of pointers, where
 ** X is a big number if we've done the block allocation right.  But,
 ** we only use one or two of these pointers during each call to readpage,
 ** needlessly researching again later on.
 **
 ** The size of the cache could be dynamic based on the size of the file.
 **
 ** I'd also like to see us cache the location the stat data item, since
 ** we are needlessly researching for that frequently.
 **
 ** --chris
 */
 /* If this page has a file tail in it, and
 ** it was read in by get_block_create_0, the page data is valid,
 ** but tail is still sitting in a direct item, and we can't write to
 ** it.  So, look through this page, and check all the mapped buffers
 ** to make sure they have valid block numbers.  Any that don't need
 ** to be unmapped, so that block_prepare_write will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
 static inline void fix_tail_page_for_writing(struct page *page)
 {
 	struct buffer_head *head, *next, *bh;
 	if (page && page_has_buffers(page)) {
 		head = page_buffers(page);
 		bh = head;
 		do {
 			next = bh->b_this_page;
 			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 				reiserfs_unmap_buffer(bh);
 			}
 			bh = next;
 		} while (bh != head);
 	}
 }
 /* reiserfs_get_block does not need to allocate a block only if it has been
    done already or non-hole position has been found in the indirect item */
 static inline int allocation_needed(int retval, b_blocknr_t allocated,
 				    struct item_head *ih,
 				    __le32 * item, int pos_in_item)
 {
 	if (allocated)
 		return 0;
 	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
 	    get_block_num(item, pos_in_item))
 		return 0;
 	return 1;
 }
 static inline int indirect_item_found(int retval, struct item_head *ih)
 {
 	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
 }
 static inline void set_block_dev_mapped(struct buffer_head *bh,
 					b_blocknr_t block, struct inode *inode)
 {
 	map_bh(bh, inode->i_sb, block);
 }
 //
 // files which were created in the earlier version can not be longer,
 // than 2 gb
 //
 static int file_capable(struct inode *inode, long block)
 {
 	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
 	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
 		return 1;
 	return 0;
 }
 /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
 				   struct inode *inode, struct path *path)
 {
 	struct super_block *s = th->t_super;
 	int len = th->t_blocks_allocated;
 	int err;
 	BUG_ON(!th->t_trans_id);
 	BUG_ON(!th->t_refcount);
 	/* we cannot restart while nested */
 	if (th->t_refcount > 1) {
 		return 0;
 	}
 	pathrelse(path);
 	reiserfs_update_sd(th, inode);
 	err = journal_end(th, s, len);
 	if (!err) {
 		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
 		if (!err)
 			reiserfs_update_inode_transaction(inode);
 	}
 	return err;
 }
 // it is called by get_block when create == 0. Returns block number
 // for 'block'-th logical block of file. When it hits direct item it
 // returns 0 (being called from bmap) or read direct item into piece
 // of page (bh_result)
 // Please improve the english/clarity in the comment above, as it is
 // hard to understand.
 static int _get_block_create_0(struct inode *inode, long block,
 			       struct buffer_head *bh_result, int args)
 {
 	INITIALIZE_PATH(path);
 	struct cpu_key key;
 	struct buffer_head *bh;
 	struct item_head *ih, tmp_ih;
 	int fs_gen;
 	int blocknr;
 	char *p = NULL;
 	int chars;
 	int ret;
 	int result;
 	int done = 0;
 	unsigned long offset;
 	// prepare the key to look for the 'block'-th block of file
 	make_cpu_key(&key, inode,
 		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
 		     3);
       research:
 	result = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (result != POSITION_FOUND) {
 		pathrelse(&path);
 		if (p)
 			kunmap(bh_result->b_page);
 		if (result == IO_ERROR)
 			return -EIO;
 		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
 		// That there is some MMAPED data associated with it that is yet to be written to disk.
 		if ((args & GET_BLOCK_NO_HOLE)
 		    && !PageUptodate(bh_result->b_page)) {
 			return -ENOENT;
 		}
 		return 0;
 	}
 	//
 	bh = get_last_bh(&path);
 	ih = get_ih(&path);
 	if (is_indirect_le_ih(ih)) {
 		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
 		/* FIXME: here we could cache indirect item or part of it in
 		   the inode to avoid search_by_key in case of subsequent
 		   access to file */
 		blocknr = get_block_num(ind_item, path.pos_in_item);
 		ret = 0;
 		if (blocknr) {
 			map_bh(bh_result, inode->i_sb, blocknr);
 			if (path.pos_in_item ==
 			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
 				set_buffer_boundary(bh_result);
 			}
 		} else
 			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
 			// That there is some MMAPED data associated with it that is yet to  be written to disk.
 		if ((args & GET_BLOCK_NO_HOLE)
 			    && !PageUptodate(bh_result->b_page)) {
 			ret = -ENOENT;
 		}
 		pathrelse(&path);
 		if (p)
 			kunmap(bh_result->b_page);
 		return ret;
 	}
 	// requested data are in direct item(s)
 	if (!(args & GET_BLOCK_READ_DIRECT)) {
 		// we are called by bmap. FIXME: we can not map block of file
 		// when it is stored in direct item(s)
 		pathrelse(&path);
 		if (p)
 			kunmap(bh_result->b_page);
 		return -ENOENT;
 	}
 	/* if we've got a direct item, and the buffer or page was uptodate,
 	 ** we don't want to pull data off disk again.  skip to the
 	 ** end, where we map the buffer and return
 	 */
 	if (buffer_uptodate(bh_result)) {
 		goto finished;
 	} else
 		/*
 		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
 		 ** pages without any buffers.  If the page is up to date, we don't want
 		 ** read old data off disk.  Set the up to date bit on the buffer instead
 		 ** and jump to the end
 		 */
 	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 		set_buffer_uptodate(bh_result);
 		goto finished;
 	}
 	// read file tail into part of page
 	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
 	fs_gen = get_generation(inode->i_sb);
 	copy_item_head(&tmp_ih, ih);
 	/* we only want to kmap if we are reading the tail into the page.
 	 ** this is not the common case, so we don't kmap until we are
 	 ** sure we need to.  But, this means the item might move if
 	 ** kmap schedules
 	 */
 	if (!p) {
 		p = (char *)kmap(bh_result->b_page);
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			goto research;
 		}
 	}
 	p += offset;
 	memset(p, 0, inode->i_sb->s_blocksize);
 	do {
 		if (!is_direct_le_ih(ih)) {
 			BUG();
 		}
 		/* make sure we don't read more bytes than actually exist in
 		 ** the file.  This can happen in odd cases where i_size isn't
 		 ** correct, and when direct item padding results in a few
 		 ** extra bytes at the end of the direct item
 		 */
 		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
 			break;
 		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
 			chars =
 			    inode->i_size - (le_ih_k_offset(ih) - 1) -
 			    path.pos_in_item;
 			done = 1;
 		} else {
 			chars = ih_item_len(ih) - path.pos_in_item;
 		}
 		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
 		if (done)
 			break;
 		p += chars;
 		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
 			// we done, if read direct item is not the last item of
 			// node FIXME: we could try to check right delimiting key
 			// to see whether direct item continues in the right
 			// neighbor or rely on i_size
 			break;
 		// update key to look for the next piece
 		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
 		result = search_for_position_by_key(inode->i_sb, &key, &path);
 		if (result != POSITION_FOUND)
 			// i/o error most likely
 			break;
 		bh = get_last_bh(&path);
 		ih = get_ih(&path);
 	} while (1);
 	flush_dcache_page(bh_result->b_page);
 	kunmap(bh_result->b_page);
       finished:
 	pathrelse(&path);
 	if (result == IO_ERROR)
 		return -EIO;
 	/* this buffer has valid data, but isn't valid for io.  mapping it to
 	 * block #0 tells the rest of reiserfs it just has a tail in it
 	 */
 	map_bh(bh_result, inode->i_sb, 0);
 	set_buffer_uptodate(bh_result);
 	return 0;
 }
 // this is called to create file map. So, _get_block_create_0 will not
 // read direct item
 static int reiserfs_bmap(struct inode *inode, sector_t block,
 			 struct buffer_head *bh_result, int create)
 {
 	if (!file_capable(inode, block))
 		return -EFBIG;
 	reiserfs_write_lock(inode->i_sb);
 	/* do not read the direct item */
 	_get_block_create_0(inode, block, bh_result, 0);
 	reiserfs_write_unlock(inode->i_sb);
 	return 0;
 }
 /* special version of get_block that is only used by grab_tail_page right
 ** now.  It is sent to block_prepare_write, and when you try to get a
 ** block past the end of the file (or a block from a hole) it returns
 ** -ENOENT instead of a valid buffer.  block_prepare_write expects to
 ** be able to do i/o on the buffers returned, unless an error value
 ** is also returned.
 **
 ** So, this allows block_prepare_write to be used for reading a single block
 ** in a page.  Where it does not produce a valid page for holes, or past the
 ** end of the file.  This turns out to be exactly what we need for reading
 ** tails for conversion.
 **
 ** The point of the wrapper is forcing a certain value for create, even
 ** though the VFS layer is calling this function with create==1.  If you
 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
 ** don't use this function.
 */
 static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
 				       struct buffer_head *bh_result,
 				       int create)
 {
 	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
 }
 /* This is special helper for reiserfs_get_block in case we are executing
    direct_IO request. */
 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 					 sector_t iblock,
 					 unsigned long max_blocks,
 					 struct buffer_head *bh_result,
 					 int create)
 {
 	int ret;
 	bh_result->b_page = NULL;
 	/* We set the b_size before reiserfs_get_block call since it is
 	   referenced in convert_tail_for_hole() that may be called from
 	   reiserfs_get_block() */
 	bh_result->b_size = (1 << inode->i_blkbits);
 	ret = reiserfs_get_block(inode, iblock, bh_result,
 				 create | GET_BLOCK_NO_DANGLE);
 	if (ret)
 		goto out;
 	/* don't allow direct io onto tail pages */
 	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 		/* make sure future calls to the direct io funcs for this offset
 		 ** in the file fail by unmapping the buffer
 		 */
 		clear_buffer_mapped(bh_result);
 		ret = -EINVAL;
 	}
 	/* Possible unpacked tail. Flush the data before pages have
 	   disappeared */
 	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
 		int err;
 		lock_kernel();
 		err = reiserfs_commit_for_inode(inode);
 		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 		unlock_kernel();
 		if (err < 0)
 			ret = err;
 	}
       out:
 	return ret;
 }
 /*
 ** helper function for when reiserfs_get_block is called for a hole
 ** but the file tail is still in a direct item
 ** bh_result is the buffer head for the hole
 ** tail_offset is the offset of the start of the tail in the file
 **
 ** This calls prepare_write, which will start a new transaction
 ** you should not be in a transaction, or have any paths held when you
 ** call this.
 */
 static int convert_tail_for_hole(struct inode *inode,
 				 struct buffer_head *bh_result,
 				 loff_t tail_offset)
 {
 	unsigned long index;
 	unsigned long tail_end;
 	unsigned long tail_start;
 	struct page *tail_page;
 	struct page *hole_page = bh_result->b_page;
 	int retval = 0;
 	if ((tail_offset & (bh_result->b_size - 1)) != 1)
 		return -EIO;
 	/* always try to read until the end of the block */
 	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
 	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
 	index = tail_offset >> PAGE_CACHE_SHIFT;
 	/* hole_page can be zero in case of direct_io, we are sure
 	   that we cannot get here if we write with O_DIRECT into
 	   tail page */
 	if (!hole_page || index != hole_page->index) {
 		tail_page = grab_cache_page(inode->i_mapping, index);
 		retval = -ENOMEM;
 		if (!tail_page) {
 			goto out;
 		}
 	} else {
 		tail_page = hole_page;
 	}
 	/* we don't have to make sure the conversion did not happen while
 	 ** we were locking the page because anyone that could convert
 	 ** must first take i_mutex.
 	 **
 	 ** We must fix the tail page for writing because it might have buffers
 	 ** that are mapped, but have a block number of 0.  This indicates tail
 	 ** data that has been read directly into the page, and block_prepare_write
 	 ** won't trigger a get_block in this case.
 	 */
 	fix_tail_page_for_writing(tail_page);
 	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
 	if (retval)
 		goto unlock;
 	/* tail conversion might change the data in the page */
 	flush_dcache_page(tail_page);
 	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
       unlock:
 	if (tail_page != hole_page) {
 		unlock_page(tail_page);
 		page_cache_release(tail_page);
 	}
       out:
 	return retval;
 }
 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 				  long block,
 				  struct inode *inode,
 				  b_blocknr_t * allocated_block_nr,
 				  struct path *path, int flags)
 {
 	BUG_ON(!th->t_trans_id);
 #ifdef REISERFS_PREALLOCATE
 	if (!(flags & GET_BLOCK_NO_IMUX)) {
 		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
 						  path, block);
 	}
 #endif
 	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
 					 block);
 }
 int reiserfs_get_block(struct inode *inode, sector_t block,
 		       struct buffer_head *bh_result, int create)
 {
 	int repeat, retval = 0;
 	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
 	INITIALIZE_PATH(path);
 	int pos_in_item;
 	struct cpu_key key;
 	struct buffer_head *bh, *unbh = NULL;
 	struct item_head *ih, tmp_ih;
 	__le32 *item;
 	int done;
 	int fs_gen;
 	struct reiserfs_transaction_handle *th = NULL;
 	/* space reserved in transaction batch:
 	   . 3 balancings in direct->indirect conversion
 	   . 1 block involved into reiserfs_update_sd()
 	   XXX in practically impossible worst case direct2indirect()
 	   can incur (much) more than 3 balancings.
 	   quota update for user, group */
 	int jbegin_count =
 	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
 	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
 	int version;
 	int dangle = 1;
 	loff_t new_offset =
 	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
 	/* bad.... */
 	reiserfs_write_lock(inode->i_sb);
 	version = get_inode_item_key_version(inode);
 	if (block < 0) {
 		reiserfs_write_unlock(inode->i_sb);
 		return -EIO;
 	}
 	if (!file_capable(inode, block)) {
 		reiserfs_write_unlock(inode->i_sb);
 		return -EFBIG;
 	}
 	/* if !create, we aren't changing the FS, so we don't need to
 	 ** log anything, so we don't need to start a transaction
 	 */
 	if (!(create & GET_BLOCK_CREATE)) {
 		int ret;
 		/* find number of block-th logical block of the file */
 		ret = _get_block_create_0(inode, block, bh_result,
 					  create | GET_BLOCK_READ_DIRECT);
 		reiserfs_write_unlock(inode->i_sb);
 		return ret;
 	}
 	/*
 	 * if we're already in a transaction, make sure to close
 	 * any new transactions we start in this func
 	 */
 	if ((create & GET_BLOCK_NO_DANGLE) ||
 	    reiserfs_transaction_running(inode->i_sb))
 		dangle = 0;
 	/* If file is of such a size, that it might have a tail and tails are enabled
 	 ** we should mark it as possibly needing tail packing on close
 	 */
 	if ((have_large_tails(inode->i_sb)
 	     && inode->i_size < i_block_size(inode) * 4)
 	    || (have_small_tails(inode->i_sb)
 		&& inode->i_size < i_block_size(inode)))
 		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
 	/* set the key of the first byte in the 'block'-th block of file */
 	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
 	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
 	      start_trans:
 		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
 		if (!th) {
 			retval = -ENOMEM;
 			goto failure;
 		}
 		reiserfs_update_inode_transaction(inode);
 	}
       research:
 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (retval == IO_ERROR) {
 		retval = -EIO;
 		goto failure;
 	}
 	bh = get_last_bh(&path);
 	ih = get_ih(&path);
 	item = get_item(&path);
 	pos_in_item = path.pos_in_item;
 	fs_gen = get_generation(inode->i_sb);
 	copy_item_head(&tmp_ih, ih);
 	if (allocation_needed
 	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
 		/* we have to allocate block for the unformatted node */
 		if (!th) {
 			pathrelse(&path);
 			goto start_trans;
 		}
 		repeat =
 		    _allocate_block(th, block, inode, &allocated_block_nr,
 				    &path, create);
 		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
 			/* restart the transaction to give the journal a chance to free
 			 ** some blocks.  releases the path, so we have to go back to
 			 ** research if we succeed on the second try
 			 */
 			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
 			retval = restart_transaction(th, inode, &path);
 			if (retval)
 				goto failure;
 			repeat =
 			    _allocate_block(th, block, inode,
 					    &allocated_block_nr, NULL, create);
 			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
 				goto research;
 			}
 			if (repeat == QUOTA_EXCEEDED)
 				retval = -EDQUOT;
 			else
 				retval = -ENOSPC;
 			goto failure;
 		}
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			goto research;
 		}
 	}
 	if (indirect_item_found(retval, ih)) {
 		b_blocknr_t unfm_ptr;
 		/* 'block'-th block is in the file already (there is
 		   corresponding cell in some indirect item). But it may be
 		   zero unformatted node pointer (hole) */
 		unfm_ptr = get_block_num(item, pos_in_item);
 		if (unfm_ptr == 0) {
 			/* use allocated block to plug the hole */
 			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 			if (fs_changed(fs_gen, inode->i_sb)
 			    && item_moved(&tmp_ih, &path)) {
 				reiserfs_restore_prepared_buffer(inode->i_sb,
 								 bh);
 				goto research;
 			}
 			set_buffer_new(bh_result);
 			if (buffer_dirty(bh_result)
 			    && reiserfs_data_ordered(inode->i_sb))
 				reiserfs_add_ordered_list(inode, bh_result);
 			put_block_num(item, pos_in_item, allocated_block_nr);
 			unfm_ptr = allocated_block_nr;
 			journal_mark_dirty(th, inode->i_sb, bh);
 			reiserfs_update_sd(th, inode);
 		}
 		set_block_dev_mapped(bh_result, unfm_ptr, inode);
 		pathrelse(&path);
 		retval = 0;
 		if (!dangle && th)
 			retval = reiserfs_end_persistent_transaction(th);
 		reiserfs_write_unlock(inode->i_sb);
 		/* the item was found, so new blocks were not added to the file
 		 ** there is no need to make sure the inode is updated with this
 		 ** transaction
 		 */
 		return retval;
 	}
 	if (!th) {
 		pathrelse(&path);
 		goto start_trans;
 	}
 	/* desired position is not found or is in the direct item. We have
 	   to append file with holes up to 'block'-th block converting
 	   direct items to indirect one if necessary */
 	done = 0;
 	do {
 		if (is_statdata_le_ih(ih)) {
 			__le32 unp = 0;
 			struct cpu_key tmp_key;
 			/* indirect item has to be inserted */
 			make_le_item_head(&tmp_ih, &key, version, 1,
 					  TYPE_INDIRECT, UNFM_P_SIZE,
 					  0 /* free_space */ );
 			if (cpu_key_k_offset(&key) == 1) {
 				/* we are going to add 'block'-th block to the file. Use
 				   allocated block for that */
 				unp = cpu_to_le32(allocated_block_nr);
 				set_block_dev_mapped(bh_result,
 						     allocated_block_nr, inode);
 				set_buffer_new(bh_result);
 				done = 1;
 			}
 			tmp_key = key;	// ;)
 			set_cpu_key_k_offset(&tmp_key, 1);
 			PATH_LAST_POSITION(&path)++;
 			retval =
 			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
 						 inode, (char *)&unp);
 			if (retval) {
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
 			}
 			//mark_tail_converted (inode);
 		} else if (is_direct_le_ih(ih)) {
 			/* direct item has to be converted */
 			loff_t tail_offset;
 			tail_offset =
 			    ((le_ih_k_offset(ih) -
 			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
 			if (tail_offset == cpu_key_k_offset(&key)) {
 				/* direct item we just found fits into block we have
 				   to map. Convert it into unformatted node: use
 				   bh_result for the conversion */
 				set_block_dev_mapped(bh_result,
 						     allocated_block_nr, inode);
 				unbh = bh_result;
 				done = 1;
 			} else {
 				/* we have to padd file tail stored in direct item(s)
 				   up to block size and convert it to unformatted
 				   node. FIXME: this should also get into page cache */
 				pathrelse(&path);
 				/*
 				 * ugly, but we can only end the transaction if
 				 * we aren't nested
 				 */
 				BUG_ON(!th->t_refcount);
 				if (th->t_refcount == 1) {
 					retval =
 					    reiserfs_end_persistent_transaction
 					    (th);
 					th = NULL;
 					if (retval)
 						goto failure;
 				}
 				retval =
 				    convert_tail_for_hole(inode, bh_result,
 							  tail_offset);
 				if (retval) {
 					if (retval != -ENOSPC)
 						reiserfs_warning(inode->i_sb,
 								 "clm-6004: convert tail failed inode %lu, error %d",
 								 inode->i_ino,
 								 retval);
 					if (allocated_block_nr) {
 						/* the bitmap, the super, and the stat data == 3 */
 						if (!th)
 							th = reiserfs_persistent_transaction(inode->i_sb, 3);
 						if (th)
 							reiserfs_free_block(th,
 									    inode,
 									    allocated_block_nr,
 									    1);
 					}
 					goto failure;
 				}
 				goto research;
 			}
 			retval =
 			    direct2indirect(th, inode, &path, unbh,
 					    tail_offset);
 			if (retval) {
 				reiserfs_unmap_buffer(unbh);
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 				goto failure;
 			}
 			/* it is important the set_buffer_uptodate is done after
 			 ** the direct2indirect.  The buffer might contain valid
 			 ** data newer than the data on disk (read by readpage, changed,
 			 ** and then sent here by writepage).  direct2indirect needs
 			 ** to know if unbh was already up to date, so it can decide
 			 ** if the data in unbh needs to be replaced with data from
 			 ** the disk
 			 */
 			set_buffer_uptodate(unbh);
 			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
 			   buffer will disappear shortly, so it should not be added to
 			 */
 			if (unbh->b_page) {
 				/* we've converted the tail, so we must
 				 ** flush unbh before the transaction commits
 				 */
 				reiserfs_add_tail_list(inode, unbh);
 				/* mark it dirty now to prevent commit_write from adding
 				 ** this buffer to the inode's dirty buffer list
 				 */
 				/*
 				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
 				 * It's still atomic, but it sets the page dirty too,
 				 * which makes it eligible for writeback at any time by the
 				 * VM (which was also the case with __mark_buffer_dirty())
 				 */
 				mark_buffer_dirty(unbh);
 			}
 		} else {
 			/* append indirect item with holes if needed, when appending
 			   pointer to 'block'-th block use block, which is already
 			   allocated */
 			struct cpu_key tmp_key;
 			unp_t unf_single = 0;	// We use this in case we need to allocate only
 			// one block which is a fastpath
 			unp_t *un;
 			__u64 max_to_insert =
 			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
 			    UNFM_P_SIZE;
 			__u64 blocks_needed;
 			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
 			       "vs-804: invalid position for append");
 			/* indirect item has to be appended, set up key of that position */
 			make_cpu_key(&tmp_key, inode,
 				     le_key_k_offset(version,
 						     &(ih->ih_key)) +
 				     op_bytes_number(ih,
 						     inode->i_sb->s_blocksize),
 				     //pos_in_item * inode->i_sb->s_blocksize,
 				     TYPE_INDIRECT, 3);	// key type is unimportant
 			blocks_needed =
 			    1 +
 			    ((cpu_key_k_offset(&key) -
 			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
 			     s_blocksize_bits);
 			RFALSE(blocks_needed < 0, "green-805: invalid offset");
 			if (blocks_needed == 1) {
 				un = &unf_single;
 			} else {
 				un = kmalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
 				if (!un) {
 					un = &unf_single;
 					blocks_needed = 1;
 					max_to_insert = 0;
 				} else
 					memset(un, 0,
 					       UNFM_P_SIZE * min(blocks_needed,
 								 max_to_insert));
 			}
 			if (blocks_needed <= max_to_insert) {
 				/* we are going to add target block to the file. Use allocated
 				   block for that */
 				un[blocks_needed - 1] =
 				    cpu_to_le32(allocated_block_nr);
 				set_block_dev_mapped(bh_result,
 						     allocated_block_nr, inode);
 				set_buffer_new(bh_result);
 				done = 1;
 			} else {
 				/* paste hole to the indirect item */
 				/* If kmalloc failed, max_to_insert becomes zero and it means we
 				   only have space for one block */
 				blocks_needed =
 				    max_to_insert ? max_to_insert : 1;
 			}
 			retval =
 			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
 						     (char *)un,
 						     UNFM_P_SIZE *
 						     blocks_needed);
 			if (blocks_needed != 1)
 				kfree(un);
 			if (retval) {
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 				goto failure;
 			}
 			if (!done) {
 				/* We need to mark new file size in case this function will be
 				   interrupted/aborted later on. And we may do this only for
 				   holes. */
 				inode->i_size +=
 				    inode->i_sb->s_blocksize * blocks_needed;
 			}
 		}
 		if (done == 1)
 			break;
 		/* this loop could log more blocks than we had originally asked
 		 ** for.  So, we have to allow the transaction to end if it is
 		 ** too big or too full.  Update the inode so things are
 		 ** consistent if we crash before the function returns
 		 **
 		 ** release the path so that anybody waiting on the path before
 		 ** ending their transaction will be able to continue.
 		 */
 		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
 			retval = restart_transaction(th, inode, &path);
 			if (retval)
 				goto failure;
 		}
 		/* inserting indirect pointers for a hole can take a
 		 ** long time.  reschedule if needed
 		 */
 		cond_resched();
 		retval = search_for_position_by_key(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
 			retval = -EIO;
 			goto failure;
 		}
 		if (retval == POSITION_FOUND) {
 			reiserfs_warning(inode->i_sb,
 					 "vs-825: reiserfs_get_block: "
 					 "%K should not be found", &key);
 			retval = -EEXIST;
 			if (allocated_block_nr)
 				reiserfs_free_block(th, inode,
 						    allocated_block_nr, 1);
 			pathrelse(&path);
 			goto failure;
 		}
 		bh = get_last_bh(&path);
 		ih = get_ih(&path);
 		item = get_item(&path);
 		pos_in_item = path.pos_in_item;
 	} while (1);
 	retval = 0;
       failure:
 	if (th && (!dangle || (retval && !th->t_trans_id))) {
 		int err;
 		if (th->t_trans_id)
 			reiserfs_update_sd(th, inode);
 		err = reiserfs_end_persistent_transaction(th);
 		if (err)
 			retval = err;
 	}
 	reiserfs_write_unlock(inode->i_sb);
 	reiserfs_check_path(&path);
 	return retval;
 }
 static int
 reiserfs_readpages(struct file *file, struct address_space *mapping,
 		   struct list_head *pages, unsigned nr_pages)
 {
 	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 }
 /* Compute real number of used bytes by file
  * Following three functions can go away when we'll have enough space in stat item
  */
 static int real_space_diff(struct inode *inode, int sd_size)
 {
 	int bytes;
 	loff_t blocksize = inode->i_sb->s_blocksize;
 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
 		return sd_size;
 	/* End of file is also in full block with indirect reference, so round
 	 ** up to the next block.
 	 **
 	 ** there is just no way to know if the tail is actually packed
 	 ** on the file, so we have to assume it isn't.  When we pack the
 	 ** tail, we add 4 bytes to pretend there really is an unformatted
 	 ** node pointer
 	 */
 	bytes =
 	    ((inode->i_size +
 	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
 	    sd_size;
 	return bytes;
 }
 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
 					int sd_size)
 {
 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 		return inode->i_size +
 		    (loff_t) (real_space_diff(inode, sd_size));
 	}
 	return ((loff_t) real_space_diff(inode, sd_size)) +
 	    (((loff_t) blocks) << 9);
 }
 /* Compute number of blocks used by file in ReiserFS counting */
 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
 {
 	loff_t bytes = inode_get_bytes(inode);
 	loff_t real_space = real_space_diff(inode, sd_size);
 	/* keeps fsck and non-quota versions of reiserfs happy */
 	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
 		bytes += (loff_t) 511;
 	}
 	/* files from before the quota patch might i_blocks such that
 	 ** bytes < real_space.  Deal with that here to prevent it from
 	 ** going negative.
 	 */
 	if (bytes < real_space)
 		return 0;
 	return (bytes - real_space) >> 9;
 }
 //
 // BAD: new directories have stat data of new type and all other items
 // of old type. Version stored in the inode says about body items, so
 // in update_stat_data we can not rely on inode, but have to check
 // item version directly
 //
 // called by read_locked_inode
 static void init_inode(struct inode *inode, struct path *path)
 {
 	struct buffer_head *bh;
 	struct item_head *ih;
 	__u32 rdev;
 	//int version = ITEM_VERSION_1;
 	bh = PATH_PLAST_BUFFER(path);
 	ih = PATH_PITEM_HEAD(path);
 	copy_key(INODE_PKEY(inode), &(ih->ih_key));
 	inode->i_blksize = reiserfs_default_io_size;
 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 	REISERFS_I(inode)->i_flags = 0;
 	REISERFS_I(inode)->i_prealloc_block = 0;
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
 	REISERFS_I(inode)->i_acl_access = NULL;
 	REISERFS_I(inode)->i_acl_default = NULL;
 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
 	if (stat_data_v1(ih)) {
 		struct stat_data_v1 *sd =
 		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
 		unsigned long blocks;
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 		set_inode_sd_version(inode, STAT_DATA_V1);
 		inode->i_mode = sd_v1_mode(sd);
 		inode->i_nlink = sd_v1_nlink(sd);
 		inode->i_uid = sd_v1_uid(sd);
 		inode->i_gid = sd_v1_gid(sd);
 		inode->i_size = sd_v1_size(sd);
 		inode->i_atime.tv_sec = sd_v1_atime(sd);
 		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
 		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
 		inode->i_atime.tv_nsec = 0;
 		inode->i_ctime.tv_nsec = 0;
 		inode->i_mtime.tv_nsec = 0;
 		inode->i_blocks = sd_v1_blocks(sd);
 		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 		blocks = (inode->i_size + 511) >> 9;
 		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
 		if (inode->i_blocks > blocks) {
 			// there was a bug in <=3.5.23 when i_blocks could take negative
 			// values. Starting from 3.5.17 this value could even be stored in
 			// stat data. For such files we set i_blocks based on file
 			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
 			// only updated if file's inode will ever change
 			inode->i_blocks = blocks;
 		}
 		rdev = sd_v1_rdev(sd);
 		REISERFS_I(inode)->i_first_direct_byte =
 		    sd_v1_first_direct_byte(sd);
 		/* an early bug in the quota code can give us an odd number for the
 		 ** block count.  This is incorrect, fix it here.
 		 */
 		if (inode->i_blocks & 1) {
 			inode->i_blocks++;
 		}
 		inode_set_bytes(inode,
 				to_real_used_space(inode, inode->i_blocks,
 						   SD_V1_SIZE));
 		/* nopack is initially zero for v1 objects. For v2 objects,
 		   nopack is initialised from sd_attrs */
 		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
 	} else {
 		// new stat data found, but object may have old items
 		// (directories and symlinks)
 		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
 		inode->i_mode = sd_v2_mode(sd);
 		inode->i_nlink = sd_v2_nlink(sd);
 		inode->i_uid = sd_v2_uid(sd);
 		inode->i_size = sd_v2_size(sd);
 		inode->i_gid = sd_v2_gid(sd);
 		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
 		inode->i_atime.tv_sec = sd_v2_atime(sd);
 		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
 		inode->i_ctime.tv_nsec = 0;
 		inode->i_mtime.tv_nsec = 0;
 		inode->i_atime.tv_nsec = 0;
 		inode->i_blocks = sd_v2_blocks(sd);
 		rdev = sd_v2_rdev(sd);
 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 			inode->i_generation =
 			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 		else
 			inode->i_generation = sd_v2_generation(sd);
 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 		else
 			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
 		REISERFS_I(inode)->i_first_direct_byte = 0;
 		set_inode_sd_version(inode, STAT_DATA_V2);
 		inode_set_bytes(inode,
 				to_real_used_space(inode, inode->i_blocks,
 						   SD_V2_SIZE));
 		/* read persistent inode attributes from sd and initalise
 		   generic inode flags from them */
 		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
 		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
 	}
 	pathrelse(path);
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &reiserfs_file_inode_operations;
 		inode->i_fop = &reiserfs_file_operations;
 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &reiserfs_dir_inode_operations;
 		inode->i_fop = &reiserfs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &reiserfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 	} else {
 		inode->i_blocks = 0;
 		inode->i_op = &reiserfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
 	}
 }
 // update new stat data with inode fields
 static void inode2sd(void *sd, struct inode *inode, loff_t size)
 {
 	struct stat_data *sd_v2 = (struct stat_data *)sd;
 	__u16 flags;
 	set_sd_v2_mode(sd_v2, inode->i_mode);
 	set_sd_v2_nlink(sd_v2, inode->i_nlink);
 	set_sd_v2_uid(sd_v2, inode->i_uid);
 	set_sd_v2_size(sd_v2, size);
 	set_sd_v2_gid(sd_v2, inode->i_gid);
 	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
 	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
 	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
 	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
 	else
 		set_sd_v2_generation(sd_v2, inode->i_generation);
 	flags = REISERFS_I(inode)->i_attrs;
 	i_attrs_to_sd_attrs(inode, &flags);
 	set_sd_v2_attrs(sd_v2, flags);
 }
 // used to copy inode's fields to old stat data
 static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
 {
 	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
 	set_sd_v1_mode(sd_v1, inode->i_mode);
 	set_sd_v1_uid(sd_v1, inode->i_uid);
 	set_sd_v1_gid(sd_v1, inode->i_gid);
 	set_sd_v1_nlink(sd_v1, inode->i_nlink);
 	set_sd_v1_size(sd_v1, size);
 	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
 	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
 	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
 	else
 		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
 	// Sigh. i_first_direct_byte is back
 	set_sd_v1_first_direct_byte(sd_v1,
 				    REISERFS_I(inode)->i_first_direct_byte);
 }
 /* NOTE, you must prepare the buffer head before sending it here,
 ** and then log it after the call
 */
 static void update_stat_data(struct path *path, struct inode *inode,
 			     loff_t size)
 {
 	struct buffer_head *bh;
 	struct item_head *ih;
 	bh = PATH_PLAST_BUFFER(path);
 	ih = PATH_PITEM_HEAD(path);
 	if (!is_statdata_le_ih(ih))
 		reiserfs_panic(inode->i_sb,
 			       "vs-13065: update_stat_data: key %k, found item %h",
 			       INODE_PKEY(inode), ih);
 	if (stat_data_v1(ih)) {
 		// path points to old stat data
 		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
 	} else {
 		inode2sd(B_I_PITEM(bh, ih), inode, size);
 	}
 	return;
 }
 void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
 			     struct inode *inode, loff_t size)
 {
 	struct cpu_key key;
 	INITIALIZE_PATH(path);
 	struct buffer_head *bh;
 	int fs_gen;
 	struct item_head *ih, tmp_ih;
 	int retval;
 	BUG_ON(!th->t_trans_id);
 	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
 	for (;;) {
 		int pos;
 		/* look for the object's stat data */
 		retval = search_item(inode->i_sb, &key, &path);
 		if (retval == IO_ERROR) {
 			reiserfs_warning(inode->i_sb,
 					 "vs-13050: reiserfs_update_sd: "
 					 "i/o failure occurred trying to update %K stat data",
 					 &key);
 			return;
 		}
 		if (retval == ITEM_NOT_FOUND) {
 			pos = PATH_LAST_POSITION(&path);
 			pathrelse(&path);
 			if (inode->i_nlink == 0) {
 				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
 				return;
 			}
 			reiserfs_warning(inode->i_sb,
 					 "vs-13060: reiserfs_update_sd: "
 					 "stat data of object %k (nlink == %d) not found (pos %d)",
 					 INODE_PKEY(inode), inode->i_nlink,
 					 pos);
 			reiserfs_check_path(&path);
 			return;
 		}
 		/* sigh, prepare_for_journal might schedule.  When it schedules the
 		 ** FS might change.  We have to detect that, and loop back to the
 		 ** search if the stat data item has moved
 		 */
 		bh = get_last_bh(&path);
 		ih = get_ih(&path);
 		copy_item_head(&tmp_ih, ih);
 		fs_gen = get_generation(inode->i_sb);
 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 			continue;	/* Stat_data item has been moved after scheduling. */
 		}
 		break;
 	}
 	update_stat_data(&path, inode, size);
 	journal_mark_dirty(th, th->t_super, bh);
 	pathrelse(&path);
 	return;
 }
 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
 ** does a make_bad_inode when things go wrong.  But, we need to make sure
 ** and clear the key in the private portion of the inode, otherwise a
 ** corresponding iput might try to delete whatever object the inode last
 ** represented.
 */
 static void reiserfs_make_bad_inode(struct inode *inode)
 {
 	memset(INODE_PKEY(inode), 0, KEY_SIZE);
 	make_bad_inode(inode);
 }
 //
 // initially this function was derived from minix or ext2's analog and
 // evolved as the prototype did
 //
 int reiserfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
 	inode->i_ino = args->objectid;
 	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
 	return 0;
 }
 /* looks for stat data in the tree, and fills up the fields of in-core
    inode stat data fields */
 void reiserfs_read_locked_inode(struct inode *inode,
 				struct reiserfs_iget_args *args)
 {
 	INITIALIZE_PATH(path_to_sd);
 	struct cpu_key key;
 	unsigned long dirino;
 	int retval;
 	dirino = args->dirid;
 	/* set version 1, version 2 could be used too, because stat data
 	   key is the same in both versions */
 	key.version = KEY_FORMAT_3_5;
 	key.on_disk_key.k_dir_id = dirino;
 	key.on_disk_key.k_objectid = inode->i_ino;
 	key.on_disk_key.k_offset = 0;
 	key.on_disk_key.k_type = 0;
 	/* look for the object's stat data */
 	retval = search_item(inode->i_sb, &key, &path_to_sd);
 	if (retval == IO_ERROR) {
 		reiserfs_warning(inode->i_sb,
 				 "vs-13070: reiserfs_read_locked_inode: "
 				 "i/o failure occurred trying to find stat data of %K",
 				 &key);
 		reiserfs_make_bad_inode(inode);
 		return;
 	}
 	if (retval != ITEM_FOUND) {
 		/* a stale NFS handle can trigger this without it being an error */
 		pathrelse(&path_to_sd);
 		reiserfs_make_bad_inode(inode);
 		inode->i_nlink = 0;
 		return;
 	}
 	init_inode(inode, &path_to_sd);
 	/* It is possible that knfsd is trying to access inode of a file
 	   that is being removed from the disk by some other thread. As we
 	   update sd on unlink all that is required is to check for nlink
 	   here. This bug was first found by Sizif when debugging
 	   SquidNG/Butterfly, forgotten, and found again after Philippe
 	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
 	   More logical fix would require changes in fs/inode.c:iput() to
 	   remove inode from hash-table _after_ fs cleaned disk stuff up and
 	   in iget() to return NULL if I_FREEING inode is found in
 	   hash-table. */
 	/* Currently there is one place where it's ok to meet inode with
 	   nlink==0: processing of open-unlinked and half-truncated files
 	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
 	if ((inode->i_nlink == 0) &&
 	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
 		reiserfs_warning(inode->i_sb,
 				 "vs-13075: reiserfs_read_locked_inode: "
 				 "dead inode read from disk %K. "
 				 "This is likely to be race with knfsd. Ignore",
 				 &key);
 		reiserfs_make_bad_inode(inode);
 	}
 	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
 }
 /**
  * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
  *
  * @inode:    inode from hash table to check
  * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
  *
  * This function is called by iget5_locked() to distinguish reiserfs inodes
  * having the same inode numbers. Such inodes can only exist due to some
  * error condition. One of them should be bad. Inodes with identical
  * inode numbers (objectids) are distinguished by parent directory ids.
  *
  */
 int reiserfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct reiserfs_iget_args *args;
 	args = opaque;
 	/* args is already in CPU order */
 	return (inode->i_ino == args->objectid) &&
 	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
 }
 struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
 {
 	struct inode *inode;
 	struct reiserfs_iget_args args;
 	args.objectid = key->on_disk_key.k_objectid;
 	args.dirid = key->on_disk_key.k_dir_id;
 	inode = iget5_locked(s, key->on_disk_key.k_objectid,
 			     reiserfs_find_actor, reiserfs_init_locked_inode,
 			     (void *)(&args));
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (inode->i_state & I_NEW) {
 		reiserfs_read_locked_inode(inode, &args);
 		unlock_new_inode(inode);
 	}
 	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
 		/* either due to i/o error or a stale NFS handle */
 		iput(inode);
 		inode = NULL;
 	}
 	return inode;
 }
 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
 {
 	__u32 *data = vobjp;
 	struct cpu_key key;
 	struct dentry *result;
 	struct inode *inode;
 	key.on_disk_key.k_objectid = data[0];
 	key.on_disk_key.k_dir_id = data[1];
 	reiserfs_write_lock(sb);
 	inode = reiserfs_iget(sb, &key);
 	if (inode && !IS_ERR(inode) && data[2] != 0 &&
 	    data[2] != inode->i_generation) {
 		iput(inode);
 		inode = NULL;
 	}
 	reiserfs_write_unlock(sb);
 	if (!inode)
 		inode = ERR_PTR(-ESTALE);
 	if (IS_ERR(inode))
 		return ERR_PTR(PTR_ERR(inode));
 	result = d_alloc_anon(inode);
 	if (!result) {
 		iput(inode);
 		return ERR_PTR(-ENOMEM);
 	}
 	return result;
 }
 struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
 				  int len, int fhtype,
 				  int (*acceptable) (void *contect,
 						     struct dentry * de),
 				  void *context)
 {
 	__u32 obj[3], parent[3];
 	/* fhtype happens to reflect the number of u32s encoded.
 	 * due to a bug in earlier code, fhtype might indicate there
 	 * are more u32s then actually fitted.
 	 * so if fhtype seems to be more than len, reduce fhtype.
 	 * Valid types are:
 	 *   2 - objectid + dir_id - legacy support
 	 *   3 - objectid + dir_id + generation
 	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
 	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
 	 *   6 - as above plus generation of directory
 	 * 6 does not fit in NFSv2 handles
 	 */
 	if (fhtype > len) {
 		if (fhtype != 6 || len != 5)
 			reiserfs_warning(sb,
 					 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
 					 fhtype, len);
 		fhtype = 5;
 	}
 	obj[0] = data[0];
 	obj[1] = data[1];
 	if (fhtype == 3 || fhtype >= 5)
 		obj[2] = data[2];
 	else
 		obj[2] = 0;	/* generation number */
 	if (fhtype >= 4) {
 		parent[0] = data[fhtype >= 5 ? 3 : 2];
 		parent[1] = data[fhtype >= 5 ? 4 : 3];
 		if (fhtype == 6)
 			parent[2] = data[5];
 		else
 			parent[2] = 0;
 	}
 	return sb->s_export_op->find_exported_dentry(sb, obj,
 						     fhtype < 4 ? NULL : parent,
 						     acceptable, context);
 }
 int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 		       int need_parent)
 {
 	struct inode *inode = dentry->d_inode;
 	int maxlen = *lenp;
 	if (maxlen < 3)
 		return 255;
 	data[0] = inode->i_ino;
 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 	data[2] = inode->i_generation;
 	*lenp = 3;
 	/* no room for directory info? return what we've stored so far */
 	if (maxlen < 5 || !need_parent)
 		return 3;
 	spin_lock(&dentry->d_lock);
 	inode = dentry->d_parent->d_inode;
 	data[3] = inode->i_ino;
 	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
 	*lenp = 5;
 	if (maxlen >= 6) {
 		data[5] = inode->i_generation;
 		*lenp = 6;
 	}
 	spin_unlock(&dentry->d_lock);
 	return *lenp;
 }
 /* looks for stat data, then copies fields to it, marks the buffer
    containing stat data as dirty */
 /* reiserfs inodes are never really dirty, since the dirty inode call
 ** always logs them.  This call allows the VFS inode marking routines
 ** to properly mark inodes for datasync and such, but only actually
 ** does something when called for a synchronous update.
 */
 int reiserfs_write_inode(struct inode *inode, int do_sync)
 {
 	struct reiserfs_transaction_handle th;
 	int jbegin_count = 1;
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return -EROFS;
 	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
 	 ** these cases are just when the system needs ram, not when the
 	 ** inode needs to reach disk for safety, and they can safely be
 	 ** ignored because the altered inode has already been logged.
 	 */
 	if (do_sync && !(current->flags & PF_MEMALLOC)) {
 		reiserfs_write_lock(inode->i_sb);
 		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
 			reiserfs_update_sd(&th, inode);
 			journal_end_sync(&th, inode->i_sb, jbegin_count);
 		}
 		reiserfs_write_unlock(inode->i_sb);
 	}
 	return 0;
 }
 /* stat data of new object is inserted already, this inserts the item
    containing "." and ".." entries */
 static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
 				  struct inode *inode,
 				  struct item_head *ih, struct path *path,
 				  struct inode *dir)
 {
 	struct super_block *sb = th->t_super;
 	char empty_dir[EMPTY_DIR_SIZE];
 	char *body = empty_dir;
 	struct cpu_key key;
 	int retval;
 	BUG_ON(!th->t_trans_id);
 	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
 		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
 		      TYPE_DIRENTRY, 3 /*key length */ );
 	/* compose item head for new item. Directories consist of items of
 	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
 	   is done by reiserfs_new_inode */
 	if (old_format_only(sb)) {
 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
 		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
 				       ih->ih_key.k_objectid,
 				       INODE_PKEY(dir)->k_dir_id,
 				       INODE_PKEY(dir)->k_objectid);
 	} else {
 		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
 				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
 		make_empty_dir_item(body, ih->ih_key.k_dir_id,
 				    ih->ih_key.k_objectid,
 				    INODE_PKEY(dir)->k_dir_id,
 				    INODE_PKEY(dir)->k_objectid);
 	}
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
 		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
 				 "i/o failure occurred creating new directory");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
 		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
 	}
 	/* insert item, that is empty directory item */
 	return reiserfs_insert_item(th, path, &key, ih, inode, body);
 }
 /* stat data of object has been inserted, this inserts the item
    containing the body of symlink */
 static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
 				struct item_head *ih,
 				struct path *path, const char *symname,
 				int item_len)
 {
 	struct super_block *sb = th->t_super;
 	struct cpu_key key;
 	int retval;
 	BUG_ON(!th->t_trans_id);
 	_make_cpu_key(&key, KEY_FORMAT_3_5,
 		      le32_to_cpu(ih->ih_key.k_dir_id),
 		      le32_to_cpu(ih->ih_key.k_objectid),
 		      1, TYPE_DIRECT, 3 /*key length */ );
 	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
 			  0 /*free_space */ );
 	/* look for place in the tree for new item */
 	retval = search_item(sb, &key, path);
 	if (retval == IO_ERROR) {
 		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
 				 "i/o failure occurred creating new symlink");
 		return -EIO;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(path);
 		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
 				 "object with this key exists (%k)",
 				 &(ih->ih_key));
 		return -EEXIST;
 	}
 	/* insert item, that is body of symlink */
 	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
 }
 /* inserts the stat data into the tree, and then calls
    reiserfs_new_directory (to insert ".", ".." item if new object is
    directory) or reiserfs_new_symlink (to insert symlink body if new
    object is symlink) or nothing (if new object is regular file)
    NOTE! uid and gid must already be set in the inode.  If we return
    non-zero due to an error, we have to drop the quota previously allocated
    for the fresh inode.  This can only be done outside a transaction, so
    if we return non-zero, we also end the transaction.  */
 int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *dir, int mode, const char *symname,
 		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
 		          strlen (symname) for symlinks) */
 		       loff_t i_size, struct dentry *dentry,
 		       struct inode *inode)
 {
 	struct super_block *sb;
 	INITIALIZE_PATH(path_to_key);
 	struct cpu_key key;
 	struct item_head ih;
 	struct stat_data sd;
 	int retval;
 	int err;
 	BUG_ON(!th->t_trans_id);
 	if (DQUOT_ALLOC_INODE(inode)) {
 		err = -EDQUOT;
 		goto out_end_trans;
 	}
 	if (!dir || !dir->i_nlink) {
 		err = -EPERM;
 		goto out_bad_inode;
 	}
 	sb = dir->i_sb;
 	/* item head of new item */
 	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
 	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
 	if (!ih.ih_key.k_objectid) {
 		err = -ENOMEM;
 		goto out_bad_inode;
 	}
 	if (old_format_only(sb))
 		/* not a perfect generation count, as object ids can be reused, but
 		 ** this is as good as reiserfs can do right now.
 		 ** note that the private part of inode isn't filled in yet, we have
 		 ** to use the directory.
 		 */
 		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
 	else
 #if defined( USE_INODE_GENERATION_COUNTER )
 		inode->i_generation =
 		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
 #else
 		inode->i_generation = ++event;
 #endif
 	/* fill stat data */
 	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
 	/* uid and gid must already be set by the caller for quota init */
 	/* symlink cannot be immutable or append only, right? */
 	if (S_ISLNK(inode->i_mode))
 		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	inode->i_size = i_size;
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
 	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
 	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 	REISERFS_I(inode)->i_flags = 0;
 	REISERFS_I(inode)->i_prealloc_block = 0;
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
 	REISERFS_I(inode)->i_attrs =
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
 	REISERFS_I(inode)->i_acl_access = NULL;
 	REISERFS_I(inode)->i_acl_default = NULL;
 	init_rwsem(&REISERFS_I(inode)->xattr_sem);
 	if (old_format_only(sb))
 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
 				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
 	else
 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
 				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 	/* key to search for correct place for new stat data */
 	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
 		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
 		      TYPE_STAT_DATA, 3 /*key length */ );
 	/* find proper place for inserting of stat data */
 	retval = search_item(sb, &key, &path_to_key);
 	if (retval == IO_ERROR) {
 		err = -EIO;
 		goto out_bad_inode;
 	}
 	if (retval == ITEM_FOUND) {
 		pathrelse(&path_to_key);
 		err = -EEXIST;
 		goto out_bad_inode;
 	}
 	if (old_format_only(sb)) {
 		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
 			pathrelse(&path_to_key);
 			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
 			err = -EINVAL;
 			goto out_bad_inode;
 		}
 		inode2sd_v1(&sd, inode, inode->i_size);
 	} else {
 		inode2sd(&sd, inode, inode->i_size);
 	}
 	// these do not go to on-disk stat data
 	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
 	inode->i_blksize = reiserfs_default_io_size;
 	// store in in-core inode the key of stat data and version all
 	// object items will have (directory items will have old offset
 	// format, other new objects will consist of new items)
 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 	else
 		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
 	if (old_format_only(sb))
 		set_inode_sd_version(inode, STAT_DATA_V1);
 	else
 		set_inode_sd_version(inode, STAT_DATA_V2);
 	/* insert the stat data into the tree */
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	if (REISERFS_I(dir)->new_packing_locality)
 		th->displace_new_blocks = 1;
 #endif
 	retval =
 	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
 				 (char *)(&sd));
 	if (retval) {
 		err = retval;
 		reiserfs_check_path(&path_to_key);
 		goto out_bad_inode;
 	}
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	if (!th->displace_new_blocks)
 		REISERFS_I(dir)->new_packing_locality = 0;
 #endif
 	if (S_ISDIR(mode)) {
 		/* insert item with "." and ".." */
 		retval =
 		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
 	}
 	if (S_ISLNK(mode)) {
 		/* insert body of symlink */
 		if (!old_format_only(sb))
 			i_size = ROUND_UP(i_size);
 		retval =
 		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
 					 i_size);
 	}
 	if (retval) {
 		err = retval;
 		reiserfs_check_path(&path_to_key);
 		journal_end(th, th->t_super, th->t_blocks_allocated);
 		goto out_inserted_sd;
 	}
 	/* XXX CHECK THIS */
 	if (reiserfs_posixacl(inode->i_sb)) {
 		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
 		if (retval) {
 			err = retval;
 			reiserfs_check_path(&path_to_key);
 			journal_end(th, th->t_super, th->t_blocks_allocated);
 			goto out_inserted_sd;
 		}
 	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
 		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
 				 "but vfs thinks they are!");
 	} else if (is_reiserfs_priv_object(dir)) {
 		reiserfs_mark_inode_private(inode);
 	}
 	insert_inode_hash(inode);
 	reiserfs_update_sd(th, inode);
 	reiserfs_check_path(&path_to_key);
 	return 0;
 /* it looks like you can easily compress these two goto targets into
  * one.  Keeping it like this doesn't actually hurt anything, and they
  * are place holders for what the quota code actually needs.
  */
       out_bad_inode:
 	/* Invalidate the object, nothing was inserted yet */
 	INODE_PKEY(inode)->k_objectid = 0;
 	/* Quota change must be inside a transaction for journaling */
 	DQUOT_FREE_INODE(inode);
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
       out_inserted_sd:
 	inode->i_nlink = 0;
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
 	/* If we were inheriting an ACL, we need to release the lock so that
 	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
 	 * code really needs to be reworked, but this will take care of it
 	 * for now. -jeffm */
 	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
 		reiserfs_write_unlock_xattrs(dir->i_sb);
 		iput(inode);
 		reiserfs_write_lock_xattrs(dir->i_sb);
 	} else
 		iput(inode);
 	return err;
 }
 /*
 ** finds the tail page in the page cache,
 ** reads the last block in.
 **
 ** On success, page_result is set to a locked, pinned page, and bh_result
 ** is set to an up to date buffer for the last block in the file.  returns 0.
 **
 ** tail conversion is not done, so bh_result might not be valid for writing
 ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
 ** trying to write the block.
 **
 ** on failure, nonzero is returned, page_result and bh_result are untouched.
 */
 static int grab_tail_page(struct inode *p_s_inode,
 			  struct page **page_result,
 			  struct buffer_head **bh_result)
 {
 	/* we want the page with the last byte in the file,
 	 ** not the page that will hold the next byte for appending
 	 */
 	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
 	unsigned long pos = 0;
 	unsigned long start = 0;
 	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
 	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
 	int error;
 	/* we know that we are only called with inode->i_size > 0.
 	 ** we also know that a file tail can never be as big as a block
 	 ** If i_size % blocksize == 0, our file is currently block aligned
 	 ** and it won't need converting or zeroing after a truncate.
 	 */
 	if ((offset & (blocksize - 1)) == 0) {
 		return -ENOENT;
 	}
 	page = grab_cache_page(p_s_inode->i_mapping, index);
 	error = -ENOMEM;
 	if (!page) {
 		goto out;
 	}
 	/* start within the page of the last block in the file */
 	start = (offset / blocksize) * blocksize;
 	error = block_prepare_write(page, start, offset,
 				    reiserfs_get_block_create_0);
 	if (error)
 		goto unlock;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		if (pos >= start) {
 			break;
 		}
 		bh = bh->b_this_page;
 		pos += blocksize;
 	} while (bh != head);
 	if (!buffer_uptodate(bh)) {
 		/* note, this should never happen, prepare_write should
 		 ** be taking care of this for us.  If the buffer isn't up to date,
 		 ** I've screwed up the code to find the buffer, or the code to
 		 ** call prepare_write
 		 */
 		reiserfs_warning(p_s_inode->i_sb,
 				 "clm-6000: error reading block %lu on dev %s",
 				 bh->b_blocknr,
 				 reiserfs_bdevname(p_s_inode->i_sb));
 		error = -EIO;
 		goto unlock;
 	}
 	*bh_result = bh;
 	*page_result = page;
       out:
 	return error;
       unlock:
 	unlock_page(page);
 	page_cache_release(page);
 	return error;
 }
 /*
 ** vfs version of truncate file.  Must NOT be called with
 ** a transaction already started.
 **
 ** some code taken from block_truncate_page
 */
 int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 {
 	struct reiserfs_transaction_handle th;
 	/* we want the offset for the first byte after the end of the file */
 	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
 	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
 	unsigned length;
 	struct page *page = NULL;
 	int error;
 	struct buffer_head *bh = NULL;
 	int err2;
 	reiserfs_write_lock(p_s_inode->i_sb);
 	if (p_s_inode->i_size > 0) {
 		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
 			// -ENOENT means we truncated past the end of the file,
 			// and get_block_create_0 could not find a block to read in,
 			// which is ok.
 			if (error != -ENOENT)
 				reiserfs_warning(p_s_inode->i_sb,
 						 "clm-6001: grab_tail_page failed %d",
 						 error);
 			page = NULL;
 			bh = NULL;
 		}
 	}
 	/* so, if page != NULL, we have a buffer head for the offset at
 	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
 	 ** then we have an unformatted node.  Otherwise, we have a direct item,
 	 ** and no zeroing is required on disk.  We zero after the truncate,
 	 ** because the truncate might pack the item anyway
 	 ** (it will unmap bh if it packs).
 	 */
 	/* it is enough to reserve space in transaction for 2 balancings:
 	   one for "save" link adding and another for the first
 	   cut_from_item. 1 is for update_sd */
 	error = journal_begin(&th, p_s_inode->i_sb,
 			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
 	reiserfs_update_inode_transaction(p_s_inode);
 	if (update_timestamps)
 		/* we are doing real truncate: if the system crashes before the last
 		   transaction of truncating gets committed - on reboot the file
 		   either appears truncated properly or not truncated at all */
 		add_save_link(&th, p_s_inode, 1);
 	err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
 	error =
 	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
 	if (error)
 		goto out;
 	/* check reiserfs_do_truncate after ending the transaction */
 	if (err2) {
 		error = err2;
   		goto out;
 	}
 	if (update_timestamps) {
 		error = remove_save_link(p_s_inode, 1 /* truncate */ );
 		if (error)
 			goto out;
 	}
 	if (page) {
 		length = offset & (blocksize - 1);
 		/* if we are not on a block boundary */
 		if (length) {
 			char *kaddr;
 			length = blocksize - length;
 			kaddr = kmap_atomic(page, KM_USER0);
 			memset(kaddr + offset, 0, length);
 			flush_dcache_page(page);
 			kunmap_atomic(kaddr, KM_USER0);
 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
 				mark_buffer_dirty(bh);
 			}
 		}
 		unlock_page(page);
 		page_cache_release(page);
 	}
 	reiserfs_write_unlock(p_s_inode->i_sb);
 	return 0;
       out:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
 	}
 	reiserfs_write_unlock(p_s_inode->i_sb);
 	return error;
 }
 static int map_block_for_writepage(struct inode *inode,
 				   struct buffer_head *bh_result,
 				   unsigned long block)
 {
 	struct reiserfs_transaction_handle th;
 	int fs_gen;
 	struct item_head tmp_ih;
 	struct item_head *ih;
 	struct buffer_head *bh;
 	__le32 *item;
 	struct cpu_key key;
 	INITIALIZE_PATH(path);
 	int pos_in_item;
 	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
 	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
 	int retval;
 	int use_get_block = 0;
 	int bytes_copied = 0;
 	int copy_size;
 	int trans_running = 0;
 	/* catch places below that try to log something without starting a trans */
 	th.t_trans_id = 0;
 	if (!buffer_uptodate(bh_result)) {
 		return -EIO;
 	}
 	kmap(bh_result->b_page);
       start_over:
 	reiserfs_write_lock(inode->i_sb);
 	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
       research:
 	retval = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (retval != POSITION_FOUND) {
 		use_get_block = 1;
 		goto out;
 	}
 	bh = get_last_bh(&path);
 	ih = get_ih(&path);
 	item = get_item(&path);
 	pos_in_item = path.pos_in_item;
 	/* we've found an unformatted node */
 	if (indirect_item_found(retval, ih)) {
 		if (bytes_copied > 0) {
 			reiserfs_warning(inode->i_sb,
 					 "clm-6002: bytes_copied %d",
 					 bytes_copied);
 		}
 		if (!get_block_num(item, pos_in_item)) {
 			/* crap, we are writing to a hole */
 			use_get_block = 1;
 			goto out;
 		}
 		set_block_dev_mapped(bh_result,
 				     get_block_num(item, pos_in_item), inode);
 	} else if (is_direct_le_ih(ih)) {
 		char *p;
 		p = page_address(bh_result->b_page);
 		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
 		copy_size = ih_item_len(ih) - pos_in_item;
 		fs_gen = get_generation(inode->i_sb);
 		copy_item_head(&tmp_ih, ih);
 		if (!trans_running) {
 			/* vs-3050 is gone, no need to drop the path */
 			retval = journal_begin(&th, inode->i_sb, jbegin_count);
 			if (retval)
 				goto out;
 			reiserfs_update_inode_transaction(inode);
 			trans_running = 1;
 			if (fs_changed(fs_gen, inode->i_sb)
 			    && item_moved(&tmp_ih, &path)) {
 				reiserfs_restore_prepared_buffer(inode->i_sb,
 								 bh);
 				goto research;
 			}
 		}
 		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
 		if (fs_changed(fs_gen, inode->i_sb)
 		    && item_moved(&tmp_ih, &path)) {
 			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
 			goto research;
 		}
 		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
 		       copy_size);
 		journal_mark_dirty(&th, inode->i_sb, bh);
 		bytes_copied += copy_size;
 		set_block_dev_mapped(bh_result, 0, inode);
 		/* are there still bytes left? */
 		if (bytes_copied < bh_result->b_size &&
 		    (byte_offset + bytes_copied) < inode->i_size) {
 			set_cpu_key_k_offset(&key,
 					     cpu_key_k_offset(&key) +
 					     copy_size);
 			goto research;
 		}
 	} else {
 		reiserfs_warning(inode->i_sb,
 				 "clm-6003: bad item inode %lu, device %s",
 				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
 		retval = -EIO;
 		goto out;
 	}
 	retval = 0;
       out:
 	pathrelse(&path);
 	if (trans_running) {
 		int err = journal_end(&th, inode->i_sb, jbegin_count);
 		if (err)
 			retval = err;
 		trans_running = 0;
 	}
 	reiserfs_write_unlock(inode->i_sb);
 	/* this is where we fill in holes in the file. */
 	if (use_get_block) {
 		retval = reiserfs_get_block(inode, block, bh_result,
 					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
 					    | GET_BLOCK_NO_DANGLE);
 		if (!retval) {
 			if (!buffer_mapped(bh_result)
 			    || bh_result->b_blocknr == 0) {
 				/* get_block failed to find a mapped unformatted node. */
 				use_get_block = 0;
 				goto start_over;
 			}
 		}
 	}
 	kunmap(bh_result->b_page);
 	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
 		/* we've copied data from the page into the direct item, so the
 		 * buffer in the page is now clean, mark it to reflect that.
 		 */
 		lock_buffer(bh_result);
 		clear_buffer_dirty(bh_result);
 		unlock_buffer(bh_result);
 	}
 	return retval;
 }
 /*
  * mason@suse.com: updated in 2.5.54 to follow the same general io
  * start/recovery path as __block_write_full_page, along with special
  * code to handle reiserfs tails.
  */
 static int reiserfs_write_full_page(struct page *page,
 				    struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 	int error = 0;
 	unsigned long block;
 	struct buffer_head *head, *bh;
 	int partial = 0;
 	int nr = 0;
 	int checked = PageChecked(page);
 	struct reiserfs_transaction_handle th;
 	struct super_block *s = inode->i_sb;
 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 	th.t_trans_id = 0;
+	/* no logging allowed when nonblocking or from PF_MEMALLOC */
+	if (checked && (current->flags & PF_MEMALLOC)) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
 	/* The page dirty bit is cleared before writepage is called, which
 	 * means we have to tell create_empty_buffers to make dirty buffers
 	 * The page really should be up to date at this point, so tossing
 	 * in the BH_Uptodate is just a sanity check.
 	 */
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, s->s_blocksize,
 				     (1 << BH_Dirty) | (1 << BH_Uptodate));
 	}
 	head = page_buffers(page);
 	/* last page in the file, zero out any contents past the
 	 ** last byte in the file
 	 */
 	if (page->index >= end_index) {
 		char *kaddr;
 		unsigned last_offset;
 		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
 		/* no file contents in this page */
 		if (page->index >= end_index + 1 || !last_offset) {
 			unlock_page(page);
 			return 0;
 		}
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
 	}
 	bh = head;
 	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
 	/* first map all the buffers, logging any direct items we find */
 	do {
 		if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
 						      (buffer_mapped(bh)
 						       && bh->b_blocknr ==
 						       0))) {
 			/* not mapped yet, or it points to a direct item, search
 			 * the btree for the mapping info, and log any direct
 			 * items found
 			 */
 			if ((error = map_block_for_writepage(inode, bh, block))) {
 				goto fail;
 			}
 		}
 		bh = bh->b_this_page;
 		block++;
 	} while (bh != head);
 	/*
 	 * we start the transaction after map_block_for_writepage,
 	 * because it can create holes in the file (an unbounded operation).
 	 * starting it here, we can make a reliable estimate for how many
 	 * blocks we're going to log
 	 */
 	if (checked) {
 		ClearPageChecked(page);
 		reiserfs_write_lock(s);
 		error = journal_begin(&th, s, bh_per_page + 1);
 		if (error) {
 			reiserfs_write_unlock(s);
 			goto fail;
 		}
 		reiserfs_update_inode_transaction(inode);
 	}
 	/* now go through and lock any dirty buffers on the page */
 	do {
 		get_bh(bh);
 		if (!buffer_mapped(bh))
 			continue;
 		if (buffer_mapped(bh) && bh->b_blocknr == 0)
 			continue;
 		if (checked) {
 			reiserfs_prepare_for_journal(s, bh, 1);
 			journal_mark_dirty(&th, s, bh);
 			continue;
 		}
 		/* from this point on, we know the buffer is mapped to a
 		 * real block and not a direct item
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else {
 			if (test_set_buffer_locked(bh)) {
 				redirty_page_for_writepage(wbc, page);
 				continue;
 			}
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			mark_buffer_async_write(bh);
 		} else {
 			unlock_buffer(bh);
 		}
 	} while ((bh = bh->b_this_page) != head);
 	if (checked) {
 		error = journal_end(&th, s, bh_per_page + 1);
 		reiserfs_write_unlock(s);
 		if (error)
 			goto fail;
 	}
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
 	/*
 	 * since any buffer might be the only dirty buffer on the page,
 	 * the first submit_bh can bring the page out of writeback.
 	 * be careful with the buffers.
 	 */
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			submit_bh(WRITE, bh);
 			nr++;
 		}
 		put_bh(bh);
 		bh = next;
 	} while (bh != head);
 	error = 0;
       done:
 	if (nr == 0) {
 		/*
 		 * if this page only had a direct item, it is very possible for
 		 * no io to be required without there being an error.  Or,
 		 * someone else could have locked them and sent them down the
 		 * pipe without locking the page
 		 */
 		bh = head;
 		do {
 			if (!buffer_uptodate(bh)) {
 				partial = 1;
 				break;
 			}
 			bh = bh->b_this_page;
 		} while (bh != head);
 		if (!partial)
 			SetPageUptodate(page);
 		end_page_writeback(page);
 	}
 	return error;
       fail:
 	/* catches various errors, we need to make sure any valid dirty blocks
 	 * get to the media.  The page is currently locked and not marked for
 	 * writeback
 	 */
 	ClearPageUptodate(page);
 	bh = head;
 	do {
 		get_bh(bh);
 		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
 			/*
 			 * clear any dirty bits that might have come from getting
 			 * attached to a dirty page
 			 */
 			clear_buffer_dirty(bh);
 		}
 		bh = bh->b_this_page;
 	} while (bh != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
 			submit_bh(WRITE, bh);
 			nr++;
 		}
 		put_bh(bh);
 		bh = next;
 	} while (bh != head);
 	goto done;
 }
 static int reiserfs_readpage(struct file *f, struct page *page)
 {
 	return block_read_full_page(page, reiserfs_get_block);
 }
 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	reiserfs_wait_on_write_block(inode->i_sb);
 	return reiserfs_write_full_page(page, wbc);
 }
 static int reiserfs_prepare_write(struct file *f, struct page *page,
 				  unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
 	int old_ref = 0;
 	reiserfs_wait_on_write_block(inode->i_sb);
 	fix_tail_page_for_writing(page);
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th;
 		th = (struct reiserfs_transaction_handle *)current->
 		    journal_info;
 		BUG_ON(!th->t_refcount);
 		BUG_ON(!th->t_trans_id);
 		old_ref = th->t_refcount;
 		th->t_refcount++;
 	}
 	ret = block_prepare_write(page, from, to, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
 		 * error and left a transacstion running, we've got to close it,
 		 * and we've got to free handle if it was a persistent transaction.
 		 *
 		 * But, if we had nested into an existing transaction, we need
 		 * to just drop the ref count on the handle.
 		 *
 		 * If old_ref == 0, the transaction is from reiserfs_get_block,
 		 * and it was a persistent trans.  Otherwise, it was nested above.
 		 */
 		if (th->t_refcount > old_ref) {
 			if (old_ref)
 				th->t_refcount--;
 			else {
 				int err;
 				reiserfs_write_lock(inode->i_sb);
 				err = reiserfs_end_persistent_transaction(th);
 				reiserfs_write_unlock(inode->i_sb);
 				if (err)
 					ret = err;
 			}
 		}
 	}
 	return ret;
 }
 static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
 {
 	return generic_block_bmap(as, block, reiserfs_bmap);
 }
 static int reiserfs_commit_write(struct file *f, struct page *page,
 				 unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
 	int ret = 0;
 	int update_sd = 0;
 	struct reiserfs_transaction_handle *th = NULL;
 	reiserfs_wait_on_write_block(inode->i_sb);
 	if (reiserfs_transaction_running(inode->i_sb)) {
 		th = current->journal_info;
 	}
 	reiserfs_commit_page(inode, page, from, to);
 	/* generic_commit_write does this for us, but does not update the
 	 ** transaction tracking stuff when the size changes.  So, we have
 	 ** to do the i_size updates here.
 	 */
 	if (pos > inode->i_size) {
 		struct reiserfs_transaction_handle myth;
 		reiserfs_write_lock(inode->i_sb);
 		/* If the file have grown beyond the border where it
 		   can have a tail, unmark it as needing a tail
 		   packing */
 		if ((have_large_tails(inode->i_sb)
 		     && inode->i_size > i_block_size(inode) * 4)
 		    || (have_small_tails(inode->i_sb)
 			&& inode->i_size > i_block_size(inode)))
 			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 		ret = journal_begin(&myth, inode->i_sb, 1);
 		if (ret) {
 			reiserfs_write_unlock(inode->i_sb);
 			goto journal_error;
 		}
 		reiserfs_update_inode_transaction(inode);
 		inode->i_size = pos;
 		/*
 		 * this will just nest into our transaction.  It's important
 		 * to use mark_inode_dirty so the inode gets pushed around on the
 		 * dirty lists, and so that O_SYNC works as expected
 		 */
 		mark_inode_dirty(inode);
 		reiserfs_update_sd(&myth, inode);
 		update_sd = 1;
 		ret = journal_end(&myth, inode->i_sb, 1);
 		reiserfs_write_unlock(inode->i_sb);
 		if (ret)
 			goto journal_error;
 	}
 	if (th) {
 		reiserfs_write_lock(inode->i_sb);
 		if (!update_sd)
 			mark_inode_dirty(inode);
 		ret = reiserfs_end_persistent_transaction(th);
 		reiserfs_write_unlock(inode->i_sb);
 		if (ret)
 			goto out;
 	}
       out:
 	return ret;
       journal_error:
 	if (th) {
 		reiserfs_write_lock(inode->i_sb);
 		if (!update_sd)
 			reiserfs_update_sd(th, inode);
 		ret = reiserfs_end_persistent_transaction(th);
 		reiserfs_write_unlock(inode->i_sb);
 	}
 	return ret;
 }
 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
 {
 	if (reiserfs_attrs(inode->i_sb)) {
 		if (sd_attrs & REISERFS_SYNC_FL)
 			inode->i_flags |= S_SYNC;
 		else
 			inode->i_flags &= ~S_SYNC;
 		if (sd_attrs & REISERFS_IMMUTABLE_FL)
 			inode->i_flags |= S_IMMUTABLE;
 		else
 			inode->i_flags &= ~S_IMMUTABLE;
 		if (sd_attrs & REISERFS_APPEND_FL)
 			inode->i_flags |= S_APPEND;
 		else
 			inode->i_flags &= ~S_APPEND;
 		if (sd_attrs & REISERFS_NOATIME_FL)
 			inode->i_flags |= S_NOATIME;
 		else
 			inode->i_flags &= ~S_NOATIME;
 		if (sd_attrs & REISERFS_NOTAIL_FL)
 			REISERFS_I(inode)->i_flags |= i_nopack_mask;
 		else
 			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
 	}
 }
 void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
 {
 	if (reiserfs_attrs(inode->i_sb)) {
 		if (inode->i_flags & S_IMMUTABLE)
 			*sd_attrs |= REISERFS_IMMUTABLE_FL;
 		else
 			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
 		if (inode->i_flags & S_SYNC)
 			*sd_attrs |= REISERFS_SYNC_FL;
 		else
 			*sd_attrs &= ~REISERFS_SYNC_FL;
 		if (inode->i_flags & S_NOATIME)
 			*sd_attrs |= REISERFS_NOATIME_FL;
 		else
 			*sd_attrs &= ~REISERFS_NOATIME_FL;
 		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
 			*sd_attrs |= REISERFS_NOTAIL_FL;
 		else
 			*sd_attrs &= ~REISERFS_NOTAIL_FL;
 	}
 }
 /* decide if this buffer needs to stay around for data logging or ordered
 ** write purposes
 */
 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 {
 	int ret = 1;
 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
 	lock_buffer(bh);
 	spin_lock(&j->j_dirty_buffers_lock);
 	if (!buffer_mapped(bh)) {
 		goto free_jh;
 	}
 	/* the page is locked, and the only places that log a data buffer
 	 * also lock the page.
 	 */
 	if (reiserfs_file_data_log(inode)) {
 		/*
 		 * very conservative, leave the buffer pinned if
 		 * anyone might need it.
 		 */
 		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
 			ret = 0;
 		}
 	} else  if (buffer_dirty(bh)) {
 		struct reiserfs_journal_list *jl;
 		struct reiserfs_jh *jh = bh->b_private;
 		/* why is this safe?
 		 * reiserfs_setattr updates i_size in the on disk
 		 * stat data before allowing vmtruncate to be called.
 		 *
 		 * If buffer was put onto the ordered list for this
 		 * transaction, we know for sure either this transaction
 		 * or an older one already has updated i_size on disk,
 		 * and this ordered data won't be referenced in the file
 		 * if we crash.
 		 *
 		 * if the buffer was put onto the ordered list for an older
 		 * transaction, we need to leave it around
 		 */
 		if (jh && (jl = jh->jl)
 		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
 			ret = 0;
 	}
       free_jh:
 	if (ret && bh->b_private) {
 		reiserfs_free_jh(bh);
 	}
 	spin_unlock(&j->j_dirty_buffers_lock);
 	unlock_buffer(bh);
 	return ret;
 }
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	struct inode *inode = page->mapping->host;
 	unsigned int curr_off = 0;
 	int ret = 1;
 	BUG_ON(!PageLocked(page));
 	if (offset == 0)
 		ClearPageChecked(page);
 	if (!page_has_buffers(page))
 		goto out;
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off) {
 			if (invalidatepage_can_drop(inode, bh))
 				reiserfs_unmap_buffer(bh);
 			else
 				ret = 0;
 		}
 		curr_off = next_off;
 		bh = next;
 	} while (bh != head);
 	/*
 	 * We release buffers only if the entire page is being invalidated.
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
 	if (!offset && ret)
 		ret = try_to_release_page(page, 0);
       out:
 	return ret;
 }
 static int reiserfs_set_page_dirty(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	if (reiserfs_file_data_log(inode)) {
 		SetPageChecked(page);
 		return __set_page_dirty_nobuffers(page);
 	}
 	return __set_page_dirty_buffers(page);
 }
 /*
  * Returns 1 if the page's buffers were dropped.  The page is locked.
  *
  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
  * in the buffers at page_buffers(page).
  *
  * even in -o notail mode, we can't be sure an old mount without -o notail
  * didn't create files with tails.
  */
 static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
 	struct inode *inode = page->mapping->host;
 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
 	struct buffer_head *head;
 	struct buffer_head *bh;
 	int ret = 1;
 	WARN_ON(PageChecked(page));
 	spin_lock(&j->j_dirty_buffers_lock);
 	head = page_buffers(page);
 	bh = head;
 	do {
 		if (bh->b_private) {
 			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
 				reiserfs_free_jh(bh);
 			} else {
 				ret = 0;
 				break;
 			}
 		}
 		bh = bh->b_this_page;
 	} while (bh != head);
 	if (ret)
 		ret = try_to_free_buffers(page);
 	spin_unlock(&j->j_dirty_buffers_lock);
 	return ret;
 }
 /* We thank Mingming Cao for helping us understand in great detail what
    to do in this section of the code. */
 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 				  const struct iovec *iov, loff_t offset,
 				  unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs,
 				  reiserfs_get_blocks_direct_io, NULL);
 }
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 	unsigned int ia_valid = attr->ia_valid;
 	reiserfs_write_lock(inode->i_sb);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* version 2 items will be caught by the s_maxbytes check
 		 ** done for us in vmtruncate
 		 */
 		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
 		    attr->ia_size > MAX_NON_LFS) {
 			error = -EFBIG;
 			goto out;
 		}
 		/* fill in hole pointers in the expanding truncate case. */
 		if (attr->ia_size > inode->i_size) {
 			error = generic_cont_expand(inode, attr->ia_size);
 			if (REISERFS_I(inode)->i_prealloc_count > 0) {
 				int err;
 				struct reiserfs_transaction_handle th;
 				/* we're changing at most 2 bitmaps, inode + super */
 				err = journal_begin(&th, inode->i_sb, 4);
 				if (!err) {
 					reiserfs_discard_prealloc(&th, inode);
 					err = journal_end(&th, inode->i_sb, 4);
 				}
 				if (err)
 					error = err;
 			}
 			if (error)
 				goto out;
 		}
 	}
 	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
 	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
 	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
 		/* stat data of format v3.5 has 16 bit uid and gid */
 		error = -EINVAL;
 		goto out;
 	}
 	error = inode_change_ok(inode, attr);
 	if (!error) {
 		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 			error = reiserfs_chown_xattrs(inode, attr);
 			if (!error) {
 				struct reiserfs_transaction_handle th;
 				int jbegin_count =
 				    2 *
 				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
 				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
 				    2;
 				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
 				error =
 				    journal_begin(&th, inode->i_sb,
 						  jbegin_count);
 				if (error)
 					goto out;
 				error =
 				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 				if (error) {
 					journal_end(&th, inode->i_sb,
 						    jbegin_count);
 					goto out;
 				}
 				/* Update corresponding info in inode so that everything is in
 				 * one transaction */
 				if (attr->ia_valid & ATTR_UID)
 					inode->i_uid = attr->ia_uid;
 				if (attr->ia_valid & ATTR_GID)
 					inode->i_gid = attr->ia_gid;
 				mark_inode_dirty(inode);
 				error =
 				    journal_end(&th, inode->i_sb, jbegin_count);
 			}
 		}
 		if (!error)
 			error = inode_setattr(inode, attr);
 	}
 	if (!error && reiserfs_posixacl(inode->i_sb)) {
 		if (attr->ia_valid & ATTR_MODE)
 			error = reiserfs_acl_chmod(inode);
 	}
       out:
 	reiserfs_write_unlock(inode->i_sb);
 	return error;
 }
 struct address_space_operations reiserfs_address_space_operations = {
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
 	.releasepage = reiserfs_releasepage,
 	.invalidatepage = reiserfs_invalidatepage,
 	.sync_page = block_sync_page,
 	.prepare_write = reiserfs_prepare_write,
 	.commit_write = reiserfs_commit_write,
 	.bmap = reiserfs_aop_bmap,
 	.direct_IO = reiserfs_direct_IO,
 	.set_page_dirty = reiserfs_set_page_dirty,
 };

 /*
 ** Write ahead logging implementation copyright Chris Mason 2000
 **
 ** The background commits make this code very interelated, and
 ** overly complex.  I need to rethink things a bit....The major players:
 **
 ** journal_begin -- call with the number of blocks you expect to log.
 **                  If the current transaction is too
 ** 		    old, it will block until the current transaction is
 ** 		    finished, and then start a new one.
 **		    Usually, your transaction will get joined in with
 **                  previous ones for speed.
 **
 ** journal_join  -- same as journal_begin, but won't block on the current
 **                  transaction regardless of age.  Don't ever call
 **                  this.  Ever.  There are only two places it should be
 **                  called from, and they are both inside this file.
 **
 ** journal_mark_dirty -- adds blocks into this transaction.  clears any flags
 **                       that might make them get sent to disk
 **                       and then marks them BH_JDirty.  Puts the buffer head
 **                       into the current transaction hash.
 **
 ** journal_end -- if the current transaction is batchable, it does nothing
 **                   otherwise, it could do an async/synchronous commit, or
 **                   a full flush of all log and real blocks in the
 **                   transaction.
 **
 ** flush_old_commits -- if the current transaction is too old, it is ended and
 **                      commit blocks are sent to disk.  Forces commit blocks
 **                      to disk for all backgrounded commits that have been
 **                      around too long.
 **		     -- Note, if you call this as an immediate flush from
 **		        from within kupdate, it will ignore the immediate flag
 */
 #include <linux/config.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/time.h>
 #include <asm/semaphore.h>
 #include <linux/vmalloc.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
                                j_list))
 #define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
                                j_working_list))
 /* the number of mounted filesystems.  This is used to decide when to
 ** start and kill the commit workqueue
 */
 static int reiserfs_mounted_fs_count;
 static struct workqueue_struct *commit_wq;
 #define JOURNAL_TRANS_HALF 1018	/* must be correct to keep the desc and commit
 				   structs at 4k */
 #define BUFNR 64		/*read ahead */
 /* cnode stat bits.  Move these into reiserfs_fs.h */
 #define BLOCK_FREED 2		/* this block was freed, and can't be written.  */
 #define BLOCK_FREED_HOLDER 3	/* this block was freed during this transaction, and can't be written */
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
 #define BLOCK_DIRTIED 5
 /* journal list state bits */
 #define LIST_TOUCHED 1
 #define LIST_DIRTY   2
 #define LIST_COMMIT_PENDING  4	/* someone will commit this list */
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
 #define COMMIT_NOW  2		/* end and commit this transaction */
 #define WAIT        4		/* wait for the log blocks to hit the disk */
 static int do_journal_end(struct reiserfs_transaction_handle *,
 			  struct super_block *, unsigned long nblocks,
 			  int flags);
 static int flush_journal_list(struct super_block *s,
 			      struct reiserfs_journal_list *jl, int flushall);
 static int flush_commit_list(struct super_block *s,
 			     struct reiserfs_journal_list *jl, int flushall);
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
 			struct super_block *p_s_sb, unsigned long nblocks);
 static int release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal);
 static int dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl);
 static void flush_async_commits(void *p);
 static void queue_log_writer(struct super_block *s);
 /* values for join in do_journal_begin_r */
 enum {
 	JBEGIN_REG = 0,		/* regular journal begin */
 	JBEGIN_JOIN = 1,	/* join the running transaction if at all possible */
 	JBEGIN_ABORT = 2,	/* called from cleanup code, ignores aborted flag */
 };
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 			      struct super_block *p_s_sb,
 			      unsigned long nblocks, int join);
 static void init_journal_hash(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	memset(journal->j_hash_table, 0,
 	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
 }
 /*
 ** clears BH_Dirty and sticks the buffer on the clean list.  Called because I can't allow refile_buffer to
 ** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
 ** more details.
 */
 static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
 {
 	if (bh) {
 		clear_buffer_dirty(bh);
 		clear_buffer_journal_test(bh);
 	}
 	return 0;
 }
 static void disable_barrier(struct super_block *s)
 {
 	REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
 	printk("reiserfs: disabling flush barriers on %s\n",
 	       reiserfs_bdevname(s));
 }
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 							 *p_s_sb)
 {
 	struct reiserfs_bitmap_node *bn;
 	static int id;
 	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
 	if (!bn) {
 		return NULL;
 	}
 	bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);
 	if (!bn->data) {
 		kfree(bn);
 		return NULL;
 	}
 	bn->id = id++;
 	INIT_LIST_HEAD(&bn->list);
 	return bn;
 }
 static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_bitmap_node *bn = NULL;
 	struct list_head *entry = journal->j_bitmap_nodes.next;
 	journal->j_used_bitmap_nodes++;
       repeat:
 	if (entry != &journal->j_bitmap_nodes) {
 		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
 		list_del(entry);
 		memset(bn->data, 0, p_s_sb->s_blocksize);
 		journal->j_free_bitmap_nodes--;
 		return bn;
 	}
 	bn = allocate_bitmap_node(p_s_sb);
 	if (!bn) {
 		yield();
 		goto repeat;
 	}
 	return bn;
 }
 static inline void free_bitmap_node(struct super_block *p_s_sb,
 				    struct reiserfs_bitmap_node *bn)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	journal->j_used_bitmap_nodes--;
 	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
 		kfree(bn->data);
 		kfree(bn);
 	} else {
 		list_add(&bn->list, &journal->j_bitmap_nodes);
 		journal->j_free_bitmap_nodes++;
 	}
 }
 static void allocate_bitmap_nodes(struct super_block *p_s_sb)
 {
 	int i;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_bitmap_node *bn = NULL;
 	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
 		bn = allocate_bitmap_node(p_s_sb);
 		if (bn) {
 			list_add(&bn->list, &journal->j_bitmap_nodes);
 			journal->j_free_bitmap_nodes++;
 		} else {
 			break;	// this is ok, we'll try again when more are needed
 		}
 	}
 }
 static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block,
 				  struct reiserfs_list_bitmap *jb)
 {
 	int bmap_nr = block / (p_s_sb->s_blocksize << 3);
 	int bit_nr = block % (p_s_sb->s_blocksize << 3);
 	if (!jb->bitmaps[bmap_nr]) {
 		jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb);
 	}
 	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
 	return 0;
 }
 static void cleanup_bitmap_list(struct super_block *p_s_sb,
 				struct reiserfs_list_bitmap *jb)
 {
 	int i;
 	if (jb->bitmaps == NULL)
 		return;
 	for (i = 0; i < SB_BMAP_NR(p_s_sb); i++) {
 		if (jb->bitmaps[i]) {
 			free_bitmap_node(p_s_sb, jb->bitmaps[i]);
 			jb->bitmaps[i] = NULL;
 		}
 	}
 }
 /*
 ** only call this on FS unmount.
 */
 static int free_list_bitmaps(struct super_block *p_s_sb,
 			     struct reiserfs_list_bitmap *jb_array)
 {
 	int i;
 	struct reiserfs_list_bitmap *jb;
 	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 		jb = jb_array + i;
 		jb->journal_list = NULL;
 		cleanup_bitmap_list(p_s_sb, jb);
 		vfree(jb->bitmaps);
 		jb->bitmaps = NULL;
 	}
 	return 0;
 }
 static int free_bitmap_nodes(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct list_head *next = journal->j_bitmap_nodes.next;
 	struct reiserfs_bitmap_node *bn;
 	while (next != &journal->j_bitmap_nodes) {
 		bn = list_entry(next, struct reiserfs_bitmap_node, list);
 		list_del(next);
 		kfree(bn->data);
 		kfree(bn);
 		next = journal->j_bitmap_nodes.next;
 		journal->j_free_bitmap_nodes--;
 	}
 	return 0;
 }
 /*
 ** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
 ** jb_array is the array to be filled in.
 */
 int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
 				   struct reiserfs_list_bitmap *jb_array,
 				   int bmap_nr)
 {
 	int i;
 	int failed = 0;
 	struct reiserfs_list_bitmap *jb;
 	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
 	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 		jb = jb_array + i;
 		jb->journal_list = NULL;
 		jb->bitmaps = vmalloc(mem);
 		if (!jb->bitmaps) {
 			reiserfs_warning(p_s_sb,
 					 "clm-2000, unable to allocate bitmaps for journal lists");
 			failed = 1;
 			break;
 		}
 		memset(jb->bitmaps, 0, mem);
 	}
 	if (failed) {
 		free_list_bitmaps(p_s_sb, jb_array);
 		return -1;
 	}
 	return 0;
 }
 /*
 ** find an available list bitmap.  If you can't find one, flush a commit list
 ** and try again
 */
 static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
 						    struct reiserfs_journal_list
 						    *jl)
 {
 	int i, j;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_list_bitmap *jb = NULL;
 	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
 		i = journal->j_list_bitmap_index;
 		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
 		jb = journal->j_list_bitmap + i;
 		if (journal->j_list_bitmap[i].journal_list) {
 			flush_commit_list(p_s_sb,
 					  journal->j_list_bitmap[i].
 					  journal_list, 1);
 			if (!journal->j_list_bitmap[i].journal_list) {
 				break;
 			}
 		} else {
 			break;
 		}
 	}
 	if (jb->journal_list) {	/* double check to make sure if flushed correctly */
 		return NULL;
 	}
 	jb->journal_list = jl;
 	return jb;
 }
 /*
 ** allocates a new chunk of X nodes, and links them all together as a list.
 ** Uses the cnode->next and cnode->prev pointers
 ** returns NULL on failure
 */
 static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
 {
 	struct reiserfs_journal_cnode *head;
 	int i;
 	if (num_cnodes <= 0) {
 		return NULL;
 	}
 	head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
 	if (!head) {
 		return NULL;
 	}
 	memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
 	head[0].prev = NULL;
 	head[0].next = head + 1;
 	for (i = 1; i < num_cnodes; i++) {
 		head[i].prev = head + (i - 1);
 		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
 	}
 	head[num_cnodes - 1].next = NULL;
 	return head;
 }
 /*
 ** pulls a cnode off the free list, or returns NULL on failure
 */
 static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	reiserfs_check_lock_depth(p_s_sb, "get_cnode");
 	if (journal->j_cnode_free <= 0) {
 		return NULL;
 	}
 	journal->j_cnode_used++;
 	journal->j_cnode_free--;
 	cn = journal->j_cnode_free_list;
 	if (!cn) {
 		return cn;
 	}
 	if (cn->next) {
 		cn->next->prev = NULL;
 	}
 	journal->j_cnode_free_list = cn->next;
 	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
 	return cn;
 }
 /*
 ** returns a cnode to the free list
 */
 static void free_cnode(struct super_block *p_s_sb,
 		       struct reiserfs_journal_cnode *cn)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	reiserfs_check_lock_depth(p_s_sb, "free_cnode");
 	journal->j_cnode_used--;
 	journal->j_cnode_free++;
 	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
 	cn->next = journal->j_cnode_free_list;
 	if (journal->j_cnode_free_list) {
 		journal->j_cnode_free_list->prev = cn;
 	}
 	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
 	journal->j_cnode_free_list = cn;
 }
 static void clear_prepared_bits(struct buffer_head *bh)
 {
 	clear_buffer_journal_prepared(bh);
 	clear_buffer_journal_restore_dirty(bh);
 }
 /* utility function to force a BUG if it is called without the big
 ** kernel lock held.  caller is the string printed just before calling BUG()
 */
 void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
 {
 #ifdef CONFIG_SMP
 	if (current->lock_depth < 0) {
 		reiserfs_panic(sb, "%s called without kernel lock held",
 			       caller);
 	}
 #else
 	;
 #endif
 }
 /* return a cnode with same dev, block number and size in table, or null if not found */
 static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
 								  super_block
 								  *sb,
 								  struct
 								  reiserfs_journal_cnode
 								  **table,
 								  long bl)
 {
 	struct reiserfs_journal_cnode *cn;
 	cn = journal_hash(table, sb, bl);
 	while (cn) {
 		if (cn->blocknr == bl && cn->sb == sb)
 			return cn;
 		cn = cn->hnext;
 	}
 	return (struct reiserfs_journal_cnode *)0;
 }
 /*
 ** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
 ** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
 ** being overwritten by a replay after crashing.
 **
 ** If you don't set search_all, a block can only be allocated if it is not in the current transaction.  Since deleting
 ** a block removes it from the current transaction, this case should never happen.  If you don't set search_all, make
 ** sure you never write the block without logging it.
 **
 ** next_zero_bit is a suggestion about the next block to try for find_forward.
 ** when bl is rejected because it is set in a journal list bitmap, we search
 ** for the next zero bit in the bitmap that rejected bl.  Then, we return that
 ** through next_zero_bit for find_forward to try.
 **
 ** Just because we return something in next_zero_bit does not mean we won't
 ** reject it on the next call to reiserfs_in_journal
 **
 */
 int reiserfs_in_journal(struct super_block *p_s_sb,
 			int bmap_nr, int bit_nr, int search_all,
 			b_blocknr_t * next_zero_bit)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_list_bitmap *jb;
 	int i;
 	unsigned long bl;
 	*next_zero_bit = 0;	/* always start this at zero. */
 	PROC_INFO_INC(p_s_sb, journal.in_journal);
 	/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
 	 ** if we crash before the transaction that freed it commits,  this transaction won't
 	 ** have committed either, and the block will never be written
 	 */
 	if (search_all) {
 		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
 			PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap);
 			jb = journal->j_list_bitmap + i;
 			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
 			    test_bit(bit_nr,
 				     (unsigned long *)jb->bitmaps[bmap_nr]->
 				     data)) {
 				*next_zero_bit =
 				    find_next_zero_bit((unsigned long *)
 						       (jb->bitmaps[bmap_nr]->
 							data),
 						       p_s_sb->s_blocksize << 3,
 						       bit_nr + 1);
 				return 1;
 			}
 		}
 	}
 	bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
 	/* is it in any old transactions? */
 	if (search_all
 	    && (cn =
 		get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
 		return 1;
 	}
 	/* is it in the current transaction.  This should never happen */
 	if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
 		BUG();
 		return 1;
 	}
 	PROC_INFO_INC(p_s_sb, journal.in_journal_reusable);
 	/* safe for reuse */
 	return 0;
 }
 /* insert cn into table
 */
 static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 				       struct reiserfs_journal_cnode *cn)
 {
 	struct reiserfs_journal_cnode *cn_orig;
 	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
 	cn->hnext = cn_orig;
 	cn->hprev = NULL;
 	if (cn_orig) {
 		cn_orig->hprev = cn;
 	}
 	journal_hash(table, cn->sb, cn->blocknr) = cn;
 }
 /* lock the current transaction */
 static inline void lock_journal(struct super_block *p_s_sb)
 {
 	PROC_INFO_INC(p_s_sb, journal.lock_journal);
 	down(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 /* unlock the current transaction */
 static inline void unlock_journal(struct super_block *p_s_sb)
 {
 	up(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
 {
 	jl->j_refcount++;
 }
 static inline void put_journal_list(struct super_block *s,
 				    struct reiserfs_journal_list *jl)
 {
 	if (jl->j_refcount < 1) {
 		reiserfs_panic(s, "trans id %lu, refcount at %d",
 			       jl->j_trans_id, jl->j_refcount);
 	}
 	if (--jl->j_refcount == 0)
 		kfree(jl);
 }
 /*
 ** this used to be much more involved, and I'm keeping it just in case things get ugly again.
 ** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
 ** transaction.
 */
 static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,
 					   struct reiserfs_journal_list *jl)
 {
 	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
 	if (jb) {
 		cleanup_bitmap_list(p_s_sb, jb);
 	}
 	jl->j_list_bitmap->journal_list = NULL;
 	jl->j_list_bitmap = NULL;
 }
 static int journal_list_still_alive(struct super_block *s,
 				    unsigned long trans_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	struct list_head *entry = &journal->j_journal_list;
 	struct reiserfs_journal_list *jl;
 	if (!list_empty(entry)) {
 		jl = JOURNAL_LIST_ENTRY(entry->next);
 		if (jl->j_trans_id <= trans_id) {
 			return 1;
 		}
 	}
 	return 0;
 }
 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	if (buffer_journaled(bh)) {
 		reiserfs_warning(NULL,
 				 "clm-2084: pinned buffer %lu:%s sent to disk",
 				 bh->b_blocknr, bdevname(bh->b_bdev, b));
 	}
 	if (uptodate)
 		set_buffer_uptodate(bh);
 	else
 		clear_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	put_bh(bh);
 }
 static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
 {
 	if (uptodate)
 		set_buffer_uptodate(bh);
 	else
 		clear_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	put_bh(bh);
 }
 static void submit_logged_buffer(struct buffer_head *bh)
 {
 	get_bh(bh);
 	bh->b_end_io = reiserfs_end_buffer_io_sync;
 	clear_buffer_journal_new(bh);
 	clear_buffer_dirty(bh);
 	if (!test_clear_buffer_journal_test(bh))
 		BUG();
 	if (!buffer_uptodate(bh))
 		BUG();
 	submit_bh(WRITE, bh);
 }
 static void submit_ordered_buffer(struct buffer_head *bh)
 {
 	get_bh(bh);
 	bh->b_end_io = reiserfs_end_ordered_io;
 	clear_buffer_dirty(bh);
 	if (!buffer_uptodate(bh))
 		BUG();
 	submit_bh(WRITE, bh);
 }
 static int submit_barrier_buffer(struct buffer_head *bh)
 {
 	get_bh(bh);
 	bh->b_end_io = reiserfs_end_ordered_io;
 	clear_buffer_dirty(bh);
 	if (!buffer_uptodate(bh))
 		BUG();
 	return submit_bh(WRITE_BARRIER, bh);
 }
 static void check_barrier_completion(struct super_block *s,
 				     struct buffer_head *bh)
 {
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
 		disable_barrier(s);
 		set_buffer_uptodate(bh);
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
 	}
 }
 #define CHUNK_SIZE 32
 struct buffer_chunk {
 	struct buffer_head *bh[CHUNK_SIZE];
 	int nr;
 };
 static void write_chunk(struct buffer_chunk *chunk)
 {
 	int i;
 	get_fs_excl();
 	for (i = 0; i < chunk->nr; i++) {
 		submit_logged_buffer(chunk->bh[i]);
 	}
 	chunk->nr = 0;
 	put_fs_excl();
 }
 static void write_ordered_chunk(struct buffer_chunk *chunk)
 {
 	int i;
 	get_fs_excl();
 	for (i = 0; i < chunk->nr; i++) {
 		submit_ordered_buffer(chunk->bh[i]);
 	}
 	chunk->nr = 0;
 	put_fs_excl();
 }
 static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
 			spinlock_t * lock, void (fn) (struct buffer_chunk *))
 {
 	int ret = 0;
 	if (chunk->nr >= CHUNK_SIZE)
 		BUG();
 	chunk->bh[chunk->nr++] = bh;
 	if (chunk->nr >= CHUNK_SIZE) {
 		ret = 1;
 		if (lock)
 			spin_unlock(lock);
 		fn(chunk);
 		if (lock)
 			spin_lock(lock);
 	}
 	return ret;
 }
 static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
 static struct reiserfs_jh *alloc_jh(void)
 {
 	struct reiserfs_jh *jh;
 	while (1) {
 		jh = kmalloc(sizeof(*jh), GFP_NOFS);
 		if (jh) {
 			atomic_inc(&nr_reiserfs_jh);
 			return jh;
 		}
 		yield();
 	}
 }
 /*
  * we want to free the jh when the buffer has been written
  * and waited on
  */
 void reiserfs_free_jh(struct buffer_head *bh)
 {
 	struct reiserfs_jh *jh;
 	jh = bh->b_private;
 	if (jh) {
 		bh->b_private = NULL;
 		jh->bh = NULL;
 		list_del_init(&jh->list);
 		kfree(jh);
 		if (atomic_read(&nr_reiserfs_jh) <= 0)
 			BUG();
 		atomic_dec(&nr_reiserfs_jh);
 		put_bh(bh);
 	}
 }
 static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
 			   int tail)
 {
 	struct reiserfs_jh *jh;
 	if (bh->b_private) {
 		spin_lock(&j->j_dirty_buffers_lock);
 		if (!bh->b_private) {
 			spin_unlock(&j->j_dirty_buffers_lock);
 			goto no_jh;
 		}
 		jh = bh->b_private;
 		list_del_init(&jh->list);
 	} else {
 	      no_jh:
 		get_bh(bh);
 		jh = alloc_jh();
 		spin_lock(&j->j_dirty_buffers_lock);
 		/* buffer must be locked for __add_jh, should be able to have
 		 * two adds at the same time
 		 */
 		if (bh->b_private)
 			BUG();
 		jh->bh = bh;
 		bh->b_private = jh;
 	}
 	jh->jl = j->j_current_jl;
 	if (tail)
 		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
 	else {
 		list_add_tail(&jh->list, &jh->jl->j_bh_list);
 	}
 	spin_unlock(&j->j_dirty_buffers_lock);
 	return 0;
 }
 int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
 {
 	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
 }
 int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
 {
 	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
 }
 #define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
 static int write_ordered_buffers(spinlock_t * lock,
 				 struct reiserfs_journal *j,
 				 struct reiserfs_journal_list *jl,
 				 struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct reiserfs_jh *jh;
 	int ret = j->j_errno;
 	struct buffer_chunk chunk;
 	struct list_head tmp;
 	INIT_LIST_HEAD(&tmp);
 	chunk.nr = 0;
 	spin_lock(lock);
 	while (!list_empty(list)) {
 		jh = JH_ENTRY(list->next);
 		bh = jh->bh;
 		get_bh(bh);
 		if (test_set_buffer_locked(bh)) {
 			if (!buffer_dirty(bh)) {
 				list_del_init(&jh->list);
 				list_add(&jh->list, &tmp);
 				goto loop_next;
 			}
 			spin_unlock(lock);
 			if (chunk.nr)
 				write_ordered_chunk(&chunk);
 			wait_on_buffer(bh);
 			cond_resched();
 			spin_lock(lock);
 			goto loop_next;
 		}
 		if (buffer_dirty(bh)) {
 			list_del_init(&jh->list);
 			list_add(&jh->list, &tmp);
 			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
 		} else {
 			reiserfs_free_jh(bh);
 			unlock_buffer(bh);
 		}
 	      loop_next:
 		put_bh(bh);
 		cond_resched_lock(lock);
 	}
 	if (chunk.nr) {
 		spin_unlock(lock);
 		write_ordered_chunk(&chunk);
 		spin_lock(lock);
 	}
 	while (!list_empty(&tmp)) {
 		jh = JH_ENTRY(tmp.prev);
 		bh = jh->bh;
 		get_bh(bh);
 		reiserfs_free_jh(bh);
 		if (buffer_locked(bh)) {
 			spin_unlock(lock);
 			wait_on_buffer(bh);
 			spin_lock(lock);
 		}
 		if (!buffer_uptodate(bh)) {
 			ret = -EIO;
 		}
 		/* ugly interaction with invalidatepage here.
 		 * reiserfs_invalidate_page will pin any buffer that has a valid
 		 * journal head from an older transaction.  If someone else sets
 		 * our buffer dirty after we write it in the first loop, and
 		 * then someone truncates the page away, nobody will ever write
 		 * the buffer. We're safe if we write the page one last time
 		 * after freeing the journal header.
 		 */
 		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
 			spin_unlock(lock);
 			ll_rw_block(WRITE, 1, &bh);
 			spin_lock(lock);
 		}
 		put_bh(bh);
 		cond_resched_lock(lock);
 	}
 	spin_unlock(lock);
 	return ret;
 }
 static int flush_older_commits(struct super_block *s,
 			       struct reiserfs_journal_list *jl)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	struct reiserfs_journal_list *other_jl;
 	struct reiserfs_journal_list *first_jl;
 	struct list_head *entry;
 	unsigned long trans_id = jl->j_trans_id;
 	unsigned long other_trans_id;
 	unsigned long first_trans_id;
       find_first:
 	/*
 	 * first we walk backwards to find the oldest uncommitted transation
 	 */
 	first_jl = jl;
 	entry = jl->j_list.prev;
 	while (1) {
 		other_jl = JOURNAL_LIST_ENTRY(entry);
 		if (entry == &journal->j_journal_list ||
 		    atomic_read(&other_jl->j_older_commits_done))
 			break;
 		first_jl = other_jl;
 		entry = other_jl->j_list.prev;
 	}
 	/* if we didn't find any older uncommitted transactions, return now */
 	if (first_jl == jl) {
 		return 0;
 	}
 	first_trans_id = first_jl->j_trans_id;
 	entry = &first_jl->j_list;
 	while (1) {
 		other_jl = JOURNAL_LIST_ENTRY(entry);
 		other_trans_id = other_jl->j_trans_id;
 		if (other_trans_id < trans_id) {
 			if (atomic_read(&other_jl->j_commit_left) != 0) {
 				flush_commit_list(s, other_jl, 0);
 				/* list we were called with is gone, return */
 				if (!journal_list_still_alive(s, trans_id))
 					return 1;
 				/* the one we just flushed is gone, this means all
 				 * older lists are also gone, so first_jl is no longer
 				 * valid either.  Go back to the beginning.
 				 */
 				if (!journal_list_still_alive
 				    (s, other_trans_id)) {
 					goto find_first;
 				}
 			}
 			entry = entry->next;
 			if (entry == &journal->j_journal_list)
 				return 0;
 		} else {
 			return 0;
 		}
 	}
 	return 0;
 }
 int reiserfs_async_progress_wait(struct super_block *s)
 {
 	DEFINE_WAIT(wait);
 	struct reiserfs_journal *j = SB_JOURNAL(s);
 	if (atomic_read(&j->j_async_throttle))
 		blk_congestion_wait(WRITE, HZ / 10);
 	return 0;
 }
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
 **
 ** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
 ** Before the commit block can by written, every other log block must be safely on disk
 **
 */
 static int flush_commit_list(struct super_block *s,
 			     struct reiserfs_journal_list *jl, int flushall)
 {
 	int i;
 	int bn;
 	struct buffer_head *tbh = NULL;
 	unsigned long trans_id = jl->j_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	int barrier = 0;
 	int retval = 0;
+	int write_len;
 	reiserfs_check_lock_depth(s, "flush_commit_list");
 	if (atomic_read(&jl->j_older_commits_done)) {
 		return 0;
 	}
 	get_fs_excl();
 	/* before we can put our commit blocks on disk, we have to make sure everyone older than
 	 ** us is on disk too
 	 */
 	BUG_ON(jl->j_len <= 0);
 	BUG_ON(trans_id == journal->j_trans_id);
 	get_journal_list(jl);
 	if (flushall) {
 		if (flush_older_commits(s, jl) == 1) {
 			/* list disappeared during flush_older_commits.  return */
 			goto put_jl;
 		}
 	}
 	/* make sure nobody is trying to flush this one at the same time */
 	down(&jl->j_commit_lock);
 	if (!journal_list_still_alive(s, trans_id)) {
 		up(&jl->j_commit_lock);
 		goto put_jl;
 	}
 	BUG_ON(jl->j_trans_id == 0);
 	/* this commit is done, exit */
 	if (atomic_read(&(jl->j_commit_left)) <= 0) {
 		if (flushall) {
 			atomic_set(&(jl->j_older_commits_done), 1);
 		}
 		up(&jl->j_commit_lock);
 		goto put_jl;
 	}
 	if (!list_empty(&jl->j_bh_list)) {
 		unlock_kernel();
 		write_ordered_buffers(&journal->j_dirty_buffers_lock,
 				      journal, jl, &jl->j_bh_list);
 		lock_kernel();
 	}
 	BUG_ON(!list_empty(&jl->j_bh_list));
 	/*
 	 * for the description block and all the log blocks, submit any buffers
-	 * that haven't already reached the disk
+	 * that haven't already reached the disk.  Try to write at least 256
+	 * log blocks. later on, we will only wait on blocks that correspond
+	 * to this transaction, but while we're unplugging we might as well
+	 * get a chunk of data on there.
 	 */
 	atomic_inc(&journal->j_async_throttle);
-	for (i = 0; i < (jl->j_len + 1); i++) {
+	write_len = jl->j_len + 1;
+	if (write_len < 256)
+		write_len = 256;
+	for (i = 0 ; i < write_len ; i++) {
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
 		    SB_ONDISK_JOURNAL_SIZE(s);
 		tbh = journal_find_get_block(s, bn);
-		if (buffer_dirty(tbh))	/* redundant, ll_rw_block() checks */
+		if (tbh) {
-			ll_rw_block(SWRITE, 1, &tbh);
+			if (buffer_dirty(tbh))
-		put_bh(tbh);
+			    ll_rw_block(WRITE, 1, &tbh) ;
+			put_bh(tbh) ;
+		}
 	}
 	atomic_dec(&journal->j_async_throttle);
 	/* We're skipping the commit if there's an error */
 	if (retval || reiserfs_is_journal_aborted(journal))
 		barrier = 0;
 	/* wait on everything written so far before writing the commit
 	 * if we are in barrier mode, send the commit down now
 	 */
 	barrier = reiserfs_barrier_flush(s);
 	if (barrier) {
 		int ret;
 		lock_buffer(jl->j_commit_bh);
 		ret = submit_barrier_buffer(jl->j_commit_bh);
 		if (ret == -EOPNOTSUPP) {
 			set_buffer_uptodate(jl->j_commit_bh);
 			disable_barrier(s);
 			barrier = 0;
 		}
 	}
 	for (i = 0; i < (jl->j_len + 1); i++) {
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
 		tbh = journal_find_get_block(s, bn);
 		wait_on_buffer(tbh);
 		// since we're using ll_rw_blk above, it might have skipped over
 		// a locked buffer.  Double check here
 		//
 		if (buffer_dirty(tbh))	/* redundant, sync_dirty_buffer() checks */
 			sync_dirty_buffer(tbh);
 		if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
 			reiserfs_warning(s, "journal-601, buffer write failed");
 #endif
 			retval = -EIO;
 		}
 		put_bh(tbh);	/* once for journal_find_get_block */
 		put_bh(tbh);	/* once due to original getblk in do_journal_end */
 		atomic_dec(&(jl->j_commit_left));
 	}
 	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
 	if (!barrier) {
 		/* If there was a write error in the journal - we can't commit
 		 * this transaction - it will be invalid and, if successful,
 		 * will just end up propogating the write error out to
 		 * the file system. */
 		if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
 			if (buffer_dirty(jl->j_commit_bh))
 				BUG();
 			mark_buffer_dirty(jl->j_commit_bh) ;
 			sync_dirty_buffer(jl->j_commit_bh) ;
 		}
 	} else
 		wait_on_buffer(jl->j_commit_bh);
 	check_barrier_completion(s, jl->j_commit_bh);
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
 	 * up propogating the write error out to the filesystem. */
 	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
 		reiserfs_warning(s, "journal-615: buffer write failed");
 #endif
 		retval = -EIO;
 	}
 	bforget(jl->j_commit_bh);
 	if (journal->j_last_commit_id != 0 &&
 	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
 		reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
 				 journal->j_last_commit_id, jl->j_trans_id);
 	}
 	journal->j_last_commit_id = jl->j_trans_id;
 	/* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
 	cleanup_freed_for_journal_list(s, jl);
 	retval = retval ? retval : journal->j_errno;
 	/* mark the metadata dirty */
 	if (!retval)
 		dirty_one_transaction(s, jl);
 	atomic_dec(&(jl->j_commit_left));
 	if (flushall) {
 		atomic_set(&(jl->j_older_commits_done), 1);
 	}
 	up(&jl->j_commit_lock);
       put_jl:
 	put_journal_list(s, jl);
 	if (retval)
 		reiserfs_abort(s, retval, "Journal write error in %s",
 			       __FUNCTION__);
 	put_fs_excl();
 	return retval;
 }
 /*
 ** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or
 ** returns NULL if it can't find anything
 */
 static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
 							  reiserfs_journal_cnode
 							  *cn)
 {
 	struct super_block *sb = cn->sb;
 	b_blocknr_t blocknr = cn->blocknr;
 	cn = cn->hprev;
 	while (cn) {
 		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
 			return cn->jlist;
 		}
 		cn = cn->hprev;
 	}
 	return NULL;
 }
 static void remove_journal_hash(struct super_block *,
 				struct reiserfs_journal_cnode **,
 				struct reiserfs_journal_list *, unsigned long,
 				int);
 /*
 ** once all the real blocks have been flushed, it is safe to remove them from the
 ** journal list for this transaction.  Aside from freeing the cnode, this also allows the
 ** block to be reallocated for data blocks if it had been deleted.
 */
 static void remove_all_from_journal_list(struct super_block *p_s_sb,
 					 struct reiserfs_journal_list *jl,
 					 int debug)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_cnode *cn, *last;
 	cn = jl->j_realblock;
 	/* which is better, to lock once around the whole loop, or
 	 ** to lock for each call to remove_journal_hash?
 	 */
 	while (cn) {
 		if (cn->blocknr != 0) {
 			if (debug) {
 				reiserfs_warning(p_s_sb,
 						 "block %u, bh is %d, state %ld",
 						 cn->blocknr, cn->bh ? 1 : 0,
 						 cn->state);
 			}
 			cn->state = 0;
 			remove_journal_hash(p_s_sb, journal->j_list_hash_table,
 					    jl, cn->blocknr, 1);
 		}
 		last = cn;
 		cn = cn->next;
 		free_cnode(p_s_sb, last);
 	}
 	jl->j_realblock = NULL;
 }
 /*
 ** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
 ** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
 ** releasing blocks in this transaction for reuse as data blocks.
 ** called by flush_journal_list, before it calls remove_all_from_journal_list
 **
 */
 static int _update_journal_header_block(struct super_block *p_s_sb,
 					unsigned long offset,
 					unsigned long trans_id)
 {
 	struct reiserfs_journal_header *jh;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	if (reiserfs_is_journal_aborted(journal))
 		return -EIO;
 	if (trans_id >= journal->j_last_flush_trans_id) {
 		if (buffer_locked((journal->j_header_bh))) {
 			wait_on_buffer((journal->j_header_bh));
 			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
 				reiserfs_warning(p_s_sb,
 						 "journal-699: buffer write failed");
 #endif
 				return -EIO;
 			}
 		}
 		journal->j_last_flush_trans_id = trans_id;
 		journal->j_first_unflushed_offset = offset;
 		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
 							b_data);
 		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
 		jh->j_first_unflushed_offset = cpu_to_le32(offset);
 		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
 		if (reiserfs_barrier_flush(p_s_sb)) {
 			int ret;
 			lock_buffer(journal->j_header_bh);
 			ret = submit_barrier_buffer(journal->j_header_bh);
 			if (ret == -EOPNOTSUPP) {
 				set_buffer_uptodate(journal->j_header_bh);
 				disable_barrier(p_s_sb);
 				goto sync;
 			}
 			wait_on_buffer(journal->j_header_bh);
 			check_barrier_completion(p_s_sb, journal->j_header_bh);
 		} else {
 		      sync:
 			set_buffer_dirty(journal->j_header_bh);
 			sync_dirty_buffer(journal->j_header_bh);
 		}
 		if (!buffer_uptodate(journal->j_header_bh)) {
 			reiserfs_warning(p_s_sb,
 					 "journal-837: IO error during journal replay");
 			return -EIO;
 		}
 	}
 	return 0;
 }
 static int update_journal_header_block(struct super_block *p_s_sb,
 				       unsigned long offset,
 				       unsigned long trans_id)
 {
 	return _update_journal_header_block(p_s_sb, offset, trans_id);
 }
 /*
 ** flush any and all journal lists older than you are
 ** can only be called from flush_journal_list
 */
 static int flush_older_journal_lists(struct super_block *p_s_sb,
 				     struct reiserfs_journal_list *jl)
 {
 	struct list_head *entry;
 	struct reiserfs_journal_list *other_jl;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	unsigned long trans_id = jl->j_trans_id;
 	/* we know we are the only ones flushing things, no extra race
 	 * protection is required.
 	 */
       restart:
 	entry = journal->j_journal_list.next;
 	/* Did we wrap? */
 	if (entry == &journal->j_journal_list)
 		return 0;
 	other_jl = JOURNAL_LIST_ENTRY(entry);
 	if (other_jl->j_trans_id < trans_id) {
 		BUG_ON(other_jl->j_refcount <= 0);
 		/* do not flush all */
 		flush_journal_list(p_s_sb, other_jl, 0);
 		/* other_jl is now deleted from the list */
 		goto restart;
 	}
 	return 0;
 }
 static void del_from_work_list(struct super_block *s,
 			       struct reiserfs_journal_list *jl)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	if (!list_empty(&jl->j_working_list)) {
 		list_del_init(&jl->j_working_list);
 		journal->j_num_work_lists--;
 	}
 }
 /* flush a journal list, both commit and real blocks
 **
 ** always set flushall to 1, unless you are calling from inside
 ** flush_journal_list
 **
 ** IMPORTANT.  This can only be called while there are no journal writers,
 ** and the journal is locked.  That means it can only be called from
 ** do_journal_end, or by journal_release
 */
 static int flush_journal_list(struct super_block *s,
 			      struct reiserfs_journal_list *jl, int flushall)
 {
 	struct reiserfs_journal_list *pjl;
 	struct reiserfs_journal_cnode *cn, *last;
 	int count;
 	int was_jwait = 0;
 	int was_dirty = 0;
 	struct buffer_head *saved_bh;
 	unsigned long j_len_saved = jl->j_len;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	int err = 0;
 	BUG_ON(j_len_saved <= 0);
 	if (atomic_read(&journal->j_wcount) != 0) {
 		reiserfs_warning(s,
 				 "clm-2048: flush_journal_list called with wcount %d",
 				 atomic_read(&journal->j_wcount));
 	}
 	BUG_ON(jl->j_trans_id == 0);
 	/* if flushall == 0, the lock is already held */
 	if (flushall) {
 		down(&journal->j_flush_sem);
 	} else if (!down_trylock(&journal->j_flush_sem)) {
 		BUG();
 	}
 	count = 0;
 	if (j_len_saved > journal->j_trans_max) {
 		reiserfs_panic(s,
 			       "journal-715: flush_journal_list, length is %lu, trans id %lu\n",
 			       j_len_saved, jl->j_trans_id);
 		return 0;
 	}
 	get_fs_excl();
 	/* if all the work is already done, get out of here */
 	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
 	    atomic_read(&(jl->j_commit_left)) <= 0) {
 		goto flush_older_and_return;
 	}
 	/* start by putting the commit list on disk.  This will also flush
 	 ** the commit lists of any olders transactions
 	 */
 	flush_commit_list(s, jl, 1);
 	if (!(jl->j_state & LIST_DIRTY)
 	    && !reiserfs_is_journal_aborted(journal))
 		BUG();
 	/* are we done now? */
 	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
 	    atomic_read(&(jl->j_commit_left)) <= 0) {
 		goto flush_older_and_return;
 	}
 	/* loop through each cnode, see if we need to write it,
 	 ** or wait on a more recent transaction, or just ignore it
 	 */
 	if (atomic_read(&(journal->j_wcount)) != 0) {
 		reiserfs_panic(s,
 			       "journal-844: panic journal list is flushing, wcount is not 0\n");
 	}
 	cn = jl->j_realblock;
 	while (cn) {
 		was_jwait = 0;
 		was_dirty = 0;
 		saved_bh = NULL;
 		/* blocknr of 0 is no longer in the hash, ignore it */
 		if (cn->blocknr == 0) {
 			goto free_cnode;
 		}
 		/* This transaction failed commit. Don't write out to the disk */
 		if (!(jl->j_state & LIST_DIRTY))
 			goto free_cnode;
 		pjl = find_newer_jl_for_cn(cn);
 		/* the order is important here.  We check pjl to make sure we
 		 ** don't clear BH_JDirty_wait if we aren't the one writing this
 		 ** block to disk
 		 */
 		if (!pjl && cn->bh) {
 			saved_bh = cn->bh;
 			/* we do this to make sure nobody releases the buffer while
 			 ** we are working with it
 			 */
 			get_bh(saved_bh);
 			if (buffer_journal_dirty(saved_bh)) {
 				BUG_ON(!can_dirty(cn));
 				was_jwait = 1;
 				was_dirty = 1;
 			} else if (can_dirty(cn)) {
 				/* everything with !pjl && jwait should be writable */
 				BUG();
 			}
 		}
 		/* if someone has this block in a newer transaction, just make
 		 ** sure they are commited, and don't try writing it to disk
 		 */
 		if (pjl) {
 			if (atomic_read(&pjl->j_commit_left))
 				flush_commit_list(s, pjl, 1);
 			goto free_cnode;
 		}
 		/* bh == NULL when the block got to disk on its own, OR,
 		 ** the block got freed in a future transaction
 		 */
 		if (saved_bh == NULL) {
 			goto free_cnode;
 		}
 		/* this should never happen.  kupdate_one_transaction has this list
 		 ** locked while it works, so we should never see a buffer here that
 		 ** is not marked JDirty_wait
 		 */
 		if ((!was_jwait) && !buffer_locked(saved_bh)) {
 			reiserfs_warning(s,
 					 "journal-813: BAD! buffer %llu %cdirty %cjwait, "
 					 "not in a newer tranasction",
 					 (unsigned long long)saved_bh->
 					 b_blocknr, was_dirty ? ' ' : '!',
 					 was_jwait ? ' ' : '!');
 		}
 		if (was_dirty) {
 			/* we inc again because saved_bh gets decremented at free_cnode */
 			get_bh(saved_bh);
 			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
 			lock_buffer(saved_bh);
 			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
 			if (buffer_dirty(saved_bh))
 				submit_logged_buffer(saved_bh);
 			else
 				unlock_buffer(saved_bh);
 			count++;
 		} else {
 			reiserfs_warning(s,
 					 "clm-2082: Unable to flush buffer %llu in %s",
 					 (unsigned long long)saved_bh->
 					 b_blocknr, __FUNCTION__);
 		}
 	      free_cnode:
 		last = cn;
 		cn = cn->next;
 		if (saved_bh) {
 			/* we incremented this to keep others from taking the buffer head away */
 			put_bh(saved_bh);
 			if (atomic_read(&(saved_bh->b_count)) < 0) {
 				reiserfs_warning(s,
 						 "journal-945: saved_bh->b_count < 0");
 			}
 		}
 	}
 	if (count > 0) {
 		cn = jl->j_realblock;
 		while (cn) {
 			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
 				if (!cn->bh) {
 					reiserfs_panic(s,
 						       "journal-1011: cn->bh is NULL\n");
 				}
 				wait_on_buffer(cn->bh);
 				if (!cn->bh) {
 					reiserfs_panic(s,
 						       "journal-1012: cn->bh is NULL\n");
 				}
 				if (unlikely(!buffer_uptodate(cn->bh))) {
 #ifdef CONFIG_REISERFS_CHECK
 					reiserfs_warning(s,
 							 "journal-949: buffer write failed\n");
 #endif
 					err = -EIO;
 				}
 				/* note, we must clear the JDirty_wait bit after the up to date
 				 ** check, otherwise we race against our flushpage routine
 				 */
 				BUG_ON(!test_clear_buffer_journal_dirty
 				       (cn->bh));
 				/* undo the inc from journal_mark_dirty */
 				put_bh(cn->bh);
 				brelse(cn->bh);
 			}
 			cn = cn->next;
 		}
 	}
 	if (err)
 		reiserfs_abort(s, -EIO,
 			       "Write error while pushing transaction to disk in %s",
 			       __FUNCTION__);
       flush_older_and_return:
 	/* before we can update the journal header block, we _must_ flush all
 	 ** real blocks from all older transactions to disk.  This is because
 	 ** once the header block is updated, this transaction will not be
 	 ** replayed after a crash
 	 */
 	if (flushall) {
 		flush_older_journal_lists(s, jl);
 	}
 	err = journal->j_errno;
 	/* before we can remove everything from the hash tables for this
 	 ** transaction, we must make sure it can never be replayed
 	 **
 	 ** since we are only called from do_journal_end, we know for sure there
 	 ** are no allocations going on while we are flushing journal lists.  So,
 	 ** we only need to update the journal header block for the last list
 	 ** being flushed
 	 */
 	if (!err && flushall) {
 		err =
 		    update_journal_header_block(s,
 						(jl->j_start + jl->j_len +
 						 2) % SB_ONDISK_JOURNAL_SIZE(s),
 						jl->j_trans_id);
 		if (err)
 			reiserfs_abort(s, -EIO,
 				       "Write error while updating journal header in %s",
 				       __FUNCTION__);
 	}
 	remove_all_from_journal_list(s, jl, 0);
 	list_del_init(&jl->j_list);
 	journal->j_num_lists--;
 	del_from_work_list(s, jl);
 	if (journal->j_last_flush_id != 0 &&
 	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
 		reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
 				 journal->j_last_flush_id, jl->j_trans_id);
 	}
 	journal->j_last_flush_id = jl->j_trans_id;
 	/* not strictly required since we are freeing the list, but it should
 	 * help find code using dead lists later on
 	 */
 	jl->j_len = 0;
 	atomic_set(&(jl->j_nonzerolen), 0);
 	jl->j_start = 0;
 	jl->j_realblock = NULL;
 	jl->j_commit_bh = NULL;
 	jl->j_trans_id = 0;
 	jl->j_state = 0;
 	put_journal_list(s, jl);
 	if (flushall)
 		up(&journal->j_flush_sem);
 	put_fs_excl();
 	return err;
 }
 static int write_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl,
 				 struct buffer_chunk *chunk)
 {
 	struct reiserfs_journal_cnode *cn;
 	int ret = 0;
 	jl->j_state |= LIST_TOUCHED;
 	del_from_work_list(s, jl);
 	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
 		return 0;
 	}
 	cn = jl->j_realblock;
 	while (cn) {
 		/* if the blocknr == 0, this has been cleared from the hash,
 		 ** skip it
 		 */
 		if (cn->blocknr == 0) {
 			goto next;
 		}
 		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
 			struct buffer_head *tmp_bh;
 			/* we can race against journal_mark_freed when we try
 			 * to lock_buffer(cn->bh), so we have to inc the buffer
 			 * count, and recheck things after locking
 			 */
 			tmp_bh = cn->bh;
 			get_bh(tmp_bh);
 			lock_buffer(tmp_bh);
 			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
 				if (!buffer_journal_dirty(tmp_bh) ||
 				    buffer_journal_prepared(tmp_bh))
 					BUG();
 				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
 				ret++;
 			} else {
 				/* note, cn->bh might be null now */
 				unlock_buffer(tmp_bh);
 			}
 			put_bh(tmp_bh);
 		}
 	      next:
 		cn = cn->next;
 		cond_resched();
 	}
 	return ret;
 }
 /* used by flush_commit_list */
 static int dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl)
 {
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_journal_list *pjl;
 	int ret = 0;
 	jl->j_state |= LIST_DIRTY;
 	cn = jl->j_realblock;
 	while (cn) {
 		/* look for a more recent transaction that logged this
 		 ** buffer.  Only the most recent transaction with a buffer in
 		 ** it is allowed to send that buffer to disk
 		 */
 		pjl = find_newer_jl_for_cn(cn);
 		if (!pjl && cn->blocknr && cn->bh
 		    && buffer_journal_dirty(cn->bh)) {
 			BUG_ON(!can_dirty(cn));
 			/* if the buffer is prepared, it will either be logged
 			 * or restored.  If restored, we need to make sure
 			 * it actually gets marked dirty
 			 */
 			clear_buffer_journal_new(cn->bh);
 			if (buffer_journal_prepared(cn->bh)) {
 				set_buffer_journal_restore_dirty(cn->bh);
 			} else {
 				set_buffer_journal_test(cn->bh);
 				mark_buffer_dirty(cn->bh);
 			}
 		}
 		cn = cn->next;
 	}
 	return ret;
 }
 static int kupdate_transactions(struct super_block *s,
 				struct reiserfs_journal_list *jl,
 				struct reiserfs_journal_list **next_jl,
 				unsigned long *next_trans_id,
 				int num_blocks, int num_trans)
 {
 	int ret = 0;
 	int written = 0;
 	int transactions_flushed = 0;
 	unsigned long orig_trans_id = jl->j_trans_id;
 	struct buffer_chunk chunk;
 	struct list_head *entry;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	chunk.nr = 0;
 	down(&journal->j_flush_sem);
 	if (!journal_list_still_alive(s, orig_trans_id)) {
 		goto done;
 	}
 	/* we've got j_flush_sem held, nobody is going to delete any
 	 * of these lists out from underneath us
 	 */
 	while ((num_trans && transactions_flushed < num_trans) ||
 	       (!num_trans && written < num_blocks)) {
 		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
 		    atomic_read(&jl->j_commit_left)
 		    || !(jl->j_state & LIST_DIRTY)) {
 			del_from_work_list(s, jl);
 			break;
 		}
 		ret = write_one_transaction(s, jl, &chunk);
 		if (ret < 0)
 			goto done;
 		transactions_flushed++;
 		written += ret;
 		entry = jl->j_list.next;
 		/* did we wrap? */
 		if (entry == &journal->j_journal_list) {
 			break;
 		}
 		jl = JOURNAL_LIST_ENTRY(entry);
 		/* don't bother with older transactions */
 		if (jl->j_trans_id <= orig_trans_id)
 			break;
 	}
 	if (chunk.nr) {
 		write_chunk(&chunk);
 	}
       done:
 	up(&journal->j_flush_sem);
 	return ret;
 }
 /* for o_sync and fsync heavy applications, they tend to use
 ** all the journa list slots with tiny transactions.  These
 ** trigger lots and lots of calls to update the header block, which
 ** adds seeks and slows things down.
 **
 ** This function tries to clear out a large chunk of the journal lists
 ** at once, which makes everything faster since only the newest journal
 ** list updates the header block
 */
 static int flush_used_journal_lists(struct super_block *s,
 				    struct reiserfs_journal_list *jl)
 {
 	unsigned long len = 0;
 	unsigned long cur_len;
 	int ret;
 	int i;
 	int limit = 256;
 	struct reiserfs_journal_list *tjl;
 	struct reiserfs_journal_list *flush_jl;
 	unsigned long trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	flush_jl = tjl = jl;
 	/* in data logging mode, try harder to flush a lot of blocks */
 	if (reiserfs_data_log(s))
 		limit = 1024;
 	/* flush for 256 transactions or limit blocks, whichever comes first */
 	for (i = 0; i < 256 && len < limit; i++) {
 		if (atomic_read(&tjl->j_commit_left) ||
 		    tjl->j_trans_id < jl->j_trans_id) {
 			break;
 		}
 		cur_len = atomic_read(&tjl->j_nonzerolen);
 		if (cur_len > 0) {
 			tjl->j_state &= ~LIST_TOUCHED;
 		}
 		len += cur_len;
 		flush_jl = tjl;
 		if (tjl->j_list.next == &journal->j_journal_list)
 			break;
 		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
 	}
 	/* try to find a group of blocks we can flush across all the
 	 ** transactions, but only bother if we've actually spanned
 	 ** across multiple lists
 	 */
 	if (flush_jl != jl) {
 		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
 	}
 	flush_journal_list(s, flush_jl, 1);
 	return 0;
 }
 /*
 ** removes any nodes in table with name block and dev as bh.
 ** only touchs the hnext and hprev pointers.
 */
 void remove_journal_hash(struct super_block *sb,
 			 struct reiserfs_journal_cnode **table,
 			 struct reiserfs_journal_list *jl,
 			 unsigned long block, int remove_freed)
 {
 	struct reiserfs_journal_cnode *cur;
 	struct reiserfs_journal_cnode **head;
 	head = &(journal_hash(table, sb, block));
 	if (!head) {
 		return;
 	}
 	cur = *head;
 	while (cur) {
 		if (cur->blocknr == block && cur->sb == sb
 		    && (jl == NULL || jl == cur->jlist)
 		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
 			if (cur->hnext) {
 				cur->hnext->hprev = cur->hprev;
 			}
 			if (cur->hprev) {
 				cur->hprev->hnext = cur->hnext;
 			} else {
 				*head = cur->hnext;
 			}
 			cur->blocknr = 0;
 			cur->sb = NULL;
 			cur->state = 0;
 			if (cur->bh && cur->jlist)	/* anybody who clears the cur->bh will also dec the nonzerolen */
 				atomic_dec(&(cur->jlist->j_nonzerolen));
 			cur->bh = NULL;
 			cur->jlist = NULL;
 		}
 		cur = cur->hnext;
 	}
 }
 static void free_journal_ram(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	kfree(journal->j_current_jl);
 	journal->j_num_lists--;
 	vfree(journal->j_cnode_free_orig);
 	free_list_bitmaps(p_s_sb, journal->j_list_bitmap);
 	free_bitmap_nodes(p_s_sb);	/* must be after free_list_bitmaps */
 	if (journal->j_header_bh) {
 		brelse(journal->j_header_bh);
 	}
 	/* j_header_bh is on the journal dev, make sure not to release the journal
 	 * dev until we brelse j_header_bh
 	 */
 	release_journal_dev(p_s_sb, journal);
 	vfree(journal);
 }
 /*
 ** call on unmount.  Only set error to 1 if you haven't made your way out
 ** of read_super() yet.  Any other caller must keep error at 0.
 */
 static int do_journal_release(struct reiserfs_transaction_handle *th,
 			      struct super_block *p_s_sb, int error)
 {
 	struct reiserfs_transaction_handle myth;
 	int flushed = 0;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	/* we only want to flush out transactions if we were called with error == 0
 	 */
 	if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
 		/* end the current trans */
 		BUG_ON(!th->t_trans_id);
 		do_journal_end(th, p_s_sb, 10, FLUSH_ALL);
 		/* make sure something gets logged to force our way into the flush code */
 		if (!journal_join(&myth, p_s_sb, 1)) {
 			reiserfs_prepare_for_journal(p_s_sb,
 						     SB_BUFFER_WITH_SB(p_s_sb),
 						     1);
 			journal_mark_dirty(&myth, p_s_sb,
 					   SB_BUFFER_WITH_SB(p_s_sb));
 			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
 			flushed = 1;
 		}
 	}
 	/* this also catches errors during the do_journal_end above */
 	if (!error && reiserfs_is_journal_aborted(journal)) {
 		memset(&myth, 0, sizeof(myth));
 		if (!journal_join_abort(&myth, p_s_sb, 1)) {
 			reiserfs_prepare_for_journal(p_s_sb,
 						     SB_BUFFER_WITH_SB(p_s_sb),
 						     1);
 			journal_mark_dirty(&myth, p_s_sb,
 					   SB_BUFFER_WITH_SB(p_s_sb));
 			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
 		}
 	}
 	reiserfs_mounted_fs_count--;
 	/* wait for all commits to finish */
 	cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
 	flush_workqueue(commit_wq);
 	if (!reiserfs_mounted_fs_count) {
 		destroy_workqueue(commit_wq);
 		commit_wq = NULL;
 	}
 	free_journal_ram(p_s_sb);
 	return 0;
 }
 /*
 ** call on unmount.  flush all journal trans, release all alloc'd ram
 */
 int journal_release(struct reiserfs_transaction_handle *th,
 		    struct super_block *p_s_sb)
 {
 	return do_journal_release(th, p_s_sb, 0);
 }
 /*
 ** only call from an error condition inside reiserfs_read_super!
 */
 int journal_release_error(struct reiserfs_transaction_handle *th,
 			  struct super_block *p_s_sb)
 {
 	return do_journal_release(th, p_s_sb, 1);
 }
 /* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
 static int journal_compare_desc_commit(struct super_block *p_s_sb,
 				       struct reiserfs_journal_desc *desc,
 				       struct reiserfs_journal_commit *commit)
 {
 	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
 	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
 	    get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
 	    get_commit_trans_len(commit) <= 0) {
 		return 1;
 	}
 	return 0;
 }
 /* returns 0 if it did not find a description block
 ** returns -1 if it found a corrupt commit block
 ** returns 1 if both desc and commit were valid
 */
 static int journal_transaction_is_valid(struct super_block *p_s_sb,
 					struct buffer_head *d_bh,
 					unsigned long *oldest_invalid_trans_id,
 					unsigned long *newest_mount_id)
 {
 	struct reiserfs_journal_desc *desc;
 	struct reiserfs_journal_commit *commit;
 	struct buffer_head *c_bh;
 	unsigned long offset;
 	if (!d_bh)
 		return 0;
 	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
 	if (get_desc_trans_len(desc) > 0
 	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
 		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
 		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
 			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 				       "journal-986: transaction "
 				       "is valid returning because trans_id %d is greater than "
 				       "oldest_invalid %lu",
 				       get_desc_trans_id(desc),
 				       *oldest_invalid_trans_id);
 			return 0;
 		}
 		if (newest_mount_id
 		    && *newest_mount_id > get_desc_mount_id(desc)) {
 			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 				       "journal-1087: transaction "
 				       "is valid returning because mount_id %d is less than "
 				       "newest_mount_id %lu",
 				       get_desc_mount_id(desc),
 				       *newest_mount_id);
 			return -1;
 		}
 		if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {
 			reiserfs_warning(p_s_sb,
 					 "journal-2018: Bad transaction length %d encountered, ignoring transaction",
 					 get_desc_trans_len(desc));
 			return -1;
 		}
 		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
 		/* ok, we have a journal description block, lets see if the transaction was valid */
 		c_bh =
 		    journal_bread(p_s_sb,
 				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 				  ((offset + get_desc_trans_len(desc) +
 				    1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
 		if (!c_bh)
 			return 0;
 		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
 		if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
 			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 				       "journal_transaction_is_valid, commit offset %ld had bad "
 				       "time %d or length %d",
 				       c_bh->b_blocknr -
 				       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
 				       get_commit_trans_id(commit),
 				       get_commit_trans_len(commit));
 			brelse(c_bh);
 			if (oldest_invalid_trans_id) {
 				*oldest_invalid_trans_id =
 				    get_desc_trans_id(desc);
 				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 					       "journal-1004: "
 					       "transaction_is_valid setting oldest invalid trans_id "
 					       "to %d",
 					       get_desc_trans_id(desc));
 			}
 			return -1;
 		}
 		brelse(c_bh);
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 			       "journal-1006: found valid "
 			       "transaction start offset %llu, len %d id %d",
 			       d_bh->b_blocknr -
 			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
 			       get_desc_trans_len(desc),
 			       get_desc_trans_id(desc));
 		return 1;
 	} else {
 		return 0;
 	}
 }
 static void brelse_array(struct buffer_head **heads, int num)
 {
 	int i;
 	for (i = 0; i < num; i++) {
 		brelse(heads[i]);
 	}
 }
 /*
 ** given the start, and values for the oldest acceptable transactions,
 ** this either reads in a replays a transaction, or returns because the transaction
 ** is invalid, or too old.
 */
 static int journal_read_transaction(struct super_block *p_s_sb,
 				    unsigned long cur_dblock,
 				    unsigned long oldest_start,
 				    unsigned long oldest_trans_id,
 				    unsigned long newest_mount_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_desc *desc;
 	struct reiserfs_journal_commit *commit;
 	unsigned long trans_id = 0;
 	struct buffer_head *c_bh;
 	struct buffer_head *d_bh;
 	struct buffer_head **log_blocks = NULL;
 	struct buffer_head **real_blocks = NULL;
 	unsigned long trans_offset;
 	int i;
 	int trans_half;
 	d_bh = journal_bread(p_s_sb, cur_dblock);
 	if (!d_bh)
 		return 1;
 	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
 	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
 		       "journal_read_transaction, offset %llu, len %d mount_id %d",
 		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
 		       get_desc_trans_len(desc), get_desc_mount_id(desc));
 	if (get_desc_trans_id(desc) < oldest_trans_id) {
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
 			       "journal_read_trans skipping because %lu is too old",
 			       cur_dblock -
 			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
 		brelse(d_bh);
 		return 1;
 	}
 	if (get_desc_mount_id(desc) != newest_mount_id) {
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
 			       "journal_read_trans skipping because %d is != "
 			       "newest_mount_id %lu", get_desc_mount_id(desc),
 			       newest_mount_id);
 		brelse(d_bh);
 		return 1;
 	}
 	c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 			     ((trans_offset + get_desc_trans_len(desc) + 1) %
 			      SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
 	if (!c_bh) {
 		brelse(d_bh);
 		return 1;
 	}
 	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
 	if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 			       "journal_read_transaction, "
 			       "commit offset %llu had bad time %d or length %d",
 			       c_bh->b_blocknr -
 			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
 			       get_commit_trans_id(commit),
 			       get_commit_trans_len(commit));
 		brelse(c_bh);
 		brelse(d_bh);
 		return 1;
 	}
 	trans_id = get_desc_trans_id(desc);
 	/* now we know we've got a good transaction, and it was inside the valid time ranges */
 	log_blocks = kmalloc(get_desc_trans_len(desc) *
 			     sizeof(struct buffer_head *), GFP_NOFS);
 	real_blocks = kmalloc(get_desc_trans_len(desc) *
 			      sizeof(struct buffer_head *), GFP_NOFS);
 	if (!log_blocks || !real_blocks) {
 		brelse(c_bh);
 		brelse(d_bh);
 		kfree(log_blocks);
 		kfree(real_blocks);
 		reiserfs_warning(p_s_sb,
 				 "journal-1169: kmalloc failed, unable to mount FS");
 		return -1;
 	}
 	/* get all the buffer heads */
 	trans_half = journal_trans_half(p_s_sb->s_blocksize);
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		log_blocks[i] =
 		    journal_getblk(p_s_sb,
 				   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 				   (trans_offset + 1 +
 				    i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 		if (i < trans_half) {
 			real_blocks[i] =
 			    sb_getblk(p_s_sb,
 				      le32_to_cpu(desc->j_realblock[i]));
 		} else {
 			real_blocks[i] =
 			    sb_getblk(p_s_sb,
 				      le32_to_cpu(commit->
 						  j_realblock[i - trans_half]));
 		}
 		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
 			reiserfs_warning(p_s_sb,
 					 "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
 			goto abort_replay;
 		}
 		/* make sure we don't try to replay onto log or reserved area */
 		if (is_block_in_log_or_reserved_area
 		    (p_s_sb, real_blocks[i]->b_blocknr)) {
 			reiserfs_warning(p_s_sb,
 					 "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block");
 		      abort_replay:
 			brelse_array(log_blocks, i);
 			brelse_array(real_blocks, i);
 			brelse(c_bh);
 			brelse(d_bh);
 			kfree(log_blocks);
 			kfree(real_blocks);
 			return -1;
 		}
 	}
 	/* read in the log blocks, memcpy to the corresponding real block */
 	ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(log_blocks[i]);
 		if (!buffer_uptodate(log_blocks[i])) {
 			reiserfs_warning(p_s_sb,
 					 "journal-1212: REPLAY FAILURE fsck required! buffer write failed");
 			brelse_array(log_blocks + i,
 				     get_desc_trans_len(desc) - i);
 			brelse_array(real_blocks, get_desc_trans_len(desc));
 			brelse(c_bh);
 			brelse(d_bh);
 			kfree(log_blocks);
 			kfree(real_blocks);
 			return -1;
 		}
 		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
 		       real_blocks[i]->b_size);
 		set_buffer_uptodate(real_blocks[i]);
 		brelse(log_blocks[i]);
 	}
 	/* flush out the real blocks */
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		set_buffer_dirty(real_blocks[i]);
 		ll_rw_block(SWRITE, 1, real_blocks + i);
 	}
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
 		if (!buffer_uptodate(real_blocks[i])) {
 			reiserfs_warning(p_s_sb,
 					 "journal-1226: REPLAY FAILURE, fsck required! buffer write failed");
 			brelse_array(real_blocks + i,
 				     get_desc_trans_len(desc) - i);
 			brelse(c_bh);
 			brelse(d_bh);
 			kfree(log_blocks);
 			kfree(real_blocks);
 			return -1;
 		}
 		brelse(real_blocks[i]);
 	}
 	cur_dblock =
 	    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 	    ((trans_offset + get_desc_trans_len(desc) +
 	      2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 		       "journal-1095: setting journal " "start to offset %ld",
 		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
 	/* init starting values for the first transaction, in case this is the last transaction to be replayed. */
 	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
 	journal->j_last_flush_trans_id = trans_id;
 	journal->j_trans_id = trans_id + 1;
 	brelse(c_bh);
 	brelse(d_bh);
 	kfree(log_blocks);
 	kfree(real_blocks);
 	return 0;
 }
 /* This function reads blocks starting from block and to max_block of bufsize
    size (but no more than BUFNR blocks at a time). This proved to improve
    mounting speed on self-rebuilding raid5 arrays at least.
    Right now it is only used from journal code. But later we might use it
    from other places.
    Note: Do not use journal_getblk/sb_getblk functions here! */
 static struct buffer_head *reiserfs_breada(struct block_device *dev, int block,
 					   int bufsize, unsigned int max_block)
 {
 	struct buffer_head *bhlist[BUFNR];
 	unsigned int blocks = BUFNR;
 	struct buffer_head *bh;
 	int i, j;
 	bh = __getblk(dev, block, bufsize);
 	if (buffer_uptodate(bh))
 		return (bh);
 	if (block + BUFNR > max_block) {
 		blocks = max_block - block;
 	}
 	bhlist[0] = bh;
 	j = 1;
 	for (i = 1; i < blocks; i++) {
 		bh = __getblk(dev, block + i, bufsize);
 		if (buffer_uptodate(bh)) {
 			brelse(bh);
 			break;
 		} else
 			bhlist[j++] = bh;
 	}
 	ll_rw_block(READ, j, bhlist);
 	for (i = 1; i < j; i++)
 		brelse(bhlist[i]);
 	bh = bhlist[0];
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	brelse(bh);
 	return NULL;
 }
 /*
 ** read and replay the log
 ** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
 ** transaction.  This tests that before finding all the transactions in the log, which makes normal mount times fast.
 **
 ** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
 **
 ** On exit, it sets things up so the first transaction will work correctly.
 */
 static int journal_read(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_desc *desc;
 	unsigned long oldest_trans_id = 0;
 	unsigned long oldest_invalid_trans_id = 0;
 	time_t start;
 	unsigned long oldest_start = 0;
 	unsigned long cur_dblock = 0;
 	unsigned long newest_mount_id = 9;
 	struct buffer_head *d_bh;
 	struct reiserfs_journal_header *jh;
 	int valid_journal_header = 0;
 	int replay_count = 0;
 	int continue_replay = 1;
 	int ret;
 	char b[BDEVNAME_SIZE];
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
 	reiserfs_info(p_s_sb, "checking transaction log (%s)\n",
 		      bdevname(journal->j_dev_bd, b));
 	start = get_seconds();
 	/* step 1, read in the journal header block.  Check the transaction it says
 	 ** is the first unflushed, and if that transaction is not valid,
 	 ** replay is done
 	 */
 	journal->j_header_bh = journal_bread(p_s_sb,
 					     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)
 					     + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 	if (!journal->j_header_bh) {
 		return 1;
 	}
 	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
 	if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&
 	    le32_to_cpu(jh->j_first_unflushed_offset) <
 	    SB_ONDISK_JOURNAL_SIZE(p_s_sb)
 	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
 		oldest_start =
 		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 		    le32_to_cpu(jh->j_first_unflushed_offset);
 		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
 		newest_mount_id = le32_to_cpu(jh->j_mount_id);
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 			       "journal-1153: found in "
 			       "header: first_unflushed_offset %d, last_flushed_trans_id "
 			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
 			       le32_to_cpu(jh->j_last_flush_trans_id));
 		valid_journal_header = 1;
 		/* now, we try to read the first unflushed offset.  If it is not valid,
 		 ** there is nothing more we can do, and it makes no sense to read
 		 ** through the whole log.
 		 */
 		d_bh =
 		    journal_bread(p_s_sb,
 				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 				  le32_to_cpu(jh->j_first_unflushed_offset));
 		ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL);
 		if (!ret) {
 			continue_replay = 0;
 		}
 		brelse(d_bh);
 		goto start_log_replay;
 	}
 	if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
 		reiserfs_warning(p_s_sb,
 				 "clm-2076: device is readonly, unable to replay log");
 		return -1;
 	}
 	/* ok, there are transactions that need to be replayed.  start with the first log block, find
 	 ** all the valid transactions, and pick out the oldest.
 	 */
 	while (continue_replay
 	       && cur_dblock <
 	       (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 		SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
 		/* Note that it is required for blocksize of primary fs device and journal
 		   device to be the same */
 		d_bh =
 		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
 				    p_s_sb->s_blocksize,
 				    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 				    SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 		ret =
 		    journal_transaction_is_valid(p_s_sb, d_bh,
 						 &oldest_invalid_trans_id,
 						 &newest_mount_id);
 		if (ret == 1) {
 			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
 			if (oldest_start == 0) {	/* init all oldest_ values */
 				oldest_trans_id = get_desc_trans_id(desc);
 				oldest_start = d_bh->b_blocknr;
 				newest_mount_id = get_desc_mount_id(desc);
 				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 					       "journal-1179: Setting "
 					       "oldest_start to offset %llu, trans_id %lu",
 					       oldest_start -
 					       SB_ONDISK_JOURNAL_1st_BLOCK
 					       (p_s_sb), oldest_trans_id);
 			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
 				/* one we just read was older */
 				oldest_trans_id = get_desc_trans_id(desc);
 				oldest_start = d_bh->b_blocknr;
 				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 					       "journal-1180: Resetting "
 					       "oldest_start to offset %lu, trans_id %lu",
 					       oldest_start -
 					       SB_ONDISK_JOURNAL_1st_BLOCK
 					       (p_s_sb), oldest_trans_id);
 			}
 			if (newest_mount_id < get_desc_mount_id(desc)) {
 				newest_mount_id = get_desc_mount_id(desc);
 				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 					       "journal-1299: Setting "
 					       "newest_mount_id to %d",
 					       get_desc_mount_id(desc));
 			}
 			cur_dblock += get_desc_trans_len(desc) + 2;
 		} else {
 			cur_dblock++;
 		}
 		brelse(d_bh);
 	}
       start_log_replay:
 	cur_dblock = oldest_start;
 	if (oldest_trans_id) {
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 			       "journal-1206: Starting replay "
 			       "from offset %llu, trans_id %lu",
 			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
 			       oldest_trans_id);
 	}
 	replay_count = 0;
 	while (continue_replay && oldest_trans_id > 0) {
 		ret =
 		    journal_read_transaction(p_s_sb, cur_dblock, oldest_start,
 					     oldest_trans_id, newest_mount_id);
 		if (ret < 0) {
 			return ret;
 		} else if (ret != 0) {
 			break;
 		}
 		cur_dblock =
 		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start;
 		replay_count++;
 		if (cur_dblock == oldest_start)
 			break;
 	}
 	if (oldest_trans_id == 0) {
 		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
 			       "journal-1225: No valid " "transactions found");
 	}
 	/* j_start does not get set correctly if we don't replay any transactions.
 	 ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
 	 ** copy the trans_id from the header
 	 */
 	if (valid_journal_header && replay_count == 0) {
 		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
 		journal->j_trans_id =
 		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
 		journal->j_last_flush_trans_id =
 		    le32_to_cpu(jh->j_last_flush_trans_id);
 		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
 	} else {
 		journal->j_mount_id = newest_mount_id + 1;
 	}
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
 		       "newest_mount_id to %lu", journal->j_mount_id);
 	journal->j_first_unflushed_offset = journal->j_start;
 	if (replay_count > 0) {
 		reiserfs_info(p_s_sb,
 			      "replayed %d transactions in %lu seconds\n",
 			      replay_count, get_seconds() - start);
 	}
 	if (!bdev_read_only(p_s_sb->s_bdev) &&
 	    _update_journal_header_block(p_s_sb, journal->j_start,
 					 journal->j_last_flush_trans_id)) {
 		/* replay failed, caller must call free_journal_ram and abort
 		 ** the mount
 		 */
 		return -1;
 	}
 	return 0;
 }
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
 	jl = kzalloc(sizeof(struct reiserfs_journal_list),
 		     GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
 	INIT_LIST_HEAD(&jl->j_bh_list);
 	sema_init(&jl->j_commit_lock, 1);
 	SB_JOURNAL(s)->j_num_lists++;
 	get_journal_list(jl);
 	return jl;
 }
 static void journal_list_init(struct super_block *p_s_sb)
 {
 	SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 }
 static int release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal)
 {
 	int result;
 	result = 0;
 	if (journal->j_dev_file != NULL) {
 		result = filp_close(journal->j_dev_file, NULL);
 		journal->j_dev_file = NULL;
 		journal->j_dev_bd = NULL;
 	} else if (journal->j_dev_bd != NULL) {
 		result = blkdev_put(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
 	}
 	if (result != 0) {
 		reiserfs_warning(super,
 				 "sh-457: release_journal_dev: Cannot release journal device: %i",
 				 result);
 	}
 	return result;
 }
 static int journal_init_dev(struct super_block *super,
 			    struct reiserfs_journal *journal,
 			    const char *jdev_name)
 {
 	int result;
 	dev_t jdev;
 	int blkdev_mode = FMODE_READ | FMODE_WRITE;
 	char b[BDEVNAME_SIZE];
 	result = 0;
 	journal->j_dev_bd = NULL;
 	journal->j_dev_file = NULL;
 	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
 	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 	if (bdev_read_only(super->s_bdev))
 		blkdev_mode = FMODE_READ;
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
 		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
 			reiserfs_warning(super, "sh-458: journal_init_dev: "
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
 		} else if (jdev != super->s_dev)
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
 		return 0;
 	}
 	journal->j_dev_file = filp_open(jdev_name, 0, 0);
 	if (!IS_ERR(journal->j_dev_file)) {
 		struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
 		if (!S_ISBLK(jdev_inode->i_mode)) {
 			reiserfs_warning(super, "journal_init_dev: '%s' is "
 					 "not a block device", jdev_name);
 			result = -ENOTBLK;
 			release_journal_dev(super, journal);
 		} else {
 			/* ok */
 			journal->j_dev_bd = I_BDEV(jdev_inode);
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
 			reiserfs_info(super,
 				      "journal_init_dev: journal device: %s\n",
 				      bdevname(journal->j_dev_bd, b));
 		}
 	} else {
 		result = PTR_ERR(journal->j_dev_file);
 		journal->j_dev_file = NULL;
 		reiserfs_warning(super,
 				 "journal_init_dev: Cannot open '%s': %i",
 				 jdev_name, result);
 	}
 	return result;
 }
 /*
 ** must be called once on fs mount.  calls journal_read for you
 */
 int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 		 int old_format, unsigned int commit_max_age)
 {
 	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2;
 	struct buffer_head *bhjh;
 	struct reiserfs_super_block *rs;
 	struct reiserfs_journal_header *jh;
 	struct reiserfs_journal *journal;
 	struct reiserfs_journal_list *jl;
 	char b[BDEVNAME_SIZE];
 	journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));
 	if (!journal) {
 		reiserfs_warning(p_s_sb,
 				 "journal-1256: unable to get memory for journal structure");
 		return 1;
 	}
 	memset(journal, 0, sizeof(struct reiserfs_journal));
 	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
 	INIT_LIST_HEAD(&journal->j_prealloc_list);
 	INIT_LIST_HEAD(&journal->j_working_list);
 	INIT_LIST_HEAD(&journal->j_journal_list);
 	journal->j_persistent_trans = 0;
 	if (reiserfs_allocate_list_bitmaps(p_s_sb,
 					   journal->j_list_bitmap,
 					   SB_BMAP_NR(p_s_sb)))
 		goto free_and_return;
 	allocate_bitmap_nodes(p_s_sb);
 	/* reserved for journal area support */
 	SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
 						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
 						 / p_s_sb->s_blocksize +
 						 SB_BMAP_NR(p_s_sb) +
 						 1 :
 						 REISERFS_DISK_OFFSET_IN_BYTES /
 						 p_s_sb->s_blocksize + 2);
 	/* Sanity check to see is the standard journal fitting withing first bitmap
 	   (actual for small blocksizes) */
 	if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&
 	    (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +
 	     SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {
 		reiserfs_warning(p_s_sb,
 				 "journal-1393: journal does not fit for area "
 				 "addressed by first of bitmap blocks. It starts at "
 				 "%u and its size is %u. Block size %ld",
 				 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
 				 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
 				 p_s_sb->s_blocksize);
 		goto free_and_return;
 	}
 	if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {
 		reiserfs_warning(p_s_sb,
 				 "sh-462: unable to initialize jornal device");
 		goto free_and_return;
 	}
 	rs = SB_DISK_SUPER_BLOCK(p_s_sb);
 	/* read journal header */
 	bhjh = journal_bread(p_s_sb,
 			     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 			     SB_ONDISK_JOURNAL_SIZE(p_s_sb));
 	if (!bhjh) {
 		reiserfs_warning(p_s_sb,
 				 "sh-459: unable to read journal header");
 		goto free_and_return;
 	}
 	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
 	/* make sure that journal matches to the super block */
 	if (is_reiserfs_jr(rs)
 	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
 		sb_jp_journal_magic(rs))) {
 		reiserfs_warning(p_s_sb,
 				 "sh-460: journal header magic %x "
 				 "(device %s) does not match to magic found in super "
 				 "block %x", jh->jh_journal.jp_journal_magic,
 				 bdevname(journal->j_dev_bd, b),
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
 		goto free_and_return;
 	}
 	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
 	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
 	journal->j_max_commit_age =
 	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
 	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
 	if (journal->j_trans_max) {
 		/* make sure these parameters are available, assign it if they are not */
 		__u32 initial = journal->j_trans_max;
 		__u32 ratio = 1;
 		if (p_s_sb->s_blocksize < 4096)
 			ratio = 4096 / p_s_sb->s_blocksize;
 		if (SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <
 		    JOURNAL_MIN_RATIO)
 			journal->j_trans_max =
 			    SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;
 		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)
 			journal->j_trans_max =
 			    JOURNAL_TRANS_MAX_DEFAULT / ratio;
 		if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)
 			journal->j_trans_max =
 			    JOURNAL_TRANS_MIN_DEFAULT / ratio;
 		if (journal->j_trans_max != initial)
 			reiserfs_warning(p_s_sb,
 					 "sh-461: journal_init: wrong transaction max size (%u). Changed to %u",
 					 initial, journal->j_trans_max);
 		journal->j_max_batch = journal->j_trans_max *
 		    JOURNAL_MAX_BATCH_DEFAULT / JOURNAL_TRANS_MAX_DEFAULT;
 	}
 	if (!journal->j_trans_max) {
 		/*we have the file system was created by old version of mkreiserfs
 		   so this field contains zero value */
 		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
 		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
 		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
 		/* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096
 		   trans max size is decreased proportionally */
 		if (p_s_sb->s_blocksize < 4096) {
 			journal->j_trans_max /= (4096 / p_s_sb->s_blocksize);
 			journal->j_max_batch = (journal->j_trans_max) * 9 / 10;
 		}
 	}
 	journal->j_default_max_commit_age = journal->j_max_commit_age;
 	if (commit_max_age != 0) {
 		journal->j_max_commit_age = commit_max_age;
 		journal->j_max_trans_age = commit_max_age;
 	}
 	reiserfs_info(p_s_sb, "journal params: device %s, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
 		      bdevname(journal->j_dev_bd, b),
 		      SB_ONDISK_JOURNAL_SIZE(p_s_sb),
 		      SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
 		      journal->j_trans_max,
 		      journal->j_max_batch,
 		      journal->j_max_commit_age, journal->j_max_trans_age);
 	brelse(bhjh);
 	journal->j_list_bitmap_index = 0;
 	journal_list_init(p_s_sb);
 	memset(journal->j_list_hash_table, 0,
 	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
 	INIT_LIST_HEAD(&journal->j_dirty_buffers);
 	spin_lock_init(&journal->j_dirty_buffers_lock);
 	journal->j_start = 0;
 	journal->j_len = 0;
 	journal->j_len_alloc = 0;
 	atomic_set(&(journal->j_wcount), 0);
 	atomic_set(&(journal->j_async_throttle), 0);
 	journal->j_bcount = 0;
 	journal->j_trans_start_time = 0;
 	journal->j_last = NULL;
 	journal->j_first = NULL;
 	init_waitqueue_head(&(journal->j_join_wait));
 	sema_init(&journal->j_lock, 1);
 	sema_init(&journal->j_flush_sem, 1);
 	journal->j_trans_id = 10;
 	journal->j_mount_id = 10;
 	journal->j_state = 0;
 	atomic_set(&(journal->j_jlock), 0);
 	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
 	journal->j_cnode_free_orig = journal->j_cnode_free_list;
 	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
 	journal->j_cnode_used = 0;
 	journal->j_must_wait = 0;
 	if (journal->j_cnode_free == 0) {
         	reiserfs_warning(p_s_sb, "journal-2004: Journal cnode memory "
 		                 "allocation failed (%ld bytes). Journal is "
 		                 "too large for available memory. Usually "
 		                 "this is due to a journal that is too large.",
 		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
         	goto free_and_return;
 	}
 	init_journal_hash(p_s_sb);
 	jl = journal->j_current_jl;
 	jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
 	if (!jl->j_list_bitmap) {
 		reiserfs_warning(p_s_sb,
 				 "journal-2005, get_list_bitmap failed for journal list 0");
 		goto free_and_return;
 	}
 	if (journal_read(p_s_sb) < 0) {
 		reiserfs_warning(p_s_sb, "Replay Failure, unable to mount");
 		goto free_and_return;
 	}
 	reiserfs_mounted_fs_count++;
 	if (reiserfs_mounted_fs_count <= 1)
 		commit_wq = create_workqueue("reiserfs");
 	INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
 	return 0;
       free_and_return:
 	free_journal_ram(p_s_sb);
 	return 1;
 }
 /*
 ** test for a polite end of the current transaction.  Used by file_write, and should
 ** be used by delete to make sure they don't write more than can fit inside a single
 ** transaction
 */
 int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
 				   int new_alloc)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
 	time_t now = get_seconds();
 	/* cannot restart while nested */
 	BUG_ON(!th->t_trans_id);
 	if (th->t_refcount > 1)
 		return 0;
 	if (journal->j_must_wait > 0 ||
 	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
 	    atomic_read(&(journal->j_jlock)) ||
 	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
 	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
 		return 1;
 	}
 	return 0;
 }
 /* this must be called inside a transaction, and requires the
 ** kernel_lock to be held
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
 	BUG_ON(!th->t_trans_id);
 	journal->j_must_wait = 1;
 	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
 	return;
 }
 /* this must be called without a transaction started, and does not
 ** require BKL
 */
 void reiserfs_allow_writes(struct super_block *s)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
 	wake_up(&journal->j_join_wait);
 }
 /* this must be called without a transaction started, and does not
 ** require BKL
 */
 void reiserfs_wait_on_write_block(struct super_block *s)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	wait_event(journal->j_join_wait,
 		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
 }
 static void queue_log_writer(struct super_block *s)
 {
 	wait_queue_t wait;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	set_bit(J_WRITERS_QUEUED, &journal->j_state);
 	/*
 	 * we don't want to use wait_event here because
 	 * we only want to wait once.
 	 */
 	init_waitqueue_entry(&wait, current);
 	add_wait_queue(&journal->j_join_wait, &wait);
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
 		schedule();
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&journal->j_join_wait, &wait);
 }
 static void wake_queued_writers(struct super_block *s)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
 		wake_up(&journal->j_join_wait);
 }
 static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	unsigned long bcount = journal->j_bcount;
 	while (1) {
 		schedule_timeout_uninterruptible(1);
 		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
 		while ((atomic_read(&journal->j_wcount) > 0 ||
 			atomic_read(&journal->j_jlock)) &&
 		       journal->j_trans_id == trans_id) {
 			queue_log_writer(sb);
 		}
 		if (journal->j_trans_id != trans_id)
 			break;
 		if (bcount == journal->j_bcount)
 			break;
 		bcount = journal->j_bcount;
 	}
 }
 /* join == true if you must join an existing transaction.
 ** join == false if you can deal with waiting for others to finish
 **
 ** this will block until the transaction is joinable.  send the number of blocks you
 ** expect to use in nblocks.
 */
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 			      struct super_block *p_s_sb, unsigned long nblocks,
 			      int join)
 {
 	time_t now = get_seconds();
 	int old_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_transaction_handle myth;
 	int sched_count = 0;
 	int retval;
 	reiserfs_check_lock_depth(p_s_sb, "journal_begin");
 	if (nblocks > journal->j_trans_max)
 		BUG();
 	PROC_INFO_INC(p_s_sb, journal.journal_being);
 	/* set here for journal_join */
 	th->t_refcount = 1;
 	th->t_super = p_s_sb;
       relock:
 	lock_journal(p_s_sb);
 	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
 		unlock_journal(p_s_sb);
 		retval = journal->j_errno;
 		goto out_fail;
 	}
 	journal->j_bcount++;
 	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
 		unlock_journal(p_s_sb);
 		reiserfs_wait_on_write_block(p_s_sb);
 		PROC_INFO_INC(p_s_sb, journal.journal_relock_writers);
 		goto relock;
 	}
 	now = get_seconds();
 	/* if there is no room in the journal OR
 	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
 	 ** we don't sleep if there aren't other writers
 	 */
 	if ((!join && journal->j_must_wait > 0) ||
 	    (!join
 	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
 	    || (!join && atomic_read(&journal->j_wcount) > 0
 		&& journal->j_trans_start_time > 0
 		&& (now - journal->j_trans_start_time) >
 		journal->j_max_trans_age) || (!join
 					      && atomic_read(&journal->j_jlock))
 	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
 		old_trans_id = journal->j_trans_id;
 		unlock_journal(p_s_sb);	/* allow others to finish this transaction */
 		if (!join && (journal->j_len_alloc + nblocks + 2) >=
 		    journal->j_max_batch &&
 		    ((journal->j_len + nblocks + 2) * 100) <
 		    (journal->j_len_alloc * 75)) {
 			if (atomic_read(&journal->j_wcount) > 10) {
 				sched_count++;
 				queue_log_writer(p_s_sb);
 				goto relock;
 			}
 		}
 		/* don't mess with joining the transaction if all we have to do is
 		 * wait for someone else to do a commit
 		 */
 		if (atomic_read(&journal->j_jlock)) {
 			while (journal->j_trans_id == old_trans_id &&
 			       atomic_read(&journal->j_jlock)) {
 				queue_log_writer(p_s_sb);
 			}
 			goto relock;
 		}
 		retval = journal_join(&myth, p_s_sb, 1);
 		if (retval)
 			goto out_fail;
 		/* someone might have ended the transaction while we joined */
 		if (old_trans_id != journal->j_trans_id) {
 			retval = do_journal_end(&myth, p_s_sb, 1, 0);
 		} else {
 			retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW);
 		}
 		if (retval)
 			goto out_fail;
 		PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount);
 		goto relock;
 	}
 	/* we are the first writer, set trans_id */
 	if (journal->j_trans_start_time == 0) {
 		journal->j_trans_start_time = get_seconds();
 	}
 	atomic_inc(&(journal->j_wcount));
 	journal->j_len_alloc += nblocks;
 	th->t_blocks_logged = 0;
 	th->t_blocks_allocated = nblocks;
 	th->t_trans_id = journal->j_trans_id;
 	unlock_journal(p_s_sb);
 	INIT_LIST_HEAD(&th->t_list);
 	get_fs_excl();
 	return 0;
       out_fail:
 	memset(th, 0, sizeof(*th));
 	/* Re-set th->t_super, so we can properly keep track of how many
 	 * persistent transactions there are. We need to do this so if this
 	 * call is part of a failed restart_transaction, we can free it later */
 	th->t_super = p_s_sb;
 	return retval;
 }
 struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
 								    super_block
 								    *s,
 								    int nblocks)
 {
 	int ret;
 	struct reiserfs_transaction_handle *th;
 	/* if we're nesting into an existing transaction.  It will be
 	 ** persistent on its own
 	 */
 	if (reiserfs_transaction_running(s)) {
 		th = current->journal_info;
 		th->t_refcount++;
 		if (th->t_refcount < 2) {
 			BUG();
 		}
 		return th;
 	}
 	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
 	if (!th)
 		return NULL;
 	ret = journal_begin(th, s, nblocks);
 	if (ret) {
 		kfree(th);
 		return NULL;
 	}
 	SB_JOURNAL(s)->j_persistent_trans++;
 	return th;
 }
 int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
 {
 	struct super_block *s = th->t_super;
 	int ret = 0;
 	if (th->t_trans_id)
 		ret = journal_end(th, th->t_super, th->t_blocks_allocated);
 	else
 		ret = -EIO;
 	if (th->t_refcount == 0) {
 		SB_JOURNAL(s)->j_persistent_trans--;
 		kfree(th);
 	}
 	return ret;
 }
 static int journal_join(struct reiserfs_transaction_handle *th,
 			struct super_block *p_s_sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 	/* this keeps do_journal_end from NULLing out the current->journal_info
 	 ** pointer
 	 */
 	th->t_handle_save = cur_th;
 	if (cur_th && cur_th->t_refcount > 1) {
 		BUG();
 	}
 	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);
 }
 int journal_join_abort(struct reiserfs_transaction_handle *th,
 		       struct super_block *p_s_sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 	/* this keeps do_journal_end from NULLing out the current->journal_info
 	 ** pointer
 	 */
 	th->t_handle_save = cur_th;
 	if (cur_th && cur_th->t_refcount > 1) {
 		BUG();
 	}
 	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);
 }
 int journal_begin(struct reiserfs_transaction_handle *th,
 		  struct super_block *p_s_sb, unsigned long nblocks)
 {
 	struct reiserfs_transaction_handle *cur_th = current->journal_info;
 	int ret;
 	th->t_handle_save = NULL;
 	if (cur_th) {
 		/* we are nesting into the current transaction */
 		if (cur_th->t_super == p_s_sb) {
 			BUG_ON(!cur_th->t_refcount);
 			cur_th->t_refcount++;
 			memcpy(th, cur_th, sizeof(*th));
 			if (th->t_refcount <= 1)
 				reiserfs_warning(p_s_sb,
 						 "BAD: refcount <= 1, but journal_info != 0");
 			return 0;
 		} else {
 			/* we've ended up with a handle from a different filesystem.
 			 ** save it and restore on journal_end.  This should never
 			 ** really happen...
 			 */
 			reiserfs_warning(p_s_sb,
 					 "clm-2100: nesting info a different FS");
 			th->t_handle_save = current->journal_info;
 			current->journal_info = th;
 		}
 	} else {
 		current->journal_info = th;
 	}
 	ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);
 	if (current->journal_info != th)
 		BUG();
 	/* I guess this boils down to being the reciprocal of clm-2100 above.
 	 * If do_journal_begin_r fails, we need to put it back, since journal_end
 	 * won't be called to do it. */
 	if (ret)
 		current->journal_info = th->t_handle_save;
 	else
 		BUG_ON(!th->t_refcount);
 	return ret;
 }
 /*
 ** puts bh into the current transaction.  If it was already there, reorders removes the
 ** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
 **
 ** if it was dirty, cleans and files onto the clean list.  I can't let it be dirty again until the
 ** transaction is committed.
 **
 ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
 */
 int journal_mark_dirty(struct reiserfs_transaction_handle *th,
 		       struct super_block *p_s_sb, struct buffer_head *bh)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_cnode *cn = NULL;
 	int count_already_incd = 0;
 	int prepared = 0;
 	BUG_ON(!th->t_trans_id);
 	PROC_INFO_INC(p_s_sb, journal.mark_dirty);
 	if (th->t_trans_id != journal->j_trans_id) {
 		reiserfs_panic(th->t_super,
 			       "journal-1577: handle trans id %ld != current trans id %ld\n",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 	p_s_sb->s_dirt = 1;
 	prepared = test_clear_buffer_journal_prepared(bh);
 	clear_buffer_journal_restore_dirty(bh);
 	/* already in this transaction, we are done */
 	if (buffer_journaled(bh)) {
 		PROC_INFO_INC(p_s_sb, journal.mark_dirty_already);
 		return 0;
 	}
 	/* this must be turned into a panic instead of a warning.  We can't allow
 	 ** a dirty or journal_dirty or locked buffer to be logged, as some changes
 	 ** could get to disk too early.  NOT GOOD.
 	 */
 	if (!prepared || buffer_dirty(bh)) {
 		reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state "
 				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
 				 (unsigned long long)bh->b_blocknr,
 				 prepared ? ' ' : '!',
 				 buffer_locked(bh) ? ' ' : '!',
 				 buffer_dirty(bh) ? ' ' : '!',
 				 buffer_journal_dirty(bh) ? ' ' : '!');
 	}
 	if (atomic_read(&(journal->j_wcount)) <= 0) {
 		reiserfs_warning(p_s_sb,
 				 "journal-1409: journal_mark_dirty returning because j_wcount was %d",
 				 atomic_read(&(journal->j_wcount)));
 		return 1;
 	}
 	/* this error means I've screwed up, and we've overflowed the transaction.
 	 ** Nothing can be done here, except make the FS readonly or panic.
 	 */
 	if (journal->j_len >= journal->j_trans_max) {
 		reiserfs_panic(th->t_super,
 			       "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n",
 			       journal->j_len);
 	}
 	if (buffer_journal_dirty(bh)) {
 		count_already_incd = 1;
 		PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal);
 		clear_buffer_journal_dirty(bh);
 	}
 	if (journal->j_len > journal->j_len_alloc) {
 		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
 	}
 	set_buffer_journaled(bh);
 	/* now put this guy on the end */
 	if (!cn) {
 		cn = get_cnode(p_s_sb);
 		if (!cn) {
 			reiserfs_panic(p_s_sb, "get_cnode failed!\n");
 		}
 		if (th->t_blocks_logged == th->t_blocks_allocated) {
 			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
 			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
 		}
 		th->t_blocks_logged++;
 		journal->j_len++;
 		cn->bh = bh;
 		cn->blocknr = bh->b_blocknr;
 		cn->sb = p_s_sb;
 		cn->jlist = NULL;
 		insert_journal_hash(journal->j_hash_table, cn);
 		if (!count_already_incd) {
 			get_bh(bh);
 		}
 	}
 	cn->next = NULL;
 	cn->prev = journal->j_last;
 	cn->bh = bh;
 	if (journal->j_last) {
 		journal->j_last->next = cn;
 		journal->j_last = cn;
 	} else {
 		journal->j_first = cn;
 		journal->j_last = cn;
 	}
 	return 0;
 }
 int journal_end(struct reiserfs_transaction_handle *th,
 		struct super_block *p_s_sb, unsigned long nblocks)
 {
 	if (!current->journal_info && th->t_refcount > 1)
 		reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d",
 				 th->t_refcount);
 	if (!th->t_trans_id) {
 		WARN_ON(1);
 		return -EIO;
 	}
 	th->t_refcount--;
 	if (th->t_refcount > 0) {
 		struct reiserfs_transaction_handle *cur_th =
 		    current->journal_info;
 		/* we aren't allowed to close a nested transaction on a different
 		 ** filesystem from the one in the task struct
 		 */
 		if (cur_th->t_super != th->t_super)
 			BUG();
 		if (th != cur_th) {
 			memcpy(current->journal_info, th, sizeof(*th));
 			th->t_trans_id = 0;
 		}
 		return 0;
 	} else {
 		return do_journal_end(th, p_s_sb, nblocks, 0);
 	}
 }
 /* removes from the current transaction, relsing and descrementing any counters.
 ** also files the removed buffer directly onto the clean list
 **
 ** called by journal_mark_freed when a block has been deleted
 **
 ** returns 1 if it cleaned and relsed the buffer. 0 otherwise
 */
 static int remove_from_transaction(struct super_block *p_s_sb,
 				   b_blocknr_t blocknr, int already_cleaned)
 {
 	struct buffer_head *bh;
 	struct reiserfs_journal_cnode *cn;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	int ret = 0;
 	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
 	if (!cn || !cn->bh) {
 		return ret;
 	}
 	bh = cn->bh;
 	if (cn->prev) {
 		cn->prev->next = cn->next;
 	}
 	if (cn->next) {
 		cn->next->prev = cn->prev;
 	}
 	if (cn == journal->j_first) {
 		journal->j_first = cn->next;
 	}
 	if (cn == journal->j_last) {
 		journal->j_last = cn->prev;
 	}
 	if (bh)
 		remove_journal_hash(p_s_sb, journal->j_hash_table, NULL,
 				    bh->b_blocknr, 0);
 	clear_buffer_journaled(bh);	/* don't log this one */
 	if (!already_cleaned) {
 		clear_buffer_journal_dirty(bh);
 		clear_buffer_dirty(bh);
 		clear_buffer_journal_test(bh);
 		put_bh(bh);
 		if (atomic_read(&(bh->b_count)) < 0) {
 			reiserfs_warning(p_s_sb,
 					 "journal-1752: remove from trans, b_count < 0");
 		}
 		ret = 1;
 	}
 	journal->j_len--;
 	journal->j_len_alloc--;
 	free_cnode(p_s_sb, cn);
 	return ret;
 }
 /*
 ** for any cnode in a journal list, it can only be dirtied of all the
 ** transactions that include it are commited to disk.
 ** this checks through each transaction, and returns 1 if you are allowed to dirty,
 ** and 0 if you aren't
 **
 ** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
 ** blocks for a given transaction on disk
 **
 */
 static int can_dirty(struct reiserfs_journal_cnode *cn)
 {
 	struct super_block *sb = cn->sb;
 	b_blocknr_t blocknr = cn->blocknr;
 	struct reiserfs_journal_cnode *cur = cn->hprev;
 	int can_dirty = 1;
 	/* first test hprev.  These are all newer than cn, so any node here
 	 ** with the same block number and dev means this node can't be sent
 	 ** to disk right now.
 	 */
 	while (cur && can_dirty) {
 		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
 		    cur->blocknr == blocknr) {
 			can_dirty = 0;
 		}
 		cur = cur->hprev;
 	}
 	/* then test hnext.  These are all older than cn.  As long as they
 	 ** are committed to the log, it is safe to write cn to disk
 	 */
 	cur = cn->hnext;
 	while (cur && can_dirty) {
 		if (cur->jlist && cur->jlist->j_len > 0 &&
 		    atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
 		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
 			can_dirty = 0;
 		}
 		cur = cur->hnext;
 	}
 	return can_dirty;
 }
 /* syncs the commit blocks, but does not force the real buffers to disk
 ** will wait until the current transaction is done/commited before returning
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th,
 		     struct super_block *p_s_sb, unsigned long nblocks)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	BUG_ON(!th->t_trans_id);
 	/* you can sync while nested, very, very bad */
 	if (th->t_refcount > 1) {
 		BUG();
 	}
 	if (journal->j_len == 0) {
 		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
 					     1);
 		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
 	}
 	return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT);
 }
 /*
 ** writeback the pending async commits to disk
 */
 static void flush_async_commits(void *p)
 {
 	struct super_block *p_s_sb = p;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_list *jl;
 	struct list_head *entry;
 	lock_kernel();
 	if (!list_empty(&journal->j_journal_list)) {
 		/* last entry is the youngest, commit it and you get everything */
 		entry = journal->j_journal_list.prev;
 		jl = JOURNAL_LIST_ENTRY(entry);
 		flush_commit_list(p_s_sb, jl, 1);
 	}
 	unlock_kernel();
 	/*
 	 * this is a little racey, but there's no harm in missing
 	 * the filemap_fdata_write
 	 */
 	if (!atomic_read(&journal->j_async_throttle)
 	    && !reiserfs_is_journal_aborted(journal)) {
 		atomic_inc(&journal->j_async_throttle);
 		filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
 		atomic_dec(&journal->j_async_throttle);
 	}
 }
 /*
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
 */
 int reiserfs_flush_old_commits(struct super_block *p_s_sb)
 {
 	time_t now;
 	struct reiserfs_transaction_handle th;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	now = get_seconds();
 	/* safety check so we don't flush while we are replaying the log during
 	 * mount
 	 */
 	if (list_empty(&journal->j_journal_list)) {
 		return 0;
 	}
 	/* check the current transaction.  If there are no writers, and it is
 	 * too old, finish it, and force the commit blocks to disk
 	 */
 	if (atomic_read(&journal->j_wcount) <= 0 &&
 	    journal->j_trans_start_time > 0 &&
 	    journal->j_len > 0 &&
 	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
 		if (!journal_join(&th, p_s_sb, 1)) {
 			reiserfs_prepare_for_journal(p_s_sb,
 						     SB_BUFFER_WITH_SB(p_s_sb),
 						     1);
 			journal_mark_dirty(&th, p_s_sb,
 					   SB_BUFFER_WITH_SB(p_s_sb));
 			/* we're only being called from kreiserfsd, it makes no sense to do
 			 ** an async commit so that kreiserfsd can do it later
 			 */
 			do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT);
 		}
 	}
 	return p_s_sb->s_dirt;
 }
 /*
 ** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
 **
 ** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
 ** the writers are done.  By the time it wakes up, the transaction it was called has already ended, so it just
 ** flushes the commit list and returns 0.
 **
 ** Won't batch when flush or commit_now is set.  Also won't batch when others are waiting on j_join_wait.
 **
 ** Note, we can't allow the journal_end to proceed while there are still writers in the log.
 */
 static int check_journal_end(struct reiserfs_transaction_handle *th,
 			     struct super_block *p_s_sb, unsigned long nblocks,
 			     int flags)
 {
 	time_t now;
 	int flush = flags & FLUSH_ALL;
 	int commit_now = flags & COMMIT_NOW;
 	int wait_on_commit = flags & WAIT;
 	struct reiserfs_journal_list *jl;
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	BUG_ON(!th->t_trans_id);
 	if (th->t_trans_id != journal->j_trans_id) {
 		reiserfs_panic(th->t_super,
 			       "journal-1577: handle trans id %ld != current trans id %ld\n",
 			       th->t_trans_id, journal->j_trans_id);
 	}
 	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
 	if (atomic_read(&(journal->j_wcount)) > 0) {	/* <= 0 is allowed.  unmounting might not call begin */
 		atomic_dec(&(journal->j_wcount));
 	}
 	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
 	 ** will be dealt with by next transaction that actually writes something, but should be taken
 	 ** care of in this trans
 	 */
 	if (journal->j_len == 0) {
 		BUG();
 	}
 	/* if wcount > 0, and we are called to with flush or commit_now,
 	 ** we wait on j_join_wait.  We will wake up when the last writer has
 	 ** finished the transaction, and started it on its way to the disk.
 	 ** Then, we flush the commit or journal list, and just return 0
 	 ** because the rest of journal end was already done for this transaction.
 	 */
 	if (atomic_read(&(journal->j_wcount)) > 0) {
 		if (flush || commit_now) {
 			unsigned trans_id;
 			jl = journal->j_current_jl;
 			trans_id = jl->j_trans_id;
 			if (wait_on_commit)
 				jl->j_state |= LIST_COMMIT_PENDING;
 			atomic_set(&(journal->j_jlock), 1);
 			if (flush) {
 				journal->j_next_full_flush = 1;
 			}
 			unlock_journal(p_s_sb);
 			/* sleep while the current transaction is still j_jlocked */
 			while (journal->j_trans_id == trans_id) {
 				if (atomic_read(&journal->j_jlock)) {
 					queue_log_writer(p_s_sb);
 				} else {
 					lock_journal(p_s_sb);
 					if (journal->j_trans_id == trans_id) {
 						atomic_set(&(journal->j_jlock),
 							   1);
 					}
 					unlock_journal(p_s_sb);
 				}
 			}
 			if (journal->j_trans_id == trans_id) {
 				BUG();
 			}
 			if (commit_now
 			    && journal_list_still_alive(p_s_sb, trans_id)
 			    && wait_on_commit) {
 				flush_commit_list(p_s_sb, jl, 1);
 			}
 			return 0;
 		}
 		unlock_journal(p_s_sb);
 		return 0;
 	}
 	/* deal with old transactions where we are the last writers */
 	now = get_seconds();
 	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
 		commit_now = 1;
 		journal->j_next_async_flush = 1;
 	}
 	/* don't batch when someone is waiting on j_join_wait */
 	/* don't batch when syncing the commit or flushing the whole trans */
 	if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
 	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
 	    && journal->j_len_alloc < journal->j_max_batch
 	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
 		journal->j_bcount++;
 		unlock_journal(p_s_sb);
 		return 0;
 	}
 	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
 		reiserfs_panic(p_s_sb,
 			       "journal-003: journal_end: j_start (%ld) is too high\n",
 			       journal->j_start);
 	}
 	return 1;
 }
 /*
 ** Does all the work that makes deleting blocks safe.
 ** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
 **
 ** otherwise:
 ** set a bit for the block in the journal bitmap.  That will prevent it from being allocated for unformatted nodes
 ** before this transaction has finished.
 **
 ** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.  That will prevent any old transactions with
 ** this block from trying to flush to the real location.  Since we aren't removing the cnode from the journal_list_hash,
 ** the block can't be reallocated yet.
 **
 ** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
 */
 int journal_mark_freed(struct reiserfs_transaction_handle *th,
 		       struct super_block *p_s_sb, b_blocknr_t blocknr)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_cnode *cn = NULL;
 	struct buffer_head *bh = NULL;
 	struct reiserfs_list_bitmap *jb = NULL;
 	int cleaned = 0;
 	BUG_ON(!th->t_trans_id);
 	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
 	if (cn && cn->bh) {
 		bh = cn->bh;
 		get_bh(bh);
 	}
 	/* if it is journal new, we just remove it from this transaction */
 	if (bh && buffer_journal_new(bh)) {
 		clear_buffer_journal_new(bh);
 		clear_prepared_bits(bh);
 		reiserfs_clean_and_file_buffer(bh);
 		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
 	} else {
 		/* set the bit for this block in the journal bitmap for this transaction */
 		jb = journal->j_current_jl->j_list_bitmap;
 		if (!jb) {
 			reiserfs_panic(p_s_sb,
 				       "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n");
 		}
 		set_bit_in_list_bitmap(p_s_sb, blocknr, jb);
 		/* Note, the entire while loop is not allowed to schedule.  */
 		if (bh) {
 			clear_prepared_bits(bh);
 			reiserfs_clean_and_file_buffer(bh);
 		}
 		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
 		/* find all older transactions with this block, make sure they don't try to write it out */
 		cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table,
 					  blocknr);
 		while (cn) {
 			if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
 				set_bit(BLOCK_FREED, &cn->state);
 				if (cn->bh) {
 					if (!cleaned) {
 						/* remove_from_transaction will brelse the buffer if it was
 						 ** in the current trans
 						 */
 						clear_buffer_journal_dirty(cn->
 									   bh);
 						clear_buffer_dirty(cn->bh);
 						clear_buffer_journal_test(cn->
 									  bh);
 						cleaned = 1;
 						put_bh(cn->bh);
 						if (atomic_read
 						    (&(cn->bh->b_count)) < 0) {
 							reiserfs_warning(p_s_sb,
 									 "journal-2138: cn->bh->b_count < 0");
 						}
 					}
 					if (cn->jlist) {	/* since we are clearing the bh, we MUST dec nonzerolen */
 						atomic_dec(&
 							   (cn->jlist->
 							    j_nonzerolen));
 					}
 					cn->bh = NULL;
 				}
 			}
 			cn = cn->hnext;
 		}
 	}
 	if (bh) {
 		put_bh(bh);	/* get_hash grabs the buffer */
 		if (atomic_read(&(bh->b_count)) < 0) {
 			reiserfs_warning(p_s_sb,
 					 "journal-2165: bh->b_count < 0");
 		}
 	}
 	return 0;
 }
 void reiserfs_update_inode_transaction(struct inode *inode)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
 	REISERFS_I(inode)->i_jl = journal->j_current_jl;
 	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
 }
 /*
  * returns -1 on error, 0 if no commits/barriers were done and 1
  * if a transaction was actually committed and the barrier was done
  */
 static int __commit_trans_jl(struct inode *inode, unsigned long id,
 			     struct reiserfs_journal_list *jl)
 {
 	struct reiserfs_transaction_handle th;
 	struct super_block *sb = inode->i_sb;
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	int ret = 0;
 	/* is it from the current transaction, or from an unknown transaction? */
 	if (id == journal->j_trans_id) {
 		jl = journal->j_current_jl;
 		/* try to let other writers come in and grow this transaction */
 		let_transaction_grow(sb, id);
 		if (journal->j_trans_id != id) {
 			goto flush_commit_only;
 		}
 		ret = journal_begin(&th, sb, 1);
 		if (ret)
 			return ret;
 		/* someone might have ended this transaction while we joined */
 		if (journal->j_trans_id != id) {
 			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
 						     1);
 			journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
 			ret = journal_end(&th, sb, 1);
 			goto flush_commit_only;
 		}
 		ret = journal_end_sync(&th, sb, 1);
 		if (!ret)
 			ret = 1;
 	} else {
 		/* this gets tricky, we have to make sure the journal list in
 		 * the inode still exists.  We know the list is still around
 		 * if we've got a larger transaction id than the oldest list
 		 */
 	      flush_commit_only:
 		if (journal_list_still_alive(inode->i_sb, id)) {
 			/*
 			 * we only set ret to 1 when we know for sure
 			 * the barrier hasn't been started yet on the commit
 			 * block.
 			 */
 			if (atomic_read(&jl->j_commit_left) > 1)
 				ret = 1;
 			flush_commit_list(sb, jl, 1);
 			if (journal->j_errno)
 				ret = journal->j_errno;
 		}
 	}
 	/* otherwise the list is gone, and long since committed */
 	return ret;
 }
 int reiserfs_commit_for_inode(struct inode *inode)
 {
 	unsigned long id = REISERFS_I(inode)->i_trans_id;
 	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
 	/* for the whole inode, assume unset id means it was
 	 * changed in the current transaction.  More conservative
 	 */
 	if (!id || !jl) {
 		reiserfs_update_inode_transaction(inode);
 		id = REISERFS_I(inode)->i_trans_id;
 		/* jl will be updated in __commit_trans_jl */
 	}
 	return __commit_trans_jl(inode, id, jl);
 }
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
 				      struct buffer_head *bh)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	PROC_INFO_INC(p_s_sb, journal.restore_prepared);
 	if (!bh) {
 		return;
 	}
 	if (test_clear_buffer_journal_restore_dirty(bh) &&
 	    buffer_journal_dirty(bh)) {
 		struct reiserfs_journal_cnode *cn;
 		cn = get_journal_hash_dev(p_s_sb,
 					  journal->j_list_hash_table,
 					  bh->b_blocknr);
 		if (cn && can_dirty(cn)) {
 			set_buffer_journal_test(bh);
 			mark_buffer_dirty(bh);
 		}
 	}
 	clear_buffer_journal_prepared(bh);
 }
 extern struct tree_balance *cur_tb;
 /*
 ** before we can change a metadata block, we have to make sure it won't
 ** be written to disk while we are altering it.  So, we must:
 ** clean it
 ** wait on it.
 **
 */
 int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
 				 struct buffer_head *bh, int wait)
 {
 	PROC_INFO_INC(p_s_sb, journal.prepare);
 	if (test_set_buffer_locked(bh)) {
 		if (!wait)
 			return 0;
 		lock_buffer(bh);
 	}
 	set_buffer_journal_prepared(bh);
 	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
 		clear_buffer_journal_test(bh);
 		set_buffer_journal_restore_dirty(bh);
 	}
 	unlock_buffer(bh);
 	return 1;
 }
 static void flush_old_journal_lists(struct super_block *s)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	struct reiserfs_journal_list *jl;
 	struct list_head *entry;
 	time_t now = get_seconds();
 	while (!list_empty(&journal->j_journal_list)) {
 		entry = journal->j_journal_list.next;
 		jl = JOURNAL_LIST_ENTRY(entry);
 		/* this check should always be run, to send old lists to disk */
 		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
 			flush_used_journal_lists(s, jl);
 		} else {
 			break;
 		}
 	}
 }
 /*
 ** long and ugly.  If flush, will not return until all commit
 ** blocks and all real buffers in the trans are on disk.
 ** If no_async, won't return until all commit blocks are on disk.
 **
 ** keep reading, there are comments as you go along
 **
 ** If the journal is aborted, we just clean up. Things like flushing
 ** journal lists, etc just won't happen.
 */
 static int do_journal_end(struct reiserfs_transaction_handle *th,
 			  struct super_block *p_s_sb, unsigned long nblocks,
 			  int flags)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
 	struct reiserfs_journal_cnode *last_cn = NULL;
 	struct reiserfs_journal_desc *desc;
 	struct reiserfs_journal_commit *commit;
 	struct buffer_head *c_bh;	/* commit bh */
 	struct buffer_head *d_bh;	/* desc bh */
 	int cur_write_start = 0;	/* start index of current log write */
 	int old_start;
 	int i;
 	int flush = flags & FLUSH_ALL;
 	int wait_on_commit = flags & WAIT;
 	struct reiserfs_journal_list *jl, *temp_jl;
 	struct list_head *entry, *safe;
 	unsigned long jindex;
 	unsigned long commit_trans_id;
 	int trans_half;
 	BUG_ON(th->t_refcount > 1);
 	BUG_ON(!th->t_trans_id);
 	put_fs_excl();
 	current->journal_info = th->t_handle_save;
 	reiserfs_check_lock_depth(p_s_sb, "journal end");
 	if (journal->j_len == 0) {
 		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
 					     1);
 		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
 	}
 	lock_journal(p_s_sb);
 	if (journal->j_next_full_flush) {
 		flags |= FLUSH_ALL;
 		flush = 1;
 	}
 	if (journal->j_next_async_flush) {
 		flags |= COMMIT_NOW | WAIT;
 		wait_on_commit = 1;
 	}
 	/* check_journal_end locks the journal, and unlocks if it does not return 1
 	 ** it tells us if we should continue with the journal_end, or just return
 	 */
 	if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
 		p_s_sb->s_dirt = 1;
 		wake_queued_writers(p_s_sb);
 		reiserfs_async_progress_wait(p_s_sb);
 		goto out;
 	}
 	/* check_journal_end might set these, check again */
 	if (journal->j_next_full_flush) {
 		flush = 1;
 	}
 	/*
 	 ** j must wait means we have to flush the log blocks, and the real blocks for
 	 ** this transaction
 	 */
 	if (journal->j_must_wait > 0) {
 		flush = 1;
 	}
 #ifdef REISERFS_PREALLOCATE
 	/* quota ops might need to nest, setup the journal_info pointer for them
 	 * and raise the refcount so that it is > 0. */
 	current->journal_info = th;
 	th->t_refcount++;
 	reiserfs_discard_all_prealloc(th);	/* it should not involve new blocks into
 						 * the transaction */
 	th->t_refcount--;
 	current->journal_info = th->t_handle_save;
 #endif
 	/* setup description block */
 	d_bh =
 	    journal_getblk(p_s_sb,
 			   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 			   journal->j_start);
 	set_buffer_uptodate(d_bh);
 	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
 	memset(d_bh->b_data, 0, d_bh->b_size);
 	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
 	set_desc_trans_id(desc, journal->j_trans_id);
 	/* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
 	c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 			      ((journal->j_start + journal->j_len +
 				1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
 	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
 	memset(c_bh->b_data, 0, c_bh->b_size);
 	set_commit_trans_id(commit, journal->j_trans_id);
 	set_buffer_uptodate(c_bh);
 	/* init this journal list */
 	jl = journal->j_current_jl;
 	/* we lock the commit before doing anything because
 	 * we want to make sure nobody tries to run flush_commit_list until
 	 * the new transaction is fully setup, and we've already flushed the
 	 * ordered bh list
 	 */
 	down(&jl->j_commit_lock);
 	/* save the transaction id in case we need to commit it later */
 	commit_trans_id = jl->j_trans_id;
 	atomic_set(&jl->j_older_commits_done, 0);
 	jl->j_trans_id = journal->j_trans_id;
 	jl->j_timestamp = journal->j_trans_start_time;
 	jl->j_commit_bh = c_bh;
 	jl->j_start = journal->j_start;
 	jl->j_len = journal->j_len;
 	atomic_set(&jl->j_nonzerolen, journal->j_len);
 	atomic_set(&jl->j_commit_left, journal->j_len + 2);
 	jl->j_realblock = NULL;
 	/* The ENTIRE FOR LOOP MUST not cause schedule to occur.
 	 **  for each real block, add it to the journal list hash,
 	 ** copy into real block index array in the commit or desc block
 	 */
 	trans_half = journal_trans_half(p_s_sb->s_blocksize);
 	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
 		if (buffer_journaled(cn->bh)) {
 			jl_cn = get_cnode(p_s_sb);
 			if (!jl_cn) {
 				reiserfs_panic(p_s_sb,
 					       "journal-1676, get_cnode returned NULL\n");
 			}
 			if (i == 0) {
 				jl->j_realblock = jl_cn;
 			}
 			jl_cn->prev = last_cn;
 			jl_cn->next = NULL;
 			if (last_cn) {
 				last_cn->next = jl_cn;
 			}
 			last_cn = jl_cn;
 			/* make sure the block we are trying to log is not a block
 			   of journal or reserved area */
 			if (is_block_in_log_or_reserved_area
 			    (p_s_sb, cn->bh->b_blocknr)) {
 				reiserfs_panic(p_s_sb,
 					       "journal-2332: Trying to log block %lu, which is a log block\n",
 					       cn->bh->b_blocknr);
 			}
 			jl_cn->blocknr = cn->bh->b_blocknr;
 			jl_cn->state = 0;
 			jl_cn->sb = p_s_sb;
 			jl_cn->bh = cn->bh;
 			jl_cn->jlist = jl;
 			insert_journal_hash(journal->j_list_hash_table, jl_cn);
 			if (i < trans_half) {
 				desc->j_realblock[i] =
 				    cpu_to_le32(cn->bh->b_blocknr);
 			} else {
 				commit->j_realblock[i - trans_half] =
 				    cpu_to_le32(cn->bh->b_blocknr);
 			}
 		} else {
 			i--;
 		}
 	}
 	set_desc_trans_len(desc, journal->j_len);
 	set_desc_mount_id(desc, journal->j_mount_id);
 	set_desc_trans_id(desc, journal->j_trans_id);
 	set_commit_trans_len(commit, journal->j_len);
 	/* special check in case all buffers in the journal were marked for not logging */
 	if (journal->j_len == 0) {
 		BUG();
 	}
 	/* we're about to dirty all the log blocks, mark the description block
 	 * dirty now too.  Don't mark the commit block dirty until all the
 	 * others are on disk
 	 */
 	mark_buffer_dirty(d_bh);
 	/* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
 	cur_write_start = journal->j_start;
 	cn = journal->j_first;
 	jindex = 1;		/* start at one so we don't get the desc again */
 	while (cn) {
 		clear_buffer_journal_new(cn->bh);
 		/* copy all the real blocks into log area.  dirty log blocks */
 		if (buffer_journaled(cn->bh)) {
 			struct buffer_head *tmp_bh;
 			char *addr;
 			struct page *page;
 			tmp_bh =
 			    journal_getblk(p_s_sb,
 					   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
 					   ((cur_write_start +
 					     jindex) %
 					    SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
 			set_buffer_uptodate(tmp_bh);
 			page = cn->bh->b_page;
 			addr = kmap(page);
 			memcpy(tmp_bh->b_data,
 			       addr + offset_in_page(cn->bh->b_data),
 			       cn->bh->b_size);
 			kunmap(page);
 			mark_buffer_dirty(tmp_bh);
 			jindex++;
 			set_buffer_journal_dirty(cn->bh);
 			clear_buffer_journaled(cn->bh);
 		} else {
 			/* JDirty cleared sometime during transaction.  don't log this one */
 			reiserfs_warning(p_s_sb,
 					 "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!");
 			brelse(cn->bh);
 		}
 		next = cn->next;
 		free_cnode(p_s_sb, cn);
 		cn = next;
 		cond_resched();
 	}
 	/* we are done  with both the c_bh and d_bh, but
 	 ** c_bh must be written after all other commit blocks,
 	 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
 	 */
 	journal->j_current_jl = alloc_journal_list(p_s_sb);
 	/* now it is safe to insert this transaction on the main list */
 	list_add_tail(&jl->j_list, &journal->j_journal_list);
 	list_add_tail(&jl->j_working_list, &journal->j_working_list);
 	journal->j_num_work_lists++;
 	/* reset journal values for the next transaction */
 	old_start = journal->j_start;
 	journal->j_start =
 	    (journal->j_start + journal->j_len +
 	     2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
 	atomic_set(&(journal->j_wcount), 0);
 	journal->j_bcount = 0;
 	journal->j_last = NULL;
 	journal->j_first = NULL;
 	journal->j_len = 0;
 	journal->j_trans_start_time = 0;
 	journal->j_trans_id++;
 	journal->j_current_jl->j_trans_id = journal->j_trans_id;
 	journal->j_must_wait = 0;
 	journal->j_len_alloc = 0;
 	journal->j_next_full_flush = 0;
 	journal->j_next_async_flush = 0;
 	init_journal_hash(p_s_sb);
 	// make sure reiserfs_add_jh sees the new current_jl before we
 	// write out the tails
 	smp_mb();
 	/* tail conversion targets have to hit the disk before we end the
 	 * transaction.  Otherwise a later transaction might repack the tail
 	 * before this transaction commits, leaving the data block unflushed and
 	 * clean, if we crash before the later transaction commits, the data block
 	 * is lost.
 	 */
 	if (!list_empty(&jl->j_tail_bh_list)) {
 		unlock_kernel();
 		write_ordered_buffers(&journal->j_dirty_buffers_lock,
 				      journal, jl, &jl->j_tail_bh_list);
 		lock_kernel();
 	}
 	if (!list_empty(&jl->j_tail_bh_list))
 		BUG();
 	up(&jl->j_commit_lock);
 	/* honor the flush wishes from the caller, simple commits can
 	 ** be done outside the journal lock, they are done below
 	 **
 	 ** if we don't flush the commit list right now, we put it into
 	 ** the work queue so the people waiting on the async progress work
 	 ** queue don't wait for this proc to flush journal lists and such.
 	 */
 	if (flush) {
 		flush_commit_list(p_s_sb, jl, 1);
 		flush_journal_list(p_s_sb, jl, 1);
 	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
 		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
 	/* if the next transaction has any chance of wrapping, flush
 	 ** transactions that might get overwritten.  If any journal lists are very
 	 ** old flush them as well.
 	 */
       first_jl:
 	list_for_each_safe(entry, safe, &journal->j_journal_list) {
 		temp_jl = JOURNAL_LIST_ENTRY(entry);
 		if (journal->j_start <= temp_jl->j_start) {
 			if ((journal->j_start + journal->j_trans_max + 1) >=
 			    temp_jl->j_start) {
 				flush_used_journal_lists(p_s_sb, temp_jl);
 				goto first_jl;
 			} else if ((journal->j_start +
 				    journal->j_trans_max + 1) <
 				   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
 				/* if we don't cross into the next transaction and we don't
 				 * wrap, there is no way we can overlap any later transactions
 				 * break now
 				 */
 				break;
 			}
 		} else if ((journal->j_start +
 			    journal->j_trans_max + 1) >
 			   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
 			if (((journal->j_start + journal->j_trans_max + 1) %
 			     SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=
 			    temp_jl->j_start) {
 				flush_used_journal_lists(p_s_sb, temp_jl);
 				goto first_jl;
 			} else {
 				/* we don't overlap anything from out start to the end of the
 				 * log, and our wrapped portion doesn't overlap anything at
 				 * the start of the log.  We can break
 				 */
 				break;
 			}
 		}
 	}
 	flush_old_journal_lists(p_s_sb);
 	journal->j_current_jl->j_list_bitmap =
 	    get_list_bitmap(p_s_sb, journal->j_current_jl);
 	if (!(journal->j_current_jl->j_list_bitmap)) {
 		reiserfs_panic(p_s_sb,
 			       "journal-1996: do_journal_end, could not get a list bitmap\n");
 	}
 	atomic_set(&(journal->j_jlock), 0);
 	unlock_journal(p_s_sb);
 	/* wake up any body waiting to join. */
 	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
 	wake_up(&(journal->j_join_wait));
 	if (!flush && wait_on_commit &&
 	    journal_list_still_alive(p_s_sb, commit_trans_id)) {
 		flush_commit_list(p_s_sb, jl, 1);
 	}
       out:
 	reiserfs_check_lock_depth(p_s_sb, "journal end2");
 	memset(th, 0, sizeof(*th));
 	/* Re-set th->t_super, so we can properly keep track of how many
 	 * persistent transactions there are. We need to do this so if this
 	 * call is part of a failed restart_transaction, we can free it later */
 	th->t_super = p_s_sb;
 	return journal->j_errno;
 }
 static void __reiserfs_journal_abort_hard(struct super_block *sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	if (test_bit(J_ABORTED, &journal->j_state))
 		return;
 	printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
 	       reiserfs_bdevname(sb));
 	sb->s_flags |= MS_RDONLY;
 	set_bit(J_ABORTED, &journal->j_state);
 #ifdef CONFIG_REISERFS_CHECK
 	dump_stack();
 #endif
 }
 static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	if (test_bit(J_ABORTED, &journal->j_state))
 		return;
 	if (!journal->j_errno)
 		journal->j_errno = errno;
 	__reiserfs_journal_abort_hard(sb);
 }
 void reiserfs_journal_abort(struct super_block *sb, int errno)
 {
 	return __reiserfs_journal_abort_soft(sb, errno);
 }