Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

*

4

*

5

* This program is free software; you can redistribute it and/or

5

* This program is free software; you can redistribute it and/or

6

* modify it under the terms of the GNU General Public License as

6

* modify it under the terms of the GNU General Public License as

7

* published by the Free Software Foundation.

7

* published by the Free Software Foundation.

8

*

8

*

9

* This program is distributed in the hope that it would be useful,

9

* This program is distributed in the hope that it would be useful,

10

* but WITHOUT ANY WARRANTY; without even the implied warranty of

10

* but WITHOUT ANY WARRANTY; without even the implied warranty of

11

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

* GNU General Public License for more details.

12

* GNU General Public License for more details.

13

*

13

*

14

* You should have received a copy of the GNU General Public License

14

* You should have received a copy of the GNU General Public License

15

* along with this program; if not, write the Free Software Foundation,

15

* along with this program; if not, write the Free Software Foundation,

16

* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

16

* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

17

*/

17

*/

18

#include "xfs.h"

18

#include "xfs.h"

19

#include "xfs_fs.h"

19

#include "xfs_fs.h"

20

#include "xfs_types.h"

20

#include "xfs_types.h"

21

#include "xfs_bit.h"

21

#include "xfs_bit.h"

22

#include "xfs_log.h"

22

#include "xfs_log.h"

23

#include "xfs_inum.h"

23

#include "xfs_inum.h"

24

#include "xfs_trans.h"

24

#include "xfs_trans.h"

25

#include "xfs_sb.h"

25

#include "xfs_sb.h"

26

#include "xfs_ag.h"

26

#include "xfs_ag.h"

27

#include "xfs_mount.h"

27

#include "xfs_mount.h"

28

#include "xfs_error.h"

28

#include "xfs_error.h"

29

#include "xfs_bmap_btree.h"

29

#include "xfs_bmap_btree.h"

30

#include "xfs_alloc_btree.h"

30

#include "xfs_alloc_btree.h"

31

#include "xfs_ialloc_btree.h"

31

#include "xfs_ialloc_btree.h"

32

#include "xfs_btree.h"

32

#include "xfs_btree.h"

33

#include "xfs_dinode.h"

33

#include "xfs_dinode.h"

34

#include "xfs_inode.h"

34

#include "xfs_inode.h"

35

#include "xfs_inode_item.h"

35

#include "xfs_inode_item.h"

36

#include "xfs_alloc.h"

36

#include "xfs_alloc.h"

37

#include "xfs_ialloc.h"

37

#include "xfs_ialloc.h"

38

#include "xfs_log_priv.h"

38

#include "xfs_log_priv.h"

39

#include "xfs_buf_item.h"

39

#include "xfs_buf_item.h"

40

#include "xfs_log_recover.h"

40

#include "xfs_log_recover.h"

41

#include "xfs_extfree_item.h"

41

#include "xfs_extfree_item.h"

42

#include "xfs_trans_priv.h"

42

#include "xfs_trans_priv.h"

43

#include "xfs_quota.h"

43

#include "xfs_quota.h"

44

#include "xfs_utils.h"

44

#include "xfs_utils.h"

45

#include "xfs_cksum.h"

45

#include "xfs_cksum.h"

46

#include "xfs_trace.h"

46

#include "xfs_trace.h"

47

#include "xfs_icache.h"

47

#include "xfs_icache.h"

48

49

STATIC int

49

STATIC int

50

xlog_find_zeroed(

50

xlog_find_zeroed(

51

struct xlog *,

51

struct xlog *,

52

xfs_daddr_t *);

52

xfs_daddr_t *);

53

STATIC int

53

STATIC int

54

xlog_clear_stale_blocks(

54

xlog_clear_stale_blocks(

55

struct xlog *,

55

struct xlog *,

56

xfs_lsn_t);

56

xfs_lsn_t);

57

#if defined(DEBUG)

57

#if defined(DEBUG)

58

STATIC void

58

STATIC void

59

xlog_recover_check_summary(

59

xlog_recover_check_summary(

60

struct xlog *);

60

struct xlog *);

61

#else

61

#else

62

#define xlog_recover_check_summary(log)

62

#define xlog_recover_check_summary(log)

63

#endif

63

#endif

64

65

/*

65

/*

66

* This structure is used during recovery to record the buf log items which

66

* This structure is used during recovery to record the buf log items which

67

* have been canceled and should not be replayed.

67

* have been canceled and should not be replayed.

68

*/

68

*/

69

struct xfs_buf_cancel {

69

struct xfs_buf_cancel {

70

xfs_daddr_t bc_blkno;

70

xfs_daddr_t bc_blkno;

71

uint bc_len;

71

uint bc_len;

72

int bc_refcount;

72

int bc_refcount;

73

struct list_head bc_list;

73

struct list_head bc_list;

74

};

74

};

75

76

/*

76

/*

77

* Sector aligned buffer routines for buffer create/read/write/access

77

* Sector aligned buffer routines for buffer create/read/write/access

78

*/

78

*/

79

80

/*

80

/*

81

* Verify the given count of basic blocks is valid number of blocks

81

* Verify the given count of basic blocks is valid number of blocks

82

* to specify for an operation involving the given XFS log buffer.

82

* to specify for an operation involving the given XFS log buffer.

83

* Returns nonzero if the count is valid, 0 otherwise.

83

* Returns nonzero if the count is valid, 0 otherwise.

84

*/

84

*/

85

86

static inline int

86

static inline int

87

xlog_buf_bbcount_valid(

87

xlog_buf_bbcount_valid(

88

struct xlog *log,

88

struct xlog *log,

89

int bbcount)

89

int bbcount)

90

{

90

{

91

return bbcount > 0 && bbcount <= log->l_logBBsize;

91

return bbcount > 0 && bbcount <= log->l_logBBsize;

92

}

92

}

93

94

/*

94

/*

95

* Allocate a buffer to hold log data. The buffer needs to be able

95

* Allocate a buffer to hold log data. The buffer needs to be able

96

* to map to a range of nbblks basic blocks at any valid (basic

96

* to map to a range of nbblks basic blocks at any valid (basic

97

* block) offset within the log.

97

* block) offset within the log.

98

*/

98

*/

99

STATIC xfs_buf_t *

99

STATIC xfs_buf_t *

100

xlog_get_bp(

100

xlog_get_bp(

101

struct xlog *log,

101

struct xlog *log,

102

int nbblks)

102

int nbblks)

103

{

103

{

104

struct xfs_buf *bp;

104

struct xfs_buf *bp;

105

106

if (!xlog_buf_bbcount_valid(log, nbblks)) {

106

if (!xlog_buf_bbcount_valid(log, nbblks)) {

107

xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",

107

xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",

108

nbblks);

108

nbblks);

109

XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);

109

XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);

110

return NULL;

110

return NULL;

111

}

111

}

112

113

/*

113

/*

114

* We do log I/O in units of log sectors (a power-of-2

114

* We do log I/O in units of log sectors (a power-of-2

115

* multiple of the basic block size), so we round up the

115

* multiple of the basic block size), so we round up the

116

* requested size to accommodate the basic blocks required

116

* requested size to accommodate the basic blocks required

117

* for complete log sectors.

117

* for complete log sectors.

118

*

118

*

119

* In addition, the buffer may be used for a non-sector-

119

* In addition, the buffer may be used for a non-sector-

120

* aligned block offset, in which case an I/O of the

120

* aligned block offset, in which case an I/O of the

121

* requested size could extend beyond the end of the

121

* requested size could extend beyond the end of the

122

* buffer. If the requested size is only 1 basic block it

122

* buffer. If the requested size is only 1 basic block it

123

* will never straddle a sector boundary, so this won't be

123

* will never straddle a sector boundary, so this won't be

124

* an issue. Nor will this be a problem if the log I/O is

124

* an issue. Nor will this be a problem if the log I/O is

125

* done in basic blocks (sector size 1). But otherwise we

125

* done in basic blocks (sector size 1). But otherwise we

126

* extend the buffer by one extra log sector to ensure

126

* extend the buffer by one extra log sector to ensure

127

* there's space to accommodate this possibility.

127

* there's space to accommodate this possibility.

128

*/

128

*/

129

if (nbblks > 1 && log->l_sectBBsize > 1)

129

if (nbblks > 1 && log->l_sectBBsize > 1)

130

nbblks += log->l_sectBBsize;

130

nbblks += log->l_sectBBsize;

131

nbblks = round_up(nbblks, log->l_sectBBsize);

131

nbblks = round_up(nbblks, log->l_sectBBsize);

132

133

bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);

133

bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);

134

if (bp)

134

if (bp)

135

xfs_buf_unlock(bp);

135

xfs_buf_unlock(bp);

136

return bp;

136

return bp;

137

}

137

}

138

139

STATIC void

139

STATIC void

140

xlog_put_bp(

140

xlog_put_bp(

141

xfs_buf_t *bp)

141

xfs_buf_t *bp)

142

{

142

{

143

xfs_buf_free(bp);

143

xfs_buf_free(bp);

144

}

144

}

145

146

/*

146

/*

147

* Return the address of the start of the given block number's data

147

* Return the address of the start of the given block number's data

148

* in a log buffer. The buffer covers a log sector-aligned region.

148

* in a log buffer. The buffer covers a log sector-aligned region.

149

*/

149

*/

150

STATIC xfs_caddr_t

150

STATIC xfs_caddr_t

151

xlog_align(

151

xlog_align(

152

struct xlog *log,

152

struct xlog *log,

153

xfs_daddr_t blk_no,

153

xfs_daddr_t blk_no,

154

int nbblks,

154

int nbblks,

155

struct xfs_buf *bp)

155

struct xfs_buf *bp)

156

{

156

{

157

xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);

157

xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);

158

159

ASSERT(offset + nbblks <= bp->b_length);

159

ASSERT(offset + nbblks <= bp->b_length);

160

return bp->b_addr + BBTOB(offset);

160

return bp->b_addr + BBTOB(offset);

161

}

161

}

162

163

164

/*

164

/*

165

* nbblks should be uint, but oh well. Just want to catch that 32-bit length.

165

* nbblks should be uint, but oh well. Just want to catch that 32-bit length.

166

*/

166

*/

167

STATIC int

167

STATIC int

168

xlog_bread_noalign(

168

xlog_bread_noalign(

169

struct xlog *log,

169

struct xlog *log,

170

xfs_daddr_t blk_no,

170

xfs_daddr_t blk_no,

171

int nbblks,

171

int nbblks,

172

struct xfs_buf *bp)

172

struct xfs_buf *bp)

173

{

173

{

174

int error;

174

int error;

175

176

if (!xlog_buf_bbcount_valid(log, nbblks)) {

176

if (!xlog_buf_bbcount_valid(log, nbblks)) {

177

xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",

177

xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",

178

nbblks);

178

nbblks);

179

XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);

179

XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);

180

return EFSCORRUPTED;

180

return EFSCORRUPTED;

181

}

181

}

182

183

blk_no = round_down(blk_no, log->l_sectBBsize);

183

blk_no = round_down(blk_no, log->l_sectBBsize);

184

nbblks = round_up(nbblks, log->l_sectBBsize);

184

nbblks = round_up(nbblks, log->l_sectBBsize);

185

186

ASSERT(nbblks > 0);

186

ASSERT(nbblks > 0);

187

ASSERT(nbblks <= bp->b_length);

187

ASSERT(nbblks <= bp->b_length);

188

189

XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);

189

XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);

190

XFS_BUF_READ(bp);

190

XFS_BUF_READ(bp);

191

bp->b_io_length = nbblks;

191

bp->b_io_length = nbblks;

192

bp->b_error = 0;

192

bp->b_error = 0;

193

194

xfsbdstrat(log->l_mp, bp);

194

xfsbdstrat(log->l_mp, bp);

195

error = xfs_buf_iowait(bp);

195

error = xfs_buf_iowait(bp);

196

if (error)

196

if (error)

197

xfs_buf_ioerror_alert(bp, __func__);

197

xfs_buf_ioerror_alert(bp, __func__);

198

return error;

198

return error;

199

}

199

}

200

201

STATIC int

201

STATIC int

202

xlog_bread(

202

xlog_bread(

203

struct xlog *log,

203

struct xlog *log,

204

xfs_daddr_t blk_no,

204

xfs_daddr_t blk_no,

205

int nbblks,

205

int nbblks,

206

struct xfs_buf *bp,

206

struct xfs_buf *bp,

207

xfs_caddr_t *offset)

207

xfs_caddr_t *offset)

208

{

208

{

209

int error;

209

int error;

210

211

error = xlog_bread_noalign(log, blk_no, nbblks, bp);

211

error = xlog_bread_noalign(log, blk_no, nbblks, bp);

212

if (error)

212

if (error)

213

return error;

213

return error;

214

215

*offset = xlog_align(log, blk_no, nbblks, bp);

215

*offset = xlog_align(log, blk_no, nbblks, bp);

216

return 0;

216

return 0;

217

}

217

}

218

219

/*

219

/*

220

* Read at an offset into the buffer. Returns with the buffer in it's original

220

* Read at an offset into the buffer. Returns with the buffer in it's original

221

* state regardless of the result of the read.

221

* state regardless of the result of the read.

222

*/

222

*/

223

STATIC int

223

STATIC int

224

xlog_bread_offset(

224

xlog_bread_offset(

225

struct xlog *log,

225

struct xlog *log,

226

xfs_daddr_t blk_no, /* block to read from */

226

xfs_daddr_t blk_no, /* block to read from */

227

int nbblks, /* blocks to read */

227

int nbblks, /* blocks to read */

228

struct xfs_buf *bp,

228

struct xfs_buf *bp,

229

xfs_caddr_t offset)

229

xfs_caddr_t offset)

230

{

230

{

231

xfs_caddr_t orig_offset = bp->b_addr;

231

xfs_caddr_t orig_offset = bp->b_addr;

232

int orig_len = BBTOB(bp->b_length);

232

int orig_len = BBTOB(bp->b_length);

233

int error, error2;

233

int error, error2;

234

235

error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));

235

error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));

236

if (error)

236

if (error)

237

return error;

237

return error;

238

239

error = xlog_bread_noalign(log, blk_no, nbblks, bp);

239

error = xlog_bread_noalign(log, blk_no, nbblks, bp);

240

241

/* must reset buffer pointer even on error */

241

/* must reset buffer pointer even on error */

242

error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);

242

error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);

243

if (error)

243

if (error)

244

return error;

244

return error;

245

return error2;

245

return error2;

246

}

246

}

247

248

/*

248

/*

249

* Write out the buffer at the given block for the given number of blocks.

249

* Write out the buffer at the given block for the given number of blocks.

250

* The buffer is kept locked across the write and is returned locked.

250

* The buffer is kept locked across the write and is returned locked.

251

* This can only be used for synchronous log writes.

251

* This can only be used for synchronous log writes.

252

*/

252

*/

253

STATIC int

253

STATIC int

254

xlog_bwrite(

254

xlog_bwrite(

255

struct xlog *log,

255

struct xlog *log,

256

xfs_daddr_t blk_no,

256

xfs_daddr_t blk_no,

257

int nbblks,

257

int nbblks,

258

struct xfs_buf *bp)

258

struct xfs_buf *bp)

259

{

259

{

260

int error;

260

int error;

261

262

if (!xlog_buf_bbcount_valid(log, nbblks)) {

262

if (!xlog_buf_bbcount_valid(log, nbblks)) {

263

xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",

263

xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",

264

nbblks);

264

nbblks);

265

XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);

265

XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);

266

return EFSCORRUPTED;

266

return EFSCORRUPTED;

267

}

267

}

268

269

blk_no = round_down(blk_no, log->l_sectBBsize);

269

blk_no = round_down(blk_no, log->l_sectBBsize);

270

nbblks = round_up(nbblks, log->l_sectBBsize);

270

nbblks = round_up(nbblks, log->l_sectBBsize);

271

272

ASSERT(nbblks > 0);

272

ASSERT(nbblks > 0);

273

ASSERT(nbblks <= bp->b_length);

273

ASSERT(nbblks <= bp->b_length);

274

275

XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);

275

XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);

276

XFS_BUF_ZEROFLAGS(bp);

276

XFS_BUF_ZEROFLAGS(bp);

277

xfs_buf_hold(bp);

277

xfs_buf_hold(bp);

278

xfs_buf_lock(bp);

278

xfs_buf_lock(bp);

279

bp->b_io_length = nbblks;

279

bp->b_io_length = nbblks;

280

bp->b_error = 0;

280

bp->b_error = 0;

281

282

error = xfs_bwrite(bp);

282

error = xfs_bwrite(bp);

283

if (error)

283

if (error)

284

xfs_buf_ioerror_alert(bp, __func__);

284

xfs_buf_ioerror_alert(bp, __func__);

285

xfs_buf_relse(bp);

285

xfs_buf_relse(bp);

286

return error;

286

return error;

287

}

287

}

288

289

#ifdef DEBUG

289

#ifdef DEBUG

290

/*

290

/*

291

* dump debug superblock and log record information

291

* dump debug superblock and log record information

292

*/

292

*/

293

STATIC void

293

STATIC void

294

xlog_header_check_dump(

294

xlog_header_check_dump(

295

xfs_mount_t *mp,

295

xfs_mount_t *mp,

296

xlog_rec_header_t *head)

296

xlog_rec_header_t *head)

297

{

297

{

298

xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",

298

xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",

299

__func__, &mp->m_sb.sb_uuid, XLOG_FMT);

299

__func__, &mp->m_sb.sb_uuid, XLOG_FMT);

300

xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",

300

xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",

301

&head->h_fs_uuid, be32_to_cpu(head->h_fmt));

301

&head->h_fs_uuid, be32_to_cpu(head->h_fmt));

302

}

302

}

303

#else

303

#else

304

#define xlog_header_check_dump(mp, head)

304

#define xlog_header_check_dump(mp, head)

305

#endif

305

#endif

306

307

/*

307

/*

308

* check log record header for recovery

308

* check log record header for recovery

309

*/

309

*/

310

STATIC int

310

STATIC int

311

xlog_header_check_recover(

311

xlog_header_check_recover(

312

xfs_mount_t *mp,

312

xfs_mount_t *mp,

313

xlog_rec_header_t *head)

313

xlog_rec_header_t *head)

314

{

314

{

315

ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));

315

ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));

316

317

/*

317

/*

318

* IRIX doesn't write the h_fmt field and leaves it zeroed

318

* IRIX doesn't write the h_fmt field and leaves it zeroed

319

* (XLOG_FMT_UNKNOWN). This stops us from trying to recover

319

* (XLOG_FMT_UNKNOWN). This stops us from trying to recover

320

* a dirty log created in IRIX.

320

* a dirty log created in IRIX.

321

*/

321

*/

322

if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {

322

if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {

323

xfs_warn(mp,

323

xfs_warn(mp,

324

"dirty log written in incompatible format - can't recover");

324

"dirty log written in incompatible format - can't recover");

325

xlog_header_check_dump(mp, head);

325

xlog_header_check_dump(mp, head);

326

XFS_ERROR_REPORT("xlog_header_check_recover(1)",

326

XFS_ERROR_REPORT("xlog_header_check_recover(1)",

327

XFS_ERRLEVEL_HIGH, mp);

327

XFS_ERRLEVEL_HIGH, mp);

328

return XFS_ERROR(EFSCORRUPTED);

328

return XFS_ERROR(EFSCORRUPTED);

329

} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {

329

} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {

330

xfs_warn(mp,

330

xfs_warn(mp,

331

"dirty log entry has mismatched uuid - can't recover");

331

"dirty log entry has mismatched uuid - can't recover");

332

xlog_header_check_dump(mp, head);

332

xlog_header_check_dump(mp, head);

333

XFS_ERROR_REPORT("xlog_header_check_recover(2)",

333

XFS_ERROR_REPORT("xlog_header_check_recover(2)",

334

XFS_ERRLEVEL_HIGH, mp);

334

XFS_ERRLEVEL_HIGH, mp);

335

return XFS_ERROR(EFSCORRUPTED);

335

return XFS_ERROR(EFSCORRUPTED);

336

}

336

}

337

return 0;

337

return 0;

338

}

338

}

339

340

/*

340

/*

341

* read the head block of the log and check the header

341

* read the head block of the log and check the header

342

*/

342

*/

343

STATIC int

343

STATIC int

344

xlog_header_check_mount(

344

xlog_header_check_mount(

345

xfs_mount_t *mp,

345

xfs_mount_t *mp,

346

xlog_rec_header_t *head)

346

xlog_rec_header_t *head)

347

{

347

{

348

ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));

348

ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));

349

350

if (uuid_is_nil(&head->h_fs_uuid)) {

350

if (uuid_is_nil(&head->h_fs_uuid)) {

351

/*

351

/*

352

* IRIX doesn't write the h_fs_uuid or h_fmt fields. If

352

* IRIX doesn't write the h_fs_uuid or h_fmt fields. If

353

* h_fs_uuid is nil, we assume this log was last mounted

353

* h_fs_uuid is nil, we assume this log was last mounted

354

* by IRIX and continue.

354

* by IRIX and continue.

355

*/

355

*/

356

xfs_warn(mp, "nil uuid in log - IRIX style log");

356

xfs_warn(mp, "nil uuid in log - IRIX style log");

357

} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {

357

} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {

358

xfs_warn(mp, "log has mismatched uuid - can't recover");

358

xfs_warn(mp, "log has mismatched uuid - can't recover");

359

xlog_header_check_dump(mp, head);

359

xlog_header_check_dump(mp, head);

360

XFS_ERROR_REPORT("xlog_header_check_mount",

360

XFS_ERROR_REPORT("xlog_header_check_mount",

361

XFS_ERRLEVEL_HIGH, mp);

361

XFS_ERRLEVEL_HIGH, mp);

362

return XFS_ERROR(EFSCORRUPTED);

362

return XFS_ERROR(EFSCORRUPTED);

363

}

363

}

364

return 0;

364

return 0;

365

}

365

}

366

367

STATIC void

367

STATIC void

368

xlog_recover_iodone(

368

xlog_recover_iodone(

369

struct xfs_buf *bp)

369

struct xfs_buf *bp)

370

{

370

{

371

if (bp->b_error) {

371

if (bp->b_error) {

372

/*

372

/*

373

* We're not going to bother about retrying

373

* We're not going to bother about retrying

374

* this during recovery. One strike!

374

* this during recovery. One strike!

375

*/

375

*/

376

xfs_buf_ioerror_alert(bp, __func__);

376

xfs_buf_ioerror_alert(bp, __func__);

377

xfs_force_shutdown(bp->b_target->bt_mount,

377

xfs_force_shutdown(bp->b_target->bt_mount,

378

SHUTDOWN_META_IO_ERROR);

378

SHUTDOWN_META_IO_ERROR);

379

}

379

}

380

bp->b_iodone = NULL;

380

bp->b_iodone = NULL;

381

xfs_buf_ioend(bp, 0);

381

xfs_buf_ioend(bp, 0);

382

}

382

}

383

384

/*

384

/*

385

* This routine finds (to an approximation) the first block in the physical

385

* This routine finds (to an approximation) the first block in the physical

386

* log which contains the given cycle. It uses a binary search algorithm.

386

* log which contains the given cycle. It uses a binary search algorithm.

387

* Note that the algorithm can not be perfect because the disk will not

387

* Note that the algorithm can not be perfect because the disk will not

388

* necessarily be perfect.

388

* necessarily be perfect.

389

*/

389

*/

390

STATIC int

390

STATIC int

391

xlog_find_cycle_start(

391

xlog_find_cycle_start(

392

struct xlog *log,

392

struct xlog *log,

393

struct xfs_buf *bp,

393

struct xfs_buf *bp,

394

xfs_daddr_t first_blk,

394

xfs_daddr_t first_blk,

395

xfs_daddr_t *last_blk,

395

xfs_daddr_t *last_blk,

396

uint cycle)

396

uint cycle)

397

{

397

{

398

xfs_caddr_t offset;

398

xfs_caddr_t offset;

399

xfs_daddr_t mid_blk;

399

xfs_daddr_t mid_blk;

400

xfs_daddr_t end_blk;

400

xfs_daddr_t end_blk;

401

uint mid_cycle;

401

uint mid_cycle;

402

int error;

402

int error;

403

404

end_blk = *last_blk;

404

end_blk = *last_blk;

405

mid_blk = BLK_AVG(first_blk, end_blk);

405

mid_blk = BLK_AVG(first_blk, end_blk);

406

while (mid_blk != first_blk && mid_blk != end_blk) {

406

while (mid_blk != first_blk && mid_blk != end_blk) {

407

error = xlog_bread(log, mid_blk, 1, bp, &offset);

407

error = xlog_bread(log, mid_blk, 1, bp, &offset);

408

if (error)

408

if (error)

409

return error;

409

return error;

410

mid_cycle = xlog_get_cycle(offset);

410

mid_cycle = xlog_get_cycle(offset);

411

if (mid_cycle == cycle)

411

if (mid_cycle == cycle)

412

end_blk = mid_blk; /* last_half_cycle == mid_cycle */

412

end_blk = mid_blk; /* last_half_cycle == mid_cycle */

413

else

413

else

414

first_blk = mid_blk; /* first_half_cycle == mid_cycle */

414

first_blk = mid_blk; /* first_half_cycle == mid_cycle */

415

mid_blk = BLK_AVG(first_blk, end_blk);

415

mid_blk = BLK_AVG(first_blk, end_blk);

416

}

416

}

417

ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||

417

ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||

418

(mid_blk == end_blk && mid_blk-1 == first_blk));

418

(mid_blk == end_blk && mid_blk-1 == first_blk));

419

420

*last_blk = end_blk;

420

*last_blk = end_blk;

421

422

return 0;

422

return 0;

423

}

423

}

424

425

/*

425

/*

426

* Check that a range of blocks does not contain stop_on_cycle_no.

426

* Check that a range of blocks does not contain stop_on_cycle_no.

427

* Fill in *new_blk with the block offset where such a block is

427

* Fill in *new_blk with the block offset where such a block is

428

* found, or with -1 (an invalid block number) if there is no such

428

* found, or with -1 (an invalid block number) if there is no such

429

* block in the range. The scan needs to occur from front to back

429

* block in the range. The scan needs to occur from front to back

430

* and the pointer into the region must be updated since a later

430

* and the pointer into the region must be updated since a later

431

* routine will need to perform another test.

431

* routine will need to perform another test.

432

*/

432

*/

433

STATIC int

433

STATIC int

434

xlog_find_verify_cycle(

434

xlog_find_verify_cycle(

435

struct xlog *log,

435

struct xlog *log,

436

xfs_daddr_t start_blk,

436

xfs_daddr_t start_blk,

437

int nbblks,

437

int nbblks,

438

uint stop_on_cycle_no,

438

uint stop_on_cycle_no,

439

xfs_daddr_t *new_blk)

439

xfs_daddr_t *new_blk)

440

{

440

{

441

xfs_daddr_t i, j;

441

xfs_daddr_t i, j;

442

uint cycle;

442

uint cycle;

443

xfs_buf_t *bp;

443

xfs_buf_t *bp;

444

xfs_daddr_t bufblks;

444

xfs_daddr_t bufblks;

445

xfs_caddr_t buf = NULL;

445

xfs_caddr_t buf = NULL;

446

int error = 0;

446

int error = 0;

447

448

/*

448

/*

449

* Greedily allocate a buffer big enough to handle the full

449

* Greedily allocate a buffer big enough to handle the full

450

* range of basic blocks we'll be examining. If that fails,

450

* range of basic blocks we'll be examining. If that fails,

451

* try a smaller size. We need to be able to read at least

451

* try a smaller size. We need to be able to read at least

452

* a log sector, or we're out of luck.

452

* a log sector, or we're out of luck.

453

*/

453

*/

454

bufblks = 1 << ffs(nbblks);

454

bufblks = 1 << ffs(nbblks);

455

while (bufblks > log->l_logBBsize)

455

while (bufblks > log->l_logBBsize)

456

bufblks >>= 1;

456

bufblks >>= 1;

457

while (!(bp = xlog_get_bp(log, bufblks))) {

457

while (!(bp = xlog_get_bp(log, bufblks))) {

458

bufblks >>= 1;

458

bufblks >>= 1;

459

if (bufblks < log->l_sectBBsize)

459

if (bufblks < log->l_sectBBsize)

460

return ENOMEM;

460

return ENOMEM;

461

}

461

}

462

463

for (i = start_blk; i < start_blk + nbblks; i += bufblks) {

463

for (i = start_blk; i < start_blk + nbblks; i += bufblks) {

464

int bcount;

464

int bcount;

465

466

bcount = min(bufblks, (start_blk + nbblks - i));

466

bcount = min(bufblks, (start_blk + nbblks - i));

467

468

error = xlog_bread(log, i, bcount, bp, &buf);

468

error = xlog_bread(log, i, bcount, bp, &buf);

469

if (error)

469

if (error)

470

goto out;

470

goto out;

471

472

for (j = 0; j < bcount; j++) {

472

for (j = 0; j < bcount; j++) {

473

cycle = xlog_get_cycle(buf);

473

cycle = xlog_get_cycle(buf);

474

if (cycle == stop_on_cycle_no) {

474

if (cycle == stop_on_cycle_no) {

475

*new_blk = i+j;

475

*new_blk = i+j;

476

goto out;

476

goto out;

477

}

477

}

478

479

buf += BBSIZE;

479

buf += BBSIZE;

480

}

480

}

481

}

481

}

482

483

*new_blk = -1;

483

*new_blk = -1;

484

485

out:

485

out:

486

xlog_put_bp(bp);

486

xlog_put_bp(bp);

487

return error;

487

return error;

488

}

488

}

489

490

/*

490

/*

491

* Potentially backup over partial log record write.

491

* Potentially backup over partial log record write.

492

*

492

*

493

* In the typical case, last_blk is the number of the block directly after

493

* In the typical case, last_blk is the number of the block directly after

494

* a good log record. Therefore, we subtract one to get the block number

494

* a good log record. Therefore, we subtract one to get the block number

495

* of the last block in the given buffer. extra_bblks contains the number

495

* of the last block in the given buffer. extra_bblks contains the number

496

* of blocks we would have read on a previous read. This happens when the

496

* of blocks we would have read on a previous read. This happens when the

497

* last log record is split over the end of the physical log.

497

* last log record is split over the end of the physical log.

498

*

498

*

499

* extra_bblks is the number of blocks potentially verified on a previous

499

* extra_bblks is the number of blocks potentially verified on a previous

500

* call to this routine.

500

* call to this routine.

501

*/

501

*/

502

STATIC int

502

STATIC int

503

xlog_find_verify_log_record(

503

xlog_find_verify_log_record(

504

struct xlog *log,

504

struct xlog *log,

505

xfs_daddr_t start_blk,

505

xfs_daddr_t start_blk,

506

xfs_daddr_t *last_blk,

506

xfs_daddr_t *last_blk,

507

int extra_bblks)

507

int extra_bblks)

508

{

508

{

509

xfs_daddr_t i;

509

xfs_daddr_t i;

510

xfs_buf_t *bp;

510

xfs_buf_t *bp;

511

xfs_caddr_t offset = NULL;

511

xfs_caddr_t offset = NULL;

512

xlog_rec_header_t *head = NULL;

512

xlog_rec_header_t *head = NULL;

513

int error = 0;

513

int error = 0;

514

int smallmem = 0;

514

int smallmem = 0;

515

int num_blks = *last_blk - start_blk;

515

int num_blks = *last_blk - start_blk;

516

int xhdrs;

516

int xhdrs;

517

518

ASSERT(start_blk != 0 || *last_blk != start_blk);

518

ASSERT(start_blk != 0 || *last_blk != start_blk);

519

520

if (!(bp = xlog_get_bp(log, num_blks))) {

520

if (!(bp = xlog_get_bp(log, num_blks))) {

521

if (!(bp = xlog_get_bp(log, 1)))

521

if (!(bp = xlog_get_bp(log, 1)))

522

return ENOMEM;

522

return ENOMEM;

523

smallmem = 1;

523

smallmem = 1;

524

} else {

524

} else {

525

error = xlog_bread(log, start_blk, num_blks, bp, &offset);

525

error = xlog_bread(log, start_blk, num_blks, bp, &offset);

526

if (error)

526

if (error)

527

goto out;

527

goto out;

528

offset += ((num_blks - 1) << BBSHIFT);

528

offset += ((num_blks - 1) << BBSHIFT);

529

}

529

}

530

531

for (i = (*last_blk) - 1; i >= 0; i--) {

531

for (i = (*last_blk) - 1; i >= 0; i--) {

532

if (i < start_blk) {

532

if (i < start_blk) {

533

/* valid log record not found */

533

/* valid log record not found */

534

xfs_warn(log->l_mp,

534

xfs_warn(log->l_mp,

535

"Log inconsistent (didn't find previous header)");

535

"Log inconsistent (didn't find previous header)");

536

ASSERT(0);

536

ASSERT(0);

537

error = XFS_ERROR(EIO);

537

error = XFS_ERROR(EIO);

538

goto out;

538

goto out;

539

}

539

}

540

541

if (smallmem) {

541

if (smallmem) {

542

error = xlog_bread(log, i, 1, bp, &offset);

542

error = xlog_bread(log, i, 1, bp, &offset);

543

if (error)

543

if (error)

544

goto out;

544

goto out;

545

}

545

}

546

547

head = (xlog_rec_header_t *)offset;

547

head = (xlog_rec_header_t *)offset;

548

549

if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))

549

if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))

550

break;

550

break;

551

552

if (!smallmem)

552

if (!smallmem)

553

offset -= BBSIZE;

553

offset -= BBSIZE;

554

}

554

}

555

556

/*

556

/*

557

* We hit the beginning of the physical log & still no header. Return

557

* We hit the beginning of the physical log & still no header. Return

558

* to caller. If caller can handle a return of -1, then this routine

558

* to caller. If caller can handle a return of -1, then this routine

559

* will be called again for the end of the physical log.

559

* will be called again for the end of the physical log.

560

*/

560

*/

561

if (i == -1) {

561

if (i == -1) {

562

error = -1;

562

error = -1;

563

goto out;

563

goto out;

564

}

564

}

565

566

/*

566

/*

567

* We have the final block of the good log (the first block

567

* We have the final block of the good log (the first block

568

* of the log record _before_ the head. So we check the uuid.

568

* of the log record _before_ the head. So we check the uuid.

569

*/

569

*/

570

if ((error = xlog_header_check_mount(log->l_mp, head)))

570

if ((error = xlog_header_check_mount(log->l_mp, head)))

571

goto out;

571

goto out;

572

573

/*

573

/*

574

* We may have found a log record header before we expected one.

574

* We may have found a log record header before we expected one.

575

* last_blk will be the 1st block # with a given cycle #. We may end

575

* last_blk will be the 1st block # with a given cycle #. We may end

576

* up reading an entire log record. In this case, we don't want to

576

* up reading an entire log record. In this case, we don't want to

577

* reset last_blk. Only when last_blk points in the middle of a log

577

* reset last_blk. Only when last_blk points in the middle of a log

578

* record do we update last_blk.

578

* record do we update last_blk.

579

*/

579

*/

580

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

580

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

581

uint h_size = be32_to_cpu(head->h_size);

581

uint h_size = be32_to_cpu(head->h_size);

582

583

xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;

583

xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;

584

if (h_size % XLOG_HEADER_CYCLE_SIZE)

584

if (h_size % XLOG_HEADER_CYCLE_SIZE)

585

xhdrs++;

585

xhdrs++;

586

} else {

586

} else {

587

xhdrs = 1;

587

xhdrs = 1;

588

}

588

}

589

590

if (*last_blk - i + extra_bblks !=

590

if (*last_blk - i + extra_bblks !=

591

BTOBB(be32_to_cpu(head->h_len)) + xhdrs)

591

BTOBB(be32_to_cpu(head->h_len)) + xhdrs)

592

*last_blk = i;

592

*last_blk = i;

593

594

out:

594

out:

595

xlog_put_bp(bp);

595

xlog_put_bp(bp);

596

return error;

596

return error;

597

}

597

}

598

599

/*

599

/*

600

* Head is defined to be the point of the log where the next log write

600

* Head is defined to be the point of the log where the next log write

601

* write could go. This means that incomplete LR writes at the end are

601

* write could go. This means that incomplete LR writes at the end are

602

* eliminated when calculating the head. We aren't guaranteed that previous

602

* eliminated when calculating the head. We aren't guaranteed that previous

603

* LR have complete transactions. We only know that a cycle number of

603

* LR have complete transactions. We only know that a cycle number of

604

* current cycle number -1 won't be present in the log if we start writing

604

* current cycle number -1 won't be present in the log if we start writing

605

* from our current block number.

605

* from our current block number.

606

*

606

*

607

* last_blk contains the block number of the first block with a given

607

* last_blk contains the block number of the first block with a given

608

* cycle number.

608

* cycle number.

609

*

609

*

610

* Return: zero if normal, non-zero if error.

610

* Return: zero if normal, non-zero if error.

611

*/

611

*/

612

STATIC int

612

STATIC int

613

xlog_find_head(

613

xlog_find_head(

614

struct xlog *log,

614

struct xlog *log,

615

xfs_daddr_t *return_head_blk)

615

xfs_daddr_t *return_head_blk)

616

{

616

{

617

xfs_buf_t *bp;

617

xfs_buf_t *bp;

618

xfs_caddr_t offset;

618

xfs_caddr_t offset;

619

xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;

619

xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;

620

int num_scan_bblks;

620

int num_scan_bblks;

621

uint first_half_cycle, last_half_cycle;

621

uint first_half_cycle, last_half_cycle;

622

uint stop_on_cycle;

622

uint stop_on_cycle;

623

int error, log_bbnum = log->l_logBBsize;

623

int error, log_bbnum = log->l_logBBsize;

624

625

/* Is the end of the log device zeroed? */

625

/* Is the end of the log device zeroed? */

626

if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {

626

if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {

627

*return_head_blk = first_blk;

627

*return_head_blk = first_blk;

628

629

/* Is the whole lot zeroed? */

629

/* Is the whole lot zeroed? */

630

if (!first_blk) {

630

if (!first_blk) {

631

/* Linux XFS shouldn't generate totally zeroed logs -

631

/* Linux XFS shouldn't generate totally zeroed logs -

632

* mkfs etc write a dummy unmount record to a fresh

632

* mkfs etc write a dummy unmount record to a fresh

633

* log so we can store the uuid in there

633

* log so we can store the uuid in there

634

*/

634

*/

635

xfs_warn(log->l_mp, "totally zeroed log");

635

xfs_warn(log->l_mp, "totally zeroed log");

636

}

636

}

637

638

return 0;

638

return 0;

639

} else if (error) {

639

} else if (error) {

640

xfs_warn(log->l_mp, "empty log check failed");

640

xfs_warn(log->l_mp, "empty log check failed");

641

return error;

641

return error;

642

}

642

}

643

644

first_blk = 0; /* get cycle # of 1st block */

644

first_blk = 0; /* get cycle # of 1st block */

645

bp = xlog_get_bp(log, 1);

645

bp = xlog_get_bp(log, 1);

646

if (!bp)

646

if (!bp)

647

return ENOMEM;

647

return ENOMEM;

648

649

error = xlog_bread(log, 0, 1, bp, &offset);

649

error = xlog_bread(log, 0, 1, bp, &offset);

650

if (error)

650

if (error)

651

goto bp_err;

651

goto bp_err;

652

653

first_half_cycle = xlog_get_cycle(offset);

653

first_half_cycle = xlog_get_cycle(offset);

654

655

last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */

655

last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */

656

error = xlog_bread(log, last_blk, 1, bp, &offset);

656

error = xlog_bread(log, last_blk, 1, bp, &offset);

657

if (error)

657

if (error)

658

goto bp_err;

658

goto bp_err;

659

660

last_half_cycle = xlog_get_cycle(offset);

660

last_half_cycle = xlog_get_cycle(offset);

661

ASSERT(last_half_cycle != 0);

661

ASSERT(last_half_cycle != 0);

662

663

/*

663

/*

664

* If the 1st half cycle number is equal to the last half cycle number,

664

* If the 1st half cycle number is equal to the last half cycle number,

665

* then the entire log is stamped with the same cycle number. In this

665

* then the entire log is stamped with the same cycle number. In this

666

* case, head_blk can't be set to zero (which makes sense). The below

666

* case, head_blk can't be set to zero (which makes sense). The below

667

* math doesn't work out properly with head_blk equal to zero. Instead,

667

* math doesn't work out properly with head_blk equal to zero. Instead,

668

* we set it to log_bbnum which is an invalid block number, but this

668

* we set it to log_bbnum which is an invalid block number, but this

669

* value makes the math correct. If head_blk doesn't changed through

669

* value makes the math correct. If head_blk doesn't changed through

670

* all the tests below, *head_blk is set to zero at the very end rather

670

* all the tests below, *head_blk is set to zero at the very end rather

671

* than log_bbnum. In a sense, log_bbnum and zero are the same block

671

* than log_bbnum. In a sense, log_bbnum and zero are the same block

672

* in a circular file.

672

* in a circular file.

673

*/

673

*/

674

if (first_half_cycle == last_half_cycle) {

674

if (first_half_cycle == last_half_cycle) {

675

/*

675

/*

676

* In this case we believe that the entire log should have

676

* In this case we believe that the entire log should have

677

* cycle number last_half_cycle. We need to scan backwards

677

* cycle number last_half_cycle. We need to scan backwards

678

* from the end verifying that there are no holes still

678

* from the end verifying that there are no holes still

679

* containing last_half_cycle - 1. If we find such a hole,

679

* containing last_half_cycle - 1. If we find such a hole,

680

* then the start of that hole will be the new head. The

680

* then the start of that hole will be the new head. The

681

* simple case looks like

681

* simple case looks like

682

* x | x ... | x - 1 | x

682

* x | x ... | x - 1 | x

683

* Another case that fits this picture would be

683

* Another case that fits this picture would be

684

* x | x + 1 | x ... | x

684

* x | x + 1 | x ... | x

685

* In this case the head really is somewhere at the end of the

685

* In this case the head really is somewhere at the end of the

686

* log, as one of the latest writes at the beginning was

686

* log, as one of the latest writes at the beginning was

687

* incomplete.

687

* incomplete.

688

* One more case is

688

* One more case is

689

* x | x + 1 | x ... | x - 1 | x

689

* x | x + 1 | x ... | x - 1 | x

690

* This is really the combination of the above two cases, and

690

* This is really the combination of the above two cases, and

691

* the head has to end up at the start of the x-1 hole at the

691

* the head has to end up at the start of the x-1 hole at the

692

* end of the log.

692

* end of the log.

693

*

693

*

694

* In the 256k log case, we will read from the beginning to the

694

* In the 256k log case, we will read from the beginning to the

695

* end of the log and search for cycle numbers equal to x-1.

695

* end of the log and search for cycle numbers equal to x-1.

696

* We don't worry about the x+1 blocks that we encounter,

696

* We don't worry about the x+1 blocks that we encounter,

697

* because we know that they cannot be the head since the log

697

* because we know that they cannot be the head since the log

698

* started with x.

698

* started with x.

699

*/

699

*/

700

head_blk = log_bbnum;

700

head_blk = log_bbnum;

701

stop_on_cycle = last_half_cycle - 1;

701

stop_on_cycle = last_half_cycle - 1;

702

} else {

702

} else {

703

/*

703

/*

704

* In this case we want to find the first block with cycle

704

* In this case we want to find the first block with cycle

705

* number matching last_half_cycle. We expect the log to be

705

* number matching last_half_cycle. We expect the log to be

706

* some variation on

706

* some variation on

707

* x + 1 ... | x ... | x

707

* x + 1 ... | x ... | x

708

* The first block with cycle number x (last_half_cycle) will

708

* The first block with cycle number x (last_half_cycle) will

709

* be where the new head belongs. First we do a binary search

709

* be where the new head belongs. First we do a binary search

710

* for the first occurrence of last_half_cycle. The binary

710

* for the first occurrence of last_half_cycle. The binary

711

* search may not be totally accurate, so then we scan back

711

* search may not be totally accurate, so then we scan back

712

* from there looking for occurrences of last_half_cycle before

712

* from there looking for occurrences of last_half_cycle before

713

* us. If that backwards scan wraps around the beginning of

713

* us. If that backwards scan wraps around the beginning of

714

* the log, then we look for occurrences of last_half_cycle - 1

714

* the log, then we look for occurrences of last_half_cycle - 1

715

* at the end of the log. The cases we're looking for look

715

* at the end of the log. The cases we're looking for look

716

* like

716

* like

717

* v binary search stopped here

717

* v binary search stopped here

718

* x + 1 ... | x | x + 1 | x ... | x

718

* x + 1 ... | x | x + 1 | x ... | x

719

* ^ but we want to locate this spot

719

* ^ but we want to locate this spot

720

* or

720

* or

721

* <---------> less than scan distance

721

* <---------> less than scan distance

722

* x + 1 ... | x ... | x - 1 | x

722

* x + 1 ... | x ... | x - 1 | x

723

* ^ we want to locate this spot

723

* ^ we want to locate this spot

724

*/

724

*/

725

stop_on_cycle = last_half_cycle;

725

stop_on_cycle = last_half_cycle;

726

if ((error = xlog_find_cycle_start(log, bp, first_blk,

726

if ((error = xlog_find_cycle_start(log, bp, first_blk,

727

&head_blk, last_half_cycle)))

727

&head_blk, last_half_cycle)))

728

goto bp_err;

728

goto bp_err;

729

}

729

}

730

731

/*

731

/*

732

* Now validate the answer. Scan back some number of maximum possible

732

* Now validate the answer. Scan back some number of maximum possible

733

* blocks and make sure each one has the expected cycle number. The

733

* blocks and make sure each one has the expected cycle number. The

734

* maximum is determined by the total possible amount of buffering

734

* maximum is determined by the total possible amount of buffering

735

* in the in-core log. The following number can be made tighter if

735

* in the in-core log. The following number can be made tighter if

736

* we actually look at the block size of the filesystem.

736

* we actually look at the block size of the filesystem.

737

*/

737

*/

738

num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);

738

num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);

739

if (head_blk >= num_scan_bblks) {

739

if (head_blk >= num_scan_bblks) {

740

/*

740

/*

741

* We are guaranteed that the entire check can be performed

741

* We are guaranteed that the entire check can be performed

742

* in one buffer.

742

* in one buffer.

743

*/

743

*/

744

start_blk = head_blk - num_scan_bblks;

744

start_blk = head_blk - num_scan_bblks;

745

if ((error = xlog_find_verify_cycle(log,

745

if ((error = xlog_find_verify_cycle(log,

746

start_blk, num_scan_bblks,

746

start_blk, num_scan_bblks,

747

stop_on_cycle, &new_blk)))

747

stop_on_cycle, &new_blk)))

748

goto bp_err;

748

goto bp_err;

749

if (new_blk != -1)

749

if (new_blk != -1)

750

head_blk = new_blk;

750

head_blk = new_blk;

751

} else { /* need to read 2 parts of log */

751

} else { /* need to read 2 parts of log */

752

/*

752

/*

753

* We are going to scan backwards in the log in two parts.

753

* We are going to scan backwards in the log in two parts.

754

* First we scan the physical end of the log. In this part

754

* First we scan the physical end of the log. In this part

755

* of the log, we are looking for blocks with cycle number

755

* of the log, we are looking for blocks with cycle number

756

* last_half_cycle - 1.

756

* last_half_cycle - 1.

757

* If we find one, then we know that the log starts there, as

757

* If we find one, then we know that the log starts there, as

758

* we've found a hole that didn't get written in going around

758

* we've found a hole that didn't get written in going around

759

* the end of the physical log. The simple case for this is

759

* the end of the physical log. The simple case for this is

760

* x + 1 ... | x ... | x - 1 | x

760

* x + 1 ... | x ... | x - 1 | x

761

* <---------> less than scan distance

761

* <---------> less than scan distance

762

* If all of the blocks at the end of the log have cycle number

762

* If all of the blocks at the end of the log have cycle number

763

* last_half_cycle, then we check the blocks at the start of

763

* last_half_cycle, then we check the blocks at the start of

764

* the log looking for occurrences of last_half_cycle. If we

764

* the log looking for occurrences of last_half_cycle. If we

765

* find one, then our current estimate for the location of the

765

* find one, then our current estimate for the location of the

766

* first occurrence of last_half_cycle is wrong and we move

766

* first occurrence of last_half_cycle is wrong and we move

767

* back to the hole we've found. This case looks like

767

* back to the hole we've found. This case looks like

768

* x + 1 ... | x | x + 1 | x ...

768

* x + 1 ... | x | x + 1 | x ...

769

* ^ binary search stopped here

769

* ^ binary search stopped here

770

* Another case we need to handle that only occurs in 256k

770

* Another case we need to handle that only occurs in 256k

771

* logs is

771

* logs is

772

* x + 1 ... | x ... | x+1 | x ...

772

* x + 1 ... | x ... | x+1 | x ...

773

* ^ binary search stops here

773

* ^ binary search stops here

774

* In a 256k log, the scan at the end of the log will see the

774

* In a 256k log, the scan at the end of the log will see the

775

* x + 1 blocks. We need to skip past those since that is

775

* x + 1 blocks. We need to skip past those since that is

776

* certainly not the head of the log. By searching for

776

* certainly not the head of the log. By searching for

777

* last_half_cycle-1 we accomplish that.

777

* last_half_cycle-1 we accomplish that.

778

*/

778

*/

779

ASSERT(head_blk <= INT_MAX &&

779

ASSERT(head_blk <= INT_MAX &&

780

(xfs_daddr_t) num_scan_bblks >= head_blk);

780

(xfs_daddr_t) num_scan_bblks >= head_blk);

781

start_blk = log_bbnum - (num_scan_bblks - head_blk);

781

start_blk = log_bbnum - (num_scan_bblks - head_blk);

782

if ((error = xlog_find_verify_cycle(log, start_blk,

782

if ((error = xlog_find_verify_cycle(log, start_blk,

783

num_scan_bblks - (int)head_blk,

783

num_scan_bblks - (int)head_blk,

784

(stop_on_cycle - 1), &new_blk)))

784

(stop_on_cycle - 1), &new_blk)))

785

goto bp_err;

785

goto bp_err;

786

if (new_blk != -1) {

786

if (new_blk != -1) {

787

head_blk = new_blk;

787

head_blk = new_blk;

788

goto validate_head;

788

goto validate_head;

789

}

789

}

790

791

/*

791

/*

792

* Scan beginning of log now. The last part of the physical

792

* Scan beginning of log now. The last part of the physical

793

* log is good. This scan needs to verify that it doesn't find

793

* log is good. This scan needs to verify that it doesn't find

794

* the last_half_cycle.

794

* the last_half_cycle.

795

*/

795

*/

796

start_blk = 0;

796

start_blk = 0;

797

ASSERT(head_blk <= INT_MAX);

797

ASSERT(head_blk <= INT_MAX);

798

if ((error = xlog_find_verify_cycle(log,

798

if ((error = xlog_find_verify_cycle(log,

799

start_blk, (int)head_blk,

799

start_blk, (int)head_blk,

800

stop_on_cycle, &new_blk)))

800

stop_on_cycle, &new_blk)))

801

goto bp_err;

801

goto bp_err;

802

if (new_blk != -1)

802

if (new_blk != -1)

803

head_blk = new_blk;

803

head_blk = new_blk;

804

}

804

}

805

806

validate_head:

806

validate_head:

807

/*

807

/*

808

* Now we need to make sure head_blk is not pointing to a block in

808

* Now we need to make sure head_blk is not pointing to a block in

809

* the middle of a log record.

809

* the middle of a log record.

810

*/

810

*/

811

num_scan_bblks = XLOG_REC_SHIFT(log);

811

num_scan_bblks = XLOG_REC_SHIFT(log);

812

if (head_blk >= num_scan_bblks) {

812

if (head_blk >= num_scan_bblks) {

813

start_blk = head_blk - num_scan_bblks; /* don't read head_blk */

813

start_blk = head_blk - num_scan_bblks; /* don't read head_blk */

814

815

/* start ptr at last block ptr before head_blk */

815

/* start ptr at last block ptr before head_blk */

816

if ((error = xlog_find_verify_log_record(log, start_blk,

816

if ((error = xlog_find_verify_log_record(log, start_blk,

817

&head_blk, 0)) == -1) {

817

&head_blk, 0)) == -1) {

818

error = XFS_ERROR(EIO);

818

error = XFS_ERROR(EIO);

819

goto bp_err;

819

goto bp_err;

820

} else if (error)

820

} else if (error)

821

goto bp_err;

821

goto bp_err;

822

} else {

822

} else {

823

start_blk = 0;

823

start_blk = 0;

824

ASSERT(head_blk <= INT_MAX);

824

ASSERT(head_blk <= INT_MAX);

825

if ((error = xlog_find_verify_log_record(log, start_blk,

825

if ((error = xlog_find_verify_log_record(log, start_blk,

826

&head_blk, 0)) == -1) {

826

&head_blk, 0)) == -1) {

827

/* We hit the beginning of the log during our search */

827

/* We hit the beginning of the log during our search */

828

start_blk = log_bbnum - (num_scan_bblks - head_blk);

828

start_blk = log_bbnum - (num_scan_bblks - head_blk);

829

new_blk = log_bbnum;

829

new_blk = log_bbnum;

830

ASSERT(start_blk <= INT_MAX &&

830

ASSERT(start_blk <= INT_MAX &&

831

(xfs_daddr_t) log_bbnum-start_blk >= 0);

831

(xfs_daddr_t) log_bbnum-start_blk >= 0);

832

ASSERT(head_blk <= INT_MAX);

832

ASSERT(head_blk <= INT_MAX);

833

if ((error = xlog_find_verify_log_record(log,

833

if ((error = xlog_find_verify_log_record(log,

834

start_blk, &new_blk,

834

start_blk, &new_blk,

835

(int)head_blk)) == -1) {

835

(int)head_blk)) == -1) {

836

error = XFS_ERROR(EIO);

836

error = XFS_ERROR(EIO);

837

goto bp_err;

837

goto bp_err;

838

} else if (error)

838

} else if (error)

839

goto bp_err;

839

goto bp_err;

840

if (new_blk != log_bbnum)

840

if (new_blk != log_bbnum)

841

head_blk = new_blk;

841

head_blk = new_blk;

842

} else if (error)

842

} else if (error)

843

goto bp_err;

843

goto bp_err;

844

}

844

}

845

846

xlog_put_bp(bp);

846

xlog_put_bp(bp);

847

if (head_blk == log_bbnum)

847

if (head_blk == log_bbnum)

848

*return_head_blk = 0;

848

*return_head_blk = 0;

849

else

849

else

850

*return_head_blk = head_blk;

850

*return_head_blk = head_blk;

851

/*

851

/*

852

* When returning here, we have a good block number. Bad block

852

* When returning here, we have a good block number. Bad block

853

* means that during a previous crash, we didn't have a clean break

853

* means that during a previous crash, we didn't have a clean break

854

* from cycle number N to cycle number N-1. In this case, we need

854

* from cycle number N to cycle number N-1. In this case, we need

855

* to find the first block with cycle number N-1.

855

* to find the first block with cycle number N-1.

856

*/

856

*/

857

return 0;

857

return 0;

858

859

bp_err:

859

bp_err:

860

xlog_put_bp(bp);

860

xlog_put_bp(bp);

861

862

if (error)

862

if (error)

863

xfs_warn(log->l_mp, "failed to find log head");

863

xfs_warn(log->l_mp, "failed to find log head");

864

return error;

864

return error;

865

}

865

}

866

867

/*

867

/*

868

* Find the sync block number or the tail of the log.

868

* Find the sync block number or the tail of the log.

869

*

869

*

870

* This will be the block number of the last record to have its

870

* This will be the block number of the last record to have its

871

* associated buffers synced to disk. Every log record header has

871

* associated buffers synced to disk. Every log record header has

872

* a sync lsn embedded in it. LSNs hold block numbers, so it is easy

872

* a sync lsn embedded in it. LSNs hold block numbers, so it is easy

873

* to get a sync block number. The only concern is to figure out which

873

* to get a sync block number. The only concern is to figure out which

874

* log record header to believe.

874

* log record header to believe.

875

*

875

*

876

* The following algorithm uses the log record header with the largest

876

* The following algorithm uses the log record header with the largest

877

* lsn. The entire log record does not need to be valid. We only care

877

* lsn. The entire log record does not need to be valid. We only care

878

* that the header is valid.

878

* that the header is valid.

879

*

879

*

880

* We could speed up search by using current head_blk buffer, but it is not

880

* We could speed up search by using current head_blk buffer, but it is not

881

* available.

881

* available.

882

*/

882

*/

883

STATIC int

883

STATIC int

884

xlog_find_tail(

884

xlog_find_tail(

885

struct xlog *log,

885

struct xlog *log,

886

xfs_daddr_t *head_blk,

886

xfs_daddr_t *head_blk,

887

xfs_daddr_t *tail_blk)

887

xfs_daddr_t *tail_blk)

888

{

888

{

889

xlog_rec_header_t *rhead;

889

xlog_rec_header_t *rhead;

890

xlog_op_header_t *op_head;

890

xlog_op_header_t *op_head;

891

xfs_caddr_t offset = NULL;

891

xfs_caddr_t offset = NULL;

892

xfs_buf_t *bp;

892

xfs_buf_t *bp;

893

int error, i, found;

893

int error, i, found;

894

xfs_daddr_t umount_data_blk;

894

xfs_daddr_t umount_data_blk;

895

xfs_daddr_t after_umount_blk;

895

xfs_daddr_t after_umount_blk;

896

xfs_lsn_t tail_lsn;

896

xfs_lsn_t tail_lsn;

897

int hblks;

897

int hblks;

898

899

found = 0;

899

found = 0;

900

901

/*

901

/*

902

* Find previous log record

902

* Find previous log record

903

*/

903

*/

904

if ((error = xlog_find_head(log, head_blk)))

904

if ((error = xlog_find_head(log, head_blk)))

905

return error;

905

return error;

906

907

bp = xlog_get_bp(log, 1);

907

bp = xlog_get_bp(log, 1);

908

if (!bp)

908

if (!bp)

909

return ENOMEM;

909

return ENOMEM;

910

if (*head_blk == 0) { /* special case */

910

if (*head_blk == 0) { /* special case */

911

error = xlog_bread(log, 0, 1, bp, &offset);

911

error = xlog_bread(log, 0, 1, bp, &offset);

912

if (error)

912

if (error)

913

goto done;

913

goto done;

914

915

if (xlog_get_cycle(offset) == 0) {

915

if (xlog_get_cycle(offset) == 0) {

916

*tail_blk = 0;

916

*tail_blk = 0;

917

/* leave all other log inited values alone */

917

/* leave all other log inited values alone */

918

goto done;

918

goto done;

919

}

919

}

920

}

920

}

921

922

/*

922

/*

923

* Search backwards looking for log record header block

923

* Search backwards looking for log record header block

924

*/

924

*/

925

ASSERT(*head_blk < INT_MAX);

925

ASSERT(*head_blk < INT_MAX);

926

for (i = (int)(*head_blk) - 1; i >= 0; i--) {

926

for (i = (int)(*head_blk) - 1; i >= 0; i--) {

927

error = xlog_bread(log, i, 1, bp, &offset);

927

error = xlog_bread(log, i, 1, bp, &offset);

928

if (error)

928

if (error)

929

goto done;

929

goto done;

930

931

if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {

931

if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {

932

found = 1;

932

found = 1;

933

break;

933

break;

934

}

934

}

935

}

935

}

936

/*

936

/*

937

* If we haven't found the log record header block, start looking

937

* If we haven't found the log record header block, start looking

938

* again from the end of the physical log. XXXmiken: There should be

938

* again from the end of the physical log. XXXmiken: There should be

939

* a check here to make sure we didn't search more than N blocks in

939

* a check here to make sure we didn't search more than N blocks in

940

* the previous code.

940

* the previous code.

941

*/

941

*/

942

if (!found) {

942

if (!found) {

943

for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {

943

for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {

944

error = xlog_bread(log, i, 1, bp, &offset);

944

error = xlog_bread(log, i, 1, bp, &offset);

945

if (error)

945

if (error)

946

goto done;

946

goto done;

947

948

if (*(__be32 *)offset ==

948

if (*(__be32 *)offset ==

949

cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {

949

cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {

950

found = 2;

950

found = 2;

951

break;

951

break;

952

}

952

}

953

}

953

}

954

}

954

}

955

if (!found) {

955

if (!found) {

956

xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);

956

xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);

957

ASSERT(0);

957

ASSERT(0);

958

return XFS_ERROR(EIO);

958

return XFS_ERROR(EIO);

959

}

959

}

960

961

/* find blk_no of tail of log */

961

/* find blk_no of tail of log */

962

rhead = (xlog_rec_header_t *)offset;

962

rhead = (xlog_rec_header_t *)offset;

963

*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));

963

*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));

964

965

/*

965

/*

966

* Reset log values according to the state of the log when we

966

* Reset log values according to the state of the log when we

967

* crashed. In the case where head_blk == 0, we bump curr_cycle

967

* crashed. In the case where head_blk == 0, we bump curr_cycle

968

* one because the next write starts a new cycle rather than

968

* one because the next write starts a new cycle rather than

969

* continuing the cycle of the last good log record. At this

969

* continuing the cycle of the last good log record. At this

970

* point we have guaranteed that all partial log records have been

970

* point we have guaranteed that all partial log records have been

971

* accounted for. Therefore, we know that the last good log record

971

* accounted for. Therefore, we know that the last good log record

972

* written was complete and ended exactly on the end boundary

972

* written was complete and ended exactly on the end boundary

973

* of the physical log.

973

* of the physical log.

974

*/

974

*/

975

log->l_prev_block = i;

975

log->l_prev_block = i;

976

log->l_curr_block = (int)*head_blk;

976

log->l_curr_block = (int)*head_blk;

977

log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);

977

log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);

978

if (found == 2)

978

if (found == 2)

979

log->l_curr_cycle++;

979

log->l_curr_cycle++;

980

atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));

980

atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));

981

atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));

981

atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));

982

xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,

982

xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,

983

BBTOB(log->l_curr_block));

983

BBTOB(log->l_curr_block));

984

xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,

984

xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,

985

BBTOB(log->l_curr_block));

985

BBTOB(log->l_curr_block));

986

987

/*

987

/*

988

* Look for unmount record. If we find it, then we know there

988

* Look for unmount record. If we find it, then we know there

989

* was a clean unmount. Since 'i' could be the last block in

989

* was a clean unmount. Since 'i' could be the last block in

990

* the physical log, we convert to a log block before comparing

990

* the physical log, we convert to a log block before comparing

991

* to the head_blk.

991

* to the head_blk.

992

*

992

*

993

* Save the current tail lsn to use to pass to

993

* Save the current tail lsn to use to pass to

994

* xlog_clear_stale_blocks() below. We won't want to clear the

994

* xlog_clear_stale_blocks() below. We won't want to clear the

995

* unmount record if there is one, so we pass the lsn of the

995

* unmount record if there is one, so we pass the lsn of the

996

* unmount record rather than the block after it.

996

* unmount record rather than the block after it.

997

*/

997

*/

998

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

998

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

999

int h_size = be32_to_cpu(rhead->h_size);

999

int h_size = be32_to_cpu(rhead->h_size);

1000

int h_version = be32_to_cpu(rhead->h_version);

1000

int h_version = be32_to_cpu(rhead->h_version);

1001

1002

if ((h_version & XLOG_VERSION_2) &&

1002

if ((h_version & XLOG_VERSION_2) &&

1003

(h_size > XLOG_HEADER_CYCLE_SIZE)) {

1003

(h_size > XLOG_HEADER_CYCLE_SIZE)) {

1004

hblks = h_size / XLOG_HEADER_CYCLE_SIZE;

1004

hblks = h_size / XLOG_HEADER_CYCLE_SIZE;

1005

if (h_size % XLOG_HEADER_CYCLE_SIZE)

1005

if (h_size % XLOG_HEADER_CYCLE_SIZE)

1006

hblks++;

1006

hblks++;

1007

} else {

1007

} else {

1008

hblks = 1;

1008

hblks = 1;

1009

}

1009

}

1010

} else {

1010

} else {

1011

hblks = 1;

1011

hblks = 1;

1012

}

1012

}

1013

after_umount_blk = (i + hblks + (int)

1013

after_umount_blk = (i + hblks + (int)

1014

BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;

1014

BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;

1015

tail_lsn = atomic64_read(&log->l_tail_lsn);

1015

tail_lsn = atomic64_read(&log->l_tail_lsn);

1016

if (*head_blk == after_umount_blk &&

1016

if (*head_blk == after_umount_blk &&

1017

be32_to_cpu(rhead->h_num_logops) == 1) {

1017

be32_to_cpu(rhead->h_num_logops) == 1) {

1018

umount_data_blk = (i + hblks) % log->l_logBBsize;

1018

umount_data_blk = (i + hblks) % log->l_logBBsize;

1019

error = xlog_bread(log, umount_data_blk, 1, bp, &offset);

1019

error = xlog_bread(log, umount_data_blk, 1, bp, &offset);

1020

if (error)

1020

if (error)

1021

goto done;

1021

goto done;

1022

1023

op_head = (xlog_op_header_t *)offset;

1023

op_head = (xlog_op_header_t *)offset;

1024

if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {

1024

if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {

1025

/*

1025

/*

1026

* Set tail and last sync so that newly written

1026

* Set tail and last sync so that newly written

1027

* log records will point recovery to after the

1027

* log records will point recovery to after the

1028

* current unmount record.

1028

* current unmount record.

1029

*/

1029

*/

1030

xlog_assign_atomic_lsn(&log->l_tail_lsn,

1030

xlog_assign_atomic_lsn(&log->l_tail_lsn,

1031

log->l_curr_cycle, after_umount_blk);

1031

log->l_curr_cycle, after_umount_blk);

1032

xlog_assign_atomic_lsn(&log->l_last_sync_lsn,

1032

xlog_assign_atomic_lsn(&log->l_last_sync_lsn,

1033

log->l_curr_cycle, after_umount_blk);

1033

log->l_curr_cycle, after_umount_blk);

1034

*tail_blk = after_umount_blk;

1034

*tail_blk = after_umount_blk;

1035

1036

/*

1036

/*

1037

* Note that the unmount was clean. If the unmount

1037

* Note that the unmount was clean. If the unmount

1038

* was not clean, we need to know this to rebuild the

1038

* was not clean, we need to know this to rebuild the

1039

* superblock counters from the perag headers if we

1039

* superblock counters from the perag headers if we

1040

* have a filesystem using non-persistent counters.

1040

* have a filesystem using non-persistent counters.

1041

*/

1041

*/

1042

log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;

1042

log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;

1043

}

1043

}

1044

}

1044

}

1045

1046

/*

1046

/*

1047

* Make sure that there are no blocks in front of the head

1047

* Make sure that there are no blocks in front of the head

1048

* with the same cycle number as the head. This can happen

1048

* with the same cycle number as the head. This can happen

1049

* because we allow multiple outstanding log writes concurrently,

1049

* because we allow multiple outstanding log writes concurrently,

1050

* and the later writes might make it out before earlier ones.

1050

* and the later writes might make it out before earlier ones.

1051

*

1051

*

1052

* We use the lsn from before modifying it so that we'll never

1052

* We use the lsn from before modifying it so that we'll never

1053

* overwrite the unmount record after a clean unmount.

1053

* overwrite the unmount record after a clean unmount.

1054

*

1054

*

1055

* Do this only if we are going to recover the filesystem

1055

* Do this only if we are going to recover the filesystem

1056

*

1056

*

1057

* NOTE: This used to say "if (!readonly)"

1057

* NOTE: This used to say "if (!readonly)"

1058

* However on Linux, we can & do recover a read-only filesystem.

1058

* However on Linux, we can & do recover a read-only filesystem.

1059

* We only skip recovery if NORECOVERY is specified on mount,

1059

* We only skip recovery if NORECOVERY is specified on mount,

1060

* in which case we would not be here.

1060

* in which case we would not be here.

1061

*

1061

*

1062

* But... if the -device- itself is readonly, just skip this.

1062

* But... if the -device- itself is readonly, just skip this.

1063

* We can't recover this device anyway, so it won't matter.

1063

* We can't recover this device anyway, so it won't matter.

1064

*/

1064

*/

1065

if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))

1065

if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))

1066

error = xlog_clear_stale_blocks(log, tail_lsn);

1066

error = xlog_clear_stale_blocks(log, tail_lsn);

1067

1068

done:

1068

done:

1069

xlog_put_bp(bp);

1069

xlog_put_bp(bp);

1070

1071

if (error)

1071

if (error)

1072

xfs_warn(log->l_mp, "failed to locate log tail");

1072

xfs_warn(log->l_mp, "failed to locate log tail");

1073

return error;

1073

return error;

1074

}

1074

}

1075

1076

/*

1076

/*

1077

* Is the log zeroed at all?

1077

* Is the log zeroed at all?

1078

*

1078

*

1079

* The last binary search should be changed to perform an X block read

1079

* The last binary search should be changed to perform an X block read

1080

* once X becomes small enough. You can then search linearly through

1080

* once X becomes small enough. You can then search linearly through

1081

* the X blocks. This will cut down on the number of reads we need to do.

1081

* the X blocks. This will cut down on the number of reads we need to do.

1082

*

1082

*

1083

* If the log is partially zeroed, this routine will pass back the blkno

1083

* If the log is partially zeroed, this routine will pass back the blkno

1084

* of the first block with cycle number 0. It won't have a complete LR

1084

* of the first block with cycle number 0. It won't have a complete LR

1085

* preceding it.

1085

* preceding it.

1086

*

1086

*

1087

* Return:

1087

* Return:

1088

* 0 => the log is completely written to

1088

* 0 => the log is completely written to

1089

* -1 => use *blk_no as the first block of the log

1089

* -1 => use *blk_no as the first block of the log

1090

* >0 => error has occurred

1090

* >0 => error has occurred

1091

*/

1091

*/

1092

STATIC int

1092

STATIC int

1093

xlog_find_zeroed(

1093

xlog_find_zeroed(

1094

struct xlog *log,

1094

struct xlog *log,

1095

xfs_daddr_t *blk_no)

1095

xfs_daddr_t *blk_no)

1096

{

1096

{

1097

xfs_buf_t *bp;

1097

xfs_buf_t *bp;

1098

xfs_caddr_t offset;

1098

xfs_caddr_t offset;

1099

uint first_cycle, last_cycle;

1099

uint first_cycle, last_cycle;

1100

xfs_daddr_t new_blk, last_blk, start_blk;

1100

xfs_daddr_t new_blk, last_blk, start_blk;

1101

xfs_daddr_t num_scan_bblks;

1101

xfs_daddr_t num_scan_bblks;

1102

int error, log_bbnum = log->l_logBBsize;

1102

int error, log_bbnum = log->l_logBBsize;

1103

1104

*blk_no = 0;

1104

*blk_no = 0;

1105

1106

/* check totally zeroed log */

1106

/* check totally zeroed log */

1107

bp = xlog_get_bp(log, 1);

1107

bp = xlog_get_bp(log, 1);

1108

if (!bp)

1108

if (!bp)

1109

return ENOMEM;

1109

return ENOMEM;

1110

error = xlog_bread(log, 0, 1, bp, &offset);

1110

error = xlog_bread(log, 0, 1, bp, &offset);

1111

if (error)

1111

if (error)

1112

goto bp_err;

1112

goto bp_err;

1113

1114

first_cycle = xlog_get_cycle(offset);

1114

first_cycle = xlog_get_cycle(offset);

1115

if (first_cycle == 0) { /* completely zeroed log */

1115

if (first_cycle == 0) { /* completely zeroed log */

1116

*blk_no = 0;

1116

*blk_no = 0;

1117

xlog_put_bp(bp);

1117

xlog_put_bp(bp);

1118

return -1;

1118

return -1;

1119

}

1119

}

1120

1121

/* check partially zeroed log */

1121

/* check partially zeroed log */

1122

error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);

1122

error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);

1123

if (error)

1123

if (error)

1124

goto bp_err;

1124

goto bp_err;

1125

1126

last_cycle = xlog_get_cycle(offset);

1126

last_cycle = xlog_get_cycle(offset);

1127

if (last_cycle != 0) { /* log completely written to */

1127

if (last_cycle != 0) { /* log completely written to */

1128

xlog_put_bp(bp);

1128

xlog_put_bp(bp);

1129

return 0;

1129

return 0;

1130

} else if (first_cycle != 1) {

1130

} else if (first_cycle != 1) {

1131

/*

1131

/*

1132

* If the cycle of the last block is zero, the cycle of

1132

* If the cycle of the last block is zero, the cycle of

1133

* the first block must be 1. If it's not, maybe we're

1133

* the first block must be 1. If it's not, maybe we're

1134

* not looking at a log... Bail out.

1134

* not looking at a log... Bail out.

1135

*/

1135

*/

1136

xfs_warn(log->l_mp,

1136

xfs_warn(log->l_mp,

1137

"Log inconsistent or not a log (last==0, first!=1)");

1137

"Log inconsistent or not a log (last==0, first!=1)");

1138

return XFS_ERROR(EINVAL);

1138

return XFS_ERROR(EINVAL);

1139

}

1139

}

1140

1141

/* we have a partially zeroed log */

1141

/* we have a partially zeroed log */

1142

last_blk = log_bbnum-1;

1142

last_blk = log_bbnum-1;

1143

if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))

1143

if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))

1144

goto bp_err;

1144

goto bp_err;

1145

1146

/*

1146

/*

1147

* Validate the answer. Because there is no way to guarantee that

1147

* Validate the answer. Because there is no way to guarantee that

1148

* the entire log is made up of log records which are the same size,

1148

* the entire log is made up of log records which are the same size,

1149

* we scan over the defined maximum blocks. At this point, the maximum

1149

* we scan over the defined maximum blocks. At this point, the maximum

1150

* is not chosen to mean anything special. XXXmiken

1150

* is not chosen to mean anything special. XXXmiken

1151

*/

1151

*/

1152

num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);

1152

num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);

1153

ASSERT(num_scan_bblks <= INT_MAX);

1153

ASSERT(num_scan_bblks <= INT_MAX);

1154

1155

if (last_blk < num_scan_bblks)

1155

if (last_blk < num_scan_bblks)

1156

num_scan_bblks = last_blk;

1156

num_scan_bblks = last_blk;

1157

start_blk = last_blk - num_scan_bblks;

1157

start_blk = last_blk - num_scan_bblks;

1158

1159

/*

1159

/*

1160

* We search for any instances of cycle number 0 that occur before

1160

* We search for any instances of cycle number 0 that occur before

1161

* our current estimate of the head. What we're trying to detect is

1161

* our current estimate of the head. What we're trying to detect is

1162

* 1 ... | 0 | 1 | 0...

1162

* 1 ... | 0 | 1 | 0...

1163

* ^ binary search ends here

1163

* ^ binary search ends here

1164

*/

1164

*/

1165

if ((error = xlog_find_verify_cycle(log, start_blk,

1165

if ((error = xlog_find_verify_cycle(log, start_blk,

1166

(int)num_scan_bblks, 0, &new_blk)))

1166

(int)num_scan_bblks, 0, &new_blk)))

1167

goto bp_err;

1167

goto bp_err;

1168

if (new_blk != -1)

1168

if (new_blk != -1)

1169

last_blk = new_blk;

1169

last_blk = new_blk;

1170

1171

/*

1171

/*

1172

* Potentially backup over partial log record write. We don't need

1172

* Potentially backup over partial log record write. We don't need

1173

* to search the end of the log because we know it is zero.

1173

* to search the end of the log because we know it is zero.

1174

*/

1174

*/

1175

if ((error = xlog_find_verify_log_record(log, start_blk,

1175

if ((error = xlog_find_verify_log_record(log, start_blk,

1176

&last_blk, 0)) == -1) {

1176

&last_blk, 0)) == -1) {

1177

error = XFS_ERROR(EIO);

1177

error = XFS_ERROR(EIO);

1178

goto bp_err;

1178

goto bp_err;

1179

} else if (error)

1179

} else if (error)

1180

goto bp_err;

1180

goto bp_err;

1181

1182

*blk_no = last_blk;

1182

*blk_no = last_blk;

1183

bp_err:

1183

bp_err:

1184

xlog_put_bp(bp);

1184

xlog_put_bp(bp);

1185

if (error)

1185

if (error)

1186

return error;

1186

return error;

1187

return -1;

1187

return -1;

1188

}

1188

}

1189

1190

/*

1190

/*

1191

* These are simple subroutines used by xlog_clear_stale_blocks() below

1191

* These are simple subroutines used by xlog_clear_stale_blocks() below

1192

* to initialize a buffer full of empty log record headers and write

1192

* to initialize a buffer full of empty log record headers and write

1193

* them into the log.

1193

* them into the log.

1194

*/

1194

*/

1195

STATIC void

1195

STATIC void

1196

xlog_add_record(

1196

xlog_add_record(

1197

struct xlog *log,

1197

struct xlog *log,

1198

xfs_caddr_t buf,

1198

xfs_caddr_t buf,

1199

int cycle,

1199

int cycle,

1200

int block,

1200

int block,

1201

int tail_cycle,

1201

int tail_cycle,

1202

int tail_block)

1202

int tail_block)

1203

{

1203

{

1204

xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;

1204

xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;

1205

1206

memset(buf, 0, BBSIZE);

1206

memset(buf, 0, BBSIZE);

1207

recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);

1207

recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);

1208

recp->h_cycle = cpu_to_be32(cycle);

1208

recp->h_cycle = cpu_to_be32(cycle);

1209

recp->h_version = cpu_to_be32(

1209

recp->h_version = cpu_to_be32(

1210

xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);

1210

xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);

1211

recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));

1211

recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));

1212

recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));

1212

recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));

1213

recp->h_fmt = cpu_to_be32(XLOG_FMT);

1213

recp->h_fmt = cpu_to_be32(XLOG_FMT);

1214

memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));

1214

memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));

1215

}

1215

}

1216

1217

STATIC int

1217

STATIC int

1218

xlog_write_log_records(

1218

xlog_write_log_records(

1219

struct xlog *log,

1219

struct xlog *log,

1220

int cycle,

1220

int cycle,

1221

int start_block,

1221

int start_block,

1222

int blocks,

1222

int blocks,

1223

int tail_cycle,

1223

int tail_cycle,

1224

int tail_block)

1224

int tail_block)

1225

{

1225

{

1226

xfs_caddr_t offset;

1226

xfs_caddr_t offset;

1227

xfs_buf_t *bp;

1227

xfs_buf_t *bp;

1228

int balign, ealign;

1228

int balign, ealign;

1229

int sectbb = log->l_sectBBsize;

1229

int sectbb = log->l_sectBBsize;

1230

int end_block = start_block + blocks;

1230

int end_block = start_block + blocks;

1231

int bufblks;

1231

int bufblks;

1232

int error = 0;

1232

int error = 0;

1233

int i, j = 0;

1233

int i, j = 0;

1234

1235

/*

1235

/*

1236

* Greedily allocate a buffer big enough to handle the full

1236

* Greedily allocate a buffer big enough to handle the full

1237

* range of basic blocks to be written. If that fails, try

1237

* range of basic blocks to be written. If that fails, try

1238

* a smaller size. We need to be able to write at least a

1238

* a smaller size. We need to be able to write at least a

1239

* log sector, or we're out of luck.

1239

* log sector, or we're out of luck.

1240

*/

1240

*/

1241

bufblks = 1 << ffs(blocks);

1241

bufblks = 1 << ffs(blocks);

1242

while (bufblks > log->l_logBBsize)

1242

while (bufblks > log->l_logBBsize)

1243

bufblks >>= 1;

1243

bufblks >>= 1;

1244

while (!(bp = xlog_get_bp(log, bufblks))) {

1244

while (!(bp = xlog_get_bp(log, bufblks))) {

1245

bufblks >>= 1;

1245

bufblks >>= 1;

1246

if (bufblks < sectbb)

1246

if (bufblks < sectbb)

1247

return ENOMEM;

1247

return ENOMEM;

1248

}

1248

}

1249

1250

/* We may need to do a read at the start to fill in part of

1250

/* We may need to do a read at the start to fill in part of

1251

* the buffer in the starting sector not covered by the first

1251

* the buffer in the starting sector not covered by the first

1252

* write below.

1252

* write below.

1253

*/

1253

*/

1254

balign = round_down(start_block, sectbb);

1254

balign = round_down(start_block, sectbb);

1255

if (balign != start_block) {

1255

if (balign != start_block) {

1256

error = xlog_bread_noalign(log, start_block, 1, bp);

1256

error = xlog_bread_noalign(log, start_block, 1, bp);

1257

if (error)

1257

if (error)

1258

goto out_put_bp;

1258

goto out_put_bp;

1259

1260

j = start_block - balign;

1260

j = start_block - balign;

1261

}

1261

}

1262

1263

for (i = start_block; i < end_block; i += bufblks) {

1263

for (i = start_block; i < end_block; i += bufblks) {

1264

int bcount, endcount;

1264

int bcount, endcount;

1265

1266

bcount = min(bufblks, end_block - start_block);

1266

bcount = min(bufblks, end_block - start_block);

1267

endcount = bcount - j;

1267

endcount = bcount - j;

1268

1269

/* We may need to do a read at the end to fill in part of

1269

/* We may need to do a read at the end to fill in part of

1270

* the buffer in the final sector not covered by the write.

1270

* the buffer in the final sector not covered by the write.

1271

* If this is the same sector as the above read, skip it.

1271

* If this is the same sector as the above read, skip it.

1272

*/

1272

*/

1273

ealign = round_down(end_block, sectbb);

1273

ealign = round_down(end_block, sectbb);

1274

if (j == 0 && (start_block + endcount > ealign)) {

1274

if (j == 0 && (start_block + endcount > ealign)) {

1275

offset = bp->b_addr + BBTOB(ealign - start_block);

1275

offset = bp->b_addr + BBTOB(ealign - start_block);

1276

error = xlog_bread_offset(log, ealign, sectbb,

1276

error = xlog_bread_offset(log, ealign, sectbb,

1277

bp, offset);

1277

bp, offset);

1278

if (error)

1278

if (error)

1279

break;

1279

break;

1280

1281

}

1281

}

1282

1283

offset = xlog_align(log, start_block, endcount, bp);

1283

offset = xlog_align(log, start_block, endcount, bp);

1284

for (; j < endcount; j++) {

1284

for (; j < endcount; j++) {

1285

xlog_add_record(log, offset, cycle, i+j,

1285

xlog_add_record(log, offset, cycle, i+j,

1286

tail_cycle, tail_block);

1286

tail_cycle, tail_block);

1287

offset += BBSIZE;

1287

offset += BBSIZE;

1288

}

1288

}

1289

error = xlog_bwrite(log, start_block, endcount, bp);

1289

error = xlog_bwrite(log, start_block, endcount, bp);

1290

if (error)

1290

if (error)

1291

break;

1291

break;

1292

start_block += endcount;

1292

start_block += endcount;

1293

j = 0;

1293

j = 0;

1294

}

1294

}

1295

1296

out_put_bp:

1296

out_put_bp:

1297

xlog_put_bp(bp);

1297

xlog_put_bp(bp);

1298

return error;

1298

return error;

1299

}

1299

}

1300

1301

/*

1301

/*

1302

* This routine is called to blow away any incomplete log writes out

1302

* This routine is called to blow away any incomplete log writes out

1303

* in front of the log head. We do this so that we won't become confused

1303

* in front of the log head. We do this so that we won't become confused

1304

* if we come up, write only a little bit more, and then crash again.

1304

* if we come up, write only a little bit more, and then crash again.

1305

* If we leave the partial log records out there, this situation could

1305

* If we leave the partial log records out there, this situation could

1306

* cause us to think those partial writes are valid blocks since they

1306

* cause us to think those partial writes are valid blocks since they

1307

* have the current cycle number. We get rid of them by overwriting them

1307

* have the current cycle number. We get rid of them by overwriting them

1308

* with empty log records with the old cycle number rather than the

1308

* with empty log records with the old cycle number rather than the

1309

* current one.

1309

* current one.

1310

*

1310

*

1311

* The tail lsn is passed in rather than taken from

1311

* The tail lsn is passed in rather than taken from

1312

* the log so that we will not write over the unmount record after a

1312

* the log so that we will not write over the unmount record after a

1313

* clean unmount in a 512 block log. Doing so would leave the log without

1313

* clean unmount in a 512 block log. Doing so would leave the log without

1314

* any valid log records in it until a new one was written. If we crashed

1314

* any valid log records in it until a new one was written. If we crashed

1315

* during that time we would not be able to recover.

1315

* during that time we would not be able to recover.

1316

*/

1316

*/

1317

STATIC int

1317

STATIC int

1318

xlog_clear_stale_blocks(

1318

xlog_clear_stale_blocks(

1319

struct xlog *log,

1319

struct xlog *log,

1320

xfs_lsn_t tail_lsn)

1320

xfs_lsn_t tail_lsn)

1321

{

1321

{

1322

int tail_cycle, head_cycle;

1322

int tail_cycle, head_cycle;

1323

int tail_block, head_block;

1323

int tail_block, head_block;

1324

int tail_distance, max_distance;

1324

int tail_distance, max_distance;

1325

int distance;

1325

int distance;

1326

int error;

1326

int error;

1327

1328

tail_cycle = CYCLE_LSN(tail_lsn);

1328

tail_cycle = CYCLE_LSN(tail_lsn);

1329

tail_block = BLOCK_LSN(tail_lsn);

1329

tail_block = BLOCK_LSN(tail_lsn);

1330

head_cycle = log->l_curr_cycle;

1330

head_cycle = log->l_curr_cycle;

1331

head_block = log->l_curr_block;

1331

head_block = log->l_curr_block;

1332

1333

/*

1333

/*

1334

* Figure out the distance between the new head of the log

1334

* Figure out the distance between the new head of the log

1335

* and the tail. We want to write over any blocks beyond the

1335

* and the tail. We want to write over any blocks beyond the

1336

* head that we may have written just before the crash, but

1336

* head that we may have written just before the crash, but

1337

* we don't want to overwrite the tail of the log.

1337

* we don't want to overwrite the tail of the log.

1338

*/

1338

*/

1339

if (head_cycle == tail_cycle) {

1339

if (head_cycle == tail_cycle) {

1340

/*

1340

/*

1341

* The tail is behind the head in the physical log,

1341

* The tail is behind the head in the physical log,

1342

* so the distance from the head to the tail is the

1342

* so the distance from the head to the tail is the

1343

* distance from the head to the end of the log plus

1343

* distance from the head to the end of the log plus

1344

* the distance from the beginning of the log to the

1344

* the distance from the beginning of the log to the

1345

* tail.

1345

* tail.

1346

*/

1346

*/

1347

if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {

1347

if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {

1348

XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",

1348

XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",

1349

XFS_ERRLEVEL_LOW, log->l_mp);

1349

XFS_ERRLEVEL_LOW, log->l_mp);

1350

return XFS_ERROR(EFSCORRUPTED);

1350

return XFS_ERROR(EFSCORRUPTED);

1351

}

1351

}

1352

tail_distance = tail_block + (log->l_logBBsize - head_block);

1352

tail_distance = tail_block + (log->l_logBBsize - head_block);

1353

} else {

1353

} else {

1354

/*

1354

/*

1355

* The head is behind the tail in the physical log,

1355

* The head is behind the tail in the physical log,

1356

* so the distance from the head to the tail is just

1356

* so the distance from the head to the tail is just

1357

* the tail block minus the head block.

1357

* the tail block minus the head block.

1358

*/

1358

*/

1359

if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){

1359

if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){

1360

XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",

1360

XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",

1361

XFS_ERRLEVEL_LOW, log->l_mp);

1361

XFS_ERRLEVEL_LOW, log->l_mp);

1362

return XFS_ERROR(EFSCORRUPTED);

1362

return XFS_ERROR(EFSCORRUPTED);

1363

}

1363

}

1364

tail_distance = tail_block - head_block;

1364

tail_distance = tail_block - head_block;

1365

}

1365

}

1366

1367

/*

1367

/*

1368

* If the head is right up against the tail, we can't clear

1368

* If the head is right up against the tail, we can't clear

1369

* anything.

1369

* anything.

1370

*/

1370

*/

1371

if (tail_distance <= 0) {

1371

if (tail_distance <= 0) {

1372

ASSERT(tail_distance == 0);

1372

ASSERT(tail_distance == 0);

1373

return 0;

1373

return 0;

1374

}

1374

}

1375

1376

max_distance = XLOG_TOTAL_REC_SHIFT(log);

1376

max_distance = XLOG_TOTAL_REC_SHIFT(log);

1377

/*

1377

/*

1378

* Take the smaller of the maximum amount of outstanding I/O

1378

* Take the smaller of the maximum amount of outstanding I/O

1379

* we could have and the distance to the tail to clear out.

1379

* we could have and the distance to the tail to clear out.

1380

* We take the smaller so that we don't overwrite the tail and

1380

* We take the smaller so that we don't overwrite the tail and

1381

* we don't waste all day writing from the head to the tail

1381

* we don't waste all day writing from the head to the tail

1382

* for no reason.

1382

* for no reason.

1383

*/

1383

*/

1384

max_distance = MIN(max_distance, tail_distance);

1384

max_distance = MIN(max_distance, tail_distance);

1385

1386

if ((head_block + max_distance) <= log->l_logBBsize) {

1386

if ((head_block + max_distance) <= log->l_logBBsize) {

1387

/*

1387

/*

1388

* We can stomp all the blocks we need to without

1388

* We can stomp all the blocks we need to without

1389

* wrapping around the end of the log. Just do it

1389

* wrapping around the end of the log. Just do it

1390

* in a single write. Use the cycle number of the

1390

* in a single write. Use the cycle number of the

1391

* current cycle minus one so that the log will look like:

1391

* current cycle minus one so that the log will look like:

1392

* n ... | n - 1 ...

1392

* n ... | n - 1 ...

1393

*/

1393

*/

1394

error = xlog_write_log_records(log, (head_cycle - 1),

1394

error = xlog_write_log_records(log, (head_cycle - 1),

1395

head_block, max_distance, tail_cycle,

1395

head_block, max_distance, tail_cycle,

1396

tail_block);

1396

tail_block);

1397

if (error)

1397

if (error)

1398

return error;

1398

return error;

1399

} else {

1399

} else {

1400

/*

1400

/*

1401

* We need to wrap around the end of the physical log in

1401

* We need to wrap around the end of the physical log in

1402

* order to clear all the blocks. Do it in two separate

1402

* order to clear all the blocks. Do it in two separate

1403

* I/Os. The first write should be from the head to the

1403

* I/Os. The first write should be from the head to the

1404

* end of the physical log, and it should use the current

1404

* end of the physical log, and it should use the current

1405

* cycle number minus one just like above.

1405

* cycle number minus one just like above.

1406

*/

1406

*/

1407

distance = log->l_logBBsize - head_block;

1407

distance = log->l_logBBsize - head_block;

1408

error = xlog_write_log_records(log, (head_cycle - 1),

1408

error = xlog_write_log_records(log, (head_cycle - 1),

1409

head_block, distance, tail_cycle,

1409

head_block, distance, tail_cycle,

1410

tail_block);

1410

tail_block);

1411

1412

if (error)

1412

if (error)

1413

return error;

1413

return error;

1414

1415

/*

1415

/*

1416

* Now write the blocks at the start of the physical log.

1416

* Now write the blocks at the start of the physical log.

1417

* This writes the remainder of the blocks we want to clear.

1417

* This writes the remainder of the blocks we want to clear.

1418

* It uses the current cycle number since we're now on the

1418

* It uses the current cycle number since we're now on the

1419

* same cycle as the head so that we get:

1419

* same cycle as the head so that we get:

1420

* n ... n ... | n - 1 ...

1420

* n ... n ... | n - 1 ...

1421

* ^^^^^ blocks we're writing

1421

* ^^^^^ blocks we're writing

1422

*/

1422

*/

1423

distance = max_distance - (log->l_logBBsize - head_block);

1423

distance = max_distance - (log->l_logBBsize - head_block);

1424

error = xlog_write_log_records(log, head_cycle, 0, distance,

1424

error = xlog_write_log_records(log, head_cycle, 0, distance,

1425

tail_cycle, tail_block);

1425

tail_cycle, tail_block);

1426

if (error)

1426

if (error)

1427

return error;

1427

return error;

1428

}

1428

}

1429

1430

return 0;

1430

return 0;

1431

}

1431

}

1432

1433

/******************************************************************************

1433

/******************************************************************************

1434

*

1434

*

1435

* Log recover routines

1435

* Log recover routines

1436

*

1436

*

1437

******************************************************************************

1437

******************************************************************************

1438

*/

1438

*/

1439

1440

STATIC xlog_recover_t *

1440

STATIC xlog_recover_t *

1441

xlog_recover_find_tid(

1441

xlog_recover_find_tid(

1442

struct hlist_head *head,

1442

struct hlist_head *head,

1443

xlog_tid_t tid)

1443

xlog_tid_t tid)

1444

{

1444

{

1445

xlog_recover_t *trans;

1445

xlog_recover_t *trans;

1446

1447

hlist_for_each_entry(trans, head, r_list) {

1447

hlist_for_each_entry(trans, head, r_list) {

1448

if (trans->r_log_tid == tid)

1448

if (trans->r_log_tid == tid)

1449

return trans;

1449

return trans;

1450

}

1450

}

1451

return NULL;

1451

return NULL;

1452

}

1452

}

1453

1454

STATIC void

1454

STATIC void

1455

xlog_recover_new_tid(

1455

xlog_recover_new_tid(

1456

struct hlist_head *head,

1456

struct hlist_head *head,

1457

xlog_tid_t tid,

1457

xlog_tid_t tid,

1458

xfs_lsn_t lsn)

1458

xfs_lsn_t lsn)

1459

{

1459

{

1460

xlog_recover_t *trans;

1460

xlog_recover_t *trans;

1461

1462

trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);

1462

trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);

1463

trans->r_log_tid = tid;

1463

trans->r_log_tid = tid;

1464

trans->r_lsn = lsn;

1464

trans->r_lsn = lsn;

1465

INIT_LIST_HEAD(&trans->r_itemq);

1465

INIT_LIST_HEAD(&trans->r_itemq);

1466

1467

INIT_HLIST_NODE(&trans->r_list);

1467

INIT_HLIST_NODE(&trans->r_list);

1468

hlist_add_head(&trans->r_list, head);

1468

hlist_add_head(&trans->r_list, head);

1469

}

1469

}

1470

1471

STATIC void

1471

STATIC void

1472

xlog_recover_add_item(

1472

xlog_recover_add_item(

1473

struct list_head *head)

1473

struct list_head *head)

1474

{

1474

{

1475

xlog_recover_item_t *item;

1475

xlog_recover_item_t *item;

1476

1477

item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);

1477

item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);

1478

INIT_LIST_HEAD(&item->ri_list);

1478

INIT_LIST_HEAD(&item->ri_list);

1479

list_add_tail(&item->ri_list, head);

1479

list_add_tail(&item->ri_list, head);

1480

}

1480

}

1481

1482

STATIC int

1482

STATIC int

1483

xlog_recover_add_to_cont_trans(

1483

xlog_recover_add_to_cont_trans(

1484

struct xlog *log,

1484

struct xlog *log,

1485

struct xlog_recover *trans,

1485

struct xlog_recover *trans,

1486

xfs_caddr_t dp,

1486

xfs_caddr_t dp,

1487

int len)

1487

int len)

1488

{

1488

{

1489

xlog_recover_item_t *item;

1489

xlog_recover_item_t *item;

1490

xfs_caddr_t ptr, old_ptr;

1490

xfs_caddr_t ptr, old_ptr;

1491

int old_len;

1491

int old_len;

1492

1493

if (list_empty(&trans->r_itemq)) {

1493

if (list_empty(&trans->r_itemq)) {

1494

/* finish copying rest of trans header */

1494

/* finish copying rest of trans header */

1495

xlog_recover_add_item(&trans->r_itemq);

1495

xlog_recover_add_item(&trans->r_itemq);

1496

ptr = (xfs_caddr_t) &trans->r_theader +

1496

ptr = (xfs_caddr_t) &trans->r_theader +

1497

sizeof(xfs_trans_header_t) - len;

1497

sizeof(xfs_trans_header_t) - len;

1498

memcpy(ptr, dp, len); /* d, s, l */

1498

memcpy(ptr, dp, len); /* d, s, l */

1499

return 0;

1499

return 0;

1500

}

1500

}

1501

/* take the tail entry */

1501

/* take the tail entry */

1502

item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);

1502

item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);

1503

1504

old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;

1504

old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;

1505

old_len = item->ri_buf[item->ri_cnt-1].i_len;

1505

old_len = item->ri_buf[item->ri_cnt-1].i_len;

1506

1507

ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);

1507

ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);

1508

memcpy(&ptr[old_len], dp, len); /* d, s, l */

1508

memcpy(&ptr[old_len], dp, len); /* d, s, l */

1509

item->ri_buf[item->ri_cnt-1].i_len += len;

1509

item->ri_buf[item->ri_cnt-1].i_len += len;

1510

item->ri_buf[item->ri_cnt-1].i_addr = ptr;

1510

item->ri_buf[item->ri_cnt-1].i_addr = ptr;

1511

trace_xfs_log_recover_item_add_cont(log, trans, item, 0);

1511

trace_xfs_log_recover_item_add_cont(log, trans, item, 0);

1512

return 0;

1512

return 0;

1513

}

1513

}

1514

1515

/*

1515

/*

1516

* The next region to add is the start of a new region. It could be

1516

* The next region to add is the start of a new region. It could be

1517

* a whole region or it could be the first part of a new region. Because

1517

* a whole region or it could be the first part of a new region. Because

1518

* of this, the assumption here is that the type and size fields of all

1518

* of this, the assumption here is that the type and size fields of all

1519

* format structures fit into the first 32 bits of the structure.

1519

* format structures fit into the first 32 bits of the structure.

1520

*

1520

*

1521

* This works because all regions must be 32 bit aligned. Therefore, we

1521

* This works because all regions must be 32 bit aligned. Therefore, we

1522

* either have both fields or we have neither field. In the case we have

1522

* either have both fields or we have neither field. In the case we have

1523

* neither field, the data part of the region is zero length. We only have

1523

* neither field, the data part of the region is zero length. We only have

1524

* a log_op_header and can throw away the header since a new one will appear

1524

* a log_op_header and can throw away the header since a new one will appear

1525

* later. If we have at least 4 bytes, then we can determine how many regions

1525

* later. If we have at least 4 bytes, then we can determine how many regions

1526

* will appear in the current log item.

1526

* will appear in the current log item.

1527

*/

1527

*/

1528

STATIC int

1528

STATIC int

1529

xlog_recover_add_to_trans(

1529

xlog_recover_add_to_trans(

1530

struct xlog *log,

1530

struct xlog *log,

1531

struct xlog_recover *trans,

1531

struct xlog_recover *trans,

1532

xfs_caddr_t dp,

1532

xfs_caddr_t dp,

1533

int len)

1533

int len)

1534

{

1534

{

1535

xfs_inode_log_format_t *in_f; /* any will do */

1535

xfs_inode_log_format_t *in_f; /* any will do */

1536

xlog_recover_item_t *item;

1536

xlog_recover_item_t *item;

1537

xfs_caddr_t ptr;

1537

xfs_caddr_t ptr;

1538

1539

if (!len)

1539

if (!len)

1540

return 0;

1540

return 0;

1541

if (list_empty(&trans->r_itemq)) {

1541

if (list_empty(&trans->r_itemq)) {

1542

/* we need to catch log corruptions here */

1542

/* we need to catch log corruptions here */

1543

if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {

1543

if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {

1544

xfs_warn(log->l_mp, "%s: bad header magic number",

1544

xfs_warn(log->l_mp, "%s: bad header magic number",

1545

__func__);

1545

__func__);

1546

ASSERT(0);

1546

ASSERT(0);

1547

return XFS_ERROR(EIO);

1547

return XFS_ERROR(EIO);

1548

}

1548

}

1549

if (len == sizeof(xfs_trans_header_t))

1549

if (len == sizeof(xfs_trans_header_t))

1550

xlog_recover_add_item(&trans->r_itemq);

1550

xlog_recover_add_item(&trans->r_itemq);

1551

memcpy(&trans->r_theader, dp, len); /* d, s, l */

1551

memcpy(&trans->r_theader, dp, len); /* d, s, l */

1552

return 0;

1552

return 0;

1553

}

1553

}

1554

1555

ptr = kmem_alloc(len, KM_SLEEP);

1555

ptr = kmem_alloc(len, KM_SLEEP);

1556

memcpy(ptr, dp, len);

1556

memcpy(ptr, dp, len);

1557

in_f = (xfs_inode_log_format_t *)ptr;

1557

in_f = (xfs_inode_log_format_t *)ptr;

1558

1559

/* take the tail entry */

1559

/* take the tail entry */

1560

item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);

1560

item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);

1561

if (item->ri_total != 0 &&

1561

if (item->ri_total != 0 &&

1562

item->ri_total == item->ri_cnt) {

1562

item->ri_total == item->ri_cnt) {

1563

/* tail item is in use, get a new one */

1563

/* tail item is in use, get a new one */

1564

xlog_recover_add_item(&trans->r_itemq);

1564

xlog_recover_add_item(&trans->r_itemq);

1565

item = list_entry(trans->r_itemq.prev,

1565

item = list_entry(trans->r_itemq.prev,

1566

xlog_recover_item_t, ri_list);

1566

xlog_recover_item_t, ri_list);

1567

}

1567

}

1568

1569

if (item->ri_total == 0) { /* first region to be added */

1569

if (item->ri_total == 0) { /* first region to be added */

1570

if (in_f->ilf_size == 0 ||

1570

if (in_f->ilf_size == 0 ||

1571

in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {

1571

in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {

1572

xfs_warn(log->l_mp,

1572

xfs_warn(log->l_mp,

1573

"bad number of regions (%d) in inode log format",

1573

"bad number of regions (%d) in inode log format",

1574

in_f->ilf_size);

1574

in_f->ilf_size);

1575

ASSERT(0);

1575

ASSERT(0);

1576

return XFS_ERROR(EIO);

1576

return XFS_ERROR(EIO);

1577

}

1577

}

1578

1579

item->ri_total = in_f->ilf_size;

1579

item->ri_total = in_f->ilf_size;

1580

item->ri_buf =

1580

item->ri_buf =

1581

kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),

1581

kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),

1582

KM_SLEEP);

1582

KM_SLEEP);

1583

}

1583

}

1584

ASSERT(item->ri_total > item->ri_cnt);

1584

ASSERT(item->ri_total > item->ri_cnt);

1585

/* Description region is ri_buf[0] */

1585

/* Description region is ri_buf[0] */

1586

item->ri_buf[item->ri_cnt].i_addr = ptr;

1586

item->ri_buf[item->ri_cnt].i_addr = ptr;

1587

item->ri_buf[item->ri_cnt].i_len = len;

1587

item->ri_buf[item->ri_cnt].i_len = len;

1588

item->ri_cnt++;

1588

item->ri_cnt++;

1589

trace_xfs_log_recover_item_add(log, trans, item, 0);

1589

trace_xfs_log_recover_item_add(log, trans, item, 0);

1590

return 0;

1590

return 0;

1591

}

1591

}

1592

1593

/*

1593

/*

1594

* Sort the log items in the transaction. Cancelled buffers need

1594

* Sort the log items in the transaction. Cancelled buffers need

1595

* to be put first so they are processed before any items that might

1595

* to be put first so they are processed before any items that might

1596

* modify the buffers. If they are cancelled, then the modifications

1596

* modify the buffers. If they are cancelled, then the modifications

1597

* don't need to be replayed.

1597

* don't need to be replayed.

1598

*/

1598

*/

1599

STATIC int

1599

STATIC int

1600

xlog_recover_reorder_trans(

1600

xlog_recover_reorder_trans(

1601

struct xlog *log,

1601

struct xlog *log,

1602

struct xlog_recover *trans,

1602

struct xlog_recover *trans,

1603

int pass)

1603

int pass)

1604

{

1604

{

1605

xlog_recover_item_t *item, *n;

1605

xlog_recover_item_t *item, *n;

1606

LIST_HEAD(sort_list);

1606

LIST_HEAD(sort_list);

1607

1608

list_splice_init(&trans->r_itemq, &sort_list);

1608

list_splice_init(&trans->r_itemq, &sort_list);

1609

list_for_each_entry_safe(item, n, &sort_list, ri_list) {

1609

list_for_each_entry_safe(item, n, &sort_list, ri_list) {

1610

xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;

1610

xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;

1611

1612

switch (ITEM_TYPE(item)) {

1612

switch (ITEM_TYPE(item)) {

1613

case XFS_LI_BUF:

1613

case XFS_LI_BUF:

1614

if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {

1614

if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {

1615

trace_xfs_log_recover_item_reorder_head(log,

1615

trace_xfs_log_recover_item_reorder_head(log,

1616

trans, item, pass);

1616

trans, item, pass);

1617

list_move(&item->ri_list, &trans->r_itemq);

1617

list_move(&item->ri_list, &trans->r_itemq);

1618

break;

1618

break;

1619

}

1619

}

1620

case XFS_LI_INODE:

1620

case XFS_LI_INODE:

1621

case XFS_LI_DQUOT:

1621

case XFS_LI_DQUOT:

1622

case XFS_LI_QUOTAOFF:

1622

case XFS_LI_QUOTAOFF:

1623

case XFS_LI_EFD:

1623

case XFS_LI_EFD:

1624

case XFS_LI_EFI:

1624

case XFS_LI_EFI:

1625

trace_xfs_log_recover_item_reorder_tail(log,

1625

trace_xfs_log_recover_item_reorder_tail(log,

1626

trans, item, pass);

1626

trans, item, pass);

1627

list_move_tail(&item->ri_list, &trans->r_itemq);

1627

list_move_tail(&item->ri_list, &trans->r_itemq);

1628

break;

1628

break;

1629

default:

1629

default:

1630

xfs_warn(log->l_mp,

1630

xfs_warn(log->l_mp,

1631

"%s: unrecognized type of log operation",

1631

"%s: unrecognized type of log operation",

1632

__func__);

1632

__func__);

1633

ASSERT(0);

1633

ASSERT(0);

1634

return XFS_ERROR(EIO);

1634

return XFS_ERROR(EIO);

1635

}

1635

}

1636

}

1636

}

1637

ASSERT(list_empty(&sort_list));

1637

ASSERT(list_empty(&sort_list));

1638

return 0;

1638

return 0;

1639

}

1639

}

1640

1641

/*

1641

/*

1642

* Build up the table of buf cancel records so that we don't replay

1642

* Build up the table of buf cancel records so that we don't replay

1643

* cancelled data in the second pass. For buffer records that are

1643

* cancelled data in the second pass. For buffer records that are

1644

* not cancel records, there is nothing to do here so we just return.

1644

* not cancel records, there is nothing to do here so we just return.

1645

*

1645

*

1646

* If we get a cancel record which is already in the table, this indicates

1646

* If we get a cancel record which is already in the table, this indicates

1647

* that the buffer was cancelled multiple times. In order to ensure

1647

* that the buffer was cancelled multiple times. In order to ensure

1648

* that during pass 2 we keep the record in the table until we reach its

1648

* that during pass 2 we keep the record in the table until we reach its

1649

* last occurrence in the log, we keep a reference count in the cancel

1649

* last occurrence in the log, we keep a reference count in the cancel

1650

* record in the table to tell us how many times we expect to see this

1650

* record in the table to tell us how many times we expect to see this

1651

* record during the second pass.

1651

* record during the second pass.

1652

*/

1652

*/

1653

STATIC int

1653

STATIC int

1654

xlog_recover_buffer_pass1(

1654

xlog_recover_buffer_pass1(

1655

struct xlog *log,

1655

struct xlog *log,

1656

struct xlog_recover_item *item)

1656

struct xlog_recover_item *item)

1657

{

1657

{

1658

xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;

1658

xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;

1659

struct list_head *bucket;

1659

struct list_head *bucket;

1660

struct xfs_buf_cancel *bcp;

1660

struct xfs_buf_cancel *bcp;

1661

1662

/*

1662

/*

1663

* If this isn't a cancel buffer item, then just return.

1663

* If this isn't a cancel buffer item, then just return.

1664

*/

1664

*/

1665

if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {

1665

if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {

1666

trace_xfs_log_recover_buf_not_cancel(log, buf_f);

1666

trace_xfs_log_recover_buf_not_cancel(log, buf_f);

1667

return 0;

1667

return 0;

1668

}

1668

}

1669

1670

/*

1670

/*

1671

* Insert an xfs_buf_cancel record into the hash table of them.

1671

* Insert an xfs_buf_cancel record into the hash table of them.

1672

* If there is already an identical record, bump its reference count.

1672

* If there is already an identical record, bump its reference count.

1673

*/

1673

*/

1674

bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);

1674

bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);

1675

list_for_each_entry(bcp, bucket, bc_list) {

1675

list_for_each_entry(bcp, bucket, bc_list) {

1676

if (bcp->bc_blkno == buf_f->blf_blkno &&

1676

if (bcp->bc_blkno == buf_f->blf_blkno &&

1677

bcp->bc_len == buf_f->blf_len) {

1677

bcp->bc_len == buf_f->blf_len) {

1678

bcp->bc_refcount++;

1678

bcp->bc_refcount++;

1679

trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);

1679

trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);

1680

return 0;

1680

return 0;

1681

}

1681

}

1682

}

1682

}

1683

1684

bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);

1684

bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);

1685

bcp->bc_blkno = buf_f->blf_blkno;

1685

bcp->bc_blkno = buf_f->blf_blkno;

1686

bcp->bc_len = buf_f->blf_len;

1686

bcp->bc_len = buf_f->blf_len;

1687

bcp->bc_refcount = 1;

1687

bcp->bc_refcount = 1;

1688

list_add_tail(&bcp->bc_list, bucket);

1688

list_add_tail(&bcp->bc_list, bucket);

1689

1690

trace_xfs_log_recover_buf_cancel_add(log, buf_f);

1690

trace_xfs_log_recover_buf_cancel_add(log, buf_f);

1691

return 0;

1691

return 0;

1692

}

1692

}

1693

1694

/*

1694

/*

1695

* Check to see whether the buffer being recovered has a corresponding

1695

* Check to see whether the buffer being recovered has a corresponding

1696

* entry in the buffer cancel record table. If it does then return 1

1696

* entry in the buffer cancel record table. If it does then return 1

1697

* so that it will be cancelled, otherwise return 0. If the buffer is

1697

* so that it will be cancelled, otherwise return 0. If the buffer is

1698

* actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement

1698

* actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement

1699

* the refcount on the entry in the table and remove it from the table

1699

* the refcount on the entry in the table and remove it from the table

1700

* if this is the last reference.

1700

* if this is the last reference.

1701

*

1701

*

1702

* We remove the cancel record from the table when we encounter its

1702

* We remove the cancel record from the table when we encounter its

1703

* last occurrence in the log so that if the same buffer is re-used

1703

* last occurrence in the log so that if the same buffer is re-used

1704

* again after its last cancellation we actually replay the changes

1704

* again after its last cancellation we actually replay the changes

1705

* made at that point.

1705

* made at that point.

1706

*/

1706

*/

1707

STATIC int

1707

STATIC int

1708

xlog_check_buffer_cancelled(

1708

xlog_check_buffer_cancelled(

1709

struct xlog *log,

1709

struct xlog *log,

1710

xfs_daddr_t blkno,

1710

xfs_daddr_t blkno,

1711

uint len,

1711

uint len,

1712

ushort flags)

1712

ushort flags)

1713

{

1713

{

1714

struct list_head *bucket;

1714

struct list_head *bucket;

1715

struct xfs_buf_cancel *bcp;

1715

struct xfs_buf_cancel *bcp;

1716

1717

if (log->l_buf_cancel_table == NULL) {

1717

if (log->l_buf_cancel_table == NULL) {

1718

/*

1718

/*

1719

* There is nothing in the table built in pass one,

1719

* There is nothing in the table built in pass one,

1720

* so this buffer must not be cancelled.

1720

* so this buffer must not be cancelled.

1721

*/

1721

*/

1722

ASSERT(!(flags & XFS_BLF_CANCEL));

1722

ASSERT(!(flags & XFS_BLF_CANCEL));

1723

return 0;

1723

return 0;

1724

}

1724

}

1725

1726

/*

1726

/*

1727

* Search for an entry in the cancel table that matches our buffer.

1727

* Search for an entry in the cancel table that matches our buffer.

1728

*/

1728

*/

1729

bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);

1729

bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);

1730

list_for_each_entry(bcp, bucket, bc_list) {

1730

list_for_each_entry(bcp, bucket, bc_list) {

1731

if (bcp->bc_blkno == blkno && bcp->bc_len == len)

1731

if (bcp->bc_blkno == blkno && bcp->bc_len == len)

1732

goto found;

1732

goto found;

1733

}

1733

}

1734

1735

/*

1735

/*

1736

* We didn't find a corresponding entry in the table, so return 0 so

1736

* We didn't find a corresponding entry in the table, so return 0 so

1737

* that the buffer is NOT cancelled.

1737

* that the buffer is NOT cancelled.

1738

*/

1738

*/

1739

ASSERT(!(flags & XFS_BLF_CANCEL));

1739

ASSERT(!(flags & XFS_BLF_CANCEL));

1740

return 0;

1740

return 0;

1741

1742

found:

1742

found:

1743

/*

1743

/*

1744

* We've go a match, so return 1 so that the recovery of this buffer

1744

* We've go a match, so return 1 so that the recovery of this buffer

1745

* is cancelled. If this buffer is actually a buffer cancel log

1745

* is cancelled. If this buffer is actually a buffer cancel log

1746

* item, then decrement the refcount on the one in the table and

1746

* item, then decrement the refcount on the one in the table and

1747

* remove it if this is the last reference.

1747

* remove it if this is the last reference.

1748

*/

1748

*/

1749

if (flags & XFS_BLF_CANCEL) {

1749

if (flags & XFS_BLF_CANCEL) {

1750

if (--bcp->bc_refcount == 0) {

1750

if (--bcp->bc_refcount == 0) {

1751

list_del(&bcp->bc_list);

1751

list_del(&bcp->bc_list);

1752

kmem_free(bcp);

1752

kmem_free(bcp);

1753

}

1753

}

1754

}

1754

}

1755

return 1;

1755

return 1;

1756

}

1756

}

1757

1758

/*

1758

/*

1759

* Perform recovery for a buffer full of inodes. In these buffers, the only

1759

* Perform recovery for a buffer full of inodes. In these buffers, the only

1760

* data which should be recovered is that which corresponds to the

1760

* data which should be recovered is that which corresponds to the

1761

* di_next_unlinked pointers in the on disk inode structures. The rest of the

1761

* di_next_unlinked pointers in the on disk inode structures. The rest of the

1762

* data for the inodes is always logged through the inodes themselves rather

1762

* data for the inodes is always logged through the inodes themselves rather

1763

* than the inode buffer and is recovered in xlog_recover_inode_pass2().

1763

* than the inode buffer and is recovered in xlog_recover_inode_pass2().

1764

*

1764

*

1765

* The only time when buffers full of inodes are fully recovered is when the

1765

* The only time when buffers full of inodes are fully recovered is when the

1766

* buffer is full of newly allocated inodes. In this case the buffer will

1766

* buffer is full of newly allocated inodes. In this case the buffer will

1767

* not be marked as an inode buffer and so will be sent to

1767

* not be marked as an inode buffer and so will be sent to

1768

* xlog_recover_do_reg_buffer() below during recovery.

1768

* xlog_recover_do_reg_buffer() below during recovery.

1769

*/

1769

*/

1770

STATIC int

1770

STATIC int

1771

xlog_recover_do_inode_buffer(

1771

xlog_recover_do_inode_buffer(

1772

struct xfs_mount *mp,

1772

struct xfs_mount *mp,

1773

xlog_recover_item_t *item,

1773

xlog_recover_item_t *item,

1774

struct xfs_buf *bp,

1774

struct xfs_buf *bp,

1775

xfs_buf_log_format_t *buf_f)

1775

xfs_buf_log_format_t *buf_f)

1776

{

1776

{

1777

int i;

1777

int i;

1778

int item_index = 0;

1778

int item_index = 0;

1779

int bit = 0;

1779

int bit = 0;

1780

int nbits = 0;

1780

int nbits = 0;

1781

int reg_buf_offset = 0;

1781

int reg_buf_offset = 0;

1782

int reg_buf_bytes = 0;

1782

int reg_buf_bytes = 0;

1783

int next_unlinked_offset;

1783

int next_unlinked_offset;

1784

int inodes_per_buf;

1784

int inodes_per_buf;

1785

xfs_agino_t *logged_nextp;

1785

xfs_agino_t *logged_nextp;

1786

xfs_agino_t *buffer_nextp;

1786

xfs_agino_t *buffer_nextp;

1787

1788

trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);

1788

trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);

1789

1790

inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;

1790

inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;

1791

for (i = 0; i < inodes_per_buf; i++) {

1791

for (i = 0; i < inodes_per_buf; i++) {

1792

next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +

1792

next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +

1793

offsetof(xfs_dinode_t, di_next_unlinked);

1793

offsetof(xfs_dinode_t, di_next_unlinked);

1794

1795

while (next_unlinked_offset >=

1795

while (next_unlinked_offset >=

1796

(reg_buf_offset + reg_buf_bytes)) {

1796

(reg_buf_offset + reg_buf_bytes)) {

1797

/*

1797

/*

1798

* The next di_next_unlinked field is beyond

1798

* The next di_next_unlinked field is beyond

1799

* the current logged region. Find the next

1799

* the current logged region. Find the next

1800

* logged region that contains or is beyond

1800

* logged region that contains or is beyond

1801

* the current di_next_unlinked field.

1801

* the current di_next_unlinked field.

1802

*/

1802

*/

1803

bit += nbits;

1803

bit += nbits;

1804

bit = xfs_next_bit(buf_f->blf_data_map,

1804

bit = xfs_next_bit(buf_f->blf_data_map,

1805

buf_f->blf_map_size, bit);

1805

buf_f->blf_map_size, bit);

1806

1807

/*

1807

/*

1808

* If there are no more logged regions in the

1808

* If there are no more logged regions in the

1809

* buffer, then we're done.

1809

* buffer, then we're done.

1810

*/

1810

*/

1811

if (bit == -1)

1811

if (bit == -1)

1812

return 0;

1812

return 0;

1813

1814

nbits = xfs_contig_bits(buf_f->blf_data_map,

1814

nbits = xfs_contig_bits(buf_f->blf_data_map,

1815

buf_f->blf_map_size, bit);

1815

buf_f->blf_map_size, bit);

1816

ASSERT(nbits > 0);

1816

ASSERT(nbits > 0);

1817

reg_buf_offset = bit << XFS_BLF_SHIFT;

1817

reg_buf_offset = bit << XFS_BLF_SHIFT;

1818

reg_buf_bytes = nbits << XFS_BLF_SHIFT;

1818

reg_buf_bytes = nbits << XFS_BLF_SHIFT;

1819

item_index++;

1819

item_index++;

1820

}

1820

}

1821

1822

/*

1822

/*

1823

* If the current logged region starts after the current

1823

* If the current logged region starts after the current

1824

* di_next_unlinked field, then move on to the next

1824

* di_next_unlinked field, then move on to the next

1825

* di_next_unlinked field.

1825

* di_next_unlinked field.

1826

*/

1826

*/

1827

if (next_unlinked_offset < reg_buf_offset)

1827

if (next_unlinked_offset < reg_buf_offset)

1828

continue;

1828

continue;

1829

1830

ASSERT(item->ri_buf[item_index].i_addr != NULL);

1830

ASSERT(item->ri_buf[item_index].i_addr != NULL);

1831

ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);

1831

ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);

1832

ASSERT((reg_buf_offset + reg_buf_bytes) <=

1832

ASSERT((reg_buf_offset + reg_buf_bytes) <=

1833

BBTOB(bp->b_io_length));

1833

BBTOB(bp->b_io_length));

1834

1835

/*

1835

/*

1836

* The current logged region contains a copy of the

1836

* The current logged region contains a copy of the

1837

* current di_next_unlinked field. Extract its value

1837

* current di_next_unlinked field. Extract its value

1838

* and copy it to the buffer copy.

1838

* and copy it to the buffer copy.

1839

*/

1839

*/

1840

logged_nextp = item->ri_buf[item_index].i_addr +

1840

logged_nextp = item->ri_buf[item_index].i_addr +

1841

next_unlinked_offset - reg_buf_offset;

1841

next_unlinked_offset - reg_buf_offset;

1842

if (unlikely(*logged_nextp == 0)) {

1842

if (unlikely(*logged_nextp == 0)) {

1843

xfs_alert(mp,

1843

xfs_alert(mp,

1844

"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "

1844

"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "

1845

"Trying to replay bad (0) inode di_next_unlinked field.",

1845

"Trying to replay bad (0) inode di_next_unlinked field.",

1846

item, bp);

1846

item, bp);

1847

XFS_ERROR_REPORT("xlog_recover_do_inode_buf",

1847

XFS_ERROR_REPORT("xlog_recover_do_inode_buf",

1848

XFS_ERRLEVEL_LOW, mp);

1848

XFS_ERRLEVEL_LOW, mp);

1849

return XFS_ERROR(EFSCORRUPTED);

1849

return XFS_ERROR(EFSCORRUPTED);

1850

}

1850

}

1851

1852

buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,

1852

buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,

1853

next_unlinked_offset);

1853

next_unlinked_offset);

1854

*buffer_nextp = *logged_nextp;

1854

*buffer_nextp = *logged_nextp;

1855

}

1855

}

1856

1857

return 0;

1857

return 0;

1858

}

1858

}

1859

1860

/*

1860

/*

1861

* Perform a 'normal' buffer recovery. Each logged region of the

1861

* Perform a 'normal' buffer recovery. Each logged region of the

1862

* buffer should be copied over the corresponding region in the

1862

* buffer should be copied over the corresponding region in the

1863

* given buffer. The bitmap in the buf log format structure indicates

1863

* given buffer. The bitmap in the buf log format structure indicates

1864

* where to place the logged data.

1864

* where to place the logged data.

1865

*/

1865

*/

1866

STATIC void

1866

STATIC void

1867

xlog_recover_do_reg_buffer(

1867

xlog_recover_do_reg_buffer(

1868

struct xfs_mount *mp,

1868

struct xfs_mount *mp,

1869

xlog_recover_item_t *item,

1869

xlog_recover_item_t *item,

1870

struct xfs_buf *bp,

1870

struct xfs_buf *bp,

1871

xfs_buf_log_format_t *buf_f)

1871

xfs_buf_log_format_t *buf_f)

1872

{

1872

{

1873

int i;

1873

int i;

1874

int bit;

1874

int bit;

1875

int nbits;

1875

int nbits;

1876

int error;

1876

int error;

1877

1878

trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);

1878

trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);

1879

1880

bit = 0;

1880

bit = 0;

1881

i = 1; /* 0 is the buf format structure */

1881

i = 1; /* 0 is the buf format structure */

1882

while (1) {

1882

while (1) {

1883

bit = xfs_next_bit(buf_f->blf_data_map,

1883

bit = xfs_next_bit(buf_f->blf_data_map,

1884

buf_f->blf_map_size, bit);

1884

buf_f->blf_map_size, bit);

1885

if (bit == -1)

1885

if (bit == -1)

1886

break;

1886

break;

1887

nbits = xfs_contig_bits(buf_f->blf_data_map,

1887

nbits = xfs_contig_bits(buf_f->blf_data_map,

1888

buf_f->blf_map_size, bit);

1888

buf_f->blf_map_size, bit);

1889

ASSERT(nbits > 0);

1889

ASSERT(nbits > 0);

1890

ASSERT(item->ri_buf[i].i_addr != NULL);

1890

ASSERT(item->ri_buf[i].i_addr != NULL);

1891

ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);

1891

ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);

1892

ASSERT(BBTOB(bp->b_io_length) >=

1892

ASSERT(BBTOB(bp->b_io_length) >=

1893

((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));

1893

((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));

1894

1895

/*

1895

/*

1896

* Do a sanity check if this is a dquot buffer. Just checking

1896

* Do a sanity check if this is a dquot buffer. Just checking

1897

* the first dquot in the buffer should do. XXXThis is

1897

* the first dquot in the buffer should do. XXXThis is

1898

* probably a good thing to do for other buf types also.

1898

* probably a good thing to do for other buf types also.

1899

*/

1899

*/

1900

error = 0;

1900

error = 0;

1901

if (buf_f->blf_flags &

1901

if (buf_f->blf_flags &

1902

(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {

1902

(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {

1903

if (item->ri_buf[i].i_addr == NULL) {

1903

if (item->ri_buf[i].i_addr == NULL) {

1904

xfs_alert(mp,

1904

xfs_alert(mp,

1905

"XFS: NULL dquot in %s.", __func__);

1905

"XFS: NULL dquot in %s.", __func__);

1906

goto next;

1906

goto next;

1907

}

1907

}

1908

if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {

1908

if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {

1909

xfs_alert(mp,

1909

xfs_alert(mp,

1910

"XFS: dquot too small (%d) in %s.",

1910

"XFS: dquot too small (%d) in %s.",

1911

item->ri_buf[i].i_len, __func__);

1911

item->ri_buf[i].i_len, __func__);

1912

goto next;

1912

goto next;

1913

}

1913

}

1914

error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,

1914

error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,

1915

-1, 0, XFS_QMOPT_DOWARN,

1915

-1, 0, XFS_QMOPT_DOWARN,

1916

"dquot_buf_recover");

1916

"dquot_buf_recover");

1917

if (error)

1917

if (error)

1918

goto next;

1918

goto next;

1919

}

1919

}

1920

1921

memcpy(xfs_buf_offset(bp,

1921

memcpy(xfs_buf_offset(bp,

1922

(uint)bit << XFS_BLF_SHIFT), /* dest */

1922

(uint)bit << XFS_BLF_SHIFT), /* dest */

1923

item->ri_buf[i].i_addr, /* source */

1923

item->ri_buf[i].i_addr, /* source */

1924

nbits<<XFS_BLF_SHIFT); /* length */

1924

nbits<<XFS_BLF_SHIFT); /* length */

1925

i++;

1926

i++;

1927

bit += nbits;

1927

bit += nbits;

1928

}

1928

}

1929

1930

/* Shouldn't be any more regions */

1930

/* Shouldn't be any more regions */

1931

ASSERT(i == item->ri_total);

1931

ASSERT(i == item->ri_total);

1932

1933

switch (buf_f->blf_flags & XFS_BLF_TYPE_MASK) {

1933

switch (buf_f->blf_flags & XFS_BLF_TYPE_MASK) {

1934

case XFS_BLF_BTREE_BUF:

1934

case XFS_BLF_BTREE_BUF:

1935

switch (be32_to_cpu(*(__be32 *)bp->b_addr)) {

1935

switch (be32_to_cpu(*(__be32 *)bp->b_addr)) {

1936

case XFS_ABTB_CRC_MAGIC:

1936

case XFS_ABTB_CRC_MAGIC:

1937

case XFS_ABTC_CRC_MAGIC:

1937

case XFS_ABTC_CRC_MAGIC:

1938

case XFS_ABTB_MAGIC:

1938

case XFS_ABTB_MAGIC:

1939

case XFS_ABTC_MAGIC:

1939

case XFS_ABTC_MAGIC:

1940

bp->b_ops = &xfs_allocbt_buf_ops;

1940

bp->b_ops = &xfs_allocbt_buf_ops;

1941

break;

1941

break;

1942

case XFS_IBT_CRC_MAGIC:

1942

case XFS_IBT_CRC_MAGIC:

1943

case XFS_IBT_MAGIC:

1943

case XFS_IBT_MAGIC:

1944

bp->b_ops = &xfs_inobt_buf_ops;

1944

bp->b_ops = &xfs_inobt_buf_ops;

1945

break;

1945

break;

1946

case XFS_BMAP_CRC_MAGIC:

1946

case XFS_BMAP_CRC_MAGIC:

1947

case XFS_BMAP_MAGIC:

1947

case XFS_BMAP_MAGIC:

1948

bp->b_ops = &xfs_bmbt_buf_ops;

1948

bp->b_ops = &xfs_bmbt_buf_ops;

1949

break;

1949

break;

1950

default:

1950

default:

1951

xfs_warn(mp, "Bad btree block magic!");

1951

xfs_warn(mp, "Bad btree block magic!");

1952

ASSERT(0);

1952

ASSERT(0);

1953

break;

1953

break;

1954

}

1954

}

1955

break;

1955

break;

1956

case XFS_BLF_AGF_BUF:

1956

case XFS_BLF_AGF_BUF:

1957

if (*(__be32 *)bp->b_addr != cpu_to_be32(XFS_AGF_MAGIC)) {

1957

if (*(__be32 *)bp->b_addr != cpu_to_be32(XFS_AGF_MAGIC)) {

1958

xfs_warn(mp, "Bad AGF block magic!");

1958

xfs_warn(mp, "Bad AGF block magic!");

1959

ASSERT(0);

1959

ASSERT(0);

1960

break;

1960

break;

1961

}

1961

}

1962

bp->b_ops = &xfs_agf_buf_ops;

1962

bp->b_ops = &xfs_agf_buf_ops;

1963

break;

1963

break;

1964

case XFS_BLF_AGFL_BUF:

1964

case XFS_BLF_AGFL_BUF:

1965

if (!xfs_sb_version_hascrc(&mp->m_sb))

1965

if (!xfs_sb_version_hascrc(&mp->m_sb))

1966

break;

1966

break;

1967

if (*(__be32 *)bp->b_addr != cpu_to_be32(XFS_AGFL_MAGIC)) {

1967

if (*(__be32 *)bp->b_addr != cpu_to_be32(XFS_AGFL_MAGIC)) {

1968

xfs_warn(mp, "Bad AGFL block magic!");

1968

xfs_warn(mp, "Bad AGFL block magic!");

1969

ASSERT(0);

1969

ASSERT(0);

1970

break;

1970

break;

1971

}

1971

}

1972

bp->b_ops = &xfs_agfl_buf_ops;

1972

bp->b_ops = &xfs_agfl_buf_ops;

1973

break;

1973

break;

1974

case XFS_BLF_AGI_BUF:

1974

case XFS_BLF_AGI_BUF:

1975

if (*(__be32 *)bp->b_addr != cpu_to_be32(XFS_AGI_MAGIC)) {

1975

if (*(__be32 *)bp->b_addr != cpu_to_be32(XFS_AGI_MAGIC)) {

1976

xfs_warn(mp, "Bad AGI block magic!");

1976

xfs_warn(mp, "Bad AGI block magic!");

1977

ASSERT(0);

1977

ASSERT(0);

1978

break;

1978

break;

1979

}

1979

}

1980

bp->b_ops = &xfs_agi_buf_ops;

1980

bp->b_ops = &xfs_agi_buf_ops;

1981

break;

1981

break;

1982

case XFS_BLF_UDQUOT_BUF:

1983

case XFS_BLF_PDQUOT_BUF:

1984

case XFS_BLF_GDQUOT_BUF:

1985

if (*(__be16 *)bp->b_addr != cpu_to_be16(XFS_DQUOT_MAGIC)) {

1986

xfs_warn(mp, "Bad DQUOT block magic!");

1987

ASSERT(0);

1988

break;

1989

}

1990

bp->b_ops = &xfs_dquot_buf_ops;

1991

break;

1982

default:

1992

default:

1983

break;

1993

break;

1984

}

1994

}

1985

}

1995

}

1986

1996

1987

/*

1997

/*

1988

* Do some primitive error checking on ondisk dquot data structures.

1998

* Do some primitive error checking on ondisk dquot data structures.

1989

*/

1999

*/

1990

int

2000

int

1991

xfs_qm_dqcheck(

2001

xfs_qm_dqcheck(

1992

struct xfs_mount *mp,

2002

struct xfs_mount *mp,

1993

xfs_disk_dquot_t *ddq,

2003

xfs_disk_dquot_t *ddq,

1994

xfs_dqid_t id,

2004

xfs_dqid_t id,

1995

uint type, /* used only when IO_dorepair is true */

2005

uint type, /* used only when IO_dorepair is true */

1996

uint flags,

2006

uint flags,

1997

char *str)

2007

char *str)

1998

{

2008

{

1999

xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;

2009

xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;

2000

int errs = 0;

2010

int errs = 0;

2001

2011

2002

/*

2012

/*

2003

* We can encounter an uninitialized dquot buffer for 2 reasons:

2013

* We can encounter an uninitialized dquot buffer for 2 reasons:

2004

* 1. If we crash while deleting the quotainode(s), and those blks got

2014

* 1. If we crash while deleting the quotainode(s), and those blks got

2005

* used for user data. This is because we take the path of regular

2015

* used for user data. This is because we take the path of regular

2006

* file deletion; however, the size field of quotainodes is never

2016

* file deletion; however, the size field of quotainodes is never

2007

* updated, so all the tricks that we play in itruncate_finish

2017

* updated, so all the tricks that we play in itruncate_finish

2008

* don't quite matter.

2018

* don't quite matter.

2009

*

2019

*

2010

* 2. We don't play the quota buffers when there's a quotaoff logitem.

2020

* 2. We don't play the quota buffers when there's a quotaoff logitem.

2011

* But the allocation will be replayed so we'll end up with an

2021

* But the allocation will be replayed so we'll end up with an

2012

* uninitialized quota block.

2022

* uninitialized quota block.

2013

*

2023

*

2014

* This is all fine; things are still consistent, and we haven't lost

2024

* This is all fine; things are still consistent, and we haven't lost

2015

* any quota information. Just don't complain about bad dquot blks.

2025

* any quota information. Just don't complain about bad dquot blks.

2016

*/

2026

*/

2017

if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {

2027

if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {

2018

if (flags & XFS_QMOPT_DOWARN)

2028

if (flags & XFS_QMOPT_DOWARN)

2019

xfs_alert(mp,

2029

xfs_alert(mp,

2020

"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",

2030

"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",

2021

str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);

2031

str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);

2022

errs++;

2032

errs++;

2023

}

2033

}

2024

if (ddq->d_version != XFS_DQUOT_VERSION) {

2034

if (ddq->d_version != XFS_DQUOT_VERSION) {

2025

if (flags & XFS_QMOPT_DOWARN)

2035

if (flags & XFS_QMOPT_DOWARN)

2026

xfs_alert(mp,

2036

xfs_alert(mp,

2027

"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",

2037

"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",

2028

str, id, ddq->d_version, XFS_DQUOT_VERSION);

2038

str, id, ddq->d_version, XFS_DQUOT_VERSION);

2029

errs++;

2039

errs++;

2030

}

2040

}

2031

2041

2032

if (ddq->d_flags != XFS_DQ_USER &&

2042

if (ddq->d_flags != XFS_DQ_USER &&

2033

ddq->d_flags != XFS_DQ_PROJ &&

2043

ddq->d_flags != XFS_DQ_PROJ &&

2034

ddq->d_flags != XFS_DQ_GROUP) {

2044

ddq->d_flags != XFS_DQ_GROUP) {

2035

if (flags & XFS_QMOPT_DOWARN)

2045

if (flags & XFS_QMOPT_DOWARN)

2036

xfs_alert(mp,

2046

xfs_alert(mp,

2037

"%s : XFS dquot ID 0x%x, unknown flags 0x%x",

2047

"%s : XFS dquot ID 0x%x, unknown flags 0x%x",

2038

str, id, ddq->d_flags);

2048

str, id, ddq->d_flags);

2039

errs++;

2049

errs++;

2040

}

2050

}

2041

2051

2042

if (id != -1 && id != be32_to_cpu(ddq->d_id)) {

2052

if (id != -1 && id != be32_to_cpu(ddq->d_id)) {

2043

if (flags & XFS_QMOPT_DOWARN)

2053

if (flags & XFS_QMOPT_DOWARN)

2044

xfs_alert(mp,

2054

xfs_alert(mp,

2045

"%s : ondisk-dquot 0x%p, ID mismatch: "

2055

"%s : ondisk-dquot 0x%p, ID mismatch: "

2046

"0x%x expected, found id 0x%x",

2056

"0x%x expected, found id 0x%x",

2047

str, ddq, id, be32_to_cpu(ddq->d_id));

2057

str, ddq, id, be32_to_cpu(ddq->d_id));

2048

errs++;

2058

errs++;

2049

}

2059

}

2050

2060

2051

if (!errs && ddq->d_id) {

2061

if (!errs && ddq->d_id) {

2052

if (ddq->d_blk_softlimit &&

2062

if (ddq->d_blk_softlimit &&

2053

be64_to_cpu(ddq->d_bcount) >

2063

be64_to_cpu(ddq->d_bcount) >

2054

be64_to_cpu(ddq->d_blk_softlimit)) {

2064

be64_to_cpu(ddq->d_blk_softlimit)) {

2055

if (!ddq->d_btimer) {

2065

if (!ddq->d_btimer) {

2056

if (flags & XFS_QMOPT_DOWARN)

2066

if (flags & XFS_QMOPT_DOWARN)

2057

xfs_alert(mp,

2067

xfs_alert(mp,

2058

"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",

2068

"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",

2059

str, (int)be32_to_cpu(ddq->d_id), ddq);

2069

str, (int)be32_to_cpu(ddq->d_id), ddq);

2060

errs++;

2070

errs++;

2061

}

2071

}

2062

}

2072

}

2063

if (ddq->d_ino_softlimit &&

2073

if (ddq->d_ino_softlimit &&

2064

be64_to_cpu(ddq->d_icount) >

2074

be64_to_cpu(ddq->d_icount) >

2065

be64_to_cpu(ddq->d_ino_softlimit)) {

2075

be64_to_cpu(ddq->d_ino_softlimit)) {

2066

if (!ddq->d_itimer) {

2076

if (!ddq->d_itimer) {

2067

if (flags & XFS_QMOPT_DOWARN)

2077

if (flags & XFS_QMOPT_DOWARN)

2068

xfs_alert(mp,

2078

xfs_alert(mp,

2069

"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",

2079

"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",

2070

str, (int)be32_to_cpu(ddq->d_id), ddq);

2080

str, (int)be32_to_cpu(ddq->d_id), ddq);

2071

errs++;

2081

errs++;

2072

}

2082

}

2073

}

2083

}

2074

if (ddq->d_rtb_softlimit &&

2084

if (ddq->d_rtb_softlimit &&

2075

be64_to_cpu(ddq->d_rtbcount) >

2085

be64_to_cpu(ddq->d_rtbcount) >

2076

be64_to_cpu(ddq->d_rtb_softlimit)) {

2086

be64_to_cpu(ddq->d_rtb_softlimit)) {

2077

if (!ddq->d_rtbtimer) {

2087

if (!ddq->d_rtbtimer) {

2078

if (flags & XFS_QMOPT_DOWARN)

2088

if (flags & XFS_QMOPT_DOWARN)

2079

xfs_alert(mp,

2089

xfs_alert(mp,

2080

"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",

2090

"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",

2081

str, (int)be32_to_cpu(ddq->d_id), ddq);

2091

str, (int)be32_to_cpu(ddq->d_id), ddq);

2082

errs++;

2092

errs++;

2083

}

2093

}

2084

}

2094

}

2085

}

2095

}

2086

2096

2087

if (!errs || !(flags & XFS_QMOPT_DQREPAIR))

2097

if (!errs || !(flags & XFS_QMOPT_DQREPAIR))

2088

return errs;

2098

return errs;

2089

2099

2090

if (flags & XFS_QMOPT_DOWARN)

2100

if (flags & XFS_QMOPT_DOWARN)

2091

xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);

2101

xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);

2092

2102

2093

/*

2103

/*

2094

* Typically, a repair is only requested by quotacheck.

2104

* Typically, a repair is only requested by quotacheck.

2095

*/

2105

*/

2096

ASSERT(id != -1);

2106

ASSERT(id != -1);

2097

ASSERT(flags & XFS_QMOPT_DQREPAIR);

2107

ASSERT(flags & XFS_QMOPT_DQREPAIR);

2098

memset(d, 0, sizeof(xfs_dqblk_t));

2108

memset(d, 0, sizeof(xfs_dqblk_t));

2099

2109

2100

d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);

2110

d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);

2101

d->dd_diskdq.d_version = XFS_DQUOT_VERSION;

2111

d->dd_diskdq.d_version = XFS_DQUOT_VERSION;

2102

d->dd_diskdq.d_flags = type;

2112

d->dd_diskdq.d_flags = type;

2103

d->dd_diskdq.d_id = cpu_to_be32(id);

2113

d->dd_diskdq.d_id = cpu_to_be32(id);

2104

2114

2105

return errs;

2115

return errs;

2106

}

2116

}

2107

2117

2108

/*

2118

/*

2109

* Perform a dquot buffer recovery.

2119

* Perform a dquot buffer recovery.

2110

* Simple algorithm: if we have found a QUOTAOFF logitem of the same type

2120

* Simple algorithm: if we have found a QUOTAOFF logitem of the same type

2111

* (ie. USR or GRP), then just toss this buffer away; don't recover it.

2121

* (ie. USR or GRP), then just toss this buffer away; don't recover it.

2112

* Else, treat it as a regular buffer and do recovery.

2122

* Else, treat it as a regular buffer and do recovery.

2113

*/

2123

*/

2114

STATIC void

2124

STATIC void

2115

xlog_recover_do_dquot_buffer(

2125

xlog_recover_do_dquot_buffer(

2116

struct xfs_mount *mp,

2126

struct xfs_mount *mp,

2117

struct xlog *log,

2127

struct xlog *log,

2118

struct xlog_recover_item *item,

2128

struct xlog_recover_item *item,

2119

struct xfs_buf *bp,

2129

struct xfs_buf *bp,

2120

struct xfs_buf_log_format *buf_f)

2130

struct xfs_buf_log_format *buf_f)

2121

{

2131

{

2122

uint type;

2132

uint type;

2123

2133

2124

trace_xfs_log_recover_buf_dquot_buf(log, buf_f);

2134

trace_xfs_log_recover_buf_dquot_buf(log, buf_f);

2125

2135

2126

/*

2136

/*

2127

* Filesystems are required to send in quota flags at mount time.

2137

* Filesystems are required to send in quota flags at mount time.

2128

*/

2138

*/

2129

if (mp->m_qflags == 0) {

2139

if (mp->m_qflags == 0) {

2130

return;

2140

return;

2131

}

2141

}

2132

2142

2133

type = 0;

2143

type = 0;

2134

if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)

2144

if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)

2135

type |= XFS_DQ_USER;

2145

type |= XFS_DQ_USER;

2136

if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)

2146

if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)

2137

type |= XFS_DQ_PROJ;

2147

type |= XFS_DQ_PROJ;

2138

if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)

2148

if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)

2139

type |= XFS_DQ_GROUP;

2149

type |= XFS_DQ_GROUP;

2140

/*

2150

/*

2141

* This type of quotas was turned off, so ignore this buffer

2151

* This type of quotas was turned off, so ignore this buffer

2142

*/

2152

*/

2143

if (log->l_quotaoffs_flag & type)

2153

if (log->l_quotaoffs_flag & type)

2144

return;

2154

return;

2145

2155

2146

xlog_recover_do_reg_buffer(mp, item, bp, buf_f);

2156

xlog_recover_do_reg_buffer(mp, item, bp, buf_f);

2147

}

2157

}

2148

2158

2149

/*

2159

/*

2150

* This routine replays a modification made to a buffer at runtime.

2160

* This routine replays a modification made to a buffer at runtime.

2151

* There are actually two types of buffer, regular and inode, which

2161

* There are actually two types of buffer, regular and inode, which

2152

* are handled differently. Inode buffers are handled differently

2162

* are handled differently. Inode buffers are handled differently

2153

* in that we only recover a specific set of data from them, namely

2163

* in that we only recover a specific set of data from them, namely

2154

* the inode di_next_unlinked fields. This is because all other inode

2164

* the inode di_next_unlinked fields. This is because all other inode

2155

* data is actually logged via inode records and any data we replay

2165

* data is actually logged via inode records and any data we replay

2156

* here which overlaps that may be stale.

2166

* here which overlaps that may be stale.

2157

*

2167

*

2158

* When meta-data buffers are freed at run time we log a buffer item

2168

* When meta-data buffers are freed at run time we log a buffer item

2159

* with the XFS_BLF_CANCEL bit set to indicate that previous copies

2169

* with the XFS_BLF_CANCEL bit set to indicate that previous copies

2160

* of the buffer in the log should not be replayed at recovery time.

2170

* of the buffer in the log should not be replayed at recovery time.

2161

* This is so that if the blocks covered by the buffer are reused for

2171

* This is so that if the blocks covered by the buffer are reused for

2162

* file data before we crash we don't end up replaying old, freed

2172

* file data before we crash we don't end up replaying old, freed

2163

* meta-data into a user's file.

2173

* meta-data into a user's file.

2164

*

2174

*

2165

* To handle the cancellation of buffer log items, we make two passes

2175

* To handle the cancellation of buffer log items, we make two passes

2166

* over the log during recovery. During the first we build a table of

2176

* over the log during recovery. During the first we build a table of

2167

* those buffers which have been cancelled, and during the second we

2177

* those buffers which have been cancelled, and during the second we

2168

* only replay those buffers which do not have corresponding cancel

2178

* only replay those buffers which do not have corresponding cancel

2169

* records in the table. See xlog_recover_do_buffer_pass[1,2] above

2179

* records in the table. See xlog_recover_do_buffer_pass[1,2] above

2170

* for more details on the implementation of the table of cancel records.

2180

* for more details on the implementation of the table of cancel records.

2171

*/

2181

*/

2172

STATIC int

2182

STATIC int

2173

xlog_recover_buffer_pass2(

2183

xlog_recover_buffer_pass2(

2174

struct xlog *log,

2184

struct xlog *log,

2175

struct list_head *buffer_list,

2185

struct list_head *buffer_list,

2176

struct xlog_recover_item *item)

2186

struct xlog_recover_item *item)

2177

{

2187

{

2178

xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;

2188

xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;

2179

xfs_mount_t *mp = log->l_mp;

2189

xfs_mount_t *mp = log->l_mp;

2180

xfs_buf_t *bp;

2190

xfs_buf_t *bp;

2181

int error;

2191

int error;

2182

uint buf_flags;

2192

uint buf_flags;

2183

2193

2184

/*

2194

/*

2185

* In this pass we only want to recover all the buffers which have

2195

* In this pass we only want to recover all the buffers which have

2186

* not been cancelled and are not cancellation buffers themselves.

2196

* not been cancelled and are not cancellation buffers themselves.

2187

*/

2197

*/

2188

if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,

2198

if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,

2189

buf_f->blf_len, buf_f->blf_flags)) {

2199

buf_f->blf_len, buf_f->blf_flags)) {

2190

trace_xfs_log_recover_buf_cancel(log, buf_f);

2200

trace_xfs_log_recover_buf_cancel(log, buf_f);

2191

return 0;

2201

return 0;

2192

}

2202

}

2193

2203

2194

trace_xfs_log_recover_buf_recover(log, buf_f);

2204

trace_xfs_log_recover_buf_recover(log, buf_f);

2195

2205

2196

buf_flags = 0;

2206

buf_flags = 0;

2197

if (buf_f->blf_flags & XFS_BLF_INODE_BUF)

2207

if (buf_f->blf_flags & XFS_BLF_INODE_BUF)

2198

buf_flags |= XBF_UNMAPPED;

2208

buf_flags |= XBF_UNMAPPED;

2199

2209

2200

bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,

2210

bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,

2201

buf_flags, NULL);

2211

buf_flags, NULL);

2202

if (!bp)

2212

if (!bp)

2203

return XFS_ERROR(ENOMEM);

2213

return XFS_ERROR(ENOMEM);

2204

error = bp->b_error;

2214

error = bp->b_error;

2205

if (error) {

2215

if (error) {

2206

xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");

2216

xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");

2207

xfs_buf_relse(bp);

2217

xfs_buf_relse(bp);

2208

return error;

2218

return error;

2209

}

2219

}

2210

2220

2211

if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {

2221

if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {

2212

error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);

2222

error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);

2213

} else if (buf_f->blf_flags &

2223

} else if (buf_f->blf_flags &

2214

(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {

2224

(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {

2215

xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);

2225

xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);

2216

} else {

2226

} else {

2217

xlog_recover_do_reg_buffer(mp, item, bp, buf_f);

2227

xlog_recover_do_reg_buffer(mp, item, bp, buf_f);

2218

}

2228

}

2219

if (error)

2229

if (error)

2220

return XFS_ERROR(error);

2230

return XFS_ERROR(error);

2221

2231

2222

/*

2232

/*

2223

* Perform delayed write on the buffer. Asynchronous writes will be

2233

* Perform delayed write on the buffer. Asynchronous writes will be

2224

* slower when taking into account all the buffers to be flushed.

2234

* slower when taking into account all the buffers to be flushed.

2225

*

2235

*

2226

* Also make sure that only inode buffers with good sizes stay in

2236

* Also make sure that only inode buffers with good sizes stay in

2227

* the buffer cache. The kernel moves inodes in buffers of 1 block

2237

* the buffer cache. The kernel moves inodes in buffers of 1 block

2228

* or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode

2238

* or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode

2229

* buffers in the log can be a different size if the log was generated

2239

* buffers in the log can be a different size if the log was generated

2230

* by an older kernel using unclustered inode buffers or a newer kernel

2240

* by an older kernel using unclustered inode buffers or a newer kernel

2231

* running with a different inode cluster size. Regardless, if the

2241

* running with a different inode cluster size. Regardless, if the

2232

* the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)

2242

* the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)

2233

* for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep

2243

* for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep

2234

* the buffer out of the buffer cache so that the buffer won't

2244

* the buffer out of the buffer cache so that the buffer won't

2235

* overlap with future reads of those inodes.

2245

* overlap with future reads of those inodes.

2236

*/

2246

*/

2237

if (XFS_DINODE_MAGIC ==

2247

if (XFS_DINODE_MAGIC ==

2238

be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&

2248

be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&

2239

(BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,

2249

(BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,

2240

(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {

2250

(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {

2241

xfs_buf_stale(bp);

2251

xfs_buf_stale(bp);

2242

error = xfs_bwrite(bp);

2252

error = xfs_bwrite(bp);

2243

} else {

2253

} else {

2244

ASSERT(bp->b_target->bt_mount == mp);

2254

ASSERT(bp->b_target->bt_mount == mp);

2245

bp->b_iodone = xlog_recover_iodone;

2255

bp->b_iodone = xlog_recover_iodone;

2246

xfs_buf_delwri_queue(bp, buffer_list);

2256

xfs_buf_delwri_queue(bp, buffer_list);

2247

}

2257

}

2248

2258

2249

xfs_buf_relse(bp);

2259

xfs_buf_relse(bp);

2250

return error;

2260

return error;

2251

}

2261

}

2252

2262

2253

STATIC int

2263

STATIC int

2254

xlog_recover_inode_pass2(

2264

xlog_recover_inode_pass2(

2255

struct xlog *log,

2265

struct xlog *log,

2256

struct list_head *buffer_list,

2266

struct list_head *buffer_list,

2257

struct xlog_recover_item *item)

2267

struct xlog_recover_item *item)

2258

{

2268

{

2259

xfs_inode_log_format_t *in_f;

2269

xfs_inode_log_format_t *in_f;

2260

xfs_mount_t *mp = log->l_mp;

2270

xfs_mount_t *mp = log->l_mp;

2261

xfs_buf_t *bp;

2271

xfs_buf_t *bp;

2262

xfs_dinode_t *dip;

2272

xfs_dinode_t *dip;

2263

int len;

2273

int len;

2264

xfs_caddr_t src;

2274

xfs_caddr_t src;

2265

xfs_caddr_t dest;

2275

xfs_caddr_t dest;

2266

int error;

2276

int error;

2267

int attr_index;

2277

int attr_index;

2268

uint fields;

2278

uint fields;

2269

xfs_icdinode_t *dicp;

2279

xfs_icdinode_t *dicp;

2270

int need_free = 0;

2280

int need_free = 0;

2271

2281

2272

if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {

2282

if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {

2273

in_f = item->ri_buf[0].i_addr;

2283

in_f = item->ri_buf[0].i_addr;

2274

} else {

2284

} else {

2275

in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);

2285

in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);

2276

need_free = 1;

2286

need_free = 1;

2277

error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);

2287

error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);

2278

if (error)

2288

if (error)

2279

goto error;

2289

goto error;

2280

}

2290

}

2281

2291

2282

/*

2292

/*

2283

* Inode buffers can be freed, look out for it,

2293

* Inode buffers can be freed, look out for it,

2284

* and do not replay the inode.

2294

* and do not replay the inode.

2285

*/

2295

*/

2286

if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,

2296

if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,

2287

in_f->ilf_len, 0)) {

2297

in_f->ilf_len, 0)) {

2288

error = 0;

2298

error = 0;

2289

trace_xfs_log_recover_inode_cancel(log, in_f);

2299

trace_xfs_log_recover_inode_cancel(log, in_f);

2290

goto error;

2300

goto error;

2291

}

2301

}

2292

trace_xfs_log_recover_inode_recover(log, in_f);

2302

trace_xfs_log_recover_inode_recover(log, in_f);

2293

2303

2294

bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,

2304

bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,

2295

NULL);

2305

NULL);

2296

if (!bp) {

2306

if (!bp) {

2297

error = ENOMEM;

2307

error = ENOMEM;

2298

goto error;

2308

goto error;

2299

}

2309

}

2300

error = bp->b_error;

2310

error = bp->b_error;

2301

if (error) {

2311

if (error) {

2302

xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");

2312

xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");

2303

xfs_buf_relse(bp);

2313

xfs_buf_relse(bp);

2304

goto error;

2314

goto error;

2305

}

2315

}

2306

ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);

2316

ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);

2307

dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);

2317

dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);

2308

2318

2309

/*

2319

/*

2310

* Make sure the place we're flushing out to really looks

2320

* Make sure the place we're flushing out to really looks

2311

* like an inode!

2321

* like an inode!

2312

*/

2322

*/

2313

if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {

2323

if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {

2314

xfs_buf_relse(bp);

2324

xfs_buf_relse(bp);

2315

xfs_alert(mp,

2325

xfs_alert(mp,

2316

"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",

2326

"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",

2317

__func__, dip, bp, in_f->ilf_ino);

2327

__func__, dip, bp, in_f->ilf_ino);

2318

XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",

2328

XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",

2319

XFS_ERRLEVEL_LOW, mp);

2329

XFS_ERRLEVEL_LOW, mp);

2320

error = EFSCORRUPTED;

2330

error = EFSCORRUPTED;

2321

goto error;

2331

goto error;

2322

}

2332

}

2323

dicp = item->ri_buf[1].i_addr;

2333

dicp = item->ri_buf[1].i_addr;

2324

if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {

2334

if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {

2325

xfs_buf_relse(bp);

2335

xfs_buf_relse(bp);

2326

xfs_alert(mp,

2336

xfs_alert(mp,

2327

"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",

2337

"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",

2328

__func__, item, in_f->ilf_ino);

2338

__func__, item, in_f->ilf_ino);

2329

XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",

2339

XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",

2330

XFS_ERRLEVEL_LOW, mp);

2340

XFS_ERRLEVEL_LOW, mp);

2331

error = EFSCORRUPTED;

2341

error = EFSCORRUPTED;

2332

goto error;

2342

goto error;

2333

}

2343

}

2334

2344

2335

/* Skip replay when the on disk inode is newer than the log one */

2345

/* Skip replay when the on disk inode is newer than the log one */

2336

if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {

2346

if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {

2337

/*

2347

/*

2338

* Deal with the wrap case, DI_MAX_FLUSH is less

2348

* Deal with the wrap case, DI_MAX_FLUSH is less

2339

* than smaller numbers

2349

* than smaller numbers

2340

*/

2350

*/

2341

if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&

2351

if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&

2342

dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {

2352

dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {

2343

/* do nothing */

2353

/* do nothing */

2344

} else {

2354

} else {

2345

xfs_buf_relse(bp);

2355

xfs_buf_relse(bp);

2346

trace_xfs_log_recover_inode_skip(log, in_f);

2356

trace_xfs_log_recover_inode_skip(log, in_f);

2347

error = 0;

2357

error = 0;

2348

goto error;

2358

goto error;

2349

}

2359

}

2350

}

2360

}

2351

/* Take the opportunity to reset the flush iteration count */

2361

/* Take the opportunity to reset the flush iteration count */

2352

dicp->di_flushiter = 0;

2362

dicp->di_flushiter = 0;

2353

2363

2354

if (unlikely(S_ISREG(dicp->di_mode))) {

2364

if (unlikely(S_ISREG(dicp->di_mode))) {

2355

if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&

2365

if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&

2356

(dicp->di_format != XFS_DINODE_FMT_BTREE)) {

2366

(dicp->di_format != XFS_DINODE_FMT_BTREE)) {

2357

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",

2367

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",

2358

XFS_ERRLEVEL_LOW, mp, dicp);

2368

XFS_ERRLEVEL_LOW, mp, dicp);

2359

xfs_buf_relse(bp);

2369

xfs_buf_relse(bp);

2360

xfs_alert(mp,

2370

xfs_alert(mp,

2361

"%s: Bad regular inode log record, rec ptr 0x%p, "

2371

"%s: Bad regular inode log record, rec ptr 0x%p, "

2362

"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",

2372

"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",

2363

__func__, item, dip, bp, in_f->ilf_ino);

2373

__func__, item, dip, bp, in_f->ilf_ino);

2364

error = EFSCORRUPTED;

2374

error = EFSCORRUPTED;

2365

goto error;

2375

goto error;

2366

}

2376

}

2367

} else if (unlikely(S_ISDIR(dicp->di_mode))) {

2377

} else if (unlikely(S_ISDIR(dicp->di_mode))) {

2368

if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&

2378

if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&

2369

(dicp->di_format != XFS_DINODE_FMT_BTREE) &&

2379

(dicp->di_format != XFS_DINODE_FMT_BTREE) &&

2370

(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {

2380

(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {

2371

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",

2381

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",

2372

XFS_ERRLEVEL_LOW, mp, dicp);

2382

XFS_ERRLEVEL_LOW, mp, dicp);

2373

xfs_buf_relse(bp);

2383

xfs_buf_relse(bp);

2374

xfs_alert(mp,

2384

xfs_alert(mp,

2375

"%s: Bad dir inode log record, rec ptr 0x%p, "

2385

"%s: Bad dir inode log record, rec ptr 0x%p, "

2376

"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",

2386

"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",

2377

__func__, item, dip, bp, in_f->ilf_ino);

2387

__func__, item, dip, bp, in_f->ilf_ino);

2378

error = EFSCORRUPTED;

2388

error = EFSCORRUPTED;

2379

goto error;

2389

goto error;

2380

}

2390

}

2381

}

2391

}

2382

if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){

2392

if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){

2383

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",

2393

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",

2384

XFS_ERRLEVEL_LOW, mp, dicp);

2394

XFS_ERRLEVEL_LOW, mp, dicp);

2385

xfs_buf_relse(bp);

2395

xfs_buf_relse(bp);

2386

xfs_alert(mp,

2396

xfs_alert(mp,

2387

"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "

2397

"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "

2388

"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",

2398

"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",

2389

__func__, item, dip, bp, in_f->ilf_ino,

2399

__func__, item, dip, bp, in_f->ilf_ino,

2390

dicp->di_nextents + dicp->di_anextents,

2400

dicp->di_nextents + dicp->di_anextents,

2391

dicp->di_nblocks);

2401

dicp->di_nblocks);

2392

error = EFSCORRUPTED;

2402

error = EFSCORRUPTED;

2393

goto error;

2403

goto error;

2394

}

2404

}

2395

if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {

2405

if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {

2396

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",

2406

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",

2397

XFS_ERRLEVEL_LOW, mp, dicp);

2407

XFS_ERRLEVEL_LOW, mp, dicp);

2398

xfs_buf_relse(bp);

2408

xfs_buf_relse(bp);

2399

xfs_alert(mp,

2409

xfs_alert(mp,

2400

"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "

2410

"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "

2401

"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,

2411

"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,

2402

item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);

2412

item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);

2403

error = EFSCORRUPTED;

2413

error = EFSCORRUPTED;

2404

goto error;

2414

goto error;

2405

}

2415

}

2406

if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {

2416

if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {

2407

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",

2417

XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",

2408

XFS_ERRLEVEL_LOW, mp, dicp);

2418

XFS_ERRLEVEL_LOW, mp, dicp);

2409

xfs_buf_relse(bp);

2419

xfs_buf_relse(bp);

2410

xfs_alert(mp,

2420

xfs_alert(mp,

2411

"%s: Bad inode log record length %d, rec ptr 0x%p",

2421

"%s: Bad inode log record length %d, rec ptr 0x%p",

2412

__func__, item->ri_buf[1].i_len, item);

2422

__func__, item->ri_buf[1].i_len, item);

2413

error = EFSCORRUPTED;

2423

error = EFSCORRUPTED;

2414

goto error;

2424

goto error;

2415

}

2425

}

2416

2426

2417

/* The core is in in-core format */

2427

/* The core is in in-core format */

2418

xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);

2428

xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);

2419

2429

2420

/* the rest is in on-disk format */

2430

/* the rest is in on-disk format */

2421

if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {

2431

if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {

2422

memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),

2432

memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),

2423

item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),

2433

item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),

2424

item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));

2434

item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));

2425

}

2435

}

2426

2436

2427

fields = in_f->ilf_fields;

2437

fields = in_f->ilf_fields;

2428

switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {

2438

switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {

2429

case XFS_ILOG_DEV:

2439

case XFS_ILOG_DEV:

2430

xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);

2440

xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);

2431

break;

2441

break;

2432

case XFS_ILOG_UUID:

2442

case XFS_ILOG_UUID:

2433

memcpy(XFS_DFORK_DPTR(dip),

2443

memcpy(XFS_DFORK_DPTR(dip),

2434

&in_f->ilf_u.ilfu_uuid,

2444

&in_f->ilf_u.ilfu_uuid,

2435

sizeof(uuid_t));

2445

sizeof(uuid_t));

2436

break;

2446

break;

2437

}

2447

}

2438

2448

2439

if (in_f->ilf_size == 2)

2449

if (in_f->ilf_size == 2)

2440

goto write_inode_buffer;

2450

goto write_inode_buffer;

2441

len = item->ri_buf[2].i_len;

2451

len = item->ri_buf[2].i_len;

2442

src = item->ri_buf[2].i_addr;

2452

src = item->ri_buf[2].i_addr;

2443

ASSERT(in_f->ilf_size <= 4);

2453

ASSERT(in_f->ilf_size <= 4);

2444

ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));

2454

ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));

2445

ASSERT(!(fields & XFS_ILOG_DFORK) ||

2455

ASSERT(!(fields & XFS_ILOG_DFORK) ||

2446

(len == in_f->ilf_dsize));

2456

(len == in_f->ilf_dsize));

2447

2457

2448

switch (fields & XFS_ILOG_DFORK) {

2458

switch (fields & XFS_ILOG_DFORK) {

2449

case XFS_ILOG_DDATA:

2459

case XFS_ILOG_DDATA:

2450

case XFS_ILOG_DEXT:

2460

case XFS_ILOG_DEXT:

2451

memcpy(XFS_DFORK_DPTR(dip), src, len);

2461

memcpy(XFS_DFORK_DPTR(dip), src, len);

2452

break;

2462

break;

2453

2463

2454

case XFS_ILOG_DBROOT:

2464

case XFS_ILOG_DBROOT:

2455

xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,

2465

xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,

2456

(xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),

2466

(xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),

2457

XFS_DFORK_DSIZE(dip, mp));

2467

XFS_DFORK_DSIZE(dip, mp));

2458

break;

2468

break;

2459

2469

2460

default:

2470

default:

2461

/*

2471

/*

2462

* There are no data fork flags set.

2472

* There are no data fork flags set.

2463

*/

2473

*/

2464

ASSERT((fields & XFS_ILOG_DFORK) == 0);

2474

ASSERT((fields & XFS_ILOG_DFORK) == 0);

2465

break;

2475

break;

2466

}

2476

}

2467

2477

2468

/*

2478

/*

2469

* If we logged any attribute data, recover it. There may or

2479

* If we logged any attribute data, recover it. There may or

2470

* may not have been any other non-core data logged in this

2480

* may not have been any other non-core data logged in this

2471

* transaction.

2481

* transaction.

2472

*/

2482

*/

2473

if (in_f->ilf_fields & XFS_ILOG_AFORK) {

2483

if (in_f->ilf_fields & XFS_ILOG_AFORK) {

2474

if (in_f->ilf_fields & XFS_ILOG_DFORK) {

2484

if (in_f->ilf_fields & XFS_ILOG_DFORK) {

2475

attr_index = 3;

2485

attr_index = 3;

2476

} else {

2486

} else {

2477

attr_index = 2;

2487

attr_index = 2;

2478

}

2488

}

2479

len = item->ri_buf[attr_index].i_len;

2489

len = item->ri_buf[attr_index].i_len;

2480

src = item->ri_buf[attr_index].i_addr;

2490

src = item->ri_buf[attr_index].i_addr;

2481

ASSERT(len == in_f->ilf_asize);

2491

ASSERT(len == in_f->ilf_asize);

2482

2492

2483

switch (in_f->ilf_fields & XFS_ILOG_AFORK) {

2493

switch (in_f->ilf_fields & XFS_ILOG_AFORK) {

2484

case XFS_ILOG_ADATA:

2494

case XFS_ILOG_ADATA:

2485

case XFS_ILOG_AEXT:

2495

case XFS_ILOG_AEXT:

2486

dest = XFS_DFORK_APTR(dip);

2496

dest = XFS_DFORK_APTR(dip);

2487

ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));

2497

ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));

2488

memcpy(dest, src, len);

2498

memcpy(dest, src, len);

2489

break;

2499

break;

2490

2500

2491

case XFS_ILOG_ABROOT:

2501

case XFS_ILOG_ABROOT:

2492

dest = XFS_DFORK_APTR(dip);

2502

dest = XFS_DFORK_APTR(dip);

2493

xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,

2503

xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,

2494

len, (xfs_bmdr_block_t*)dest,

2504

len, (xfs_bmdr_block_t*)dest,

2495

XFS_DFORK_ASIZE(dip, mp));

2505

XFS_DFORK_ASIZE(dip, mp));

2496

break;

2506

break;

2497

2507

2498

default:

2508

default:

2499

xfs_warn(log->l_mp, "%s: Invalid flag", __func__);

2509

xfs_warn(log->l_mp, "%s: Invalid flag", __func__);

2500

ASSERT(0);

2510

ASSERT(0);

2501

xfs_buf_relse(bp);

2511

xfs_buf_relse(bp);

2502

error = EIO;

2512

error = EIO;

2503

goto error;

2513

goto error;

2504

}

2514

}

2505

}

2515

}

2506

2516

2507

write_inode_buffer:

2517

write_inode_buffer:

2508

ASSERT(bp->b_target->bt_mount == mp);

2518

ASSERT(bp->b_target->bt_mount == mp);

2509

bp->b_iodone = xlog_recover_iodone;

2519

bp->b_iodone = xlog_recover_iodone;

2510

xfs_buf_delwri_queue(bp, buffer_list);

2520

xfs_buf_delwri_queue(bp, buffer_list);

2511

xfs_buf_relse(bp);

2521

xfs_buf_relse(bp);

2512

error:

2522

error:

2513

if (need_free)

2523

if (need_free)

2514

kmem_free(in_f);

2524

kmem_free(in_f);

2515

return XFS_ERROR(error);

2525

return XFS_ERROR(error);

2516

}

2526

}

2517

2527

2518

/*

2528

/*

2519

* Recover QUOTAOFF records. We simply make a note of it in the xlog

2529

* Recover QUOTAOFF records. We simply make a note of it in the xlog

2520

* structure, so that we know not to do any dquot item or dquot buffer recovery,

2530

* structure, so that we know not to do any dquot item or dquot buffer recovery,

2521

* of that type.

2531

* of that type.

2522

*/

2532

*/

2523

STATIC int

2533

STATIC int

2524

xlog_recover_quotaoff_pass1(

2534

xlog_recover_quotaoff_pass1(

2525

struct xlog *log,

2535

struct xlog *log,

2526

struct xlog_recover_item *item)

2536

struct xlog_recover_item *item)

2527

{

2537

{

2528

xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;

2538

xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;

2529

ASSERT(qoff_f);

2539

ASSERT(qoff_f);

2530

2540

2531

/*

2541

/*

2532

* The logitem format's flag tells us if this was user quotaoff,

2542

* The logitem format's flag tells us if this was user quotaoff,

2533

* group/project quotaoff or both.

2543

* group/project quotaoff or both.

2534

*/

2544

*/

2535

if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)

2545

if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)

2536

log->l_quotaoffs_flag |= XFS_DQ_USER;

2546

log->l_quotaoffs_flag |= XFS_DQ_USER;

2537

if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)

2547

if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)

2538

log->l_quotaoffs_flag |= XFS_DQ_PROJ;

2548

log->l_quotaoffs_flag |= XFS_DQ_PROJ;

2539

if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)

2549

if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)

2540

log->l_quotaoffs_flag |= XFS_DQ_GROUP;

2550

log->l_quotaoffs_flag |= XFS_DQ_GROUP;

2541

2551

2542

return (0);

2552

return (0);

2543

}

2553

}

2544

2554

2545

/*

2555

/*

2546

* Recover a dquot record

2556

* Recover a dquot record

2547

*/

2557

*/

2548

STATIC int

2558

STATIC int

2549

xlog_recover_dquot_pass2(

2559

xlog_recover_dquot_pass2(

2550

struct xlog *log,

2560

struct xlog *log,

2551

struct list_head *buffer_list,

2561

struct list_head *buffer_list,

2552

struct xlog_recover_item *item)

2562

struct xlog_recover_item *item)

2553

{

2563

{

2554

xfs_mount_t *mp = log->l_mp;

2564

xfs_mount_t *mp = log->l_mp;

2555

xfs_buf_t *bp;

2565

xfs_buf_t *bp;

2556

struct xfs_disk_dquot *ddq, *recddq;

2566

struct xfs_disk_dquot *ddq, *recddq;

2557

int error;

2567

int error;

2558

xfs_dq_logformat_t *dq_f;

2568

xfs_dq_logformat_t *dq_f;

2559

uint type;

2569

uint type;

2560

2570

2561

2571

2562

/*

2572

/*

2563

* Filesystems are required to send in quota flags at mount time.

2573

* Filesystems are required to send in quota flags at mount time.

2564

*/

2574

*/

2565

if (mp->m_qflags == 0)

2575

if (mp->m_qflags == 0)

2566

return (0);

2576

return (0);

2567

2577

2568

recddq = item->ri_buf[1].i_addr;

2578

recddq = item->ri_buf[1].i_addr;

2569

if (recddq == NULL) {

2579

if (recddq == NULL) {

2570

xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);

2580

xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);

2571

return XFS_ERROR(EIO);

2581

return XFS_ERROR(EIO);

2572

}

2582

}

2573

if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {

2583

if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {

2574

xfs_alert(log->l_mp, "dquot too small (%d) in %s.",

2584

xfs_alert(log->l_mp, "dquot too small (%d) in %s.",

2575

item->ri_buf[1].i_len, __func__);

2585

item->ri_buf[1].i_len, __func__);

2576

return XFS_ERROR(EIO);

2586

return XFS_ERROR(EIO);

2577

}

2587

}

2578

2588

2579

/*

2589

/*

2580

* This type of quotas was turned off, so ignore this record.

2590

* This type of quotas was turned off, so ignore this record.

2581

*/

2591

*/

2582

type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);

2592

type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);

2583

ASSERT(type);

2593

ASSERT(type);

2584

if (log->l_quotaoffs_flag & type)

2594

if (log->l_quotaoffs_flag & type)

2585

return (0);

2595

return (0);

2586

2596

2587

/*

2597

/*

2588

* At this point we know that quota was _not_ turned off.

2598

* At this point we know that quota was _not_ turned off.

2589

* Since the mount flags are not indicating to us otherwise, this

2599

* Since the mount flags are not indicating to us otherwise, this

2590

* must mean that quota is on, and the dquot needs to be replayed.

2600

* must mean that quota is on, and the dquot needs to be replayed.

2591

* Remember that we may not have fully recovered the superblock yet,

2601

* Remember that we may not have fully recovered the superblock yet,

2592

* so we can't do the usual trick of looking at the SB quota bits.

2602

* so we can't do the usual trick of looking at the SB quota bits.

2593

*

2603

*

2594

* The other possibility, of course, is that the quota subsystem was

2604

* The other possibility, of course, is that the quota subsystem was

2595

* removed since the last mount - ENOSYS.

2605

* removed since the last mount - ENOSYS.

2596

*/

2606

*/

2597

dq_f = item->ri_buf[0].i_addr;

2607

dq_f = item->ri_buf[0].i_addr;

2598

ASSERT(dq_f);

2608

ASSERT(dq_f);

2599

error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,

2609

error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,

2600

"xlog_recover_dquot_pass2 (log copy)");

2610

"xlog_recover_dquot_pass2 (log copy)");

2601

if (error)

2611

if (error)

2602

return XFS_ERROR(EIO);

2612

return XFS_ERROR(EIO);

2603

ASSERT(dq_f->qlf_len == 1);

2613

ASSERT(dq_f->qlf_len == 1);

2604

2614

2605

error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,

2615

error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,

2606

XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,

2616

XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,

2607

NULL);

2617

NULL);

2608

if (error)

2618

if (error)

2609

return error;

2619

return error;

2610

2620

2611

ASSERT(bp);

2621

ASSERT(bp);

2612

ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);

2622

ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);

2613

2623

2614

/*

2624

/*

2615

* At least the magic num portion should be on disk because this

2625

* At least the magic num portion should be on disk because this

2616

* was among a chunk of dquots created earlier, and we did some

2626

* was among a chunk of dquots created earlier, and we did some

2617

* minimal initialization then.

2627

* minimal initialization then.

2618

*/

2628

*/

2619

error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,

2629

error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,

2620

"xlog_recover_dquot_pass2");

2630

"xlog_recover_dquot_pass2");

2621

if (error) {

2631

if (error) {

2622

xfs_buf_relse(bp);

2632

xfs_buf_relse(bp);

2623

return XFS_ERROR(EIO);

2633

return XFS_ERROR(EIO);

2624

}

2634

}

2625

2635

2626

memcpy(ddq, recddq, item->ri_buf[1].i_len);

2636

memcpy(ddq, recddq, item->ri_buf[1].i_len);

2627

2637

2628

ASSERT(dq_f->qlf_size == 2);

2638

ASSERT(dq_f->qlf_size == 2);

2629

ASSERT(bp->b_target->bt_mount == mp);

2639

ASSERT(bp->b_target->bt_mount == mp);

2630

bp->b_iodone = xlog_recover_iodone;

2640

bp->b_iodone = xlog_recover_iodone;

2631

xfs_buf_delwri_queue(bp, buffer_list);

2641

xfs_buf_delwri_queue(bp, buffer_list);

2632

xfs_buf_relse(bp);

2642

xfs_buf_relse(bp);

2633

2643

2634

return (0);

2644

return (0);

2635

}

2645

}

2636

2646

2637

/*

2647

/*

2638

* This routine is called to create an in-core extent free intent

2648

* This routine is called to create an in-core extent free intent

2639

* item from the efi format structure which was logged on disk.

2649

* item from the efi format structure which was logged on disk.

2640

* It allocates an in-core efi, copies the extents from the format

2650

* It allocates an in-core efi, copies the extents from the format

2641

* structure into it, and adds the efi to the AIL with the given

2651

* structure into it, and adds the efi to the AIL with the given

2642

* LSN.

2652

* LSN.

2643

*/

2653

*/

2644

STATIC int

2654

STATIC int

2645

xlog_recover_efi_pass2(

2655

xlog_recover_efi_pass2(

2646

struct xlog *log,

2656

struct xlog *log,

2647

struct xlog_recover_item *item,

2657

struct xlog_recover_item *item,

2648

xfs_lsn_t lsn)

2658

xfs_lsn_t lsn)

2649

{

2659

{

2650

int error;

2660

int error;

2651

xfs_mount_t *mp = log->l_mp;

2661

xfs_mount_t *mp = log->l_mp;

2652

xfs_efi_log_item_t *efip;

2662

xfs_efi_log_item_t *efip;

2653

xfs_efi_log_format_t *efi_formatp;

2663

xfs_efi_log_format_t *efi_formatp;

2654

2664

2655

efi_formatp = item->ri_buf[0].i_addr;

2665

efi_formatp = item->ri_buf[0].i_addr;

2656

2666

2657

efip = xfs_efi_init(mp, efi_formatp->efi_nextents);

2667

efip = xfs_efi_init(mp, efi_formatp->efi_nextents);

2658

if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),

2668

if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),

2659

&(efip->efi_format)))) {

2669

&(efip->efi_format)))) {

2660

xfs_efi_item_free(efip);

2670

xfs_efi_item_free(efip);

2661

return error;

2671

return error;

2662

}

2672

}

2663

atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);

2673

atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);

2664

2674

2665

spin_lock(&log->l_ailp->xa_lock);

2675

spin_lock(&log->l_ailp->xa_lock);

2666

/*

2676

/*

2667

* xfs_trans_ail_update() drops the AIL lock.

2677

* xfs_trans_ail_update() drops the AIL lock.

2668

*/

2678

*/

2669

xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);

2679

xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);

2670

return 0;

2680

return 0;

2671

}

2681

}

2672

2682

2673

2683

2674

/*

2684

/*

2675

* This routine is called when an efd format structure is found in

2685

* This routine is called when an efd format structure is found in

2676

* a committed transaction in the log. It's purpose is to cancel

2686

* a committed transaction in the log. It's purpose is to cancel

2677

* the corresponding efi if it was still in the log. To do this

2687

* the corresponding efi if it was still in the log. To do this

2678

* it searches the AIL for the efi with an id equal to that in the

2688

* it searches the AIL for the efi with an id equal to that in the

2679

* efd format structure. If we find it, we remove the efi from the

2689

* efd format structure. If we find it, we remove the efi from the

2680

* AIL and free it.

2690

* AIL and free it.

2681

*/

2691

*/

2682

STATIC int

2692

STATIC int

2683

xlog_recover_efd_pass2(

2693

xlog_recover_efd_pass2(

2684

struct xlog *log,

2694

struct xlog *log,

2685

struct xlog_recover_item *item)

2695

struct xlog_recover_item *item)

2686

{

2696

{

2687

xfs_efd_log_format_t *efd_formatp;

2697

xfs_efd_log_format_t *efd_formatp;

2688

xfs_efi_log_item_t *efip = NULL;

2698

xfs_efi_log_item_t *efip = NULL;

2689

xfs_log_item_t *lip;

2699

xfs_log_item_t *lip;

2690

__uint64_t efi_id;

2700

__uint64_t efi_id;

2691

struct xfs_ail_cursor cur;

2701

struct xfs_ail_cursor cur;

2692

struct xfs_ail *ailp = log->l_ailp;

2702

struct xfs_ail *ailp = log->l_ailp;

2693

2703

2694

efd_formatp = item->ri_buf[0].i_addr;

2704

efd_formatp = item->ri_buf[0].i_addr;

2695

ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +

2705

ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +

2696

((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||

2706

((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||

2697

(item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +

2707

(item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +

2698

((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));

2708

((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));

2699

efi_id = efd_formatp->efd_efi_id;

2709

efi_id = efd_formatp->efd_efi_id;

2700

2710

2701

/*

2711

/*

2702

* Search for the efi with the id in the efd format structure

2712

* Search for the efi with the id in the efd format structure

2703

* in the AIL.

2713

* in the AIL.

2704

*/

2714

*/

2705

spin_lock(&ailp->xa_lock);

2715

spin_lock(&ailp->xa_lock);

2706

lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);

2716

lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);

2707

while (lip != NULL) {

2717

while (lip != NULL) {

2708

if (lip->li_type == XFS_LI_EFI) {

2718

if (lip->li_type == XFS_LI_EFI) {

2709

efip = (xfs_efi_log_item_t *)lip;

2719

efip = (xfs_efi_log_item_t *)lip;

2710

if (efip->efi_format.efi_id == efi_id) {

2720

if (efip->efi_format.efi_id == efi_id) {

2711

/*

2721

/*

2712

* xfs_trans_ail_delete() drops the

2722

* xfs_trans_ail_delete() drops the

2713

* AIL lock.

2723

* AIL lock.

2714

*/

2724

*/

2715

xfs_trans_ail_delete(ailp, lip,

2725

xfs_trans_ail_delete(ailp, lip,

2716

SHUTDOWN_CORRUPT_INCORE);

2726

SHUTDOWN_CORRUPT_INCORE);

2717

xfs_efi_item_free(efip);

2727

xfs_efi_item_free(efip);

2718

spin_lock(&ailp->xa_lock);

2728

spin_lock(&ailp->xa_lock);

2719

break;

2729

break;

2720

}

2730

}

2721

}

2731

}

2722

lip = xfs_trans_ail_cursor_next(ailp, &cur);

2732

lip = xfs_trans_ail_cursor_next(ailp, &cur);

2723

}

2733

}

2724

xfs_trans_ail_cursor_done(ailp, &cur);

2734

xfs_trans_ail_cursor_done(ailp, &cur);

2725

spin_unlock(&ailp->xa_lock);

2735

spin_unlock(&ailp->xa_lock);

2726

2736

2727

return 0;

2737

return 0;

2728

}

2738

}

2729

2739

2730

/*

2740

/*

2731

* Free up any resources allocated by the transaction

2741

* Free up any resources allocated by the transaction

2732

*

2742

*

2733

* Remember that EFIs, EFDs, and IUNLINKs are handled later.

2743

* Remember that EFIs, EFDs, and IUNLINKs are handled later.

2734

*/

2744

*/

2735

STATIC void

2745

STATIC void

2736

xlog_recover_free_trans(

2746

xlog_recover_free_trans(

2737

struct xlog_recover *trans)

2747

struct xlog_recover *trans)

2738

{

2748

{

2739

xlog_recover_item_t *item, *n;

2749

xlog_recover_item_t *item, *n;

2740

int i;

2750

int i;

2741

2751

2742

list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {

2752

list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {

2743

/* Free the regions in the item. */

2753

/* Free the regions in the item. */

2744

list_del(&item->ri_list);

2754

list_del(&item->ri_list);

2745

for (i = 0; i < item->ri_cnt; i++)

2755

for (i = 0; i < item->ri_cnt; i++)

2746

kmem_free(item->ri_buf[i].i_addr);

2756

kmem_free(item->ri_buf[i].i_addr);

2747

/* Free the item itself */

2757

/* Free the item itself */

2748

kmem_free(item->ri_buf);

2758

kmem_free(item->ri_buf);

2749

kmem_free(item);

2759

kmem_free(item);

2750

}

2760

}

2751

/* Free the transaction recover structure */

2761

/* Free the transaction recover structure */

2752

kmem_free(trans);

2762

kmem_free(trans);

2753

}

2763

}

2754

2764

2755

STATIC int

2765

STATIC int

2756

xlog_recover_commit_pass1(

2766

xlog_recover_commit_pass1(

2757

struct xlog *log,

2767

struct xlog *log,

2758

struct xlog_recover *trans,

2768

struct xlog_recover *trans,

2759

struct xlog_recover_item *item)

2769

struct xlog_recover_item *item)

2760

{

2770

{

2761

trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);

2771

trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);

2762

2772

2763

switch (ITEM_TYPE(item)) {

2773

switch (ITEM_TYPE(item)) {

2764

case XFS_LI_BUF:

2774

case XFS_LI_BUF:

2765

return xlog_recover_buffer_pass1(log, item);

2775

return xlog_recover_buffer_pass1(log, item);

2766

case XFS_LI_QUOTAOFF:

2776

case XFS_LI_QUOTAOFF:

2767

return xlog_recover_quotaoff_pass1(log, item);

2777

return xlog_recover_quotaoff_pass1(log, item);

2768

case XFS_LI_INODE:

2778

case XFS_LI_INODE:

2769

case XFS_LI_EFI:

2779

case XFS_LI_EFI:

2770

case XFS_LI_EFD:

2780

case XFS_LI_EFD:

2771

case XFS_LI_DQUOT:

2781

case XFS_LI_DQUOT:

2772

/* nothing to do in pass 1 */

2782

/* nothing to do in pass 1 */

2773

return 0;

2783

return 0;

2774

default:

2784

default:

2775

xfs_warn(log->l_mp, "%s: invalid item type (%d)",

2785

xfs_warn(log->l_mp, "%s: invalid item type (%d)",

2776

__func__, ITEM_TYPE(item));

2786

__func__, ITEM_TYPE(item));

2777

ASSERT(0);

2787

ASSERT(0);

2778

return XFS_ERROR(EIO);

2788

return XFS_ERROR(EIO);

2779

}

2789

}

2780

}

2790

}

2781

2791

2782

STATIC int

2792

STATIC int

2783

xlog_recover_commit_pass2(

2793

xlog_recover_commit_pass2(

2784

struct xlog *log,

2794

struct xlog *log,

2785

struct xlog_recover *trans,

2795

struct xlog_recover *trans,

2786

struct list_head *buffer_list,

2796

struct list_head *buffer_list,

2787

struct xlog_recover_item *item)

2797

struct xlog_recover_item *item)

2788

{

2798

{

2789

trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);

2799

trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);

2790

2800

2791

switch (ITEM_TYPE(item)) {

2801

switch (ITEM_TYPE(item)) {

2792

case XFS_LI_BUF:

2802

case XFS_LI_BUF:

2793

return xlog_recover_buffer_pass2(log, buffer_list, item);

2803

return xlog_recover_buffer_pass2(log, buffer_list, item);

2794

case XFS_LI_INODE:

2804

case XFS_LI_INODE:

2795

return xlog_recover_inode_pass2(log, buffer_list, item);

2805

return xlog_recover_inode_pass2(log, buffer_list, item);

2796

case XFS_LI_EFI:

2806

case XFS_LI_EFI:

2797

return xlog_recover_efi_pass2(log, item, trans->r_lsn);

2807

return xlog_recover_efi_pass2(log, item, trans->r_lsn);

2798

case XFS_LI_EFD:

2808

case XFS_LI_EFD:

2799

return xlog_recover_efd_pass2(log, item);

2809

return xlog_recover_efd_pass2(log, item);

2800

case XFS_LI_DQUOT:

2810

case XFS_LI_DQUOT:

2801

return xlog_recover_dquot_pass2(log, buffer_list, item);

2811

return xlog_recover_dquot_pass2(log, buffer_list, item);

2802

case XFS_LI_QUOTAOFF:

2812

case XFS_LI_QUOTAOFF:

2803

/* nothing to do in pass2 */

2813

/* nothing to do in pass2 */

2804

return 0;

2814

return 0;

2805

default:

2815

default:

2806

xfs_warn(log->l_mp, "%s: invalid item type (%d)",

2816

xfs_warn(log->l_mp, "%s: invalid item type (%d)",

2807

__func__, ITEM_TYPE(item));

2817

__func__, ITEM_TYPE(item));

2808

ASSERT(0);

2818

ASSERT(0);

2809

return XFS_ERROR(EIO);

2819

return XFS_ERROR(EIO);

2810

}

2820

}

2811

}

2821

}

2812

2822

2813

/*

2823

/*

2814

* Perform the transaction.

2824

* Perform the transaction.

2815

*

2825

*

2816

* If the transaction modifies a buffer or inode, do it now. Otherwise,

2826

* If the transaction modifies a buffer or inode, do it now. Otherwise,

2817

* EFIs and EFDs get queued up by adding entries into the AIL for them.

2827

* EFIs and EFDs get queued up by adding entries into the AIL for them.

2818

*/

2828

*/

2819

STATIC int

2829

STATIC int

2820

xlog_recover_commit_trans(

2830

xlog_recover_commit_trans(

2821

struct xlog *log,

2831

struct xlog *log,

2822

struct xlog_recover *trans,

2832

struct xlog_recover *trans,

2823

int pass)

2833

int pass)

2824

{

2834

{

2825

int error = 0, error2;

2835

int error = 0, error2;

2826

xlog_recover_item_t *item;

2836

xlog_recover_item_t *item;

2827

LIST_HEAD (buffer_list);

2837

LIST_HEAD (buffer_list);

2828

2838

2829

hlist_del(&trans->r_list);

2839

hlist_del(&trans->r_list);

2830

2840

2831

error = xlog_recover_reorder_trans(log, trans, pass);

2841

error = xlog_recover_reorder_trans(log, trans, pass);

2832

if (error)

2842

if (error)

2833

return error;

2843

return error;

2834

2844

2835

list_for_each_entry(item, &trans->r_itemq, ri_list) {

2845

list_for_each_entry(item, &trans->r_itemq, ri_list) {

2836

switch (pass) {

2846

switch (pass) {

2837

case XLOG_RECOVER_PASS1:

2847

case XLOG_RECOVER_PASS1:

2838

error = xlog_recover_commit_pass1(log, trans, item);

2848

error = xlog_recover_commit_pass1(log, trans, item);

2839

break;

2849

break;

2840

case XLOG_RECOVER_PASS2:

2850

case XLOG_RECOVER_PASS2:

2841

error = xlog_recover_commit_pass2(log, trans,

2851

error = xlog_recover_commit_pass2(log, trans,

2842

&buffer_list, item);

2852

&buffer_list, item);

2843

break;

2853

break;

2844

default:

2854

default:

2845

ASSERT(0);

2855

ASSERT(0);

2846

}

2856

}

2847

2857

2848

if (error)

2858

if (error)

2849

goto out;

2859

goto out;

2850

}

2860

}

2851

2861

2852

xlog_recover_free_trans(trans);

2862

xlog_recover_free_trans(trans);

2853

2863

2854

out:

2864

out:

2855

error2 = xfs_buf_delwri_submit(&buffer_list);

2865

error2 = xfs_buf_delwri_submit(&buffer_list);

2856

return error ? error : error2;

2866

return error ? error : error2;

2857

}

2867

}

2858

2868

2859

STATIC int

2869

STATIC int

2860

xlog_recover_unmount_trans(

2870

xlog_recover_unmount_trans(

2861

struct xlog *log,

2871

struct xlog *log,

2862

struct xlog_recover *trans)

2872

struct xlog_recover *trans)

2863

{

2873

{

2864

/* Do nothing now */

2874

/* Do nothing now */

2865

xfs_warn(log->l_mp, "%s: Unmount LR", __func__);

2875

xfs_warn(log->l_mp, "%s: Unmount LR", __func__);

2866

return 0;

2876

return 0;

2867

}

2877

}

2868

2878

2869

/*

2879

/*

2870

* There are two valid states of the r_state field. 0 indicates that the

2880

* There are two valid states of the r_state field. 0 indicates that the

2871

* transaction structure is in a normal state. We have either seen the

2881

* transaction structure is in a normal state. We have either seen the

2872

* start of the transaction or the last operation we added was not a partial

2882

* start of the transaction or the last operation we added was not a partial

2873

* operation. If the last operation we added to the transaction was a

2883

* operation. If the last operation we added to the transaction was a

2874

* partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.

2884

* partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.

2875

*

2885

*

2876

* NOTE: skip LRs with 0 data length.

2886

* NOTE: skip LRs with 0 data length.

2877

*/

2887

*/

2878

STATIC int

2888

STATIC int

2879

xlog_recover_process_data(

2889

xlog_recover_process_data(

2880

struct xlog *log,

2890

struct xlog *log,

2881

struct hlist_head rhash[],

2891

struct hlist_head rhash[],

2882

struct xlog_rec_header *rhead,

2892

struct xlog_rec_header *rhead,

2883

xfs_caddr_t dp,

2893

xfs_caddr_t dp,

2884

int pass)

2894

int pass)

2885

{

2895

{

2886

xfs_caddr_t lp;

2896

xfs_caddr_t lp;

2887

int num_logops;

2897

int num_logops;

2888

xlog_op_header_t *ohead;

2898

xlog_op_header_t *ohead;

2889

xlog_recover_t *trans;

2899

xlog_recover_t *trans;

2890

xlog_tid_t tid;

2900

xlog_tid_t tid;

2891

int error;

2901

int error;

2892

unsigned long hash;

2902

unsigned long hash;

2893

uint flags;

2903

uint flags;

2894

2904

2895

lp = dp + be32_to_cpu(rhead->h_len);

2905

lp = dp + be32_to_cpu(rhead->h_len);

2896

num_logops = be32_to_cpu(rhead->h_num_logops);

2906

num_logops = be32_to_cpu(rhead->h_num_logops);

2897

2907

2898

/* check the log format matches our own - else we can't recover */

2908

/* check the log format matches our own - else we can't recover */

2899

if (xlog_header_check_recover(log->l_mp, rhead))

2909

if (xlog_header_check_recover(log->l_mp, rhead))

2900

return (XFS_ERROR(EIO));

2910

return (XFS_ERROR(EIO));

2901

2911

2902

while ((dp < lp) && num_logops) {

2912

while ((dp < lp) && num_logops) {

2903

ASSERT(dp + sizeof(xlog_op_header_t) <= lp);

2913

ASSERT(dp + sizeof(xlog_op_header_t) <= lp);

2904

ohead = (xlog_op_header_t *)dp;

2914

ohead = (xlog_op_header_t *)dp;

2905

dp += sizeof(xlog_op_header_t);

2915

dp += sizeof(xlog_op_header_t);

2906

if (ohead->oh_clientid != XFS_TRANSACTION &&

2916

if (ohead->oh_clientid != XFS_TRANSACTION &&

2907

ohead->oh_clientid != XFS_LOG) {

2917

ohead->oh_clientid != XFS_LOG) {

2908

xfs_warn(log->l_mp, "%s: bad clientid 0x%x",

2918

xfs_warn(log->l_mp, "%s: bad clientid 0x%x",

2909

__func__, ohead->oh_clientid);

2919

__func__, ohead->oh_clientid);

2910

ASSERT(0);

2920

ASSERT(0);

2911

return (XFS_ERROR(EIO));

2921

return (XFS_ERROR(EIO));

2912

}

2922

}

2913

tid = be32_to_cpu(ohead->oh_tid);

2923

tid = be32_to_cpu(ohead->oh_tid);

2914

hash = XLOG_RHASH(tid);

2924

hash = XLOG_RHASH(tid);

2915

trans = xlog_recover_find_tid(&rhash[hash], tid);

2925

trans = xlog_recover_find_tid(&rhash[hash], tid);

2916

if (trans == NULL) { /* not found; add new tid */

2926

if (trans == NULL) { /* not found; add new tid */

2917

if (ohead->oh_flags & XLOG_START_TRANS)

2927

if (ohead->oh_flags & XLOG_START_TRANS)

2918

xlog_recover_new_tid(&rhash[hash], tid,

2928

xlog_recover_new_tid(&rhash[hash], tid,

2919

be64_to_cpu(rhead->h_lsn));

2929

be64_to_cpu(rhead->h_lsn));

2920

} else {

2930

} else {

2921

if (dp + be32_to_cpu(ohead->oh_len) > lp) {

2931

if (dp + be32_to_cpu(ohead->oh_len) > lp) {

2922

xfs_warn(log->l_mp, "%s: bad length 0x%x",

2932

xfs_warn(log->l_mp, "%s: bad length 0x%x",

2923

__func__, be32_to_cpu(ohead->oh_len));

2933

__func__, be32_to_cpu(ohead->oh_len));

2924

WARN_ON(1);

2934

WARN_ON(1);

2925

return (XFS_ERROR(EIO));

2935

return (XFS_ERROR(EIO));

2926

}

2936

}

2927

flags = ohead->oh_flags & ~XLOG_END_TRANS;

2937

flags = ohead->oh_flags & ~XLOG_END_TRANS;

2928

if (flags & XLOG_WAS_CONT_TRANS)

2938

if (flags & XLOG_WAS_CONT_TRANS)

2929

flags &= ~XLOG_CONTINUE_TRANS;

2939

flags &= ~XLOG_CONTINUE_TRANS;

2930

switch (flags) {

2940

switch (flags) {

2931

case XLOG_COMMIT_TRANS:

2941

case XLOG_COMMIT_TRANS:

2932

error = xlog_recover_commit_trans(log,

2942

error = xlog_recover_commit_trans(log,

2933

trans, pass);

2943

trans, pass);

2934

break;

2944

break;

2935

case XLOG_UNMOUNT_TRANS:

2945

case XLOG_UNMOUNT_TRANS:

2936

error = xlog_recover_unmount_trans(log, trans);

2946

error = xlog_recover_unmount_trans(log, trans);

2937

break;

2947

break;

2938

case XLOG_WAS_CONT_TRANS:

2948

case XLOG_WAS_CONT_TRANS:

2939

error = xlog_recover_add_to_cont_trans(log,

2949

error = xlog_recover_add_to_cont_trans(log,

2940

trans, dp,

2950

trans, dp,

2941

be32_to_cpu(ohead->oh_len));

2951

be32_to_cpu(ohead->oh_len));

2942

break;

2952

break;

2943

case XLOG_START_TRANS:

2953

case XLOG_START_TRANS:

2944

xfs_warn(log->l_mp, "%s: bad transaction",

2954

xfs_warn(log->l_mp, "%s: bad transaction",

2945

__func__);

2955

__func__);

2946

ASSERT(0);

2956

ASSERT(0);

2947

error = XFS_ERROR(EIO);

2957

error = XFS_ERROR(EIO);

2948

break;

2958

break;

2949

case 0:

2959

case 0:

2950

case XLOG_CONTINUE_TRANS:

2960

case XLOG_CONTINUE_TRANS:

2951

error = xlog_recover_add_to_trans(log, trans,

2961

error = xlog_recover_add_to_trans(log, trans,

2952

dp, be32_to_cpu(ohead->oh_len));

2962

dp, be32_to_cpu(ohead->oh_len));

2953

break;

2963

break;

2954

default:

2964

default:

2955

xfs_warn(log->l_mp, "%s: bad flag 0x%x",

2965

xfs_warn(log->l_mp, "%s: bad flag 0x%x",

2956

__func__, flags);

2966

__func__, flags);

2957

ASSERT(0);

2967

ASSERT(0);

2958

error = XFS_ERROR(EIO);

2968

error = XFS_ERROR(EIO);

2959

break;

2969

break;

2960

}

2970

}

2961

if (error)

2971

if (error)

2962

return error;

2972

return error;

2963

}

2973

}

2964

dp += be32_to_cpu(ohead->oh_len);

2974

dp += be32_to_cpu(ohead->oh_len);

2965

num_logops--;

2975

num_logops--;

2966

}

2976

}

2967

return 0;

2977

return 0;

2968

}

2978

}

2969

2979

2970

/*

2980

/*

2971

* Process an extent free intent item that was recovered from

2981

* Process an extent free intent item that was recovered from

2972

* the log. We need to free the extents that it describes.

2982

* the log. We need to free the extents that it describes.

2973

*/

2983

*/

2974

STATIC int

2984

STATIC int

2975

xlog_recover_process_efi(

2985

xlog_recover_process_efi(

2976

xfs_mount_t *mp,

2986

xfs_mount_t *mp,

2977

xfs_efi_log_item_t *efip)

2987

xfs_efi_log_item_t *efip)

2978

{

2988

{

2979

xfs_efd_log_item_t *efdp;

2989

xfs_efd_log_item_t *efdp;

2980

xfs_trans_t *tp;

2990

xfs_trans_t *tp;

2981

int i;

2991

int i;

2982

int error = 0;

2992

int error = 0;

2983

xfs_extent_t *extp;

2993

xfs_extent_t *extp;

2984

xfs_fsblock_t startblock_fsb;

2994

xfs_fsblock_t startblock_fsb;

2985

2995

2986

ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));

2996

ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));

2987

2997

2988

/*

2998

/*

2989

* First check the validity of the extents described by the

2999

* First check the validity of the extents described by the

2990

* EFI. If any are bad, then assume that all are bad and

3000

* EFI. If any are bad, then assume that all are bad and

2991

* just toss the EFI.

3001

* just toss the EFI.

2992

*/

3002

*/

2993

for (i = 0; i < efip->efi_format.efi_nextents; i++) {

3003

for (i = 0; i < efip->efi_format.efi_nextents; i++) {

2994

extp = &(efip->efi_format.efi_extents[i]);

3004

extp = &(efip->efi_format.efi_extents[i]);

2995

startblock_fsb = XFS_BB_TO_FSB(mp,

3005

startblock_fsb = XFS_BB_TO_FSB(mp,

2996

XFS_FSB_TO_DADDR(mp, extp->ext_start));

3006

XFS_FSB_TO_DADDR(mp, extp->ext_start));

2997

if ((startblock_fsb == 0) ||

3007

if ((startblock_fsb == 0) ||

2998

(extp->ext_len == 0) ||

3008

(extp->ext_len == 0) ||

2999

(startblock_fsb >= mp->m_sb.sb_dblocks) ||

3009

(startblock_fsb >= mp->m_sb.sb_dblocks) ||

3000

(extp->ext_len >= mp->m_sb.sb_agblocks)) {

3010

(extp->ext_len >= mp->m_sb.sb_agblocks)) {

3001

/*

3011

/*

3002

* This will pull the EFI from the AIL and

3012

* This will pull the EFI from the AIL and

3003

* free the memory associated with it.

3013

* free the memory associated with it.

3004

*/

3014

*/

3005

set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);

3015

set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);

3006

xfs_efi_release(efip, efip->efi_format.efi_nextents);

3016

xfs_efi_release(efip, efip->efi_format.efi_nextents);

3007

return XFS_ERROR(EIO);

3017

return XFS_ERROR(EIO);

3008

}

3018

}

3009

}

3019

}

3010

3020

3011

tp = xfs_trans_alloc(mp, 0);

3021

tp = xfs_trans_alloc(mp, 0);

3012

error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);

3022

error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);

3013

if (error)

3023

if (error)

3014

goto abort_error;

3024

goto abort_error;

3015

efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);

3025

efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);

3016

3026

3017

for (i = 0; i < efip->efi_format.efi_nextents; i++) {

3027

for (i = 0; i < efip->efi_format.efi_nextents; i++) {

3018

extp = &(efip->efi_format.efi_extents[i]);

3028

extp = &(efip->efi_format.efi_extents[i]);

3019

error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);

3029

error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);

3020

if (error)

3030

if (error)

3021

goto abort_error;

3031

goto abort_error;

3022

xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,

3032

xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,

3023

extp->ext_len);

3033

extp->ext_len);

3024

}

3034

}

3025

3035

3026

set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);

3036

set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);

3027

error = xfs_trans_commit(tp, 0);

3037

error = xfs_trans_commit(tp, 0);

3028

return error;

3038

return error;

3029

3039

3030

abort_error:

3040

abort_error:

3031

xfs_trans_cancel(tp, XFS_TRANS_ABORT);

3041

xfs_trans_cancel(tp, XFS_TRANS_ABORT);

3032

return error;

3042

return error;

3033

}

3043

}

3034

3044

3035

/*

3045

/*

3036

* When this is called, all of the EFIs which did not have

3046

* When this is called, all of the EFIs which did not have

3037

* corresponding EFDs should be in the AIL. What we do now

3047

* corresponding EFDs should be in the AIL. What we do now

3038

* is free the extents associated with each one.

3048

* is free the extents associated with each one.

3039

*

3049

*

3040

* Since we process the EFIs in normal transactions, they

3050

* Since we process the EFIs in normal transactions, they

3041

* will be removed at some point after the commit. This prevents

3051

* will be removed at some point after the commit. This prevents

3042

* us from just walking down the list processing each one.

3052

* us from just walking down the list processing each one.

3043

* We'll use a flag in the EFI to skip those that we've already

3053

* We'll use a flag in the EFI to skip those that we've already

3044

* processed and use the AIL iteration mechanism's generation

3054

* processed and use the AIL iteration mechanism's generation

3045

* count to try to speed this up at least a bit.

3055

* count to try to speed this up at least a bit.

3046

*

3056

*

3047

* When we start, we know that the EFIs are the only things in

3057

* When we start, we know that the EFIs are the only things in

3048

* the AIL. As we process them, however, other items are added

3058

* the AIL. As we process them, however, other items are added

3049

* to the AIL. Since everything added to the AIL must come after

3059

* to the AIL. Since everything added to the AIL must come after

3050

* everything already in the AIL, we stop processing as soon as

3060

* everything already in the AIL, we stop processing as soon as

3051

* we see something other than an EFI in the AIL.

3061

* we see something other than an EFI in the AIL.

3052

*/

3062

*/

3053

STATIC int

3063

STATIC int

3054

xlog_recover_process_efis(

3064

xlog_recover_process_efis(

3055

struct xlog *log)

3065

struct xlog *log)

3056

{

3066

{

3057

xfs_log_item_t *lip;

3067

xfs_log_item_t *lip;

3058

xfs_efi_log_item_t *efip;

3068

xfs_efi_log_item_t *efip;

3059

int error = 0;

3069

int error = 0;

3060

struct xfs_ail_cursor cur;

3070

struct xfs_ail_cursor cur;

3061

struct xfs_ail *ailp;

3071

struct xfs_ail *ailp;

3062

3072

3063

ailp = log->l_ailp;

3073

ailp = log->l_ailp;

3064

spin_lock(&ailp->xa_lock);

3074

spin_lock(&ailp->xa_lock);

3065

lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);

3075

lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);

3066

while (lip != NULL) {

3076

while (lip != NULL) {

3067

/*

3077

/*

3068

* We're done when we see something other than an EFI.

3078

* We're done when we see something other than an EFI.

3069

* There should be no EFIs left in the AIL now.

3079

* There should be no EFIs left in the AIL now.

3070

*/

3080

*/

3071

if (lip->li_type != XFS_LI_EFI) {

3081

if (lip->li_type != XFS_LI_EFI) {

3072

#ifdef DEBUG

3082

#ifdef DEBUG

3073

for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))

3083

for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))

3074

ASSERT(lip->li_type != XFS_LI_EFI);

3084

ASSERT(lip->li_type != XFS_LI_EFI);

3075

#endif

3085

#endif

3076

break;

3086

break;

3077

}

3087

}

3078

3088

3079

/*

3089

/*

3080

* Skip EFIs that we've already processed.

3090

* Skip EFIs that we've already processed.

3081

*/

3091

*/

3082

efip = (xfs_efi_log_item_t *)lip;

3092

efip = (xfs_efi_log_item_t *)lip;

3083

if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {

3093

if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {

3084

lip = xfs_trans_ail_cursor_next(ailp, &cur);

3094

lip = xfs_trans_ail_cursor_next(ailp, &cur);

3085

continue;

3095

continue;

3086

}

3096

}

3087

3097

3088

spin_unlock(&ailp->xa_lock);

3098

spin_unlock(&ailp->xa_lock);

3089

error = xlog_recover_process_efi(log->l_mp, efip);

3099

error = xlog_recover_process_efi(log->l_mp, efip);

3090

spin_lock(&ailp->xa_lock);

3100

spin_lock(&ailp->xa_lock);

3091

if (error)

3101

if (error)

3092

goto out;

3102

goto out;

3093

lip = xfs_trans_ail_cursor_next(ailp, &cur);

3103

lip = xfs_trans_ail_cursor_next(ailp, &cur);

3094

}

3104

}

3095

out:

3105

out:

3096

xfs_trans_ail_cursor_done(ailp, &cur);

3106

xfs_trans_ail_cursor_done(ailp, &cur);

3097

spin_unlock(&ailp->xa_lock);

3107

spin_unlock(&ailp->xa_lock);

3098

return error;

3108

return error;

3099

}

3109

}

3100

3110

3101

/*

3111

/*

3102

* This routine performs a transaction to null out a bad inode pointer

3112

* This routine performs a transaction to null out a bad inode pointer

3103

* in an agi unlinked inode hash bucket.

3113

* in an agi unlinked inode hash bucket.

3104

*/

3114

*/

3105

STATIC void

3115

STATIC void

3106

xlog_recover_clear_agi_bucket(

3116

xlog_recover_clear_agi_bucket(

3107

xfs_mount_t *mp,

3117

xfs_mount_t *mp,

3108

xfs_agnumber_t agno,

3118

xfs_agnumber_t agno,

3109

int bucket)

3119

int bucket)

3110

{

3120

{

3111

xfs_trans_t *tp;

3121

xfs_trans_t *tp;

3112

xfs_agi_t *agi;

3122

xfs_agi_t *agi;

3113

xfs_buf_t *agibp;

3123

xfs_buf_t *agibp;

3114

int offset;

3124

int offset;

3115

int error;

3125

int error;

3116

3126

3117

tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);

3127

tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);

3118

error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),

3128

error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),

3119

0, 0, 0);

3129

0, 0, 0);

3120

if (error)

3130

if (error)

3121

goto out_abort;

3131

goto out_abort;

3122

3132

3123

error = xfs_read_agi(mp, tp, agno, &agibp);

3133

error = xfs_read_agi(mp, tp, agno, &agibp);

3124

if (error)

3134

if (error)

3125

goto out_abort;

3135

goto out_abort;

3126

3136

3127

agi = XFS_BUF_TO_AGI(agibp);

3137

agi = XFS_BUF_TO_AGI(agibp);

3128

agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);

3138

agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);

3129

offset = offsetof(xfs_agi_t, agi_unlinked) +

3139

offset = offsetof(xfs_agi_t, agi_unlinked) +

3130

(sizeof(xfs_agino_t) * bucket);

3140

(sizeof(xfs_agino_t) * bucket);

3131

xfs_trans_log_buf(tp, agibp, offset,

3141

xfs_trans_log_buf(tp, agibp, offset,

3132

(offset + sizeof(xfs_agino_t) - 1));

3142

(offset + sizeof(xfs_agino_t) - 1));

3133

3143

3134

error = xfs_trans_commit(tp, 0);

3144

error = xfs_trans_commit(tp, 0);

3135

if (error)

3145

if (error)

3136

goto out_error;

3146

goto out_error;

3137

return;

3147

return;

3138

3148

3139

out_abort:

3149

out_abort:

3140

xfs_trans_cancel(tp, XFS_TRANS_ABORT);

3150

xfs_trans_cancel(tp, XFS_TRANS_ABORT);

3141

out_error:

3151

out_error:

3142

xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);

3152

xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);

3143

return;

3153

return;

3144

}

3154

}

3145

3155

3146

STATIC xfs_agino_t

3156

STATIC xfs_agino_t

3147

xlog_recover_process_one_iunlink(

3157

xlog_recover_process_one_iunlink(

3148

struct xfs_mount *mp,

3158

struct xfs_mount *mp,

3149

xfs_agnumber_t agno,

3159

xfs_agnumber_t agno,

3150

xfs_agino_t agino,

3160

xfs_agino_t agino,

3151

int bucket)

3161

int bucket)

3152

{

3162

{

3153

struct xfs_buf *ibp;

3163

struct xfs_buf *ibp;

3154

struct xfs_dinode *dip;

3164

struct xfs_dinode *dip;

3155

struct xfs_inode *ip;

3165

struct xfs_inode *ip;

3156

xfs_ino_t ino;

3166

xfs_ino_t ino;

3157

int error;

3167

int error;

3158

3168

3159

ino = XFS_AGINO_TO_INO(mp, agno, agino);

3169

ino = XFS_AGINO_TO_INO(mp, agno, agino);

3160

error = xfs_iget(mp, NULL, ino, 0, 0, &ip);

3170

error = xfs_iget(mp, NULL, ino, 0, 0, &ip);

3161

if (error)

3171

if (error)

3162

goto fail;

3172

goto fail;

3163

3173

3164

/*

3174

/*

3165

* Get the on disk inode to find the next inode in the bucket.

3175

* Get the on disk inode to find the next inode in the bucket.

3166

*/

3176

*/

3167

error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);

3177

error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);

3168

if (error)

3178

if (error)

3169

goto fail_iput;

3179

goto fail_iput;

3170

3180

3171

ASSERT(ip->i_d.di_nlink == 0);

3181

ASSERT(ip->i_d.di_nlink == 0);

3172

ASSERT(ip->i_d.di_mode != 0);

3182

ASSERT(ip->i_d.di_mode != 0);

3173

3183

3174

/* setup for the next pass */

3184

/* setup for the next pass */

3175

agino = be32_to_cpu(dip->di_next_unlinked);

3185

agino = be32_to_cpu(dip->di_next_unlinked);

3176

xfs_buf_relse(ibp);

3186

xfs_buf_relse(ibp);

3177

3187

3178

/*

3188

/*

3179

* Prevent any DMAPI event from being sent when the reference on

3189

* Prevent any DMAPI event from being sent when the reference on

3180

* the inode is dropped.

3190

* the inode is dropped.

3181

*/

3191

*/

3182

ip->i_d.di_dmevmask = 0;

3192

ip->i_d.di_dmevmask = 0;

3183

3193

3184

IRELE(ip);

3194

IRELE(ip);

3185

return agino;

3195

return agino;

3186

3196

3187

fail_iput:

3197

fail_iput:

3188

IRELE(ip);

3198

IRELE(ip);

3189

fail:

3199

fail:

3190

/*

3200

/*

3191

* We can't read in the inode this bucket points to, or this inode

3201

* We can't read in the inode this bucket points to, or this inode

3192

* is messed up. Just ditch this bucket of inodes. We will lose

3202

* is messed up. Just ditch this bucket of inodes. We will lose

3193

* some inodes and space, but at least we won't hang.

3203

* some inodes and space, but at least we won't hang.

3194

*

3204

*

3195

* Call xlog_recover_clear_agi_bucket() to perform a transaction to

3205

* Call xlog_recover_clear_agi_bucket() to perform a transaction to

3196

* clear the inode pointer in the bucket.

3206

* clear the inode pointer in the bucket.

3197

*/

3207

*/

3198

xlog_recover_clear_agi_bucket(mp, agno, bucket);

3208

xlog_recover_clear_agi_bucket(mp, agno, bucket);

3199

return NULLAGINO;

3209

return NULLAGINO;

3200

}

3210

}

3201

3211

3202

/*

3212

/*

3203

* xlog_iunlink_recover

3213

* xlog_iunlink_recover

3204

*

3214

*

3205

* This is called during recovery to process any inodes which

3215

* This is called during recovery to process any inodes which

3206

* we unlinked but not freed when the system crashed. These

3216

* we unlinked but not freed when the system crashed. These

3207

* inodes will be on the lists in the AGI blocks. What we do

3217

* inodes will be on the lists in the AGI blocks. What we do

3208

* here is scan all the AGIs and fully truncate and free any

3218

* here is scan all the AGIs and fully truncate and free any

3209

* inodes found on the lists. Each inode is removed from the

3219

* inodes found on the lists. Each inode is removed from the

3210

* lists when it has been fully truncated and is freed. The

3220

* lists when it has been fully truncated and is freed. The

3211

* freeing of the inode and its removal from the list must be

3221

* freeing of the inode and its removal from the list must be

3212

* atomic.

3222

* atomic.

3213

*/

3223

*/

3214

STATIC void

3224

STATIC void

3215

xlog_recover_process_iunlinks(

3225

xlog_recover_process_iunlinks(

3216

struct xlog *log)

3226

struct xlog *log)

3217

{

3227

{

3218

xfs_mount_t *mp;

3228

xfs_mount_t *mp;

3219

xfs_agnumber_t agno;

3229

xfs_agnumber_t agno;

3220

xfs_agi_t *agi;

3230

xfs_agi_t *agi;

3221

xfs_buf_t *agibp;

3231

xfs_buf_t *agibp;

3222

xfs_agino_t agino;

3232

xfs_agino_t agino;

3223

int bucket;

3233

int bucket;

3224

int error;

3234

int error;

3225

uint mp_dmevmask;

3235

uint mp_dmevmask;

3226

3236

3227

mp = log->l_mp;

3237

mp = log->l_mp;

3228

3238

3229

/*

3239

/*

3230

* Prevent any DMAPI event from being sent while in this function.

3240

* Prevent any DMAPI event from being sent while in this function.

3231

*/

3241

*/

3232

mp_dmevmask = mp->m_dmevmask;

3242

mp_dmevmask = mp->m_dmevmask;

3233

mp->m_dmevmask = 0;

3243

mp->m_dmevmask = 0;

3234

3244

3235

for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {

3245

for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {

3236

/*

3246

/*

3237

* Find the agi for this ag.

3247

* Find the agi for this ag.

3238

*/

3248

*/

3239

error = xfs_read_agi(mp, NULL, agno, &agibp);

3249

error = xfs_read_agi(mp, NULL, agno, &agibp);

3240

if (error) {

3250

if (error) {

3241

/*

3251

/*

3242

* AGI is b0rked. Don't process it.

3252

* AGI is b0rked. Don't process it.

3243

*

3253

*

3244

* We should probably mark the filesystem as corrupt

3254

* We should probably mark the filesystem as corrupt

3245

* after we've recovered all the ag's we can....

3255

* after we've recovered all the ag's we can....

3246

*/

3256

*/

3247

continue;

3257

continue;

3248

}

3258

}

3249

/*

3259

/*

3250

* Unlock the buffer so that it can be acquired in the normal

3260

* Unlock the buffer so that it can be acquired in the normal

3251

* course of the transaction to truncate and free each inode.

3261

* course of the transaction to truncate and free each inode.

3252

* Because we are not racing with anyone else here for the AGI

3262

* Because we are not racing with anyone else here for the AGI

3253

* buffer, we don't even need to hold it locked to read the

3263

* buffer, we don't even need to hold it locked to read the

3254

* initial unlinked bucket entries out of the buffer. We keep

3264

* initial unlinked bucket entries out of the buffer. We keep

3255

* buffer reference though, so that it stays pinned in memory

3265

* buffer reference though, so that it stays pinned in memory

3256

* while we need the buffer.

3266

* while we need the buffer.

3257

*/

3267

*/

3258

agi = XFS_BUF_TO_AGI(agibp);

3268

agi = XFS_BUF_TO_AGI(agibp);

3259

xfs_buf_unlock(agibp);

3269

xfs_buf_unlock(agibp);

3260

3270

3261

for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {

3271

for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {

3262

agino = be32_to_cpu(agi->agi_unlinked[bucket]);

3272

agino = be32_to_cpu(agi->agi_unlinked[bucket]);

3263

while (agino != NULLAGINO) {

3273

while (agino != NULLAGINO) {

3264

agino = xlog_recover_process_one_iunlink(mp,

3274

agino = xlog_recover_process_one_iunlink(mp,

3265

agno, agino, bucket);

3275

agno, agino, bucket);

3266

}

3276

}

3267

}

3277

}

3268

xfs_buf_rele(agibp);

3278

xfs_buf_rele(agibp);

3269

}

3279

}

3270

3280

3271

mp->m_dmevmask = mp_dmevmask;

3281

mp->m_dmevmask = mp_dmevmask;

3272

}

3282

}

3273

3283

3274

/*

3284

/*

3275

* Upack the log buffer data and crc check it. If the check fails, issue a

3285

* Upack the log buffer data and crc check it. If the check fails, issue a

3276

* warning if and only if the CRC in the header is non-zero. This makes the

3286

* warning if and only if the CRC in the header is non-zero. This makes the

3277

* check an advisory warning, and the zero CRC check will prevent failure

3287

* check an advisory warning, and the zero CRC check will prevent failure

3278

* warnings from being emitted when upgrading the kernel from one that does not

3288

* warnings from being emitted when upgrading the kernel from one that does not

3279

* add CRCs by default.

3289

* add CRCs by default.

3280

*

3290

*

3281

* When filesystems are CRC enabled, this CRC mismatch becomes a fatal log

3291

* When filesystems are CRC enabled, this CRC mismatch becomes a fatal log

3282

* corruption failure

3292

* corruption failure

3283

*/

3293

*/

3284

STATIC int

3294

STATIC int

3285

xlog_unpack_data_crc(

3295

xlog_unpack_data_crc(

3286

struct xlog_rec_header *rhead,

3296

struct xlog_rec_header *rhead,

3287

xfs_caddr_t dp,

3297

xfs_caddr_t dp,

3288

struct xlog *log)

3298

struct xlog *log)

3289

{

3299

{

3290

__le32 crc;

3300

__le32 crc;

3291

3301

3292

crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));

3302

crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));

3293

if (crc != rhead->h_crc) {

3303

if (crc != rhead->h_crc) {

3294

if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {

3304

if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {

3295

xfs_alert(log->l_mp,

3305

xfs_alert(log->l_mp,

3296

"log record CRC mismatch: found 0x%x, expected 0x%x.\n",

3306

"log record CRC mismatch: found 0x%x, expected 0x%x.\n",

3297

le32_to_cpu(rhead->h_crc),

3307

le32_to_cpu(rhead->h_crc),

3298

le32_to_cpu(crc));

3308

le32_to_cpu(crc));

3299

xfs_hex_dump(dp, 32);

3309

xfs_hex_dump(dp, 32);

3300

}

3310

}

3301

3311

3302

/*

3312

/*

3303

* If we've detected a log record corruption, then we can't

3313

* If we've detected a log record corruption, then we can't

3304

* recover past this point. Abort recovery if we are enforcing

3314

* recover past this point. Abort recovery if we are enforcing

3305

* CRC protection by punting an error back up the stack.

3315

* CRC protection by punting an error back up the stack.

3306

*/

3316

*/

3307

if (xfs_sb_version_hascrc(&log->l_mp->m_sb))

3317

if (xfs_sb_version_hascrc(&log->l_mp->m_sb))

3308

return EFSCORRUPTED;

3318

return EFSCORRUPTED;

3309

}

3319

}

3310

3320

3311

return 0;

3321

return 0;

3312

}

3322

}

3313

3323

3314

STATIC int

3324

STATIC int

3315

xlog_unpack_data(

3325

xlog_unpack_data(

3316

struct xlog_rec_header *rhead,

3326

struct xlog_rec_header *rhead,

3317

xfs_caddr_t dp,

3327

xfs_caddr_t dp,

3318

struct xlog *log)

3328

struct xlog *log)

3319

{

3329

{

3320

int i, j, k;

3330

int i, j, k;

3321

int error;

3331

int error;

3322

3332

3323

error = xlog_unpack_data_crc(rhead, dp, log);

3333

error = xlog_unpack_data_crc(rhead, dp, log);

3324

if (error)

3334

if (error)

3325

return error;

3335

return error;

3326

3336

3327

for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&

3337

for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&

3328

i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {

3338

i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {

3329

*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];

3339

*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];

3330

dp += BBSIZE;

3340

dp += BBSIZE;

3331

}

3341

}

3332

3342

3333

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

3343

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

3334

xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;

3344

xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;

3335

for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {

3345

for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {

3336

j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);

3346

j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);

3337

k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);

3347

k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);

3338

*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];

3348

*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];

3339

dp += BBSIZE;

3349

dp += BBSIZE;

3340

}

3350

}

3341

}

3351

}

3342

3352

3343

return 0;

3353

return 0;

3344

}

3354

}

3345

3355

3346

STATIC int

3356

STATIC int

3347

xlog_valid_rec_header(

3357

xlog_valid_rec_header(

3348

struct xlog *log,

3358

struct xlog *log,

3349

struct xlog_rec_header *rhead,

3359

struct xlog_rec_header *rhead,

3350

xfs_daddr_t blkno)

3360

xfs_daddr_t blkno)

3351

{

3361

{

3352

int hlen;

3362

int hlen;

3353

3363

3354

if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {

3364

if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {

3355

XFS_ERROR_REPORT("xlog_valid_rec_header(1)",

3365

XFS_ERROR_REPORT("xlog_valid_rec_header(1)",

3356

XFS_ERRLEVEL_LOW, log->l_mp);

3366

XFS_ERRLEVEL_LOW, log->l_mp);

3357

return XFS_ERROR(EFSCORRUPTED);

3367

return XFS_ERROR(EFSCORRUPTED);

3358

}

3368

}

3359

if (unlikely(

3369

if (unlikely(

3360

(!rhead->h_version ||

3370

(!rhead->h_version ||

3361

(be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {

3371

(be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {

3362

xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",

3372

xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",

3363

__func__, be32_to_cpu(rhead->h_version));

3373

__func__, be32_to_cpu(rhead->h_version));

3364

return XFS_ERROR(EIO);

3374

return XFS_ERROR(EIO);

3365

}

3375

}

3366

3376

3367

/* LR body must have data or it wouldn't have been written */

3377

/* LR body must have data or it wouldn't have been written */

3368

hlen = be32_to_cpu(rhead->h_len);

3378

hlen = be32_to_cpu(rhead->h_len);

3369

if (unlikely( hlen <= 0 || hlen > INT_MAX )) {

3379

if (unlikely( hlen <= 0 || hlen > INT_MAX )) {

3370

XFS_ERROR_REPORT("xlog_valid_rec_header(2)",

3380

XFS_ERROR_REPORT("xlog_valid_rec_header(2)",

3371

XFS_ERRLEVEL_LOW, log->l_mp);

3381

XFS_ERRLEVEL_LOW, log->l_mp);

3372

return XFS_ERROR(EFSCORRUPTED);

3382

return XFS_ERROR(EFSCORRUPTED);

3373

}

3383

}

3374

if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {

3384

if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {

3375

XFS_ERROR_REPORT("xlog_valid_rec_header(3)",

3385

XFS_ERROR_REPORT("xlog_valid_rec_header(3)",

3376

XFS_ERRLEVEL_LOW, log->l_mp);

3386

XFS_ERRLEVEL_LOW, log->l_mp);

3377

return XFS_ERROR(EFSCORRUPTED);

3387

return XFS_ERROR(EFSCORRUPTED);

3378

}

3388

}

3379

return 0;

3389

return 0;

3380

}

3390

}

3381

3391

3382

/*

3392

/*

3383

* Read the log from tail to head and process the log records found.

3393

* Read the log from tail to head and process the log records found.

3384

* Handle the two cases where the tail and head are in the same cycle

3394

* Handle the two cases where the tail and head are in the same cycle

3385

* and where the active portion of the log wraps around the end of

3395

* and where the active portion of the log wraps around the end of

3386

* the physical log separately. The pass parameter is passed through

3396

* the physical log separately. The pass parameter is passed through

3387

* to the routines called to process the data and is not looked at

3397

* to the routines called to process the data and is not looked at

3388

* here.

3398

* here.

3389

*/

3399

*/

3390

STATIC int

3400

STATIC int

3391

xlog_do_recovery_pass(

3401

xlog_do_recovery_pass(

3392

struct xlog *log,

3402

struct xlog *log,

3393

xfs_daddr_t head_blk,

3403

xfs_daddr_t head_blk,

3394

xfs_daddr_t tail_blk,

3404

xfs_daddr_t tail_blk,

3395

int pass)

3405

int pass)

3396

{

3406

{

3397

xlog_rec_header_t *rhead;

3407

xlog_rec_header_t *rhead;

3398

xfs_daddr_t blk_no;

3408

xfs_daddr_t blk_no;

3399

xfs_caddr_t offset;

3409

xfs_caddr_t offset;

3400

xfs_buf_t *hbp, *dbp;

3410

xfs_buf_t *hbp, *dbp;

3401

int error = 0, h_size;

3411

int error = 0, h_size;

3402

int bblks, split_bblks;

3412

int bblks, split_bblks;

3403

int hblks, split_hblks, wrapped_hblks;

3413

int hblks, split_hblks, wrapped_hblks;

3404

struct hlist_head rhash[XLOG_RHASH_SIZE];

3414

struct hlist_head rhash[XLOG_RHASH_SIZE];

3405

3415

3406

ASSERT(head_blk != tail_blk);

3416

ASSERT(head_blk != tail_blk);

3407

3417

3408

/*

3418

/*

3409

* Read the header of the tail block and get the iclog buffer size from

3419

* Read the header of the tail block and get the iclog buffer size from

3410

* h_size. Use this to tell how many sectors make up the log header.

3420

* h_size. Use this to tell how many sectors make up the log header.

3411

*/

3421

*/

3412

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

3422

if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {

3413

/*

3423

/*

3414

* When using variable length iclogs, read first sector of

3424

* When using variable length iclogs, read first sector of

3415

* iclog header and extract the header size from it. Get a

3425

* iclog header and extract the header size from it. Get a

3416

* new hbp that is the correct size.

3426

* new hbp that is the correct size.

3417

*/

3427

*/

3418

hbp = xlog_get_bp(log, 1);

3428

hbp = xlog_get_bp(log, 1);

3419

if (!hbp)

3429

if (!hbp)

3420

return ENOMEM;

3430

return ENOMEM;

3421

3431

3422

error = xlog_bread(log, tail_blk, 1, hbp, &offset);

3432

error = xlog_bread(log, tail_blk, 1, hbp, &offset);

3423

if (error)

3433

if (error)

3424

goto bread_err1;

3434

goto bread_err1;

3425

3435

3426

rhead = (xlog_rec_header_t *)offset;

3436

rhead = (xlog_rec_header_t *)offset;

3427

error = xlog_valid_rec_header(log, rhead, tail_blk);

3437

error = xlog_valid_rec_header(log, rhead, tail_blk);

3428

if (error)

3438

if (error)

3429

goto bread_err1;

3439

goto bread_err1;

3430

h_size = be32_to_cpu(rhead->h_size);

3440

h_size = be32_to_cpu(rhead->h_size);

3431

if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&

3441

if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&

3432

(h_size > XLOG_HEADER_CYCLE_SIZE)) {

3442

(h_size > XLOG_HEADER_CYCLE_SIZE)) {

3433

hblks = h_size / XLOG_HEADER_CYCLE_SIZE;

3443

hblks = h_size / XLOG_HEADER_CYCLE_SIZE;

3434

if (h_size % XLOG_HEADER_CYCLE_SIZE)

3444

if (h_size % XLOG_HEADER_CYCLE_SIZE)

3435

hblks++;

3445

hblks++;

3436

xlog_put_bp(hbp);

3446

xlog_put_bp(hbp);

3437

hbp = xlog_get_bp(log, hblks);

3447

hbp = xlog_get_bp(log, hblks);

3438

} else {

3448

} else {

3439

hblks = 1;

3449

hblks = 1;

3440

}

3450

}

3441

} else {

3451

} else {

3442

ASSERT(log->l_sectBBsize == 1);

3452

ASSERT(log->l_sectBBsize == 1);

3443

hblks = 1;

3453

hblks = 1;

3444

hbp = xlog_get_bp(log, 1);

3454

hbp = xlog_get_bp(log, 1);

3445

h_size = XLOG_BIG_RECORD_BSIZE;

3455

h_size = XLOG_BIG_RECORD_BSIZE;

3446

}

3456

}

3447

3457

3448

if (!hbp)

3458

if (!hbp)

3449

return ENOMEM;

3459

return ENOMEM;

3450

dbp = xlog_get_bp(log, BTOBB(h_size));

3460

dbp = xlog_get_bp(log, BTOBB(h_size));

3451

if (!dbp) {

3461

if (!dbp) {

3452

xlog_put_bp(hbp);

3462

xlog_put_bp(hbp);

3453

return ENOMEM;

3463

return ENOMEM;

3454

}

3464

}

3455

3465

3456

memset(rhash, 0, sizeof(rhash));

3466

memset(rhash, 0, sizeof(rhash));

3457

if (tail_blk <= head_blk) {

3467

if (tail_blk <= head_blk) {

3458

for (blk_no = tail_blk; blk_no < head_blk; ) {

3468

for (blk_no = tail_blk; blk_no < head_blk; ) {

3459

error = xlog_bread(log, blk_no, hblks, hbp, &offset);

3469

error = xlog_bread(log, blk_no, hblks, hbp, &offset);

3460

if (error)

3470

if (error)

3461

goto bread_err2;

3471

goto bread_err2;

3462

3472

3463

rhead = (xlog_rec_header_t *)offset;

3473

rhead = (xlog_rec_header_t *)offset;

3464

error = xlog_valid_rec_header(log, rhead, blk_no);

3474

error = xlog_valid_rec_header(log, rhead, blk_no);

3465

if (error)

3475

if (error)

3466

goto bread_err2;

3476

goto bread_err2;

3467

3477

3468

/* blocks in data section */

3478

/* blocks in data section */

3469

bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));

3479

bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));

3470

error = xlog_bread(log, blk_no + hblks, bblks, dbp,

3480

error = xlog_bread(log, blk_no + hblks, bblks, dbp,

3471

&offset);

3481

&offset);

3472

if (error)

3482

if (error)

3473

goto bread_err2;

3483

goto bread_err2;

3474

3484

3475

error = xlog_unpack_data(rhead, offset, log);

3485

error = xlog_unpack_data(rhead, offset, log);

3476

if (error)

3486

if (error)

3477

goto bread_err2;

3487

goto bread_err2;

3478

3488

3479

error = xlog_recover_process_data(log,

3489

error = xlog_recover_process_data(log,

3480

rhash, rhead, offset, pass);

3490

rhash, rhead, offset, pass);

3481

if (error)

3491

if (error)

3482

goto bread_err2;

3492

goto bread_err2;

3483

blk_no += bblks + hblks;

3493

blk_no += bblks + hblks;

3484

}

3494

}

3485

} else {

3495

} else {

3486

/*

3496

/*

3487

* Perform recovery around the end of the physical log.

3497

* Perform recovery around the end of the physical log.

3488

* When the head is not on the same cycle number as the tail,

3498

* When the head is not on the same cycle number as the tail,

3489

* we can't do a sequential recovery as above.

3499

* we can't do a sequential recovery as above.

3490

*/

3500

*/

3491

blk_no = tail_blk;

3501

blk_no = tail_blk;

3492

while (blk_no < log->l_logBBsize) {

3502

while (blk_no < log->l_logBBsize) {

3493

/*

3503

/*

3494

* Check for header wrapping around physical end-of-log

3504

* Check for header wrapping around physical end-of-log

3495

*/

3505

*/

3496

offset = hbp->b_addr;

3506

offset = hbp->b_addr;

3497

split_hblks = 0;

3507

split_hblks = 0;

3498

wrapped_hblks = 0;

3508

wrapped_hblks = 0;

3499

if (blk_no + hblks <= log->l_logBBsize) {

3509

if (blk_no + hblks <= log->l_logBBsize) {

3500

/* Read header in one read */

3510

/* Read header in one read */

3501

error = xlog_bread(log, blk_no, hblks, hbp,

3511

error = xlog_bread(log, blk_no, hblks, hbp,

3502

&offset);

3512

&offset);

3503

if (error)

3513

if (error)

3504

goto bread_err2;

3514

goto bread_err2;

3505

} else {

3515

} else {

3506

/* This LR is split across physical log end */

3516

/* This LR is split across physical log end */

3507

if (blk_no != log->l_logBBsize) {

3517

if (blk_no != log->l_logBBsize) {

3508

/* some data before physical log end */

3518

/* some data before physical log end */

3509

ASSERT(blk_no <= INT_MAX);

3519

ASSERT(blk_no <= INT_MAX);

3510

split_hblks = log->l_logBBsize - (int)blk_no;

3520

split_hblks = log->l_logBBsize - (int)blk_no;

3511

ASSERT(split_hblks > 0);

3521

ASSERT(split_hblks > 0);

3512

error = xlog_bread(log, blk_no,

3522

error = xlog_bread(log, blk_no,

3513

split_hblks, hbp,

3523

split_hblks, hbp,

3514

&offset);

3524

&offset);

3515

if (error)

3525

if (error)

3516

goto bread_err2;

3526

goto bread_err2;

3517

}

3527

}

3518

3528

3519

/*

3529

/*

3520

* Note: this black magic still works with

3530

* Note: this black magic still works with

3521

* large sector sizes (non-512) only because:

3531

* large sector sizes (non-512) only because:

3522

* - we increased the buffer size originally

3532

* - we increased the buffer size originally

3523

* by 1 sector giving us enough extra space

3533

* by 1 sector giving us enough extra space

3524

* for the second read;

3534

* for the second read;

3525

* - the log start is guaranteed to be sector

3535

* - the log start is guaranteed to be sector

3526

* aligned;

3536

* aligned;

3527

* - we read the log end (LR header start)

3537

* - we read the log end (LR header start)

3528

* _first_, then the log start (LR header end)

3538

* _first_, then the log start (LR header end)

3529

* - order is important.

3539

* - order is important.

3530

*/

3540

*/

3531

wrapped_hblks = hblks - split_hblks;

3541

wrapped_hblks = hblks - split_hblks;

3532

error = xlog_bread_offset(log, 0,

3542

error = xlog_bread_offset(log, 0,

3533

wrapped_hblks, hbp,

3543

wrapped_hblks, hbp,

3534

offset + BBTOB(split_hblks));

3544

offset + BBTOB(split_hblks));

3535

if (error)

3545

if (error)

3536

goto bread_err2;

3546

goto bread_err2;

3537

}

3547

}

3538

rhead = (xlog_rec_header_t *)offset;

3548

rhead = (xlog_rec_header_t *)offset;

3539

error = xlog_valid_rec_header(log, rhead,

3549

error = xlog_valid_rec_header(log, rhead,

3540

split_hblks ? blk_no : 0);

3550

split_hblks ? blk_no : 0);

3541

if (error)

3551

if (error)

3542

goto bread_err2;

3552

goto bread_err2;

3543

3553

3544

bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));

3554

bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));

3545

blk_no += hblks;

3555

blk_no += hblks;

3546

3556

3547

/* Read in data for log record */

3557

/* Read in data for log record */

3548

if (blk_no + bblks <= log->l_logBBsize) {

3558

if (blk_no + bblks <= log->l_logBBsize) {

3549

error = xlog_bread(log, blk_no, bblks, dbp,

3559

error = xlog_bread(log, blk_no, bblks, dbp,

3550

&offset);

3560

&offset);

3551

if (error)

3561

if (error)

3552

goto bread_err2;

3562

goto bread_err2;

3553

} else {

3563

} else {

3554

/* This log record is split across the

3564

/* This log record is split across the

3555

* physical end of log */

3565

* physical end of log */

3556

offset = dbp->b_addr;

3566

offset = dbp->b_addr;

3557

split_bblks = 0;

3567

split_bblks = 0;

3558

if (blk_no != log->l_logBBsize) {

3568

if (blk_no != log->l_logBBsize) {

3559

/* some data is before the physical

3569

/* some data is before the physical

3560

* end of log */

3570

* end of log */

3561

ASSERT(!wrapped_hblks);

3571

ASSERT(!wrapped_hblks);

3562

ASSERT(blk_no <= INT_MAX);

3572

ASSERT(blk_no <= INT_MAX);

3563

split_bblks =

3573

split_bblks =

3564

log->l_logBBsize - (int)blk_no;

3574

log->l_logBBsize - (int)blk_no;

3565

ASSERT(split_bblks > 0);

3575

ASSERT(split_bblks > 0);

3566

error = xlog_bread(log, blk_no,

3576

error = xlog_bread(log, blk_no,

3567

split_bblks, dbp,

3577

split_bblks, dbp,

3568

&offset);

3578

&offset);

3569

if (error)

3579

if (error)

3570

goto bread_err2;

3580

goto bread_err2;

3571

}

3581

}

3572

3582

3573

/*

3583

/*

3574

* Note: this black magic still works with

3584

* Note: this black magic still works with

3575

* large sector sizes (non-512) only because:

3585

* large sector sizes (non-512) only because:

3576

* - we increased the buffer size originally

3586

* - we increased the buffer size originally

3577

* by 1 sector giving us enough extra space

3587

* by 1 sector giving us enough extra space

3578

* for the second read;

3588

* for the second read;

3579

* - the log start is guaranteed to be sector

3589

* - the log start is guaranteed to be sector

3580

* aligned;

3590

* aligned;

3581

* - we read the log end (LR header start)

3591

* - we read the log end (LR header start)

3582

* _first_, then the log start (LR header end)

3592

* _first_, then the log start (LR header end)

3583

* - order is important.

3593

* - order is important.

3584

*/

3594

*/

3585

error = xlog_bread_offset(log, 0,

3595

error = xlog_bread_offset(log, 0,

3586

bblks - split_bblks, dbp,

3596

bblks - split_bblks, dbp,

3587

offset + BBTOB(split_bblks));

3597

offset + BBTOB(split_bblks));

3588

if (error)

3598

if (error)

3589

goto bread_err2;

3599

goto bread_err2;

3590

}

3600

}

3591

3601

3592

error = xlog_unpack_data(rhead, offset, log);

3602

error = xlog_unpack_data(rhead, offset, log);

3593

if (error)

3603

if (error)

3594

goto bread_err2;

3604

goto bread_err2;

3595

3605

3596

error = xlog_recover_process_data(log, rhash,

3606

error = xlog_recover_process_data(log, rhash,

3597

rhead, offset, pass);

3607

rhead, offset, pass);

3598

if (error)

3608

if (error)

3599

goto bread_err2;

3609

goto bread_err2;

3600

blk_no += bblks;

3610

blk_no += bblks;

3601

}

3611

}

3602

3612

3603

ASSERT(blk_no >= log->l_logBBsize);

3613

ASSERT(blk_no >= log->l_logBBsize);

3604

blk_no -= log->l_logBBsize;

3614

blk_no -= log->l_logBBsize;

3605

3615

3606

/* read first part of physical log */

3616

/* read first part of physical log */

3607

while (blk_no < head_blk) {

3617

while (blk_no < head_blk) {

3608

error = xlog_bread(log, blk_no, hblks, hbp, &offset);

3618

error = xlog_bread(log, blk_no, hblks, hbp, &offset);

3609

if (error)

3619

if (error)

3610

goto bread_err2;

3620

goto bread_err2;

3611

3621

3612

rhead = (xlog_rec_header_t *)offset;

3622

rhead = (xlog_rec_header_t *)offset;

3613

error = xlog_valid_rec_header(log, rhead, blk_no);

3623

error = xlog_valid_rec_header(log, rhead, blk_no);

3614

if (error)

3624

if (error)

3615

goto bread_err2;

3625

goto bread_err2;

3616

3626

3617

bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));

3627

bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));

3618

error = xlog_bread(log, blk_no+hblks, bblks, dbp,

3628

error = xlog_bread(log, blk_no+hblks, bblks, dbp,

3619

&offset);

3629

&offset);

3620

if (error)

3630

if (error)

3621

goto bread_err2;

3631

goto bread_err2;

3622

3632

3623

error = xlog_unpack_data(rhead, offset, log);

3633

error = xlog_unpack_data(rhead, offset, log);

3624

if (error)

3634

if (error)

3625

goto bread_err2;

3635

goto bread_err2;

3626

3636

3627

error = xlog_recover_process_data(log, rhash,

3637

error = xlog_recover_process_data(log, rhash,

3628

rhead, offset, pass);

3638

rhead, offset, pass);

3629

if (error)

3639

if (error)

3630

goto bread_err2;

3640

goto bread_err2;

3631

blk_no += bblks + hblks;

3641

blk_no += bblks + hblks;

3632

}

3642

}

3633

}

3643

}

3634

3644

3635

bread_err2:

3645

bread_err2:

3636

xlog_put_bp(dbp);

3646

xlog_put_bp(dbp);

3637

bread_err1:

3647

bread_err1:

3638

xlog_put_bp(hbp);

3648

xlog_put_bp(hbp);

3639

return error;

3649

return error;

3640

}

3650

}

3641

3651

3642

/*

3652

/*

3643

* Do the recovery of the log. We actually do this in two phases.

3653

* Do the recovery of the log. We actually do this in two phases.

3644

* The two passes are necessary in order to implement the function

3654

* The two passes are necessary in order to implement the function

3645

* of cancelling a record written into the log. The first pass

3655

* of cancelling a record written into the log. The first pass

3646

* determines those things which have been cancelled, and the

3656

* determines those things which have been cancelled, and the

3647

* second pass replays log items normally except for those which

3657

* second pass replays log items normally except for those which

3648

* have been cancelled. The handling of the replay and cancellations

3658

* have been cancelled. The handling of the replay and cancellations

3649

* takes place in the log item type specific routines.

3659

* takes place in the log item type specific routines.

3650

*

3660

*

3651

* The table of items which have cancel records in the log is allocated

3661

* The table of items which have cancel records in the log is allocated

3652

* and freed at this level, since only here do we know when all of

3662

* and freed at this level, since only here do we know when all of

3653

* the log recovery has been completed.

3663

* the log recovery has been completed.

3654

*/

3664

*/

3655

STATIC int

3665

STATIC int

3656

xlog_do_log_recovery(

3666

xlog_do_log_recovery(

3657

struct xlog *log,

3667

struct xlog *log,

3658

xfs_daddr_t head_blk,

3668

xfs_daddr_t head_blk,

3659

xfs_daddr_t tail_blk)

3669

xfs_daddr_t tail_blk)

3660

{

3670

{

3661

int error, i;

3671

int error, i;

3662

3672

3663

ASSERT(head_blk != tail_blk);

3673

ASSERT(head_blk != tail_blk);

3664

3674

3665

/*

3675

/*

3666

* First do a pass to find all of the cancelled buf log items.

3676

* First do a pass to find all of the cancelled buf log items.

3667

* Store them in the buf_cancel_table for use in the second pass.

3677

* Store them in the buf_cancel_table for use in the second pass.

3668

*/

3678

*/

3669

log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *

3679

log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *

3670

sizeof(struct list_head),

3680

sizeof(struct list_head),

3671

KM_SLEEP);

3681

KM_SLEEP);

3672

for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)

3682

for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)

3673

INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);

3683

INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);

3674

3684

3675

error = xlog_do_recovery_pass(log, head_blk, tail_blk,

3685

error = xlog_do_recovery_pass(log, head_blk, tail_blk,

3676

XLOG_RECOVER_PASS1);

3686

XLOG_RECOVER_PASS1);

3677

if (error != 0) {

3687

if (error != 0) {

3678

kmem_free(log->l_buf_cancel_table);

3688

kmem_free(log->l_buf_cancel_table);

3679

log->l_buf_cancel_table = NULL;

3689

log->l_buf_cancel_table = NULL;

3680

return error;

3690

return error;

3681

}

3691

}

3682

/*

3692

/*

3683

* Then do a second pass to actually recover the items in the log.

3693

* Then do a second pass to actually recover the items in the log.

3684

* When it is complete free the table of buf cancel items.

3694

* When it is complete free the table of buf cancel items.

3685

*/

3695

*/

3686

error = xlog_do_recovery_pass(log, head_blk, tail_blk,

3696

error = xlog_do_recovery_pass(log, head_blk, tail_blk,

3687

XLOG_RECOVER_PASS2);

3697

XLOG_RECOVER_PASS2);

3688

#ifdef DEBUG

3698

#ifdef DEBUG

3689

if (!error) {

3699

if (!error) {

3690

int i;

3700

int i;

3691

3701

3692

for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)

3702

for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)

3693

ASSERT(list_empty(&log->l_buf_cancel_table[i]));

3703

ASSERT(list_empty(&log->l_buf_cancel_table[i]));

3694

}

3704

}

3695

#endif /* DEBUG */

3705

#endif /* DEBUG */

3696

3706

3697

kmem_free(log->l_buf_cancel_table);

3707

kmem_free(log->l_buf_cancel_table);

3698

log->l_buf_cancel_table = NULL;

3708

log->l_buf_cancel_table = NULL;

3699

3709

3700

return error;

3710

return error;

3701

}

3711

}

3702

3712

3703

/*

3713

/*

3704

* Do the actual recovery

3714

* Do the actual recovery

3705

*/

3715

*/

3706

STATIC int

3716

STATIC int

3707

xlog_do_recover(

3717

xlog_do_recover(

3708

struct xlog *log,

3718

struct xlog *log,

3709

xfs_daddr_t head_blk,

3719

xfs_daddr_t head_blk,

3710

xfs_daddr_t tail_blk)

3720

xfs_daddr_t tail_blk)

3711

{

3721

{

3712

int error;

3722

int error;

3713

xfs_buf_t *bp;

3723

xfs_buf_t *bp;

3714

xfs_sb_t *sbp;

3724

xfs_sb_t *sbp;

3715

3725

3716

/*

3726

/*

3717

* First replay the images in the log.

3727

* First replay the images in the log.

3718

*/

3728

*/

3719

error = xlog_do_log_recovery(log, head_blk, tail_blk);

3729

error = xlog_do_log_recovery(log, head_blk, tail_blk);

3720

if (error)

3730

if (error)

3721

return error;

3731

return error;

3722

3732

3723

/*

3733

/*

3724

* If IO errors happened during recovery, bail out.

3734

* If IO errors happened during recovery, bail out.

3725

*/

3735

*/

3726

if (XFS_FORCED_SHUTDOWN(log->l_mp)) {

3736

if (XFS_FORCED_SHUTDOWN(log->l_mp)) {

3727

return (EIO);

3737

return (EIO);

3728

}

3738

}

3729

3739

3730

/*

3740

/*

3731

* We now update the tail_lsn since much of the recovery has completed

3741

* We now update the tail_lsn since much of the recovery has completed

3732

* and there may be space available to use. If there were no extent

3742

* and there may be space available to use. If there were no extent

3733

* or iunlinks, we can free up the entire log and set the tail_lsn to

3743

* or iunlinks, we can free up the entire log and set the tail_lsn to

3734

* be the last_sync_lsn. This was set in xlog_find_tail to be the

3744

* be the last_sync_lsn. This was set in xlog_find_tail to be the

3735

* lsn of the last known good LR on disk. If there are extent frees

3745

* lsn of the last known good LR on disk. If there are extent frees

3736

* or iunlinks they will have some entries in the AIL; so we look at

3746

* or iunlinks they will have some entries in the AIL; so we look at

3737

* the AIL to determine how to set the tail_lsn.

3747

* the AIL to determine how to set the tail_lsn.

3738

*/

3748

*/

3739

xlog_assign_tail_lsn(log->l_mp);

3749

xlog_assign_tail_lsn(log->l_mp);

3740

3750

3741

/*

3751

/*

3742

* Now that we've finished replaying all buffer and inode

3752

* Now that we've finished replaying all buffer and inode

3743

* updates, re-read in the superblock and reverify it.

3753

* updates, re-read in the superblock and reverify it.

3744

*/

3754

*/

3745

bp = xfs_getsb(log->l_mp, 0);

3755

bp = xfs_getsb(log->l_mp, 0);

3746

XFS_BUF_UNDONE(bp);

3756

XFS_BUF_UNDONE(bp);

3747

ASSERT(!(XFS_BUF_ISWRITE(bp)));

3757

ASSERT(!(XFS_BUF_ISWRITE(bp)));

3748

XFS_BUF_READ(bp);

3758

XFS_BUF_READ(bp);

3749

XFS_BUF_UNASYNC(bp);

3759

XFS_BUF_UNASYNC(bp);

3750

bp->b_ops = &xfs_sb_buf_ops;

3760

bp->b_ops = &xfs_sb_buf_ops;

3751

xfsbdstrat(log->l_mp, bp);

3761

xfsbdstrat(log->l_mp, bp);

3752

error = xfs_buf_iowait(bp);

3762

error = xfs_buf_iowait(bp);

3753

if (error) {

3763

if (error) {

3754

xfs_buf_ioerror_alert(bp, __func__);

3764

xfs_buf_ioerror_alert(bp, __func__);

3755

ASSERT(0);

3765

ASSERT(0);

3756

xfs_buf_relse(bp);

3766

xfs_buf_relse(bp);

3757

return error;

3767

return error;

3758

}

3768

}

3759

3769

3760

/* Convert superblock from on-disk format */

3770

/* Convert superblock from on-disk format */

3761

sbp = &log->l_mp->m_sb;

3771

sbp = &log->l_mp->m_sb;

3762

xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));

3772

xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));

3763

ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);

3773

ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);

3764

ASSERT(xfs_sb_good_version(sbp));

3774

ASSERT(xfs_sb_good_version(sbp));

3765

xfs_buf_relse(bp);

3775

xfs_buf_relse(bp);

3766

3776

3767

/* We've re-read the superblock so re-initialize per-cpu counters */

3777

/* We've re-read the superblock so re-initialize per-cpu counters */

3768

xfs_icsb_reinit_counters(log->l_mp);

3778

xfs_icsb_reinit_counters(log->l_mp);

3769

3779

3770

xlog_recover_check_summary(log);

3780

xlog_recover_check_summary(log);

3771

3781

3772

/* Normal transactions can now occur */

3782

/* Normal transactions can now occur */

3773

log->l_flags &= ~XLOG_ACTIVE_RECOVERY;

3783

log->l_flags &= ~XLOG_ACTIVE_RECOVERY;

3774

return 0;

3784

return 0;

3775

}

3785

}

3776

3786

3777

/*

3787

/*

3778

* Perform recovery and re-initialize some log variables in xlog_find_tail.

3788

* Perform recovery and re-initialize some log variables in xlog_find_tail.

3779

*

3789

*

3780

* Return error or zero.

3790

* Return error or zero.

3781

*/

3791

*/

3782

int

3792

int

3783

xlog_recover(

3793

xlog_recover(

3784

struct xlog *log)

3794

struct xlog *log)

3785

{

3795

{

3786

xfs_daddr_t head_blk, tail_blk;

3796

xfs_daddr_t head_blk, tail_blk;

3787

int error;

3797

int error;

3788

3798

3789

/* find the tail of the log */

3799

/* find the tail of the log */

3790

if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))

3800

if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))

3791

return error;

3801

return error;

3792

3802

3793

if (tail_blk != head_blk) {

3803

if (tail_blk != head_blk) {

3794

/* There used to be a comment here:

3804

/* There used to be a comment here:

3795

*

3805

*

3796

* disallow recovery on read-only mounts. note -- mount

3806

* disallow recovery on read-only mounts. note -- mount

3797

* checks for ENOSPC and turns it into an intelligent

3807

* checks for ENOSPC and turns it into an intelligent

3798

* error message.

3808

* error message.

3799

* ...but this is no longer true. Now, unless you specify

3809

* ...but this is no longer true. Now, unless you specify

3800

* NORECOVERY (in which case this function would never be

3810

* NORECOVERY (in which case this function would never be

3801

* called), we just go ahead and recover. We do this all

3811

* called), we just go ahead and recover. We do this all

3802

* under the vfs layer, so we can get away with it unless

3812

* under the vfs layer, so we can get away with it unless

3803

* the device itself is read-only, in which case we fail.

3813

* the device itself is read-only, in which case we fail.

3804

*/

3814

*/

3805

if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {

3815

if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {

3806

return error;

3816

return error;

3807

}

3817

}

3808

3818

3809

xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",

3819

xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",

3810

log->l_mp->m_logname ? log->l_mp->m_logname

3820

log->l_mp->m_logname ? log->l_mp->m_logname

3811

: "internal");

3821

: "internal");

3812

3822

3813

error = xlog_do_recover(log, head_blk, tail_blk);

3823

error = xlog_do_recover(log, head_blk, tail_blk);

3814

log->l_flags |= XLOG_RECOVERY_NEEDED;

3824

log->l_flags |= XLOG_RECOVERY_NEEDED;

3815

}

3825

}

3816

return error;

3826

return error;

3817

}

3827

}

3818

3828

3819

/*

3829

/*

3820

* In the first part of recovery we replay inodes and buffers and build

3830

* In the first part of recovery we replay inodes and buffers and build

3821

* up the list of extent free items which need to be processed. Here

3831

* up the list of extent free items which need to be processed. Here

3822

* we process the extent free items and clean up the on disk unlinked

3832

* we process the extent free items and clean up the on disk unlinked

3823

* inode lists. This is separated from the first part of recovery so

3833

* inode lists. This is separated from the first part of recovery so

3824

* that the root and real-time bitmap inodes can be read in from disk in

3834

* that the root and real-time bitmap inodes can be read in from disk in

3825

* between the two stages. This is necessary so that we can free space

3835

* between the two stages. This is necessary so that we can free space

3826

* in the real-time portion of the file system.

3836

* in the real-time portion of the file system.

3827

*/

3837

*/

3828

int

3838

int

3829

xlog_recover_finish(

3839

xlog_recover_finish(

3830

struct xlog *log)

3840

struct xlog *log)

3831

{

3841

{

3832

/*

3842

/*

3833

* Now we're ready to do the transactions needed for the

3843

* Now we're ready to do the transactions needed for the

3834

* rest of recovery. Start with completing all the extent

3844

* rest of recovery. Start with completing all the extent

3835

* free intent records and then process the unlinked inode

3845

* free intent records and then process the unlinked inode

3836

* lists. At this point, we essentially run in normal mode

3846

* lists. At this point, we essentially run in normal mode

3837

* except that we're still performing recovery actions

3847

* except that we're still performing recovery actions

3838

* rather than accepting new requests.

3848

* rather than accepting new requests.

3839

*/

3849

*/

3840

if (log->l_flags & XLOG_RECOVERY_NEEDED) {

3850

if (log->l_flags & XLOG_RECOVERY_NEEDED) {

3841

int error;

3851

int error;

3842

error = xlog_recover_process_efis(log);

3852

error = xlog_recover_process_efis(log);

3843

if (error) {

3853

if (error) {

3844

xfs_alert(log->l_mp, "Failed to recover EFIs");

3854

xfs_alert(log->l_mp, "Failed to recover EFIs");

3845

return error;

3855

return error;

3846

}

3856

}

3847

/*

3857

/*

3848

* Sync the log to get all the EFIs out of the AIL.

3858

* Sync the log to get all the EFIs out of the AIL.

3849

* This isn't absolutely necessary, but it helps in

3859

* This isn't absolutely necessary, but it helps in

3850

* case the unlink transactions would have problems

3860

* case the unlink transactions would have problems

3851

* pushing the EFIs out of the way.

3861

* pushing the EFIs out of the way.

3852

*/

3862

*/

3853

xfs_log_force(log->l_mp, XFS_LOG_SYNC);

3863

xfs_log_force(log->l_mp, XFS_LOG_SYNC);

3854

3864

3855

xlog_recover_process_iunlinks(log);

3865

xlog_recover_process_iunlinks(log);

3856

3866

3857

xlog_recover_check_summary(log);

3867

xlog_recover_check_summary(log);

3858

3868

3859

xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",

3869

xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",

3860

log->l_mp->m_logname ? log->l_mp->m_logname

3870

log->l_mp->m_logname ? log->l_mp->m_logname

3861

: "internal");

3871

: "internal");

3862

log->l_flags &= ~XLOG_RECOVERY_NEEDED;

3872

log->l_flags &= ~XLOG_RECOVERY_NEEDED;

3863

} else {

3873

} else {

3864

xfs_info(log->l_mp, "Ending clean mount");

3874

xfs_info(log->l_mp, "Ending clean mount");

3865

}

3875

}

3866

return 0;

3876

return 0;

3867

}

3877

}

3868

3878

3869

3879

3870

#if defined(DEBUG)

3880

#if defined(DEBUG)

3871

/*

3881

/*

3872

* Read all of the agf and agi counters and check that they

3882

* Read all of the agf and agi counters and check that they

3873

* are consistent with the superblock counters.

3883

* are consistent with the superblock counters.

3874

*/

3884

*/

3875

void

3885

void

3876

xlog_recover_check_summary(

3886

xlog_recover_check_summary(

3877

struct xlog *log)

3887

struct xlog *log)

3878

{

3888

{

3879

xfs_mount_t *mp;

3889

xfs_mount_t *mp;

3880

xfs_agf_t *agfp;

3890

xfs_agf_t *agfp;

3881

xfs_buf_t *agfbp;

3891

xfs_buf_t *agfbp;

3882

xfs_buf_t *agibp;

3892

xfs_buf_t *agibp;

3883

xfs_agnumber_t agno;

3893

xfs_agnumber_t agno;

3884

__uint64_t freeblks;

3894

__uint64_t freeblks;

3885

__uint64_t itotal;

3895

__uint64_t itotal;

3886

__uint64_t ifree;

3896

__uint64_t ifree;

3887

int error;

3897

int error;

3888

3898

3889

mp = log->l_mp;

3899

mp = log->l_mp;

3890

3900

3891

freeblks = 0LL;

3901

freeblks = 0LL;

3892

itotal = 0LL;

3902

itotal = 0LL;

3893

ifree = 0LL;

3903

ifree = 0LL;

3894

for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {

3904

for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {

3895

error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);

3905

error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);

3896

if (error) {

3906

if (error) {

3897

xfs_alert(mp, "%s agf read failed agno %d error %d",

3907

xfs_alert(mp, "%s agf read failed agno %d error %d",

3898

__func__, agno, error);

3908

__func__, agno, error);

3899

} else {

3909

} else {

3900

agfp = XFS_BUF_TO_AGF(agfbp);

3910

agfp = XFS_BUF_TO_AGF(agfbp);

3901

freeblks += be32_to_cpu(agfp->agf_freeblks) +

3911

freeblks += be32_to_cpu(agfp->agf_freeblks) +

3902

be32_to_cpu(agfp->agf_flcount);

3912

be32_to_cpu(agfp->agf_flcount);

3903

xfs_buf_relse(agfbp);

3913

xfs_buf_relse(agfbp);

3904

}

3914

}

3905

3915

3906

error = xfs_read_agi(mp, NULL, agno, &agibp);

3916

error = xfs_read_agi(mp, NULL, agno, &agibp);

3907

if (error) {

3917

if (error) {

3908

xfs_alert(mp, "%s agi read failed agno %d error %d",

3918

xfs_alert(mp, "%s agi read failed agno %d error %d",

3909

__func__, agno, error);

3919

__func__, agno, error);

3910

} else {

3920

} else {

3911

struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);

3921

struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);

3912

3922

3913

itotal += be32_to_cpu(agi->agi_count);

3923

itotal += be32_to_cpu(agi->agi_count);

3914

ifree += be32_to_cpu(agi->agi_freecount);

3924

ifree += be32_to_cpu(agi->agi_freecount);

3915

xfs_buf_relse(agibp);

3925

xfs_buf_relse(agibp);

3916

}

3926

}

3917

}

3927

}

3918

}

3928

}

3919

#endif /* DEBUG */

3929

#endif /* DEBUG */

3920

3930

GITLAB

xfs: add CRC checks for quota blocks

 /*
  * Copyright (c) 2000-2003 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_alloc.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
 /*
  * Lock order:
  *
  * ip->i_lock
  *   qi->qi_tree_lock
  *     dquot->q_qlock (xfs_dqlock() and friends)
  *       dquot->q_flush (xfs_dqflock() and friends)
  *       qi->qi_lru_lock
  *
  * If two dquots need to be locked the order is user before group/project,
  * otherwise by the lowest id first, see xfs_dqlock2.
  */
 #ifdef DEBUG
 xfs_buftarg_t *xfs_dqerror_target;
 int xfs_do_dqerror;
 int xfs_dqreq_num;
 int xfs_dqerror_mod = 33;
 #endif
 struct kmem_zone		*xfs_qm_dqtrxzone;
 static struct kmem_zone		*xfs_qm_dqzone;
 static struct lock_class_key xfs_dquot_other_class;
 /*
  * This is called to free all the memory associated with a dquot
  */
 void
 xfs_qm_dqdestroy(
 	xfs_dquot_t	*dqp)
 {
 	ASSERT(list_empty(&dqp->q_lru));
 	mutex_destroy(&dqp->q_qlock);
 	kmem_zone_free(xfs_qm_dqzone, dqp);
 	XFS_STATS_DEC(xs_qm_dquot);
 }
 /*
  * If default limits are in force, push them into the dquot now.
  * We overwrite the dquot limits only if they are zero and this
  * is not the root dquot.
  */
 void
 xfs_qm_adjust_dqlimits(
 	struct xfs_mount	*mp,
 	struct xfs_dquot	*dq)
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	struct xfs_disk_dquot	*d = &dq->q_core;
 	int			prealloc = 0;
 	ASSERT(d->d_id);
 	if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
 		d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
 		prealloc = 1;
 	}
 	if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
 		d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
 		prealloc = 1;
 	}
 	if (q->qi_isoftlimit && !d->d_ino_softlimit)
 		d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
 	if (q->qi_ihardlimit && !d->d_ino_hardlimit)
 		d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
 	if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
 		d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
 	if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
 		d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
 	if (prealloc)
 		xfs_dquot_set_prealloc_limits(dq);
 }
 /*
  * Check the limits and timers of a dquot and start or reset timers
  * if necessary.
  * This gets called even when quota enforcement is OFF, which makes our
  * life a little less complicated. (We just don't reject any quota
  * reservations in that case, when enforcement is off).
  * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
  * enforcement's off.
  * In contrast, warnings are a little different in that they don't
  * 'automatically' get started when limits get exceeded.  They do
  * get reset to zero, however, when we find the count to be under
  * the soft limit (they are only ever set non-zero via userspace).
  */
 void
 xfs_qm_adjust_dqtimers(
 	xfs_mount_t		*mp,
 	xfs_disk_dquot_t	*d)
 {
 	ASSERT(d->d_id);
 #ifdef DEBUG
 	if (d->d_blk_hardlimit)
 		ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
 		       be64_to_cpu(d->d_blk_hardlimit));
 	if (d->d_ino_hardlimit)
 		ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
 		       be64_to_cpu(d->d_ino_hardlimit));
 	if (d->d_rtb_hardlimit)
 		ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
 		       be64_to_cpu(d->d_rtb_hardlimit));
 #endif
 	if (!d->d_btimer) {
 		if ((d->d_blk_softlimit &&
 		     (be64_to_cpu(d->d_bcount) >
 		      be64_to_cpu(d->d_blk_softlimit))) ||
 		    (d->d_blk_hardlimit &&
 		     (be64_to_cpu(d->d_bcount) >
 		      be64_to_cpu(d->d_blk_hardlimit)))) {
 			d->d_btimer = cpu_to_be32(get_seconds() +
 					mp->m_quotainfo->qi_btimelimit);
 		} else {
 			d->d_bwarns = 0;
 		}
 	} else {
 		if ((!d->d_blk_softlimit ||
 		     (be64_to_cpu(d->d_bcount) <=
 		      be64_to_cpu(d->d_blk_softlimit))) &&
 		    (!d->d_blk_hardlimit ||
 		    (be64_to_cpu(d->d_bcount) <=
 		     be64_to_cpu(d->d_blk_hardlimit)))) {
 			d->d_btimer = 0;
 		}
 	}
 	if (!d->d_itimer) {
 		if ((d->d_ino_softlimit &&
 		     (be64_to_cpu(d->d_icount) >
 		      be64_to_cpu(d->d_ino_softlimit))) ||
 		    (d->d_ino_hardlimit &&
 		     (be64_to_cpu(d->d_icount) >
 		      be64_to_cpu(d->d_ino_hardlimit)))) {
 			d->d_itimer = cpu_to_be32(get_seconds() +
 					mp->m_quotainfo->qi_itimelimit);
 		} else {
 			d->d_iwarns = 0;
 		}
 	} else {
 		if ((!d->d_ino_softlimit ||
 		     (be64_to_cpu(d->d_icount) <=
 		      be64_to_cpu(d->d_ino_softlimit)))  &&
 		    (!d->d_ino_hardlimit ||
 		     (be64_to_cpu(d->d_icount) <=
 		      be64_to_cpu(d->d_ino_hardlimit)))) {
 			d->d_itimer = 0;
 		}
 	}
 	if (!d->d_rtbtimer) {
 		if ((d->d_rtb_softlimit &&
 		     (be64_to_cpu(d->d_rtbcount) >
 		      be64_to_cpu(d->d_rtb_softlimit))) ||
 		    (d->d_rtb_hardlimit &&
 		     (be64_to_cpu(d->d_rtbcount) >
 		      be64_to_cpu(d->d_rtb_hardlimit)))) {
 			d->d_rtbtimer = cpu_to_be32(get_seconds() +
 					mp->m_quotainfo->qi_rtbtimelimit);
 		} else {
 			d->d_rtbwarns = 0;
 		}
 	} else {
 		if ((!d->d_rtb_softlimit ||
 		     (be64_to_cpu(d->d_rtbcount) <=
 		      be64_to_cpu(d->d_rtb_softlimit))) &&
 		    (!d->d_rtb_hardlimit ||
 		     (be64_to_cpu(d->d_rtbcount) <=
 		      be64_to_cpu(d->d_rtb_hardlimit)))) {
 			d->d_rtbtimer = 0;
 		}
 	}
 }
 /*
  * initialize a buffer full of dquots and log the whole thing
  */
 STATIC void
 xfs_qm_init_dquot_blk(
 	xfs_trans_t	*tp,
 	xfs_mount_t	*mp,
 	xfs_dqid_t	id,
 	uint		type,
 	xfs_buf_t	*bp)
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	xfs_dqblk_t	*d;
 	int		curid, i;
 	ASSERT(tp);
 	ASSERT(xfs_buf_islocked(bp));
 	d = bp->b_addr;
 	/*
 	 * ID of the first dquot in the block - id's are zero based.
 	 */
 	curid = id - (id % q->qi_dqperchunk);
 	ASSERT(curid >= 0);
 	memset(d, 0, BBTOB(q->qi_dqchunklen));
 	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
 		d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
 		d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
 		d->dd_diskdq.d_id = cpu_to_be32(curid);
 		d->dd_diskdq.d_flags = type;
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
 	}
 	xfs_trans_dquot_buf(tp, bp,
 			    (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
 			    ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
 			     XFS_BLF_GDQUOT_BUF)));
 	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
 /*
  * Initialize the dynamic speculative preallocation thresholds. The lo/hi
  * watermarks correspond to the soft and hard limits by default. If a soft limit
  * is not specified, we use 95% of the hard limit.
  */
 void
 xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
 {
 	__uint64_t space;
 	dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
 	dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
 	if (!dqp->q_prealloc_lo_wmark) {
 		dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
 		do_div(dqp->q_prealloc_lo_wmark, 100);
 		dqp->q_prealloc_lo_wmark *= 95;
 	}
 	space = dqp->q_prealloc_hi_wmark;
 	do_div(space, 100);
 	dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
 	dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
 	dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
 }
-static void
+STATIC void
+xfs_dquot_buf_calc_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			i;
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) {
+		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 offsetof(struct xfs_dqblk, dd_crc));
+	}
+}
+STATIC bool
+xfs_dquot_buf_verify_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			ndquots;
+	int			i;
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return true;
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_qm_calc_dquots_per_chunk(mp,
+					XFS_BB_TO_FSB(mp, bp->b_length));
+	for (i = 0; i < ndquots; i++, d++) {
+		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 offsetof(struct xfs_dqblk, dd_crc)))
+			return false;
+		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	}
+	return true;
+}
+STATIC bool
 xfs_dquot_buf_verify(
+	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	struct xfs_disk_dquot	*ddq;
 	xfs_dqid_t		id = 0;
+	int			ndquots;
 	int			i;
 	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length);
+	/*
 	 * On the first read of the buffer, verify that each dquot is valid.
 	 * We don't know what the id of the dquot is supposed to be, just that
 	 * they should be increasing monotonically within the buffer. If the
 	 * first id is corrupt, then it will fail on the second dquot in the
 	 * buffer so corruptions could point to the wrong dquot in this case.
 	 */
-	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+	for (i = 0; i < ndquots; i++) {
-		int	error;
+		struct xfs_disk_dquot	*ddq;
+		int			error;
 		ddq = &d[i].dd_diskdq;
 		if (i == 0)
 			id = be32_to_cpu(ddq->d_id);
 		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-					"xfs_dquot_read_verify");
+				       "xfs_dquot_buf_verify");
-		if (error) {
+		if (error)
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+			return false;
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
-			break;
-		}
 	}
+	return true;
 }
 static void
 xfs_dquot_buf_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dquot_buf_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 void
 xfs_dquot_buf_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dquot_buf_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	if (!xfs_dquot_buf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+	xfs_dquot_buf_calc_crc(mp, bp);
 }
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
 	.verify_read = xfs_dquot_buf_read_verify,
 	.verify_write = xfs_dquot_buf_write_verify,
 };
 /*
  * Allocate a block and fill it with dquots.
  * This is called when the bmapi finds a hole.
  */
 STATIC int
 xfs_qm_dqalloc(
 	xfs_trans_t	**tpp,
 	xfs_mount_t	*mp,
 	xfs_dquot_t	*dqp,
 	xfs_inode_t	*quotip,
 	xfs_fileoff_t	offset_fsb,
 	xfs_buf_t	**O_bpp)
 {
 	xfs_fsblock_t	firstblock;
 	xfs_bmap_free_t flist;
 	xfs_bmbt_irec_t map;
 	int		nmaps, error, committed;
 	xfs_buf_t	*bp;
 	xfs_trans_t	*tp = *tpp;
 	ASSERT(tp != NULL);
 	trace_xfs_dqalloc(dqp);
 	/*
 	 * Initialize the bmap freelist prior to calling bmapi code.
 	 */
 	xfs_bmap_init(&flist, &firstblock);
 	xfs_ilock(quotip, XFS_ILOCK_EXCL);
 	/*
 	 * Return if this type of quotas is turned off while we didn't
 	 * have an inode lock
 	 */
 	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
 		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 		return (ESRCH);
 	}
 	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
 	nmaps = 1;
 	error = xfs_bmapi_write(tp, quotip, offset_fsb,
 				XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
 				&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
 				&map, &nmaps, &flist);
 	if (error)
 		goto error0;
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
 	ASSERT(nmaps == 1);
 	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 	       (map.br_startblock != HOLESTARTBLOCK));
 	/*
 	 * Keep track of the blkno to save a lookup later
 	 */
 	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
 	/* now we can just get the buffer (there's nothing to read yet) */
 	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 			       dqp->q_blkno,
 			       mp->m_quotainfo->qi_dqchunklen,
 			       0);
 	error = xfs_buf_geterror(bp);
 	if (error)
 		goto error1;
 	bp->b_ops = &xfs_dquot_buf_ops;
 	/*
 	 * Make a chunk of dquots out of this buffer and log
 	 * the entire thing.
 	 */
 	xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
 			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
 	/*
 	 * xfs_bmap_finish() may commit the current transaction and
 	 * start a second transaction if the freelist is not empty.
 	 *
 	 * Since we still want to modify this buffer, we need to
 	 * ensure that the buffer is not released on commit of
 	 * the first transaction and ensure the buffer is added to the
 	 * second transaction.
 	 *
 	 * If there is only one transaction then don't stop the buffer
 	 * from being released when it commits later on.
 	 */
 	xfs_trans_bhold(tp, bp);
 	if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
 		goto error1;
 	}
 	if (committed) {
 		tp = *tpp;
 		xfs_trans_bjoin(tp, bp);
 	} else {
 		xfs_trans_bhold_release(tp, bp);
 	}
 	*O_bpp = bp;
 	return 0;
       error1:
 	xfs_bmap_cancel(&flist);
       error0:
 	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 	return (error);
 }
 STATIC int
 xfs_qm_dqrepair(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
 	struct xfs_dquot	*dqp,
 	xfs_dqid_t		firstid,
 	struct xfs_buf		**bpp)
 {
 	int			error;
 	struct xfs_disk_dquot	*ddq;
 	struct xfs_dqblk	*d;
 	int			i;
 	/*
 	 * Read the buffer without verification so we get the corrupted
 	 * buffer returned to us. make sure we verify it on write, though.
 	 */
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
 				   mp->m_quotainfo->qi_dqchunklen,
 				   0, bpp, NULL);
 	if (error) {
 		ASSERT(*bpp == NULL);
 		return XFS_ERROR(error);
 	}
 	(*bpp)->b_ops = &xfs_dquot_buf_ops;
 	ASSERT(xfs_buf_islocked(*bpp));
 	d = (struct xfs_dqblk *)(*bpp)->b_addr;
 	/* Do the actual repair of dquots in this buffer */
 	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
 		ddq = &d[i].dd_diskdq;
 		error = xfs_qm_dqcheck(mp, ddq, firstid + i,
 				       dqp->dq_flags & XFS_DQ_ALLTYPES,
 				       XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
 		if (error) {
 			/* repair failed, we're screwed */
 			xfs_trans_brelse(tp, *bpp);
 			return XFS_ERROR(EIO);
 		}
 	}
 	return 0;
 }
 /*
  * Maps a dquot to the buffer containing its on-disk version.
  * This returns a ptr to the buffer containing the on-disk dquot
  * in the bpp param, and a ptr to the on-disk dquot within that buffer
  */
 STATIC int
 xfs_qm_dqtobp(
 	xfs_trans_t		**tpp,
 	xfs_dquot_t		*dqp,
 	xfs_disk_dquot_t	**O_ddpp,
 	xfs_buf_t		**O_bpp,
 	uint			flags)
 {
 	xfs_bmbt_irec_t map;
 	int		nmaps = 1, error;
 	xfs_buf_t	*bp;
 	xfs_inode_t	*quotip = XFS_DQ_TO_QIP(dqp);
 	xfs_mount_t	*mp = dqp->q_mount;
 	xfs_dqid_t	id = be32_to_cpu(dqp->q_core.d_id);
 	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
 	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 	xfs_ilock(quotip, XFS_ILOCK_SHARED);
 	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
 		/*
 		 * Return if this type of quotas is turned off while we
 		 * didn't have the quota inode lock.
 		 */
 		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
 		return ESRCH;
 	}
 	/*
 	 * Find the block map; no allocations yet
 	 */
 	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
 			       XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
 	xfs_iunlock(quotip, XFS_ILOCK_SHARED);
 	if (error)
 		return error;
 	ASSERT(nmaps == 1);
 	ASSERT(map.br_blockcount == 1);
 	/*
 	 * Offset of dquot in the (fixed sized) dquot chunk.
 	 */
 	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
 		sizeof(xfs_dqblk_t);
 	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
 	if (map.br_startblock == HOLESTARTBLOCK) {
 		/*
 		 * We don't allocate unless we're asked to
 		 */
 		if (!(flags & XFS_QMOPT_DQALLOC))
 			return ENOENT;
 		ASSERT(tp);
 		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
 					dqp->q_fileoffset, &bp);
 		if (error)
 			return error;
 		tp = *tpp;
 	} else {
 		trace_xfs_dqtobp_read(dqp);
 		/*
 		 * store the blkno etc so that we don't have to do the
 		 * mapping all the time
 		 */
 		dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
 					   mp->m_quotainfo->qi_dqchunklen,
 					   0, &bp, &xfs_dquot_buf_ops);
 		if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
 			xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
 						mp->m_quotainfo->qi_dqperchunk;
 			ASSERT(bp == NULL);
 			error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
 		}
 		if (error) {
 			ASSERT(bp == NULL);
 			return XFS_ERROR(error);
 		}
 	}
 	ASSERT(xfs_buf_islocked(bp));
 	*O_bpp = bp;
 	*O_ddpp = bp->b_addr + dqp->q_bufoffset;
 	return (0);
 }
 /*
  * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
  * and release the buffer immediately.
  *
  * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
  */
 int
 xfs_qm_dqread(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
 	uint			flags,
 	struct xfs_dquot	**O_dqpp)
 {
 	struct xfs_dquot	*dqp;
 	struct xfs_disk_dquot	*ddqp;
 	struct xfs_buf		*bp;
 	struct xfs_trans	*tp = NULL;
 	int			error;
 	int			cancelflags = 0;
 	dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
 	dqp->dq_flags = type;
 	dqp->q_core.d_id = cpu_to_be32(id);
 	dqp->q_mount = mp;
 	INIT_LIST_HEAD(&dqp->q_lru);
 	mutex_init(&dqp->q_qlock);
 	init_waitqueue_head(&dqp->q_pinwait);
 	/*
 	 * Because we want to use a counting completion, complete
 	 * the flush completion once to allow a single access to
 	 * the flush completion without blocking.
 	 */
 	init_completion(&dqp->q_flush);
 	complete(&dqp->q_flush);
 	/*
 	 * Make sure group quotas have a different lock class than user
 	 * quotas.
 	 */
 	if (!(type & XFS_DQ_USER))
 		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
 	XFS_STATS_INC(xs_qm_dquot);
 	trace_xfs_dqread(dqp);
 	if (flags & XFS_QMOPT_DQALLOC) {
 		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
 		error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
 					  XFS_QM_DQALLOC_LOG_RES(mp), 0,
 					  XFS_TRANS_PERM_LOG_RES,
 					  XFS_WRITE_LOG_COUNT);
 		if (error)
 			goto error1;
 		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
 	}
 	/*
 	 * get a pointer to the on-disk dquot and the buffer containing it
 	 * dqp already knows its own type (GROUP/USER).
 	 */
 	error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
 	if (error) {
 		/*
 		 * This can happen if quotas got turned off (ESRCH),
 		 * or if the dquot didn't exist on disk and we ask to
 		 * allocate (ENOENT).
 		 */
 		trace_xfs_dqread_fail(dqp);
 		cancelflags |= XFS_TRANS_ABORT;
 		goto error1;
 	}
 	/* copy everything from disk dquot to the incore dquot */
 	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
 	xfs_qm_dquot_logitem_init(dqp);
 	/*
 	 * Reservation counters are defined as reservation plus current usage
 	 * to avoid having to add every time.
 	 */
 	dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
 	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
 	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
 	/* initialize the dquot speculative prealloc thresholds */
 	xfs_dquot_set_prealloc_limits(dqp);
 	/* Mark the buf so that this will stay incore a little longer */
 	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
 	/*
 	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
 	 * So we need to release with xfs_trans_brelse().
 	 * The strategy here is identical to that of inodes; we lock
 	 * the dquot in xfs_qm_dqget() before making it accessible to
 	 * others. This is because dquots, like inodes, need a good level of
 	 * concurrency, and we don't want to take locks on the entire buffers
 	 * for dquot accesses.
 	 * Note also that the dquot buffer may even be dirty at this point, if
 	 * this particular dquot was repaired. We still aren't afraid to
 	 * brelse it because we have the changes incore.
 	 */
 	ASSERT(xfs_buf_islocked(bp));
 	xfs_trans_brelse(tp, bp);
 	if (tp) {
 		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		if (error)
 			goto error0;
 	}
 	*O_dqpp = dqp;
 	return error;
 error1:
 	if (tp)
 		xfs_trans_cancel(tp, cancelflags);
 error0:
 	xfs_qm_dqdestroy(dqp);
 	*O_dqpp = NULL;
 	return error;
 }
 /*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
  * When both an inode and an id are given, the inode's id takes precedence.
  * That is, if the id changes while we don't hold the ilock inside this
  * function, the new dquot is returned, not necessarily the one requested
  * in the id argument.
  */
 int
 xfs_qm_dqget(
 	xfs_mount_t	*mp,
 	xfs_inode_t	*ip,	  /* locked inode (optional) */
 	xfs_dqid_t	id,	  /* uid/projid/gid depending on type */
 	uint		type,	  /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
 	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
 	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
 	struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
 	struct xfs_dquot	*dqp;
 	int			error;
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
 	    (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
 	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
 		return (ESRCH);
 	}
 #ifdef DEBUG
 	if (xfs_do_dqerror) {
 		if ((xfs_dqerror_target == mp->m_ddev_targp) &&
 		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
 			xfs_debug(mp, "Returning error in dqget");
 			return (EIO);
 		}
 	}
 	ASSERT(type == XFS_DQ_USER ||
 	       type == XFS_DQ_PROJ ||
 	       type == XFS_DQ_GROUP);
 	if (ip) {
 		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 		ASSERT(xfs_inode_dquot(ip, type) == NULL);
 	}
 #endif
 restart:
 	mutex_lock(&qi->qi_tree_lock);
 	dqp = radix_tree_lookup(tree, id);
 	if (dqp) {
 		xfs_dqlock(dqp);
 		if (dqp->dq_flags & XFS_DQ_FREEING) {
 			xfs_dqunlock(dqp);
 			mutex_unlock(&qi->qi_tree_lock);
 			trace_xfs_dqget_freeing(dqp);
 			delay(1);
 			goto restart;
 		}
 		dqp->q_nrefs++;
 		mutex_unlock(&qi->qi_tree_lock);
 		trace_xfs_dqget_hit(dqp);
 		XFS_STATS_INC(xs_qm_dqcachehits);
 		*O_dqpp = dqp;
 		return 0;
 	}
 	mutex_unlock(&qi->qi_tree_lock);
 	XFS_STATS_INC(xs_qm_dqcachemisses);
 	/*
 	 * Dquot cache miss. We don't want to keep the inode lock across
 	 * a (potential) disk read. Also we don't want to deal with the lock
 	 * ordering between quotainode and this inode. OTOH, dropping the inode
 	 * lock here means dealing with a chown that can happen before
 	 * we re-acquire the lock.
 	 */
 	if (ip)
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
 	if (ip)
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
 	if (ip) {
 		/*
 		 * A dquot could be attached to this inode by now, since
 		 * we had dropped the ilock.
 		 */
 		if (xfs_this_quota_on(mp, type)) {
 			struct xfs_dquot	*dqp1;
 			dqp1 = xfs_inode_dquot(ip, type);
 			if (dqp1) {
 				xfs_qm_dqdestroy(dqp);
 				dqp = dqp1;
 				xfs_dqlock(dqp);
 				goto dqret;
 			}
 		} else {
 			/* inode stays locked on return */
 			xfs_qm_dqdestroy(dqp);
 			return XFS_ERROR(ESRCH);
 		}
 	}
 	mutex_lock(&qi->qi_tree_lock);
 	error = -radix_tree_insert(tree, id, dqp);
 	if (unlikely(error)) {
 		WARN_ON(error != EEXIST);
 		/*
 		 * Duplicate found. Just throw away the new dquot and start
 		 * over.
 		 */
 		mutex_unlock(&qi->qi_tree_lock);
 		trace_xfs_dqget_dup(dqp);
 		xfs_qm_dqdestroy(dqp);
 		XFS_STATS_INC(xs_qm_dquot_dups);
 		goto restart;
 	}
 	/*
 	 * We return a locked dquot to the caller, with a reference taken
 	 */
 	xfs_dqlock(dqp);
 	dqp->q_nrefs = 1;
 	qi->qi_dquots++;
 	mutex_unlock(&qi->qi_tree_lock);
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);
 	*O_dqpp = dqp;
 	return (0);
 }
 STATIC void
 xfs_qm_dqput_final(
 	struct xfs_dquot	*dqp)
 {
 	struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
 	struct xfs_dquot	*gdqp;
 	trace_xfs_dqput_free(dqp);
 	mutex_lock(&qi->qi_lru_lock);
 	if (list_empty(&dqp->q_lru)) {
 		list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
 		qi->qi_lru_count++;
 		XFS_STATS_INC(xs_qm_dquot_unused);
 	}
 	mutex_unlock(&qi->qi_lru_lock);
 	/*
 	 * If we just added a udquot to the freelist, then we want to release
 	 * the gdquot reference that it (probably) has. Otherwise it'll keep
 	 * the gdquot from getting reclaimed.
 	 */
 	gdqp = dqp->q_gdquot;
 	if (gdqp) {
 		xfs_dqlock(gdqp);
 		dqp->q_gdquot = NULL;
 	}
 	xfs_dqunlock(dqp);
 	/*
 	 * If we had a group quota hint, release it now.
 	 */
 	if (gdqp)
 		xfs_qm_dqput(gdqp);
 }
 /*
  * Release a reference to the dquot (decrement ref-count) and unlock it.
  *
  * If there is a group quota attached to this dquot, carefully release that
  * too without tripping over deadlocks'n'stuff.
  */
 void
 xfs_qm_dqput(
 	struct xfs_dquot	*dqp)
 {
 	ASSERT(dqp->q_nrefs > 0);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	trace_xfs_dqput(dqp);
 	if (--dqp->q_nrefs > 0)
 		xfs_dqunlock(dqp);
 	else
 		xfs_qm_dqput_final(dqp);
 }
 /*
  * Release a dquot. Flush it if dirty, then dqput() it.
  * dquot must not be locked.
  */
 void
 xfs_qm_dqrele(
 	xfs_dquot_t	*dqp)
 {
 	if (!dqp)
 		return;
 	trace_xfs_dqrele(dqp);
 	xfs_dqlock(dqp);
 	/*
 	 * We don't care to flush it if the dquot is dirty here.
 	 * That will create stutters that we want to avoid.
 	 * Instead we do a delayed write when we try to reclaim
 	 * a dirty dquot. Also xfs_sync will take part of the burden...
 	 */
 	xfs_qm_dqput(dqp);
 }
 /*
  * This is the dquot flushing I/O completion routine.  It is called
  * from interrupt level when the buffer containing the dquot is
  * flushed to disk.  It is responsible for removing the dquot logitem
  * from the AIL if it has not been re-logged, and unlocking the dquot's
  * flush lock. This behavior is very similar to that of inodes..
  */
 STATIC void
 xfs_qm_dqflush_done(
 	struct xfs_buf		*bp,
 	struct xfs_log_item	*lip)
 {
 	xfs_dq_logitem_t	*qip = (struct xfs_dq_logitem *)lip;
 	xfs_dquot_t		*dqp = qip->qli_dquot;
 	struct xfs_ail		*ailp = lip->li_ailp;
 	/*
 	 * We only want to pull the item from the AIL if its
 	 * location in the log has not changed since we started the flush.
 	 * Thus, we only bother if the dquot's lsn has
 	 * not changed. First we check the lsn outside the lock
 	 * since it's cheaper, and then we recheck while
 	 * holding the lock before removing the dquot from the AIL.
 	 */
 	if ((lip->li_flags & XFS_LI_IN_AIL) &&
 	    lip->li_lsn == qip->qli_flush_lsn) {
 		/* xfs_trans_ail_delete() drops the AIL lock. */
 		spin_lock(&ailp->xa_lock);
 		if (lip->li_lsn == qip->qli_flush_lsn)
 			xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
 		else
 			spin_unlock(&ailp->xa_lock);
 	}
 	/*
 	 * Release the dq's flush lock since we're done with it.
 	 */
 	xfs_dqfunlock(dqp);
 }
 /*
  * Write a modified dquot to disk.
  * The dquot must be locked and the flush lock too taken by caller.
  * The flush lock will not be unlocked until the dquot reaches the disk,
  * but the dquot is free to be unlocked and modified by the caller
  * in the interim. Dquot is still locked on return. This behavior is
  * identical to that of inodes.
  */
 int
 xfs_qm_dqflush(
 	struct xfs_dquot	*dqp,
 	struct xfs_buf		**bpp)
 {
 	struct xfs_mount	*mp = dqp->q_mount;
 	struct xfs_buf		*bp;
 	struct xfs_disk_dquot	*ddqp;
 	int			error;
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 	ASSERT(!completion_done(&dqp->q_flush));
 	trace_xfs_dqflush(dqp);
 	*bpp = NULL;
 	xfs_qm_dqunpin_wait(dqp);
 	/*
 	 * This may have been unpinned because the filesystem is shutting
 	 * down forcibly. If that's the case we must not write this dquot
 	 * to disk, because the log record didn't make it to disk.
 	 *
 	 * We also have to remove the log item from the AIL in this case,
 	 * as we wait for an emptry AIL as part of the unmount process.
 	 */
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		struct xfs_log_item	*lip = &dqp->q_logitem.qli_item;
 		dqp->dq_flags &= ~XFS_DQ_DIRTY;
 		spin_lock(&mp->m_ail->xa_lock);
 		if (lip->li_flags & XFS_LI_IN_AIL)
 			xfs_trans_ail_delete(mp->m_ail, lip,
 					     SHUTDOWN_CORRUPT_INCORE);
 		else
 			spin_unlock(&mp->m_ail->xa_lock);
 		error = XFS_ERROR(EIO);
 		goto out_unlock;
 	}
 	/*
 	 * Get the buffer containing the on-disk dquot
 	 */
 	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
 				   mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
 	if (error)
 		goto out_unlock;
 	/*
 	 * Calculate the location of the dquot inside the buffer.
 	 */
 	ddqp = bp->b_addr + dqp->q_bufoffset;
 	/*
 	 * A simple sanity check in case we got a corrupted dquot..
 	 */
 	error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
 			   XFS_QMOPT_DOWARN, "dqflush (incore copy)");
 	if (error) {
 		xfs_buf_relse(bp);
 		xfs_dqfunlock(dqp);
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		return XFS_ERROR(EIO);
 	}
 	/* This is the only portion of data that needs to persist */
 	memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
 	/*
 	 * Clear the dirty field and remember the flush lsn for later use.
 	 */
 	dqp->dq_flags &= ~XFS_DQ_DIRTY;
 	xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
 					&dqp->q_logitem.qli_item.li_lsn);
+	/*
+	 * copy the lsn into the on-disk dquot now while we have the in memory
+	 * dquot here. This can't be done later in the write verifier as we
+	 * can't get access to the log item at that point in time.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
+		dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+	}
 	/*
 	 * Attach an iodone routine so that we can remove this dquot from the
 	 * AIL and release the flush lock once the dquot is synced to disk.
 	 */
 	xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
 				  &dqp->q_logitem.qli_item);
 	/*
 	 * If the buffer is pinned then push on the log so we won't
 	 * get stuck waiting in the write for too long.
 	 */
 	if (xfs_buf_ispinned(bp)) {
 		trace_xfs_dqflush_force(dqp);
 		xfs_log_force(mp, 0);
 	}
 	trace_xfs_dqflush_done(dqp);
 	*bpp = bp;
 	return 0;
 out_unlock:
 	xfs_dqfunlock(dqp);
 	return XFS_ERROR(EIO);
 }
 /*
  * Lock two xfs_dquot structures.
  *
  * To avoid deadlocks we always lock the quota structure with
  * the lowerd id first.
  */
 void
 xfs_dqlock2(
 	xfs_dquot_t	*d1,
 	xfs_dquot_t	*d2)
 {
 	if (d1 && d2) {
 		ASSERT(d1 != d2);
 		if (be32_to_cpu(d1->q_core.d_id) >
 		    be32_to_cpu(d2->q_core.d_id)) {
 			mutex_lock(&d2->q_qlock);
 			mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
 		} else {
 			mutex_lock(&d1->q_qlock);
 			mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
 		}
 	} else if (d1) {
 		mutex_lock(&d1->q_qlock);
 	} else if (d2) {
 		mutex_lock(&d2->q_qlock);
 	}
 }
 int __init
 xfs_qm_init(void)
 {
 	xfs_qm_dqzone =
 		kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
 	if (!xfs_qm_dqzone)
 		goto out;
 	xfs_qm_dqtrxzone =
 		kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
 	if (!xfs_qm_dqtrxzone)
 		goto out_free_dqzone;
 	return 0;
 out_free_dqzone:
 	kmem_zone_destroy(xfs_qm_dqzone);
 out:
 	return -ENOMEM;
 }
 void
 xfs_qm_exit(void)

 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef __XFS_QM_H__
 #define __XFS_QM_H__
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_quota_priv.h"
 struct xfs_inode;
 extern struct kmem_zone	*xfs_qm_dqtrxzone;
 /*
  * This defines the unit of allocation of dquots.
  * Currently, it is just one file system block, and a 4K blk contains 30
  * (136 * 30 = 4080) dquots. It's probably not worth trying to make
  * this more dynamic.
  * XXXsup However, if this number is changed, we have to make sure that we don't
  * implicitly assume that we do allocations in chunks of a single filesystem
  * block in the dquot/xqm code.
  */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
 /*
  * Various quota information for individual filesystems.
  * The mount structure keeps a pointer to this.
  */
 typedef struct xfs_quotainfo {
 	struct radix_tree_root qi_uquota_tree;
 	struct radix_tree_root qi_gquota_tree;
 	struct mutex qi_tree_lock;
 	xfs_inode_t	*qi_uquotaip;	 /* user quota inode */
 	xfs_inode_t	*qi_gquotaip;	 /* group quota inode */
 	struct list_head qi_lru_list;
 	struct mutex	 qi_lru_lock;
 	int		 qi_lru_count;
 	int		 qi_dquots;
 	time_t		 qi_btimelimit;	 /* limit for blks timer */
 	time_t		 qi_itimelimit;	 /* limit for inodes timer */
 	time_t		 qi_rtbtimelimit;/* limit for rt blks timer */
 	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for blks warnings */
 	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for inodes warnings */
 	xfs_qwarncnt_t	 qi_rtbwarnlimit;/* limit for rt blks warnings */
 	struct mutex	 qi_quotaofflock;/* to serialize quotaoff */
 	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
 	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
 	xfs_qcnt_t	 qi_bhardlimit;	 /* default data blk hard limit */
 	xfs_qcnt_t	 qi_bsoftlimit;	 /* default data blk soft limit */
 	xfs_qcnt_t	 qi_ihardlimit;	 /* default inode count hard limit */
 	xfs_qcnt_t	 qi_isoftlimit;	 /* default inode count soft limit */
 	xfs_qcnt_t	 qi_rtbhardlimit;/* default realtime blk hard limit */
 	xfs_qcnt_t	 qi_rtbsoftlimit;/* default realtime blk soft limit */
 	struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
 #define XFS_DQUOT_TREE(qi, type) \
 	((type & XFS_DQ_USER) ? \
 	 &((qi)->qi_uquota_tree) : \
 	 &((qi)->qi_gquota_tree))
+extern int	xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
+					     unsigned int nbblks);
 extern void	xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int	xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
 			xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
 extern void	xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
 extern void	xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
 /*
  * We keep the usr and grp dquots separately so that locking will be easier
  * to do at commit time. All transactions that we know of at this point
  * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
  */
 #define XFS_QM_TRANS_MAXDQS		2
 typedef struct xfs_dquot_acct {
 	xfs_dqtrx_t	dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
 	xfs_dqtrx_t	dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
 } xfs_dquot_acct_t;
 /*
  * Users are allowed to have a usage exceeding their softlimit for
  * a period this long.
  */
 #define XFS_QM_BTIMELIMIT	(7 * 24*60*60)          /* 1 week */
 #define XFS_QM_RTBTIMELIMIT	(7 * 24*60*60)          /* 1 week */
 #define XFS_QM_ITIMELIMIT	(7 * 24*60*60)          /* 1 week */
 #define XFS_QM_BWARNLIMIT	5
 #define XFS_QM_IWARNLIMIT	5
 #define XFS_QM_RTBWARNLIMIT	5
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern int		xfs_qm_quotacheck(xfs_mount_t *);
 extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 /* dquot stuff */
 extern void		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 /* quota ops */
 extern int		xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
 extern int		xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
 					fs_disk_quota_t *);
 extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
 					fs_disk_quota_t *);
 extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int		xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int		xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
 #endif /* __XFS_QM_H__ */

 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef __XFS_QUOTA_H__
 #define __XFS_QUOTA_H__
 struct xfs_trans;
 /*
  * The ondisk form of a dquot structure.
  */
 #define XFS_DQUOT_MAGIC		0x4451		/* 'DQ' */
 #define XFS_DQUOT_VERSION	(u_int8_t)0x01	/* latest version number */
 /*
  * uid_t and gid_t are hard-coded to 32 bits in the inode.
  * Hence, an 'id' in a dquot is 32 bits..
  */
 typedef __uint32_t	xfs_dqid_t;
 /*
  * Even though users may not have quota limits occupying all 64-bits,
  * they may need 64-bit accounting. Hence, 64-bit quota-counters,
  * and quota-limits. This is a waste in the common case, but hey ...
  */
 typedef __uint64_t	xfs_qcnt_t;
 typedef __uint16_t	xfs_qwarncnt_t;
 /*
  * This is the main portion of the on-disk representation of quota
  * information for a user. This is the q_core of the xfs_dquot_t that
  * is kept in kernel memory. We pad this with some more expansion room
  * to construct the on disk structure.
  */
 typedef struct	xfs_disk_dquot {
 	__be16		d_magic;	/* dquot magic = XFS_DQUOT_MAGIC */
 	__u8		d_version;	/* dquot version */
 	__u8		d_flags;	/* XFS_DQ_USER/PROJ/GROUP */
 	__be32		d_id;		/* user,project,group id */
 	__be64		d_blk_hardlimit;/* absolute limit on disk blks */
 	__be64		d_blk_softlimit;/* preferred limit on disk blks */
 	__be64		d_ino_hardlimit;/* maximum # allocated inodes */
 	__be64		d_ino_softlimit;/* preferred inode limit */
 	__be64		d_bcount;	/* disk blocks owned by the user */
 	__be64		d_icount;	/* inodes owned by the user */
 	__be32		d_itimer;	/* zero if within inode limits if not,
 					   this is when we refuse service */
 	__be32		d_btimer;	/* similar to above; for disk blocks */
 	__be16		d_iwarns;	/* warnings issued wrt num inodes */
 	__be16		d_bwarns;	/* warnings issued wrt disk blocks */
 	__be32		d_pad0;		/* 64 bit align */
 	__be64		d_rtb_hardlimit;/* absolute limit on realtime blks */
 	__be64		d_rtb_softlimit;/* preferred limit on RT disk blks */
 	__be64		d_rtbcount;	/* realtime blocks owned */
 	__be32		d_rtbtimer;	/* similar to above; for RT disk blocks */
 	__be16		d_rtbwarns;	/* warnings issued wrt RT disk blocks */
 	__be16		d_pad;
 } xfs_disk_dquot_t;
 /*
  * This is what goes on disk. This is separated from the xfs_disk_dquot because
  * carrying the unnecessary padding would be a waste of memory.
  */
 typedef struct xfs_dqblk {
 	xfs_disk_dquot_t  dd_diskdq;	/* portion that lives incore as well */
-	char		  dd_fill[32];	/* filling for posterity */
+	char		  dd_fill[4];	/* filling for posterity */
+	/*
+	 * These two are only present on filesystems with the CRC bits set.
+	 */
+	__be32		  dd_crc;	/* checksum */
+	__be64		  dd_lsn;	/* last modification in log */
+	uuid_t		  dd_uuid;	/* location information */
 } xfs_dqblk_t;
 /*
  * flags for q_flags field in the dquot.
  */
 #define XFS_DQ_USER		0x0001		/* a user quota */
 #define XFS_DQ_PROJ		0x0002		/* project quota */
 #define XFS_DQ_GROUP		0x0004		/* a group quota */
 #define XFS_DQ_DIRTY		0x0008		/* dquot is dirty */
 #define XFS_DQ_FREEING		0x0010		/* dquot is beeing torn down */
 #define XFS_DQ_ALLTYPES		(XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
 #define XFS_DQ_FLAGS \
 	{ XFS_DQ_USER,		"USER" }, \
 	{ XFS_DQ_PROJ,		"PROJ" }, \
 	{ XFS_DQ_GROUP,		"GROUP" }, \
 	{ XFS_DQ_DIRTY,		"DIRTY" }, \
 	{ XFS_DQ_FREEING,	"FREEING" }
 /*
  * In the worst case, when both user and group quotas are on,
  * we can have a max of three dquots changing in a single transaction.
  */
 #define XFS_DQUOT_LOGRES(mp)	(sizeof(xfs_disk_dquot_t) * 3)
 /*
  * These are the structures used to lay out dquots and quotaoff
  * records on the log. Quite similar to those of inodes.
  */
 /*
  * log format struct for dquots.
  * The first two fields must be the type and size fitting into
  * 32 bits : log_recovery code assumes that.
  */
 typedef struct xfs_dq_logformat {
 	__uint16_t		qlf_type;      /* dquot log item type */
 	__uint16_t		qlf_size;      /* size of this item */
 	xfs_dqid_t		qlf_id;	       /* usr/grp/proj id : 32 bits */
 	__int64_t		qlf_blkno;     /* blkno of dquot buffer */
 	__int32_t		qlf_len;       /* len of dquot buffer */
 	__uint32_t		qlf_boffset;   /* off of dquot in buffer */
 } xfs_dq_logformat_t;
 /*
  * log format struct for QUOTAOFF records.
  * The first two fields must be the type and size fitting into
  * 32 bits : log_recovery code assumes that.
  * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
  * to the first and ensures that the first logitem is taken out of the AIL
  * only when the last one is securely committed.
  */
 typedef struct xfs_qoff_logformat {
 	unsigned short		qf_type;	/* quotaoff log item type */
 	unsigned short		qf_size;	/* size of this item */
 	unsigned int		qf_flags;	/* USR and/or GRP */
 	char			qf_pad[12];	/* padding for future */
 } xfs_qoff_logformat_t;
 /*
  * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
  */
 #define XFS_UQUOTA_ACCT	0x0001  /* user quota accounting ON */
 #define XFS_UQUOTA_ENFD	0x0002  /* user quota limits enforced */
 #define XFS_UQUOTA_CHKD	0x0004  /* quotacheck run on usr quotas */
 #define XFS_PQUOTA_ACCT	0x0008  /* project quota accounting ON */
 #define XFS_OQUOTA_ENFD	0x0010  /* other (grp/prj) quota limits enforced */
 #define XFS_OQUOTA_CHKD	0x0020  /* quotacheck run on other (grp/prj) quotas */
 #define XFS_GQUOTA_ACCT	0x0040  /* group quota accounting ON */
 /*
  * Quota Accounting/Enforcement flags
  */
 #define XFS_ALL_QUOTA_ACCT	\
 		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
 #define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
 #define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
 #define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
 #define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
 #define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
 #define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
 #define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
 #define XFS_IS_OQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_OQUOTA_ENFD)
 /*
  * Incore only flags for quotaoff - these bits get cleared when quota(s)
  * are in the process of getting turned off. These flags are in m_qflags but
  * never in sb_qflags.
  */
 #define XFS_UQUOTA_ACTIVE	0x0100  /* uquotas are being turned off */
 #define XFS_PQUOTA_ACTIVE	0x0200  /* pquotas are being turned off */
 #define XFS_GQUOTA_ACTIVE	0x0400  /* gquotas are being turned off */
 #define XFS_ALL_QUOTA_ACTIVE	\
 	(XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
 /*
  * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
  * quota will be not be switched off as long as that inode lock is held.
  */
 #define XFS_IS_QUOTA_ON(mp)	((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
 						   XFS_GQUOTA_ACTIVE | \
 						   XFS_PQUOTA_ACTIVE))
 #define XFS_IS_OQUOTA_ON(mp)	((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
 						   XFS_PQUOTA_ACTIVE))
 #define XFS_IS_UQUOTA_ON(mp)	((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
 #define XFS_IS_GQUOTA_ON(mp)	((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
 #define XFS_IS_PQUOTA_ON(mp)	((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
 /*
  * Flags to tell various functions what to do. Not all of these are meaningful
  * to a single function. None of these XFS_QMOPT_* flags are meant to have
  * persistent values (ie. their values can and will change between versions)
  */
 #define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
 #define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
 #define XFS_QMOPT_PQUOTA	0x0000008 /* project dquot requested */
 #define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
 #define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC	0x0004000 /* enospc instead of edquot (prj) */
 /*
  * flags to xfs_trans_mod_dquot to indicate which field needs to be
  * modified.
  */
 #define XFS_QMOPT_RES_REGBLKS	0x0010000
 #define XFS_QMOPT_RES_RTBLKS	0x0020000
 #define XFS_QMOPT_BCOUNT	0x0040000
 #define XFS_QMOPT_ICOUNT	0x0080000
 #define XFS_QMOPT_RTBCOUNT	0x0100000
 #define XFS_QMOPT_DELBCOUNT	0x0200000
 #define XFS_QMOPT_DELRTBCOUNT	0x0400000
 #define XFS_QMOPT_RES_INOS	0x0800000
 /*
  * flags for dqalloc.
  */
 #define XFS_QMOPT_INHERIT	0x1000000
 /*
  * flags to xfs_trans_mod_dquot.
  */
 #define XFS_TRANS_DQ_RES_BLKS	XFS_QMOPT_RES_REGBLKS
 #define XFS_TRANS_DQ_RES_RTBLKS	XFS_QMOPT_RES_RTBLKS
 #define XFS_TRANS_DQ_RES_INOS	XFS_QMOPT_RES_INOS
 #define XFS_TRANS_DQ_BCOUNT	XFS_QMOPT_BCOUNT
 #define XFS_TRANS_DQ_DELBCOUNT	XFS_QMOPT_DELBCOUNT
 #define XFS_TRANS_DQ_ICOUNT	XFS_QMOPT_ICOUNT
 #define XFS_TRANS_DQ_RTBCOUNT	XFS_QMOPT_RTBCOUNT
 #define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
 #define XFS_QMOPT_QUOTALL	\
 		(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
 #define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
 #ifdef __KERNEL__
 /*
  * This check is done typically without holding the inode lock;
  * that may seem racy, but it is harmless in the context that it is used.
  * The inode cannot go inactive as long a reference is kept, and
  * therefore if dquot(s) were attached, they'll stay consistent.
  * If, for example, the ownership of the inode changes while
  * we didn't have the inode locked, the appropriate dquot(s) will be
  * attached atomically.
  */
 #define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
 				     (ip)->i_udquot == NULL) || \
 				    (XFS_IS_OQUOTA_ON(mp) && \
 				     (ip)->i_gdquot == NULL))
 #define XFS_QM_NEED_QUOTACHECK(mp) \
 	((XFS_IS_UQUOTA_ON(mp) && \
 		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
 	 (XFS_IS_GQUOTA_ON(mp) && \
 		((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
 		 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
 	 (XFS_IS_PQUOTA_ON(mp) && \
 		((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
 		 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
 #define XFS_MOUNT_QUOTA_SET1	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
 				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
 				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
 #define XFS_MOUNT_QUOTA_SET2	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
 				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
 				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
 #define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
 				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
 				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
 				 XFS_GQUOTA_ACCT)
 /*
  * The structure kept inside the xfs_trans_t keep track of dquot changes
  * within a transaction and apply them later.
  */
 typedef struct xfs_dqtrx {
 	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
 	ulong		qt_blk_res;	  /* blks reserved on a dquot */
 	ulong		qt_blk_res_used;  /* blks used from the reservation */
 	ulong		qt_ino_res;	  /* inode reserved on a dquot */
 	ulong		qt_ino_res_used;  /* inodes used from the reservation */
 	long		qt_bcount_delta;  /* dquot blk count changes */
 	long		qt_delbcnt_delta; /* delayed dquot blk count changes */
 	long		qt_icount_delta;  /* dquot inode count changes */
 	ulong		qt_rtblk_res;	  /* # blks reserved on a dquot */
 	ulong		qt_rtblk_res_used;/* # blks used from reservation */
 	long		qt_rtbcount_delta;/* dquot realtime blk changes */
 	long		qt_delrtb_delta;  /* delayed RT blk count changes */
 } xfs_dqtrx_t;
 #ifdef CONFIG_XFS_QUOTA
 extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
 extern void xfs_trans_free_dqinfo(struct xfs_trans *);
 extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
 		uint, long);
 extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
 extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
 extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
 		struct xfs_inode *, long, long, uint);
 extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
 		struct xfs_mount *, struct xfs_dquot *,
 		struct xfs_dquot *, long, long, uint);
 extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
 		struct xfs_dquot **, struct xfs_dquot **);
 extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *);
 extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
 extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
 		struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
 extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *, uint);
 extern int xfs_qm_dqattach(struct xfs_inode *, uint);
 extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
 extern void xfs_qm_dqdetach(struct xfs_inode *);
 extern void xfs_qm_dqrele(struct xfs_dquot *);
 extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
 extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
 extern void xfs_qm_mount_quotas(struct xfs_mount *);
 extern void xfs_qm_unmount(struct xfs_mount *);
 extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 #else
 static inline int
 xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
 {
 	*udqp = NULL;
 	*gdqp = NULL;
 	return 0;
 }
 #define xfs_trans_dup_dqinfo(tp, tp2)
 #define xfs_trans_free_dqinfo(tp)
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
 static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
 		struct xfs_inode *ip, long nblks, long ninos, uint flags)
 {
 	return 0;
 }
 static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
 		struct xfs_mount *mp, struct xfs_dquot *udqp,
 		struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
 {
 	return 0;
 }
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)					(0)
 #define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
 #define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl)			(0)
 #define xfs_qm_dqattach(ip, fl)						(0)
 #define xfs_qm_dqattach_locked(ip, fl)					(0)
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
 #define xfs_qm_newmount(mp, a, b)					(0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
 #define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
 	xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
 #define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
 	xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
 				f | XFS_QMOPT_RES_REGBLKS)
 extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
 				xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 #endif	/* __KERNEL__ */
 #endif	/* __XFS_QUOTA_H__ */