Blame view

fs/ocfs2/journal.c 43.6 KB
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
  /* -*- mode: c; c-basic-offset: 8; -*-
   * vim: noexpandtab sw=8 ts=8 sts=0:
   *
   * journal.c
   *
   * Defines functions of journalling api
   *
   * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public
   * License as published by the Free Software Foundation; either
   * version 2 of the License, or (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * General Public License for more details.
   *
   * You should have received a copy of the GNU General Public
   * License along with this program; if not, write to the
   * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   * Boston, MA 021110-1307, USA.
   */
  
  #include <linux/fs.h>
  #include <linux/types.h>
  #include <linux/slab.h>
  #include <linux/highmem.h>
  #include <linux/kthread.h>
  
  #define MLOG_MASK_PREFIX ML_JOURNAL
  #include <cluster/masklog.h>
  
  #include "ocfs2.h"
  
  #include "alloc.h"
316f4b9f9   Mark Fasheh   ocfs2: Move direc...
38
  #include "dir.h"
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
39
40
41
42
43
44
  #include "dlmglue.h"
  #include "extent_map.h"
  #include "heartbeat.h"
  #include "inode.h"
  #include "journal.h"
  #include "localalloc.h"
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
45
46
  #include "slot_map.h"
  #include "super.h"
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
47
48
49
  #include "sysfile.h"
  
  #include "buffer_head_io.h"
34af946a2   Ingo Molnar   [PATCH] spin/rwlo...
50
  DEFINE_SPINLOCK(trans_inc_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
51
52
53
54
55
56
57
  
  static int ocfs2_force_read_journal(struct inode *inode);
  static int ocfs2_recover_node(struct ocfs2_super *osb,
  			      int node_num);
  static int __ocfs2_recovery_thread(void *arg);
  static int ocfs2_commit_cache(struct ocfs2_super *osb);
  static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
58
  static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
59
  				      int dirty, int replayed);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
60
61
62
63
64
  static int ocfs2_trylock_journal(struct ocfs2_super *osb,
  				 int slot_num);
  static int ocfs2_recover_orphans(struct ocfs2_super *osb,
  				 int slot);
  static int ocfs2_commit_thread(void *arg);
553abd046   Joel Becker   ocfs2: Change the...
65
66
67
68
69
70
71
  
  /*
   * The recovery_list is a simple linked list of node numbers to recover.
   * It is protected by the recovery_lock.
   */
  
  struct ocfs2_recovery_map {
fc881fa0d   Joel Becker   ocfs2: De-magic t...
72
  	unsigned int rm_used;
553abd046   Joel Becker   ocfs2: Change the...
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
  	unsigned int *rm_entries;
  };
  
  int ocfs2_recovery_init(struct ocfs2_super *osb)
  {
  	struct ocfs2_recovery_map *rm;
  
  	mutex_init(&osb->recovery_lock);
  	osb->disable_recovery = 0;
  	osb->recovery_thread_task = NULL;
  	init_waitqueue_head(&osb->recovery_event);
  
  	rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
  		     osb->max_slots * sizeof(unsigned int),
  		     GFP_KERNEL);
  	if (!rm) {
  		mlog_errno(-ENOMEM);
  		return -ENOMEM;
  	}
  
  	rm->rm_entries = (unsigned int *)((char *)rm +
  					  sizeof(struct ocfs2_recovery_map));
  	osb->recovery_map = rm;
  
  	return 0;
  }
  
  /* we can't grab the goofy sem lock from inside wait_event, so we use
   * memory barriers to make sure that we'll see the null task before
   * being woken up */
  static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
  {
  	mb();
  	return osb->recovery_thread_task != NULL;
  }
  
  void ocfs2_recovery_exit(struct ocfs2_super *osb)
  {
  	struct ocfs2_recovery_map *rm;
  
  	/* disable any new recovery threads and wait for any currently
  	 * running ones to exit. Do this before setting the vol_state. */
  	mutex_lock(&osb->recovery_lock);
  	osb->disable_recovery = 1;
  	mutex_unlock(&osb->recovery_lock);
  	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
  
  	/* At this point, we know that no more recovery threads can be
  	 * launched, so wait for any recovery completion work to
  	 * complete. */
  	flush_workqueue(ocfs2_wq);
  
  	/*
  	 * Now that recovery is shut down, and the osb is about to be
  	 * freed,  the osb_lock is not taken here.
  	 */
  	rm = osb->recovery_map;
  	/* XXX: Should we bug if there are dirty entries? */
  
  	kfree(rm);
  }
  
  static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
  				     unsigned int node_num)
  {
  	int i;
  	struct ocfs2_recovery_map *rm = osb->recovery_map;
  
  	assert_spin_locked(&osb->osb_lock);
  
  	for (i = 0; i < rm->rm_used; i++) {
  		if (rm->rm_entries[i] == node_num)
  			return 1;
  	}
  
  	return 0;
  }
  
  /* Behaves like test-and-set.  Returns the previous value */
  static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
  				  unsigned int node_num)
  {
  	struct ocfs2_recovery_map *rm = osb->recovery_map;
  
  	spin_lock(&osb->osb_lock);
  	if (__ocfs2_recovery_map_test(osb, node_num)) {
  		spin_unlock(&osb->osb_lock);
  		return 1;
  	}
  
  	/* XXX: Can this be exploited? Not from o2dlm... */
  	BUG_ON(rm->rm_used >= osb->max_slots);
  
  	rm->rm_entries[rm->rm_used] = node_num;
  	rm->rm_used++;
  	spin_unlock(&osb->osb_lock);
  
  	return 0;
  }
  
  static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
  				     unsigned int node_num)
  {
  	int i;
  	struct ocfs2_recovery_map *rm = osb->recovery_map;
  
  	spin_lock(&osb->osb_lock);
  
  	for (i = 0; i < rm->rm_used; i++) {
  		if (rm->rm_entries[i] == node_num)
  			break;
  	}
  
  	if (i < rm->rm_used) {
  		/* XXX: be careful with the pointer math */
  		memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
  			(rm->rm_used - i - 1) * sizeof(unsigned int));
  		rm->rm_used--;
  	}
  
  	spin_unlock(&osb->osb_lock);
  }
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  static int ocfs2_commit_cache(struct ocfs2_super *osb)
  {
  	int status = 0;
  	unsigned int flushed;
  	unsigned long old_id;
  	struct ocfs2_journal *journal = NULL;
  
  	mlog_entry_void();
  
  	journal = osb->journal;
  
  	/* Flush all pending commits and checkpoint the journal. */
  	down_write(&journal->j_trans_barrier);
  
  	if (atomic_read(&journal->j_num_trans) == 0) {
  		up_write(&journal->j_trans_barrier);
  		mlog(0, "No transactions for me to flush!
  ");
  		goto finally;
  	}
  
  	journal_lock_updates(journal->j_journal);
  	status = journal_flush(journal->j_journal);
  	journal_unlock_updates(journal->j_journal);
  	if (status < 0) {
  		up_write(&journal->j_trans_barrier);
  		mlog_errno(status);
  		goto finally;
  	}
  
  	old_id = ocfs2_inc_trans_id(journal);
  
  	flushed = atomic_read(&journal->j_num_trans);
  	atomic_set(&journal->j_num_trans, 0);
  	up_write(&journal->j_trans_barrier);
  
  	mlog(0, "commit_thread: flushed transaction %lu (%u handles)
  ",
  	     journal->j_trans_id, flushed);
34d024f84   Mark Fasheh   ocfs2: Remove mou...
234
  	ocfs2_wake_downconvert_thread(osb);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
235
236
237
238
239
  	wake_up(&journal->j_checkpointed);
  finally:
  	mlog_exit(status);
  	return status;
  }
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
240
241
242
  /* pass it NULL and it will allocate a new handle object for you.  If
   * you pass it a handle however, it may still return error, in which
   * case it has free'd the passed handle for you. */
1fabe1481   Mark Fasheh   ocfs2: Remove str...
243
  handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
244
  {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
245
  	journal_t *journal = osb->journal->j_journal;
1fabe1481   Mark Fasheh   ocfs2: Remove str...
246
  	handle_t *handle;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
247

ebdec83ba   Eric Sesterhenn / snakebyte   [PATCH] BUG_ON() ...
248
  	BUG_ON(!osb || !osb->journal->j_journal);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
249

65eff9ccf   Mark Fasheh   ocfs2: remove han...
250
251
  	if (ocfs2_is_hard_readonly(osb))
  		return ERR_PTR(-EROFS);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
252
253
254
255
256
257
258
259
260
261
  
  	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
  	BUG_ON(max_buffs <= 0);
  
  	/* JBD might support this, but our journalling code doesn't yet. */
  	if (journal_current_handle()) {
  		mlog(ML_ERROR, "Recursive transaction attempted!
  ");
  		BUG();
  	}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
262
  	down_read(&osb->journal->j_trans_barrier);
1fabe1481   Mark Fasheh   ocfs2: Remove str...
263
264
  	handle = journal_start(journal, max_buffs);
  	if (IS_ERR(handle)) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
265
  		up_read(&osb->journal->j_trans_barrier);
1fabe1481   Mark Fasheh   ocfs2: Remove str...
266
  		mlog_errno(PTR_ERR(handle));
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
267
268
269
  
  		if (is_journal_aborted(journal)) {
  			ocfs2_abort(osb->sb, "Detected aborted journal");
1fabe1481   Mark Fasheh   ocfs2: Remove str...
270
  			handle = ERR_PTR(-EROFS);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
271
  		}
c271c5c22   Sunil Mushran   ocfs2: local mounts
272
273
274
275
  	} else {
  		if (!ocfs2_mount_local(osb))
  			atomic_inc(&(osb->journal->j_num_trans));
  	}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
276

ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
277
  	return handle;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
278
  }
1fabe1481   Mark Fasheh   ocfs2: Remove str...
279
280
  int ocfs2_commit_trans(struct ocfs2_super *osb,
  		       handle_t *handle)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
281
  {
1fabe1481   Mark Fasheh   ocfs2: Remove str...
282
  	int ret;
02dc1af44   Mark Fasheh   ocfs2: pass ocfs2...
283
  	struct ocfs2_journal *journal = osb->journal;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
284
285
  
  	BUG_ON(!handle);
1fabe1481   Mark Fasheh   ocfs2: Remove str...
286
287
288
  	ret = journal_stop(handle);
  	if (ret < 0)
  		mlog_errno(ret);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
289
290
  
  	up_read(&journal->j_trans_barrier);
1fabe1481   Mark Fasheh   ocfs2: Remove str...
291
  	return ret;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
292
293
294
295
296
297
298
  }
  
  /*
   * 'nblocks' is what you want to add to the current
   * transaction. extend_trans will either extend the current handle by
   * nblocks, or commit it and start a new one with nblocks credits.
   *
e8aed3450   Mark Fasheh   ocfs2: Re-journal...
299
300
301
302
303
304
   * This might call journal_restart() which will commit dirty buffers
   * and then restart the transaction. Before calling
   * ocfs2_extend_trans(), any changed blocks should have been
   * dirtied. After calling it, all blocks which need to be changed must
   * go through another set of journal_access/journal_dirty calls.
   *
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
305
306
307
308
309
310
311
312
   * WARNING: This will not release any semaphores or disk locks taken
   * during the transaction, so make sure they were taken *before*
   * start_trans or we'll have ordering deadlocks.
   *
   * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
   * good because transaction ids haven't yet been recorded on the
   * cluster locks associated with this handle.
   */
1fc581467   Mark Fasheh   ocfs2: have ocfs2...
313
  int ocfs2_extend_trans(handle_t *handle, int nblocks)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
314
315
316
317
  {
  	int status;
  
  	BUG_ON(!handle);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
318
319
320
321
322
323
  	BUG_ON(!nblocks);
  
  	mlog_entry_void();
  
  	mlog(0, "Trying to extend transaction by %d blocks
  ", nblocks);
e407e3978   Joel Becker   ocfs2: Fix CONFIG...
324
  #ifdef CONFIG_OCFS2_DEBUG_FS
0879c584f   Mark Fasheh   ocfs2: Allow for ...
325
326
  	status = 1;
  #else
1fc581467   Mark Fasheh   ocfs2: have ocfs2...
327
  	status = journal_extend(handle, nblocks);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
328
329
330
331
  	if (status < 0) {
  		mlog_errno(status);
  		goto bail;
  	}
0879c584f   Mark Fasheh   ocfs2: Allow for ...
332
  #endif
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
333
334
335
336
  
  	if (status > 0) {
  		mlog(0, "journal_extend failed, trying journal_restart
  ");
1fc581467   Mark Fasheh   ocfs2: have ocfs2...
337
  		status = journal_restart(handle, nblocks);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
338
  		if (status < 0) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
339
340
341
  			mlog_errno(status);
  			goto bail;
  		}
01ddf1e18   Mark Fasheh   ocfs2: remove unu...
342
  	}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
343
344
345
346
347
348
349
  
  	status = 0;
  bail:
  
  	mlog_exit(status);
  	return status;
  }
1fabe1481   Mark Fasheh   ocfs2: Remove str...
350
  int ocfs2_journal_access(handle_t *handle,
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
351
352
353
354
355
356
357
358
359
  			 struct inode *inode,
  			 struct buffer_head *bh,
  			 int type)
  {
  	int status;
  
  	BUG_ON(!inode);
  	BUG_ON(!handle);
  	BUG_ON(!bh);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
360

205f87f6b   Badari Pulavarty   [PATCH] change bu...
361
362
  	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu
  ",
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
  		   (unsigned long long)bh->b_blocknr, type,
  		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
  		   "OCFS2_JOURNAL_ACCESS_CREATE" :
  		   "OCFS2_JOURNAL_ACCESS_WRITE",
  		   bh->b_size);
  
  	/* we can safely remove this assertion after testing. */
  	if (!buffer_uptodate(bh)) {
  		mlog(ML_ERROR, "giving me a buffer that's not uptodate!
  ");
  		mlog(ML_ERROR, "b_blocknr=%llu
  ",
  		     (unsigned long long)bh->b_blocknr);
  		BUG();
  	}
  
  	/* Set the current transaction information on the inode so
  	 * that the locking code knows whether it can drop it's locks
  	 * on this inode or not. We're protected from the commit
  	 * thread updating the current transaction id until
  	 * ocfs2_commit_trans() because ocfs2_start_trans() took
  	 * j_trans_barrier for us. */
  	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
251b6eccb   Mark Fasheh   [OCFS2] Make ip_i...
386
  	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
387
388
389
  	switch (type) {
  	case OCFS2_JOURNAL_ACCESS_CREATE:
  	case OCFS2_JOURNAL_ACCESS_WRITE:
1fabe1481   Mark Fasheh   ocfs2: Remove str...
390
  		status = journal_get_write_access(handle, bh);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
391
392
393
  		break;
  
  	case OCFS2_JOURNAL_ACCESS_UNDO:
1fabe1481   Mark Fasheh   ocfs2: Remove str...
394
  		status = journal_get_undo_access(handle, bh);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
395
396
397
398
399
400
401
  		break;
  
  	default:
  		status = -EINVAL;
  		mlog(ML_ERROR, "Uknown access type!
  ");
  	}
251b6eccb   Mark Fasheh   [OCFS2] Make ip_i...
402
  	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
403
404
405
406
407
408
409
410
411
  
  	if (status < 0)
  		mlog(ML_ERROR, "Error %d getting %d access to buffer!
  ",
  		     status, type);
  
  	mlog_exit(status);
  	return status;
  }
1fabe1481   Mark Fasheh   ocfs2: Remove str...
412
  int ocfs2_journal_dirty(handle_t *handle,
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
413
414
415
  			struct buffer_head *bh)
  {
  	int status;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
416
417
418
  	mlog_entry("(bh->b_blocknr=%llu)
  ",
  		   (unsigned long long)bh->b_blocknr);
1fabe1481   Mark Fasheh   ocfs2: Remove str...
419
  	status = journal_dirty_metadata(handle, bh);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
  	if (status < 0)
  		mlog(ML_ERROR, "Could not dirty metadata buffer. "
  		     "(bh->b_blocknr=%llu)
  ",
  		     (unsigned long long)bh->b_blocknr);
  
  	mlog_exit(status);
  	return status;
  }
  
  int ocfs2_journal_dirty_data(handle_t *handle,
  			     struct buffer_head *bh)
  {
  	int err = journal_dirty_data(handle, bh);
  	if (err)
  		mlog_errno(err);
  	/* TODO: When we can handle it, abort the handle and go RO on
  	 * error here. */
  
  	return err;
  }
d147b3d63   Mark Fasheh   ocfs2: Support co...
441
  #define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
442
443
444
445
  
  void ocfs2_set_journal_params(struct ocfs2_super *osb)
  {
  	journal_t *journal = osb->journal->j_journal;
d147b3d63   Mark Fasheh   ocfs2: Support co...
446
447
448
449
  	unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
  
  	if (osb->osb_commit_interval)
  		commit_interval = osb->osb_commit_interval;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
450
451
  
  	spin_lock(&journal->j_state_lock);
d147b3d63   Mark Fasheh   ocfs2: Support co...
452
  	journal->j_commit_interval = commit_interval;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
  	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
  		journal->j_flags |= JFS_BARRIER;
  	else
  		journal->j_flags &= ~JFS_BARRIER;
  	spin_unlock(&journal->j_state_lock);
  }
  
  int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
  {
  	int status = -1;
  	struct inode *inode = NULL; /* the journal inode */
  	journal_t *j_journal = NULL;
  	struct ocfs2_dinode *di = NULL;
  	struct buffer_head *bh = NULL;
  	struct ocfs2_super *osb;
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
468
  	int inode_lock = 0;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
  
  	mlog_entry_void();
  
  	BUG_ON(!journal);
  
  	osb = journal->j_osb;
  
  	/* already have the inode for our journal */
  	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
  					    osb->slot_num);
  	if (inode == NULL) {
  		status = -EACCES;
  		mlog_errno(status);
  		goto done;
  	}
  	if (is_bad_inode(inode)) {
  		mlog(ML_ERROR, "access error (bad inode)
  ");
  		iput(inode);
  		inode = NULL;
  		status = -EACCES;
  		goto done;
  	}
  
  	SET_INODE_JOURNAL(inode);
  	OCFS2_I(inode)->ip_open_count++;
6eff5790d   Mark Fasheh   [PATCH] ocfs2: do...
495
496
497
  	/* Skip recovery waits here - journal inode metadata never
  	 * changes in a live cluster so it can be considered an
  	 * exception to the rule. */
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
498
  	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
499
500
501
502
503
504
  	if (status < 0) {
  		if (status != -ERESTARTSYS)
  			mlog(ML_ERROR, "Could not get lock on journal!
  ");
  		goto done;
  	}
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
505
  	inode_lock = 1;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
506
507
508
509
510
511
512
513
514
515
516
517
  	di = (struct ocfs2_dinode *)bh->b_data;
  
  	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
  		mlog(ML_ERROR, "Journal file size (%lld) is too small!
  ",
  		     inode->i_size);
  		status = -EINVAL;
  		goto done;
  	}
  
  	mlog(0, "inode->i_size = %lld
  ", inode->i_size);
5515eff81   Andrew Morton   [PATCH] 2tb-files...
518
519
520
  	mlog(0, "inode->i_blocks = %llu
  ",
  			(unsigned long long)inode->i_blocks);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
  	mlog(0, "inode->ip_clusters = %u
  ", OCFS2_I(inode)->ip_clusters);
  
  	/* call the kernels journal init function now */
  	j_journal = journal_init_inode(inode);
  	if (j_journal == NULL) {
  		mlog(ML_ERROR, "Linux journal layer error
  ");
  		status = -EINVAL;
  		goto done;
  	}
  
  	mlog(0, "Returned from journal_init_inode
  ");
  	mlog(0, "j_journal->j_maxlen = %u
  ", j_journal->j_maxlen);
  
  	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
  		  OCFS2_JOURNAL_DIRTY_FL);
  
  	journal->j_journal = j_journal;
  	journal->j_inode = inode;
  	journal->j_bh = bh;
  
  	ocfs2_set_journal_params(osb);
  
  	journal->j_state = OCFS2_JOURNAL_LOADED;
  
  	status = 0;
  done:
  	if (status < 0) {
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
552
553
  		if (inode_lock)
  			ocfs2_inode_unlock(inode, 1);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
554
555
556
557
558
559
560
561
562
563
564
  		if (bh != NULL)
  			brelse(bh);
  		if (inode) {
  			OCFS2_I(inode)->ip_open_count--;
  			iput(inode);
  		}
  	}
  
  	mlog_exit(status);
  	return status;
  }
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
565
566
567
568
569
570
571
572
573
  static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
  {
  	le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
  }
  
  static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
  {
  	return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
  }
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
574
  static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
575
  				      int dirty, int replayed)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
576
577
578
579
580
581
582
583
584
585
586
587
588
589
  {
  	int status;
  	unsigned int flags;
  	struct ocfs2_journal *journal = osb->journal;
  	struct buffer_head *bh = journal->j_bh;
  	struct ocfs2_dinode *fe;
  
  	mlog_entry_void();
  
  	fe = (struct ocfs2_dinode *)bh->b_data;
  	if (!OCFS2_IS_VALID_DINODE(fe)) {
  		/* This is called from startup/shutdown which will
  		 * handle the errors in a specific manner, so no need
  		 * to call ocfs2_error() here. */
b0697053f   Mark Fasheh   ocfs2: don't use ...
590
  		mlog(ML_ERROR, "Journal dinode %llu  has invalid "
1ca1a111b   Mark Fasheh   ocfs2: fix sparse...
591
592
  		     "signature: %.*s",
  		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
b0697053f   Mark Fasheh   ocfs2: don't use ...
593
  		     fe->i_signature);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
594
595
596
597
598
599
600
601
602
603
  		status = -EIO;
  		goto out;
  	}
  
  	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
  	if (dirty)
  		flags |= OCFS2_JOURNAL_DIRTY_FL;
  	else
  		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
  	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
604
605
  	if (replayed)
  		ocfs2_bump_recovery_generation(fe);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
  	status = ocfs2_write_block(osb, bh, journal->j_inode);
  	if (status < 0)
  		mlog_errno(status);
  
  out:
  	mlog_exit(status);
  	return status;
  }
  
  /*
   * If the journal has been kmalloc'd it needs to be freed after this
   * call.
   */
  void ocfs2_journal_shutdown(struct ocfs2_super *osb)
  {
  	struct ocfs2_journal *journal = NULL;
  	int status = 0;
  	struct inode *inode = NULL;
  	int num_running_trans = 0;
  
  	mlog_entry_void();
ebdec83ba   Eric Sesterhenn / snakebyte   [PATCH] BUG_ON() ...
627
  	BUG_ON(!osb);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
  
  	journal = osb->journal;
  	if (!journal)
  		goto done;
  
  	inode = journal->j_inode;
  
  	if (journal->j_state != OCFS2_JOURNAL_LOADED)
  		goto done;
  
  	/* need to inc inode use count as journal_destroy will iput. */
  	if (!igrab(inode))
  		BUG();
  
  	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
  	if (num_running_trans > 0)
  		mlog(0, "Shutting down journal: must wait on %d "
  		     "running transactions!
  ",
  		     num_running_trans);
  
  	/* Do a commit_cache here. It will flush our journal, *and*
  	 * release any locks that are still held.
  	 * set the SHUTDOWN flag and release the trans lock.
  	 * the commit thread will take the trans lock for us below. */
  	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
  
  	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
  	 * drop the trans_lock (which we want to hold until we
  	 * completely destroy the journal. */
  	if (osb->commit_task) {
  		/* Wait for the commit thread */
  		mlog(0, "Waiting for ocfs2commit to exit....
  ");
  		kthread_stop(osb->commit_task);
  		osb->commit_task = NULL;
  	}
  
  	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
c271c5c22   Sunil Mushran   ocfs2: local mounts
667
668
669
670
671
672
673
674
675
676
677
678
679
  	if (ocfs2_mount_local(osb)) {
  		journal_lock_updates(journal->j_journal);
  		status = journal_flush(journal->j_journal);
  		journal_unlock_updates(journal->j_journal);
  		if (status < 0)
  			mlog_errno(status);
  	}
  
  	if (status == 0) {
  		/*
  		 * Do not toggle if flush was unsuccessful otherwise
  		 * will leave dirty metadata in a "clean" journal
  		 */
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
680
  		status = ocfs2_journal_toggle_dirty(osb, 0, 0);
c271c5c22   Sunil Mushran   ocfs2: local mounts
681
682
683
  		if (status < 0)
  			mlog_errno(status);
  	}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
684
685
686
687
688
689
690
  
  	/* Shutdown the kernel journal system */
  	journal_destroy(journal->j_journal);
  
  	OCFS2_I(inode)->ip_open_count--;
  
  	/* unlock our journal */
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
691
  	ocfs2_inode_unlock(inode, 1);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
  
  	brelse(journal->j_bh);
  	journal->j_bh = NULL;
  
  	journal->j_state = OCFS2_JOURNAL_FREE;
  
  //	up_write(&journal->j_trans_barrier);
  done:
  	if (inode)
  		iput(inode);
  	mlog_exit_void();
  }
  
  static void ocfs2_clear_journal_error(struct super_block *sb,
  				      journal_t *journal,
  				      int slot)
  {
  	int olderr;
  
  	olderr = journal_errno(journal);
  	if (olderr) {
  		mlog(ML_ERROR, "File system error %d recorded in "
  		     "journal %u.
  ", olderr, slot);
  		mlog(ML_ERROR, "File system on device %s needs checking.
  ",
  		     sb->s_id);
  
  		journal_ack_err(journal);
  		journal_clear_err(journal);
  	}
  }
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
724
  int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
725
726
727
728
729
  {
  	int status = 0;
  	struct ocfs2_super *osb;
  
  	mlog_entry_void();
b1f3550fa   Julia Lawall   ocfs2: Use BUG_ON
730
  	BUG_ON(!journal);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
731
732
733
734
735
736
737
738
739
740
741
  
  	osb = journal->j_osb;
  
  	status = journal_load(journal->j_journal);
  	if (status < 0) {
  		mlog(ML_ERROR, "Failed to load journal!
  ");
  		goto done;
  	}
  
  	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
742
  	status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
743
744
745
746
747
748
  	if (status < 0) {
  		mlog_errno(status);
  		goto done;
  	}
  
  	/* Launch the commit thread */
c271c5c22   Sunil Mushran   ocfs2: local mounts
749
750
751
752
753
754
755
756
757
758
759
  	if (!local) {
  		osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
  					       "ocfs2cmt");
  		if (IS_ERR(osb->commit_task)) {
  			status = PTR_ERR(osb->commit_task);
  			osb->commit_task = NULL;
  			mlog(ML_ERROR, "unable to launch ocfs2commit thread, "
  			     "error=%d", status);
  			goto done;
  		}
  	} else
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
760
  		osb->commit_task = NULL;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
761
762
763
764
765
766
767
768
769
770
771
772
773
774
  
  done:
  	mlog_exit(status);
  	return status;
  }
  
  
  /* 'full' flag tells us whether we clear out all blocks or if we just
   * mark the journal clean */
  int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
  {
  	int status;
  
  	mlog_entry_void();
ebdec83ba   Eric Sesterhenn / snakebyte   [PATCH] BUG_ON() ...
775
  	BUG_ON(!journal);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
776
777
778
779
780
781
  
  	status = journal_wipe(journal->j_journal, full);
  	if (status < 0) {
  		mlog_errno(status);
  		goto bail;
  	}
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
782
  	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
783
784
785
786
787
788
789
  	if (status < 0)
  		mlog_errno(status);
  
  bail:
  	mlog_exit(status);
  	return status;
  }
553abd046   Joel Becker   ocfs2: Change the...
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
  static int ocfs2_recovery_completed(struct ocfs2_super *osb)
  {
  	int empty;
  	struct ocfs2_recovery_map *rm = osb->recovery_map;
  
  	spin_lock(&osb->osb_lock);
  	empty = (rm->rm_used == 0);
  	spin_unlock(&osb->osb_lock);
  
  	return empty;
  }
  
  void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
  {
  	wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
  }
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
806
807
808
809
810
811
812
813
814
815
816
817
818
  /*
   * JBD Might read a cached version of another nodes journal file. We
   * don't want this as this file changes often and we get no
   * notification on those changes. The only way to be sure that we've
   * got the most up to date version of those blocks then is to force
   * read them off disk. Just searching through the buffer cache won't
   * work as there may be pages backing this file which are still marked
   * up to date. We know things can't change on this file underneath us
   * as we have the lock by now :)
   */
  static int ocfs2_force_read_journal(struct inode *inode)
  {
  	int status = 0;
4f902c377   Mark Fasheh   ocfs2: Fix extent...
819
  	int i;
8110b073a   Mark Fasheh   ocfs2: Fix up i_b...
820
  	u64 v_blkno, p_blkno, p_blocks, num_blocks;
4f902c377   Mark Fasheh   ocfs2: Fix extent...
821
  #define CONCURRENT_JOURNAL_FILL 32ULL
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
822
823
824
  	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
  
  	mlog_entry_void();
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
825
  	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
8110b073a   Mark Fasheh   ocfs2: Fix up i_b...
826
  	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
827
  	v_blkno = 0;
8110b073a   Mark Fasheh   ocfs2: Fix up i_b...
828
  	while (v_blkno < num_blocks) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
829
  		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
49cb8d2d4   Mark Fasheh   ocfs2: Read from ...
830
  						     &p_blkno, &p_blocks, NULL);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
831
832
833
834
835
836
837
  		if (status < 0) {
  			mlog_errno(status);
  			goto bail;
  		}
  
  		if (p_blocks > CONCURRENT_JOURNAL_FILL)
  			p_blocks = CONCURRENT_JOURNAL_FILL;
dd4a2c2bf   Mark Fasheh   ocfs2: Don't popu...
838
839
  		/* We are reading journal data which should not
  		 * be put in the uptodate cache */
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
840
841
  		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
  					   p_blkno, p_blocks, bhs, 0,
dd4a2c2bf   Mark Fasheh   ocfs2: Don't popu...
842
  					   NULL);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
  		if (status < 0) {
  			mlog_errno(status);
  			goto bail;
  		}
  
  		for(i = 0; i < p_blocks; i++) {
  			brelse(bhs[i]);
  			bhs[i] = NULL;
  		}
  
  		v_blkno += p_blocks;
  	}
  
  bail:
  	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
  		if (bhs[i])
  			brelse(bhs[i]);
  	mlog_exit(status);
  	return status;
  }
  
  struct ocfs2_la_recovery_item {
  	struct list_head	lri_list;
  	int			lri_slot;
  	struct ocfs2_dinode	*lri_la_dinode;
  	struct ocfs2_dinode	*lri_tl_dinode;
  };
  
  /* Does the second half of the recovery process. By this point, the
   * node is marked clean and can actually be considered recovered,
   * hence it's no longer in the recovery map, but there's still some
   * cleanup we can do which shouldn't happen within the recovery thread
   * as locking in that context becomes very difficult if we are to take
   * recovering nodes into account.
   *
   * NOTE: This function can and will sleep on recovery of other nodes
   * during cluster locking, just like any other ocfs2 process.
   */
c4028958b   David Howells   WorkStruct: make ...
881
  void ocfs2_complete_recovery(struct work_struct *work)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
882
883
  {
  	int ret;
c4028958b   David Howells   WorkStruct: make ...
884
885
886
  	struct ocfs2_journal *journal =
  		container_of(work, struct ocfs2_journal, j_recovery_work);
  	struct ocfs2_super *osb = journal->j_osb;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
887
  	struct ocfs2_dinode *la_dinode, *tl_dinode;
800deef3f   Christoph Hellwig   [PATCH] ocfs2: us...
888
  	struct ocfs2_la_recovery_item *item, *n;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
889
890
891
892
893
894
895
896
897
898
  	LIST_HEAD(tmp_la_list);
  
  	mlog_entry_void();
  
  	mlog(0, "completing recovery from keventd
  ");
  
  	spin_lock(&journal->j_lock);
  	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
  	spin_unlock(&journal->j_lock);
800deef3f   Christoph Hellwig   [PATCH] ocfs2: us...
899
  	list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
900
901
902
903
904
905
906
  		list_del_init(&item->lri_list);
  
  		mlog(0, "Complete recovery for slot %d
  ", item->lri_slot);
  
  		la_dinode = item->lri_la_dinode;
  		if (la_dinode) {
b0697053f   Mark Fasheh   ocfs2: don't use ...
907
908
  			mlog(0, "Clean up local alloc %llu
  ",
1ca1a111b   Mark Fasheh   ocfs2: fix sparse...
909
  			     (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
910
911
912
913
914
915
916
917
918
919
920
  
  			ret = ocfs2_complete_local_alloc_recovery(osb,
  								  la_dinode);
  			if (ret < 0)
  				mlog_errno(ret);
  
  			kfree(la_dinode);
  		}
  
  		tl_dinode = item->lri_tl_dinode;
  		if (tl_dinode) {
b0697053f   Mark Fasheh   ocfs2: don't use ...
921
922
  			mlog(0, "Clean up truncate log %llu
  ",
1ca1a111b   Mark Fasheh   ocfs2: fix sparse...
923
  			     (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
  
  			ret = ocfs2_complete_truncate_log_recovery(osb,
  								   tl_dinode);
  			if (ret < 0)
  				mlog_errno(ret);
  
  			kfree(tl_dinode);
  		}
  
  		ret = ocfs2_recover_orphans(osb, item->lri_slot);
  		if (ret < 0)
  			mlog_errno(ret);
  
  		kfree(item);
  	}
  
  	mlog(0, "Recovery completion
  ");
  	mlog_exit_void();
  }
  
  /* NOTE: This function always eats your references to la_dinode and
   * tl_dinode, either manually on error, or by passing them to
   * ocfs2_complete_recovery */
  static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
  					    int slot_num,
  					    struct ocfs2_dinode *la_dinode,
  					    struct ocfs2_dinode *tl_dinode)
  {
  	struct ocfs2_la_recovery_item *item;
afae00ab4   Sunil Mushran   ocfs2: fix gfp ma...
954
  	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
  	if (!item) {
  		/* Though we wish to avoid it, we are in fact safe in
  		 * skipping local alloc cleanup as fsck.ocfs2 is more
  		 * than capable of reclaiming unused space. */
  		if (la_dinode)
  			kfree(la_dinode);
  
  		if (tl_dinode)
  			kfree(tl_dinode);
  
  		mlog_errno(-ENOMEM);
  		return;
  	}
  
  	INIT_LIST_HEAD(&item->lri_list);
  	item->lri_la_dinode = la_dinode;
  	item->lri_slot = slot_num;
  	item->lri_tl_dinode = tl_dinode;
  
  	spin_lock(&journal->j_lock);
  	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
  	queue_work(ocfs2_wq, &journal->j_recovery_work);
  	spin_unlock(&journal->j_lock);
  }
  
  /* Called by the mount code to queue recovery the last part of
   * recovery for it's own slot. */
  void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
  {
  	struct ocfs2_journal *journal = osb->journal;
  
  	if (osb->dirty) {
  		/* No need to queue up our truncate_log as regular
  		 * cleanup will catch that. */
  		ocfs2_queue_recovery_completion(journal,
  						osb->slot_num,
  						osb->local_alloc_copy,
  						NULL);
  		ocfs2_schedule_truncate_log_flush(osb, 0);
  
  		osb->local_alloc_copy = NULL;
  		osb->dirty = 0;
  	}
  }
  
  static int __ocfs2_recovery_thread(void *arg)
  {
  	int status, node_num;
  	struct ocfs2_super *osb = arg;
553abd046   Joel Becker   ocfs2: Change the...
1004
  	struct ocfs2_recovery_map *rm = osb->recovery_map;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
  
  	mlog_entry_void();
  
  	status = ocfs2_wait_on_mount(osb);
  	if (status < 0) {
  		goto bail;
  	}
  
  restart:
  	status = ocfs2_super_lock(osb, 1);
  	if (status < 0) {
  		mlog_errno(status);
  		goto bail;
  	}
553abd046   Joel Becker   ocfs2: Change the...
1019
1020
1021
1022
1023
1024
  	spin_lock(&osb->osb_lock);
  	while (rm->rm_used) {
  		/* It's always safe to remove entry zero, as we won't
  		 * clear it until ocfs2_recover_node() has succeeded. */
  		node_num = rm->rm_entries[0];
  		spin_unlock(&osb->osb_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1025
1026
  
  		status = ocfs2_recover_node(osb, node_num);
553abd046   Joel Becker   ocfs2: Change the...
1027
1028
1029
  		if (!status) {
  			ocfs2_recovery_map_clear(osb, node_num);
  		} else {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1030
1031
1032
1033
1034
1035
1036
  			mlog(ML_ERROR,
  			     "Error %d recovering node %d on device (%u,%u)!
  ",
  			     status, node_num,
  			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
  			mlog(ML_ERROR, "Volume requires unmount.
  ");
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1037
  		}
553abd046   Joel Becker   ocfs2: Change the...
1038
  		spin_lock(&osb->osb_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1039
  	}
553abd046   Joel Becker   ocfs2: Change the...
1040
1041
1042
  	spin_unlock(&osb->osb_lock);
  	mlog(0, "All nodes recovered
  ");
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1043
1044
1045
1046
1047
  	/* Refresh all journal recovery generations from disk */
  	status = ocfs2_check_journals_nolocks(osb);
  	status = (status == -EROFS) ? 0 : status;
  	if (status < 0)
  		mlog_errno(status);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1048
1049
1050
  	ocfs2_super_unlock(osb, 1);
  
  	/* We always run recovery on our own orphan dir - the dead
34d024f84   Mark Fasheh   ocfs2: Remove mou...
1051
1052
  	 * node(s) may have disallowd a previos inode delete. Re-processing
  	 * is therefore required. */
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1053
1054
1055
1056
  	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
  					NULL);
  
  bail:
c74ec2f77   Arjan van de Ven   [PATCH] ocfs2: Se...
1057
  	mutex_lock(&osb->recovery_lock);
553abd046   Joel Becker   ocfs2: Change the...
1058
  	if (!status && !ocfs2_recovery_completed(osb)) {
c74ec2f77   Arjan van de Ven   [PATCH] ocfs2: Se...
1059
  		mutex_unlock(&osb->recovery_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1060
1061
1062
1063
1064
1065
  		goto restart;
  	}
  
  	osb->recovery_thread_task = NULL;
  	mb(); /* sync with ocfs2_recovery_thread_running */
  	wake_up(&osb->recovery_event);
c74ec2f77   Arjan van de Ven   [PATCH] ocfs2: Se...
1066
  	mutex_unlock(&osb->recovery_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
  
  	mlog_exit(status);
  	/* no one is callint kthread_stop() for us so the kthread() api
  	 * requires that we call do_exit().  And it isn't exported, but
  	 * complete_and_exit() seems to be a minimal wrapper around it. */
  	complete_and_exit(NULL, status);
  	return status;
  }
  
  void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
  {
  	mlog_entry("(node_num=%d, osb->node_num = %d)
  ",
  		   node_num, osb->node_num);
c74ec2f77   Arjan van de Ven   [PATCH] ocfs2: Se...
1081
  	mutex_lock(&osb->recovery_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1082
1083
1084
1085
1086
  	if (osb->disable_recovery)
  		goto out;
  
  	/* People waiting on recovery will wait on
  	 * the recovery map to empty. */
553abd046   Joel Becker   ocfs2: Change the...
1087
1088
1089
  	if (ocfs2_recovery_map_set(osb, node_num))
  		mlog(0, "node %d already in recovery map.
  ", node_num);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1090
1091
1092
1093
1094
1095
1096
1097
  
  	mlog(0, "starting recovery thread...
  ");
  
  	if (osb->recovery_thread_task)
  		goto out;
  
  	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
784270435   Mark Fasheh   ocfs2: clean up s...
1098
  						 "ocfs2rec");
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1099
1100
1101
1102
1103
1104
  	if (IS_ERR(osb->recovery_thread_task)) {
  		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
  		osb->recovery_thread_task = NULL;
  	}
  
  out:
c74ec2f77   Arjan van de Ven   [PATCH] ocfs2: Se...
1105
  	mutex_unlock(&osb->recovery_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1106
1107
1108
1109
  	wake_up(&osb->recovery_event);
  
  	mlog_exit_void();
  }
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
  static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
  				    int slot_num,
  				    struct buffer_head **bh,
  				    struct inode **ret_inode)
  {
  	int status = -EACCES;
  	struct inode *inode = NULL;
  
  	BUG_ON(slot_num >= osb->max_slots);
  
  	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
  					    slot_num);
  	if (!inode || is_bad_inode(inode)) {
  		mlog_errno(status);
  		goto bail;
  	}
  	SET_INODE_JOURNAL(inode);
  
  	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
  	if (status < 0) {
  		mlog_errno(status);
  		goto bail;
  	}
  
  	status = 0;
  
  bail:
  	if (inode) {
  		if (status || !ret_inode)
  			iput(inode);
  		else
  			*ret_inode = inode;
  	}
  	return status;
  }
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
  /* Does the actual journal replay and marks the journal inode as
   * clean. Will only replay if the journal inode is marked dirty. */
  static int ocfs2_replay_journal(struct ocfs2_super *osb,
  				int node_num,
  				int slot_num)
  {
  	int status;
  	int got_lock = 0;
  	unsigned int flags;
  	struct inode *inode = NULL;
  	struct ocfs2_dinode *fe;
  	journal_t *journal = NULL;
  	struct buffer_head *bh = NULL;
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1158
  	u32 slot_reco_gen;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1159

539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1160
1161
  	status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
  	if (status) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1162
1163
1164
  		mlog_errno(status);
  		goto done;
  	}
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
  
  	fe = (struct ocfs2_dinode *)bh->b_data;
  	slot_reco_gen = ocfs2_get_recovery_generation(fe);
  	brelse(bh);
  	bh = NULL;
  
  	/*
  	 * As the fs recovery is asynchronous, there is a small chance that
  	 * another node mounted (and recovered) the slot before the recovery
  	 * thread could get the lock. To handle that, we dirty read the journal
  	 * inode for that slot to get the recovery generation. If it is
  	 * different than what we expected, the slot has been recovered.
  	 * If not, it needs recovery.
  	 */
  	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
  		mlog(0, "Slot %u already recovered (old/new=%u/%u)
  ", slot_num,
  		     osb->slot_recovery_generations[slot_num], slot_reco_gen);
  		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
  		status = -EBUSY;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1185
1186
  		goto done;
  	}
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1187
1188
  
  	/* Continue with recovery as the journal has not yet been recovered */
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1189

e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1190
  	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1191
  	if (status < 0) {
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1192
1193
  		mlog(0, "status returned from ocfs2_inode_lock=%d
  ", status);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
  		if (status != -ERESTARTSYS)
  			mlog(ML_ERROR, "Could not lock journal!
  ");
  		goto done;
  	}
  	got_lock = 1;
  
  	fe = (struct ocfs2_dinode *) bh->b_data;
  
  	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1204
  	slot_reco_gen = ocfs2_get_recovery_generation(fe);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1205
1206
1207
1208
  
  	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
  		mlog(0, "No recovery required for node %d
  ", node_num);
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1209
1210
  		/* Refresh recovery generation for the slot */
  		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
  		goto done;
  	}
  
  	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)
  ",
  	     node_num, slot_num,
  	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
  
  	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
  
  	status = ocfs2_force_read_journal(inode);
  	if (status < 0) {
  		mlog_errno(status);
  		goto done;
  	}
  
  	mlog(0, "calling journal_init_inode
  ");
  	journal = journal_init_inode(inode);
  	if (journal == NULL) {
  		mlog(ML_ERROR, "Linux journal layer error
  ");
  		status = -EIO;
  		goto done;
  	}
  
  	status = journal_load(journal);
  	if (status < 0) {
  		mlog_errno(status);
  		if (!igrab(inode))
  			BUG();
  		journal_destroy(journal);
  		goto done;
  	}
  
  	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
  
  	/* wipe the journal */
  	mlog(0, "flushing the journal.
  ");
  	journal_lock_updates(journal);
  	status = journal_flush(journal);
  	journal_unlock_updates(journal);
  	if (status < 0)
  		mlog_errno(status);
  
  	/* This will mark the node clean */
  	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
  	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
  	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1261
1262
1263
1264
  	/* Increment recovery generation to indicate successful recovery */
  	ocfs2_bump_recovery_generation(fe);
  	osb->slot_recovery_generations[slot_num] =
  					ocfs2_get_recovery_generation(fe);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
  	status = ocfs2_write_block(osb, bh, inode);
  	if (status < 0)
  		mlog_errno(status);
  
  	if (!igrab(inode))
  		BUG();
  
  	journal_destroy(journal);
  
  done:
  	/* drop the lock on this nodes journal */
  	if (got_lock)
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1277
  		ocfs2_inode_unlock(inode, 1);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
  
  	if (inode)
  		iput(inode);
  
  	if (bh)
  		brelse(bh);
  
  	mlog_exit(status);
  	return status;
  }
  
  /*
   * Do the most important parts of node recovery:
   *  - Replay it's journal
   *  - Stamp a clean local allocator file
   *  - Stamp a clean truncate log
   *  - Mark the node clean
   *
   * If this function completes without error, a node in OCFS2 can be
   * said to have been safely recovered. As a result, failure during the
   * second part of a nodes recovery process (local alloc recovery) is
   * far less concerning.
   */
  static int ocfs2_recover_node(struct ocfs2_super *osb,
  			      int node_num)
  {
  	int status = 0;
  	int slot_num;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
  	struct ocfs2_dinode *la_copy = NULL;
  	struct ocfs2_dinode *tl_copy = NULL;
  
  	mlog_entry("(node_num=%d, osb->node_num = %d)
  ",
  		   node_num, osb->node_num);
  
  	mlog(0, "checking node %d
  ", node_num);
  
  	/* Should not ever be called to recover ourselves -- in that
  	 * case we should've called ocfs2_journal_load instead. */
ebdec83ba   Eric Sesterhenn / snakebyte   [PATCH] BUG_ON() ...
1318
  	BUG_ON(osb->node_num == node_num);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1319

d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1320
1321
  	slot_num = ocfs2_node_num_to_slot(osb, node_num);
  	if (slot_num == -ENOENT) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
  		status = 0;
  		mlog(0, "no slot for this node, so no recovery required.
  ");
  		goto done;
  	}
  
  	mlog(0, "node %d was using slot %d
  ", node_num, slot_num);
  
  	status = ocfs2_replay_journal(osb, node_num, slot_num);
  	if (status < 0) {
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1333
1334
1335
1336
1337
1338
1339
1340
  		if (status == -EBUSY) {
  			mlog(0, "Skipping recovery for slot %u (node %u) "
  			     "as another node has recovered it
  ", slot_num,
  			     node_num);
  			status = 0;
  			goto done;
  		}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
  		mlog_errno(status);
  		goto done;
  	}
  
  	/* Stamp a clean local alloc file AFTER recovering the journal... */
  	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
  	if (status < 0) {
  		mlog_errno(status);
  		goto done;
  	}
  
  	/* An error from begin_truncate_log_recovery is not
  	 * serious enough to warrant halting the rest of
  	 * recovery. */
  	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
  	if (status < 0)
  		mlog_errno(status);
  
  	/* Likewise, this would be a strange but ultimately not so
  	 * harmful place to get an error... */
8e8a4603b   Mark Fasheh   ocfs2: Move slot ...
1361
  	status = ocfs2_clear_slot(osb, slot_num);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
  	if (status < 0)
  		mlog_errno(status);
  
  	/* This will kfree the memory pointed to by la_copy and tl_copy */
  	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
  					tl_copy);
  
  	status = 0;
  done:
  
  	mlog_exit(status);
  	return status;
  }
  
  /* Test node liveness by trylocking his journal. If we get the lock,
   * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
   * still alive (we couldn't get the lock) and < 0 on error. */
  static int ocfs2_trylock_journal(struct ocfs2_super *osb,
  				 int slot_num)
  {
  	int status, flags;
  	struct inode *inode = NULL;
  
  	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
  					    slot_num);
  	if (inode == NULL) {
  		mlog(ML_ERROR, "access error
  ");
  		status = -EACCES;
  		goto bail;
  	}
  	if (is_bad_inode(inode)) {
  		mlog(ML_ERROR, "access error (bad inode)
  ");
  		iput(inode);
  		inode = NULL;
  		status = -EACCES;
  		goto bail;
  	}
  	SET_INODE_JOURNAL(inode);
  
  	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1404
  	status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1405
1406
1407
1408
1409
  	if (status < 0) {
  		if (status != -EAGAIN)
  			mlog_errno(status);
  		goto bail;
  	}
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1410
  	ocfs2_inode_unlock(inode, 1);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
  bail:
  	if (inode)
  		iput(inode);
  
  	return status;
  }
  
  /* Call this underneath ocfs2_super_lock. It also assumes that the
   * slot info struct has been updated from disk. */
  int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
  {
d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1422
1423
  	unsigned int node_num;
  	int status, i;
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1424
1425
  	struct buffer_head *bh = NULL;
  	struct ocfs2_dinode *di;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1426
1427
1428
  
  	/* This is called with the super block cluster lock, so we
  	 * know that the slot map can't change underneath us. */
d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1429
1430
  	spin_lock(&osb->osb_lock);
  	for (i = 0; i < osb->max_slots; i++) {
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
  		/* Read journal inode to get the recovery generation */
  		status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
  		if (status) {
  			mlog_errno(status);
  			goto bail;
  		}
  		di = (struct ocfs2_dinode *)bh->b_data;
  		osb->slot_recovery_generations[i] =
  					ocfs2_get_recovery_generation(di);
  		brelse(bh);
  		bh = NULL;
  
  		mlog(0, "Slot %u recovery generation is %u
  ", i,
  		     osb->slot_recovery_generations[i]);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1446
1447
  		if (i == osb->slot_num)
  			continue;
d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1448
1449
1450
  
  		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
  		if (status == -ENOENT)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1451
  			continue;
553abd046   Joel Becker   ocfs2: Change the...
1452
  		if (__ocfs2_recovery_map_test(osb, node_num))
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1453
  			continue;
d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1454
  		spin_unlock(&osb->osb_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
  
  		/* Ok, we have a slot occupied by another node which
  		 * is not in the recovery map. We trylock his journal
  		 * file here to test if he's alive. */
  		status = ocfs2_trylock_journal(osb, i);
  		if (!status) {
  			/* Since we're called from mount, we know that
  			 * the recovery thread can't race us on
  			 * setting / checking the recovery bits. */
  			ocfs2_recovery_thread(osb, node_num);
  		} else if ((status < 0) && (status != -EAGAIN)) {
  			mlog_errno(status);
  			goto bail;
  		}
d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1469
  		spin_lock(&osb->osb_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1470
  	}
d85b20e4b   Joel Becker   ocfs2: Make ocfs2...
1471
  	spin_unlock(&osb->osb_lock);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1472
1473
1474
1475
1476
1477
  
  	status = 0;
  bail:
  	mlog_exit(status);
  	return status;
  }
5eae5b96f   Mark Fasheh   ocfs2: Remove ope...
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
  struct ocfs2_orphan_filldir_priv {
  	struct inode		*head;
  	struct ocfs2_super	*osb;
  };
  
  static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
  				loff_t pos, u64 ino, unsigned type)
  {
  	struct ocfs2_orphan_filldir_priv *p = priv;
  	struct inode *iter;
  
  	if (name_len == 1 && !strncmp(".", name, 1))
  		return 0;
  	if (name_len == 2 && !strncmp("..", name, 2))
  		return 0;
  
  	/* Skip bad inodes so that recovery can continue */
  	iter = ocfs2_iget(p->osb, ino,
5fa0613ea   Jan Kara   ocfs2: Silence fa...
1496
  			  OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
5eae5b96f   Mark Fasheh   ocfs2: Remove ope...
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
  	if (IS_ERR(iter))
  		return 0;
  
  	mlog(0, "queue orphan %llu
  ",
  	     (unsigned long long)OCFS2_I(iter)->ip_blkno);
  	/* No locking is required for the next_orphan queue as there
  	 * is only ever a single process doing orphan recovery. */
  	OCFS2_I(iter)->ip_next_orphan = p->head;
  	p->head = iter;
  
  	return 0;
  }
b4df6ed8d   Mark Fasheh   [PATCH] ocfs2: fi...
1510
1511
1512
  static int ocfs2_queue_orphans(struct ocfs2_super *osb,
  			       int slot,
  			       struct inode **head)
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1513
  {
b4df6ed8d   Mark Fasheh   [PATCH] ocfs2: fi...
1514
  	int status;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1515
  	struct inode *orphan_dir_inode = NULL;
5eae5b96f   Mark Fasheh   ocfs2: Remove ope...
1516
1517
1518
1519
1520
  	struct ocfs2_orphan_filldir_priv priv;
  	loff_t pos = 0;
  
  	priv.osb = osb;
  	priv.head = *head;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1521
1522
1523
1524
1525
1526
1527
  
  	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
  						       ORPHAN_DIR_SYSTEM_INODE,
  						       slot);
  	if  (!orphan_dir_inode) {
  		status = -ENOENT;
  		mlog_errno(status);
b4df6ed8d   Mark Fasheh   [PATCH] ocfs2: fi...
1528
1529
  		return status;
  	}	
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1530

1b1dcc1b5   Jes Sorensen   [PATCH] mutex sub...
1531
  	mutex_lock(&orphan_dir_inode->i_mutex);
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1532
  	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1533
  	if (status < 0) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1534
1535
1536
  		mlog_errno(status);
  		goto out;
  	}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1537

5eae5b96f   Mark Fasheh   ocfs2: Remove ope...
1538
1539
1540
1541
  	status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
  				   ocfs2_orphan_filldir);
  	if (status) {
  		mlog_errno(status);
a86370fbb   Mark Fasheh   ocfs2: fix exit-w...
1542
  		goto out_cluster;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1543
  	}
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1544

5eae5b96f   Mark Fasheh   ocfs2: Remove ope...
1545
  	*head = priv.head;
a86370fbb   Mark Fasheh   ocfs2: fix exit-w...
1546
  out_cluster:
e63aecb65   Mark Fasheh   ocfs2: Rename ocf...
1547
  	ocfs2_inode_unlock(orphan_dir_inode, 0);
b4df6ed8d   Mark Fasheh   [PATCH] ocfs2: fi...
1548
1549
  out:
  	mutex_unlock(&orphan_dir_inode->i_mutex);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1550
  	iput(orphan_dir_inode);
b4df6ed8d   Mark Fasheh   [PATCH] ocfs2: fi...
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
  	return status;
  }
  
  static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
  					      int slot)
  {
  	int ret;
  
  	spin_lock(&osb->osb_lock);
  	ret = !osb->osb_orphan_wipes[slot];
  	spin_unlock(&osb->osb_lock);
  	return ret;
  }
  
  static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
  					     int slot)
  {
  	spin_lock(&osb->osb_lock);
  	/* Mark ourselves such that new processes in delete_inode()
  	 * know to quit early. */
  	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
  	while (osb->osb_orphan_wipes[slot]) {
  		/* If any processes are already in the middle of an
  		 * orphan wipe on this dir, then we need to wait for
  		 * them. */
  		spin_unlock(&osb->osb_lock);
  		wait_event_interruptible(osb->osb_wipe_event,
  					 ocfs2_orphan_recovery_can_continue(osb, slot));
  		spin_lock(&osb->osb_lock);
  	}
  	spin_unlock(&osb->osb_lock);
  }
  
  static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
  					      int slot)
  {
  	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
  }
  
  /*
   * Orphan recovery. Each mounted node has it's own orphan dir which we
   * must run during recovery. Our strategy here is to build a list of
   * the inodes in the orphan dir and iget/iput them. The VFS does
   * (most) of the rest of the work.
   *
   * Orphan recovery can happen at any time, not just mount so we have a
   * couple of extra considerations.
   *
   * - We grab as many inodes as we can under the orphan dir lock -
   *   doing iget() outside the orphan dir risks getting a reference on
   *   an invalid inode.
   * - We must be sure not to deadlock with other processes on the
   *   system wanting to run delete_inode(). This can happen when they go
   *   to lock the orphan dir and the orphan recovery process attempts to
   *   iget() inside the orphan dir lock. This can be avoided by
   *   advertising our state to ocfs2_delete_inode().
   */
  static int ocfs2_recover_orphans(struct ocfs2_super *osb,
  				 int slot)
  {
  	int ret = 0;
  	struct inode *inode = NULL;
  	struct inode *iter;
  	struct ocfs2_inode_info *oi;
  
  	mlog(0, "Recover inodes from orphan dir in slot %d
  ", slot);
  
  	ocfs2_mark_recovering_orphan_dir(osb, slot);
  	ret = ocfs2_queue_orphans(osb, slot, &inode);
  	ocfs2_clear_recovering_orphan_dir(osb, slot);
  
  	/* Error here should be noted, but we want to continue with as
  	 * many queued inodes as we've got. */
  	if (ret)
  		mlog_errno(ret);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1627
1628
1629
  
  	while (inode) {
  		oi = OCFS2_I(inode);
b0697053f   Mark Fasheh   ocfs2: don't use ...
1630
1631
  		mlog(0, "iput orphan %llu
  ", (unsigned long long)oi->ip_blkno);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1632
1633
1634
1635
  
  		iter = oi->ip_next_orphan;
  
  		spin_lock(&oi->ip_lock);
34d024f84   Mark Fasheh   ocfs2: Remove mou...
1636
1637
1638
1639
  		/* The remote delete code may have set these on the
  		 * assumption that the other node would wipe them
  		 * successfully.  If they are still in the node's
  		 * orphan dir, we need to reset that state. */
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1640
1641
1642
1643
1644
  		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
  
  		/* Set the proper information to get us going into
  		 * ocfs2_delete_inode. */
  		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1645
1646
1647
1648
1649
1650
  		spin_unlock(&oi->ip_lock);
  
  		iput(inode);
  
  		inode = iter;
  	}
b4df6ed8d   Mark Fasheh   [PATCH] ocfs2: fi...
1651
  	return ret;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
  }
  
  static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
  {
  	/* This check is good because ocfs2 will wait on our recovery
  	 * thread before changing it to something other than MOUNTED
  	 * or DISABLED. */
  	wait_event(osb->osb_mount_event,
  		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
  		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
  
  	/* If there's an error on mount, then we may never get to the
  	 * MOUNTED flag, but this is set right before
  	 * dismount_volume() so we can trust it. */
  	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
  		mlog(0, "mount error, exiting!
  ");
  		return -EBUSY;
  	}
  
  	return 0;
  }
  
  static int ocfs2_commit_thread(void *arg)
  {
  	int status;
  	struct ocfs2_super *osb = arg;
  	struct ocfs2_journal *journal = osb->journal;
  
  	/* we can trust j_num_trans here because _should_stop() is only set in
  	 * shutdown and nobody other than ourselves should be able to start
  	 * transactions.  committing on shutdown might take a few iterations
  	 * as final transactions put deleted inodes on the list */
  	while (!(kthread_should_stop() &&
  		 atomic_read(&journal->j_num_trans) == 0)) {
745ae8ba2   Mark Fasheh   [PATCH] ocfs2: on...
1687
1688
1689
  		wait_event_interruptible(osb->checkpoint_event,
  					 atomic_read(&journal->j_num_trans)
  					 || kthread_should_stop());
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
  
  		status = ocfs2_commit_cache(osb);
  		if (status < 0)
  			mlog_errno(status);
  
  		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
  			mlog(ML_KTHREAD,
  			     "commit_thread: %u transactions pending on "
  			     "shutdown
  ",
  			     atomic_read(&journal->j_num_trans));
  		}
  	}
  
  	return 0;
  }
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1706
1707
1708
1709
1710
  /* Reads all the journal inodes without taking any cluster locks. Used
   * for hard readonly access to determine whether any journal requires
   * recovery. Also used to refresh the recovery generation numbers after
   * a journal has been recovered by another node.
   */
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1711
1712
1713
1714
  int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
  {
  	int ret = 0;
  	unsigned int slot;
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1715
  	struct buffer_head *di_bh = NULL;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1716
  	struct ocfs2_dinode *di;
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1717
  	int journal_dirty = 0;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1718
1719
  
  	for(slot = 0; slot < osb->max_slots; slot++) {
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1720
1721
  		ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
  		if (ret) {
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1722
1723
1724
1725
1726
  			mlog_errno(ret);
  			goto out;
  		}
  
  		di = (struct ocfs2_dinode *) di_bh->b_data;
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1727
1728
  		osb->slot_recovery_generations[slot] =
  					ocfs2_get_recovery_generation(di);
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1729
1730
  		if (le32_to_cpu(di->id1.journal1.ij_flags) &
  		    OCFS2_JOURNAL_DIRTY_FL)
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1731
  			journal_dirty = 1;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1732
1733
  
  		brelse(di_bh);
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1734
  		di_bh = NULL;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1735
1736
1737
  	}
  
  out:
539d82640   Sunil Mushran   [PATCH 2/2] ocfs2...
1738
1739
  	if (journal_dirty)
  		ret = -EROFS;
ccd979bdb   Mark Fasheh   [PATCH] OCFS2: Th...
1740
1741
  	return ret;
  }