Blame view

fs/jbd/transaction.c 65 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
  /*
588626996   Uwe Kleine-König   fix file specific...
2
   * linux/fs/jbd/transaction.c
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
3
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4
5
6
7
8
9
10
11
12
   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   *
   * Copyright 1998 Red Hat corp --- All Rights Reserved
   *
   * This file is part of the Linux kernel and is made available under
   * the terms of the GNU General Public License, version 2, or at your
   * option, any later version, incorporated herein by reference.
   *
   * Generic filesystem transaction handling code; part of the ext2fs
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
13
   * journaling system.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
15
16
17
18
19
20
21
22
23
24
25
   *
   * This file manages transactions (compound commits managed by the
   * journaling code) and handles (individual atomic operations by the
   * filesystem).
   */
  
  #include <linux/time.h>
  #include <linux/fs.h>
  #include <linux/jbd.h>
  #include <linux/errno.h>
  #include <linux/slab.h>
  #include <linux/timer.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
26
27
  #include <linux/mm.h>
  #include <linux/highmem.h>
f420d4dc4   Josef Bacik   jbd: improve fsyn...
28
  #include <linux/hrtimer.h>
05713082a   Jan Kara   jbd: remove depen...
29
  #include <linux/backing-dev.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
30

d394e122b   Adrian Bunk   [PATCH] make fs/j...
31
  static void __journal_temp_unlink_buffer(struct journal_head *jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
  /*
   * get_transaction: obtain a new transaction_t object.
   *
   * Simply allocate and initialise a new transaction.  Create it in
   * RUNNING state and add it to the current journal (which should not
   * have an existing running transaction: we only make a new transaction
   * once we have started to commit the old one).
   *
   * Preconditions:
   *	The journal MUST be locked.  We don't perform atomic mallocs on the
   *	new transaction	and we can't block without protecting against other
   *	processes trying to touch the journal while it is in transition.
   *
   * Called under j_state_lock
   */
  
  static transaction_t *
  get_transaction(journal_t *journal, transaction_t *transaction)
  {
  	transaction->t_journal = journal;
  	transaction->t_state = T_RUNNING;
f420d4dc4   Josef Bacik   jbd: improve fsyn...
53
  	transaction->t_start_time = ktime_get();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
54
55
56
57
58
  	transaction->t_tid = journal->j_transaction_sequence++;
  	transaction->t_expires = jiffies + journal->j_commit_interval;
  	spin_lock_init(&transaction->t_handle_lock);
  
  	/* Set up the commit timer for the new transaction. */
b449fc6fc   Andreas Dilger   JBD: round commit...
59
60
  	journal->j_commit_timer.expires =
  				round_jiffies_up(transaction->t_expires);
e3df18983   Andrew Morton   [PATCH] jbd: embe...
61
  	add_timer(&journal->j_commit_timer);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
  
  	J_ASSERT(journal->j_running_transaction == NULL);
  	journal->j_running_transaction = transaction;
  
  	return transaction;
  }
  
  /*
   * Handle management.
   *
   * A handle_t is an object which represents a single atomic update to a
   * filesystem, and which tracks all of the modifications which form part
   * of that one update.
   */
  
  /*
   * start_this_handle: Given a handle, deal with any locking or stalling
   * needed to make sure that there is enough journal space for the handle
   * to begin.  Attach the handle to a transaction and set up the
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
81
   * transaction's buffer credits.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
   */
  
  static int start_this_handle(journal_t *journal, handle_t *handle)
  {
  	transaction_t *transaction;
  	int needed;
  	int nblocks = handle->h_buffer_credits;
  	transaction_t *new_transaction = NULL;
  	int ret = 0;
  
  	if (nblocks > journal->j_max_transaction_buffers) {
  		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)
  ",
  		       current->comm, nblocks,
  		       journal->j_max_transaction_buffers);
  		ret = -ENOSPC;
  		goto out;
  	}
  
  alloc_transaction:
  	if (!journal->j_running_transaction) {
05713082a   Jan Kara   jbd: remove depen...
103
  		new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
  		if (!new_transaction) {
05713082a   Jan Kara   jbd: remove depen...
105
106
  			congestion_wait(BLK_RW_ASYNC, HZ/50);
  			goto alloc_transaction;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
107
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  	}
  
  	jbd_debug(3, "New handle %p going live.
  ", handle);
  
  repeat:
  
  	/*
  	 * We need to hold j_state_lock until t_updates has been incremented,
  	 * for proper journal barrier handling
  	 */
  	spin_lock(&journal->j_state_lock);
  repeat_locked:
  	if (is_journal_aborted(journal) ||
  	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
  		spin_unlock(&journal->j_state_lock);
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
124
  		ret = -EROFS;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  		goto out;
  	}
  
  	/* Wait on the journal's transaction barrier if necessary */
  	if (journal->j_barrier_count) {
  		spin_unlock(&journal->j_state_lock);
  		wait_event(journal->j_wait_transaction_locked,
  				journal->j_barrier_count == 0);
  		goto repeat;
  	}
  
  	if (!journal->j_running_transaction) {
  		if (!new_transaction) {
  			spin_unlock(&journal->j_state_lock);
  			goto alloc_transaction;
  		}
  		get_transaction(journal, new_transaction);
  		new_transaction = NULL;
  	}
  
  	transaction = journal->j_running_transaction;
  
  	/*
  	 * If the current transaction is locked down for commit, wait for the
  	 * lock to be released.
  	 */
  	if (transaction->t_state == T_LOCKED) {
  		DEFINE_WAIT(wait);
  
  		prepare_to_wait(&journal->j_wait_transaction_locked,
  					&wait, TASK_UNINTERRUPTIBLE);
  		spin_unlock(&journal->j_state_lock);
  		schedule();
  		finish_wait(&journal->j_wait_transaction_locked, &wait);
  		goto repeat;
  	}
  
  	/*
  	 * If there is not enough space left in the log to write all potential
  	 * buffers requested by this operation, we need to stall pending a log
  	 * checkpoint to free some more log space.
  	 */
  	spin_lock(&transaction->t_handle_lock);
  	needed = transaction->t_outstanding_credits + nblocks;
  
  	if (needed > journal->j_max_transaction_buffers) {
  		/*
  		 * If the current transaction is already too large, then start
  		 * to commit it: we can then go back and attach this handle to
  		 * a new transaction.
  		 */
  		DEFINE_WAIT(wait);
  
  		jbd_debug(2, "Handle %p starting new commit...
  ", handle);
  		spin_unlock(&transaction->t_handle_lock);
  		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
  				TASK_UNINTERRUPTIBLE);
  		__log_start_commit(journal, transaction->t_tid);
  		spin_unlock(&journal->j_state_lock);
  		schedule();
  		finish_wait(&journal->j_wait_transaction_locked, &wait);
  		goto repeat;
  	}
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
189
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
190
191
192
193
194
195
196
197
  	 * The commit code assumes that it can get enough log space
  	 * without forcing a checkpoint.  This is *critical* for
  	 * correctness: a checkpoint of a buffer which is also
  	 * associated with a committing transaction creates a deadlock,
  	 * so commit simply cannot force through checkpoints.
  	 *
  	 * We must therefore ensure the necessary space in the journal
  	 * *before* starting to dirty potentially checkpointed buffers
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
198
  	 * in the new transaction.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
199
200
201
202
203
204
205
206
207
208
209
210
  	 *
  	 * The worst part is, any transaction currently committing can
  	 * reduce the free space arbitrarily.  Be careful to account for
  	 * those buffers when checkpointing.
  	 */
  
  	/*
  	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
  	 * a _lot_ of headroom: 1/4 of the journal plus the size of
  	 * the committing transaction.  Really, we only need to give it
  	 * committing_transaction->t_outstanding_credits plus "enough" for
  	 * the log control blocks.
a34f0b313   Uwe Kleine-König   fix comment typos...
211
  	 * Also, this test is inconsistent with the matching one in
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
  	 * journal_extend().
  	 */
  	if (__log_space_left(journal) < jbd_space_needed(journal)) {
  		jbd_debug(2, "Handle %p waiting for checkpoint...
  ", handle);
  		spin_unlock(&transaction->t_handle_lock);
  		__log_wait_for_space(journal);
  		goto repeat_locked;
  	}
  
  	/* OK, account for the buffers that this operation expects to
  	 * use and add the handle to the running transaction. */
  
  	handle->h_transaction = transaction;
  	transaction->t_outstanding_credits += nblocks;
  	transaction->t_updates++;
  	transaction->t_handle_count++;
  	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)
  ",
  		  handle, nblocks, transaction->t_outstanding_credits,
  		  __log_space_left(journal));
  	spin_unlock(&transaction->t_handle_lock);
  	spin_unlock(&journal->j_state_lock);
3adae9da0   Jan Kara   jbd: Annotate tra...
235
236
  
  	lock_map_acquire(&handle->h_lockdep_map);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
237
  out:
304c4c841   Andrew Morton   [PATCH] jbd: avoi...
238
239
  	if (unlikely(new_transaction))		/* It's usually NULL */
  		kfree(new_transaction);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
240
241
  	return ret;
  }
34a3d1e83   Peter Zijlstra   lockdep: annotate...
242
  static struct lock_class_key jbd_handle_key;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
243
244
245
246
247
248
249
250
251
  /* Allocate a new handle.  This should probably be in a slab... */
  static handle_t *new_handle(int nblocks)
  {
  	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
  	if (!handle)
  		return NULL;
  	memset(handle, 0, sizeof(*handle));
  	handle->h_buffer_credits = nblocks;
  	handle->h_ref = 1;
34a3d1e83   Peter Zijlstra   lockdep: annotate...
252
  	lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
254
255
256
  	return handle;
  }
  
  /**
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
257
   * handle_t *journal_start() - Obtain a new handle.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258
259
260
261
262
   * @journal: Journal to start transaction on.
   * @nblocks: number of block buffer we might modify
   *
   * We make sure that the transaction can guarantee at least nblocks of
   * modified buffers in the log.  We block until the log can guarantee
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
263
   * that much space.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264
265
266
267
   *
   * This function is visible to journal users (like ext3fs), so is not
   * called with the journal already locked.
   *
c2b67735e   Eryu Guan   jbd: Fix comment ...
268
269
   * Return a pointer to a newly allocated handle, or an ERR_PTR() value
   * on failure.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
   */
  handle_t *journal_start(journal_t *journal, int nblocks)
  {
  	handle_t *handle = journal_current_handle();
  	int err;
  
  	if (!journal)
  		return ERR_PTR(-EROFS);
  
  	if (handle) {
  		J_ASSERT(handle->h_transaction->t_journal == journal);
  		handle->h_ref++;
  		return handle;
  	}
  
  	handle = new_handle(nblocks);
  	if (!handle)
  		return ERR_PTR(-ENOMEM);
  
  	current->journal_info = handle;
  
  	err = start_this_handle(journal, handle);
  	if (err < 0) {
  		jbd_free_handle(handle);
  		current->journal_info = NULL;
  		handle = ERR_PTR(err);
  	}
  	return handle;
  }
  
  /**
   * int journal_extend() - extend buffer credits.
   * @handle:  handle to 'extend'
   * @nblocks: nr blocks to try to extend by.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
304
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
305
306
307
   * Some transactions, such as large extends and truncates, can be done
   * atomically all at once or in several stages.  The operation requests
   * a credit for a number of buffer modications in advance, but can
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
308
   * extend its credit if it needs more.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
   *
   * journal_extend tries to give the running handle more buffer credits.
   * It does not guarantee that allocation - this is a best-effort only.
   * The calling process MUST be able to deal cleanly with a failure to
   * extend here.
   *
   * Return 0 on success, non-zero on failure.
   *
   * return code < 0 implies an error
   * return code > 0 implies normal transaction-full status.
   */
  int journal_extend(handle_t *handle, int nblocks)
  {
  	transaction_t *transaction = handle->h_transaction;
  	journal_t *journal = transaction->t_journal;
  	int result;
  	int wanted;
  
  	result = -EIO;
  	if (is_handle_aborted(handle))
  		goto out;
  
  	result = 1;
  
  	spin_lock(&journal->j_state_lock);
  
  	/* Don't extend a locked-down transaction! */
  	if (handle->h_transaction->t_state != T_RUNNING) {
  		jbd_debug(3, "denied handle %p %d blocks: "
  			  "transaction not running
  ", handle, nblocks);
  		goto error_out;
  	}
  
  	spin_lock(&transaction->t_handle_lock);
  	wanted = transaction->t_outstanding_credits + nblocks;
  
  	if (wanted > journal->j_max_transaction_buffers) {
  		jbd_debug(3, "denied handle %p %d blocks: "
  			  "transaction too large
  ", handle, nblocks);
  		goto unlock;
  	}
  
  	if (wanted > __log_space_left(journal)) {
  		jbd_debug(3, "denied handle %p %d blocks: "
  			  "insufficient log space
  ", handle, nblocks);
  		goto unlock;
  	}
  
  	handle->h_buffer_credits += nblocks;
  	transaction->t_outstanding_credits += nblocks;
  	result = 0;
  
  	jbd_debug(3, "extended handle %p by %d
  ", handle, nblocks);
  unlock:
  	spin_unlock(&transaction->t_handle_lock);
  error_out:
  	spin_unlock(&journal->j_state_lock);
  out:
  	return result;
  }
  
  
  /**
78a4a50a8   Randy Dunlap   docbook: fix file...
376
   * int journal_restart() - restart a handle.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
377
378
   * @handle:  handle to restart
   * @nblocks: nr credits requested
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
379
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
   * Restart a handle for a multi-transaction filesystem
   * operation.
   *
   * If the journal_extend() call above fails to grant new buffer credits
   * to a running handle, a call to journal_restart will commit the
   * handle's transaction so far and reattach the handle to a new
   * transaction capabable of guaranteeing the requested number of
   * credits.
   */
  
  int journal_restart(handle_t *handle, int nblocks)
  {
  	transaction_t *transaction = handle->h_transaction;
  	journal_t *journal = transaction->t_journal;
  	int ret;
  
  	/* If we've had an abort of any type, don't even think about
  	 * actually doing the restart! */
  	if (is_handle_aborted(handle))
  		return 0;
  
  	/*
  	 * First unlink the handle from its current transaction, and start the
  	 * commit on that.
  	 */
  	J_ASSERT(transaction->t_updates > 0);
  	J_ASSERT(journal_current_handle() == handle);
  
  	spin_lock(&journal->j_state_lock);
  	spin_lock(&transaction->t_handle_lock);
  	transaction->t_outstanding_credits -= handle->h_buffer_credits;
  	transaction->t_updates--;
  
  	if (!transaction->t_updates)
  		wake_up(&journal->j_wait_updates);
  	spin_unlock(&transaction->t_handle_lock);
  
  	jbd_debug(2, "restarting handle %p
  ", handle);
  	__log_start_commit(journal, transaction->t_tid);
  	spin_unlock(&journal->j_state_lock);
3adae9da0   Jan Kara   jbd: Annotate tra...
421
  	lock_map_release(&handle->h_lockdep_map);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
422
423
424
425
426
427
428
429
430
431
  	handle->h_buffer_credits = nblocks;
  	ret = start_this_handle(journal, handle);
  	return ret;
  }
  
  
  /**
   * void journal_lock_updates () - establish a transaction barrier.
   * @journal:  Journal to establish a barrier on.
   *
004827855   Jan Kara   jbd: Remove j_bar...
432
433
434
   * This locks out any further updates from being started, and blocks until all
   * existing updates have completed, returning only once the journal is in a
   * quiescent state with no updates running.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
435
   *
004827855   Jan Kara   jbd: Remove j_bar...
436
437
438
439
440
   * We do not use simple mutex for synchronization as there are syscalls which
   * want to return with filesystem locked and that trips up lockdep. Also
   * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
   * Since locking filesystem is rare operation, we use simple counter and
   * waitqueue for locking.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
441
442
443
444
   */
  void journal_lock_updates(journal_t *journal)
  {
  	DEFINE_WAIT(wait);
004827855   Jan Kara   jbd: Remove j_bar...
445
446
447
448
  wait:
  	/* Wait for previous locked operation to finish */
  	wait_event(journal->j_wait_transaction_locked,
  		   journal->j_barrier_count == 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449
  	spin_lock(&journal->j_state_lock);
004827855   Jan Kara   jbd: Remove j_bar...
450
451
452
453
454
455
456
457
  	/*
  	 * Check reliably under the lock whether we are the ones winning the race
  	 * and locking the journal
  	 */
  	if (journal->j_barrier_count > 0) {
  		spin_unlock(&journal->j_state_lock);
  		goto wait;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
  	++journal->j_barrier_count;
  
  	/* Wait until there are no running updates */
  	while (1) {
  		transaction_t *transaction = journal->j_running_transaction;
  
  		if (!transaction)
  			break;
  
  		spin_lock(&transaction->t_handle_lock);
  		if (!transaction->t_updates) {
  			spin_unlock(&transaction->t_handle_lock);
  			break;
  		}
  		prepare_to_wait(&journal->j_wait_updates, &wait,
  				TASK_UNINTERRUPTIBLE);
  		spin_unlock(&transaction->t_handle_lock);
  		spin_unlock(&journal->j_state_lock);
  		schedule();
  		finish_wait(&journal->j_wait_updates, &wait);
  		spin_lock(&journal->j_state_lock);
  	}
  	spin_unlock(&journal->j_state_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481
482
483
484
485
  }
  
  /**
   * void journal_unlock_updates (journal_t* journal) - release barrier
   * @journal:  Journal to release the barrier on.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
486
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
487
   * Release a transaction barrier obtained with journal_lock_updates().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
488
489
490
491
   */
  void journal_unlock_updates (journal_t *journal)
  {
  	J_ASSERT(journal->j_barrier_count != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
492
493
494
495
496
  	spin_lock(&journal->j_state_lock);
  	--journal->j_barrier_count;
  	spin_unlock(&journal->j_state_lock);
  	wake_up(&journal->j_wait_transaction_locked);
  }
1e9fd53b7   Jan Kara   jbd: Fix a race b...
497
  static void warn_dirty_buffer(struct buffer_head *bh)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
498
  {
1e9fd53b7   Jan Kara   jbd: Fix a race b...
499
  	char b[BDEVNAME_SIZE];
4407c2b6b   Jan Kara   [PATCH] Fix race ...
500

1e9fd53b7   Jan Kara   jbd: Fix a race b...
501
502
503
504
505
506
  	printk(KERN_WARNING
  	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
  	       "There's a risk of filesystem corruption in case of system "
  	       "crash.
  ",
  	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
  }
  
  /*
   * If the buffer is already part of the current transaction, then there
   * is nothing we need to do.  If it is already part of a prior
   * transaction which we are still committing to disk, then we need to
   * make sure that we do not overwrite the old copy: we do copy-out to
   * preserve the copy going to disk.  We also account the buffer against
   * the handle's metadata buffer credits (unless the buffer is already
   * part of the transaction, that is).
   *
   */
  static int
  do_get_write_access(handle_t *handle, struct journal_head *jh,
  			int force_copy)
  {
  	struct buffer_head *bh;
  	transaction_t *transaction;
  	journal_t *journal;
  	int error;
  	char *frozen_buffer = NULL;
  	int need_copy = 0;
  
  	if (is_handle_aborted(handle))
  		return -EROFS;
  
  	transaction = handle->h_transaction;
  	journal = transaction->t_journal;
b8ea49fa9   Namhyung Kim   jbd: Fix debug me...
535
536
  	jbd_debug(5, "journal_head %p, force_copy %d
  ", jh, force_copy);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
537
538
539
540
541
542
543
544
545
546
547
  
  	JBUFFER_TRACE(jh, "entry");
  repeat:
  	bh = jh2bh(jh);
  
  	/* @@@ Need to check for errors here at some point. */
  
  	lock_buffer(bh);
  	jbd_lock_bh_state(bh);
  
  	/* We now hold the buffer lock so it is safe to query the buffer
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
548
549
  	 * state.  Is the buffer dirty?
  	 *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
  	 * If so, there are two possibilities.  The buffer may be
  	 * non-journaled, and undergoing a quite legitimate writeback.
  	 * Otherwise, it is journaled, and we don't expect dirty buffers
  	 * in that state (the buffers should be marked JBD_Dirty
  	 * instead.)  So either the IO is being done under our own
  	 * control and this is a bug, or it's a third party IO such as
  	 * dump(8) (which may leave the buffer scheduled for read ---
  	 * ie. locked but not dirty) or tune2fs (which may actually have
  	 * the buffer dirtied, ugh.)  */
  
  	if (buffer_dirty(bh)) {
  		/*
  		 * First question: is this buffer already part of the current
  		 * transaction or the existing committing transaction?
  		 */
  		if (jh->b_transaction) {
  			J_ASSERT_JH(jh,
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
567
  				jh->b_transaction == transaction ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
568
569
570
571
572
  				jh->b_transaction ==
  					journal->j_committing_transaction);
  			if (jh->b_next_transaction)
  				J_ASSERT_JH(jh, jh->b_next_transaction ==
  							transaction);
1e9fd53b7   Jan Kara   jbd: Fix a race b...
573
  			warn_dirty_buffer(bh);
4407c2b6b   Jan Kara   [PATCH] Fix race ...
574
575
576
577
578
579
  		}
  		/*
  		 * In any case we need to clean the dirty flag and we must
  		 * do it under the buffer lock to be sure we don't race
  		 * with running write-out.
  		 */
1e9fd53b7   Jan Kara   jbd: Fix a race b...
580
581
582
  		JBUFFER_TRACE(jh, "Journalling dirty buffer");
  		clear_buffer_dirty(bh);
  		set_buffer_jbddirty(bh);
e9ad5620b   Dave Kleikamp   [PATCH] ext3: Mor...
583
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
  
  	unlock_buffer(bh);
  
  	error = -EROFS;
  	if (is_handle_aborted(handle)) {
  		jbd_unlock_bh_state(bh);
  		goto out;
  	}
  	error = 0;
  
  	/*
  	 * The buffer is already part of this transaction if b_transaction or
  	 * b_next_transaction points to it
  	 */
  	if (jh->b_transaction == transaction ||
  	    jh->b_next_transaction == transaction)
  		goto done;
  
  	/*
5bc833fea   Josef Bacik   jbd: fix the way ...
603
604
605
606
607
608
  	 * this is the first time this transaction is touching this buffer,
  	 * reset the modified flag
  	 */
  	jh->b_modified = 0;
  
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
  	 * If there is already a copy-out version of this buffer, then we don't
  	 * need to make another one
  	 */
  	if (jh->b_frozen_data) {
  		JBUFFER_TRACE(jh, "has frozen data");
  		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
  		jh->b_next_transaction = transaction;
  		goto done;
  	}
  
  	/* Is there data here we need to preserve? */
  
  	if (jh->b_transaction && jh->b_transaction != transaction) {
  		JBUFFER_TRACE(jh, "owned by older transaction");
  		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
  		J_ASSERT_JH(jh, jh->b_transaction ==
  					journal->j_committing_transaction);
  
  		/* There is one case we have to be very careful about.
  		 * If the committing transaction is currently writing
  		 * this buffer out to disk and has NOT made a copy-out,
  		 * then we cannot modify the buffer contents at all
  		 * right now.  The essence of copy-out is that it is the
  		 * extra copy, not the primary copy, which gets
  		 * journaled.  If the primary copy is already going to
  		 * disk then we cannot do copy-out here. */
  
  		if (jh->b_jlist == BJ_Shadow) {
  			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
  			wait_queue_head_t *wqh;
  
  			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
  
  			JBUFFER_TRACE(jh, "on shadow: sleep");
  			jbd_unlock_bh_state(bh);
  			/* commit wakes up all shadow buffers after IO */
  			for ( ; ; ) {
  				prepare_to_wait(wqh, &wait.wait,
  						TASK_UNINTERRUPTIBLE);
  				if (jh->b_jlist != BJ_Shadow)
  					break;
  				schedule();
  			}
  			finish_wait(wqh, &wait.wait);
  			goto repeat;
  		}
  
  		/* Only do the copy if the currently-owning transaction
  		 * still needs it.  If it is on the Forget list, the
  		 * committing transaction is past that stage.  The
  		 * buffer had better remain locked during the kmalloc,
  		 * but that should be true --- we hold the journal lock
  		 * still and the buffer is already on the BUF_JOURNAL
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
662
  		 * list so won't be flushed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
663
664
665
666
667
668
669
670
671
672
673
674
  		 *
  		 * Subtle point, though: if this is a get_undo_access,
  		 * then we will be relying on the frozen_data to contain
  		 * the new value of the committed_data record after the
  		 * transaction, so we HAVE to force the frozen_data copy
  		 * in that case. */
  
  		if (jh->b_jlist != BJ_Forget || force_copy) {
  			JBUFFER_TRACE(jh, "generate frozen data");
  			if (!frozen_buffer) {
  				JBUFFER_TRACE(jh, "allocate memory for buffer");
  				jbd_unlock_bh_state(bh);
ea817398e   Badari Pulavarty   [PATCH] Manage jb...
675
  				frozen_buffer =
c089d490d   Mingming Cao   JBD: JBD slab all...
676
  					jbd_alloc(jh2bh(jh)->b_size,
ea817398e   Badari Pulavarty   [PATCH] Manage jb...
677
  							 GFP_NOFS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
678
679
680
681
  				if (!frozen_buffer) {
  					printk(KERN_EMERG
  					       "%s: OOM for frozen_buffer
  ",
08fc99bfc   Harvey Harrison   jbd: replace rema...
682
  					       __func__);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
  					JBUFFER_TRACE(jh, "oom!");
  					error = -ENOMEM;
  					jbd_lock_bh_state(bh);
  					goto done;
  				}
  				goto repeat;
  			}
  			jh->b_frozen_data = frozen_buffer;
  			frozen_buffer = NULL;
  			need_copy = 1;
  		}
  		jh->b_next_transaction = transaction;
  	}
  
  
  	/*
  	 * Finally, if the buffer is not journaled right now, we need to make
  	 * sure it doesn't get written to disk before the caller actually
  	 * commits the new data
  	 */
  	if (!jh->b_transaction) {
  		JBUFFER_TRACE(jh, "no transaction");
  		J_ASSERT_JH(jh, !jh->b_next_transaction);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
  		JBUFFER_TRACE(jh, "file as BJ_Reserved");
  		spin_lock(&journal->j_list_lock);
  		__journal_file_buffer(jh, transaction, BJ_Reserved);
  		spin_unlock(&journal->j_list_lock);
  	}
  
  done:
  	if (need_copy) {
  		struct page *page;
  		int offset;
  		char *source;
  
  		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
  			    "Possible IO failure.
  ");
  		page = jh2bh(jh)->b_page;
8117f98c0   Namhyung Kim   jbd: Use offset_i...
722
  		offset = offset_in_page(jh2bh(jh)->b_data);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
723
724
725
726
727
728
729
730
731
732
733
734
735
  		source = kmap_atomic(page, KM_USER0);
  		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
  		kunmap_atomic(source, KM_USER0);
  	}
  	jbd_unlock_bh_state(bh);
  
  	/*
  	 * If we are about to journal a buffer, then any revoke pending on it is
  	 * no longer valid
  	 */
  	journal_cancel_revoke(handle, jh);
  
  out:
304c4c841   Andrew Morton   [PATCH] jbd: avoi...
736
  	if (unlikely(frozen_buffer))	/* It's usually NULL */
c089d490d   Mingming Cao   JBD: JBD slab all...
737
  		jbd_free(frozen_buffer, bh->b_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
739
740
741
742
743
744
745
746
  
  	JBUFFER_TRACE(jh, "exit");
  	return error;
  }
  
  /**
   * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
   * @handle: transaction to add buffer modifications to
   * @bh:     bh to be used for metadata writes
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
   *
   * Returns an error code or 0 on success.
   *
   * In full data journalling mode the buffer may be of type BJ_AsyncData,
   * because we're write()ing a buffer which is also part of a shared mapping.
   */
  
  int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
  {
  	struct journal_head *jh = journal_add_journal_head(bh);
  	int rc;
  
  	/* We do not want to get caught playing with fields which the
  	 * log thread also manipulates.  Make sure that the buffer
  	 * completes any outstanding IO before proceeding. */
  	rc = do_get_write_access(handle, jh, 0);
  	journal_put_journal_head(jh);
  	return rc;
  }
  
  
  /*
   * When the user wants to journal a newly created buffer_head
   * (ie. getblk() returned a new buffer and we are going to populate it
   * manually rather than reading off disk), then we need to keep the
   * buffer_head locked until it has been completely filled with new
   * data.  In this case, we should be able to make the assertion that
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
774
775
   * the bh is not already part of an existing transaction.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776
777
778
779
780
781
782
783
784
785
786
   * The buffer should already be locked by the caller by this point.
   * There is no lock ranking violation: it was a newly created,
   * unlocked buffer beforehand. */
  
  /**
   * int journal_get_create_access () - notify intent to use newly created bh
   * @handle: transaction to new buffer to
   * @bh: new buffer.
   *
   * Call this if you create a new bh.
   */
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
787
  int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
  {
  	transaction_t *transaction = handle->h_transaction;
  	journal_t *journal = transaction->t_journal;
  	struct journal_head *jh = journal_add_journal_head(bh);
  	int err;
  
  	jbd_debug(5, "journal_head %p
  ", jh);
  	err = -EROFS;
  	if (is_handle_aborted(handle))
  		goto out;
  	err = 0;
  
  	JBUFFER_TRACE(jh, "entry");
  	/*
  	 * The buffer may already belong to this transaction due to pre-zeroing
  	 * in the filesystem's new_block code.  It may also be on the previous,
  	 * committing transaction's lists, but it HAS to be in Forget state in
  	 * that case: the transaction must have deleted the buffer for it to be
  	 * reused here.
  	 */
  	jbd_lock_bh_state(bh);
  	spin_lock(&journal->j_list_lock);
  	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
  		jh->b_transaction == NULL ||
  		(jh->b_transaction == journal->j_committing_transaction &&
  			  jh->b_jlist == BJ_Forget)));
  
  	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
  	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
  
  	if (jh->b_transaction == NULL) {
1e9fd53b7   Jan Kara   jbd: Fix a race b...
820
821
822
823
824
825
826
827
828
  		/*
  		 * Previous journal_forget() could have left the buffer
  		 * with jbddirty bit set because it was being committed. When
  		 * the commit finished, we've filed the buffer for
  		 * checkpointing and marked it dirty. Now we are reallocating
  		 * the buffer so the transaction freeing it must have
  		 * committed and so it's safe to clear the dirty bit.
  		 */
  		clear_buffer_dirty(jh2bh(jh));
5bc833fea   Josef Bacik   jbd: fix the way ...
829
830
831
  
  		/* first access by this transaction */
  		jh->b_modified = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
832
833
834
  		JBUFFER_TRACE(jh, "file as BJ_Reserved");
  		__journal_file_buffer(jh, transaction, BJ_Reserved);
  	} else if (jh->b_transaction == journal->j_committing_transaction) {
5bc833fea   Josef Bacik   jbd: fix the way ...
835
836
  		/* first access by this transaction */
  		jh->b_modified = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
  		JBUFFER_TRACE(jh, "set next transaction");
  		jh->b_next_transaction = transaction;
  	}
  	spin_unlock(&journal->j_list_lock);
  	jbd_unlock_bh_state(bh);
  
  	/*
  	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
  	 * blocks which contain freed but then revoked metadata.  We need
  	 * to cancel the revoke in case we end up freeing it yet again
  	 * and the reallocating as data - this would cause a second revoke,
  	 * which hits an assertion error.
  	 */
  	JBUFFER_TRACE(jh, "cancelling revoke");
  	journal_cancel_revoke(handle, jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
852
  out:
bd5c9e185   Ding Dinghua   jbd: fix a bug of...
853
  	journal_put_journal_head(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
854
855
856
857
  	return err;
  }
  
  /**
78a4a50a8   Randy Dunlap   docbook: fix file...
858
   * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
859
860
   * @handle: transaction
   * @bh: buffer to undo
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
861
862
863
864
865
866
867
   *
   * Sometimes there is a need to distinguish between metadata which has
   * been committed to disk and that which has not.  The ext3fs code uses
   * this for freeing and allocating space, we have to make sure that we
   * do not reuse freed space until the deallocation has been committed,
   * since if we overwrote that space we would make the delete
   * un-rewindable in case of a crash.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
868
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
869
870
871
872
873
   * To deal with that, journal_get_undo_access requests write access to a
   * buffer for parts of non-rewindable operations such as delete
   * operations on the bitmaps.  The journaling code must keep a copy of
   * the buffer's contents prior to the undo_access call until such time
   * as we know that the buffer has definitely been committed to disk.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
874
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
   * We never need to know which transaction the committed data is part
   * of, buffers touched here are guaranteed to be dirtied later and so
   * will be committed to a new transaction in due course, at which point
   * we can discard the old committed data pointer.
   *
   * Returns error number or 0 on success.
   */
  int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
  {
  	int err;
  	struct journal_head *jh = journal_add_journal_head(bh);
  	char *committed_data = NULL;
  
  	JBUFFER_TRACE(jh, "entry");
  
  	/*
  	 * Do this first --- it can drop the journal lock, so we want to
  	 * make sure that obtaining the committed_data is done
  	 * atomically wrt. completion of any outstanding commits.
  	 */
  	err = do_get_write_access(handle, jh, 1);
  	if (err)
  		goto out;
  
  repeat:
  	if (!jh->b_committed_data) {
c089d490d   Mingming Cao   JBD: JBD slab all...
901
  		committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
902
903
904
  		if (!committed_data) {
  			printk(KERN_EMERG "%s: No memory for committed data
  ",
08fc99bfc   Harvey Harrison   jbd: replace rema...
905
  				__func__);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
  			err = -ENOMEM;
  			goto out;
  		}
  	}
  
  	jbd_lock_bh_state(bh);
  	if (!jh->b_committed_data) {
  		/* Copy out the current buffer contents into the
  		 * preserved, committed copy. */
  		JBUFFER_TRACE(jh, "generate b_committed data");
  		if (!committed_data) {
  			jbd_unlock_bh_state(bh);
  			goto repeat;
  		}
  
  		jh->b_committed_data = committed_data;
  		committed_data = NULL;
  		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
  	}
  	jbd_unlock_bh_state(bh);
  out:
  	journal_put_journal_head(jh);
304c4c841   Andrew Morton   [PATCH] jbd: avoi...
928
  	if (unlikely(committed_data))
c089d490d   Mingming Cao   JBD: JBD slab all...
929
  		jbd_free(committed_data, bh->b_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
930
931
  	return err;
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
932
  /**
78a4a50a8   Randy Dunlap   docbook: fix file...
933
   * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
934
935
   * @handle: transaction
   * @bh: bufferhead to mark
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
936
   *
78a4a50a8   Randy Dunlap   docbook: fix file...
937
938
939
940
   * Description:
   * Mark a buffer as containing dirty data which needs to be flushed before
   * we can commit the current transaction.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
941
942
943
944
945
946
947
948
949
950
951
952
953
   * The buffer is placed on the transaction's data list and is marked as
   * belonging to the transaction.
   *
   * Returns error number or 0 on success.
   *
   * journal_dirty_data() can be called via page_launder->ext3_writepage
   * by kswapd.
   */
  int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
  {
  	journal_t *journal = handle->h_transaction->t_journal;
  	int need_brelse = 0;
  	struct journal_head *jh;
960a22ae6   Hidehiro Kawai   jbd: ordered data...
954
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
955
956
  
  	if (is_handle_aborted(handle))
960a22ae6   Hidehiro Kawai   jbd: ordered data...
957
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
958
959
960
961
962
963
964
965
966
967
968
969
970
  
  	jh = journal_add_journal_head(bh);
  	JBUFFER_TRACE(jh, "entry");
  
  	/*
  	 * The buffer could *already* be dirty.  Writeout can start
  	 * at any time.
  	 */
  	jbd_debug(4, "jh: %p, tid:%d
  ", jh, handle->h_transaction->t_tid);
  
  	/*
  	 * What if the buffer is already part of a running transaction?
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
971
  	 *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
972
973
974
  	 * There are two cases:
  	 * 1) It is part of the current running transaction.  Refile it,
  	 *    just in case we have allocated it as metadata, deallocated
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
975
  	 *    it, then reallocated it as data.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
976
977
978
  	 * 2) It is part of the previous, still-committing transaction.
  	 *    If all we want to do is to guarantee that the buffer will be
  	 *    written to disk before this new transaction commits, then
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
979
  	 *    being sure that the *previous* transaction has this same
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
980
981
982
983
984
985
986
987
988
989
990
991
  	 *    property is sufficient for us!  Just leave it on its old
  	 *    transaction.
  	 *
  	 * In case (2), the buffer must not already exist as metadata
  	 * --- that would violate write ordering (a transaction is free
  	 * to write its data at any point, even before the previous
  	 * committing transaction has committed).  The caller must
  	 * never, ever allow this to happen: there's nothing we can do
  	 * about it in this layer.
  	 */
  	jbd_lock_bh_state(bh);
  	spin_lock(&journal->j_list_lock);
f58a74dca   Eric Sandeen   [PATCH] jbd: jour...
992
993
994
995
996
997
  
  	/* Now that we have bh_state locked, are we really still mapped? */
  	if (!buffer_mapped(bh)) {
  		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
  		goto no_journal;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
  	if (jh->b_transaction) {
  		JBUFFER_TRACE(jh, "has transaction");
  		if (jh->b_transaction != handle->h_transaction) {
  			JBUFFER_TRACE(jh, "belongs to older transaction");
  			J_ASSERT_JH(jh, jh->b_transaction ==
  					journal->j_committing_transaction);
  
  			/* @@@ IS THIS TRUE  ? */
  			/*
  			 * Not any more.  Scenario: someone does a write()
  			 * in data=journal mode.  The buffer's transaction has
  			 * moved into commit.  Then someone does another
  			 * write() to the file.  We do the frozen data copyout
  			 * and set b_next_transaction to point to j_running_t.
  			 * And while we're in that state, someone does a
  			 * writepage() in an attempt to pageout the same area
  			 * of the file via a shared mapping.  At present that
  			 * calls journal_dirty_data(), and we get right here.
  			 * It may be too late to journal the data.  Simply
  			 * falling through to the next test will suffice: the
  			 * data will be dirty and wil be checkpointed.  The
  			 * ordering comments in the next comment block still
  			 * apply.
  			 */
  			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
  
  			/*
  			 * If we're journalling data, and this buffer was
  			 * subject to a write(), it could be metadata, forget
  			 * or shadow against the committing transaction.  Now,
  			 * someone has dirtied the same darn page via a mapping
  			 * and it is being writepage()'d.
  			 * We *could* just steal the page from commit, with some
  			 * fancy locking there.  Instead, we just skip it -
  			 * don't tie the page's buffers to the new transaction
  			 * at all.
  			 * Implication: if we crash before the writepage() data
  			 * is written into the filesystem, recovery will replay
  			 * the write() data.
  			 */
  			if (jh->b_jlist != BJ_None &&
  					jh->b_jlist != BJ_SyncData &&
  					jh->b_jlist != BJ_Locked) {
  				JBUFFER_TRACE(jh, "Not stealing");
  				goto no_journal;
  			}
  
  			/*
  			 * This buffer may be undergoing writeout in commit.  We
  			 * can't return from here and let the caller dirty it
  			 * again because that can cause the write-out loop in
  			 * commit to never terminate.
  			 */
  			if (buffer_dirty(bh)) {
  				get_bh(bh);
  				spin_unlock(&journal->j_list_lock);
  				jbd_unlock_bh_state(bh);
  				need_brelse = 1;
  				sync_dirty_buffer(bh);
  				jbd_lock_bh_state(bh);
  				spin_lock(&journal->j_list_lock);
f58a74dca   Eric Sandeen   [PATCH] jbd: jour...
1059
1060
1061
1062
1063
  				/* Since we dropped the lock... */
  				if (!buffer_mapped(bh)) {
  					JBUFFER_TRACE(jh, "buffer got unmapped");
  					goto no_journal;
  				}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1064
1065
1066
  				/* The buffer may become locked again at any
  				   time if it is redirtied */
  			}
960a22ae6   Hidehiro Kawai   jbd: ordered data...
1067
1068
1069
1070
1071
1072
1073
1074
1075
  			/*
  			 * We cannot remove the buffer with io error from the
  			 * committing transaction, because otherwise it would
  			 * miss the error and the commit would not abort.
  			 */
  			if (unlikely(!buffer_uptodate(bh))) {
  				ret = -EIO;
  				goto no_journal;
  			}
bb189247f   Jan Kara   jbd: Fix oops in ...
1076
1077
1078
  			/* We might have slept so buffer could be refiled now */
  			if (jh->b_transaction != NULL &&
  			    jh->b_transaction != handle->h_transaction) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
  				JBUFFER_TRACE(jh, "unfile from commit");
  				__journal_temp_unlink_buffer(jh);
  				/* It still points to the committing
  				 * transaction; move it to this one so
  				 * that the refile assert checks are
  				 * happy. */
  				jh->b_transaction = handle->h_transaction;
  			}
  			/* The buffer will be refiled below */
  
  		}
  		/*
  		 * Special case --- the buffer might actually have been
  		 * allocated and then immediately deallocated in the previous,
  		 * committing transaction, so might still be left on that
  		 * transaction's metadata lists.
  		 */
  		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
  			JBUFFER_TRACE(jh, "not on correct data list: unfile");
  			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
  			JBUFFER_TRACE(jh, "file as data");
  			__journal_file_buffer(jh, handle->h_transaction,
  						BJ_SyncData);
  		}
  	} else {
  		JBUFFER_TRACE(jh, "not on a transaction");
  		__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
  	}
  no_journal:
  	spin_unlock(&journal->j_list_lock);
  	jbd_unlock_bh_state(bh);
  	if (need_brelse) {
  		BUFFER_TRACE(bh, "brelse");
  		__brelse(bh);
  	}
  	JBUFFER_TRACE(jh, "exit");
  	journal_put_journal_head(jh);
960a22ae6   Hidehiro Kawai   jbd: ordered data...
1116
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1117
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1118
  /**
78a4a50a8   Randy Dunlap   docbook: fix file...
1119
   * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1120
   * @handle: transaction to add buffer to.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1121
1122
   * @bh: buffer to mark
   *
78a4a50a8   Randy Dunlap   docbook: fix file...
1123
   * Mark dirty metadata which needs to be journaled as part of the current
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1124
1125
1126
   * transaction.
   *
   * The buffer is placed on the transaction's metadata list and is marked
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1127
   * as belonging to the transaction.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1128
   *
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1129
   * Returns error number or 0 on success.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
   *
   * Special care needs to be taken if the buffer already belongs to the
   * current committing transaction (in which case we should have frozen
   * data present for that commit).  In that case, we don't relink the
   * buffer: that only gets done when the old transaction finally
   * completes its commit.
   */
  int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
  {
  	transaction_t *transaction = handle->h_transaction;
  	journal_t *journal = transaction->t_journal;
  	struct journal_head *jh = bh2jh(bh);
  
  	jbd_debug(5, "journal_head %p
  ", jh);
  	JBUFFER_TRACE(jh, "entry");
  	if (is_handle_aborted(handle))
  		goto out;
  
  	jbd_lock_bh_state(bh);
  
  	if (jh->b_modified == 0) {
  		/*
  		 * This buffer's got modified and becoming part
  		 * of the transaction. This needs to be done
  		 * once a transaction -bzzz
  		 */
  		jh->b_modified = 1;
  		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
  		handle->h_buffer_credits--;
  	}
  
  	/*
  	 * fastpath, to avoid expensive locking.  If this buffer is already
  	 * on the running transaction's metadata list there is nothing to do.
  	 * Nobody can take it off again because there is a handle open.
  	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
  	 * result in this test being false, so we go in and take the locks.
  	 */
  	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
  		JBUFFER_TRACE(jh, "fastpath");
  		J_ASSERT_JH(jh, jh->b_transaction ==
  					journal->j_running_transaction);
  		goto out_unlock_bh;
  	}
  
  	set_buffer_jbddirty(bh);
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1177
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1178
1179
1180
  	 * Metadata already on the current transaction list doesn't
  	 * need to be filed.  Metadata on another transaction's list must
  	 * be committing, and will be refiled once the commit completes:
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1181
  	 * leave it alone for now.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
  	 */
  	if (jh->b_transaction != transaction) {
  		JBUFFER_TRACE(jh, "already on other transaction");
  		J_ASSERT_JH(jh, jh->b_transaction ==
  					journal->j_committing_transaction);
  		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
  		/* And this case is illegal: we can't reuse another
  		 * transaction's data buffer, ever. */
  		goto out_unlock_bh;
  	}
  
  	/* That test should have eliminated the following case: */
c80544dc0   Stephen Hemminger   sparse pointer us...
1194
  	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
  
  	JBUFFER_TRACE(jh, "file as BJ_Metadata");
  	spin_lock(&journal->j_list_lock);
  	__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
  	spin_unlock(&journal->j_list_lock);
  out_unlock_bh:
  	jbd_unlock_bh_state(bh);
  out:
  	JBUFFER_TRACE(jh, "exit");
  	return 0;
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1206
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1207
1208
1209
1210
1211
1212
1213
1214
1215
   * journal_release_buffer: undo a get_write_access without any buffer
   * updates, if the update decided in the end that it didn't need access.
   *
   */
  void
  journal_release_buffer(handle_t *handle, struct buffer_head *bh)
  {
  	BUFFER_TRACE(bh, "entry");
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1216
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1217
1218
1219
1220
1221
1222
   * void journal_forget() - bforget() for potentially-journaled buffers.
   * @handle: transaction handle
   * @bh:     bh to 'forget'
   *
   * We can only do the bforget if there are no commits pending against the
   * buffer.  If the buffer is dirty in the current running transaction we
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1223
   * can safely unlink it.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1224
1225
1226
1227
1228
   *
   * bh may not be a journalled buffer at all - it may be a non-JBD
   * buffer which came off the hashtable.  Check for this.
   *
   * Decrements bh->b_count by one.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1229
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
   * Allow this call even if the handle has aborted --- it may be part of
   * the caller's cleanup after an abort.
   */
  int journal_forget (handle_t *handle, struct buffer_head *bh)
  {
  	transaction_t *transaction = handle->h_transaction;
  	journal_t *journal = transaction->t_journal;
  	struct journal_head *jh;
  	int drop_reserve = 0;
  	int err = 0;
5b9a499d7   Josef Bacik   jbd: fix possible...
1240
  	int was_modified = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
  
  	BUFFER_TRACE(bh, "entry");
  
  	jbd_lock_bh_state(bh);
  	spin_lock(&journal->j_list_lock);
  
  	if (!buffer_jbd(bh))
  		goto not_jbd;
  	jh = bh2jh(bh);
  
  	/* Critical error: attempting to delete a bitmap buffer, maybe?
  	 * Don't do any jbd operations, and return an error. */
  	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
  			 "inconsistent data on disk")) {
  		err = -EIO;
  		goto not_jbd;
  	}
5b9a499d7   Josef Bacik   jbd: fix possible...
1258
1259
  	/* keep track of wether or not this transaction modified us */
  	was_modified = jh->b_modified;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
  	/*
  	 * The buffer's going from the transaction, we must drop
  	 * all references -bzzz
  	 */
  	jh->b_modified = 0;
  
  	if (jh->b_transaction == handle->h_transaction) {
  		J_ASSERT_JH(jh, !jh->b_frozen_data);
  
  		/* If we are forgetting a buffer which is already part
  		 * of this transaction, then we can just drop it from
  		 * the transaction immediately. */
  		clear_buffer_dirty(bh);
  		clear_buffer_jbddirty(bh);
  
  		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
5b9a499d7   Josef Bacik   jbd: fix possible...
1276
1277
1278
1279
1280
1281
  		/*
  		 * we only want to drop a reference if this transaction
  		 * modified the buffer
  		 */
  		if (was_modified)
  			drop_reserve = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1282

ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1283
  		/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1284
1285
1286
1287
1288
1289
1290
1291
  		 * We are no longer going to journal this buffer.
  		 * However, the commit of this transaction is still
  		 * important to the buffer: the delete that we are now
  		 * processing might obsolete an old log entry, so by
  		 * committing, we can satisfy the buffer's checkpoint.
  		 *
  		 * So, if we have a checkpoint on the buffer, we should
  		 * now refile the buffer on our BJ_Forget list so that
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1292
  		 * we know to remove the checkpoint after we commit.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1293
1294
1295
1296
1297
1298
1299
  		 */
  
  		if (jh->b_cp_transaction) {
  			__journal_temp_unlink_buffer(jh);
  			__journal_file_buffer(jh, transaction, BJ_Forget);
  		} else {
  			__journal_unfile_buffer(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1300
1301
1302
1303
1304
1305
1306
1307
  			if (!buffer_jbd(bh)) {
  				spin_unlock(&journal->j_list_lock);
  				jbd_unlock_bh_state(bh);
  				__bforget(bh);
  				goto drop;
  			}
  		}
  	} else if (jh->b_transaction) {
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1308
  		J_ASSERT_JH(jh, (jh->b_transaction ==
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
  				 journal->j_committing_transaction));
  		/* However, if the buffer is still owned by a prior
  		 * (committing) transaction, we can't drop it yet... */
  		JBUFFER_TRACE(jh, "belongs to older transaction");
  		/* ... but we CAN drop it from the new transaction if we
  		 * have also modified it since the original commit. */
  
  		if (jh->b_next_transaction) {
  			J_ASSERT(jh->b_next_transaction == transaction);
  			jh->b_next_transaction = NULL;
5b9a499d7   Josef Bacik   jbd: fix possible...
1319
1320
1321
1322
1323
1324
1325
  
  			/*
  			 * only drop a reference if this transaction modified
  			 * the buffer
  			 */
  			if (was_modified)
  				drop_reserve = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
  		}
  	}
  
  not_jbd:
  	spin_unlock(&journal->j_list_lock);
  	jbd_unlock_bh_state(bh);
  	__brelse(bh);
  drop:
  	if (drop_reserve) {
  		/* no need to reserve log space for this block -bzzz */
  		handle->h_buffer_credits++;
  	}
  	return err;
  }
  
  /**
   * int journal_stop() - complete a transaction
   * @handle: tranaction to complete.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1344
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1345
1346
1347
1348
1349
1350
1351
1352
   * All done for a particular handle.
   *
   * There is not much action needed here.  We just return any remaining
   * buffer credits to the transaction and remove the handle.  The only
   * complication is that we need to start a commit operation if the
   * filesystem is marked for synchronous update.
   *
   * journal_stop itself will not usually return an error, but it may
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1353
   * do so in unusual circumstances.  In particular, expect it to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1354
1355
1356
1357
1358
1359
1360
   * return -EIO if a journal_abort has been executed since the
   * transaction began.
   */
  int journal_stop(handle_t *handle)
  {
  	transaction_t *transaction = handle->h_transaction;
  	journal_t *journal = transaction->t_journal;
f420d4dc4   Josef Bacik   jbd: improve fsyn...
1361
  	int err;
fe1dcbc4f   Andrew Morton   [PATCH] jbd: fix ...
1362
  	pid_t pid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1363

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1364
1365
1366
1367
  	J_ASSERT(journal_current_handle() == handle);
  
  	if (is_handle_aborted(handle))
  		err = -EIO;
3e2a532b2   OGAWA Hirofumi   [PATCH] ext3/4: f...
1368
1369
  	else {
  		J_ASSERT(transaction->t_updates > 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1370
  		err = 0;
3e2a532b2   OGAWA Hirofumi   [PATCH] ext3/4: f...
1371
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
  
  	if (--handle->h_ref > 0) {
  		jbd_debug(4, "h_ref %d -> %d
  ", handle->h_ref + 1,
  			  handle->h_ref);
  		return err;
  	}
  
  	jbd_debug(4, "Handle %p going down
  ", handle);
  
  	/*
  	 * Implement synchronous transaction batching.  If the handle
  	 * was synchronous, don't force a commit immediately.  Let's
  	 * yield and let another thread piggyback onto this transaction.
  	 * Keep doing that while new threads continue to arrive.
  	 * It doesn't cost much - we're about to run a commit and sleep
  	 * on IO anyway.  Speeds up many-threaded, many-dir operations
  	 * by 30x or more...
fe1dcbc4f   Andrew Morton   [PATCH] jbd: fix ...
1391
  	 *
f420d4dc4   Josef Bacik   jbd: improve fsyn...
1392
  	 * We try and optimize the sleep time against what the underlying disk
25985edce   Lucas De Marchi   Fix common misspe...
1393
  	 * can do, instead of having a static sleep time.  This is useful for
f420d4dc4   Josef Bacik   jbd: improve fsyn...
1394
1395
1396
  	 * the case where our storage is so fast that it is more optimal to go
  	 * ahead and force a flush and wait for the transaction to be committed
  	 * than it is to wait for an arbitrary amount of time for new writers to
3ad2f3fbb   Daniel Mack   tree-wide: Assort...
1397
  	 * join the transaction.  We achieve this by measuring how long it takes
f420d4dc4   Josef Bacik   jbd: improve fsyn...
1398
1399
1400
1401
1402
  	 * to commit a transaction, and compare it with how long this
  	 * transaction has been running, and if run time < commit time then we
  	 * sleep for the delta and commit.  This greatly helps super fast disks
  	 * that would see slowdowns as more threads started doing fsyncs.
  	 *
fe1dcbc4f   Andrew Morton   [PATCH] jbd: fix ...
1403
1404
1405
1406
  	 * But don't do this if this process was the most recent one to
  	 * perform a synchronous write.  We do this to detect the case where a
  	 * single process is doing a stream of sync writes.  No point in waiting
  	 * for joiners in that case.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1407
  	 */
fe1dcbc4f   Andrew Morton   [PATCH] jbd: fix ...
1408
1409
  	pid = current->pid;
  	if (handle->h_sync && journal->j_last_sync_writer != pid) {
f420d4dc4   Josef Bacik   jbd: improve fsyn...
1410
  		u64 commit_time, trans_time;
fe1dcbc4f   Andrew Morton   [PATCH] jbd: fix ...
1411
  		journal->j_last_sync_writer = pid;
f420d4dc4   Josef Bacik   jbd: improve fsyn...
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
  
  		spin_lock(&journal->j_state_lock);
  		commit_time = journal->j_average_commit_time;
  		spin_unlock(&journal->j_state_lock);
  
  		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
  						   transaction->t_start_time));
  
  		commit_time = min_t(u64, commit_time,
  				    1000*jiffies_to_usecs(1));
  
  		if (trans_time < commit_time) {
  			ktime_t expires = ktime_add_ns(ktime_get(),
  						       commit_time);
  			set_current_state(TASK_UNINTERRUPTIBLE);
  			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1429
  	}
512a00438   Theodore Ts'o   ext3: Use WRITE_S...
1430
1431
  	if (handle->h_sync)
  		transaction->t_synchronous_commit = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
  	current->journal_info = NULL;
  	spin_lock(&journal->j_state_lock);
  	spin_lock(&transaction->t_handle_lock);
  	transaction->t_outstanding_credits -= handle->h_buffer_credits;
  	transaction->t_updates--;
  	if (!transaction->t_updates) {
  		wake_up(&journal->j_wait_updates);
  		if (journal->j_barrier_count)
  			wake_up(&journal->j_wait_transaction_locked);
  	}
  
  	/*
  	 * If the handle is marked SYNC, we need to set another commit
  	 * going!  We also want to force a commit if the current
  	 * transaction is occupying too much of the log, or if the
  	 * transaction is too old now.
  	 */
  	if (handle->h_sync ||
  			transaction->t_outstanding_credits >
  				journal->j_max_transaction_buffers ||
e9ad5620b   Dave Kleikamp   [PATCH] ext3: Mor...
1452
  			time_after_eq(jiffies, transaction->t_expires)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
  		/* Do this even for aborted journals: an abort still
  		 * completes the commit thread, it just doesn't write
  		 * anything to disk. */
  		tid_t tid = transaction->t_tid;
  
  		spin_unlock(&transaction->t_handle_lock);
  		jbd_debug(2, "transaction too old, requesting commit for "
  					"handle %p
  ", handle);
  		/* This is non-blocking */
  		__log_start_commit(journal, transaction->t_tid);
  		spin_unlock(&journal->j_state_lock);
  
  		/*
  		 * Special case: JFS_SYNC synchronous updates require us
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1468
  		 * to wait for the commit to complete.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1469
1470
1471
1472
1473
1474
1475
  		 */
  		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
  			err = log_wait_commit(journal, tid);
  	} else {
  		spin_unlock(&transaction->t_handle_lock);
  		spin_unlock(&journal->j_state_lock);
  	}
3295f0ef9   Ingo Molnar   lockdep: rename m...
1476
  	lock_map_release(&handle->h_lockdep_map);
34a3d1e83   Peter Zijlstra   lockdep: annotate...
1477

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1478
1479
1480
  	jbd_free_handle(handle);
  	return err;
  }
0cf01f668   Randy Dunlap   jbd: fix jbd kern...
1481
1482
  /**
   * int journal_force_commit() - force any uncommitted transactions
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
   * @journal: journal to force
   *
   * For synchronous operations: force any uncommitted transactions
   * to disk.  May seem kludgy, but it reuses all the handle batching
   * code in a very simple manner.
   */
  int journal_force_commit(journal_t *journal)
  {
  	handle_t *handle;
  	int ret;
  
  	handle = journal_start(journal, 1);
  	if (IS_ERR(handle)) {
  		ret = PTR_ERR(handle);
  	} else {
  		handle->h_sync = 1;
  		ret = journal_stop(handle);
  	}
  	return ret;
  }
  
  /*
   *
   * List management code snippets: various functions for manipulating the
   * transaction buffer lists.
   *
   */
  
  /*
   * Append a buffer to a transaction list, given the transaction's list head
   * pointer.
   *
   * j_list_lock is held.
   *
   * jbd_lock_bh_state(jh2bh(jh)) is held.
   */
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1519
  static inline void
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
  __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
  {
  	if (!*list) {
  		jh->b_tnext = jh->b_tprev = jh;
  		*list = jh;
  	} else {
  		/* Insert at the tail of the list to preserve order */
  		struct journal_head *first = *list, *last = first->b_tprev;
  		jh->b_tprev = last;
  		jh->b_tnext = first;
  		last->b_tnext = first->b_tprev = jh;
  	}
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1533
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
   * Remove a buffer from a transaction list, given the transaction's list
   * head pointer.
   *
   * Called with j_list_lock held, and the journal may not be locked.
   *
   * jbd_lock_bh_state(jh2bh(jh)) is held.
   */
  
  static inline void
  __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  {
  	if (*list == jh) {
  		*list = jh->b_tnext;
  		if (*list == jh)
  			*list = NULL;
  	}
  	jh->b_tprev->b_tnext = jh->b_tnext;
  	jh->b_tnext->b_tprev = jh->b_tprev;
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1553
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
   * Remove a buffer from the appropriate transaction list.
   *
   * Note that this function can *change* the value of
   * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
   * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
   * is holding onto a copy of one of thee pointers, it could go bad.
   * Generally the caller needs to re-read the pointer from the transaction_t.
   *
   * Called under j_list_lock.  The journal may not be locked.
   */
d394e122b   Adrian Bunk   [PATCH] make fs/j...
1564
  static void __journal_temp_unlink_buffer(struct journal_head *jh)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
  {
  	struct journal_head **list = NULL;
  	transaction_t *transaction;
  	struct buffer_head *bh = jh2bh(jh);
  
  	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
  	transaction = jh->b_transaction;
  	if (transaction)
  		assert_spin_locked(&transaction->t_journal->j_list_lock);
  
  	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
  	if (jh->b_jlist != BJ_None)
c80544dc0   Stephen Hemminger   sparse pointer us...
1577
  		J_ASSERT_JH(jh, transaction != NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
  
  	switch (jh->b_jlist) {
  	case BJ_None:
  		return;
  	case BJ_SyncData:
  		list = &transaction->t_sync_datalist;
  		break;
  	case BJ_Metadata:
  		transaction->t_nr_buffers--;
  		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
  		list = &transaction->t_buffers;
  		break;
  	case BJ_Forget:
  		list = &transaction->t_forget;
  		break;
  	case BJ_IO:
  		list = &transaction->t_iobuf_list;
  		break;
  	case BJ_Shadow:
  		list = &transaction->t_shadow_list;
  		break;
  	case BJ_LogCtl:
  		list = &transaction->t_log_list;
  		break;
  	case BJ_Reserved:
  		list = &transaction->t_reserved_list;
  		break;
  	case BJ_Locked:
  		list = &transaction->t_locked_list;
  		break;
  	}
  
  	__blist_del_buffer(list, jh);
  	jh->b_jlist = BJ_None;
  	if (test_clear_buffer_jbddirty(bh))
  		mark_buffer_dirty(bh);	/* Expose it to the VM */
  }
bb189247f   Jan Kara   jbd: Fix oops in ...
1615
1616
1617
1618
1619
1620
1621
  /*
   * Remove buffer from all transactions.
   *
   * Called with bh_state lock and j_list_lock
   *
   * jh and bh may be already freed when this function returns.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1622
1623
1624
1625
  void __journal_unfile_buffer(struct journal_head *jh)
  {
  	__journal_temp_unlink_buffer(jh);
  	jh->b_transaction = NULL;
bb189247f   Jan Kara   jbd: Fix oops in ...
1626
  	journal_put_journal_head(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1627
1628
1629
1630
  }
  
  void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
  {
bb189247f   Jan Kara   jbd: Fix oops in ...
1631
1632
1633
1634
1635
  	struct buffer_head *bh = jh2bh(jh);
  
  	/* Get reference so that buffer cannot be freed before we unlock it */
  	get_bh(bh);
  	jbd_lock_bh_state(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1636
1637
1638
  	spin_lock(&journal->j_list_lock);
  	__journal_unfile_buffer(jh);
  	spin_unlock(&journal->j_list_lock);
bb189247f   Jan Kara   jbd: Fix oops in ...
1639
1640
  	jbd_unlock_bh_state(bh);
  	__brelse(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
  }
  
  /*
   * Called from journal_try_to_free_buffers().
   *
   * Called under jbd_lock_bh_state(bh)
   */
  static void
  __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
  {
  	struct journal_head *jh;
  
  	jh = bh2jh(bh);
  
  	if (buffer_locked(bh) || buffer_dirty(bh))
  		goto out;
c80544dc0   Stephen Hemminger   sparse pointer us...
1657
  	if (jh->b_next_transaction != NULL)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1658
1659
1660
  		goto out;
  
  	spin_lock(&journal->j_list_lock);
c80544dc0   Stephen Hemminger   sparse pointer us...
1661
  	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1662
1663
1664
1665
  		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
  			/* A written-back ordered data buffer */
  			JBUFFER_TRACE(jh, "release data");
  			__journal_unfile_buffer(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1666
  		}
c80544dc0   Stephen Hemminger   sparse pointer us...
1667
  	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1668
1669
1670
1671
  		/* written-back checkpointed metadata buffer */
  		if (jh->b_jlist == BJ_None) {
  			JBUFFER_TRACE(jh, "remove from checkpoint list");
  			__journal_remove_checkpoint(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1672
1673
1674
1675
1676
1677
  		}
  	}
  	spin_unlock(&journal->j_list_lock);
  out:
  	return;
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1678
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1679
1680
1681
   * int journal_try_to_free_buffers() - try to free page buffers.
   * @journal: journal for operation
   * @page: to try and free
3f31fddfa   Mingming Cao   jbd: fix race bet...
1682
1683
1684
   * @gfp_mask: we use the mask to detect how hard should we try to release
   * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
   * release the buffers.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1685
   *
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1686
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1687
1688
1689
   * For all the buffers on this page,
   * if they are fully written out ordered data, move them onto BUF_CLEAN
   * so try_to_free_buffers() can reap them.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1690
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
   * This function returns non-zero if we wish try_to_free_buffers()
   * to be called. We do this if the page is releasable by try_to_free_buffers().
   * We also do it if the page has locked or dirty buffers and the caller wants
   * us to perform sync or async writeout.
   *
   * This complicates JBD locking somewhat.  We aren't protected by the
   * BKL here.  We wish to remove the buffer from its committing or
   * running transaction's ->t_datalist via __journal_unfile_buffer.
   *
   * This may *change* the value of transaction_t->t_datalist, so anyone
   * who looks at t_datalist needs to lock against this function.
   *
   * Even worse, someone may be doing a journal_dirty_data on this
   * buffer.  So we need to lock against that.  journal_dirty_data()
   * will come out of the lock with the buffer dirty, which makes it
   * ineligible for release here.
   *
   * Who else is affected by this?  hmm...  Really the only contender
   * is do_get_write_access() - it could be looking at the buffer while
   * journal_try_to_free_buffer() is changing its state.  But that
   * cannot happen because we never reallocate freed data as metadata
   * while the data is part of a transaction.  Yes?
3f31fddfa   Mingming Cao   jbd: fix race bet...
1713
1714
   *
   * Return 0 on failure, 1 on success
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1715
   */
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1716
  int journal_try_to_free_buffers(journal_t *journal,
3f31fddfa   Mingming Cao   jbd: fix race bet...
1717
  				struct page *page, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
  {
  	struct buffer_head *head;
  	struct buffer_head *bh;
  	int ret = 0;
  
  	J_ASSERT(PageLocked(page));
  
  	head = page_buffers(page);
  	bh = head;
  	do {
  		struct journal_head *jh;
  
  		/*
  		 * We take our own ref against the journal_head here to avoid
  		 * having to add tons of locking around each instance of
bb189247f   Jan Kara   jbd: Fix oops in ...
1733
  		 * journal_put_journal_head().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
  		 */
  		jh = journal_grab_journal_head(bh);
  		if (!jh)
  			continue;
  
  		jbd_lock_bh_state(bh);
  		__journal_try_to_free_buffer(journal, bh);
  		journal_put_journal_head(jh);
  		jbd_unlock_bh_state(bh);
  		if (buffer_jbd(bh))
  			goto busy;
  	} while ((bh = bh->b_this_page) != head);
3f31fddfa   Mingming Cao   jbd: fix race bet...
1746

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1747
  	ret = try_to_free_buffers(page);
3f31fddfa   Mingming Cao   jbd: fix race bet...
1748

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
  busy:
  	return ret;
  }
  
  /*
   * This buffer is no longer needed.  If it is on an older transaction's
   * checkpoint list we need to record it on this transaction's forget list
   * to pin this buffer (and hence its checkpointing transaction) down until
   * this transaction commits.  If the buffer isn't on a checkpoint list, we
   * release it.
   * Returns non-zero if JBD no longer has an interest in the buffer.
   *
   * Called under j_list_lock.
   *
   * Called under jbd_lock_bh_state(bh).
   */
  static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
  {
  	int may_free = 1;
  	struct buffer_head *bh = jh2bh(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1769
1770
  	if (jh->b_cp_transaction) {
  		JBUFFER_TRACE(jh, "on running+cp transaction");
bb189247f   Jan Kara   jbd: Fix oops in ...
1771
  		__journal_temp_unlink_buffer(jh);
1e9fd53b7   Jan Kara   jbd: Fix a race b...
1772
1773
1774
1775
1776
1777
  		/*
  		 * We don't want to write the buffer anymore, clear the
  		 * bit so that we don't confuse checks in
  		 * __journal_file_buffer
  		 */
  		clear_buffer_dirty(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1778
  		__journal_file_buffer(jh, transaction, BJ_Forget);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1779
1780
1781
  		may_free = 0;
  	} else {
  		JBUFFER_TRACE(jh, "on running transaction");
bb189247f   Jan Kara   jbd: Fix oops in ...
1782
  		__journal_unfile_buffer(jh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1783
1784
1785
1786
1787
  	}
  	return may_free;
  }
  
  /*
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1788
   * journal_invalidatepage
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1789
1790
1791
1792
1793
1794
1795
   *
   * This code is tricky.  It has a number of cases to deal with.
   *
   * There are two invariants which this code relies on:
   *
   * i_size must be updated on disk before we start calling invalidatepage on the
   * data.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1796
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1797
1798
1799
1800
1801
   *  This is done in ext3 by defining an ext3_setattr method which
   *  updates i_size before truncate gets going.  By maintaining this
   *  invariant, we can be sure that it is safe to throw away any buffers
   *  attached to the current transaction: once the transaction commits,
   *  we know that the data will not be needed.
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1802
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1803
   *  Note however that we can *not* throw away data belonging to the
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1804
   *  previous, committing transaction!
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
   *
   * Any disk blocks which *are* part of the previous, committing
   * transaction (and which therefore cannot be discarded immediately) are
   * not going to be reused in the new running transaction
   *
   *  The bitmap committed_data images guarantee this: any block which is
   *  allocated in one transaction and removed in the next will be marked
   *  as in-use in the committed_data bitmap, so cannot be reused until
   *  the next transaction to delete the block commits.  This means that
   *  leaving committing buffers dirty is quite safe: the disk blocks
   *  cannot be reallocated to a different file and so buffer aliasing is
   *  not possible.
   *
   *
   * The above applies mainly to ordered data mode.  In writeback mode we
   * don't make guarantees about the order in which data hits disk --- in
   * particular we don't guarantee that new dirty data is flushed before
   * transaction commit --- so it is always safe just to discard data
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1823
   * immediately in that mode.  --sct
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
   */
  
  /*
   * The journal_unmap_buffer helper function returns zero if the buffer
   * concerned remains pinned as an anonymous buffer belonging to an older
   * transaction.
   *
   * We're outside-transaction here.  Either or both of j_running_transaction
   * and j_committing_transaction may be NULL.
   */
  static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
  {
  	transaction_t *transaction;
  	struct journal_head *jh;
  	int may_free = 1;
  	int ret;
  
  	BUFFER_TRACE(bh, "entry");
  
  	/*
  	 * It is safe to proceed here without the j_list_lock because the
  	 * buffers cannot be stolen by try_to_free_buffers as long as we are
  	 * holding the page lock. --sct
  	 */
  
  	if (!buffer_jbd(bh))
  		goto zap_buffer_unlocked;
  
  	spin_lock(&journal->j_state_lock);
  	jbd_lock_bh_state(bh);
  	spin_lock(&journal->j_list_lock);
  
  	jh = journal_grab_journal_head(bh);
  	if (!jh)
  		goto zap_buffer_no_jh;
869639189   Jan Kara   jbd: Delay discar...
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
  	/*
  	 * We cannot remove the buffer from checkpoint lists until the
  	 * transaction adding inode to orphan list (let's call it T)
  	 * is committed.  Otherwise if the transaction changing the
  	 * buffer would be cleaned from the journal before T is
  	 * committed, a crash will cause that the correct contents of
  	 * the buffer will be lost.  On the other hand we have to
  	 * clear the buffer dirty bit at latest at the moment when the
  	 * transaction marking the buffer as freed in the filesystem
  	 * structures is committed because from that moment on the
  	 * buffer can be reallocated and used by a different page.
  	 * Since the block hasn't been freed yet but the inode has
  	 * already been added to orphan list, it is safe for us to add
  	 * the buffer to BJ_Forget list of the newest transaction.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
  	transaction = jh->b_transaction;
  	if (transaction == NULL) {
  		/* First case: not on any transaction.  If it
  		 * has no checkpoint link, then we can zap it:
  		 * it's a writeback-mode buffer so we don't care
  		 * if it hits disk safely. */
  		if (!jh->b_cp_transaction) {
  			JBUFFER_TRACE(jh, "not on any transaction: zap");
  			goto zap_buffer;
  		}
  
  		if (!buffer_dirty(bh)) {
  			/* bdflush has written it.  We can drop it now */
  			goto zap_buffer;
  		}
  
  		/* OK, it must be in the journal but still not
  		 * written fully to disk: it's metadata or
  		 * journaled data... */
  
  		if (journal->j_running_transaction) {
  			/* ... and once the current transaction has
  			 * committed, the buffer won't be needed any
  			 * longer. */
  			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
  			ret = __dispose_buffer(jh,
  					journal->j_running_transaction);
  			journal_put_journal_head(jh);
  			spin_unlock(&journal->j_list_lock);
  			jbd_unlock_bh_state(bh);
  			spin_unlock(&journal->j_state_lock);
  			return ret;
  		} else {
  			/* There is no currently-running transaction. So the
  			 * orphan record which we wrote for this file must have
  			 * passed into commit.  We must attach this buffer to
  			 * the committing transaction, if it exists. */
  			if (journal->j_committing_transaction) {
  				JBUFFER_TRACE(jh, "give to committing trans");
  				ret = __dispose_buffer(jh,
  					journal->j_committing_transaction);
  				journal_put_journal_head(jh);
  				spin_unlock(&journal->j_list_lock);
  				jbd_unlock_bh_state(bh);
  				spin_unlock(&journal->j_state_lock);
  				return ret;
  			} else {
  				/* The orphan record's transaction has
  				 * committed.  We can cleanse this buffer */
  				clear_buffer_jbddirty(bh);
  				goto zap_buffer;
  			}
  		}
  	} else if (transaction == journal->j_committing_transaction) {
f58a74dca   Eric Sandeen   [PATCH] jbd: jour...
1928
  		JBUFFER_TRACE(jh, "on committing transaction");
d13df84ff   Andrew Morton   [PATCH] jbd dirty...
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
  		if (jh->b_jlist == BJ_Locked) {
  			/*
  			 * The buffer is on the committing transaction's locked
  			 * list.  We have the buffer locked, so I/O has
  			 * completed.  So we can nail the buffer now.
  			 */
  			may_free = __dispose_buffer(jh, transaction);
  			goto zap_buffer;
  		}
  		/*
869639189   Jan Kara   jbd: Delay discar...
1939
1940
1941
1942
1943
1944
  		 * The buffer is committing, we simply cannot touch
  		 * it. So we just set j_next_transaction to the
  		 * running transaction (if there is one) and mark
  		 * buffer as freed so that commit code knows it should
  		 * clear dirty bits when it is done with the buffer.
  		 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1945
  		set_buffer_freed(bh);
869639189   Jan Kara   jbd: Delay discar...
1946
1947
  		if (journal->j_running_transaction && buffer_jbddirty(bh))
  			jh->b_next_transaction = journal->j_running_transaction;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
  		journal_put_journal_head(jh);
  		spin_unlock(&journal->j_list_lock);
  		jbd_unlock_bh_state(bh);
  		spin_unlock(&journal->j_state_lock);
  		return 0;
  	} else {
  		/* Good, the buffer belongs to the running transaction.
  		 * We are writing our own transaction's data, not any
  		 * previous one's, so it is safe to throw it away
  		 * (remember that we expect the filesystem to have set
  		 * i_size already for this truncate so recovery will not
  		 * expose the disk blocks we are discarding here.) */
  		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
f58a74dca   Eric Sandeen   [PATCH] jbd: jour...
1961
  		JBUFFER_TRACE(jh, "on running transaction");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
  		may_free = __dispose_buffer(jh, transaction);
  	}
  
  zap_buffer:
  	journal_put_journal_head(jh);
  zap_buffer_no_jh:
  	spin_unlock(&journal->j_list_lock);
  	jbd_unlock_bh_state(bh);
  	spin_unlock(&journal->j_state_lock);
  zap_buffer_unlocked:
  	clear_buffer_dirty(bh);
  	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
  	clear_buffer_mapped(bh);
  	clear_buffer_req(bh);
  	clear_buffer_new(bh);
  	bh->b_bdev = NULL;
  	return may_free;
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1980
  /**
a6b91919e   Randy Dunlap   fs: fix kernel-do...
1981
1982
   * void journal_invalidatepage() - invalidate a journal page
   * @journal: journal to use for flush
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1983
1984
1985
1986
   * @page:    page to flush
   * @offset:  length of page to invalidate.
   *
   * Reap page buffers containing data after offset in page.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1987
   */
2ff28e22b   NeilBrown   [PATCH] Make addr...
1988
  void journal_invalidatepage(journal_t *journal,
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
1989
  		      struct page *page,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1990
1991
1992
1993
1994
1995
1996
1997
1998
  		      unsigned long offset)
  {
  	struct buffer_head *head, *bh, *next;
  	unsigned int curr_off = 0;
  	int may_free = 1;
  
  	if (!PageLocked(page))
  		BUG();
  	if (!page_has_buffers(page))
2ff28e22b   NeilBrown   [PATCH] Make addr...
1999
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2000
2001
2002
2003
2004
2005
2006
2007
2008
  
  	/* We will potentially be playing with lists other than just the
  	 * data lists (especially for journaled data mode), so be
  	 * cautious in our locking. */
  
  	head = bh = page_buffers(page);
  	do {
  		unsigned int next_off = curr_off + bh->b_size;
  		next = bh->b_this_page;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2009
  		if (offset <= curr_off) {
e9ad5620b   Dave Kleikamp   [PATCH] ext3: Mor...
2010
  			/* This block is wholly outside the truncation point */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
  			lock_buffer(bh);
  			may_free &= journal_unmap_buffer(journal, bh);
  			unlock_buffer(bh);
  		}
  		curr_off = next_off;
  		bh = next;
  
  	} while (bh != head);
  
  	if (!offset) {
2ff28e22b   NeilBrown   [PATCH] Make addr...
2021
2022
  		if (may_free && try_to_free_buffers(page))
  			J_ASSERT(!page_has_buffers(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2023
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2024
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
2025
2026
  /*
   * File a buffer on the given transaction list.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
   */
  void __journal_file_buffer(struct journal_head *jh,
  			transaction_t *transaction, int jlist)
  {
  	struct journal_head **list = NULL;
  	int was_dirty = 0;
  	struct buffer_head *bh = jh2bh(jh);
  
  	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
  	assert_spin_locked(&transaction->t_journal->j_list_lock);
  
  	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
  	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
c80544dc0   Stephen Hemminger   sparse pointer us...
2040
  				jh->b_transaction == NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2041
2042
2043
  
  	if (jh->b_transaction && jh->b_jlist == jlist)
  		return;
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
2044
  	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2045
  	    jlist == BJ_Shadow || jlist == BJ_Forget) {
1e9fd53b7   Jan Kara   jbd: Fix a race b...
2046
2047
2048
2049
2050
2051
2052
2053
2054
  		/*
  		 * For metadata buffers, we track dirty bit in buffer_jbddirty
  		 * instead of buffer_dirty. We should not see a dirty bit set
  		 * here because we clear it in do_get_write_access but e.g.
  		 * tune2fs can modify the sb and set the dirty bit at any time
  		 * so we try to gracefully handle that.
  		 */
  		if (buffer_dirty(bh))
  			warn_dirty_buffer(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2055
2056
2057
2058
2059
2060
2061
  		if (test_clear_buffer_dirty(bh) ||
  		    test_clear_buffer_jbddirty(bh))
  			was_dirty = 1;
  	}
  
  	if (jh->b_transaction)
  		__journal_temp_unlink_buffer(jh);
bb189247f   Jan Kara   jbd: Fix oops in ...
2062
2063
  	else
  		journal_grab_journal_head(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
  	jh->b_transaction = transaction;
  
  	switch (jlist) {
  	case BJ_None:
  		J_ASSERT_JH(jh, !jh->b_committed_data);
  		J_ASSERT_JH(jh, !jh->b_frozen_data);
  		return;
  	case BJ_SyncData:
  		list = &transaction->t_sync_datalist;
  		break;
  	case BJ_Metadata:
  		transaction->t_nr_buffers++;
  		list = &transaction->t_buffers;
  		break;
  	case BJ_Forget:
  		list = &transaction->t_forget;
  		break;
  	case BJ_IO:
  		list = &transaction->t_iobuf_list;
  		break;
  	case BJ_Shadow:
  		list = &transaction->t_shadow_list;
  		break;
  	case BJ_LogCtl:
  		list = &transaction->t_log_list;
  		break;
  	case BJ_Reserved:
  		list = &transaction->t_reserved_list;
  		break;
  	case BJ_Locked:
  		list =  &transaction->t_locked_list;
  		break;
  	}
  
  	__blist_add_buffer(list, jh);
  	jh->b_jlist = jlist;
  
  	if (was_dirty)
  		set_buffer_jbddirty(bh);
  }
  
  void journal_file_buffer(struct journal_head *jh,
  				transaction_t *transaction, int jlist)
  {
  	jbd_lock_bh_state(jh2bh(jh));
  	spin_lock(&transaction->t_journal->j_list_lock);
  	__journal_file_buffer(jh, transaction, jlist);
  	spin_unlock(&transaction->t_journal->j_list_lock);
  	jbd_unlock_bh_state(jh2bh(jh));
  }
ae6ddcc5f   Mingming Cao   [PATCH] ext3 and ...
2114
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2115
2116
2117
2118
2119
   * Remove a buffer from its current buffer list in preparation for
   * dropping it from its current transaction entirely.  If the buffer has
   * already started to be used by a subsequent transaction, refile the
   * buffer on that transaction's metadata list.
   *
bb189247f   Jan Kara   jbd: Fix oops in ...
2120
   * Called under j_list_lock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2121
   * Called under jbd_lock_bh_state(jh2bh(jh))
bb189247f   Jan Kara   jbd: Fix oops in ...
2122
2123
   *
   * jh and bh may be already free when this function returns
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2124
2125
2126
   */
  void __journal_refile_buffer(struct journal_head *jh)
  {
869639189   Jan Kara   jbd: Delay discar...
2127
  	int was_dirty, jlist;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
  	struct buffer_head *bh = jh2bh(jh);
  
  	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
  	if (jh->b_transaction)
  		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
  
  	/* If the buffer is now unused, just drop it. */
  	if (jh->b_next_transaction == NULL) {
  		__journal_unfile_buffer(jh);
  		return;
  	}
  
  	/*
  	 * It has been modified by a later transaction: add it to the new
  	 * transaction's metadata list.
  	 */
  
  	was_dirty = test_clear_buffer_jbddirty(bh);
  	__journal_temp_unlink_buffer(jh);
bb189247f   Jan Kara   jbd: Fix oops in ...
2147
2148
2149
2150
2151
  	/*
  	 * We set b_transaction here because b_next_transaction will inherit
  	 * our jh reference and thus __journal_file_buffer() must not take a
  	 * new one.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2152
2153
  	jh->b_transaction = jh->b_next_transaction;
  	jh->b_next_transaction = NULL;
869639189   Jan Kara   jbd: Delay discar...
2154
2155
2156
2157
2158
2159
2160
  	if (buffer_freed(bh))
  		jlist = BJ_Forget;
  	else if (jh->b_modified)
  		jlist = BJ_Metadata;
  	else
  		jlist = BJ_Reserved;
  	__journal_file_buffer(jh, jh->b_transaction, jlist);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2161
2162
2163
2164
2165
2166
2167
  	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
  
  	if (was_dirty)
  		set_buffer_jbddirty(bh);
  }
  
  /*
bb189247f   Jan Kara   jbd: Fix oops in ...
2168
2169
2170
2171
   * __journal_refile_buffer() with necessary locking added. We take our bh
   * reference so that we can safely unlock bh.
   *
   * The jh and bh may be freed by this call.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2172
2173
2174
2175
   */
  void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
  {
  	struct buffer_head *bh = jh2bh(jh);
bb189247f   Jan Kara   jbd: Fix oops in ...
2176
2177
  	/* Get reference so that buffer cannot be freed before we unlock it */
  	get_bh(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2178
2179
  	jbd_lock_bh_state(bh);
  	spin_lock(&journal->j_list_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2180
2181
  	__journal_refile_buffer(jh);
  	jbd_unlock_bh_state(bh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2182
2183
2184
  	spin_unlock(&journal->j_list_lock);
  	__brelse(bh);
  }