Commit 4afe978530702c934dfdb11f54073136818b2119

Authored by Hidehiro Kawai
Committed by Linus Torvalds
1 parent 66f50ee3ce

jbd: fix error handling for checkpoint io

When a checkpointing IO fails, current JBD code doesn't check the error
and continue journaling.  This means latest metadata can be lost from both
the journal and filesystem.

This patch leaves the failed metadata blocks in the journal space and
aborts journaling in the case of log_do_checkpoint().  To achieve this, we
need to do:

1. don't remove the failed buffer from the checkpoint list where in
   the case of __try_to_free_cp_buf() because it may be released or
   overwritten by a later transaction
2. log_do_checkpoint() is the last chance, remove the failed buffer
   from the checkpoint list and abort the journal
3. when checkpointing fails, don't update the journal super block to
   prevent the journaled contents from being cleaned.  For safety,
   don't update j_tail and j_tail_sequence either
4. when checkpointing fails, notify this error to the ext3 layer so
   that ext3 don't clear the needs_recovery flag, otherwise the
   journaled contents are ignored and cleaned in the recovery phase
5. if the recovery fails, keep the needs_recovery flag
6. prevent cleanup_journal_tail() from being called between
   __journal_drop_transaction() and journal_abort() (a race issue
   between journal_flush() and __log_wait_for_space()

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 65 additions and 21 deletions Side-by-side Diff

... ... @@ -93,7 +93,8 @@
93 93 int ret = 0;
94 94 struct buffer_head *bh = jh2bh(jh);
95 95  
96   - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
  96 + if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
  97 + !buffer_dirty(bh) && buffer_uptodate(bh)) {
97 98 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 99 ret = __journal_remove_checkpoint(jh) + 1;
99 100 jbd_unlock_bh_state(bh);
100 101  
101 102  
102 103  
... ... @@ -160,21 +161,25 @@
160 161 * buffers. Note that we take the buffers in the opposite ordering
161 162 * from the one in which they were submitted for IO.
162 163 *
  164 + * Return 0 on success, and return <0 if some buffers have failed
  165 + * to be written out.
  166 + *
163 167 * Called with j_list_lock held.
164 168 */
165   -static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
  169 +static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166 170 {
167 171 struct journal_head *jh;
168 172 struct buffer_head *bh;
169 173 tid_t this_tid;
170 174 int released = 0;
  175 + int ret = 0;
171 176  
172 177 this_tid = transaction->t_tid;
173 178 restart:
174 179 /* Did somebody clean up the transaction in the meanwhile? */
175 180 if (journal->j_checkpoint_transactions != transaction ||
176 181 transaction->t_tid != this_tid)
177   - return;
  182 + return ret;
178 183 while (!released && transaction->t_checkpoint_io_list) {
179 184 jh = transaction->t_checkpoint_io_list;
180 185 bh = jh2bh(jh);
... ... @@ -194,6 +199,9 @@
194 199 spin_lock(&journal->j_list_lock);
195 200 goto restart;
196 201 }
  202 + if (unlikely(!buffer_uptodate(bh)))
  203 + ret = -EIO;
  204 +
197 205 /*
198 206 * Now in whatever state the buffer currently is, we know that
199 207 * it has been written out and so we can drop it from the list
... ... @@ -203,6 +211,8 @@
203 211 journal_remove_journal_head(bh);
204 212 __brelse(bh);
205 213 }
  214 +
  215 + return ret;
206 216 }
207 217  
208 218 #define NR_BATCH 64
... ... @@ -226,7 +236,8 @@
226 236 * Try to flush one buffer from the checkpoint list to disk.
227 237 *
228 238 * Return 1 if something happened which requires us to abort the current
229   - * scan of the checkpoint list.
  239 + * scan of the checkpoint list. Return <0 if the buffer has failed to
  240 + * be written out.
230 241 *
231 242 * Called with j_list_lock held and drops it if 1 is returned
232 243 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
... ... @@ -256,6 +267,9 @@
256 267 log_wait_commit(journal, tid);
257 268 ret = 1;
258 269 } else if (!buffer_dirty(bh)) {
  270 + ret = 1;
  271 + if (unlikely(!buffer_uptodate(bh)))
  272 + ret = -EIO;
259 273 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 274 BUFFER_TRACE(bh, "remove from checkpoint");
261 275 __journal_remove_checkpoint(jh);
... ... @@ -263,7 +277,6 @@
263 277 jbd_unlock_bh_state(bh);
264 278 journal_remove_journal_head(bh);
265 279 __brelse(bh);
266   - ret = 1;
267 280 } else {
268 281 /*
269 282 * Important: we are about to write the buffer, and
... ... @@ -295,6 +308,7 @@
295 308 * to disk. We submit larger chunks of data at once.
296 309 *
297 310 * The journal should be locked before calling this function.
  311 + * Called with j_checkpoint_mutex held.
298 312 */
299 313 int log_do_checkpoint(journal_t *journal)
300 314 {
... ... @@ -318,6 +332,7 @@
318 332 * OK, we need to start writing disk blocks. Take one transaction
319 333 * and write it.
320 334 */
  335 + result = 0;
321 336 spin_lock(&journal->j_list_lock);
322 337 if (!journal->j_checkpoint_transactions)
323 338 goto out;
... ... @@ -334,7 +349,7 @@
334 349 int batch_count = 0;
335 350 struct buffer_head *bhs[NR_BATCH];
336 351 struct journal_head *jh;
337   - int retry = 0;
  352 + int retry = 0, err;
338 353  
339 354 while (!retry && transaction->t_checkpoint_list) {
340 355 struct buffer_head *bh;
... ... @@ -347,6 +362,8 @@
347 362 break;
348 363 }
349 364 retry = __process_buffer(journal, jh, bhs,&batch_count);
  365 + if (retry < 0 && !result)
  366 + result = retry;
350 367 if (!retry && (need_resched() ||
351 368 spin_needbreak(&journal->j_list_lock))) {
352 369 spin_unlock(&journal->j_list_lock);
353 370  
354 371  
... ... @@ -371,14 +388,18 @@
371 388 * Now we have cleaned up the first transaction's checkpoint
372 389 * list. Let's clean up the second one
373 390 */
374   - __wait_cp_io(journal, transaction);
  391 + err = __wait_cp_io(journal, transaction);
  392 + if (!result)
  393 + result = err;
375 394 }
376 395 out:
377 396 spin_unlock(&journal->j_list_lock);
378   - result = cleanup_journal_tail(journal);
379 397 if (result < 0)
380   - return result;
381   - return 0;
  398 + journal_abort(journal, result);
  399 + else
  400 + result = cleanup_journal_tail(journal);
  401 +
  402 + return (result < 0) ? result : 0;
382 403 }
383 404  
384 405 /*
... ... @@ -394,8 +415,9 @@
394 415 * This is the only part of the journaling code which really needs to be
395 416 * aware of transaction aborts. Checkpointing involves writing to the
396 417 * main filesystem area rather than to the journal, so it can proceed
397   - * even in abort state, but we must not update the journal superblock if
398   - * we have an abort error outstanding.
  418 + * even in abort state, but we must not update the super block if
  419 + * checkpointing may have failed. Otherwise, we would lose some metadata
  420 + * buffers which should be written-back to the filesystem.
399 421 */
400 422  
401 423 int cleanup_journal_tail(journal_t *journal)
... ... @@ -403,6 +425,9 @@
403 425 transaction_t * transaction;
404 426 tid_t first_tid;
405 427 unsigned long blocknr, freed;
  428 +
  429 + if (is_journal_aborted(journal))
  430 + return 1;
406 431  
407 432 /* OK, work out the oldest transaction remaining in the log, and
408 433 * the log block it starts at.
... ... @@ -1121,9 +1121,12 @@
1121 1121 *
1122 1122 * Release a journal_t structure once it is no longer in use by the
1123 1123 * journaled object.
  1124 + * Return <0 if we couldn't clean up the journal.
1124 1125 */
1125   -void journal_destroy(journal_t *journal)
  1126 +int journal_destroy(journal_t *journal)
1126 1127 {
  1128 + int err = 0;
  1129 +
1127 1130 /* Wait for the commit thread to wake up and die. */
1128 1131 journal_kill_thread(journal);
1129 1132  
1130 1133  
... ... @@ -1146,11 +1149,16 @@
1146 1149 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1147 1150 spin_unlock(&journal->j_list_lock);
1148 1151  
1149   - /* We can now mark the journal as empty. */
1150   - journal->j_tail = 0;
1151   - journal->j_tail_sequence = ++journal->j_transaction_sequence;
1152 1152 if (journal->j_sb_buffer) {
1153   - journal_update_superblock(journal, 1);
  1153 + if (!is_journal_aborted(journal)) {
  1154 + /* We can now mark the journal as empty. */
  1155 + journal->j_tail = 0;
  1156 + journal->j_tail_sequence =
  1157 + ++journal->j_transaction_sequence;
  1158 + journal_update_superblock(journal, 1);
  1159 + } else {
  1160 + err = -EIO;
  1161 + }
1154 1162 brelse(journal->j_sb_buffer);
1155 1163 }
1156 1164  
... ... @@ -1160,6 +1168,8 @@
1160 1168 journal_destroy_revoke(journal);
1161 1169 kfree(journal->j_wbuf);
1162 1170 kfree(journal);
  1171 +
  1172 + return err;
1163 1173 }
1164 1174  
1165 1175  
1166 1176  
1167 1177  
... ... @@ -1359,10 +1369,16 @@
1359 1369 spin_lock(&journal->j_list_lock);
1360 1370 while (!err && journal->j_checkpoint_transactions != NULL) {
1361 1371 spin_unlock(&journal->j_list_lock);
  1372 + mutex_lock(&journal->j_checkpoint_mutex);
1362 1373 err = log_do_checkpoint(journal);
  1374 + mutex_unlock(&journal->j_checkpoint_mutex);
1363 1375 spin_lock(&journal->j_list_lock);
1364 1376 }
1365 1377 spin_unlock(&journal->j_list_lock);
  1378 +
  1379 + if (is_journal_aborted(journal))
  1380 + return -EIO;
  1381 +
1366 1382 cleanup_journal_tail(journal);
1367 1383  
1368 1384 /* Finally, mark the journal as really needing no recovery.
... ... @@ -1384,7 +1400,7 @@
1384 1400 J_ASSERT(journal->j_head == journal->j_tail);
1385 1401 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1386 1402 spin_unlock(&journal->j_state_lock);
1387   - return err;
  1403 + return 0;
1388 1404 }
1389 1405  
1390 1406 /**
... ... @@ -223,7 +223,7 @@
223 223 */
224 224 int journal_recover(journal_t *journal)
225 225 {
226   - int err;
  226 + int err, err2;
227 227 journal_superblock_t * sb;
228 228  
229 229 struct recovery_info info;
... ... @@ -261,7 +261,10 @@
261 261 journal->j_transaction_sequence = ++info.end_transaction;
262 262  
263 263 journal_clear_revoke(journal);
264   - sync_blockdev(journal->j_fs_dev);
  264 + err2 = sync_blockdev(journal->j_fs_dev);
  265 + if (!err)
  266 + err = err2;
  267 +
265 268 return err;
266 269 }
267 270  
... ... @@ -911,7 +911,7 @@
911 911 (journal_t *, unsigned long, unsigned long, unsigned long);
912 912 extern int journal_create (journal_t *);
913 913 extern int journal_load (journal_t *journal);
914   -extern void journal_destroy (journal_t *);
  914 +extern int journal_destroy (journal_t *);
915 915 extern int journal_recover (journal_t *journal);
916 916 extern int journal_wipe (journal_t *, int);
917 917 extern int journal_skip_recovery (journal_t *);