jbd: fix error handling for checkpoint io

When a checkpointing IO fails, current JBD code doesn't check the error and continue journaling. This means latest metadata can be lost from both the journal and filesystem. This patch leaves the failed metadata blocks in the journal space and aborts journaling in the case of log_do_checkpoint(). To achieve this, we need to do: 1. don't remove the failed buffer from the checkpoint list where in the case of __try_to_free_cp_buf() because it may be released or overwritten by a later transaction 2. log_do_checkpoint() is the last chance, remove the failed buffer from the checkpoint list and abort the journal 3. when checkpointing fails, don't update the journal super block to prevent the journaled contents from being cleaned. For safety, don't update j_tail and j_tail_sequence either 4. when checkpointing fails, notify this error to the ext3 layer so that ext3 don't clear the needs_recovery flag, otherwise the journaled contents are ignored and cleaned in the recovery phase 5. if the recovery fails, keep the needs_recovery flag 6. prevent cleanup_journal_tail() from being called between __journal_drop_transaction() and journal_abort() (a race issue between journal_flush() and __log_wait_for_space() Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Acked-by: Jan Kara <jack@suse.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

jbd: fix error handling for checkpoint io
When a checkpointing IO fails, current JBD code doesn't check the error and continue journaling. This means latest metadata can be lost from both the journal and filesystem. This patch leaves the failed metadata blocks in the journal space and aborts journaling in the case of log_do_checkpoint(). To achieve this, we need to do: 1. don't remove the failed buffer from the checkpoint list where in the case of __try_to_free_cp_buf() because it may be released or overwritten by a later transaction 2. log_do_checkpoint() is the last chance, remove the failed buffer from the checkpoint list and abort the journal 3. when checkpointing fails, don't update the journal super block to prevent the journaled contents from being cleaned. For safety, don't update j_tail and j_tail_sequence either 4. when checkpointing fails, notify this error to the ext3 layer so that ext3 don't clear the needs_recovery flag, otherwise the journaled contents are ignored and cleaned in the recovery phase 5. if the recovery fails, keep the needs_recovery flag 6. prevent cleanup_journal_tail() from being called between __journal_drop_transaction() and journal_abort() (a race issue between journal_flush() and __log_wait_for_space() Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com> Acked-by: Jan Kara <jack@suse.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Hidehiro Kawai · Linus Torvalds
1 parent 66f50ee3ce
Showing 4 changed files with 65 additions and 21 deletions Side-by-side Diff
fs/jbd/checkpoint.c
fs/jbd/journal.c
fs/jbd/recovery.c
include/linux/jbd.h
@@ -93,7 +93,8 @@
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
  
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && buffer_uptodate(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
  
  
  
@@ -160,21 +161,25 @@
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
+	int ret = 0;
  
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
-		return;
+		return ret;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
@@ -194,6 +199,9 @@
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		if (unlikely(!buffer_uptodate(bh)))
+			ret = -EIO;
+
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
@@ -203,6 +211,8 @@
 		journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
+
+	return ret;
 }
  
 #define NR_BATCH	64
@@ -226,7 +236,8 @@
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +267,9 @@
 		log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(!buffer_uptodate(bh)))
+			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__journal_remove_checkpoint(jh);
@@ -263,7 +277,6 @@
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		__brelse(bh);
-		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
@@ -295,6 +308,7 @@
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int log_do_checkpoint(journal_t *journal)
 {
@@ -318,6 +332,7 @@
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
+	result = 0;
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
@@ -334,7 +349,7 @@
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
-		int retry = 0;
+		int retry = 0, err;
  
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
@@ -347,6 +362,8 @@
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (retry < 0 && !result)
+				result = retry;
 			if (!retry && (need_resched() ||
 				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
  
  
@@ -371,14 +388,18 @@
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
-		__wait_cp_io(journal, transaction);
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-	result = cleanup_journal_tail(journal);
 	if (result < 0)
-		return result;
-	return 0;
+		journal_abort(journal, result);
+	else
+		result = cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
 }
  
 /*
@@ -394,8 +415,9 @@
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
  
 int cleanup_journal_tail(journal_t *journal)
@@ -403,6 +425,9 @@
 	transaction_t * transaction;
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
+
+	if (is_journal_aborted(journal))
+		return 1;
  
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
@@ -1121,9 +1121,12 @@
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void journal_destroy(journal_t *journal)
+int journal_destroy(journal_t *journal)
 {
+	int err = 0;
+
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
  
  
@@ -1146,11 +1149,16 @@
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
  
-	/* We can now mark the journal as empty. */
-	journal->j_tail = 0;
-	journal->j_tail_sequence = ++journal->j_transaction_sequence;
 	if (journal->j_sb_buffer) {
-		journal_update_superblock(journal, 1);
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
 		brelse(journal->j_sb_buffer);
 	}
  
@@ -1160,6 +1168,8 @@
 		journal_destroy_revoke(journal);
 	kfree(journal->j_wbuf);
 	kfree(journal);
+
+	return err;
 }
  
  
  
  
@@ -1359,10 +1369,16 @@
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		err = log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
 	cleanup_journal_tail(journal);
  
 	/* Finally, mark the journal as really needing no recovery.
@@ -1384,7 +1400,7 @@
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	spin_unlock(&journal->j_state_lock);
-	return err;
+	return 0;
 }
  
 /**
@@ -223,7 +223,7 @@
  */
 int journal_recover(journal_t *journal)
 {
-	int			err;
+	int			err, err2;
 	journal_superblock_t *	sb;
  
 	struct recovery_info	info;
@@ -261,7 +261,10 @@
 	journal->j_transaction_sequence = ++info.end_transaction;
  
 	journal_clear_revoke(journal);
-	sync_blockdev(journal->j_fs_dev);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
 	return err;
 }
  
@@ -911,7 +911,7 @@
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern int	   journal_create     (journal_t *);
 extern int	   journal_load       (journal_t *journal);
-extern void	   journal_destroy    (journal_t *);
+extern int	   journal_destroy    (journal_t *);
 extern int	   journal_recover    (journal_t *journal);
 extern int	   journal_wipe       (journal_t *, int);
 extern int	   journal_skip_recovery	(journal_t *);
...	...	@@ -93,7 +93,8 @@
93	93	int ret = 0;
94	94	struct buffer_head *bh = jh2bh(jh);
95	95
96		- if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
	96	+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
	97	+ !buffer_dirty(bh) && buffer_uptodate(bh)) {
97	98	JBUFFER_TRACE(jh, "remove from checkpoint list");
98	99	ret = __journal_remove_checkpoint(jh) + 1;
99	100	jbd_unlock_bh_state(bh);
100	101
101	102
102	103
...	...	@@ -160,21 +161,25 @@
160	161	* buffers. Note that we take the buffers in the opposite ordering
161	162	* from the one in which they were submitted for IO.
162	163	*
	164	+ * Return 0 on success, and return <0 if some buffers have failed
	165	+ * to be written out.
	166	+ *
163	167	* Called with j_list_lock held.
164	168	*/
165		-static void __wait_cp_io(journal_t journal, transaction_t transaction)
	169	+static int __wait_cp_io(journal_t journal, transaction_t transaction)
166	170	{
167	171	struct journal_head *jh;
168	172	struct buffer_head *bh;
169	173	tid_t this_tid;
170	174	int released = 0;
	175	+ int ret = 0;
171	176
172	177	this_tid = transaction->t_tid;
173	178	restart:
174	179	/* Did somebody clean up the transaction in the meanwhile? */
175	180	if (journal->j_checkpoint_transactions != transaction \|\|
176	181	transaction->t_tid != this_tid)
177		- return;
	182	+ return ret;
178	183	while (!released && transaction->t_checkpoint_io_list) {
179	184	jh = transaction->t_checkpoint_io_list;
180	185	bh = jh2bh(jh);
...	...	@@ -194,6 +199,9 @@
194	199	spin_lock(&journal->j_list_lock);
195	200	goto restart;
196	201	}
	202	+ if (unlikely(!buffer_uptodate(bh)))
	203	+ ret = -EIO;
	204	+
197	205	/*
198	206	* Now in whatever state the buffer currently is, we know that
199	207	* it has been written out and so we can drop it from the list
...	...	@@ -203,6 +211,8 @@
203	211	journal_remove_journal_head(bh);
204	212	__brelse(bh);
205	213	}
	214	+
	215	+ return ret;
206	216	}
207	217
208	218	#define NR_BATCH 64
...	...	@@ -226,7 +236,8 @@
226	236	* Try to flush one buffer from the checkpoint list to disk.
227	237	*
228	238	* Return 1 if something happened which requires us to abort the current
229		- * scan of the checkpoint list.
	239	+ * scan of the checkpoint list. Return <0 if the buffer has failed to
	240	+ * be written out.
230	241	*
231	242	* Called with j_list_lock held and drops it if 1 is returned
232	243	* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
...	...	@@ -256,6 +267,9 @@
256	267	log_wait_commit(journal, tid);
257	268	ret = 1;
258	269	} else if (!buffer_dirty(bh)) {
	270	+ ret = 1;
	271	+ if (unlikely(!buffer_uptodate(bh)))
	272	+ ret = -EIO;
259	273	J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260	274	BUFFER_TRACE(bh, "remove from checkpoint");
261	275	__journal_remove_checkpoint(jh);
...	...	@@ -263,7 +277,6 @@
263	277	jbd_unlock_bh_state(bh);
264	278	journal_remove_journal_head(bh);
265	279	__brelse(bh);
266		- ret = 1;
267	280	} else {
268	281	/*
269	282	* Important: we are about to write the buffer, and
...	...	@@ -295,6 +308,7 @@
295	308	* to disk. We submit larger chunks of data at once.
296	309	*
297	310	* The journal should be locked before calling this function.
	311	+ * Called with j_checkpoint_mutex held.
298	312	*/
299	313	int log_do_checkpoint(journal_t *journal)
300	314	{
...	...	@@ -318,6 +332,7 @@
318	332	* OK, we need to start writing disk blocks. Take one transaction
319	333	* and write it.
320	334	*/
	335	+ result = 0;
321	336	spin_lock(&journal->j_list_lock);
322	337	if (!journal->j_checkpoint_transactions)
323	338	goto out;
...	...	@@ -334,7 +349,7 @@
334	349	int batch_count = 0;
335	350	struct buffer_head *bhs[NR_BATCH];
336	351	struct journal_head *jh;
337		- int retry = 0;
	352	+ int retry = 0, err;
338	353
339	354	while (!retry && transaction->t_checkpoint_list) {
340	355	struct buffer_head *bh;
...	...	@@ -347,6 +362,8 @@
347	362	break;
348	363	}
349	364	retry = __process_buffer(journal, jh, bhs,&batch_count);
	365	+ if (retry < 0 && !result)
	366	+ result = retry;
350	367	if (!retry && (need_resched() \|\|
351	368	spin_needbreak(&journal->j_list_lock))) {
352	369	spin_unlock(&journal->j_list_lock);
353	370
354	371
...	...	@@ -371,14 +388,18 @@
371	388	* Now we have cleaned up the first transaction's checkpoint
372	389	* list. Let's clean up the second one
373	390	*/
374		- __wait_cp_io(journal, transaction);
	391	+ err = __wait_cp_io(journal, transaction);
	392	+ if (!result)
	393	+ result = err;
375	394	}
376	395	out:
377	396	spin_unlock(&journal->j_list_lock);
378		- result = cleanup_journal_tail(journal);
379	397	if (result < 0)
380		- return result;
381		- return 0;
	398	+ journal_abort(journal, result);
	399	+ else
	400	+ result = cleanup_journal_tail(journal);
	401	+
	402	+ return (result < 0) ? result : 0;
382	403	}
383	404
384	405	/*
...	...	@@ -394,8 +415,9 @@
394	415	* This is the only part of the journaling code which really needs to be
395	416	* aware of transaction aborts. Checkpointing involves writing to the
396	417	* main filesystem area rather than to the journal, so it can proceed
397		- * even in abort state, but we must not update the journal superblock if
398		- * we have an abort error outstanding.
	418	+ * even in abort state, but we must not update the super block if
	419	+ * checkpointing may have failed. Otherwise, we would lose some metadata
	420	+ * buffers which should be written-back to the filesystem.
399	421	*/
400	422
401	423	int cleanup_journal_tail(journal_t *journal)
...	...	@@ -403,6 +425,9 @@
403	425	transaction_t * transaction;
404	426	tid_t first_tid;
405	427	unsigned long blocknr, freed;
	428	+
	429	+ if (is_journal_aborted(journal))
	430	+ return 1;
406	431
407	432	/* OK, work out the oldest transaction remaining in the log, and
408	433	* the log block it starts at.
...	...	@@ -1121,9 +1121,12 @@
1121	1121	*
1122	1122	* Release a journal_t structure once it is no longer in use by the
1123	1123	* journaled object.
	1124	+ * Return <0 if we couldn't clean up the journal.
1124	1125	*/
1125		-void journal_destroy(journal_t *journal)
	1126	+int journal_destroy(journal_t *journal)
1126	1127	{
	1128	+ int err = 0;
	1129	+
1127	1130	/* Wait for the commit thread to wake up and die. */
1128	1131	journal_kill_thread(journal);
1129	1132
1130	1133
...	...	@@ -1146,11 +1149,16 @@
1146	1149	J_ASSERT(journal->j_checkpoint_transactions == NULL);
1147	1150	spin_unlock(&journal->j_list_lock);
1148	1151
1149		- /* We can now mark the journal as empty. */
1150		- journal->j_tail = 0;
1151		- journal->j_tail_sequence = ++journal->j_transaction_sequence;
1152	1152	if (journal->j_sb_buffer) {
1153		- journal_update_superblock(journal, 1);
	1153	+ if (!is_journal_aborted(journal)) {
	1154	+ /* We can now mark the journal as empty. */
	1155	+ journal->j_tail = 0;
	1156	+ journal->j_tail_sequence =
	1157	+ ++journal->j_transaction_sequence;
	1158	+ journal_update_superblock(journal, 1);
	1159	+ } else {
	1160	+ err = -EIO;
	1161	+ }
1154	1162	brelse(journal->j_sb_buffer);
1155	1163	}
1156	1164
...	...	@@ -1160,6 +1168,8 @@
1160	1168	journal_destroy_revoke(journal);
1161	1169	kfree(journal->j_wbuf);
1162	1170	kfree(journal);
	1171	+
	1172	+ return err;
1163	1173	}
1164	1174
1165	1175
1166	1176
1167	1177
...	...	@@ -1359,10 +1369,16 @@
1359	1369	spin_lock(&journal->j_list_lock);
1360	1370	while (!err && journal->j_checkpoint_transactions != NULL) {
1361	1371	spin_unlock(&journal->j_list_lock);
	1372	+ mutex_lock(&journal->j_checkpoint_mutex);
1362	1373	err = log_do_checkpoint(journal);
	1374	+ mutex_unlock(&journal->j_checkpoint_mutex);
1363	1375	spin_lock(&journal->j_list_lock);
1364	1376	}
1365	1377	spin_unlock(&journal->j_list_lock);
	1378	+
	1379	+ if (is_journal_aborted(journal))
	1380	+ return -EIO;
	1381	+
1366	1382	cleanup_journal_tail(journal);
1367	1383
1368	1384	/* Finally, mark the journal as really needing no recovery.
...	...	@@ -1384,7 +1400,7 @@
1384	1400	J_ASSERT(journal->j_head == journal->j_tail);
1385	1401	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1386	1402	spin_unlock(&journal->j_state_lock);
1387		- return err;
	1403	+ return 0;
1388	1404	}
1389	1405
1390	1406	/**
...	...	@@ -223,7 +223,7 @@
223	223	*/
224	224	int journal_recover(journal_t *journal)
225	225	{
226		- int err;
	226	+ int err, err2;
227	227	journal_superblock_t * sb;
228	228
229	229	struct recovery_info info;
...	...	@@ -261,7 +261,10 @@
261	261	journal->j_transaction_sequence = ++info.end_transaction;
262	262
263	263	journal_clear_revoke(journal);
264		- sync_blockdev(journal->j_fs_dev);
	264	+ err2 = sync_blockdev(journal->j_fs_dev);
	265	+ if (!err)
	266	+ err = err2;
	267	+
265	268	return err;
266	269	}
267	270
...	...	@@ -911,7 +911,7 @@
911	911	(journal_t *, unsigned long, unsigned long, unsigned long);
912	912	extern int journal_create (journal_t *);
913	913	extern int journal_load (journal_t *journal);
914		-extern void journal_destroy (journal_t *);
	914	+extern int journal_destroy (journal_t *);
915	915	extern int journal_recover (journal_t *journal);
916	916	extern int journal_wipe (journal_t *, int);
917	917	extern int journal_skip_recovery (journal_t *);