vfs: Make sys_sync() use fsync_super() (version 4)

It is unnecessarily fragile to have two places (fsync_super() and do_sync()) doing data integrity sync of the filesystem. Alter __fsync_super() to accommodate needs of both callers and use it. So after this patch __fsync_super() is the only place where we gather all the calls needed to properly send all data on a filesystem to disk. Nice bonus is that we get a complete livelock avoidance and write_supers() is now only used for periodic writeback of superblocks. sync_blockdevs() introduced a couple of patches ago is gone now. [build fixes folded] Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

vfs: Make sys_sync() use fsync_super() (version 4)
It is unnecessarily fragile to have two places (fsync_super() and do_sync()) doing data integrity sync of the filesystem. Alter __fsync_super() to accommodate needs of both callers and use it. So after this patch __fsync_super() is the only place where we gather all the calls needed to properly send all data on a filesystem to disk. Nice bonus is that we get a complete livelock avoidance and write_supers() is now only used for periodic writeback of superblocks. sync_blockdevs() introduced a couple of patches ago is gone now. [build fixes folded] Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Jan Kara · Al Viro
1 parent 429479f031
Showing 7 changed files with 51 additions and 135 deletions Side-by-side Diff
fs/block_dev.c
fs/fs-writeback.c
fs/internal.h
fs/super.c
fs/sync.c
include/linux/fs.h
include/linux/writeback.h
@@ -176,17 +176,22 @@
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
  
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	if (!bdev)
+		return 0;
+	if (!wait)
+		return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
+	return __sync_blockdev(bdev, 1);
 }
 EXPORT_SYMBOL(sync_blockdev);
  
@@ -679,55 +679,6 @@
 }
  
 /**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root) {
-			sync_inodes_sb(sb, wait);
-			sync_blockdev(sb->s_bdev);
-		}
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-
-void sync_inodes(int wait)
-{
-	__sync_inodes(0);
-
-	if (wait)
-		__sync_inodes(1);
-}
-
-/**
  * write_inode_now	-	write an inode to disk
  * @inode: inode to write to disk
  * @sync: whether the write should be synchronous or not
@@ -25,6 +25,8 @@
 	return sb == blockdev_superblock;
 }
  
+extern int __sync_blockdev(struct block_device *bdev, int wait);
+
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@
 {
 	return 0;
 }
+
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	return 0;
+}
 #endif
  
 /*
@@ -71,13 +78,4 @@
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
-
-/*
- * super.c
- */
-#ifdef CONFIG_BLOCK
-extern void sync_blockdevs(void);
-#else
-static inline void sync_blockdevs(void) { }
-#endif
@@ -284,23 +284,23 @@
 EXPORT_SYMBOL(unlock_super);
  
 /*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb, int wait)
 {
-	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, 1);
+	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	return sync_blockdev(sb->s_bdev);
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
 }
  
 /*
@@ -310,7 +310,12 @@
  */
 int fsync_super(struct super_block *sb)
 {
-	return __fsync_super(sb);
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
  
  
  
  
@@ -469,20 +474,18 @@
 }
  
 /*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
  *
  * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
  * is used only here.  We set it against all filesystems and then clear it as
  * we sync them.  So redirtied filesystems are skipped.
  *
  * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
+ * calls sync_filesystems as well, process B will set all the s_need_sync
  * flags again, which will cause process A to resync everything.  Fix that with
  * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
  */
 void sync_filesystems(int wait)
 {
  
  
  
  
@@ -492,25 +495,23 @@
 	mutex_lock(&mutex);		/* Could be down_interruptible */
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_op->sync_fs)
-			continue;
 		if (sb->s_flags & MS_RDONLY)
 			continue;
-		sb->s_need_sync_fs = 1;
+		sb->s_need_sync = 1;
 	}
  
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync_fs)
+		if (!sb->s_need_sync)
 			continue;
-		sb->s_need_sync_fs = 0;
+		sb->s_need_sync = 0;
 		if (sb->s_flags & MS_RDONLY)
 			continue;	/* hm.  Was remounted r/o meanwhile */
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			sb->s_op->sync_fs(sb, wait);
+			__fsync_super(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
@@ -520,33 +521,6 @@
 	spin_unlock(&sb_lock);
 	mutex_unlock(&mutex);
 }
-
-#ifdef CONFIG_BLOCK
-/*
- *  Sync all block devices underlying some superblock
- */
-void sync_blockdevs(void)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_bdev)
-			continue;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			sync_blockdev(sb->s_bdev);
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-#endif
  
 /**
  *	get_super - get the superblock of a device
@@ -18,35 +18,24 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
  
-/*
- * sync everything.  Start out by waking pdflush, because that writes back
- * all queues in parallel.
- */
-static void do_sync(unsigned long wait)
+SYSCALL_DEFINE0(sync)
 {
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	vfs_dq_sync(NULL);
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
-	sync_supers();		/* Write the superblocks */
-	sync_filesystems(0);	/* Start syncing the filesystems */
-	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_blockdevs();
-	if (!wait)
-		printk("Emergency Sync complete\n");
+	sync_filesystems(0);
+	sync_filesystems(1);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
-}
-
-SYSCALL_DEFINE0(sync)
-{
-	do_sync(1);
 	return 0;
 }
  
 static void do_sync_work(struct work_struct *work)
 {
-	do_sync(0);
+	/*
+	 * Sync twice to reduce the possibility we skipped some inodes / pages
+	 * because they were temporarily locked
+	 */
+	sync_filesystems(0);
+	sync_filesystems(0);
+	printk("Emergency Sync complete\n");
 	kfree(work);
 }
  
@@ -1321,7 +1321,7 @@
 	struct rw_semaphore	s_umount;
 	struct mutex		s_lock;
 	int			s_count;
-	int			s_need_sync_fs;
+	int			s_need_sync;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
@@ -79,7 +79,6 @@
 void writeback_inodes(struct writeback_control *wbc);
 int inode_wait(void *);
 void sync_inodes_sb(struct super_block *, int wait);
-void sync_inodes(int wait);
  
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
...	...	@@ -176,17 +176,22 @@
176	176	iov, offset, nr_segs, blkdev_get_blocks, NULL);
177	177	}
178	178
	179	+int __sync_blockdev(struct block_device *bdev, int wait)
	180	+{
	181	+ if (!bdev)
	182	+ return 0;
	183	+ if (!wait)
	184	+ return filemap_flush(bdev->bd_inode->i_mapping);
	185	+ return filemap_write_and_wait(bdev->bd_inode->i_mapping);
	186	+}
	187	+
179	188	/*
180	189	* Write out and wait upon all the dirty data associated with a block
181	190	* device via its mapping. Does not take the superblock lock.
182	191	*/
183	192	int sync_blockdev(struct block_device *bdev)
184	193	{
185		- int ret = 0;
186		-
187		- if (bdev)
188		- ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
189		- return ret;
	194	+ return __sync_blockdev(bdev, 1);
190	195	}
191	196	EXPORT_SYMBOL(sync_blockdev);
192	197
...	...	@@ -679,55 +679,6 @@
679	679	}
680	680
681	681	/**
682		- * sync_inodes - writes all inodes to disk
683		- * @wait: wait for completion
684		- *
685		- * sync_inodes() goes through each super block's dirty inode list, writes the
686		- * inodes out, waits on the writeout and puts the inodes back on the normal
687		- * list.
688		- *
689		- * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
690		- * part of the sync functions is that the blockdev "superblock" is processed
691		- * last. This is because the write_inode() function of a typical fs will
692		- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
693		- * What we want to do is to perform all that dirtying first, and then write
694		- * back all those inode blocks via the blockdev mapping in one sweep. So the
695		- * additional (somewhat redundant) sync_blockdev() calls here are to make
696		- * sure that really happens. Because if we call sync_inodes_sb(wait=1) with
697		- * outstanding dirty inodes, the writeback goes block-at-a-time within the
698		- * filesystem's write_inode(). This is extremely slow.
699		- */
700		-static void __sync_inodes(int wait)
701		-{
702		- struct super_block *sb;
703		-
704		- spin_lock(&sb_lock);
705		-restart:
706		- list_for_each_entry(sb, &super_blocks, s_list) {
707		- sb->s_count++;
708		- spin_unlock(&sb_lock);
709		- down_read(&sb->s_umount);
710		- if (sb->s_root) {
711		- sync_inodes_sb(sb, wait);
712		- sync_blockdev(sb->s_bdev);
713		- }
714		- up_read(&sb->s_umount);
715		- spin_lock(&sb_lock);
716		- if (__put_super_and_need_restart(sb))
717		- goto restart;
718		- }
719		- spin_unlock(&sb_lock);
720		-}
721		-
722		-void sync_inodes(int wait)
723		-{
724		- __sync_inodes(0);
725		-
726		- if (wait)
727		- __sync_inodes(1);
728		-}
729		-
730		-/**
731	682	* write_inode_now - write an inode to disk
732	683	* @inode: inode to write to disk
733	684	* @sync: whether the write should be synchronous or not
...	...	@@ -25,6 +25,8 @@
25	25	return sb == blockdev_superblock;
26	26	}
27	27
	28	+extern int __sync_blockdev(struct block_device *bdev, int wait);
	29	+
28	30	#else
29	31	static inline void bdev_cache_init(void)
30	32	{
...	...	@@ -34,6 +36,11 @@
34	36	{
35	37	return 0;
36	38	}
	39	+
	40	+static inline int __sync_blockdev(struct block_device *bdev, int wait)
	41	+{
	42	+ return 0;
	43	+}
37	44	#endif
38	45
39	46	/*
...	...	@@ -71,13 +78,4 @@
71	78	* file_table.c
72	79	*/
73	80	extern void mark_files_ro(struct super_block *);
74		-
75		-/*
76		- * super.c
77		- */
78		-#ifdef CONFIG_BLOCK
79		-extern void sync_blockdevs(void);
80		-#else
81		-static inline void sync_blockdevs(void) { }
82		-#endif
...	...	@@ -284,23 +284,23 @@
284	284	EXPORT_SYMBOL(unlock_super);
285	285
286	286	/*
287		- * Write out and wait upon all dirty data associated with this
288		- * superblock. Filesystem data as well as the underlying block
289		- * device. Takes the superblock lock. Requires a second blkdev
290		- * flush by the caller to complete the operation.
	287	+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
	288	+ * just dirties buffers with inodes so we have to submit IO for these buffers
	289	+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
	290	+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
	291	+ * write one block at a time.
291	292	*/
292		-static int __fsync_super(struct super_block *sb)
	293	+static int __fsync_super(struct super_block *sb, int wait)
293	294	{
294		- sync_inodes_sb(sb, 0);
295	295	vfs_dq_sync(sb);
296		- sync_inodes_sb(sb, 1);
	296	+ sync_inodes_sb(sb, wait);
297	297	lock_super(sb);
298	298	if (sb->s_dirt && sb->s_op->write_super)
299	299	sb->s_op->write_super(sb);
300	300	unlock_super(sb);
301	301	if (sb->s_op->sync_fs)
302		- sb->s_op->sync_fs(sb, 1);
303		- return sync_blockdev(sb->s_bdev);
	302	+ sb->s_op->sync_fs(sb, wait);
	303	+ return __sync_blockdev(sb->s_bdev, wait);
304	304	}
305	305
306	306	/*
...	...	@@ -310,7 +310,12 @@
310	310	*/
311	311	int fsync_super(struct super_block *sb)
312	312	{
313		- return __fsync_super(sb);
	313	+ int ret;
	314	+
	315	+ ret = __fsync_super(sb, 0);
	316	+ if (ret < 0)
	317	+ return ret;
	318	+ return __fsync_super(sb, 1);
314	319	}
315	320	EXPORT_SYMBOL_GPL(fsync_super);
316	321
317	322
318	323
319	324
...	...	@@ -469,20 +474,18 @@
469	474	}
470	475
471	476	/*
472		- * Call the ->sync_fs super_op against all filesystems which are r/w and
473		- * which implement it.
	477	+ * Sync all the data for all the filesystems (called by sys_sync() and
	478	+ * emergency sync)
474	479	*
475	480	* This operation is careful to avoid the livelock which could easily happen
476		- * if two or more filesystems are being continuously dirtied. s_need_sync_fs
	481	+ * if two or more filesystems are being continuously dirtied. s_need_sync
477	482	* is used only here. We set it against all filesystems and then clear it as
478	483	* we sync them. So redirtied filesystems are skipped.
479	484	*
480	485	* But if process A is currently running sync_filesystems and then process B
481		- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
	486	+ * calls sync_filesystems as well, process B will set all the s_need_sync
482	487	* flags again, which will cause process A to resync everything. Fix that with
483	488	* a local mutex.
484		- *
485		- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
486	489	*/
487	490	void sync_filesystems(int wait)
488	491	{
489	492
490	493
491	494
492	495
...	...	@@ -492,25 +495,23 @@
492	495	mutex_lock(&mutex); /* Could be down_interruptible */
493	496	spin_lock(&sb_lock);
494	497	list_for_each_entry(sb, &super_blocks, s_list) {
495		- if (!sb->s_op->sync_fs)
496		- continue;
497	498	if (sb->s_flags & MS_RDONLY)
498	499	continue;
499		- sb->s_need_sync_fs = 1;
	500	+ sb->s_need_sync = 1;
500	501	}
501	502
502	503	restart:
503	504	list_for_each_entry(sb, &super_blocks, s_list) {
504		- if (!sb->s_need_sync_fs)
	505	+ if (!sb->s_need_sync)
505	506	continue;
506		- sb->s_need_sync_fs = 0;
	507	+ sb->s_need_sync = 0;
507	508	if (sb->s_flags & MS_RDONLY)
508	509	continue; /* hm. Was remounted r/o meanwhile */
509	510	sb->s_count++;
510	511	spin_unlock(&sb_lock);
511	512	down_read(&sb->s_umount);
512	513	if (sb->s_root)
513		- sb->s_op->sync_fs(sb, wait);
	514	+ __fsync_super(sb, wait);
514	515	up_read(&sb->s_umount);
515	516	/* restart only when sb is no longer on the list */
516	517	spin_lock(&sb_lock);
...	...	@@ -520,33 +521,6 @@
520	521	spin_unlock(&sb_lock);
521	522	mutex_unlock(&mutex);
522	523	}
523		-
524		-#ifdef CONFIG_BLOCK
525		-/*
526		- * Sync all block devices underlying some superblock
527		- */
528		-void sync_blockdevs(void)
529		-{
530		- struct super_block *sb;
531		-
532		- spin_lock(&sb_lock);
533		-restart:
534		- list_for_each_entry(sb, &super_blocks, s_list) {
535		- if (!sb->s_bdev)
536		- continue;
537		- sb->s_count++;
538		- spin_unlock(&sb_lock);
539		- down_read(&sb->s_umount);
540		- if (sb->s_root)
541		- sync_blockdev(sb->s_bdev);
542		- up_read(&sb->s_umount);
543		- spin_lock(&sb_lock);
544		- if (__put_super_and_need_restart(sb))
545		- goto restart;
546		- }
547		- spin_unlock(&sb_lock);
548		-}
549		-#endif
550	524
551	525	/**
552	526	* get_super - get the superblock of a device
...	...	@@ -18,35 +18,24 @@
18	18	#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \
19	19	SYNC_FILE_RANGE_WAIT_AFTER)
20	20
21		-/*
22		- * sync everything. Start out by waking pdflush, because that writes back
23		- * all queues in parallel.
24		- */
25		-static void do_sync(unsigned long wait)
	21	+SYSCALL_DEFINE0(sync)
26	22	{
27		- wakeup_pdflush(0);
28		- sync_inodes(0); /* All mappings, inodes and their blockdevs */
29		- vfs_dq_sync(NULL);
30		- sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
31		- sync_supers(); /* Write the superblocks */
32		- sync_filesystems(0); /* Start syncing the filesystems */
33		- sync_filesystems(wait); /* Waitingly sync the filesystems */
34		- sync_blockdevs();
35		- if (!wait)
36		- printk("Emergency Sync complete\n");
	23	+ sync_filesystems(0);
	24	+ sync_filesystems(1);
37	25	if (unlikely(laptop_mode))
38	26	laptop_sync_completion();
39		-}
40		-
41		-SYSCALL_DEFINE0(sync)
42		-{
43		- do_sync(1);
44	27	return 0;
45	28	}
46	29
47	30	static void do_sync_work(struct work_struct *work)
48	31	{
49		- do_sync(0);
	32	+ /*
	33	+ * Sync twice to reduce the possibility we skipped some inodes / pages
	34	+ * because they were temporarily locked
	35	+ */
	36	+ sync_filesystems(0);
	37	+ sync_filesystems(0);
	38	+ printk("Emergency Sync complete\n");
50	39	kfree(work);
51	40	}
52	41
...	...	@@ -1321,7 +1321,7 @@
1321	1321	struct rw_semaphore s_umount;
1322	1322	struct mutex s_lock;
1323	1323	int s_count;
1324		- int s_need_sync_fs;
	1324	+ int s_need_sync;
1325	1325	atomic_t s_active;
1326	1326	#ifdef CONFIG_SECURITY
1327	1327	void *s_security;
...	...	@@ -79,7 +79,6 @@
79	79	void writeback_inodes(struct writeback_control *wbc);
80	80	int inode_wait(void *);
81	81	void sync_inodes_sb(struct super_block *, int wait);
82		-void sync_inodes(int wait);
83	82
84	83	/* writeback.h requires fs.h; it, too, is not included from here. */
85	84	static inline void wait_on_inode(struct inode *inode)