Commit 5cee5815d1564bbbd505fea86f4550f1efdb5cd0

Authored by Jan Kara
Committed by Al Viro
1 parent 429479f031

vfs: Make sys_sync() use fsync_super() (version 4)

It is unnecessarily fragile to have two places (fsync_super() and do_sync())
doing data integrity sync of the filesystem. Alter __fsync_super() to
accommodate needs of both callers and use it. So after this patch
__fsync_super() is the only place where we gather all the calls needed to
properly send all data on a filesystem to disk.

Nice bonus is that we get a complete livelock avoidance and write_supers()
is now only used for periodic writeback of superblocks.

sync_blockdevs() introduced a couple of patches ago is gone now.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 7 changed files with 51 additions and 135 deletions Side-by-side Diff

... ... @@ -176,17 +176,22 @@
176 176 iov, offset, nr_segs, blkdev_get_blocks, NULL);
177 177 }
178 178  
  179 +int __sync_blockdev(struct block_device *bdev, int wait)
  180 +{
  181 + if (!bdev)
  182 + return 0;
  183 + if (!wait)
  184 + return filemap_flush(bdev->bd_inode->i_mapping);
  185 + return filemap_write_and_wait(bdev->bd_inode->i_mapping);
  186 +}
  187 +
179 188 /*
180 189 * Write out and wait upon all the dirty data associated with a block
181 190 * device via its mapping. Does not take the superblock lock.
182 191 */
183 192 int sync_blockdev(struct block_device *bdev)
184 193 {
185   - int ret = 0;
186   -
187   - if (bdev)
188   - ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
189   - return ret;
  194 + return __sync_blockdev(bdev, 1);
190 195 }
191 196 EXPORT_SYMBOL(sync_blockdev);
192 197  
... ... @@ -679,55 +679,6 @@
679 679 }
680 680  
681 681 /**
682   - * sync_inodes - writes all inodes to disk
683   - * @wait: wait for completion
684   - *
685   - * sync_inodes() goes through each super block's dirty inode list, writes the
686   - * inodes out, waits on the writeout and puts the inodes back on the normal
687   - * list.
688   - *
689   - * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
690   - * part of the sync functions is that the blockdev "superblock" is processed
691   - * last. This is because the write_inode() function of a typical fs will
692   - * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
693   - * What we want to do is to perform all that dirtying first, and then write
694   - * back all those inode blocks via the blockdev mapping in one sweep. So the
695   - * additional (somewhat redundant) sync_blockdev() calls here are to make
696   - * sure that really happens. Because if we call sync_inodes_sb(wait=1) with
697   - * outstanding dirty inodes, the writeback goes block-at-a-time within the
698   - * filesystem's write_inode(). This is extremely slow.
699   - */
700   -static void __sync_inodes(int wait)
701   -{
702   - struct super_block *sb;
703   -
704   - spin_lock(&sb_lock);
705   -restart:
706   - list_for_each_entry(sb, &super_blocks, s_list) {
707   - sb->s_count++;
708   - spin_unlock(&sb_lock);
709   - down_read(&sb->s_umount);
710   - if (sb->s_root) {
711   - sync_inodes_sb(sb, wait);
712   - sync_blockdev(sb->s_bdev);
713   - }
714   - up_read(&sb->s_umount);
715   - spin_lock(&sb_lock);
716   - if (__put_super_and_need_restart(sb))
717   - goto restart;
718   - }
719   - spin_unlock(&sb_lock);
720   -}
721   -
722   -void sync_inodes(int wait)
723   -{
724   - __sync_inodes(0);
725   -
726   - if (wait)
727   - __sync_inodes(1);
728   -}
729   -
730   -/**
731 682 * write_inode_now - write an inode to disk
732 683 * @inode: inode to write to disk
733 684 * @sync: whether the write should be synchronous or not
... ... @@ -25,6 +25,8 @@
25 25 return sb == blockdev_superblock;
26 26 }
27 27  
  28 +extern int __sync_blockdev(struct block_device *bdev, int wait);
  29 +
28 30 #else
29 31 static inline void bdev_cache_init(void)
30 32 {
... ... @@ -34,6 +36,11 @@
34 36 {
35 37 return 0;
36 38 }
  39 +
  40 +static inline int __sync_blockdev(struct block_device *bdev, int wait)
  41 +{
  42 + return 0;
  43 +}
37 44 #endif
38 45  
39 46 /*
... ... @@ -71,13 +78,4 @@
71 78 * file_table.c
72 79 */
73 80 extern void mark_files_ro(struct super_block *);
74   -
75   -/*
76   - * super.c
77   - */
78   -#ifdef CONFIG_BLOCK
79   -extern void sync_blockdevs(void);
80   -#else
81   -static inline void sync_blockdevs(void) { }
82   -#endif
... ... @@ -284,23 +284,23 @@
284 284 EXPORT_SYMBOL(unlock_super);
285 285  
286 286 /*
287   - * Write out and wait upon all dirty data associated with this
288   - * superblock. Filesystem data as well as the underlying block
289   - * device. Takes the superblock lock. Requires a second blkdev
290   - * flush by the caller to complete the operation.
  287 + * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
  288 + * just dirties buffers with inodes so we have to submit IO for these buffers
  289 + * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
  290 + * case write_inode() functions do sync_dirty_buffer() and thus effectively
  291 + * write one block at a time.
291 292 */
292   -static int __fsync_super(struct super_block *sb)
  293 +static int __fsync_super(struct super_block *sb, int wait)
293 294 {
294   - sync_inodes_sb(sb, 0);
295 295 vfs_dq_sync(sb);
296   - sync_inodes_sb(sb, 1);
  296 + sync_inodes_sb(sb, wait);
297 297 lock_super(sb);
298 298 if (sb->s_dirt && sb->s_op->write_super)
299 299 sb->s_op->write_super(sb);
300 300 unlock_super(sb);
301 301 if (sb->s_op->sync_fs)
302   - sb->s_op->sync_fs(sb, 1);
303   - return sync_blockdev(sb->s_bdev);
  302 + sb->s_op->sync_fs(sb, wait);
  303 + return __sync_blockdev(sb->s_bdev, wait);
304 304 }
305 305  
306 306 /*
... ... @@ -310,7 +310,12 @@
310 310 */
311 311 int fsync_super(struct super_block *sb)
312 312 {
313   - return __fsync_super(sb);
  313 + int ret;
  314 +
  315 + ret = __fsync_super(sb, 0);
  316 + if (ret < 0)
  317 + return ret;
  318 + return __fsync_super(sb, 1);
314 319 }
315 320 EXPORT_SYMBOL_GPL(fsync_super);
316 321  
317 322  
318 323  
319 324  
... ... @@ -469,20 +474,18 @@
469 474 }
470 475  
471 476 /*
472   - * Call the ->sync_fs super_op against all filesystems which are r/w and
473   - * which implement it.
  477 + * Sync all the data for all the filesystems (called by sys_sync() and
  478 + * emergency sync)
474 479 *
475 480 * This operation is careful to avoid the livelock which could easily happen
476   - * if two or more filesystems are being continuously dirtied. s_need_sync_fs
  481 + * if two or more filesystems are being continuously dirtied. s_need_sync
477 482 * is used only here. We set it against all filesystems and then clear it as
478 483 * we sync them. So redirtied filesystems are skipped.
479 484 *
480 485 * But if process A is currently running sync_filesystems and then process B
481   - * calls sync_filesystems as well, process B will set all the s_need_sync_fs
  486 + * calls sync_filesystems as well, process B will set all the s_need_sync
482 487 * flags again, which will cause process A to resync everything. Fix that with
483 488 * a local mutex.
484   - *
485   - * (Fabian) Avoid sync_fs with clean fs & wait mode 0
486 489 */
487 490 void sync_filesystems(int wait)
488 491 {
489 492  
490 493  
491 494  
492 495  
... ... @@ -492,25 +495,23 @@
492 495 mutex_lock(&mutex); /* Could be down_interruptible */
493 496 spin_lock(&sb_lock);
494 497 list_for_each_entry(sb, &super_blocks, s_list) {
495   - if (!sb->s_op->sync_fs)
496   - continue;
497 498 if (sb->s_flags & MS_RDONLY)
498 499 continue;
499   - sb->s_need_sync_fs = 1;
  500 + sb->s_need_sync = 1;
500 501 }
501 502  
502 503 restart:
503 504 list_for_each_entry(sb, &super_blocks, s_list) {
504   - if (!sb->s_need_sync_fs)
  505 + if (!sb->s_need_sync)
505 506 continue;
506   - sb->s_need_sync_fs = 0;
  507 + sb->s_need_sync = 0;
507 508 if (sb->s_flags & MS_RDONLY)
508 509 continue; /* hm. Was remounted r/o meanwhile */
509 510 sb->s_count++;
510 511 spin_unlock(&sb_lock);
511 512 down_read(&sb->s_umount);
512 513 if (sb->s_root)
513   - sb->s_op->sync_fs(sb, wait);
  514 + __fsync_super(sb, wait);
514 515 up_read(&sb->s_umount);
515 516 /* restart only when sb is no longer on the list */
516 517 spin_lock(&sb_lock);
... ... @@ -520,33 +521,6 @@
520 521 spin_unlock(&sb_lock);
521 522 mutex_unlock(&mutex);
522 523 }
523   -
524   -#ifdef CONFIG_BLOCK
525   -/*
526   - * Sync all block devices underlying some superblock
527   - */
528   -void sync_blockdevs(void)
529   -{
530   - struct super_block *sb;
531   -
532   - spin_lock(&sb_lock);
533   -restart:
534   - list_for_each_entry(sb, &super_blocks, s_list) {
535   - if (!sb->s_bdev)
536   - continue;
537   - sb->s_count++;
538   - spin_unlock(&sb_lock);
539   - down_read(&sb->s_umount);
540   - if (sb->s_root)
541   - sync_blockdev(sb->s_bdev);
542   - up_read(&sb->s_umount);
543   - spin_lock(&sb_lock);
544   - if (__put_super_and_need_restart(sb))
545   - goto restart;
546   - }
547   - spin_unlock(&sb_lock);
548   -}
549   -#endif
550 524  
551 525 /**
552 526 * get_super - get the superblock of a device
... ... @@ -18,35 +18,24 @@
18 18 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
19 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20  
21   -/*
22   - * sync everything. Start out by waking pdflush, because that writes back
23   - * all queues in parallel.
24   - */
25   -static void do_sync(unsigned long wait)
  21 +SYSCALL_DEFINE0(sync)
26 22 {
27   - wakeup_pdflush(0);
28   - sync_inodes(0); /* All mappings, inodes and their blockdevs */
29   - vfs_dq_sync(NULL);
30   - sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
31   - sync_supers(); /* Write the superblocks */
32   - sync_filesystems(0); /* Start syncing the filesystems */
33   - sync_filesystems(wait); /* Waitingly sync the filesystems */
34   - sync_blockdevs();
35   - if (!wait)
36   - printk("Emergency Sync complete\n");
  23 + sync_filesystems(0);
  24 + sync_filesystems(1);
37 25 if (unlikely(laptop_mode))
38 26 laptop_sync_completion();
39   -}
40   -
41   -SYSCALL_DEFINE0(sync)
42   -{
43   - do_sync(1);
44 27 return 0;
45 28 }
46 29  
47 30 static void do_sync_work(struct work_struct *work)
48 31 {
49   - do_sync(0);
  32 + /*
  33 + * Sync twice to reduce the possibility we skipped some inodes / pages
  34 + * because they were temporarily locked
  35 + */
  36 + sync_filesystems(0);
  37 + sync_filesystems(0);
  38 + printk("Emergency Sync complete\n");
50 39 kfree(work);
51 40 }
52 41  
... ... @@ -1321,7 +1321,7 @@
1321 1321 struct rw_semaphore s_umount;
1322 1322 struct mutex s_lock;
1323 1323 int s_count;
1324   - int s_need_sync_fs;
  1324 + int s_need_sync;
1325 1325 atomic_t s_active;
1326 1326 #ifdef CONFIG_SECURITY
1327 1327 void *s_security;
include/linux/writeback.h
... ... @@ -79,7 +79,6 @@
79 79 void writeback_inodes(struct writeback_control *wbc);
80 80 int inode_wait(void *);
81 81 void sync_inodes_sb(struct super_block *, int wait);
82   -void sync_inodes(int wait);
83 82  
84 83 /* writeback.h requires fs.h; it, too, is not included from here. */
85 84 static inline void wait_on_inode(struct inode *inode)