Commit a66979abad090b2765a6c6790c9fdeab996833f2

Authored by Dave Chinner
Committed by Al Viro
1 parent 55fa6091d8

fs: move i_wb_list out from under inode_lock

Protect the inode writeback list with a new global lock
inode_wb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 8 changed files with 70 additions and 48 deletions Side-by-side Diff

... ... @@ -55,13 +55,13 @@
55 55 static void bdev_inode_switch_bdi(struct inode *inode,
56 56 struct backing_dev_info *dst)
57 57 {
58   - spin_lock(&inode_lock);
  58 + spin_lock(&inode_wb_list_lock);
59 59 spin_lock(&inode->i_lock);
60 60 inode->i_data.backing_dev_info = dst;
61 61 if (inode->i_state & I_DIRTY)
62 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 63 spin_unlock(&inode->i_lock);
64   - spin_unlock(&inode_lock);
  64 + spin_unlock(&inode_wb_list_lock);
65 65 }
66 66  
67 67 static sector_t max_block(struct block_device *bdev)
... ... @@ -176,6 +176,17 @@
176 176 }
177 177  
178 178 /*
  179 + * Remove the inode from the writeback list it is on.
  180 + */
  181 +void inode_wb_list_del(struct inode *inode)
  182 +{
  183 + spin_lock(&inode_wb_list_lock);
  184 + list_del_init(&inode->i_wb_list);
  185 + spin_unlock(&inode_wb_list_lock);
  186 +}
  187 +
  188 +
  189 +/*
179 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
180 191 * furthest end of its superblock's dirty-inode list.
181 192 *
... ... @@ -188,6 +199,7 @@
188 199 {
189 200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
190 201  
  202 + assert_spin_locked(&inode_wb_list_lock);
191 203 if (!list_empty(&wb->b_dirty)) {
192 204 struct inode *tail;
193 205  
194 206  
195 207  
... ... @@ -205,14 +217,17 @@
205 217 {
206 218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
207 219  
  220 + assert_spin_locked(&inode_wb_list_lock);
208 221 list_move(&inode->i_wb_list, &wb->b_more_io);
209 222 }
210 223  
211 224 static void inode_sync_complete(struct inode *inode)
212 225 {
213 226 /*
214   - * Prevent speculative execution through spin_unlock(&inode_lock);
  227 + * Prevent speculative execution through
  228 + * spin_unlock(&inode_wb_list_lock);
215 229 */
  230 +
216 231 smp_mb();
217 232 wake_up_bit(&inode->i_state, __I_SYNC);
218 233 }
... ... @@ -286,6 +301,7 @@
286 301 */
287 302 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
288 303 {
  304 + assert_spin_locked(&inode_wb_list_lock);
289 305 list_splice_init(&wb->b_more_io, &wb->b_io);
290 306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
291 307 }
292 308  
293 309  
294 310  
... ... @@ -308,25 +324,23 @@
308 324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
309 325 while (inode->i_state & I_SYNC) {
310 326 spin_unlock(&inode->i_lock);
311   - spin_unlock(&inode_lock);
  327 + spin_unlock(&inode_wb_list_lock);
312 328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
313   - spin_lock(&inode_lock);
  329 + spin_lock(&inode_wb_list_lock);
314 330 spin_lock(&inode->i_lock);
315 331 }
316 332 }
317 333  
318 334 /*
319   - * Write out an inode's dirty pages. Called under inode_lock. Either the
320   - * caller has ref on the inode (either via __iget or via syscall against an fd)
321   - * or the inode has I_WILL_FREE set (via generic_forget_inode)
  335 + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either
  336 + * the caller has an active reference on the inode or the inode has I_WILL_FREE
  337 + * set.
322 338 *
323 339 * If `wait' is set, wait on the writeout.
324 340 *
325 341 * The whole writeout design is quite complex and fragile. We want to avoid
326 342 * starvation of particular inodes when others are being redirtied, prevent
327 343 * livelocks, etc.
328   - *
329   - * Called under inode_lock.
330 344 */
331 345 static int
332 346 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
... ... @@ -368,7 +382,7 @@
368 382 inode->i_state |= I_SYNC;
369 383 inode->i_state &= ~I_DIRTY_PAGES;
370 384 spin_unlock(&inode->i_lock);
371   - spin_unlock(&inode_lock);
  385 + spin_unlock(&inode_wb_list_lock);
372 386  
373 387 ret = do_writepages(mapping, wbc);
374 388  
375 389  
... ... @@ -388,12 +402,10 @@
388 402 * due to delalloc, clear dirty metadata flags right before
389 403 * write_inode()
390 404 */
391   - spin_lock(&inode_lock);
392 405 spin_lock(&inode->i_lock);
393 406 dirty = inode->i_state & I_DIRTY;
394 407 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
395 408 spin_unlock(&inode->i_lock);
396   - spin_unlock(&inode_lock);
397 409 /* Don't write the inode if only I_DIRTY_PAGES was set */
398 410 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
399 411 int err = write_inode(inode, wbc);
... ... @@ -401,7 +413,7 @@
401 413 ret = err;
402 414 }
403 415  
404   - spin_lock(&inode_lock);
  416 + spin_lock(&inode_wb_list_lock);
405 417 spin_lock(&inode->i_lock);
406 418 inode->i_state &= ~I_SYNC;
407 419 if (!(inode->i_state & I_FREEING)) {
408 420  
... ... @@ -543,10 +555,10 @@
543 555 */
544 556 redirty_tail(inode);
545 557 }
546   - spin_unlock(&inode_lock);
  558 + spin_unlock(&inode_wb_list_lock);
547 559 iput(inode);
548 560 cond_resched();
549   - spin_lock(&inode_lock);
  561 + spin_lock(&inode_wb_list_lock);
550 562 if (wbc->nr_to_write <= 0) {
551 563 wbc->more_io = 1;
552 564 return 1;
... ... @@ -565,7 +577,7 @@
565 577  
566 578 if (!wbc->wb_start)
567 579 wbc->wb_start = jiffies; /* livelock avoidance */
568   - spin_lock(&inode_lock);
  580 + spin_lock(&inode_wb_list_lock);
569 581 if (!wbc->for_kupdate || list_empty(&wb->b_io))
570 582 queue_io(wb, wbc->older_than_this);
571 583  
... ... @@ -583,7 +595,7 @@
583 595 if (ret)
584 596 break;
585 597 }
586   - spin_unlock(&inode_lock);
  598 + spin_unlock(&inode_wb_list_lock);
587 599 /* Leave any unwritten inodes on b_io */
588 600 }
589 601  
590 602  
... ... @@ -592,11 +604,11 @@
592 604 {
593 605 WARN_ON(!rwsem_is_locked(&sb->s_umount));
594 606  
595   - spin_lock(&inode_lock);
  607 + spin_lock(&inode_wb_list_lock);
596 608 if (!wbc->for_kupdate || list_empty(&wb->b_io))
597 609 queue_io(wb, wbc->older_than_this);
598 610 writeback_sb_inodes(sb, wb, wbc, true);
599   - spin_unlock(&inode_lock);
  611 + spin_unlock(&inode_wb_list_lock);
600 612 }
601 613  
602 614 /*
... ... @@ -735,7 +747,7 @@
735 747 * become available for writeback. Otherwise
736 748 * we'll just busyloop.
737 749 */
738   - spin_lock(&inode_lock);
  750 + spin_lock(&inode_wb_list_lock);
739 751 if (!list_empty(&wb->b_more_io)) {
740 752 inode = wb_inode(wb->b_more_io.prev);
741 753 trace_wbc_writeback_wait(&wbc, wb->bdi);
... ... @@ -743,7 +755,7 @@
743 755 inode_wait_for_writeback(inode);
744 756 spin_unlock(&inode->i_lock);
745 757 }
746   - spin_unlock(&inode_lock);
  758 + spin_unlock(&inode_wb_list_lock);
747 759 }
748 760  
749 761 return wrote;
... ... @@ -1009,7 +1021,6 @@
1009 1021 {
1010 1022 struct super_block *sb = inode->i_sb;
1011 1023 struct backing_dev_info *bdi = NULL;
1012   - bool wakeup_bdi = false;
1013 1024  
1014 1025 /*
1015 1026 * Don't do this for I_DIRTY_PAGES - that doesn't actually
... ... @@ -1033,7 +1044,6 @@
1033 1044 if (unlikely(block_dump))
1034 1045 block_dump___mark_inode_dirty(inode);
1035 1046  
1036   - spin_lock(&inode_lock);
1037 1047 spin_lock(&inode->i_lock);
1038 1048 if ((inode->i_state & flags) != flags) {
1039 1049 const int was_dirty = inode->i_state & I_DIRTY;
1040 1050  
... ... @@ -1059,12 +1069,12 @@
1059 1069 if (inode->i_state & I_FREEING)
1060 1070 goto out_unlock_inode;
1061 1071  
1062   - spin_unlock(&inode->i_lock);
1063 1072 /*
1064 1073 * If the inode was already on b_dirty/b_io/b_more_io, don't
1065 1074 * reposition it (that would break b_dirty time-ordering).
1066 1075 */
1067 1076 if (!was_dirty) {
  1077 + bool wakeup_bdi = false;
1068 1078 bdi = inode_to_bdi(inode);
1069 1079  
1070 1080 if (bdi_cap_writeback_dirty(bdi)) {
1071 1081  
1072 1082  
1073 1083  
1074 1084  
... ... @@ -1081,18 +1091,20 @@
1081 1091 wakeup_bdi = true;
1082 1092 }
1083 1093  
  1094 + spin_unlock(&inode->i_lock);
  1095 + spin_lock(&inode_wb_list_lock);
1084 1096 inode->dirtied_when = jiffies;
1085 1097 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
  1098 + spin_unlock(&inode_wb_list_lock);
  1099 +
  1100 + if (wakeup_bdi)
  1101 + bdi_wakeup_thread_delayed(bdi);
  1102 + return;
1086 1103 }
1087   - goto out;
1088 1104 }
1089 1105 out_unlock_inode:
1090 1106 spin_unlock(&inode->i_lock);
1091   -out:
1092   - spin_unlock(&inode_lock);
1093 1107  
1094   - if (wakeup_bdi)
1095   - bdi_wakeup_thread_delayed(bdi);
1096 1108 }
1097 1109 EXPORT_SYMBOL(__mark_inode_dirty);
1098 1110  
1099 1111  
... ... @@ -1296,9 +1308,9 @@
1296 1308 wbc.nr_to_write = 0;
1297 1309  
1298 1310 might_sleep();
1299   - spin_lock(&inode_lock);
  1311 + spin_lock(&inode_wb_list_lock);
1300 1312 ret = writeback_single_inode(inode, &wbc);
1301   - spin_unlock(&inode_lock);
  1313 + spin_unlock(&inode_wb_list_lock);
1302 1314 if (sync)
1303 1315 inode_sync_wait(inode);
1304 1316 return ret;
1305 1317  
... ... @@ -1320,9 +1332,9 @@
1320 1332 {
1321 1333 int ret;
1322 1334  
1323   - spin_lock(&inode_lock);
  1335 + spin_lock(&inode_wb_list_lock);
1324 1336 ret = writeback_single_inode(inode, wbc);
1325   - spin_unlock(&inode_lock);
  1337 + spin_unlock(&inode_wb_list_lock);
1326 1338 return ret;
1327 1339 }
1328 1340 EXPORT_SYMBOL(sync_inode);
... ... @@ -26,6 +26,7 @@
26 26 #include <linux/posix_acl.h>
27 27 #include <linux/ima.h>
28 28 #include <linux/cred.h>
  29 +#include "internal.h"
29 30  
30 31 /*
31 32 * inode locking rules.
... ... @@ -36,6 +37,8 @@
36 37 * inode_lru, inode->i_lru
37 38 * inode_sb_list_lock protects:
38 39 * sb->s_inodes, inode->i_sb_list
  40 + * inode_wb_list_lock protects:
  41 + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
39 42 *
40 43 * Lock ordering:
41 44 * inode_lock
... ... @@ -44,6 +47,9 @@
44 47 * inode_sb_list_lock
45 48 * inode->i_lock
46 49 * inode_lru_lock
  50 + *
  51 + * inode_wb_list_lock
  52 + * inode->i_lock
47 53 */
48 54  
49 55 /*
... ... @@ -105,6 +111,7 @@
105 111 DEFINE_SPINLOCK(inode_lock);
106 112  
107 113 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
  114 +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
108 115  
109 116 /*
110 117 * iprune_sem provides exclusion between the icache shrinking and the
... ... @@ -483,10 +490,7 @@
483 490 BUG_ON(!(inode->i_state & I_FREEING));
484 491 BUG_ON(!list_empty(&inode->i_lru));
485 492  
486   - spin_lock(&inode_lock);
487   - list_del_init(&inode->i_wb_list);
488   - spin_unlock(&inode_lock);
489   -
  493 + inode_wb_list_del(inode);
490 494 inode_sb_list_del(inode);
491 495  
492 496 if (op->evict_inode) {
... ... @@ -127,6 +127,11 @@
127 127 */
128 128 extern spinlock_t inode_sb_list_lock;
129 129  
  130 +/*
  131 + * fs-writeback.c
  132 + */
  133 +extern void inode_wb_list_del(struct inode *inode);
  134 +
130 135 extern int get_nr_dirty_inodes(void);
131 136 extern void evict_inodes(struct super_block *);
132 137 extern int invalidate_inodes(struct super_block *, bool);
include/linux/writeback.h
... ... @@ -10,6 +10,7 @@
10 10 struct backing_dev_info;
11 11  
12 12 extern spinlock_t inode_lock;
  13 +extern spinlock_t inode_wb_list_lock;
13 14  
14 15 /*
15 16 * fs/fs-writeback.c
... ... @@ -73,14 +73,14 @@
73 73 struct inode *inode;
74 74  
75 75 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
76   - spin_lock(&inode_lock);
  76 + spin_lock(&inode_wb_list_lock);
77 77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 78 nr_dirty++;
79 79 list_for_each_entry(inode, &wb->b_io, i_wb_list)
80 80 nr_io++;
81 81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 82 nr_more_io++;
83   - spin_unlock(&inode_lock);
  83 + spin_unlock(&inode_wb_list_lock);
84 84  
85 85 global_dirty_limits(&background_thresh, &dirty_thresh);
86 86 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
87 87  
... ... @@ -682,11 +682,11 @@
682 682 if (bdi_has_dirty_io(bdi)) {
683 683 struct bdi_writeback *dst = &default_backing_dev_info.wb;
684 684  
685   - spin_lock(&inode_lock);
  685 + spin_lock(&inode_wb_list_lock);
686 686 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
687 687 list_splice(&bdi->wb.b_io, &dst->b_io);
688 688 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
689   - spin_unlock(&inode_lock);
  689 + spin_unlock(&inode_wb_list_lock);
690 690 }
691 691  
692 692 bdi_unregister(bdi);
... ... @@ -80,8 +80,8 @@
80 80 * ->i_mutex
81 81 * ->i_alloc_sem (various)
82 82 *
83   - * ->inode_lock
84   - * ->sb_lock (fs/fs-writeback.c)
  83 + * inode_wb_list_lock
  84 + * sb_lock (fs/fs-writeback.c)
85 85 * ->mapping->tree_lock (__sync_single_inode)
86 86 *
87 87 * ->i_mmap_lock
88 88  
... ... @@ -98,9 +98,9 @@
98 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
99 99 * ->private_lock (page_remove_rmap->set_page_dirty)
100 100 * ->tree_lock (page_remove_rmap->set_page_dirty)
101   - * ->inode_lock (page_remove_rmap->set_page_dirty)
  101 + * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
102 102 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
103   - * ->inode_lock (zap_pte_range->set_page_dirty)
  103 + * inode_wb_list_lock (zap_pte_range->set_page_dirty)
104 104 * ->inode->i_lock (zap_pte_range->set_page_dirty)
105 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
106 106 *
... ... @@ -31,12 +31,12 @@
31 31 * swap_lock (in swap_duplicate, swap_info_get)
32 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 33 * mapping->private_lock (in __set_page_dirty_buffers)
34   - * inode_lock (in set_page_dirty's __mark_inode_dirty)
35 34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  35 + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
36 36 * sb_lock (within inode_lock in fs/fs-writeback.c)
37 37 * mapping->tree_lock (widely used, in set_page_dirty,
38 38 * in arch-dependent flush_dcache_mmap_lock,
39   - * within inode_lock in __sync_single_inode)
  39 + * within inode_wb_list_lock in __sync_single_inode)
40 40 *
41 41 * (code doesn't rely on that order so it could be switched around)
42 42 * ->tasklist_lock