Commit a66979abad090b2765a6c6790c9fdeab996833f2
Committed by
Al Viro
1 parent
55fa6091d8
Exists in
master
and in
4 other branches
fs: move i_wb_list out from under inode_lock
Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Showing 8 changed files with 70 additions and 48 deletions Side-by-side Diff
fs/block_dev.c
... | ... | @@ -55,13 +55,13 @@ |
55 | 55 | static void bdev_inode_switch_bdi(struct inode *inode, |
56 | 56 | struct backing_dev_info *dst) |
57 | 57 | { |
58 | - spin_lock(&inode_lock); | |
58 | + spin_lock(&inode_wb_list_lock); | |
59 | 59 | spin_lock(&inode->i_lock); |
60 | 60 | inode->i_data.backing_dev_info = dst; |
61 | 61 | if (inode->i_state & I_DIRTY) |
62 | 62 | list_move(&inode->i_wb_list, &dst->wb.b_dirty); |
63 | 63 | spin_unlock(&inode->i_lock); |
64 | - spin_unlock(&inode_lock); | |
64 | + spin_unlock(&inode_wb_list_lock); | |
65 | 65 | } |
66 | 66 | |
67 | 67 | static sector_t max_block(struct block_device *bdev) |
fs/fs-writeback.c
... | ... | @@ -176,6 +176,17 @@ |
176 | 176 | } |
177 | 177 | |
178 | 178 | /* |
179 | + * Remove the inode from the writeback list it is on. | |
180 | + */ | |
181 | +void inode_wb_list_del(struct inode *inode) | |
182 | +{ | |
183 | + spin_lock(&inode_wb_list_lock); | |
184 | + list_del_init(&inode->i_wb_list); | |
185 | + spin_unlock(&inode_wb_list_lock); | |
186 | +} | |
187 | + | |
188 | + | |
189 | +/* | |
179 | 190 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
180 | 191 | * furthest end of its superblock's dirty-inode list. |
181 | 192 | * |
... | ... | @@ -188,6 +199,7 @@ |
188 | 199 | { |
189 | 200 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
190 | 201 | |
202 | + assert_spin_locked(&inode_wb_list_lock); | |
191 | 203 | if (!list_empty(&wb->b_dirty)) { |
192 | 204 | struct inode *tail; |
193 | 205 | |
194 | 206 | |
195 | 207 | |
... | ... | @@ -205,14 +217,17 @@ |
205 | 217 | { |
206 | 218 | struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; |
207 | 219 | |
220 | + assert_spin_locked(&inode_wb_list_lock); | |
208 | 221 | list_move(&inode->i_wb_list, &wb->b_more_io); |
209 | 222 | } |
210 | 223 | |
211 | 224 | static void inode_sync_complete(struct inode *inode) |
212 | 225 | { |
213 | 226 | /* |
214 | - * Prevent speculative execution through spin_unlock(&inode_lock); | |
227 | + * Prevent speculative execution through | |
228 | + * spin_unlock(&inode_wb_list_lock); | |
215 | 229 | */ |
230 | + | |
216 | 231 | smp_mb(); |
217 | 232 | wake_up_bit(&inode->i_state, __I_SYNC); |
218 | 233 | } |
... | ... | @@ -286,6 +301,7 @@ |
286 | 301 | */ |
287 | 302 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) |
288 | 303 | { |
304 | + assert_spin_locked(&inode_wb_list_lock); | |
289 | 305 | list_splice_init(&wb->b_more_io, &wb->b_io); |
290 | 306 | move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); |
291 | 307 | } |
292 | 308 | |
293 | 309 | |
294 | 310 | |
... | ... | @@ -308,25 +324,23 @@ |
308 | 324 | wqh = bit_waitqueue(&inode->i_state, __I_SYNC); |
309 | 325 | while (inode->i_state & I_SYNC) { |
310 | 326 | spin_unlock(&inode->i_lock); |
311 | - spin_unlock(&inode_lock); | |
327 | + spin_unlock(&inode_wb_list_lock); | |
312 | 328 | __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); |
313 | - spin_lock(&inode_lock); | |
329 | + spin_lock(&inode_wb_list_lock); | |
314 | 330 | spin_lock(&inode->i_lock); |
315 | 331 | } |
316 | 332 | } |
317 | 333 | |
318 | 334 | /* |
319 | - * Write out an inode's dirty pages. Called under inode_lock. Either the | |
320 | - * caller has ref on the inode (either via __iget or via syscall against an fd) | |
321 | - * or the inode has I_WILL_FREE set (via generic_forget_inode) | |
335 | + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either | |
336 | + * the caller has an active reference on the inode or the inode has I_WILL_FREE | |
337 | + * set. | |
322 | 338 | * |
323 | 339 | * If `wait' is set, wait on the writeout. |
324 | 340 | * |
325 | 341 | * The whole writeout design is quite complex and fragile. We want to avoid |
326 | 342 | * starvation of particular inodes when others are being redirtied, prevent |
327 | 343 | * livelocks, etc. |
328 | - * | |
329 | - * Called under inode_lock. | |
330 | 344 | */ |
331 | 345 | static int |
332 | 346 | writeback_single_inode(struct inode *inode, struct writeback_control *wbc) |
... | ... | @@ -368,7 +382,7 @@ |
368 | 382 | inode->i_state |= I_SYNC; |
369 | 383 | inode->i_state &= ~I_DIRTY_PAGES; |
370 | 384 | spin_unlock(&inode->i_lock); |
371 | - spin_unlock(&inode_lock); | |
385 | + spin_unlock(&inode_wb_list_lock); | |
372 | 386 | |
373 | 387 | ret = do_writepages(mapping, wbc); |
374 | 388 | |
375 | 389 | |
... | ... | @@ -388,12 +402,10 @@ |
388 | 402 | * due to delalloc, clear dirty metadata flags right before |
389 | 403 | * write_inode() |
390 | 404 | */ |
391 | - spin_lock(&inode_lock); | |
392 | 405 | spin_lock(&inode->i_lock); |
393 | 406 | dirty = inode->i_state & I_DIRTY; |
394 | 407 | inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); |
395 | 408 | spin_unlock(&inode->i_lock); |
396 | - spin_unlock(&inode_lock); | |
397 | 409 | /* Don't write the inode if only I_DIRTY_PAGES was set */ |
398 | 410 | if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
399 | 411 | int err = write_inode(inode, wbc); |
... | ... | @@ -401,7 +413,7 @@ |
401 | 413 | ret = err; |
402 | 414 | } |
403 | 415 | |
404 | - spin_lock(&inode_lock); | |
416 | + spin_lock(&inode_wb_list_lock); | |
405 | 417 | spin_lock(&inode->i_lock); |
406 | 418 | inode->i_state &= ~I_SYNC; |
407 | 419 | if (!(inode->i_state & I_FREEING)) { |
408 | 420 | |
... | ... | @@ -543,10 +555,10 @@ |
543 | 555 | */ |
544 | 556 | redirty_tail(inode); |
545 | 557 | } |
546 | - spin_unlock(&inode_lock); | |
558 | + spin_unlock(&inode_wb_list_lock); | |
547 | 559 | iput(inode); |
548 | 560 | cond_resched(); |
549 | - spin_lock(&inode_lock); | |
561 | + spin_lock(&inode_wb_list_lock); | |
550 | 562 | if (wbc->nr_to_write <= 0) { |
551 | 563 | wbc->more_io = 1; |
552 | 564 | return 1; |
... | ... | @@ -565,7 +577,7 @@ |
565 | 577 | |
566 | 578 | if (!wbc->wb_start) |
567 | 579 | wbc->wb_start = jiffies; /* livelock avoidance */ |
568 | - spin_lock(&inode_lock); | |
580 | + spin_lock(&inode_wb_list_lock); | |
569 | 581 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) |
570 | 582 | queue_io(wb, wbc->older_than_this); |
571 | 583 | |
... | ... | @@ -583,7 +595,7 @@ |
583 | 595 | if (ret) |
584 | 596 | break; |
585 | 597 | } |
586 | - spin_unlock(&inode_lock); | |
598 | + spin_unlock(&inode_wb_list_lock); | |
587 | 599 | /* Leave any unwritten inodes on b_io */ |
588 | 600 | } |
589 | 601 | |
590 | 602 | |
... | ... | @@ -592,11 +604,11 @@ |
592 | 604 | { |
593 | 605 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
594 | 606 | |
595 | - spin_lock(&inode_lock); | |
607 | + spin_lock(&inode_wb_list_lock); | |
596 | 608 | if (!wbc->for_kupdate || list_empty(&wb->b_io)) |
597 | 609 | queue_io(wb, wbc->older_than_this); |
598 | 610 | writeback_sb_inodes(sb, wb, wbc, true); |
599 | - spin_unlock(&inode_lock); | |
611 | + spin_unlock(&inode_wb_list_lock); | |
600 | 612 | } |
601 | 613 | |
602 | 614 | /* |
... | ... | @@ -735,7 +747,7 @@ |
735 | 747 | * become available for writeback. Otherwise |
736 | 748 | * we'll just busyloop. |
737 | 749 | */ |
738 | - spin_lock(&inode_lock); | |
750 | + spin_lock(&inode_wb_list_lock); | |
739 | 751 | if (!list_empty(&wb->b_more_io)) { |
740 | 752 | inode = wb_inode(wb->b_more_io.prev); |
741 | 753 | trace_wbc_writeback_wait(&wbc, wb->bdi); |
... | ... | @@ -743,7 +755,7 @@ |
743 | 755 | inode_wait_for_writeback(inode); |
744 | 756 | spin_unlock(&inode->i_lock); |
745 | 757 | } |
746 | - spin_unlock(&inode_lock); | |
758 | + spin_unlock(&inode_wb_list_lock); | |
747 | 759 | } |
748 | 760 | |
749 | 761 | return wrote; |
... | ... | @@ -1009,7 +1021,6 @@ |
1009 | 1021 | { |
1010 | 1022 | struct super_block *sb = inode->i_sb; |
1011 | 1023 | struct backing_dev_info *bdi = NULL; |
1012 | - bool wakeup_bdi = false; | |
1013 | 1024 | |
1014 | 1025 | /* |
1015 | 1026 | * Don't do this for I_DIRTY_PAGES - that doesn't actually |
... | ... | @@ -1033,7 +1044,6 @@ |
1033 | 1044 | if (unlikely(block_dump)) |
1034 | 1045 | block_dump___mark_inode_dirty(inode); |
1035 | 1046 | |
1036 | - spin_lock(&inode_lock); | |
1037 | 1047 | spin_lock(&inode->i_lock); |
1038 | 1048 | if ((inode->i_state & flags) != flags) { |
1039 | 1049 | const int was_dirty = inode->i_state & I_DIRTY; |
1040 | 1050 | |
... | ... | @@ -1059,12 +1069,12 @@ |
1059 | 1069 | if (inode->i_state & I_FREEING) |
1060 | 1070 | goto out_unlock_inode; |
1061 | 1071 | |
1062 | - spin_unlock(&inode->i_lock); | |
1063 | 1072 | /* |
1064 | 1073 | * If the inode was already on b_dirty/b_io/b_more_io, don't |
1065 | 1074 | * reposition it (that would break b_dirty time-ordering). |
1066 | 1075 | */ |
1067 | 1076 | if (!was_dirty) { |
1077 | + bool wakeup_bdi = false; | |
1068 | 1078 | bdi = inode_to_bdi(inode); |
1069 | 1079 | |
1070 | 1080 | if (bdi_cap_writeback_dirty(bdi)) { |
1071 | 1081 | |
1072 | 1082 | |
1073 | 1083 | |
1074 | 1084 | |
... | ... | @@ -1081,18 +1091,20 @@ |
1081 | 1091 | wakeup_bdi = true; |
1082 | 1092 | } |
1083 | 1093 | |
1094 | + spin_unlock(&inode->i_lock); | |
1095 | + spin_lock(&inode_wb_list_lock); | |
1084 | 1096 | inode->dirtied_when = jiffies; |
1085 | 1097 | list_move(&inode->i_wb_list, &bdi->wb.b_dirty); |
1098 | + spin_unlock(&inode_wb_list_lock); | |
1099 | + | |
1100 | + if (wakeup_bdi) | |
1101 | + bdi_wakeup_thread_delayed(bdi); | |
1102 | + return; | |
1086 | 1103 | } |
1087 | - goto out; | |
1088 | 1104 | } |
1089 | 1105 | out_unlock_inode: |
1090 | 1106 | spin_unlock(&inode->i_lock); |
1091 | -out: | |
1092 | - spin_unlock(&inode_lock); | |
1093 | 1107 | |
1094 | - if (wakeup_bdi) | |
1095 | - bdi_wakeup_thread_delayed(bdi); | |
1096 | 1108 | } |
1097 | 1109 | EXPORT_SYMBOL(__mark_inode_dirty); |
1098 | 1110 | |
1099 | 1111 | |
... | ... | @@ -1296,9 +1308,9 @@ |
1296 | 1308 | wbc.nr_to_write = 0; |
1297 | 1309 | |
1298 | 1310 | might_sleep(); |
1299 | - spin_lock(&inode_lock); | |
1311 | + spin_lock(&inode_wb_list_lock); | |
1300 | 1312 | ret = writeback_single_inode(inode, &wbc); |
1301 | - spin_unlock(&inode_lock); | |
1313 | + spin_unlock(&inode_wb_list_lock); | |
1302 | 1314 | if (sync) |
1303 | 1315 | inode_sync_wait(inode); |
1304 | 1316 | return ret; |
1305 | 1317 | |
... | ... | @@ -1320,9 +1332,9 @@ |
1320 | 1332 | { |
1321 | 1333 | int ret; |
1322 | 1334 | |
1323 | - spin_lock(&inode_lock); | |
1335 | + spin_lock(&inode_wb_list_lock); | |
1324 | 1336 | ret = writeback_single_inode(inode, wbc); |
1325 | - spin_unlock(&inode_lock); | |
1337 | + spin_unlock(&inode_wb_list_lock); | |
1326 | 1338 | return ret; |
1327 | 1339 | } |
1328 | 1340 | EXPORT_SYMBOL(sync_inode); |
fs/inode.c
... | ... | @@ -26,6 +26,7 @@ |
26 | 26 | #include <linux/posix_acl.h> |
27 | 27 | #include <linux/ima.h> |
28 | 28 | #include <linux/cred.h> |
29 | +#include "internal.h" | |
29 | 30 | |
30 | 31 | /* |
31 | 32 | * inode locking rules. |
... | ... | @@ -36,6 +37,8 @@ |
36 | 37 | * inode_lru, inode->i_lru |
37 | 38 | * inode_sb_list_lock protects: |
38 | 39 | * sb->s_inodes, inode->i_sb_list |
40 | + * inode_wb_list_lock protects: | |
41 | + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list | |
39 | 42 | * |
40 | 43 | * Lock ordering: |
41 | 44 | * inode_lock |
... | ... | @@ -44,6 +47,9 @@ |
44 | 47 | * inode_sb_list_lock |
45 | 48 | * inode->i_lock |
46 | 49 | * inode_lru_lock |
50 | + * | |
51 | + * inode_wb_list_lock | |
52 | + * inode->i_lock | |
47 | 53 | */ |
48 | 54 | |
49 | 55 | /* |
... | ... | @@ -105,6 +111,7 @@ |
105 | 111 | DEFINE_SPINLOCK(inode_lock); |
106 | 112 | |
107 | 113 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); |
114 | +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); | |
108 | 115 | |
109 | 116 | /* |
110 | 117 | * iprune_sem provides exclusion between the icache shrinking and the |
... | ... | @@ -483,10 +490,7 @@ |
483 | 490 | BUG_ON(!(inode->i_state & I_FREEING)); |
484 | 491 | BUG_ON(!list_empty(&inode->i_lru)); |
485 | 492 | |
486 | - spin_lock(&inode_lock); | |
487 | - list_del_init(&inode->i_wb_list); | |
488 | - spin_unlock(&inode_lock); | |
489 | - | |
493 | + inode_wb_list_del(inode); | |
490 | 494 | inode_sb_list_del(inode); |
491 | 495 | |
492 | 496 | if (op->evict_inode) { |
fs/internal.h
... | ... | @@ -127,6 +127,11 @@ |
127 | 127 | */ |
128 | 128 | extern spinlock_t inode_sb_list_lock; |
129 | 129 | |
130 | +/* | |
131 | + * fs-writeback.c | |
132 | + */ | |
133 | +extern void inode_wb_list_del(struct inode *inode); | |
134 | + | |
130 | 135 | extern int get_nr_dirty_inodes(void); |
131 | 136 | extern void evict_inodes(struct super_block *); |
132 | 137 | extern int invalidate_inodes(struct super_block *, bool); |
include/linux/writeback.h
mm/backing-dev.c
... | ... | @@ -73,14 +73,14 @@ |
73 | 73 | struct inode *inode; |
74 | 74 | |
75 | 75 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; |
76 | - spin_lock(&inode_lock); | |
76 | + spin_lock(&inode_wb_list_lock); | |
77 | 77 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
78 | 78 | nr_dirty++; |
79 | 79 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
80 | 80 | nr_io++; |
81 | 81 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
82 | 82 | nr_more_io++; |
83 | - spin_unlock(&inode_lock); | |
83 | + spin_unlock(&inode_wb_list_lock); | |
84 | 84 | |
85 | 85 | global_dirty_limits(&background_thresh, &dirty_thresh); |
86 | 86 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
87 | 87 | |
... | ... | @@ -682,11 +682,11 @@ |
682 | 682 | if (bdi_has_dirty_io(bdi)) { |
683 | 683 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
684 | 684 | |
685 | - spin_lock(&inode_lock); | |
685 | + spin_lock(&inode_wb_list_lock); | |
686 | 686 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
687 | 687 | list_splice(&bdi->wb.b_io, &dst->b_io); |
688 | 688 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
689 | - spin_unlock(&inode_lock); | |
689 | + spin_unlock(&inode_wb_list_lock); | |
690 | 690 | } |
691 | 691 | |
692 | 692 | bdi_unregister(bdi); |
mm/filemap.c
... | ... | @@ -80,8 +80,8 @@ |
80 | 80 | * ->i_mutex |
81 | 81 | * ->i_alloc_sem (various) |
82 | 82 | * |
83 | - * ->inode_lock | |
84 | - * ->sb_lock (fs/fs-writeback.c) | |
83 | + * inode_wb_list_lock | |
84 | + * sb_lock (fs/fs-writeback.c) | |
85 | 85 | * ->mapping->tree_lock (__sync_single_inode) |
86 | 86 | * |
87 | 87 | * ->i_mmap_lock |
88 | 88 | |
... | ... | @@ -98,9 +98,9 @@ |
98 | 98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
99 | 99 | * ->private_lock (page_remove_rmap->set_page_dirty) |
100 | 100 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
101 | - * ->inode_lock (page_remove_rmap->set_page_dirty) | |
101 | + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | |
102 | 102 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
103 | - * ->inode_lock (zap_pte_range->set_page_dirty) | |
103 | + * inode_wb_list_lock (zap_pte_range->set_page_dirty) | |
104 | 104 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
105 | 105 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
106 | 106 | * |
mm/rmap.c
... | ... | @@ -31,12 +31,12 @@ |
31 | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | - * inode_lock (in set_page_dirty's __mark_inode_dirty) | |
35 | 34 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
35 | + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | |
36 | 36 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
37 | 37 | * mapping->tree_lock (widely used, in set_page_dirty, |
38 | 38 | * in arch-dependent flush_dcache_mmap_lock, |
39 | - * within inode_lock in __sync_single_inode) | |
39 | + * within inode_wb_list_lock in __sync_single_inode) | |
40 | 40 | * |
41 | 41 | * (code doesn't rely on that order so it could be switched around) |
42 | 42 | * ->tasklist_lock |