Commit 250df6ed274d767da844a5d9f05720b804240197

Authored by Dave Chinner
Committed by Al Viro
1 parent 3dc8fe4dca

fs: protect inode->i_state with inode->i_lock

Protect inode state transitions and validity checks with the
inode->i_lock. This enables us to make inode state transitions
independently of the inode_lock and is the first step to peeling
away the inode_lock from the code.

This requires that __iget() is done atomically with i_state checks
during list traversals so that we don't race with another thread
marking the inode I_FREEING between the state check and grabbing the
reference.

Also remove the unlock_new_inode() memory barrier optimisation
required to avoid taking the inode_lock when clearing I_NEW.
Simplify the code by simply taking the inode->i_lock around the
state change and wakeup. Because the wakeup is no longer tricky,
remove the wake_up_inode() function and open code the wakeup where
necessary.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 11 changed files with 174 additions and 74 deletions Side-by-side Diff

... ... @@ -56,9 +56,11 @@
56 56 struct backing_dev_info *dst)
57 57 {
58 58 spin_lock(&inode_lock);
  59 + spin_lock(&inode->i_lock);
59 60 inode->i_data.backing_dev_info = dst;
60 61 if (inode->i_state & I_DIRTY)
61 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
  63 + spin_unlock(&inode->i_lock);
62 64 spin_unlock(&inode_lock);
63 65 }
64 66  
... ... @@ -1144,7 +1144,7 @@
1144 1144 * inode list.
1145 1145 *
1146 1146 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1147   - * mapping->tree_lock and the global inode_lock.
  1147 + * mapping->tree_lock and mapping->host->i_lock.
1148 1148 */
1149 1149 void mark_buffer_dirty(struct buffer_head *bh)
1150 1150 {
... ... @@ -18,11 +18,14 @@
18 18  
19 19 spin_lock(&inode_lock);
20 20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21   - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
  21 + spin_lock(&inode->i_lock);
  22 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
  23 + (inode->i_mapping->nrpages == 0)) {
  24 + spin_unlock(&inode->i_lock);
22 25 continue;
23   - if (inode->i_mapping->nrpages == 0)
24   - continue;
  26 + }
25 27 __iget(inode);
  28 + spin_unlock(&inode->i_lock);
26 29 spin_unlock(&inode_lock);
27 30 invalidate_mapping_pages(inode->i_mapping, 0, -1);
28 31 iput(toput_inode);
... ... @@ -306,10 +306,12 @@
306 306 wait_queue_head_t *wqh;
307 307  
308 308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
309   - while (inode->i_state & I_SYNC) {
  309 + while (inode->i_state & I_SYNC) {
  310 + spin_unlock(&inode->i_lock);
310 311 spin_unlock(&inode_lock);
311 312 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
312 313 spin_lock(&inode_lock);
  314 + spin_lock(&inode->i_lock);
313 315 }
314 316 }
315 317  
... ... @@ -333,6 +335,7 @@
333 335 unsigned dirty;
334 336 int ret;
335 337  
  338 + spin_lock(&inode->i_lock);
336 339 if (!atomic_read(&inode->i_count))
337 340 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
338 341 else
... ... @@ -348,6 +351,7 @@
348 351 * completed a full scan of b_io.
349 352 */
350 353 if (wbc->sync_mode != WB_SYNC_ALL) {
  354 + spin_unlock(&inode->i_lock);
351 355 requeue_io(inode);
352 356 return 0;
353 357 }
... ... @@ -363,6 +367,7 @@
363 367 /* Set I_SYNC, reset I_DIRTY_PAGES */
364 368 inode->i_state |= I_SYNC;
365 369 inode->i_state &= ~I_DIRTY_PAGES;
  370 + spin_unlock(&inode->i_lock);
366 371 spin_unlock(&inode_lock);
367 372  
368 373 ret = do_writepages(mapping, wbc);
369 374  
... ... @@ -384,8 +389,10 @@
384 389 * write_inode()
385 390 */
386 391 spin_lock(&inode_lock);
  392 + spin_lock(&inode->i_lock);
387 393 dirty = inode->i_state & I_DIRTY;
388 394 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
  395 + spin_unlock(&inode->i_lock);
389 396 spin_unlock(&inode_lock);
390 397 /* Don't write the inode if only I_DIRTY_PAGES was set */
391 398 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
... ... @@ -395,6 +402,7 @@
395 402 }
396 403  
397 404 spin_lock(&inode_lock);
  405 + spin_lock(&inode->i_lock);
398 406 inode->i_state &= ~I_SYNC;
399 407 if (!(inode->i_state & I_FREEING)) {
400 408 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
... ... @@ -436,6 +444,7 @@
436 444 }
437 445 }
438 446 inode_sync_complete(inode);
  447 + spin_unlock(&inode->i_lock);
439 448 return ret;
440 449 }
441 450  
442 451  
... ... @@ -506,7 +515,9 @@
506 515 * kind does not need peridic writeout yet, and for the latter
507 516 * kind writeout is handled by the freer.
508 517 */
  518 + spin_lock(&inode->i_lock);
509 519 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
  520 + spin_unlock(&inode->i_lock);
510 521 requeue_io(inode);
511 522 continue;
512 523 }
513 524  
514 525  
... ... @@ -515,10 +526,14 @@
515 526 * Was this inode dirtied after sync_sb_inodes was called?
516 527 * This keeps sync from extra jobs and livelock.
517 528 */
518   - if (inode_dirtied_after(inode, wbc->wb_start))
  529 + if (inode_dirtied_after(inode, wbc->wb_start)) {
  530 + spin_unlock(&inode->i_lock);
519 531 return 1;
  532 + }
520 533  
521 534 __iget(inode);
  535 + spin_unlock(&inode->i_lock);
  536 +
522 537 pages_skipped = wbc->pages_skipped;
523 538 writeback_single_inode(inode, wbc);
524 539 if (wbc->pages_skipped != pages_skipped) {
525 540  
... ... @@ -724,7 +739,9 @@
724 739 if (!list_empty(&wb->b_more_io)) {
725 740 inode = wb_inode(wb->b_more_io.prev);
726 741 trace_wbc_writeback_wait(&wbc, wb->bdi);
  742 + spin_lock(&inode->i_lock);
727 743 inode_wait_for_writeback(inode);
  744 + spin_unlock(&inode->i_lock);
728 745 }
729 746 spin_unlock(&inode_lock);
730 747 }
... ... @@ -1017,6 +1034,7 @@
1017 1034 block_dump___mark_inode_dirty(inode);
1018 1035  
1019 1036 spin_lock(&inode_lock);
  1037 + spin_lock(&inode->i_lock);
1020 1038 if ((inode->i_state & flags) != flags) {
1021 1039 const int was_dirty = inode->i_state & I_DIRTY;
1022 1040  
... ... @@ -1028,7 +1046,7 @@
1028 1046 * superblock list, based upon its state.
1029 1047 */
1030 1048 if (inode->i_state & I_SYNC)
1031   - goto out;
  1049 + goto out_unlock_inode;
1032 1050  
1033 1051 /*
1034 1052 * Only add valid (hashed) inodes to the superblock's
1035 1053  
1036 1054  
... ... @@ -1036,11 +1054,12 @@
1036 1054 */
1037 1055 if (!S_ISBLK(inode->i_mode)) {
1038 1056 if (inode_unhashed(inode))
1039   - goto out;
  1057 + goto out_unlock_inode;
1040 1058 }
1041 1059 if (inode->i_state & I_FREEING)
1042   - goto out;
  1060 + goto out_unlock_inode;
1043 1061  
  1062 + spin_unlock(&inode->i_lock);
1044 1063 /*
1045 1064 * If the inode was already on b_dirty/b_io/b_more_io, don't
1046 1065 * reposition it (that would break b_dirty time-ordering).
1047 1066  
... ... @@ -1065,7 +1084,10 @@
1065 1084 inode->dirtied_when = jiffies;
1066 1085 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1067 1086 }
  1087 + goto out;
1068 1088 }
  1089 +out_unlock_inode:
  1090 + spin_unlock(&inode->i_lock);
1069 1091 out:
1070 1092 spin_unlock(&inode_lock);
1071 1093  
1072 1094  
1073 1095  
1074 1096  
... ... @@ -1111,14 +1133,16 @@
1111 1133 * we still have to wait for that writeout.
1112 1134 */
1113 1135 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1114   - struct address_space *mapping;
  1136 + struct address_space *mapping = inode->i_mapping;
1115 1137  
1116   - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
  1138 + spin_lock(&inode->i_lock);
  1139 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
  1140 + (mapping->nrpages == 0)) {
  1141 + spin_unlock(&inode->i_lock);
1117 1142 continue;
1118   - mapping = inode->i_mapping;
1119   - if (mapping->nrpages == 0)
1120   - continue;
  1143 + }
1121 1144 __iget(inode);
  1145 + spin_unlock(&inode->i_lock);
1122 1146 spin_unlock(&inode_lock);
1123 1147 /*
1124 1148 * We hold a reference to 'inode' so it couldn't have
... ... @@ -28,6 +28,17 @@
28 28 #include <linux/cred.h>
29 29  
30 30 /*
  31 + * inode locking rules.
  32 + *
  33 + * inode->i_lock protects:
  34 + * inode->i_state, inode->i_hash, __iget()
  35 + *
  36 + * Lock ordering:
  37 + * inode_lock
  38 + * inode->i_lock
  39 + */
  40 +
  41 +/*
31 42 * This is needed for the following functions:
32 43 * - inode_has_buffers
33 44 * - invalidate_bdev
... ... @@ -137,15 +148,6 @@
137 148 }
138 149 #endif
139 150  
140   -static void wake_up_inode(struct inode *inode)
141   -{
142   - /*
143   - * Prevent speculative execution through spin_unlock(&inode_lock);
144   - */
145   - smp_mb();
146   - wake_up_bit(&inode->i_state, __I_NEW);
147   -}
148   -
149 151 /**
150 152 * inode_init_always - perform inode structure intialisation
151 153 * @sb: superblock inode belongs to
... ... @@ -336,7 +338,7 @@
336 338 }
337 339  
338 340 /*
339   - * inode_lock must be held
  341 + * inode->i_lock must be held
340 342 */
341 343 void __iget(struct inode *inode)
342 344 {
343 345  
... ... @@ -413,7 +415,9 @@
413 415 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
414 416  
415 417 spin_lock(&inode_lock);
  418 + spin_lock(&inode->i_lock);
416 419 hlist_add_head(&inode->i_hash, b);
  420 + spin_unlock(&inode->i_lock);
417 421 spin_unlock(&inode_lock);
418 422 }
419 423 EXPORT_SYMBOL(__insert_inode_hash);
420 424  
... ... @@ -438,7 +442,9 @@
438 442 void remove_inode_hash(struct inode *inode)
439 443 {
440 444 spin_lock(&inode_lock);
  445 + spin_lock(&inode->i_lock);
441 446 hlist_del_init(&inode->i_hash);
  447 + spin_unlock(&inode->i_lock);
442 448 spin_unlock(&inode_lock);
443 449 }
444 450 EXPORT_SYMBOL(remove_inode_hash);
... ... @@ -495,7 +501,9 @@
495 501 __inode_sb_list_del(inode);
496 502 spin_unlock(&inode_lock);
497 503  
498   - wake_up_inode(inode);
  504 + spin_lock(&inode->i_lock);
  505 + wake_up_bit(&inode->i_state, __I_NEW);
  506 + spin_unlock(&inode->i_lock);
499 507 destroy_inode(inode);
500 508 }
501 509 }
502 510  
503 511  
... ... @@ -518,10 +526,17 @@
518 526 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
519 527 if (atomic_read(&inode->i_count))
520 528 continue;
521   - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
  529 +
  530 + spin_lock(&inode->i_lock);
  531 + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
  532 + spin_unlock(&inode->i_lock);
522 533 continue;
  534 + }
523 535  
524 536 inode->i_state |= I_FREEING;
  537 + if (!(inode->i_state & (I_DIRTY | I_SYNC)))
  538 + inodes_stat.nr_unused--;
  539 + spin_unlock(&inode->i_lock);
525 540  
526 541 /*
527 542 * Move the inode off the IO lists and LRU once I_FREEING is
... ... @@ -529,8 +544,6 @@
529 544 */
530 545 list_move(&inode->i_lru, &dispose);
531 546 list_del_init(&inode->i_wb_list);
532   - if (!(inode->i_state & (I_DIRTY | I_SYNC)))
533   - inodes_stat.nr_unused--;
534 547 }
535 548 spin_unlock(&inode_lock);
536 549  
537 550  
538 551  
539 552  
540 553  
... ... @@ -563,18 +576,26 @@
563 576  
564 577 spin_lock(&inode_lock);
565 578 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
566   - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
  579 + spin_lock(&inode->i_lock);
  580 + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
  581 + spin_unlock(&inode->i_lock);
567 582 continue;
  583 + }
568 584 if (inode->i_state & I_DIRTY && !kill_dirty) {
  585 + spin_unlock(&inode->i_lock);
569 586 busy = 1;
570 587 continue;
571 588 }
572 589 if (atomic_read(&inode->i_count)) {
  590 + spin_unlock(&inode->i_lock);
573 591 busy = 1;
574 592 continue;
575 593 }
576 594  
577 595 inode->i_state |= I_FREEING;
  596 + if (!(inode->i_state & (I_DIRTY | I_SYNC)))
  597 + inodes_stat.nr_unused--;
  598 + spin_unlock(&inode->i_lock);
578 599  
579 600 /*
580 601 * Move the inode off the IO lists and LRU once I_FREEING is
... ... @@ -582,8 +603,6 @@
582 603 */
583 604 list_move(&inode->i_lru, &dispose);
584 605 list_del_init(&inode->i_wb_list);
585   - if (!(inode->i_state & (I_DIRTY | I_SYNC)))
586   - inodes_stat.nr_unused--;
587 606 }
588 607 spin_unlock(&inode_lock);
589 608  
590 609  
... ... @@ -641,8 +660,10 @@
641 660 * Referenced or dirty inodes are still in use. Give them
642 661 * another pass through the LRU as we canot reclaim them now.
643 662 */
  663 + spin_lock(&inode->i_lock);
644 664 if (atomic_read(&inode->i_count) ||
645 665 (inode->i_state & ~I_REFERENCED)) {
  666 + spin_unlock(&inode->i_lock);
646 667 list_del_init(&inode->i_lru);
647 668 inodes_stat.nr_unused--;
648 669 continue;
649 670  
650 671  
... ... @@ -650,12 +671,14 @@
650 671  
651 672 /* recently referenced inodes get one more pass */
652 673 if (inode->i_state & I_REFERENCED) {
653   - list_move(&inode->i_lru, &inode_lru);
654 674 inode->i_state &= ~I_REFERENCED;
  675 + spin_unlock(&inode->i_lock);
  676 + list_move(&inode->i_lru, &inode_lru);
655 677 continue;
656 678 }
657 679 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
658 680 __iget(inode);
  681 + spin_unlock(&inode->i_lock);
659 682 spin_unlock(&inode_lock);
660 683 if (remove_inode_buffers(inode))
661 684 reap += invalidate_mapping_pages(&inode->i_data,
662 685  
663 686  
... ... @@ -666,11 +689,15 @@
666 689 if (inode != list_entry(inode_lru.next,
667 690 struct inode, i_lru))
668 691 continue; /* wrong inode or list_empty */
669   - if (!can_unuse(inode))
  692 + spin_lock(&inode->i_lock);
  693 + if (!can_unuse(inode)) {
  694 + spin_unlock(&inode->i_lock);
670 695 continue;
  696 + }
671 697 }
672 698 WARN_ON(inode->i_state & I_NEW);
673 699 inode->i_state |= I_FREEING;
  700 + spin_unlock(&inode->i_lock);
674 701  
675 702 /*
676 703 * Move the inode off the IO lists and LRU once I_FREEING is
677 704  
... ... @@ -737,11 +764,13 @@
737 764 continue;
738 765 if (!test(inode, data))
739 766 continue;
  767 + spin_lock(&inode->i_lock);
740 768 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
741 769 __wait_on_freeing_inode(inode);
742 770 goto repeat;
743 771 }
744 772 __iget(inode);
  773 + spin_unlock(&inode->i_lock);
745 774 return inode;
746 775 }
747 776 return NULL;
748 777  
... ... @@ -763,11 +792,13 @@
763 792 continue;
764 793 if (inode->i_sb != sb)
765 794 continue;
  795 + spin_lock(&inode->i_lock);
766 796 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
767 797 __wait_on_freeing_inode(inode);
768 798 goto repeat;
769 799 }
770 800 __iget(inode);
  801 + spin_unlock(&inode->i_lock);
771 802 return inode;
772 803 }
773 804 return NULL;
774 805  
775 806  
... ... @@ -832,14 +863,23 @@
832 863 inode = alloc_inode(sb);
833 864 if (inode) {
834 865 spin_lock(&inode_lock);
835   - __inode_sb_list_add(inode);
  866 + spin_lock(&inode->i_lock);
836 867 inode->i_state = 0;
  868 + spin_unlock(&inode->i_lock);
  869 + __inode_sb_list_add(inode);
837 870 spin_unlock(&inode_lock);
838 871 }
839 872 return inode;
840 873 }
841 874 EXPORT_SYMBOL(new_inode);
842 875  
  876 +/**
  877 + * unlock_new_inode - clear the I_NEW state and wake up any waiters
  878 + * @inode: new inode to unlock
  879 + *
  880 + * Called when the inode is fully initialised to clear the new state of the
  881 + * inode and wake up anyone waiting for the inode to finish initialisation.
  882 + */
843 883 void unlock_new_inode(struct inode *inode)
844 884 {
845 885 #ifdef CONFIG_DEBUG_LOCK_ALLOC
846 886  
... ... @@ -859,19 +899,11 @@
859 899 }
860 900 }
861 901 #endif
862   - /*
863   - * This is special! We do not need the spinlock when clearing I_NEW,
864   - * because we're guaranteed that nobody else tries to do anything about
865   - * the state of the inode when it is locked, as we just created it (so
866   - * there can be no old holders that haven't tested I_NEW).
867   - * However we must emit the memory barrier so that other CPUs reliably
868   - * see the clearing of I_NEW after the other inode initialisation has
869   - * completed.
870   - */
871   - smp_mb();
  902 + spin_lock(&inode->i_lock);
872 903 WARN_ON(!(inode->i_state & I_NEW));
873 904 inode->i_state &= ~I_NEW;
874   - wake_up_inode(inode);
  905 + wake_up_bit(&inode->i_state, __I_NEW);
  906 + spin_unlock(&inode->i_lock);
875 907 }
876 908 EXPORT_SYMBOL(unlock_new_inode);
877 909  
878 910  
879 911  
... ... @@ -900,9 +932,11 @@
900 932 if (set(inode, data))
901 933 goto set_failed;
902 934  
  935 + spin_lock(&inode->i_lock);
  936 + inode->i_state = I_NEW;
903 937 hlist_add_head(&inode->i_hash, head);
  938 + spin_unlock(&inode->i_lock);
904 939 __inode_sb_list_add(inode);
905   - inode->i_state = I_NEW;
906 940 spin_unlock(&inode_lock);
907 941  
908 942 /* Return the locked inode with I_NEW set, the
909 943  
910 944  
... ... @@ -947,9 +981,11 @@
947 981 old = find_inode_fast(sb, head, ino);
948 982 if (!old) {
949 983 inode->i_ino = ino;
  984 + spin_lock(&inode->i_lock);
  985 + inode->i_state = I_NEW;
950 986 hlist_add_head(&inode->i_hash, head);
  987 + spin_unlock(&inode->i_lock);
951 988 __inode_sb_list_add(inode);
952   - inode->i_state = I_NEW;
953 989 spin_unlock(&inode_lock);
954 990  
955 991 /* Return the locked inode with I_NEW set, the
956 992  
957 993  
... ... @@ -1034,15 +1070,19 @@
1034 1070 struct inode *igrab(struct inode *inode)
1035 1071 {
1036 1072 spin_lock(&inode_lock);
1037   - if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
  1073 + spin_lock(&inode->i_lock);
  1074 + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1038 1075 __iget(inode);
1039   - else
  1076 + spin_unlock(&inode->i_lock);
  1077 + } else {
  1078 + spin_unlock(&inode->i_lock);
1040 1079 /*
1041 1080 * Handle the case where s_op->clear_inode is not been
1042 1081 * called yet, and somebody is calling igrab
1043 1082 * while the inode is getting freed.
1044 1083 */
1045 1084 inode = NULL;
  1085 + }
1046 1086 spin_unlock(&inode_lock);
1047 1087 return inode;
1048 1088 }
... ... @@ -1271,7 +1311,6 @@
1271 1311 ino_t ino = inode->i_ino;
1272 1312 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1273 1313  
1274   - inode->i_state |= I_NEW;
1275 1314 while (1) {
1276 1315 struct hlist_node *node;
1277 1316 struct inode *old = NULL;
1278 1317  
1279 1318  
1280 1319  
1281 1320  
... ... @@ -1281,16 +1320,23 @@
1281 1320 continue;
1282 1321 if (old->i_sb != sb)
1283 1322 continue;
1284   - if (old->i_state & (I_FREEING|I_WILL_FREE))
  1323 + spin_lock(&old->i_lock);
  1324 + if (old->i_state & (I_FREEING|I_WILL_FREE)) {
  1325 + spin_unlock(&old->i_lock);
1285 1326 continue;
  1327 + }
1286 1328 break;
1287 1329 }
1288 1330 if (likely(!node)) {
  1331 + spin_lock(&inode->i_lock);
  1332 + inode->i_state |= I_NEW;
1289 1333 hlist_add_head(&inode->i_hash, head);
  1334 + spin_unlock(&inode->i_lock);
1290 1335 spin_unlock(&inode_lock);
1291 1336 return 0;
1292 1337 }
1293 1338 __iget(old);
  1339 + spin_unlock(&old->i_lock);
1294 1340 spin_unlock(&inode_lock);
1295 1341 wait_on_inode(old);
1296 1342 if (unlikely(!inode_unhashed(old))) {
... ... @@ -1308,8 +1354,6 @@
1308 1354 struct super_block *sb = inode->i_sb;
1309 1355 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1310 1356  
1311   - inode->i_state |= I_NEW;
1312   -
1313 1357 while (1) {
1314 1358 struct hlist_node *node;
1315 1359 struct inode *old = NULL;
1316 1360  
1317 1361  
1318 1362  
1319 1363  
... ... @@ -1320,16 +1364,23 @@
1320 1364 continue;
1321 1365 if (!test(old, data))
1322 1366 continue;
1323   - if (old->i_state & (I_FREEING|I_WILL_FREE))
  1367 + spin_lock(&old->i_lock);
  1368 + if (old->i_state & (I_FREEING|I_WILL_FREE)) {
  1369 + spin_unlock(&old->i_lock);
1324 1370 continue;
  1371 + }
1325 1372 break;
1326 1373 }
1327 1374 if (likely(!node)) {
  1375 + spin_lock(&inode->i_lock);
  1376 + inode->i_state |= I_NEW;
1328 1377 hlist_add_head(&inode->i_hash, head);
  1378 + spin_unlock(&inode->i_lock);
1329 1379 spin_unlock(&inode_lock);
1330 1380 return 0;
1331 1381 }
1332 1382 __iget(old);
  1383 + spin_unlock(&old->i_lock);
1333 1384 spin_unlock(&inode_lock);
1334 1385 wait_on_inode(old);
1335 1386 if (unlikely(!inode_unhashed(old))) {
... ... @@ -1375,6 +1426,9 @@
1375 1426 const struct super_operations *op = inode->i_sb->s_op;
1376 1427 int drop;
1377 1428  
  1429 + spin_lock(&inode->i_lock);
  1430 + WARN_ON(inode->i_state & I_NEW);
  1431 +
1378 1432 if (op && op->drop_inode)
1379 1433 drop = op->drop_inode(inode);
1380 1434 else
1381 1435  
1382 1436  
1383 1437  
1384 1438  
1385 1439  
... ... @@ -1386,21 +1440,23 @@
1386 1440 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1387 1441 inode_lru_list_add(inode);
1388 1442 }
  1443 + spin_unlock(&inode->i_lock);
1389 1444 spin_unlock(&inode_lock);
1390 1445 return;
1391 1446 }
1392   - WARN_ON(inode->i_state & I_NEW);
1393 1447 inode->i_state |= I_WILL_FREE;
  1448 + spin_unlock(&inode->i_lock);
1394 1449 spin_unlock(&inode_lock);
1395 1450 write_inode_now(inode, 1);
1396 1451 spin_lock(&inode_lock);
  1452 + spin_lock(&inode->i_lock);
1397 1453 WARN_ON(inode->i_state & I_NEW);
1398 1454 inode->i_state &= ~I_WILL_FREE;
1399 1455 __remove_inode_hash(inode);
1400 1456 }
1401 1457  
1402   - WARN_ON(inode->i_state & I_NEW);
1403 1458 inode->i_state |= I_FREEING;
  1459 + spin_unlock(&inode->i_lock);
1404 1460  
1405 1461 /*
1406 1462 * Move the inode off the IO lists and LRU once I_FREEING is
1407 1463  
... ... @@ -1413,8 +1469,10 @@
1413 1469 spin_unlock(&inode_lock);
1414 1470 evict(inode);
1415 1471 remove_inode_hash(inode);
1416   - wake_up_inode(inode);
  1472 + spin_lock(&inode->i_lock);
  1473 + wake_up_bit(&inode->i_state, __I_NEW);
1417 1474 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
  1475 + spin_unlock(&inode->i_lock);
1418 1476 destroy_inode(inode);
1419 1477 }
1420 1478  
... ... @@ -1611,9 +1669,8 @@
1611 1669 * to recheck inode state.
1612 1670 *
1613 1671 * It doesn't matter if I_NEW is not set initially, a call to
1614   - * wake_up_inode() after removing from the hash list will DTRT.
1615   - *
1616   - * This is called with inode_lock held.
  1672 + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
  1673 + * will DTRT.
1617 1674 */
1618 1675 static void __wait_on_freeing_inode(struct inode *inode)
1619 1676 {
... ... @@ -1621,6 +1678,7 @@
1621 1678 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1622 1679 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1623 1680 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
  1681 + spin_unlock(&inode->i_lock);
1624 1682 spin_unlock(&inode_lock);
1625 1683 schedule();
1626 1684 finish_wait(wq, &wait.wait);
fs/notify/inode_mark.c
... ... @@ -254,8 +254,11 @@
254 254 * I_WILL_FREE, or I_NEW which is fine because by that point
255 255 * the inode cannot have any associated watches.
256 256 */
257   - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
  257 + spin_lock(&inode->i_lock);
  258 + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
  259 + spin_unlock(&inode->i_lock);
258 260 continue;
  261 + }
259 262  
260 263 /*
261 264 * If i_count is zero, the inode cannot have any watches and
262 265  
... ... @@ -263,8 +266,10 @@
263 266 * evict all inodes with zero i_count from icache which is
264 267 * unnecessarily violent and may in fact be illegal to do.
265 268 */
266   - if (!atomic_read(&inode->i_count))
  269 + if (!atomic_read(&inode->i_count)) {
  270 + spin_unlock(&inode->i_lock);
267 271 continue;
  272 + }
268 273  
269 274 need_iput_tmp = need_iput;
270 275 need_iput = NULL;
271 276  
... ... @@ -274,13 +279,17 @@
274 279 __iget(inode);
275 280 else
276 281 need_iput_tmp = NULL;
  282 + spin_unlock(&inode->i_lock);
277 283  
278 284 /* In case the dropping of a reference would nuke next_i. */
279 285 if ((&next_i->i_sb_list != list) &&
280   - atomic_read(&next_i->i_count) &&
281   - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
282   - __iget(next_i);
283   - need_iput = next_i;
  286 + atomic_read(&next_i->i_count)) {
  287 + spin_lock(&next_i->i_lock);
  288 + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
  289 + __iget(next_i);
  290 + need_iput = next_i;
  291 + }
  292 + spin_unlock(&next_i->i_lock);
284 293 }
285 294  
286 295 /*
... ... @@ -902,18 +902,19 @@
902 902  
903 903 spin_lock(&inode_lock);
904 904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
905   - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
  905 + spin_lock(&inode->i_lock);
  906 + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
  907 + !atomic_read(&inode->i_writecount) ||
  908 + !dqinit_needed(inode, type)) {
  909 + spin_unlock(&inode->i_lock);
906 910 continue;
  911 + }
907 912 #ifdef CONFIG_QUOTA_DEBUG
908 913 if (unlikely(inode_get_rsv_space(inode) > 0))
909 914 reserved = 1;
910 915 #endif
911   - if (!atomic_read(&inode->i_writecount))
912   - continue;
913   - if (!dqinit_needed(inode, type))
914   - continue;
915   -
916 916 __iget(inode);
  917 + spin_unlock(&inode->i_lock);
917 918 spin_unlock(&inode_lock);
918 919  
919 920 iput(old_inode);
... ... @@ -1647,7 +1647,7 @@
1647 1647 };
1648 1648  
1649 1649 /*
1650   - * Inode state bits. Protected by inode_lock.
  1650 + * Inode state bits. Protected by inode->i_lock
1651 1651 *
1652 1652 * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
1653 1653 * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
include/linux/quotaops.h
... ... @@ -277,7 +277,7 @@
277 277 /*
278 278 * Mark inode fully dirty. Since we are allocating blocks, inode
279 279 * would become fully dirty soon anyway and it reportedly
280   - * reduces inode_lock contention.
  280 + * reduces lock contention.
281 281 */
282 282 mark_inode_dirty(inode);
283 283 }
... ... @@ -99,7 +99,9 @@
99 99 * ->private_lock (page_remove_rmap->set_page_dirty)
100 100 * ->tree_lock (page_remove_rmap->set_page_dirty)
101 101 * ->inode_lock (page_remove_rmap->set_page_dirty)
  102 + * ->inode->i_lock (page_remove_rmap->set_page_dirty)
102 103 * ->inode_lock (zap_pte_range->set_page_dirty)
  104 + * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 106 *
105 107 * (code doesn't rely on that order, so you could switch it around)
... ... @@ -32,6 +32,7 @@
32 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 34 * inode_lock (in set_page_dirty's __mark_inode_dirty)
  35 + * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 36 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 37 * mapping->tree_lock (widely used, in set_page_dirty,
37 38 * in arch-dependent flush_dcache_mmap_lock,