Commit bd681513fa6f2ff29aa391f01e413a2d1c59fd77

Authored by Chris Mason
1 parent 81317fdedd

Btrfs: switch the btrfs tree locks to reader/writer

The btrfs metadata btree is the source of significant
lock contention, especially in the root node.   This
commit changes our locking to use a reader/writer
lock.

The lock is built on top of rw spinlocks, and it
extends the lock tracking to remember if we have a
read lock or a write lock when we go to blocking.  Atomics
count the number of blocking readers or writers at any
given time.

It removes all of the adaptive spinning from the old code
and uses only the spinning/blocking hints inside of btrfs
to decide when it should continue spinning.

In read heavy workloads this is dramatically faster.  In write
heavy workloads we're still faster because of less contention
on the root node lock.

We suffer slightly in dbench because we schedule more often
during write locks, but all other benchmarks so far are improved.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

Showing 9 changed files with 431 additions and 218 deletions Side-by-side Diff

... ... @@ -54,8 +54,13 @@
54 54 {
55 55 int i;
56 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57   - if (p->nodes[i] && p->locks[i])
58   - btrfs_set_lock_blocking(p->nodes[i]);
  57 + if (!p->nodes[i] || !p->locks[i])
  58 + continue;
  59 + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  60 + if (p->locks[i] == BTRFS_READ_LOCK)
  61 + p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  62 + else if (p->locks[i] == BTRFS_WRITE_LOCK)
  63 + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
59 64 }
60 65 }
61 66  
... ... @@ -68,7 +73,7 @@
68 73 * for held
69 74 */
70 75 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71   - struct extent_buffer *held)
  76 + struct extent_buffer *held, int held_rw)
72 77 {
73 78 int i;
74 79  
75 80  
76 81  
... ... @@ -79,19 +84,29 @@
79 84 * really sure by forcing the path to blocking before we clear
80 85 * the path blocking.
81 86 */
82   - if (held)
83   - btrfs_set_lock_blocking(held);
  87 + if (held) {
  88 + btrfs_set_lock_blocking_rw(held, held_rw);
  89 + if (held_rw == BTRFS_WRITE_LOCK)
  90 + held_rw = BTRFS_WRITE_LOCK_BLOCKING;
  91 + else if (held_rw == BTRFS_READ_LOCK)
  92 + held_rw = BTRFS_READ_LOCK_BLOCKING;
  93 + }
84 94 btrfs_set_path_blocking(p);
85 95 #endif
86 96  
87 97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88   - if (p->nodes[i] && p->locks[i])
89   - btrfs_clear_lock_blocking(p->nodes[i]);
  98 + if (p->nodes[i] && p->locks[i]) {
  99 + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
  100 + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
  101 + p->locks[i] = BTRFS_WRITE_LOCK;
  102 + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
  103 + p->locks[i] = BTRFS_READ_LOCK;
  104 + }
90 105 }
91 106  
92 107 #ifdef CONFIG_DEBUG_LOCK_ALLOC
93 108 if (held)
94   - btrfs_clear_lock_blocking(held);
  109 + btrfs_clear_lock_blocking_rw(held, held_rw);
95 110 #endif
96 111 }
97 112  
... ... @@ -119,7 +134,7 @@
119 134 if (!p->nodes[i])
120 135 continue;
121 136 if (p->locks[i]) {
122   - btrfs_tree_unlock(p->nodes[i]);
  137 + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
123 138 p->locks[i] = 0;
124 139 }
125 140 free_extent_buffer(p->nodes[i]);
... ... @@ -167,6 +182,25 @@
167 182 return eb;
168 183 }
169 184  
  185 +/* loop around taking references on and locking the root node of the
  186 + * tree until you end up with a lock on the root. A locked buffer
  187 + * is returned, with a reference held.
  188 + */
  189 +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
  190 +{
  191 + struct extent_buffer *eb;
  192 +
  193 + while (1) {
  194 + eb = btrfs_root_node(root);
  195 + btrfs_tree_read_lock(eb);
  196 + if (eb == root->node)
  197 + break;
  198 + btrfs_tree_read_unlock(eb);
  199 + free_extent_buffer(eb);
  200 + }
  201 + return eb;
  202 +}
  203 +
170 204 /* cowonly root (everything not a reference counted cow subvolume), just get
171 205 * put onto a simple dirty list. transaction.c walks this to make sure they
172 206 * get properly updated on disk.
... ... @@ -862,7 +896,8 @@
862 896  
863 897 mid = path->nodes[level];
864 898  
865   - WARN_ON(!path->locks[level]);
  899 + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
  900 + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
866 901 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 902  
868 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
... ... @@ -1360,7 +1395,7 @@
1360 1395  
1361 1396 t = path->nodes[i];
1362 1397 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1363   - btrfs_tree_unlock(t);
  1398 + btrfs_tree_unlock_rw(t, path->locks[i]);
1364 1399 path->locks[i] = 0;
1365 1400 }
1366 1401 }
... ... @@ -1387,7 +1422,7 @@
1387 1422 continue;
1388 1423 if (!path->locks[i])
1389 1424 continue;
1390   - btrfs_tree_unlock(path->nodes[i]);
  1425 + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1391 1426 path->locks[i] = 0;
1392 1427 }
1393 1428 }
... ... @@ -1436,6 +1471,8 @@
1436 1471 * we can trust our generation number
1437 1472 */
1438 1473 free_extent_buffer(tmp);
  1474 + btrfs_set_path_blocking(p);
  1475 +
1439 1476 tmp = read_tree_block(root, blocknr, blocksize, gen);
1440 1477 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1441 1478 *eb_ret = tmp;
1442 1479  
1443 1480  
... ... @@ -1491,20 +1528,27 @@
1491 1528 static int
1492 1529 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1493 1530 struct btrfs_root *root, struct btrfs_path *p,
1494   - struct extent_buffer *b, int level, int ins_len)
  1531 + struct extent_buffer *b, int level, int ins_len,
  1532 + int *write_lock_level)
1495 1533 {
1496 1534 int ret;
1497 1535 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1498 1536 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1499 1537 int sret;
1500 1538  
  1539 + if (*write_lock_level < level + 1) {
  1540 + *write_lock_level = level + 1;
  1541 + btrfs_release_path(p);
  1542 + goto again;
  1543 + }
  1544 +
1501 1545 sret = reada_for_balance(root, p, level);
1502 1546 if (sret)
1503 1547 goto again;
1504 1548  
1505 1549 btrfs_set_path_blocking(p);
1506 1550 sret = split_node(trans, root, p, level);
1507   - btrfs_clear_path_blocking(p, NULL);
  1551 + btrfs_clear_path_blocking(p, NULL, 0);
1508 1552  
1509 1553 BUG_ON(sret > 0);
1510 1554 if (sret) {
1511 1555  
... ... @@ -1516,13 +1560,19 @@
1516 1560 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1517 1561 int sret;
1518 1562  
  1563 + if (*write_lock_level < level + 1) {
  1564 + *write_lock_level = level + 1;
  1565 + btrfs_release_path(p);
  1566 + goto again;
  1567 + }
  1568 +
1519 1569 sret = reada_for_balance(root, p, level);
1520 1570 if (sret)
1521 1571 goto again;
1522 1572  
1523 1573 btrfs_set_path_blocking(p);
1524 1574 sret = balance_level(trans, root, p, level);
1525   - btrfs_clear_path_blocking(p, NULL);
  1575 + btrfs_clear_path_blocking(p, NULL, 0);
1526 1576  
1527 1577 if (sret) {
1528 1578 ret = sret;
1529 1579  
1530 1580  
1531 1581  
1532 1582  
1533 1583  
1534 1584  
1535 1585  
1536 1586  
1537 1587  
... ... @@ -1566,27 +1616,78 @@
1566 1616 int err;
1567 1617 int level;
1568 1618 int lowest_unlock = 1;
  1619 + int root_lock;
  1620 + /* everything at write_lock_level or lower must be write locked */
  1621 + int write_lock_level = 0;
1569 1622 u8 lowest_level = 0;
1570 1623  
1571 1624 lowest_level = p->lowest_level;
1572 1625 WARN_ON(lowest_level && ins_len > 0);
1573 1626 WARN_ON(p->nodes[0] != NULL);
1574 1627  
1575   - if (ins_len < 0)
  1628 + if (ins_len < 0) {
1576 1629 lowest_unlock = 2;
1577 1630  
  1631 + /* when we are removing items, we might have to go up to level
  1632 + * two as we update tree pointers Make sure we keep write
  1633 + * for those levels as well
  1634 + */
  1635 + write_lock_level = 2;
  1636 + } else if (ins_len > 0) {
  1637 + /*
  1638 + * for inserting items, make sure we have a write lock on
  1639 + * level 1 so we can update keys
  1640 + */
  1641 + write_lock_level = 1;
  1642 + }
  1643 +
  1644 + if (!cow)
  1645 + write_lock_level = -1;
  1646 +
  1647 + if (cow && (p->keep_locks || p->lowest_level))
  1648 + write_lock_level = BTRFS_MAX_LEVEL;
  1649 +
1578 1650 again:
  1651 + /*
  1652 + * we try very hard to do read locks on the root
  1653 + */
  1654 + root_lock = BTRFS_READ_LOCK;
  1655 + level = 0;
1579 1656 if (p->search_commit_root) {
  1657 + /*
  1658 + * the commit roots are read only
  1659 + * so we always do read locks
  1660 + */
1580 1661 b = root->commit_root;
1581 1662 extent_buffer_get(b);
  1663 + level = btrfs_header_level(b);
1582 1664 if (!p->skip_locking)
1583   - btrfs_tree_lock(b);
  1665 + btrfs_tree_read_lock(b);
1584 1666 } else {
1585   - if (p->skip_locking)
  1667 + if (p->skip_locking) {
1586 1668 b = btrfs_root_node(root);
1587   - else
1588   - b = btrfs_lock_root_node(root);
  1669 + level = btrfs_header_level(b);
  1670 + } else {
  1671 + /* we don't know the level of the root node
  1672 + * until we actually have it read locked
  1673 + */
  1674 + b = btrfs_read_lock_root_node(root);
  1675 + level = btrfs_header_level(b);
  1676 + if (level <= write_lock_level) {
  1677 + /* whoops, must trade for write lock */
  1678 + btrfs_tree_read_unlock(b);
  1679 + free_extent_buffer(b);
  1680 + b = btrfs_lock_root_node(root);
  1681 + root_lock = BTRFS_WRITE_LOCK;
  1682 +
  1683 + /* the level might have changed, check again */
  1684 + level = btrfs_header_level(b);
  1685 + }
  1686 + }
1589 1687 }
  1688 + p->nodes[level] = b;
  1689 + if (!p->skip_locking)
  1690 + p->locks[level] = root_lock;
1590 1691  
1591 1692 while (b) {
1592 1693 level = btrfs_header_level(b);
... ... @@ -1595,10 +1696,6 @@
1595 1696 * setup the path here so we can release it under lock
1596 1697 * contention with the cow code
1597 1698 */
1598   - p->nodes[level] = b;
1599   - if (!p->skip_locking)
1600   - p->locks[level] = 1;
1601   -
1602 1699 if (cow) {
1603 1700 /*
1604 1701 * if we don't really need to cow this block
... ... @@ -1610,6 +1707,16 @@
1610 1707  
1611 1708 btrfs_set_path_blocking(p);
1612 1709  
  1710 + /*
  1711 + * must have write locks on this node and the
  1712 + * parent
  1713 + */
  1714 + if (level + 1 > write_lock_level) {
  1715 + write_lock_level = level + 1;
  1716 + btrfs_release_path(p);
  1717 + goto again;
  1718 + }
  1719 +
1613 1720 err = btrfs_cow_block(trans, root, b,
1614 1721 p->nodes[level + 1],
1615 1722 p->slots[level + 1], &b);
1616 1723  
... ... @@ -1622,11 +1729,8 @@
1622 1729 BUG_ON(!cow && ins_len);
1623 1730  
1624 1731 p->nodes[level] = b;
1625   - if (!p->skip_locking)
1626   - p->locks[level] = 1;
  1732 + btrfs_clear_path_blocking(p, NULL, 0);
1627 1733  
1628   - btrfs_clear_path_blocking(p, NULL);
1629   -
1630 1734 /*
1631 1735 * we have a lock on b and as long as we aren't changing
1632 1736 * the tree, there is no way to for the items in b to change.
... ... @@ -1651,7 +1755,7 @@
1651 1755 }
1652 1756 p->slots[level] = slot;
1653 1757 err = setup_nodes_for_search(trans, root, p, b, level,
1654   - ins_len);
  1758 + ins_len, &write_lock_level);
1655 1759 if (err == -EAGAIN)
1656 1760 goto again;
1657 1761 if (err) {
... ... @@ -1661,6 +1765,19 @@
1661 1765 b = p->nodes[level];
1662 1766 slot = p->slots[level];
1663 1767  
  1768 + /*
  1769 + * slot 0 is special, if we change the key
  1770 + * we have to update the parent pointer
  1771 + * which means we must have a write lock
  1772 + * on the parent
  1773 + */
  1774 + if (slot == 0 && cow &&
  1775 + write_lock_level < level + 1) {
  1776 + write_lock_level = level + 1;
  1777 + btrfs_release_path(p);
  1778 + goto again;
  1779 + }
  1780 +
1664 1781 unlock_up(p, level, lowest_unlock);
1665 1782  
1666 1783 if (level == lowest_level) {
1667 1784  
1668 1785  
1669 1786  
... ... @@ -1679,23 +1796,42 @@
1679 1796 }
1680 1797  
1681 1798 if (!p->skip_locking) {
1682   - btrfs_clear_path_blocking(p, NULL);
1683   - err = btrfs_try_spin_lock(b);
1684   -
1685   - if (!err) {
1686   - btrfs_set_path_blocking(p);
1687   - btrfs_tree_lock(b);
1688   - btrfs_clear_path_blocking(p, b);
  1799 + level = btrfs_header_level(b);
  1800 + if (level <= write_lock_level) {
  1801 + err = btrfs_try_tree_write_lock(b);
  1802 + if (!err) {
  1803 + btrfs_set_path_blocking(p);
  1804 + btrfs_tree_lock(b);
  1805 + btrfs_clear_path_blocking(p, b,
  1806 + BTRFS_WRITE_LOCK);
  1807 + }
  1808 + p->locks[level] = BTRFS_WRITE_LOCK;
  1809 + } else {
  1810 + err = btrfs_try_tree_read_lock(b);
  1811 + if (!err) {
  1812 + btrfs_set_path_blocking(p);
  1813 + btrfs_tree_read_lock(b);
  1814 + btrfs_clear_path_blocking(p, b,
  1815 + BTRFS_READ_LOCK);
  1816 + }
  1817 + p->locks[level] = BTRFS_READ_LOCK;
1689 1818 }
  1819 + p->nodes[level] = b;
1690 1820 }
1691 1821 } else {
1692 1822 p->slots[level] = slot;
1693 1823 if (ins_len > 0 &&
1694 1824 btrfs_leaf_free_space(root, b) < ins_len) {
  1825 + if (write_lock_level < 1) {
  1826 + write_lock_level = 1;
  1827 + btrfs_release_path(p);
  1828 + goto again;
  1829 + }
  1830 +
1695 1831 btrfs_set_path_blocking(p);
1696 1832 err = split_leaf(trans, root, key,
1697 1833 p, ins_len, ret == 0);
1698   - btrfs_clear_path_blocking(p, NULL);
  1834 + btrfs_clear_path_blocking(p, NULL, 0);
1699 1835  
1700 1836 BUG_ON(err > 0);
1701 1837 if (err) {
... ... @@ -1976,7 +2112,7 @@
1976 2112 add_root_to_dirty_list(root);
1977 2113 extent_buffer_get(c);
1978 2114 path->nodes[level] = c;
1979   - path->locks[level] = 1;
  2115 + path->locks[level] = BTRFS_WRITE_LOCK;
1980 2116 path->slots[level] = 0;
1981 2117 return 0;
1982 2118 }
1983 2119  
... ... @@ -3819,11 +3955,11 @@
3819 3955  
3820 3956 WARN_ON(!path->keep_locks);
3821 3957 again:
3822   - cur = btrfs_lock_root_node(root);
  3958 + cur = btrfs_read_lock_root_node(root);
3823 3959 level = btrfs_header_level(cur);
3824 3960 WARN_ON(path->nodes[level]);
3825 3961 path->nodes[level] = cur;
3826   - path->locks[level] = 1;
  3962 + path->locks[level] = BTRFS_READ_LOCK;
3827 3963  
3828 3964 if (btrfs_header_generation(cur) < min_trans) {
3829 3965 ret = 1;
3830 3966  
3831 3967  
... ... @@ -3913,12 +4049,12 @@
3913 4049 cur = read_node_slot(root, cur, slot);
3914 4050 BUG_ON(!cur);
3915 4051  
3916   - btrfs_tree_lock(cur);
  4052 + btrfs_tree_read_lock(cur);
3917 4053  
3918   - path->locks[level - 1] = 1;
  4054 + path->locks[level - 1] = BTRFS_READ_LOCK;
3919 4055 path->nodes[level - 1] = cur;
3920 4056 unlock_up(path, level, 1);
3921   - btrfs_clear_path_blocking(path, NULL);
  4057 + btrfs_clear_path_blocking(path, NULL, 0);
3922 4058 }
3923 4059 out:
3924 4060 if (ret == 0)
... ... @@ -4034,6 +4170,7 @@
4034 4170 int ret;
4035 4171 int old_spinning = path->leave_spinning;
4036 4172 int force_blocking = 0;
  4173 + int next_rw_lock = 0;
4037 4174  
4038 4175 nritems = btrfs_header_nritems(path->nodes[0]);
4039 4176 if (nritems == 0)
... ... @@ -4051,6 +4188,7 @@
4051 4188 again:
4052 4189 level = 1;
4053 4190 next = NULL;
  4191 + next_rw_lock = 0;
4054 4192 btrfs_release_path(path);
4055 4193  
4056 4194 path->keep_locks = 1;
4057 4195  
... ... @@ -4096,11 +4234,12 @@
4096 4234 }
4097 4235  
4098 4236 if (next) {
4099   - btrfs_tree_unlock(next);
  4237 + btrfs_tree_unlock_rw(next, next_rw_lock);
4100 4238 free_extent_buffer(next);
4101 4239 }
4102 4240  
4103 4241 next = c;
  4242 + next_rw_lock = path->locks[level];
4104 4243 ret = read_block_for_search(NULL, root, path, &next, level,
4105 4244 slot, &key);
4106 4245 if (ret == -EAGAIN)
4107 4246  
4108 4247  
... ... @@ -4112,15 +4251,22 @@
4112 4251 }
4113 4252  
4114 4253 if (!path->skip_locking) {
4115   - ret = btrfs_try_spin_lock(next);
  4254 + ret = btrfs_try_tree_read_lock(next);
4116 4255 if (!ret) {
4117 4256 btrfs_set_path_blocking(path);
4118   - btrfs_tree_lock(next);
4119   - if (!force_blocking)
4120   - btrfs_clear_path_blocking(path, next);
  4257 + btrfs_tree_read_lock(next);
  4258 + if (!force_blocking) {
  4259 + btrfs_clear_path_blocking(path, next,
  4260 + BTRFS_READ_LOCK);
  4261 + }
4121 4262 }
4122   - if (force_blocking)
4123   - btrfs_set_lock_blocking(next);
  4263 + if (force_blocking) {
  4264 + btrfs_set_lock_blocking_rw(next,
  4265 + BTRFS_READ_LOCK);
  4266 + next_rw_lock = BTRFS_READ_LOCK_BLOCKING;
  4267 + } else {
  4268 + next_rw_lock = BTRFS_READ_LOCK;
  4269 + }
4124 4270 }
4125 4271 break;
4126 4272 }
4127 4273  
... ... @@ -4129,14 +4275,13 @@
4129 4275 level--;
4130 4276 c = path->nodes[level];
4131 4277 if (path->locks[level])
4132   - btrfs_tree_unlock(c);
  4278 + btrfs_tree_unlock_rw(c, path->locks[level]);
4133 4279  
4134 4280 free_extent_buffer(c);
4135 4281 path->nodes[level] = next;
4136 4282 path->slots[level] = 0;
4137 4283 if (!path->skip_locking)
4138   - path->locks[level] = 1;
4139   -
  4284 + path->locks[level] = next_rw_lock;
4140 4285 if (!level)
4141 4286 break;
4142 4287  
4143 4288  
4144 4289  
4145 4290  
... ... @@ -4151,16 +4296,21 @@
4151 4296 }
4152 4297  
4153 4298 if (!path->skip_locking) {
4154   - btrfs_assert_tree_locked(path->nodes[level]);
4155   - ret = btrfs_try_spin_lock(next);
  4299 + ret = btrfs_try_tree_read_lock(next);
4156 4300 if (!ret) {
4157 4301 btrfs_set_path_blocking(path);
4158   - btrfs_tree_lock(next);
  4302 + btrfs_tree_read_lock(next);
4159 4303 if (!force_blocking)
4160   - btrfs_clear_path_blocking(path, next);
  4304 + btrfs_clear_path_blocking(path, next,
  4305 + BTRFS_READ_LOCK);
4161 4306 }
4162   - if (force_blocking)
4163   - btrfs_set_lock_blocking(next);
  4307 + if (force_blocking) {
  4308 + btrfs_set_lock_blocking_rw(next,
  4309 + BTRFS_READ_LOCK);
  4310 + next_rw_lock = BTRFS_READ_LOCK_BLOCKING;
  4311 + } else {
  4312 + next_rw_lock = BTRFS_READ_LOCK;
  4313 + }
4164 4314 }
4165 4315 }
4166 4316 ret = 0;
... ... @@ -2333,7 +2333,7 @@
2333 2333 void btrfs_free_path(struct btrfs_path *p);
2334 2334 void btrfs_set_path_blocking(struct btrfs_path *p);
2335 2335 void btrfs_clear_path_blocking(struct btrfs_path *p,
2336   - struct extent_buffer *held);
  2336 + struct extent_buffer *held, int held_rw);
2337 2337 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2338 2338  
2339 2339 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fs/btrfs/delayed-inode.c
... ... @@ -735,7 +735,7 @@
735 735 }
736 736  
737 737 /* reset all the locked nodes in the patch to spinning locks. */
738   - btrfs_clear_path_blocking(path, NULL);
  738 + btrfs_clear_path_blocking(path, NULL, 0);
739 739  
740 740 /* insert the keys of the items */
741 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
fs/btrfs/extent-tree.c
... ... @@ -5912,7 +5912,7 @@
5912 5912 return 1;
5913 5913  
5914 5914 if (path->locks[level] && !wc->keep_locks) {
5915   - btrfs_tree_unlock(eb);
  5915 + btrfs_tree_unlock_rw(eb, path->locks[level]);
5916 5916 path->locks[level] = 0;
5917 5917 }
5918 5918 return 0;
... ... @@ -5936,7 +5936,7 @@
5936 5936 * keep the tree lock
5937 5937 */
5938 5938 if (path->locks[level] && level > 0) {
5939   - btrfs_tree_unlock(eb);
  5939 + btrfs_tree_unlock_rw(eb, path->locks[level]);
5940 5940 path->locks[level] = 0;
5941 5941 }
5942 5942 return 0;
... ... @@ -6049,7 +6049,7 @@
6049 6049 BUG_ON(level != btrfs_header_level(next));
6050 6050 path->nodes[level] = next;
6051 6051 path->slots[level] = 0;
6052   - path->locks[level] = 1;
  6052 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6053 6053 wc->level = level;
6054 6054 if (wc->level == 1)
6055 6055 wc->reada_slot = 0;
... ... @@ -6120,7 +6120,7 @@
6120 6120 BUG_ON(level == 0);
6121 6121 btrfs_tree_lock(eb);
6122 6122 btrfs_set_lock_blocking(eb);
6123   - path->locks[level] = 1;
  6123 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6124 6124  
6125 6125 ret = btrfs_lookup_extent_info(trans, root,
6126 6126 eb->start, eb->len,
... ... @@ -6129,8 +6129,7 @@
6129 6129 BUG_ON(ret);
6130 6130 BUG_ON(wc->refs[level] == 0);
6131 6131 if (wc->refs[level] == 1) {
6132   - btrfs_tree_unlock(eb);
6133   - path->locks[level] = 0;
  6132 + btrfs_tree_unlock_rw(eb, path->locks[level]);
6134 6133 return 1;
6135 6134 }
6136 6135 }
... ... @@ -6152,7 +6151,7 @@
6152 6151 btrfs_header_generation(eb) == trans->transid) {
6153 6152 btrfs_tree_lock(eb);
6154 6153 btrfs_set_lock_blocking(eb);
6155   - path->locks[level] = 1;
  6154 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6156 6155 }
6157 6156 clean_tree_block(trans, root, eb);
6158 6157 }
... ... @@ -6231,7 +6230,8 @@
6231 6230 return 0;
6232 6231  
6233 6232 if (path->locks[level]) {
6234   - btrfs_tree_unlock(path->nodes[level]);
  6233 + btrfs_tree_unlock_rw(path->nodes[level],
  6234 + path->locks[level]);
6235 6235 path->locks[level] = 0;
6236 6236 }
6237 6237 free_extent_buffer(path->nodes[level]);
... ... @@ -6283,7 +6283,7 @@
6283 6283 path->nodes[level] = btrfs_lock_root_node(root);
6284 6284 btrfs_set_lock_blocking(path->nodes[level]);
6285 6285 path->slots[level] = 0;
6286   - path->locks[level] = 1;
  6286 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6287 6287 memset(&wc->update_progress, 0,
6288 6288 sizeof(wc->update_progress));
6289 6289 } else {
... ... @@ -6451,7 +6451,7 @@
6451 6451 level = btrfs_header_level(node);
6452 6452 path->nodes[level] = node;
6453 6453 path->slots[level] = 0;
6454   - path->locks[level] = 1;
  6454 + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6455 6455  
6456 6456 wc->refs[parent_level] = 1;
6457 6457 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
fs/btrfs/extent_io.c
... ... @@ -3017,8 +3017,15 @@
3017 3017 return NULL;
3018 3018 eb->start = start;
3019 3019 eb->len = len;
3020   - spin_lock_init(&eb->lock);
3021   - init_waitqueue_head(&eb->lock_wq);
  3020 + rwlock_init(&eb->lock);
  3021 + atomic_set(&eb->write_locks, 0);
  3022 + atomic_set(&eb->read_locks, 0);
  3023 + atomic_set(&eb->blocking_readers, 0);
  3024 + atomic_set(&eb->blocking_writers, 0);
  3025 + atomic_set(&eb->spinning_readers, 0);
  3026 + atomic_set(&eb->spinning_writers, 0);
  3027 + init_waitqueue_head(&eb->write_lock_wq);
  3028 + init_waitqueue_head(&eb->read_lock_wq);
3022 3029  
3023 3030 #if LEAK_DEBUG
3024 3031 spin_lock_irqsave(&leak_lock, flags);
fs/btrfs/extent_io.h
... ... @@ -128,14 +128,26 @@
128 128 struct rcu_head rcu_head;
129 129 atomic_t refs;
130 130  
131   - /* the spinlock is used to protect most operations */
132   - spinlock_t lock;
  131 + /* count of read lock holders on the extent buffer */
  132 + atomic_t write_locks;
  133 + atomic_t read_locks;
  134 + atomic_t blocking_writers;
  135 + atomic_t blocking_readers;
  136 + atomic_t spinning_readers;
  137 + atomic_t spinning_writers;
133 138  
134   - /*
135   - * when we keep the lock held while blocking, waiters go onto
136   - * the wq
  139 + /* protects write locks */
  140 + rwlock_t lock;
  141 +
  142 + /* readers use lock_wq while they wait for the write
  143 + * lock holders to unlock
137 144 */
138   - wait_queue_head_t lock_wq;
  145 + wait_queue_head_t write_lock_wq;
  146 +
  147 + /* writers use read_lock_wq while they wait for readers
  148 + * to unlock
  149 + */
  150 + wait_queue_head_t read_lock_wq;
139 151 };
140 152  
141 153 static inline void extent_set_compress_type(unsigned long *bio_flags,
... ... @@ -24,186 +24,198 @@
24 24 #include "extent_io.h"
25 25 #include "locking.h"
26 26  
27   -static inline void spin_nested(struct extent_buffer *eb)
  27 +void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
  28 +
  29 +/*
  30 + * if we currently have a spinning reader or writer lock
  31 + * (indicated by the rw flag) this will bump the count
  32 + * of blocking holders and drop the spinlock.
  33 + */
  34 +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
28 35 {
29   - spin_lock(&eb->lock);
  36 + if (rw == BTRFS_WRITE_LOCK) {
  37 + if (atomic_read(&eb->blocking_writers) == 0) {
  38 + WARN_ON(atomic_read(&eb->spinning_writers) != 1);
  39 + atomic_dec(&eb->spinning_writers);
  40 + btrfs_assert_tree_locked(eb);
  41 + atomic_inc(&eb->blocking_writers);
  42 + write_unlock(&eb->lock);
  43 + }
  44 + } else if (rw == BTRFS_READ_LOCK) {
  45 + btrfs_assert_tree_read_locked(eb);
  46 + atomic_inc(&eb->blocking_readers);
  47 + WARN_ON(atomic_read(&eb->spinning_readers) == 0);
  48 + atomic_dec(&eb->spinning_readers);
  49 + read_unlock(&eb->lock);
  50 + }
  51 + return;
30 52 }
31 53  
32 54 /*
33   - * Setting a lock to blocking will drop the spinlock and set the
34   - * flag that forces other procs who want the lock to wait. After
35   - * this you can safely schedule with the lock held.
  55 + * if we currently have a blocking lock, take the spinlock
  56 + * and drop our blocking count
36 57 */
37   -void btrfs_set_lock_blocking(struct extent_buffer *eb)
  58 +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
38 59 {
39   - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
40   - set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
41   - spin_unlock(&eb->lock);
  60 + if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
  61 + BUG_ON(atomic_read(&eb->blocking_writers) != 1);
  62 + write_lock(&eb->lock);
  63 + WARN_ON(atomic_read(&eb->spinning_writers));
  64 + atomic_inc(&eb->spinning_writers);
  65 + if (atomic_dec_and_test(&eb->blocking_writers))
  66 + wake_up(&eb->write_lock_wq);
  67 + } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
  68 + BUG_ON(atomic_read(&eb->blocking_readers) == 0);
  69 + read_lock(&eb->lock);
  70 + atomic_inc(&eb->spinning_readers);
  71 + if (atomic_dec_and_test(&eb->blocking_readers))
  72 + wake_up(&eb->read_lock_wq);
42 73 }
43   - /* exit with the spin lock released and the bit set */
  74 + return;
44 75 }
45 76  
46 77 /*
47   - * clearing the blocking flag will take the spinlock again.
48   - * After this you can't safely schedule
  78 + * take a spinning read lock. This will wait for any blocking
  79 + * writers
49 80 */
50   -void btrfs_clear_lock_blocking(struct extent_buffer *eb)
  81 +void btrfs_tree_read_lock(struct extent_buffer *eb)
51 82 {
52   - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
53   - spin_nested(eb);
54   - clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
55   - smp_mb__after_clear_bit();
  83 +again:
  84 + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
  85 + read_lock(&eb->lock);
  86 + if (atomic_read(&eb->blocking_writers)) {
  87 + read_unlock(&eb->lock);
  88 + wait_event(eb->write_lock_wq,
  89 + atomic_read(&eb->blocking_writers) == 0);
  90 + goto again;
56 91 }
57   - /* exit with the spin lock held */
  92 + atomic_inc(&eb->read_locks);
  93 + atomic_inc(&eb->spinning_readers);
58 94 }
59 95  
60 96 /*
61   - * unfortunately, many of the places that currently set a lock to blocking
62   - * don't end up blocking for very long, and often they don't block
63   - * at all. For a dbench 50 run, if we don't spin on the blocking bit
64   - * at all, the context switch rate can jump up to 400,000/sec or more.
65   - *
66   - * So, we're still stuck with this crummy spin on the blocking bit,
67   - * at least until the most common causes of the short blocks
68   - * can be dealt with.
  97 + * returns 1 if we get the read lock and 0 if we don't
  98 + * this won't wait for blocking writers
69 99 */
70   -static int btrfs_spin_on_block(struct extent_buffer *eb)
  100 +int btrfs_try_tree_read_lock(struct extent_buffer *eb)
71 101 {
72   - int i;
  102 + if (atomic_read(&eb->blocking_writers))
  103 + return 0;
73 104  
74   - for (i = 0; i < 512; i++) {
75   - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
76   - return 1;
77   - if (need_resched())
78   - break;
79   - cpu_relax();
  105 + read_lock(&eb->lock);
  106 + if (atomic_read(&eb->blocking_writers)) {
  107 + read_unlock(&eb->lock);
  108 + return 0;
80 109 }
81   - return 0;
  110 + atomic_inc(&eb->read_locks);
  111 + atomic_inc(&eb->spinning_readers);
  112 + return 1;
82 113 }
83 114  
84 115 /*
85   - * This is somewhat different from trylock. It will take the
86   - * spinlock but if it finds the lock is set to blocking, it will
87   - * return without the lock held.
88   - *
89   - * returns 1 if it was able to take the lock and zero otherwise
90   - *
91   - * After this call, scheduling is not safe without first calling
92   - * btrfs_set_lock_blocking()
  116 + * returns 1 if we get the read lock and 0 if we don't
  117 + * this won't wait for blocking writers or readers
93 118 */
94   -int btrfs_try_spin_lock(struct extent_buffer *eb)
  119 +int btrfs_try_tree_write_lock(struct extent_buffer *eb)
95 120 {
96   - int i;
97   -
98   - if (btrfs_spin_on_block(eb)) {
99   - spin_nested(eb);
100   - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
101   - return 1;
102   - spin_unlock(&eb->lock);
  121 + if (atomic_read(&eb->blocking_writers) ||
  122 + atomic_read(&eb->blocking_readers))
  123 + return 0;
  124 + write_lock(&eb->lock);
  125 + if (atomic_read(&eb->blocking_writers) ||
  126 + atomic_read(&eb->blocking_readers)) {
  127 + write_unlock(&eb->lock);
  128 + return 0;
103 129 }
104   - /* spin for a bit on the BLOCKING flag */
105   - for (i = 0; i < 2; i++) {
106   - cpu_relax();
107   - if (!btrfs_spin_on_block(eb))
108   - break;
  130 + atomic_inc(&eb->write_locks);
  131 + atomic_inc(&eb->spinning_writers);
  132 + return 1;
  133 +}
109 134  
110   - spin_nested(eb);
111   - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112   - return 1;
113   - spin_unlock(&eb->lock);
114   - }
115   - return 0;
  135 +/*
  136 + * drop a spinning read lock
  137 + */
  138 +void btrfs_tree_read_unlock(struct extent_buffer *eb)
  139 +{
  140 + btrfs_assert_tree_read_locked(eb);
  141 + WARN_ON(atomic_read(&eb->spinning_readers) == 0);
  142 + atomic_dec(&eb->spinning_readers);
  143 + atomic_dec(&eb->read_locks);
  144 + read_unlock(&eb->lock);
116 145 }
117 146  
118 147 /*
119   - * the autoremove wake function will return 0 if it tried to wake up
120   - * a process that was already awake, which means that process won't
121   - * count as an exclusive wakeup. The waitq code will continue waking
122   - * procs until it finds one that was actually sleeping.
123   - *
124   - * For btrfs, this isn't quite what we want. We want a single proc
125   - * to be notified that the lock is ready for taking. If that proc
126   - * already happen to be awake, great, it will loop around and try for
127   - * the lock.
128   - *
129   - * So, btrfs_wake_function always returns 1, even when the proc that we
130   - * tried to wake up was already awake.
  148 + * drop a blocking read lock
131 149 */
132   -static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
133   - int sync, void *key)
  150 +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
134 151 {
135   - autoremove_wake_function(wait, mode, sync, key);
136   - return 1;
  152 + btrfs_assert_tree_read_locked(eb);
  153 + WARN_ON(atomic_read(&eb->blocking_readers) == 0);
  154 + if (atomic_dec_and_test(&eb->blocking_readers))
  155 + wake_up(&eb->read_lock_wq);
  156 + atomic_dec(&eb->read_locks);
137 157 }
138 158  
139 159 /*
140   - * returns with the extent buffer spinlocked.
141   - *
142   - * This will spin and/or wait as required to take the lock, and then
143   - * return with the spinlock held.
144   - *
145   - * After this call, scheduling is not safe without first calling
146   - * btrfs_set_lock_blocking()
  160 + * take a spinning write lock. This will wait for both
  161 + * blocking readers or writers
147 162 */
148 163 int btrfs_tree_lock(struct extent_buffer *eb)
149 164 {
150   - DEFINE_WAIT(wait);
151   - wait.func = btrfs_wake_function;
152   -
153   - if (!btrfs_spin_on_block(eb))
154   - goto sleep;
155   -
156   - while(1) {
157   - spin_nested(eb);
158   -
159   - /* nobody is blocking, exit with the spinlock held */
160   - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
161   - return 0;
162   -
163   - /*
164   - * we have the spinlock, but the real owner is blocking.
165   - * wait for them
166   - */
167   - spin_unlock(&eb->lock);
168   -
169   - /*
170   - * spin for a bit, and if the blocking flag goes away,
171   - * loop around
172   - */
173   - cpu_relax();
174   - if (btrfs_spin_on_block(eb))
175   - continue;
176   -sleep:
177   - prepare_to_wait_exclusive(&eb->lock_wq, &wait,
178   - TASK_UNINTERRUPTIBLE);
179   -
180   - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
181   - schedule();
182   -
183   - finish_wait(&eb->lock_wq, &wait);
  165 +again:
  166 + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
  167 + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
  168 + write_lock(&eb->lock);
  169 + if (atomic_read(&eb->blocking_readers)) {
  170 + write_unlock(&eb->lock);
  171 + wait_event(eb->read_lock_wq,
  172 + atomic_read(&eb->blocking_readers) == 0);
  173 + goto again;
184 174 }
  175 + if (atomic_read(&eb->blocking_writers)) {
  176 + write_unlock(&eb->lock);
  177 + wait_event(eb->write_lock_wq,
  178 + atomic_read(&eb->blocking_writers) == 0);
  179 + goto again;
  180 + }
  181 + WARN_ON(atomic_read(&eb->spinning_writers));
  182 + atomic_inc(&eb->spinning_writers);
  183 + atomic_inc(&eb->write_locks);
185 184 return 0;
186 185 }
187 186  
  187 +/*
  188 + * drop a spinning or a blocking write lock.
  189 + */
188 190 int btrfs_tree_unlock(struct extent_buffer *eb)
189 191 {
190   - /*
191   - * if we were a blocking owner, we don't have the spinlock held
192   - * just clear the bit and look for waiters
193   - */
194   - if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
195   - smp_mb__after_clear_bit();
196   - else
197   - spin_unlock(&eb->lock);
  192 + int blockers = atomic_read(&eb->blocking_writers);
198 193  
199   - if (waitqueue_active(&eb->lock_wq))
200   - wake_up(&eb->lock_wq);
  194 + BUG_ON(blockers > 1);
  195 +
  196 + btrfs_assert_tree_locked(eb);
  197 + atomic_dec(&eb->write_locks);
  198 +
  199 + if (blockers) {
  200 + WARN_ON(atomic_read(&eb->spinning_writers));
  201 + atomic_dec(&eb->blocking_writers);
  202 + smp_wmb();
  203 + wake_up(&eb->write_lock_wq);
  204 + } else {
  205 + WARN_ON(atomic_read(&eb->spinning_writers) != 1);
  206 + atomic_dec(&eb->spinning_writers);
  207 + write_unlock(&eb->lock);
  208 + }
201 209 return 0;
202 210 }
203 211  
204 212 void btrfs_assert_tree_locked(struct extent_buffer *eb)
205 213 {
206   - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
207   - assert_spin_locked(&eb->lock);
  214 + BUG_ON(!atomic_read(&eb->write_locks));
  215 +}
  216 +
  217 +void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
  218 +{
  219 + BUG_ON(!atomic_read(&eb->read_locks));
208 220 }
... ... @@ -19,12 +19,44 @@
19 19 #ifndef __BTRFS_LOCKING_
20 20 #define __BTRFS_LOCKING_
21 21  
  22 +#define BTRFS_WRITE_LOCK 1
  23 +#define BTRFS_READ_LOCK 2
  24 +#define BTRFS_WRITE_LOCK_BLOCKING 3
  25 +#define BTRFS_READ_LOCK_BLOCKING 4
  26 +
22 27 int btrfs_tree_lock(struct extent_buffer *eb);
23 28 int btrfs_tree_unlock(struct extent_buffer *eb);
24 29 int btrfs_try_spin_lock(struct extent_buffer *eb);
25 30  
26   -void btrfs_set_lock_blocking(struct extent_buffer *eb);
27   -void btrfs_clear_lock_blocking(struct extent_buffer *eb);
  31 +void btrfs_tree_read_lock(struct extent_buffer *eb);
  32 +void btrfs_tree_read_unlock(struct extent_buffer *eb);
  33 +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
  34 +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
  35 +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
28 36 void btrfs_assert_tree_locked(struct extent_buffer *eb);
  37 +int btrfs_try_tree_read_lock(struct extent_buffer *eb);
  38 +int btrfs_try_tree_write_lock(struct extent_buffer *eb);
  39 +
  40 +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
  41 +{
  42 + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
  43 + btrfs_tree_unlock(eb);
  44 + else if (rw == BTRFS_READ_LOCK_BLOCKING)
  45 + btrfs_tree_read_unlock_blocking(eb);
  46 + else if (rw == BTRFS_READ_LOCK)
  47 + btrfs_tree_read_unlock(eb);
  48 + else
  49 + BUG();
  50 +}
  51 +
  52 +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
  53 +{
  54 + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
  55 +}
  56 +
  57 +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
  58 +{
  59 + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
  60 +}
29 61 #endif
... ... @@ -1730,8 +1730,8 @@
1730 1730 btrfs_read_buffer(next, ptr_gen);
1731 1731  
1732 1732 btrfs_tree_lock(next);
1733   - clean_tree_block(trans, root, next);
1734 1733 btrfs_set_lock_blocking(next);
  1734 + clean_tree_block(trans, root, next);
1735 1735 btrfs_wait_tree_block_writeback(next);
1736 1736 btrfs_tree_unlock(next);
1737 1737  
1738 1738  
... ... @@ -1796,8 +1796,8 @@
1796 1796 next = path->nodes[*level];
1797 1797  
1798 1798 btrfs_tree_lock(next);
1799   - clean_tree_block(trans, root, next);
1800 1799 btrfs_set_lock_blocking(next);
  1800 + clean_tree_block(trans, root, next);
1801 1801 btrfs_wait_tree_block_writeback(next);
1802 1802 btrfs_tree_unlock(next);
1803 1803  
1804 1804  
... ... @@ -1864,8 +1864,8 @@
1864 1864 next = path->nodes[orig_level];
1865 1865  
1866 1866 btrfs_tree_lock(next);
1867   - clean_tree_block(trans, log, next);
1868 1867 btrfs_set_lock_blocking(next);
  1868 + clean_tree_block(trans, log, next);
1869 1869 btrfs_wait_tree_block_writeback(next);
1870 1870 btrfs_tree_unlock(next);
1871 1871