Commit 12f8ad4b0533d9212cb1d5e58ed73d2170114785

Authored by Linus Torvalds
1 parent 4f988f152e

vfs: clean up __d_lookup_rcu() and dentry_cmp() interfaces

The calling conventions for __d_lookup_rcu() and dentry_cmp() are
annoying in different ways, and there is actually one single underlying
reason for both of the annoyances.

The fundamental reason is that we do the returned dentry sequence number
check inside __d_lookup_rcu() instead of doing it in the caller.  This
results in two annoyances:

 - __d_lookup_rcu() now not only needs to return the dentry and the
   sequence number that goes along with the lookup, it also needs to
   return the inode pointer that was validated by that sequence number
   check.

 - and because we did the sequence number check early (to validate the
   name pointer and length) we also couldn't just pass the dentry itself
   to dentry_cmp(), we had to pass the counted string that contained the
   name.

So that sequence number decision caused two separate ugly calling
conventions.

Both of these problems would be solved if we just did the sequence
number check in the caller instead.  There's only one caller, and that
caller already has to do the sequence number check for the parent
anyway, so just do that.

That allows us to stop returning the dentry->d_inode in that in-out
argument (pointer-to-pointer-to-inode), so we can make the inode
argument just a regular input inode pointer.  The caller can just load
the inode from dentry->d_inode, and then do the sequence number check
after that to make sure that it's synchronized with the name we looked
up.

And it allows us to just pass in the dentry to dentry_cmp(), which is
what all the callers really wanted.  Sure, dentry_cmp() has to be a bit
careful about the dentry (which is not stable during RCU lookup), but
that's actually very simple.

And now that dentry_cmp() can clearly see that the first string argument
is a dentry, we can use the direct word access for that, instead of the
careful unaligned zero-padding.  The dentry name is always properly
aligned, since it is a single path component that is either embedded
into the dentry itself, or was allocated with kmalloc() (see __d_alloc).

Finally, this also uninlines the nasty slow-case for dentry comparisons:
that one *does* need to do a sequence number check, since it will call
in to the low-level filesystems, and we want to give those a stable
inode pointer and path component length/start arguments.  Doing an extra
sequence check for that slow case is not a problem, though.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 121 additions and 52 deletions Side-by-side Diff

... ... @@ -153,16 +153,33 @@
153 153 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
154 154 * need the careful unaligned handling.
155 155 */
156   -static inline int dentry_cmp(const unsigned char *cs, size_t scount,
157   - const unsigned char *ct, size_t tcount)
  156 +static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
158 157 {
159 158 unsigned long a,b,mask;
  159 + const unsigned char *cs;
160 160  
161   - if (unlikely(scount != tcount))
  161 + if (unlikely(dentry->d_name.len != tcount))
162 162 return 1;
  163 + /*
  164 + * Be careful about RCU walk racing with rename:
  165 + * use ACCESS_ONCE to fetch the name pointer.
  166 + *
  167 + * NOTE! Even if a rename will mean that the length
  168 + * was not loaded atomically, we don't care. The
  169 + * RCU walk will check the sequence count eventually,
  170 + * and catch it. And we won't overrun the buffer,
  171 + * because we're reading the name pointer atomically,
  172 + * and a dentry name is guaranteed to be properly
  173 + * terminated with a NUL byte.
  174 + *
  175 + * End result: even if 'len' is wrong, we'll exit
  176 + * early because the data cannot match (there can
  177 + * be no NUL in the ct/tcount data)
  178 + */
  179 + cs = ACCESS_ONCE(dentry->d_name.name);
163 180  
164 181 for (;;) {
165   - a = load_unaligned_zeropad(cs);
  182 + a = *(unsigned long *)cs;
166 183 b = load_unaligned_zeropad(ct);
167 184 if (tcount < sizeof(unsigned long))
168 185 break;
169 186  
... ... @@ -180,10 +197,11 @@
180 197  
181 198 #else
182 199  
183   -static inline int dentry_cmp(const unsigned char *cs, size_t scount,
184   - const unsigned char *ct, size_t tcount)
  200 +static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
185 201 {
186   - if (scount != tcount)
  202 + const unsigned char *cs = dentry->d_name.name;
  203 +
  204 + if (dentry->d_name.len != tcount)
187 205 return 1;
188 206  
189 207 do {
190 208  
191 209  
... ... @@ -1439,18 +1457,16 @@
1439 1457 }
1440 1458  
1441 1459 list_for_each_entry(alias, &inode->i_dentry, d_alias) {
1442   - struct qstr *qstr = &alias->d_name;
1443   -
1444 1460 /*
1445 1461 * Don't need alias->d_lock here, because aliases with
1446 1462 * d_parent == entry->d_parent are not subject to name or
1447 1463 * parent changes, because the parent inode i_mutex is held.
1448 1464 */
1449   - if (qstr->hash != hash)
  1465 + if (alias->d_name.hash != hash)
1450 1466 continue;
1451 1467 if (alias->d_parent != entry->d_parent)
1452 1468 continue;
1453   - if (dentry_cmp(qstr->name, qstr->len, name, len))
  1469 + if (dentry_cmp(alias, name, len))
1454 1470 continue;
1455 1471 __dget(alias);
1456 1472 return alias;
... ... @@ -1727,6 +1743,48 @@
1727 1743 }
1728 1744 EXPORT_SYMBOL(d_add_ci);
1729 1745  
  1746 +/*
  1747 + * Do the slow-case of the dentry name compare.
  1748 + *
  1749 + * Unlike the dentry_cmp() function, we need to atomically
  1750 + * load the name, length and inode information, so that the
  1751 + * filesystem can rely on them, and can use the 'name' and
  1752 + * 'len' information without worrying about walking off the
  1753 + * end of memory etc.
  1754 + *
  1755 + * Thus the read_seqcount_retry() and the "duplicate" info
  1756 + * in arguments (the low-level filesystem should not look
  1757 + * at the dentry inode or name contents directly, since
  1758 + * rename can change them while we're in RCU mode).
  1759 + */
  1760 +enum slow_d_compare {
  1761 + D_COMP_OK,
  1762 + D_COMP_NOMATCH,
  1763 + D_COMP_SEQRETRY,
  1764 +};
  1765 +
  1766 +static noinline enum slow_d_compare slow_dentry_cmp(
  1767 + const struct dentry *parent,
  1768 + struct inode *inode,
  1769 + struct dentry *dentry,
  1770 + unsigned int seq,
  1771 + const struct qstr *name)
  1772 +{
  1773 + int tlen = dentry->d_name.len;
  1774 + const char *tname = dentry->d_name.name;
  1775 + struct inode *i = dentry->d_inode;
  1776 +
  1777 + if (read_seqcount_retry(&dentry->d_seq, seq)) {
  1778 + cpu_relax();
  1779 + return D_COMP_SEQRETRY;
  1780 + }
  1781 + if (parent->d_op->d_compare(parent, inode,
  1782 + dentry, i,
  1783 + tlen, tname, name))
  1784 + return D_COMP_NOMATCH;
  1785 + return D_COMP_OK;
  1786 +}
  1787 +
1730 1788 /**
1731 1789 * __d_lookup_rcu - search for a dentry (racy, store-free)
1732 1790 * @parent: parent dentry
1733 1791  
... ... @@ -1753,10 +1811,13 @@
1753 1811 * the returned dentry, so long as its parent's seqlock is checked after the
1754 1812 * child is looked up. Thus, an interlocking stepping of sequence lock checks
1755 1813 * is formed, giving integrity down the path walk.
  1814 + *
  1815 + * NOTE! The caller *has* to check the resulting dentry against the sequence
  1816 + * number we've returned before using any of the resulting dentry state!
1756 1817 */
1757 1818 struct dentry *__d_lookup_rcu(const struct dentry *parent,
1758 1819 const struct qstr *name,
1759   - unsigned *seqp, struct inode **inode)
  1820 + unsigned *seqp, struct inode *inode)
1760 1821 {
1761 1822 unsigned int len = name->len;
1762 1823 unsigned int hash = name->hash;
1763 1824  
1764 1825  
1765 1826  
1766 1827  
1767 1828  
... ... @@ -1787,49 +1848,46 @@
1787 1848 */
1788 1849 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
1789 1850 unsigned seq;
1790   - struct inode *i;
1791   - const char *tname;
1792   - int tlen;
1793 1851  
1794 1852 if (dentry->d_name.hash != hash)
1795 1853 continue;
1796 1854  
1797 1855 seqretry:
1798   - seq = read_seqcount_begin(&dentry->d_seq);
  1856 + /*
  1857 + * The dentry sequence count protects us from concurrent
  1858 + * renames, and thus protects inode, parent and name fields.
  1859 + *
  1860 + * The caller must perform a seqcount check in order
  1861 + * to do anything useful with the returned dentry,
  1862 + * including using the 'd_inode' pointer.
  1863 + *
  1864 + * NOTE! We do a "raw" seqcount_begin here. That means that
  1865 + * we don't wait for the sequence count to stabilize if it
  1866 + * is in the middle of a sequence change. If we do the slow
  1867 + * dentry compare, we will do seqretries until it is stable,
  1868 + * and if we end up with a successful lookup, we actually
  1869 + * want to exit RCU lookup anyway.
  1870 + */
  1871 + seq = raw_seqcount_begin(&dentry->d_seq);
1799 1872 if (dentry->d_parent != parent)
1800 1873 continue;
1801 1874 if (d_unhashed(dentry))
1802 1875 continue;
1803   - tlen = dentry->d_name.len;
1804   - tname = dentry->d_name.name;
1805   - i = dentry->d_inode;
1806   - prefetch(tname);
1807   - /*
1808   - * This seqcount check is required to ensure name and
1809   - * len are loaded atomically, so as not to walk off the
1810   - * edge of memory when walking. If we could load this
1811   - * atomically some other way, we could drop this check.
1812   - */
1813   - if (read_seqcount_retry(&dentry->d_seq, seq))
1814   - goto seqretry;
  1876 + *seqp = seq;
  1877 +
1815 1878 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1816   - if (parent->d_op->d_compare(parent, *inode,
1817   - dentry, i,
1818   - tlen, tname, name))
  1879 + switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
  1880 + case D_COMP_OK:
  1881 + return dentry;
  1882 + case D_COMP_NOMATCH:
1819 1883 continue;
1820   - } else {
1821   - if (dentry_cmp(tname, tlen, str, len))
1822   - continue;
  1884 + default:
  1885 + goto seqretry;
  1886 + }
1823 1887 }
1824   - /*
1825   - * No extra seqcount check is required after the name
1826   - * compare. The caller must perform a seqcount check in
1827   - * order to do anything useful with the returned dentry
1828   - * anyway.
1829   - */
1830   - *seqp = seq;
1831   - *inode = i;
1832   - return dentry;
  1888 +
  1889 + if (!dentry_cmp(dentry, str, len))
  1890 + return dentry;
1833 1891 }
1834 1892 return NULL;
1835 1893 }
... ... @@ -1908,8 +1966,6 @@
1908 1966 rcu_read_lock();
1909 1967  
1910 1968 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
1911   - const char *tname;
1912   - int tlen;
1913 1969  
1914 1970 if (dentry->d_name.hash != hash)
1915 1971 continue;
1916 1972  
1917 1973  
... ... @@ -1924,15 +1980,15 @@
1924 1980 * It is safe to compare names since d_move() cannot
1925 1981 * change the qstr (protected by d_lock).
1926 1982 */
1927   - tlen = dentry->d_name.len;
1928   - tname = dentry->d_name.name;
1929 1983 if (parent->d_flags & DCACHE_OP_COMPARE) {
  1984 + int tlen = dentry->d_name.len;
  1985 + const char *tname = dentry->d_name.name;
1930 1986 if (parent->d_op->d_compare(parent, parent->d_inode,
1931 1987 dentry, dentry->d_inode,
1932 1988 tlen, tname, name))
1933 1989 goto next;
1934 1990 } else {
1935   - if (dentry_cmp(tname, tlen, str, len))
  1991 + if (dentry_cmp(dentry, str, len))
1936 1992 goto next;
1937 1993 }
1938 1994  
... ... @@ -1154,12 +1154,25 @@
1154 1154 */
1155 1155 if (nd->flags & LOOKUP_RCU) {
1156 1156 unsigned seq;
1157   - *inode = nd->inode;
1158   - dentry = __d_lookup_rcu(parent, name, &seq, inode);
  1157 + dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
1159 1158 if (!dentry)
1160 1159 goto unlazy;
1161 1160  
1162   - /* Memory barrier in read_seqcount_begin of child is enough */
  1161 + /*
  1162 + * This sequence count validates that the inode matches
  1163 + * the dentry name information from lookup.
  1164 + */
  1165 + *inode = dentry->d_inode;
  1166 + if (read_seqcount_retry(&dentry->d_seq, seq))
  1167 + return -ECHILD;
  1168 +
  1169 + /*
  1170 + * This sequence count validates that the parent had no
  1171 + * changes while we did the lookup of the dentry above.
  1172 + *
  1173 + * The memory barrier in read_seqcount_begin of child is
  1174 + * enough, we can use __read_seqcount_retry here.
  1175 + */
1163 1176 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1164 1177 return -ECHILD;
1165 1178 nd->seq = seq;
include/linux/dcache.h
... ... @@ -282,7 +282,7 @@
282 282 extern struct dentry *__d_lookup(struct dentry *, struct qstr *);
283 283 extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
284 284 const struct qstr *name,
285   - unsigned *seq, struct inode **inode);
  285 + unsigned *seq, struct inode *inode);
286 286  
287 287 /**
288 288 * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok