Commit da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb

Authored by David Chinner
Committed by Tim Shimmin
1 parent 39cd9f877e

[XFS] Radix tree based inode caching

One of the perpetual scaling problems XFS has is indexing it's incore
inodes. We currently uses hashes and the default hash sizes chosen can
only ever be a tradeoff between memory consumption and the maximum
realistic size of the cache.

As a result, anyone who has millions of inodes cached on a filesystem
needs to tunes the size of the cache via the ihashsize mount option to
allow decent scalability with inode cache operations.

A further problem is the separate inode cluster hash, whose size is based
on the ihashsize but is smaller, and so under certain conditions (sparse
cluster cache population) this can become a limitation long before the
inode hash is causing issues.

The following patchset removes the inode hash and cluster hash and
replaces them with radix trees to avoid the scalability limitations of the
hashes. It also reduces the size of the inodes by 3 pointers....

SGI-PV: 969561
SGI-Modid: xfs-linux-melb:xfs-kern:29481a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>

Showing 20 changed files with 284 additions and 503 deletions Side-by-side Diff

fs/xfs/linux-2.6/xfs_export.c
... ... @@ -17,10 +17,12 @@
17 17 */
18 18 #include "xfs.h"
19 19 #include "xfs_types.h"
20   -#include "xfs_dmapi.h"
  20 +#include "xfs_inum.h"
21 21 #include "xfs_log.h"
22 22 #include "xfs_trans.h"
23 23 #include "xfs_sb.h"
  24 +#include "xfs_ag.h"
  25 +#include "xfs_dmapi.h"
24 26 #include "xfs_mount.h"
25 27 #include "xfs_export.h"
26 28  
... ... @@ -197,6 +197,10 @@
197 197 #endif
198 198 xfs_perag_busy_t *pagb_list; /* unstable blocks */
199 199 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
  200 +
  201 + int pag_ici_init; /* incore inode cache initialised */
  202 + rwlock_t pag_ici_lock; /* incore inode lock */
  203 + struct radix_tree_root pag_ici_root; /* incore inode cache root */
200 204 } xfs_perag_t;
201 205  
202 206 #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
fs/xfs/xfs_buf_item.c
... ... @@ -23,6 +23,7 @@
23 23 #include "xfs_inum.h"
24 24 #include "xfs_trans.h"
25 25 #include "xfs_sb.h"
  26 +#include "xfs_ag.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
28 29 #include "xfs_buf_item.h"
... ... @@ -89,7 +89,6 @@
89 89 #define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */
90 90 #define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width
91 91 * allocation */
92   -#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */
93 92 #define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename
94 93 * symlink,mkdir,rmdir,mknod */
95 94 #define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */
fs/xfs/xfs_dir2_block.c
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dir2.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
fs/xfs/xfs_dir2_data.c
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dir2.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
fs/xfs/xfs_dir2_node.c
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dir2.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
fs/xfs/xfs_dir2_sf.c
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dir2.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dir2.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
fs/xfs/xfs_extfree_item.c
... ... @@ -23,6 +23,7 @@
23 23 #include "xfs_trans.h"
24 24 #include "xfs_buf_item.h"
25 25 #include "xfs_sb.h"
  26 +#include "xfs_ag.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
28 29 #include "xfs_trans_priv.h"
... ... @@ -40,132 +40,14 @@
40 40 #include "xfs_utils.h"
41 41  
42 42 /*
43   - * Initialize the inode hash table for the newly mounted file system.
44   - * Choose an initial table size based on user specified value, else
45   - * use a simple algorithm using the maximum number of inodes as an
46   - * indicator for table size, and clamp it between one and some large
47   - * number of pages.
48   - */
49   -void
50   -xfs_ihash_init(xfs_mount_t *mp)
51   -{
52   - __uint64_t icount;
53   - uint i;
54   -
55   - if (!mp->m_ihsize) {
56   - icount = mp->m_maxicount ? mp->m_maxicount :
57   - (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
58   - mp->m_ihsize = 1 << max_t(uint, 8,
59   - (xfs_highbit64(icount) + 1) / 2);
60   - mp->m_ihsize = min_t(uint, mp->m_ihsize,
61   - (64 * NBPP) / sizeof(xfs_ihash_t));
62   - }
63   -
64   - mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize,
65   - NBPC * sizeof(xfs_ihash_t),
66   - mp->m_ihsize * sizeof(xfs_ihash_t),
67   - KM_SLEEP | KM_MAYFAIL | KM_LARGE);
68   - mp->m_ihsize /= sizeof(xfs_ihash_t);
69   - for (i = 0; i < mp->m_ihsize; i++)
70   - rwlock_init(&(mp->m_ihash[i].ih_lock));
71   -}
72   -
73   -/*
74   - * Free up structures allocated by xfs_ihash_init, at unmount time.
75   - */
76   -void
77   -xfs_ihash_free(xfs_mount_t *mp)
78   -{
79   - kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t));
80   - mp->m_ihash = NULL;
81   -}
82   -
83   -/*
84   - * Initialize the inode cluster hash table for the newly mounted file system.
85   - * Its size is derived from the ihash table size.
86   - */
87   -void
88   -xfs_chash_init(xfs_mount_t *mp)
89   -{
90   - uint i;
91   -
92   - mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
93   - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
94   - mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
95   - mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
96   - * sizeof(xfs_chash_t),
97   - KM_SLEEP | KM_LARGE);
98   - for (i = 0; i < mp->m_chsize; i++) {
99   - spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
100   - }
101   -}
102   -
103   -/*
104   - * Free up structures allocated by xfs_chash_init, at unmount time.
105   - */
106   -void
107   -xfs_chash_free(xfs_mount_t *mp)
108   -{
109   - int i;
110   -
111   - for (i = 0; i < mp->m_chsize; i++) {
112   - spinlock_destroy(&mp->m_chash[i].ch_lock);
113   - }
114   -
115   - kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
116   - mp->m_chash = NULL;
117   -}
118   -
119   -/*
120   - * Try to move an inode to the front of its hash list if possible
121   - * (and if its not there already). Called right after obtaining
122   - * the list version number and then dropping the read_lock on the
123   - * hash list in question (which is done right after looking up the
124   - * inode in question...).
125   - */
126   -STATIC void
127   -xfs_ihash_promote(
128   - xfs_ihash_t *ih,
129   - xfs_inode_t *ip,
130   - ulong version)
131   -{
132   - xfs_inode_t *iq;
133   -
134   - if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) {
135   - if (likely(version == ih->ih_version)) {
136   - /* remove from list */
137   - if ((iq = ip->i_next)) {
138   - iq->i_prevp = ip->i_prevp;
139   - }
140   - *ip->i_prevp = iq;
141   -
142   - /* insert at list head */
143   - iq = ih->ih_next;
144   - iq->i_prevp = &ip->i_next;
145   - ip->i_next = iq;
146   - ip->i_prevp = &ih->ih_next;
147   - ih->ih_next = ip;
148   - }
149   - write_unlock(&ih->ih_lock);
150   - }
151   -}
152   -
153   -/*
154 43 * Look up an inode by number in the given file system.
155   - * The inode is looked up in the hash table for the file system
156   - * represented by the mount point parameter mp. Each bucket of
157   - * the hash table is guarded by an individual semaphore.
  44 + * The inode is looked up in the cache held in each AG.
  45 + * If the inode is found in the cache, attach it to the provided
  46 + * vnode.
158 47 *
159   - * If the inode is found in the hash table, its corresponding vnode
160   - * is obtained with a call to vn_get(). This call takes care of
161   - * coordination with the reclamation of the inode and vnode. Note
162   - * that the vmap structure is filled in while holding the hash lock.
163   - * This gives us the state of the inode/vnode when we found it and
164   - * is used for coordination in vn_get().
  48 + * If it is not in core, read it in from the file system's device,
  49 + * add it to the cache and attach the provided vnode.
165 50 *
166   - * If it is not in core, read it in from the file system's device and
167   - * add the inode into the hash table.
168   - *
169 51 * The inode is locked according to the value of the lock_flags parameter.
170 52 * This flag parameter indicates how and if the inode's IO lock and inode lock
171 53 * should be taken.
172 54  
173 55  
174 56  
175 57  
176 58  
177 59  
178 60  
179 61  
180 62  
181 63  
182 64  
183 65  
184 66  
185 67  
186 68  
187 69  
188 70  
189 71  
190 72  
191 73  
192 74  
193 75  
194 76  
195 77  
196 78  
197 79  
198 80  
199 81  
200 82  
201 83  
202 84  
203 85  
204 86  
205 87  
206 88  
207 89  
208 90  
209 91  
210 92  
211 93  
212 94  
213 95  
214 96  
215 97  
216 98  
... ... @@ -192,275 +74,242 @@
192 74 xfs_inode_t **ipp,
193 75 xfs_daddr_t bno)
194 76 {
195   - xfs_ihash_t *ih;
196 77 xfs_inode_t *ip;
197 78 xfs_inode_t *iq;
198 79 bhv_vnode_t *inode_vp;
199   - ulong version;
200 80 int error;
201   - /* REFERENCED */
202   - xfs_chash_t *ch;
203   - xfs_chashlist_t *chl, *chlnew;
204   - SPLDECL(s);
  81 + xfs_icluster_t *icl, *new_icl = NULL;
  82 + unsigned long first_index, mask;
  83 + xfs_perag_t *pag;
  84 + xfs_agino_t agino;
205 85  
  86 + /* the radix tree exists only in inode capable AGs */
  87 + if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
  88 + return EINVAL;
206 89  
207   - ih = XFS_IHASH(mp, ino);
  90 + /* get the perag structure and ensure that it's inode capable */
  91 + pag = xfs_get_perag(mp, ino);
  92 + if (!pag->pagi_inodeok)
  93 + return EINVAL;
  94 + ASSERT(pag->pag_ici_init);
  95 + agino = XFS_INO_TO_AGINO(mp, ino);
208 96  
209 97 again:
210   - read_lock(&ih->ih_lock);
  98 + read_lock(&pag->pag_ici_lock);
  99 + ip = radix_tree_lookup(&pag->pag_ici_root, agino);
211 100  
212   - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
213   - if (ip->i_ino == ino) {
  101 + if (ip != NULL) {
  102 + /*
  103 + * If INEW is set this inode is being set up
  104 + * we need to pause and try again.
  105 + */
  106 + if (xfs_iflags_test(ip, XFS_INEW)) {
  107 + read_unlock(&pag->pag_ici_lock);
  108 + delay(1);
  109 + XFS_STATS_INC(xs_ig_frecycle);
  110 +
  111 + goto again;
  112 + }
  113 +
  114 + inode_vp = XFS_ITOV_NULL(ip);
  115 + if (inode_vp == NULL) {
214 116 /*
215   - * If INEW is set this inode is being set up
  117 + * If IRECLAIM is set this inode is
  118 + * on its way out of the system,
216 119 * we need to pause and try again.
217 120 */
218   - if (xfs_iflags_test(ip, XFS_INEW)) {
219   - read_unlock(&ih->ih_lock);
  121 + if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
  122 + read_unlock(&pag->pag_ici_lock);
220 123 delay(1);
221 124 XFS_STATS_INC(xs_ig_frecycle);
222 125  
223 126 goto again;
224 127 }
  128 + ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
225 129  
226   - inode_vp = XFS_ITOV_NULL(ip);
227   - if (inode_vp == NULL) {
228   - /*
229   - * If IRECLAIM is set this inode is
230   - * on its way out of the system,
231   - * we need to pause and try again.
232   - */
233   - if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
234   - read_unlock(&ih->ih_lock);
235   - delay(1);
236   - XFS_STATS_INC(xs_ig_frecycle);
  130 + /*
  131 + * If lookup is racing with unlink, then we
  132 + * should return an error immediately so we
  133 + * don't remove it from the reclaim list and
  134 + * potentially leak the inode.
  135 + */
  136 + if ((ip->i_d.di_mode == 0) &&
  137 + !(flags & XFS_IGET_CREATE)) {
  138 + read_unlock(&pag->pag_ici_lock);
  139 + xfs_put_perag(mp, pag);
  140 + return ENOENT;
  141 + }
237 142  
238   - goto again;
239   - }
240   - ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
  143 + /*
  144 + * There may be transactions sitting in the
  145 + * incore log buffers or being flushed to disk
  146 + * at this time. We can't clear the
  147 + * XFS_IRECLAIMABLE flag until these
  148 + * transactions have hit the disk, otherwise we
  149 + * will void the guarantee the flag provides
  150 + * xfs_iunpin()
  151 + */
  152 + if (xfs_ipincount(ip)) {
  153 + read_unlock(&pag->pag_ici_lock);
  154 + xfs_log_force(mp, 0,
  155 + XFS_LOG_FORCE|XFS_LOG_SYNC);
  156 + XFS_STATS_INC(xs_ig_frecycle);
  157 + goto again;
  158 + }
241 159  
242   - /*
243   - * If lookup is racing with unlink, then we
244   - * should return an error immediately so we
245   - * don't remove it from the reclaim list and
246   - * potentially leak the inode.
247   - */
248   - if ((ip->i_d.di_mode == 0) &&
249   - !(flags & XFS_IGET_CREATE)) {
250   - read_unlock(&ih->ih_lock);
251   - return ENOENT;
252   - }
  160 + vn_trace_exit(vp, "xfs_iget.alloc",
  161 + (inst_t *)__return_address);
253 162  
254   - /*
255   - * There may be transactions sitting in the
256   - * incore log buffers or being flushed to disk
257   - * at this time. We can't clear the
258   - * XFS_IRECLAIMABLE flag until these
259   - * transactions have hit the disk, otherwise we
260   - * will void the guarantee the flag provides
261   - * xfs_iunpin()
262   - */
263   - if (xfs_ipincount(ip)) {
264   - read_unlock(&ih->ih_lock);
265   - xfs_log_force(mp, 0,
266   - XFS_LOG_FORCE|XFS_LOG_SYNC);
267   - XFS_STATS_INC(xs_ig_frecycle);
268   - goto again;
269   - }
  163 + XFS_STATS_INC(xs_ig_found);
270 164  
271   - vn_trace_exit(vp, "xfs_iget.alloc",
272   - (inst_t *)__return_address);
  165 + xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
  166 + read_unlock(&pag->pag_ici_lock);
273 167  
274   - XFS_STATS_INC(xs_ig_found);
  168 + XFS_MOUNT_ILOCK(mp);
  169 + list_del_init(&ip->i_reclaim);
  170 + XFS_MOUNT_IUNLOCK(mp);
275 171  
276   - xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
277   - version = ih->ih_version;
278   - read_unlock(&ih->ih_lock);
279   - xfs_ihash_promote(ih, ip, version);
  172 + goto finish_inode;
280 173  
281   - XFS_MOUNT_ILOCK(mp);
282   - list_del_init(&ip->i_reclaim);
283   - XFS_MOUNT_IUNLOCK(mp);
  174 + } else if (vp != inode_vp) {
  175 + struct inode *inode = vn_to_inode(inode_vp);
284 176  
285   - goto finish_inode;
  177 + /* The inode is being torn down, pause and
  178 + * try again.
  179 + */
  180 + if (inode->i_state & (I_FREEING | I_CLEAR)) {
  181 + read_unlock(&pag->pag_ici_lock);
  182 + delay(1);
  183 + XFS_STATS_INC(xs_ig_frecycle);
286 184  
287   - } else if (vp != inode_vp) {
288   - struct inode *inode = vn_to_inode(inode_vp);
289   -
290   - /* The inode is being torn down, pause and
291   - * try again.
292   - */
293   - if (inode->i_state & (I_FREEING | I_CLEAR)) {
294   - read_unlock(&ih->ih_lock);
295   - delay(1);
296   - XFS_STATS_INC(xs_ig_frecycle);
297   -
298   - goto again;
299   - }
300   -/* Chances are the other vnode (the one in the inode) is being torn
301   - * down right now, and we landed on top of it. Question is, what do
302   - * we do? Unhook the old inode and hook up the new one?
303   - */
304   - cmn_err(CE_PANIC,
305   - "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
306   - inode_vp, vp);
  185 + goto again;
307 186 }
  187 +/* Chances are the other vnode (the one in the inode) is being torn
  188 +* down right now, and we landed on top of it. Question is, what do
  189 +* we do? Unhook the old inode and hook up the new one?
  190 +*/
  191 + cmn_err(CE_PANIC,
  192 + "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
  193 + inode_vp, vp);
  194 + }
308 195  
309   - /*
310   - * Inode cache hit: if ip is not at the front of
311   - * its hash chain, move it there now.
312   - * Do this with the lock held for update, but
313   - * do statistics after releasing the lock.
314   - */
315   - version = ih->ih_version;
316   - read_unlock(&ih->ih_lock);
317   - xfs_ihash_promote(ih, ip, version);
318   - XFS_STATS_INC(xs_ig_found);
  196 + /*
  197 + * Inode cache hit
  198 + */
  199 + read_unlock(&pag->pag_ici_lock);
  200 + XFS_STATS_INC(xs_ig_found);
319 201  
320 202 finish_inode:
321   - if (ip->i_d.di_mode == 0) {
322   - if (!(flags & XFS_IGET_CREATE))
323   - return ENOENT;
324   - xfs_iocore_inode_reinit(ip);
  203 + if (ip->i_d.di_mode == 0) {
  204 + if (!(flags & XFS_IGET_CREATE)) {
  205 + xfs_put_perag(mp, pag);
  206 + return ENOENT;
325 207 }
  208 + xfs_iocore_inode_reinit(ip);
  209 + }
326 210  
327   - if (lock_flags != 0)
328   - xfs_ilock(ip, lock_flags);
  211 + if (lock_flags != 0)
  212 + xfs_ilock(ip, lock_flags);
329 213  
330   - xfs_iflags_clear(ip, XFS_ISTALE);
331   - vn_trace_exit(vp, "xfs_iget.found",
332   - (inst_t *)__return_address);
333   - goto return_ip;
334   - }
  214 + xfs_iflags_clear(ip, XFS_ISTALE);
  215 + vn_trace_exit(vp, "xfs_iget.found",
  216 + (inst_t *)__return_address);
  217 + goto return_ip;
335 218 }
336 219  
337 220 /*
338   - * Inode cache miss: save the hash chain version stamp and unlock
339   - * the chain, so we don't deadlock in vn_alloc.
  221 + * Inode cache miss
340 222 */
  223 + read_unlock(&pag->pag_ici_lock);
341 224 XFS_STATS_INC(xs_ig_missed);
342 225  
343   - version = ih->ih_version;
344   -
345   - read_unlock(&ih->ih_lock);
346   -
347 226 /*
348 227 * Read the disk inode attributes into a new inode structure and get
349 228 * a new vnode for it. This should also initialize i_ino and i_mount.
350 229 */
351 230 error = xfs_iread(mp, tp, ino, &ip, bno,
352 231 (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
353   - if (error)
  232 + if (error) {
  233 + xfs_put_perag(mp, pag);
354 234 return error;
  235 + }
355 236  
356 237 vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
357 238  
358 239 xfs_inode_lock_init(ip, vp);
359 240 xfs_iocore_inode_init(ip);
360   -
361 241 if (lock_flags)
362 242 xfs_ilock(ip, lock_flags);
363 243  
364 244 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
365 245 xfs_idestroy(ip);
  246 + xfs_put_perag(mp, pag);
366 247 return ENOENT;
367 248 }
368 249  
369 250 /*
370   - * Put ip on its hash chain, unless someone else hashed a duplicate
371   - * after we released the hash lock.
  251 + * This is a bit messy - we preallocate everything we _might_
  252 + * need before we pick up the ici lock. That way we don't have to
  253 + * juggle locks and go all the way back to the start.
372 254 */
373   - write_lock(&ih->ih_lock);
  255 + new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
  256 + if (radix_tree_preload(GFP_KERNEL)) {
  257 + delay(1);
  258 + goto again;
  259 + }
  260 + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
  261 + first_index = agino & mask;
  262 + write_lock(&pag->pag_ici_lock);
374 263  
375   - if (ih->ih_version != version) {
376   - for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
377   - if (iq->i_ino == ino) {
378   - write_unlock(&ih->ih_lock);
379   - xfs_idestroy(ip);
  264 + /*
  265 + * Find the cluster if it exists
  266 + */
  267 + icl = NULL;
  268 + if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
  269 + first_index, 1)) {
  270 + if ((iq->i_ino & mask) == first_index)
  271 + icl = iq->i_cluster;
  272 + }
380 273  
381   - XFS_STATS_INC(xs_ig_dup);
382   - goto again;
383   - }
384   - }
  274 + /*
  275 + * insert the new inode
  276 + */
  277 + error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
  278 + if (unlikely(error)) {
  279 + BUG_ON(error != -EEXIST);
  280 + write_unlock(&pag->pag_ici_lock);
  281 + radix_tree_preload_end();
  282 + xfs_idestroy(ip);
  283 + XFS_STATS_INC(xs_ig_dup);
  284 + goto again;
385 285 }
386 286  
387 287 /*
388 288 * These values _must_ be set before releasing ihlock!
389 289 */
390   - ip->i_hash = ih;
391   - if ((iq = ih->ih_next)) {
392   - iq->i_prevp = &ip->i_next;
393   - }
394   - ip->i_next = iq;
395   - ip->i_prevp = &ih->ih_next;
396   - ih->ih_next = ip;
397 290 ip->i_udquot = ip->i_gdquot = NULL;
398   - ih->ih_version++;
399 291 xfs_iflags_set(ip, XFS_INEW);
400   - write_unlock(&ih->ih_lock);
401 292  
402   - /*
403   - * put ip on its cluster's hash chain
404   - */
405   - ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
406   - ip->i_cnext == NULL);
  293 + ASSERT(ip->i_cluster == NULL);
407 294  
408   - chlnew = NULL;
409   - ch = XFS_CHASH(mp, ip->i_blkno);
410   - chlredo:
411   - s = mutex_spinlock(&ch->ch_lock);
412   - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
413   - if (chl->chl_blkno == ip->i_blkno) {
414   -
415   - /* insert this inode into the doubly-linked list
416   - * where chl points */
417   - if ((iq = chl->chl_ip)) {
418   - ip->i_cprev = iq->i_cprev;
419   - iq->i_cprev->i_cnext = ip;
420   - iq->i_cprev = ip;
421   - ip->i_cnext = iq;
422   - } else {
423   - ip->i_cnext = ip;
424   - ip->i_cprev = ip;
425   - }
426   - chl->chl_ip = ip;
427   - ip->i_chash = chl;
428   - break;
429   - }
430   - }
431   -
432   - /* no hash list found for this block; add a new hash list */
433   - if (chl == NULL) {
434   - if (chlnew == NULL) {
435   - mutex_spinunlock(&ch->ch_lock, s);
436   - ASSERT(xfs_chashlist_zone != NULL);
437   - chlnew = (xfs_chashlist_t *)
438   - kmem_zone_alloc(xfs_chashlist_zone,
439   - KM_SLEEP);
440   - ASSERT(chlnew != NULL);
441   - goto chlredo;
442   - } else {
443   - ip->i_cnext = ip;
444   - ip->i_cprev = ip;
445   - ip->i_chash = chlnew;
446   - chlnew->chl_ip = ip;
447   - chlnew->chl_blkno = ip->i_blkno;
448   - if (ch->ch_list)
449   - ch->ch_list->chl_prev = chlnew;
450   - chlnew->chl_next = ch->ch_list;
451   - chlnew->chl_prev = NULL;
452   - ch->ch_list = chlnew;
453   - chlnew = NULL;
454   - }
  295 + if (!icl) {
  296 + spin_lock_init(&new_icl->icl_lock);
  297 + INIT_HLIST_HEAD(&new_icl->icl_inodes);
  298 + icl = new_icl;
  299 + new_icl = NULL;
455 300 } else {
456   - if (chlnew != NULL) {
457   - kmem_zone_free(xfs_chashlist_zone, chlnew);
458   - }
  301 + ASSERT(!hlist_empty(&icl->icl_inodes));
459 302 }
  303 + spin_lock(&icl->icl_lock);
  304 + hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
  305 + ip->i_cluster = icl;
  306 + spin_unlock(&icl->icl_lock);
460 307  
461   - mutex_spinunlock(&ch->ch_lock, s);
  308 + write_unlock(&pag->pag_ici_lock);
  309 + radix_tree_preload_end();
  310 + if (new_icl)
  311 + kmem_zone_free(xfs_icluster_zone, new_icl);
462 312  
463   -
464 313 /*
465 314 * Link ip to its mount and thread it on the mount's inode list.
466 315 */
... ... @@ -478,6 +327,7 @@
478 327 mp->m_inodes = ip;
479 328  
480 329 XFS_MOUNT_IUNLOCK(mp);
  330 + xfs_put_perag(mp, pag);
481 331  
482 332 return_ip:
483 333 ASSERT(ip->i_df.if_ext_max ==
484 334  
485 335  
... ... @@ -587,32 +437,19 @@
587 437 xfs_ino_t ino,
588 438 xfs_trans_t *tp)
589 439 {
590   - xfs_ihash_t *ih;
591 440 xfs_inode_t *ip;
592   - ulong version;
  441 + xfs_perag_t *pag;
593 442  
594   - ih = XFS_IHASH(mp, ino);
595   - read_lock(&ih->ih_lock);
596   - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
597   - if (ip->i_ino == ino) {
598   - /*
599   - * If we find it and tp matches, return it.
600   - * Also move it to the front of the hash list
601   - * if we find it and it is not already there.
602   - * Otherwise break from the loop and return
603   - * NULL.
604   - */
605   - if (ip->i_transp == tp) {
606   - version = ih->ih_version;
607   - read_unlock(&ih->ih_lock);
608   - xfs_ihash_promote(ih, ip, version);
609   - return (ip);
610   - }
611   - break;
612   - }
613   - }
614   - read_unlock(&ih->ih_lock);
615   - return (NULL);
  443 + pag = xfs_get_perag(mp, ino);
  444 + read_lock(&pag->pag_ici_lock);
  445 + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
  446 + read_unlock(&pag->pag_ici_lock);
  447 + xfs_put_perag(mp, pag);
  448 +
  449 + /* the returned inode must match the transaction */
  450 + if (ip && (ip->i_transp != tp))
  451 + return NULL;
  452 + return ip;
616 453 }
617 454  
618 455 /*
619 456  
620 457  
621 458  
622 459  
623 460  
... ... @@ -718,58 +555,26 @@
718 555 xfs_iextract(
719 556 xfs_inode_t *ip)
720 557 {
721   - xfs_ihash_t *ih;
  558 + xfs_mount_t *mp = ip->i_mount;
  559 + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
722 560 xfs_inode_t *iq;
723   - xfs_mount_t *mp;
724   - xfs_chash_t *ch;
725   - xfs_chashlist_t *chl, *chm;
726   - SPLDECL(s);
727 561  
728   - ih = ip->i_hash;
729   - write_lock(&ih->ih_lock);
730   - if ((iq = ip->i_next)) {
731   - iq->i_prevp = ip->i_prevp;
732   - }
733   - *ip->i_prevp = iq;
734   - ih->ih_version++;
735   - write_unlock(&ih->ih_lock);
  562 + write_lock(&pag->pag_ici_lock);
  563 + radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
  564 + write_unlock(&pag->pag_ici_lock);
  565 + xfs_put_perag(mp, pag);
736 566  
737 567 /*
738   - * Remove from cluster hash list
739   - * 1) delete the chashlist if this is the last inode on the chashlist
740   - * 2) unchain from list of inodes
741   - * 3) point chashlist->chl_ip to 'chl_next' if to this inode.
  568 + * Remove from cluster list
742 569 */
743 570 mp = ip->i_mount;
744   - ch = XFS_CHASH(mp, ip->i_blkno);
745   - s = mutex_spinlock(&ch->ch_lock);
  571 + spin_lock(&ip->i_cluster->icl_lock);
  572 + hlist_del(&ip->i_cnode);
  573 + spin_unlock(&ip->i_cluster->icl_lock);
746 574  
747   - if (ip->i_cnext == ip) {
748   - /* Last inode on chashlist */
749   - ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
750   - ASSERT(ip->i_chash != NULL);
751   - chm=NULL;
752   - chl = ip->i_chash;
753   - if (chl->chl_prev)
754   - chl->chl_prev->chl_next = chl->chl_next;
755   - else
756   - ch->ch_list = chl->chl_next;
757   - if (chl->chl_next)
758   - chl->chl_next->chl_prev = chl->chl_prev;
759   - kmem_zone_free(xfs_chashlist_zone, chl);
760   - } else {
761   - /* delete one inode from a non-empty list */
762   - iq = ip->i_cnext;
763   - iq->i_cprev = ip->i_cprev;
764   - ip->i_cprev->i_cnext = iq;
765   - if (ip->i_chash->chl_ip == ip) {
766   - ip->i_chash->chl_ip = iq;
767   - }
768   - ip->i_chash = __return_address;
769   - ip->i_cprev = __return_address;
770   - ip->i_cnext = __return_address;
771   - }
772   - mutex_spinunlock(&ch->ch_lock, s);
  575 + /* was last inode in cluster? */
  576 + if (hlist_empty(&ip->i_cluster->icl_inodes))
  577 + kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
773 578  
774 579 /*
775 580 * Remove from mount's inode list.
... ... @@ -52,7 +52,7 @@
52 52  
53 53 kmem_zone_t *xfs_ifork_zone;
54 54 kmem_zone_t *xfs_inode_zone;
55   -kmem_zone_t *xfs_chashlist_zone;
  55 +kmem_zone_t *xfs_icluster_zone;
56 56  
57 57 /*
58 58 * Used in xfs_itruncate(). This is the maximum number of extents
59 59  
... ... @@ -2182,10 +2182,10 @@
2182 2182 int i, j, found, pre_flushed;
2183 2183 xfs_daddr_t blkno;
2184 2184 xfs_buf_t *bp;
2185   - xfs_ihash_t *ih;
2186 2185 xfs_inode_t *ip, **ip_found;
2187 2186 xfs_inode_log_item_t *iip;
2188 2187 xfs_log_item_t *lip;
  2188 + xfs_perag_t *pag = xfs_get_perag(mp, inum);
2189 2189 SPLDECL(s);
2190 2190  
2191 2191 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2192 2192  
2193 2193  
... ... @@ -2220,23 +2220,20 @@
2220 2220 */
2221 2221 found = 0;
2222 2222 for (i = 0; i < ninodes; i++) {
2223   - ih = XFS_IHASH(mp, inum + i);
2224   - read_lock(&ih->ih_lock);
2225   - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
2226   - if (ip->i_ino == inum + i)
2227   - break;
2228   - }
  2223 + read_lock(&pag->pag_ici_lock);
  2224 + ip = radix_tree_lookup(&pag->pag_ici_root,
  2225 + XFS_INO_TO_AGINO(mp, (inum + i)));
2229 2226  
2230 2227 /* Inode not in memory or we found it already,
2231 2228 * nothing to do
2232 2229 */
2233 2230 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
2234   - read_unlock(&ih->ih_lock);
  2231 + read_unlock(&pag->pag_ici_lock);
2235 2232 continue;
2236 2233 }
2237 2234  
2238 2235 if (xfs_inode_clean(ip)) {
2239   - read_unlock(&ih->ih_lock);
  2236 + read_unlock(&pag->pag_ici_lock);
2240 2237 continue;
2241 2238 }
2242 2239  
... ... @@ -2259,7 +2256,7 @@
2259 2256 ip_found[found++] = ip;
2260 2257 }
2261 2258 }
2262   - read_unlock(&ih->ih_lock);
  2259 + read_unlock(&pag->pag_ici_lock);
2263 2260 continue;
2264 2261 }
2265 2262  
... ... @@ -2277,8 +2274,7 @@
2277 2274 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2278 2275 }
2279 2276 }
2280   -
2281   - read_unlock(&ih->ih_lock);
  2277 + read_unlock(&pag->pag_ici_lock);
2282 2278 }
2283 2279  
2284 2280 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
... ... @@ -2333,6 +2329,7 @@
2333 2329 }
2334 2330  
2335 2331 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
  2332 + xfs_put_perag(mp, pag);
2336 2333 }
2337 2334  
2338 2335 /*
2339 2336  
2340 2337  
... ... @@ -3050,12 +3047,11 @@
3050 3047 xfs_mount_t *mp;
3051 3048 int error;
3052 3049 /* REFERENCED */
3053   - xfs_chash_t *ch;
3054 3050 xfs_inode_t *iq;
3055 3051 int clcount; /* count of inodes clustered */
3056 3052 int bufwasdelwri;
  3053 + struct hlist_node *entry;
3057 3054 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3058   - SPLDECL(s);
3059 3055  
3060 3056 XFS_STATS_INC(xs_iflush_count);
3061 3057  
3062 3058  
3063 3059  
... ... @@ -3169,14 +3165,14 @@
3169 3165 * inode clustering:
3170 3166 * see if other inodes can be gathered into this write
3171 3167 */
  3168 + spin_lock(&ip->i_cluster->icl_lock);
  3169 + ip->i_cluster->icl_buf = bp;
3172 3170  
3173   - ip->i_chash->chl_buf = bp;
3174   -
3175   - ch = XFS_CHASH(mp, ip->i_blkno);
3176   - s = mutex_spinlock(&ch->ch_lock);
3177   -
3178 3171 clcount = 0;
3179   - for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
  3172 + hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
  3173 + if (iq == ip)
  3174 + continue;
  3175 +
3180 3176 /*
3181 3177 * Do an un-protected check to see if the inode is dirty and
3182 3178 * is a candidate for flushing. These checks will be repeated
... ... @@ -3227,7 +3223,7 @@
3227 3223 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3228 3224 }
3229 3225 }
3230   - mutex_spinunlock(&ch->ch_lock, s);
  3226 + spin_unlock(&ip->i_cluster->icl_lock);
3231 3227  
3232 3228 if (clcount) {
3233 3229 XFS_STATS_INC(xs_icluster_flushcnt);
... ... @@ -3264,7 +3260,7 @@
3264 3260 /* Corruption detected in the clustering loop. Invalidate the
3265 3261 * inode buffer and shut down the filesystem.
3266 3262 */
3267   - mutex_spinunlock(&ch->ch_lock, s);
  3263 + spin_unlock(&ip->i_cluster->icl_lock);
3268 3264  
3269 3265 /*
3270 3266 * Clean up the buffer. If it was B_DELWRI, just release it --
... ... @@ -172,42 +172,19 @@
172 172 extern void xfs_iocore_inode_init(struct xfs_inode *);
173 173 extern void xfs_iocore_inode_reinit(struct xfs_inode *);
174 174  
175   -
176 175 /*
177   - * This is the type used in the xfs inode hash table.
178   - * An array of these is allocated for each mounted
179   - * file system to hash the inodes for that file system.
  176 + * This is the xfs inode cluster structure. This structure is used by
  177 + * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
  178 + * the same time.
180 179 */
181   -typedef struct xfs_ihash {
182   - struct xfs_inode *ih_next;
183   - rwlock_t ih_lock;
184   - uint ih_version;
185   -} xfs_ihash_t;
186   -
187   -#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize))
188   -
189   -/*
190   - * This is the xfs inode cluster hash. This hash is used by xfs_iflush to
191   - * find inodes that share a cluster and can be flushed to disk at the same
192   - * time.
193   - */
194   -typedef struct xfs_chashlist {
195   - struct xfs_chashlist *chl_next;
196   - struct xfs_chashlist *chl_prev;
197   - struct xfs_inode *chl_ip;
198   - xfs_daddr_t chl_blkno; /* starting block number of
  180 +typedef struct xfs_icluster {
  181 + struct hlist_head icl_inodes; /* list of inodes on cluster */
  182 + xfs_daddr_t icl_blkno; /* starting block number of
199 183 * the cluster */
200   - struct xfs_buf *chl_buf; /* the inode buffer */
201   -} xfs_chashlist_t;
  184 + struct xfs_buf *icl_buf; /* the inode buffer */
  185 + lock_t icl_lock; /* inode list lock */
  186 +} xfs_icluster_t;
202 187  
203   -typedef struct xfs_chash {
204   - xfs_chashlist_t *ch_list;
205   - lock_t ch_lock;
206   -} xfs_chash_t;
207   -
208   -#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
209   -
210   -
211 188 /*
212 189 * This is the xfs in-core inode structure.
213 190 * Most of the on-disk inode is embedded in the i_d field.
214 191  
215 192  
216 193  
... ... @@ -269,21 +246,15 @@
269 246 } xfs_icdinode_t;
270 247  
271 248 typedef struct {
272   - struct xfs_ihash *ip_hash; /* pointer to hash header */
273   - struct xfs_inode *ip_next; /* inode hash link forw */
274 249 struct xfs_inode *ip_mnext; /* next inode in mount list */
275 250 struct xfs_inode *ip_mprev; /* ptr to prev inode */
276   - struct xfs_inode **ip_prevp; /* ptr to prev i_next */
277 251 struct xfs_mount *ip_mount; /* fs mount struct ptr */
278 252 } xfs_iptr_t;
279 253  
280 254 typedef struct xfs_inode {
281 255 /* Inode linking and identification information. */
282   - struct xfs_ihash *i_hash; /* pointer to hash header */
283   - struct xfs_inode *i_next; /* inode hash link forw */
284 256 struct xfs_inode *i_mnext; /* next inode in mount list */
285 257 struct xfs_inode *i_mprev; /* ptr to prev inode */
286   - struct xfs_inode **i_prevp; /* ptr to prev i_next */
287 258 struct xfs_mount *i_mount; /* fs mount struct ptr */
288 259 struct list_head i_reclaim; /* reclaim list */
289 260 struct bhv_desc i_bhv_desc; /* inode behavior descriptor*/
... ... @@ -324,9 +295,8 @@
324 295 unsigned int i_delayed_blks; /* count of delay alloc blks */
325 296  
326 297 xfs_icdinode_t i_d; /* most of ondisk inode */
327   - xfs_chashlist_t *i_chash; /* cluster hash list header */
328   - struct xfs_inode *i_cnext; /* cluster hash link forward */
329   - struct xfs_inode *i_cprev; /* cluster hash link backward */
  298 + xfs_icluster_t *i_cluster; /* cluster list header */
  299 + struct hlist_node i_cnode; /* cluster link node */
330 300  
331 301 xfs_fsize_t i_size; /* in-memory size */
332 302 /* Trace buffers per inode. */
... ... @@ -521,8 +491,6 @@
521 491 */
522 492 void xfs_ihash_init(struct xfs_mount *);
523 493 void xfs_ihash_free(struct xfs_mount *);
524   -void xfs_chash_init(struct xfs_mount *);
525   -void xfs_chash_free(struct xfs_mount *);
526 494 xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
527 495 struct xfs_trans *);
528 496 void xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *);
... ... @@ -633,7 +601,7 @@
633 601 #define xfs_inobp_check(mp, bp)
634 602 #endif /* DEBUG */
635 603  
636   -extern struct kmem_zone *xfs_chashlist_zone;
  604 +extern struct kmem_zone *xfs_icluster_zone;
637 605 extern struct kmem_zone *xfs_ifork_zone;
638 606 extern struct kmem_zone *xfs_inode_zone;
639 607 extern struct kmem_zone *xfs_ili_zone;
... ... @@ -160,11 +160,6 @@
160 160 xfs_mount_t *mp,
161 161 int remove_bhv)
162 162 {
163   - if (mp->m_ihash)
164   - xfs_ihash_free(mp);
165   - if (mp->m_chash)
166   - xfs_chash_free(mp);
167   -
168 163 if (mp->m_perag) {
169 164 int agno;
170 165  
... ... @@ -342,6 +337,17 @@
342 337 return 0;
343 338 }
344 339  
  340 +STATIC void
  341 +xfs_initialize_perag_icache(
  342 + xfs_perag_t *pag)
  343 +{
  344 + if (!pag->pag_ici_init) {
  345 + rwlock_init(&pag->pag_ici_lock);
  346 + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
  347 + pag->pag_ici_init = 1;
  348 + }
  349 +}
  350 +
345 351 xfs_agnumber_t
346 352 xfs_initialize_perag(
347 353 bhv_vfs_t *vfs,
348 354  
... ... @@ -396,12 +402,14 @@
396 402 pag->pagi_inodeok = 1;
397 403 if (index < max_metadata)
398 404 pag->pagf_metadata = 1;
  405 + xfs_initialize_perag_icache(pag);
399 406 }
400 407 } else {
401 408 /* Setup default behavior for smaller filesystems */
402 409 for (index = 0; index < agcount; index++) {
403 410 pag = &mp->m_perag[index];
404 411 pag->pagi_inodeok = 1;
  412 + xfs_initialize_perag_icache(pag);
405 413 }
406 414 }
407 415 return index;
... ... @@ -1033,13 +1041,6 @@
1033 1041 xfs_trans_init(mp);
1034 1042  
1035 1043 /*
1036   - * Allocate and initialize the inode hash table for this
1037   - * file system.
1038   - */
1039   - xfs_ihash_init(mp);
1040   - xfs_chash_init(mp);
1041   -
1042   - /*
1043 1044 * Allocate and initialize the per-ag data.
1044 1045 */
1045 1046 init_rwsem(&mp->m_peraglock);
... ... @@ -1190,8 +1191,6 @@
1190 1191 error3:
1191 1192 xfs_log_unmount_dealloc(mp);
1192 1193 error2:
1193   - xfs_ihash_free(mp);
1194   - xfs_chash_free(mp);
1195 1194 for (agno = 0; agno < sbp->sb_agcount; agno++)
1196 1195 if (mp->m_perag[agno].pagb_list)
1197 1196 kmem_free(mp->m_perag[agno].pagb_list,
... ... @@ -57,10 +57,7 @@
57 57 struct bhv_vfs;
58 58 struct bhv_vnode;
59 59 struct xfs_mount_args;
60   -struct xfs_ihash;
61   -struct xfs_chash;
62 60 struct xfs_inode;
63   -struct xfs_perag;
64 61 struct xfs_iocore;
65 62 struct xfs_bmbt_irec;
66 63 struct xfs_bmap_free;
... ... @@ -335,8 +332,6 @@
335 332 xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
336 333 lock_t m_agirotor_lock;/* .. and lock protecting it */
337 334 xfs_agnumber_t m_maxagi; /* highest inode alloc group */
338   - size_t m_ihsize; /* size of next field */
339   - struct xfs_ihash *m_ihash; /* fs private inode hash table*/
340 335 struct xfs_inode *m_inodes; /* active inode list */
341 336 struct list_head m_del_inodes; /* inodes to reclaim */
342 337 mutex_t m_ilock; /* inode list mutex */
... ... @@ -458,7 +453,7 @@
458 453 #define XFS_MOUNT_IDELETE (1ULL << 18) /* delete empty inode clusters*/
459 454 #define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
460 455 * allocation */
461   -#define XFS_MOUNT_IHASHSIZE (1ULL << 20) /* inode hash table size */
  456 + /* (1ULL << 20) -- currently unused */
462 457 #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
463 458 #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
464 459 * I/O size in stat() */
... ... @@ -569,6 +564,21 @@
569 564 {
570 565 xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
571 566 return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
  567 +}
  568 +
  569 +/*
  570 + * perag get/put wrappers for eventual ref counting
  571 + */
  572 +static inline xfs_perag_t *
  573 +xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
  574 +{
  575 + return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
  576 +}
  577 +
  578 +static inline void
  579 +xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
  580 +{
  581 + /* nothing to see here, move along */
572 582 }
573 583  
574 584 /*
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dir2.h"
26 27 #include "xfs_dmapi.h"
27 28 #include "xfs_mount.h"
fs/xfs/xfs_trans_ail.c
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dmapi.h"
26 27 #include "xfs_mount.h"
27 28 #include "xfs_trans_priv.h"
fs/xfs/xfs_trans_extfree.c
... ... @@ -22,6 +22,7 @@
22 22 #include "xfs_inum.h"
23 23 #include "xfs_trans.h"
24 24 #include "xfs_sb.h"
  25 +#include "xfs_ag.h"
25 26 #include "xfs_dmapi.h"
26 27 #include "xfs_mount.h"
27 28 #include "xfs_trans_priv.h"
... ... @@ -117,8 +117,8 @@
117 117 xfs_ili_zone =
118 118 kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
119 119 KM_ZONE_SPREAD, NULL);
120   - xfs_chashlist_zone =
121   - kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist",
  120 + xfs_icluster_zone =
  121 + kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
122 122 KM_ZONE_SPREAD, NULL);
123 123  
124 124 /*
... ... @@ -163,7 +163,7 @@
163 163 extern kmem_zone_t *xfs_efd_zone;
164 164 extern kmem_zone_t *xfs_efi_zone;
165 165 extern kmem_zone_t *xfs_buf_item_zone;
166   - extern kmem_zone_t *xfs_chashlist_zone;
  166 + extern kmem_zone_t *xfs_icluster_zone;
167 167  
168 168 xfs_cleanup_procfs();
169 169 xfs_sysctl_unregister();
... ... @@ -199,7 +199,7 @@
199 199 kmem_zone_destroy(xfs_efi_zone);
200 200 kmem_zone_destroy(xfs_ifork_zone);
201 201 kmem_zone_destroy(xfs_ili_zone);
202   - kmem_zone_destroy(xfs_chashlist_zone);
  202 + kmem_zone_destroy(xfs_icluster_zone);
203 203 }
204 204  
205 205 /*
... ... @@ -246,7 +246,6 @@
246 246 ap->logbufsize);
247 247 return XFS_ERROR(EINVAL);
248 248 }
249   - mp->m_ihsize = ap->ihashsize;
250 249 mp->m_logbsize = ap->logbufsize;
251 250 mp->m_fsname_len = strlen(ap->fsname) + 1;
252 251 mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
... ... @@ -293,8 +292,6 @@
293 292 mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
294 293 }
295 294  
296   - if (ap->flags & XFSMNT_IHASHSIZE)
297   - mp->m_flags |= XFS_MOUNT_IHASHSIZE;
298 295 if (ap->flags & XFSMNT_IDELETE)
299 296 mp->m_flags |= XFS_MOUNT_IDELETE;
300 297 if (ap->flags & XFSMNT_DIRSYNC)
... ... @@ -1673,7 +1670,6 @@
1673 1670 #define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */
1674 1671 #define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */
1675 1672 #define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */
1676   -#define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */
1677 1673 #define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
1678 1674 #define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
1679 1675 * unwritten extent conversion */
... ... @@ -1799,15 +1795,6 @@
1799 1795 iosize = suffix_strtoul(value, &eov, 10);
1800 1796 args->flags |= XFSMNT_IOSIZE;
1801 1797 args->iosizelog = ffs(iosize) - 1;
1802   - } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
1803   - if (!value || !*value) {
1804   - cmn_err(CE_WARN,
1805   - "XFS: %s option requires an argument",
1806   - this_char);
1807   - return EINVAL;
1808   - }
1809   - args->flags |= XFSMNT_IHASHSIZE;
1810   - args->ihashsize = simple_strtoul(value, &eov, 10);
1811 1798 } else if (!strcmp(this_char, MNTOPT_GRPID) ||
1812 1799 !strcmp(this_char, MNTOPT_BSDGROUPS)) {
1813 1800 vfsp->vfs_flag |= VFS_GRPID;
... ... @@ -1876,6 +1863,9 @@
1876 1863 args->flags &= ~XFSMNT_ATTR2;
1877 1864 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
1878 1865 args->flags2 |= XFSMNT2_FILESTREAMS;
  1866 + } else if (!strcmp(this_char, "ihashsize")) {
  1867 + cmn_err(CE_WARN,
  1868 + "XFS: ihashsize no longer used, option is deprecated.");
1879 1869 } else if (!strcmp(this_char, "osyncisdsync")) {
1880 1870 /* no-op, this is now the default */
1881 1871 cmn_err(CE_WARN,
... ... @@ -1965,9 +1955,6 @@
1965 1955 if (mp->m_flags & xfs_infop->flag)
1966 1956 seq_puts(m, xfs_infop->str);
1967 1957 }
1968   -
1969   - if (mp->m_flags & XFS_MOUNT_IHASHSIZE)
1970   - seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", (int)mp->m_ihsize);
1971 1958  
1972 1959 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
1973 1960 seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
fs/xfs/xfs_vnodeops.c
... ... @@ -3876,7 +3876,7 @@
3876 3876 int locked,
3877 3877 int sync_mode)
3878 3878 {
3879   - xfs_ihash_t *ih = ip->i_hash;
  3879 + xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3880 3880 bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
3881 3881 int error;
3882 3882  
3883 3883  
... ... @@ -3888,12 +3888,12 @@
3888 3888 * Once we have the XFS_IRECLAIM flag set it will not touch
3889 3889 * us.
3890 3890 */
3891   - write_lock(&ih->ih_lock);
  3891 + write_lock(&pag->pag_ici_lock);
3892 3892 spin_lock(&ip->i_flags_lock);
3893 3893 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3894 3894 (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3895 3895 spin_unlock(&ip->i_flags_lock);
3896   - write_unlock(&ih->ih_lock);
  3896 + write_unlock(&pag->pag_ici_lock);
3897 3897 if (locked) {
3898 3898 xfs_ifunlock(ip);
3899 3899 xfs_iunlock(ip, XFS_ILOCK_EXCL);
... ... @@ -3902,7 +3902,8 @@
3902 3902 }
3903 3903 __xfs_iflags_set(ip, XFS_IRECLAIM);
3904 3904 spin_unlock(&ip->i_flags_lock);
3905   - write_unlock(&ih->ih_lock);
  3905 + write_unlock(&pag->pag_ici_lock);
  3906 + xfs_put_perag(ip->i_mount, pag);
3906 3907  
3907 3908 /*
3908 3909 * If the inode is still dirty, then flush it out. If the inode