Commit 9e38d86ff2d8a8db99570e982230861046df32b5

Authored by Nick Piggin
Committed by Al Viro
1 parent cffbc8aa33

fs: Implement lazy LRU updates for inodes

Convert the inode LRU to use lazy updates to reduce lock and
cacheline traffic.  We avoid moving inodes around in the LRU list
during iget/iput operations so these frequent operations don't need
to access the LRUs. Instead, we defer the refcount checks to
reclaim-time and use a per-inode state flag, I_REFERENCED, to tell
reclaim that iget has touched the inode in the past. This means that
only reclaim should be touching the LRU with any frequency, hence
significantly reducing lock acquisitions and the amount contention
on LRU updates.

This also removes the inode_in_use list, which means we now only
have one list for tracking the inode LRU status. This makes it much
simpler to split out the LRU list operations under it's own lock.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 4 changed files with 71 additions and 41 deletions Side-by-side Diff

... ... @@ -408,16 +408,13 @@
408 408 * completion.
409 409 */
410 410 redirty_tail(inode);
411   - } else if (atomic_read(&inode->i_count)) {
412   - /*
413   - * The inode is clean, inuse
414   - */
415   - list_move(&inode->i_list, &inode_in_use);
416 411 } else {
417 412 /*
418   - * The inode is clean, unused
  413 + * The inode is clean. At this point we either have
  414 + * a reference to the inode or it's on it's way out.
  415 + * No need to add it back to the LRU.
419 416 */
420   - list_move(&inode->i_list, &inode_unused);
  417 + list_del_init(&inode->i_list);
421 418 }
422 419 }
423 420 inode_sync_complete(inode);
... ... @@ -72,8 +72,7 @@
72 72 * allowing for low-overhead inode sync() operations.
73 73 */
74 74  
75   -LIST_HEAD(inode_in_use);
76   -LIST_HEAD(inode_unused);
  75 +static LIST_HEAD(inode_unused);
77 76 static struct hlist_head *inode_hashtable __read_mostly;
78 77  
79 78 /*
... ... @@ -291,6 +290,7 @@
291 290 INIT_HLIST_NODE(&inode->i_hash);
292 291 INIT_LIST_HEAD(&inode->i_dentry);
293 292 INIT_LIST_HEAD(&inode->i_devices);
  293 + INIT_LIST_HEAD(&inode->i_list);
294 294 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
295 295 spin_lock_init(&inode->i_data.tree_lock);
296 296 spin_lock_init(&inode->i_data.i_mmap_lock);
297 297  
298 298  
... ... @@ -317,14 +317,25 @@
317 317 */
318 318 void __iget(struct inode *inode)
319 319 {
320   - if (atomic_inc_return(&inode->i_count) != 1)
321   - return;
  320 + atomic_inc(&inode->i_count);
  321 +}
322 322  
323   - if (!(inode->i_state & (I_DIRTY|I_SYNC)))
324   - list_move(&inode->i_list, &inode_in_use);
325   - percpu_counter_dec(&nr_inodes_unused);
  323 +static void inode_lru_list_add(struct inode *inode)
  324 +{
  325 + if (list_empty(&inode->i_list)) {
  326 + list_add(&inode->i_list, &inode_unused);
  327 + percpu_counter_inc(&nr_inodes_unused);
  328 + }
326 329 }
327 330  
  331 +static void inode_lru_list_del(struct inode *inode)
  332 +{
  333 + if (!list_empty(&inode->i_list)) {
  334 + list_del_init(&inode->i_list);
  335 + percpu_counter_dec(&nr_inodes_unused);
  336 + }
  337 +}
  338 +
328 339 void end_writeback(struct inode *inode)
329 340 {
330 341 might_sleep();
... ... @@ -367,7 +378,7 @@
367 378 struct inode *inode;
368 379  
369 380 inode = list_first_entry(head, struct inode, i_list);
370   - list_del(&inode->i_list);
  381 + list_del_init(&inode->i_list);
371 382  
372 383 evict(inode);
373 384  
... ... @@ -413,7 +424,8 @@
413 424 list_move(&inode->i_list, dispose);
414 425 WARN_ON(inode->i_state & I_NEW);
415 426 inode->i_state |= I_FREEING;
416   - percpu_counter_dec(&nr_inodes_unused);
  427 + if (!(inode->i_state & (I_DIRTY | I_SYNC)))
  428 + percpu_counter_dec(&nr_inodes_unused);
417 429 continue;
418 430 }
419 431 busy = 1;
... ... @@ -448,7 +460,7 @@
448 460  
449 461 static int can_unuse(struct inode *inode)
450 462 {
451   - if (inode->i_state)
  463 + if (inode->i_state & ~I_REFERENCED)
452 464 return 0;
453 465 if (inode_has_buffers(inode))
454 466 return 0;
455 467  
456 468  
... ... @@ -460,17 +472,20 @@
460 472 }
461 473  
462 474 /*
463   - * Scan `goal' inodes on the unused list for freeable ones. They are moved to
464   - * a temporary list and then are freed outside inode_lock by dispose_list().
  475 + * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
  476 + * temporary list and then are freed outside inode_lock by dispose_list().
465 477 *
466 478 * Any inodes which are pinned purely because of attached pagecache have their
467   - * pagecache removed. We expect the final iput() on that inode to add it to
468   - * the front of the inode_unused list. So look for it there and if the
469   - * inode is still freeable, proceed. The right inode is found 99.9% of the
470   - * time in testing on a 4-way.
  479 + * pagecache removed. If the inode has metadata buffers attached to
  480 + * mapping->private_list then try to remove them.
471 481 *
472   - * If the inode has metadata buffers attached to mapping->private_list then
473   - * try to remove them.
  482 + * If the inode has the I_REFERENCED flag set, then it means that it has been
  483 + * used recently - the flag is set in iput_final(). When we encounter such an
  484 + * inode, clear the flag and move it to the back of the LRU so it gets another
  485 + * pass through the LRU before it gets reclaimed. This is necessary because of
  486 + * the fact we are doing lazy LRU updates to minimise lock contention so the
  487 + * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  488 + * with this flag set because they are the inodes that are out of order.
474 489 */
475 490 static void prune_icache(int nr_to_scan)
476 491 {
477 492  
... ... @@ -488,8 +503,21 @@
488 503  
489 504 inode = list_entry(inode_unused.prev, struct inode, i_list);
490 505  
491   - if (inode->i_state || atomic_read(&inode->i_count)) {
  506 + /*
  507 + * Referenced or dirty inodes are still in use. Give them
  508 + * another pass through the LRU as we canot reclaim them now.
  509 + */
  510 + if (atomic_read(&inode->i_count) ||
  511 + (inode->i_state & ~I_REFERENCED)) {
  512 + list_del_init(&inode->i_list);
  513 + percpu_counter_dec(&nr_inodes_unused);
  514 + continue;
  515 + }
  516 +
  517 + /* recently referenced inodes get one more pass */
  518 + if (inode->i_state & I_REFERENCED) {
492 519 list_move(&inode->i_list, &inode_unused);
  520 + inode->i_state &= ~I_REFERENCED;
493 521 continue;
494 522 }
495 523 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
... ... @@ -620,7 +648,6 @@
620 648 __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
621 649 struct inode *inode)
622 650 {
623   - list_add(&inode->i_list, &inode_in_use);
624 651 list_add(&inode->i_sb_list, &sb->s_inodes);
625 652 if (head)
626 653 hlist_add_head(&inode->i_hash, head);
627 654  
... ... @@ -1237,10 +1264,11 @@
1237 1264 drop = generic_drop_inode(inode);
1238 1265  
1239 1266 if (!drop) {
1240   - if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1241   - list_move(&inode->i_list, &inode_unused);
1242   - percpu_counter_inc(&nr_inodes_unused);
1243 1267 if (sb->s_flags & MS_ACTIVE) {
  1268 + inode->i_state |= I_REFERENCED;
  1269 + if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
  1270 + inode_lru_list_add(inode);
  1271 + }
1244 1272 spin_unlock(&inode_lock);
1245 1273 return;
1246 1274 }
1247 1275  
1248 1276  
... ... @@ -1251,13 +1279,19 @@
1251 1279 spin_lock(&inode_lock);
1252 1280 WARN_ON(inode->i_state & I_NEW);
1253 1281 inode->i_state &= ~I_WILL_FREE;
1254   - percpu_counter_dec(&nr_inodes_unused);
1255 1282 hlist_del_init(&inode->i_hash);
1256 1283 }
1257   - list_del_init(&inode->i_list);
1258   - list_del_init(&inode->i_sb_list);
1259 1284 WARN_ON(inode->i_state & I_NEW);
1260 1285 inode->i_state |= I_FREEING;
  1286 +
  1287 + /*
  1288 + * After we delete the inode from the LRU here, we avoid moving dirty
  1289 + * inodes back onto the LRU now because I_FREEING is set and hence
  1290 + * writeback_single_inode() won't move the inode around.
  1291 + */
  1292 + inode_lru_list_del(inode);
  1293 +
  1294 + list_del_init(&inode->i_sb_list);
1261 1295 spin_unlock(&inode_lock);
1262 1296 evict(inode);
1263 1297 spin_lock(&inode_lock);
... ... @@ -1641,16 +1641,17 @@
1641 1641 *
1642 1642 * Q: What is the difference between I_WILL_FREE and I_FREEING?
1643 1643 */
1644   -#define I_DIRTY_SYNC 1
1645   -#define I_DIRTY_DATASYNC 2
1646   -#define I_DIRTY_PAGES 4
  1644 +#define I_DIRTY_SYNC (1 << 0)
  1645 +#define I_DIRTY_DATASYNC (1 << 1)
  1646 +#define I_DIRTY_PAGES (1 << 2)
1647 1647 #define __I_NEW 3
1648 1648 #define I_NEW (1 << __I_NEW)
1649   -#define I_WILL_FREE 16
1650   -#define I_FREEING 32
1651   -#define I_CLEAR 64
  1649 +#define I_WILL_FREE (1 << 4)
  1650 +#define I_FREEING (1 << 5)
  1651 +#define I_CLEAR (1 << 6)
1652 1652 #define __I_SYNC 7
1653 1653 #define I_SYNC (1 << __I_SYNC)
  1654 +#define I_REFERENCED (1 << 8)
1654 1655  
1655 1656 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
1656 1657  
include/linux/writeback.h
... ... @@ -10,8 +10,6 @@
10 10 struct backing_dev_info;
11 11  
12 12 extern spinlock_t inode_lock;
13   -extern struct list_head inode_in_use;
14   -extern struct list_head inode_unused;
15 13  
16 14 /*
17 15 * fs/fs-writeback.c