Commit 91b0abe36a7b2b3b02d7500925a5f8455334f0e5

Authored by Johannes Weiner
Committed by Linus Torvalds
1 parent 0cd6144aad

mm + fs: store shadow entries in page cache

Reclaim will be leaving shadow entries in the page cache radix tree upon
evicting the real page.  As those pages are found from the LRU, an
iput() can lead to the inode being freed concurrently.  At this point,
reclaim must no longer install shadow pages because the inode freeing
code needs to ensure the page tree is really empty.

Add an address_space flag, AS_EXITING, that the inode freeing code sets
under the tree lock before doing the final truncate.  Reclaim will check
for this flag before installing shadow pages.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Luigi Semenzato <semenzato@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Metin Doslu <metin@citusdata.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ozgun Erdogan <ozgun@citusdata.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin <klamm@yandex-team.ru>
Cc: Ryan Mallon <rmallon@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 50 changed files with 147 additions and 65 deletions Side-by-side Diff

Documentation/filesystems/porting
... ... @@ -295,9 +295,9 @@
295 295 ->clear_inode() and ->delete_inode() are gone; ->evict_inode() should
296 296 be used instead. It gets called whenever the inode is evicted, whether it has
297 297 remaining links or not. Caller does *not* evict the pagecache or inode-associated
298   -metadata buffers; getting rid of those is responsibility of method, as it had
299   -been for ->delete_inode(). Caller makes sure async writeback cannot be running
300   -for the inode while (or after) ->evict_inode() is called.
  298 +metadata buffers; the method has to use truncate_inode_pages_final() to get rid
  299 +of those. Caller makes sure async writeback cannot be running for the inode while
  300 +(or after) ->evict_inode() is called.
301 301  
302 302 ->drop_inode() returns int now; it's called on final iput() with
303 303 inode->i_lock held and it returns true if filesystems wants the inode to be
drivers/staging/lustre/lustre/llite/llite_lib.c
... ... @@ -1877,7 +1877,7 @@
1877 1877 cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
1878 1878 CL_FSYNC_DISCARD, 1);
1879 1879  
1880   - truncate_inode_pages(&inode->i_data, 0);
  1880 + truncate_inode_pages_final(&inode->i_data);
1881 1881  
1882 1882 /* Workaround for LU-118 */
1883 1883 if (inode->i_data.nrpages) {
... ... @@ -451,7 +451,7 @@
451 451 {
452 452 struct v9fs_inode *v9inode = V9FS_I(inode);
453 453  
454   - truncate_inode_pages(inode->i_mapping, 0);
  454 + truncate_inode_pages_final(inode->i_mapping);
455 455 clear_inode(inode);
456 456 filemap_fdatawrite(inode->i_mapping);
457 457  
... ... @@ -259,7 +259,7 @@
259 259 {
260 260 unsigned long cache_page;
261 261 pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
262   - truncate_inode_pages(&inode->i_data, 0);
  262 + truncate_inode_pages_final(&inode->i_data);
263 263  
264 264 if (!inode->i_nlink) {
265 265 inode->i_size = 0;
... ... @@ -422,7 +422,7 @@
422 422  
423 423 ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
424 424  
425   - truncate_inode_pages(&inode->i_data, 0);
  425 + truncate_inode_pages_final(&inode->i_data);
426 426 clear_inode(inode);
427 427  
428 428 afs_give_up_callback(vnode);
... ... @@ -172,7 +172,7 @@
172 172  
173 173 dprintf("ino=%08lx\n", ino);
174 174  
175   - truncate_inode_pages(&inode->i_data, 0);
  175 + truncate_inode_pages_final(&inode->i_data);
176 176 invalidate_inode_buffers(inode);
177 177 clear_inode(inode);
178 178  
... ... @@ -83,7 +83,7 @@
83 83 {
84 84 struct address_space *mapping = bdev->bd_inode->i_mapping;
85 85  
86   - if (mapping->nrpages == 0)
  86 + if (mapping->nrpages == 0 && mapping->nrshadows == 0)
87 87 return;
88 88  
89 89 invalidate_bh_lrus();
... ... @@ -419,7 +419,7 @@
419 419 {
420 420 struct block_device *bdev = &BDEV_I(inode)->bdev;
421 421 struct list_head *p;
422   - truncate_inode_pages(&inode->i_data, 0);
  422 + truncate_inode_pages_final(&inode->i_data);
423 423 invalidate_inode_buffers(inode); /* is it needed here? */
424 424 clear_inode(inode);
425 425 spin_lock(&bdev_lock);
... ... @@ -4593,7 +4593,7 @@
4593 4593 struct rb_node *node;
4594 4594  
4595 4595 ASSERT(inode->i_state & I_FREEING);
4596   - truncate_inode_pages(&inode->i_data, 0);
  4596 + truncate_inode_pages_final(&inode->i_data);
4597 4597  
4598 4598 write_lock(&map_tree->lock);
4599 4599 while (!RB_EMPTY_ROOT(&map_tree->map)) {
... ... @@ -286,7 +286,7 @@
286 286 static void
287 287 cifs_evict_inode(struct inode *inode)
288 288 {
289   - truncate_inode_pages(&inode->i_data, 0);
  289 + truncate_inode_pages_final(&inode->i_data);
290 290 clear_inode(inode);
291 291 cifs_fscache_release_inode_cookie(inode);
292 292 }
... ... @@ -250,7 +250,7 @@
250 250  
251 251 static void coda_evict_inode(struct inode *inode)
252 252 {
253   - truncate_inode_pages(&inode->i_data, 0);
  253 + truncate_inode_pages_final(&inode->i_data);
254 254 clear_inode(inode);
255 255 coda_cache_clear_inode(inode);
256 256 }
... ... @@ -132,7 +132,7 @@
132 132 */
133 133 static void ecryptfs_evict_inode(struct inode *inode)
134 134 {
135   - truncate_inode_pages(&inode->i_data, 0);
  135 + truncate_inode_pages_final(&inode->i_data);
136 136 clear_inode(inode);
137 137 iput(ecryptfs_inode_to_lower(inode));
138 138 }
... ... @@ -1486,7 +1486,7 @@
1486 1486 struct ore_io_state *ios;
1487 1487 int ret;
1488 1488  
1489   - truncate_inode_pages(&inode->i_data, 0);
  1489 + truncate_inode_pages_final(&inode->i_data);
1490 1490  
1491 1491 /* TODO: should do better here */
1492 1492 if (inode->i_nlink || is_bad_inode(inode))
... ... @@ -78,7 +78,7 @@
78 78 dquot_drop(inode);
79 79 }
80 80  
81   - truncate_inode_pages(&inode->i_data, 0);
  81 + truncate_inode_pages_final(&inode->i_data);
82 82  
83 83 if (want_delete) {
84 84 sb_start_intwrite(inode->i_sb);
... ... @@ -228,7 +228,7 @@
228 228 log_wait_commit(journal, commit_tid);
229 229 filemap_write_and_wait(&inode->i_data);
230 230 }
231   - truncate_inode_pages(&inode->i_data, 0);
  231 + truncate_inode_pages_final(&inode->i_data);
232 232  
233 233 ext3_discard_reservation(inode);
234 234 rsv = ei->i_block_alloc_info;
... ... @@ -215,7 +215,7 @@
215 215 jbd2_complete_transaction(journal, commit_tid);
216 216 filemap_write_and_wait(&inode->i_data);
217 217 }
218   - truncate_inode_pages(&inode->i_data, 0);
  218 + truncate_inode_pages_final(&inode->i_data);
219 219  
220 220 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
221 221 goto no_delete;
... ... @@ -226,7 +226,7 @@
226 226  
227 227 if (ext4_should_order_data(inode))
228 228 ext4_begin_ordered_truncate(inode, 0);
229   - truncate_inode_pages(&inode->i_data, 0);
  229 + truncate_inode_pages_final(&inode->i_data);
230 230  
231 231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
232 232 if (is_bad_inode(inode))
... ... @@ -260,7 +260,7 @@
260 260 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
261 261  
262 262 trace_f2fs_evict_inode(inode);
263   - truncate_inode_pages(&inode->i_data, 0);
  263 + truncate_inode_pages_final(&inode->i_data);
264 264  
265 265 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
266 266 inode->i_ino == F2FS_META_INO(sbi))
... ... @@ -490,7 +490,7 @@
490 490  
491 491 static void fat_evict_inode(struct inode *inode)
492 492 {
493   - truncate_inode_pages(&inode->i_data, 0);
  493 + truncate_inode_pages_final(&inode->i_data);
494 494 if (!inode->i_nlink) {
495 495 inode->i_size = 0;
496 496 fat_truncate_blocks(inode, 0);
fs/freevxfs/vxfs_inode.c
... ... @@ -354,7 +354,7 @@
354 354 void
355 355 vxfs_evict_inode(struct inode *ip)
356 356 {
357   - truncate_inode_pages(&ip->i_data, 0);
  357 + truncate_inode_pages_final(&ip->i_data);
358 358 clear_inode(ip);
359 359 call_rcu(&ip->i_rcu, vxfs_i_callback);
360 360 }
... ... @@ -123,7 +123,7 @@
123 123  
124 124 static void fuse_evict_inode(struct inode *inode)
125 125 {
126   - truncate_inode_pages(&inode->i_data, 0);
  126 + truncate_inode_pages_final(&inode->i_data);
127 127 clear_inode(inode);
128 128 if (inode->i_sb->s_flags & MS_ACTIVE) {
129 129 struct fuse_conn *fc = get_fuse_conn(inode);
... ... @@ -1558,7 +1558,7 @@
1558 1558 fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
1559 1559 out:
1560 1560 /* Case 3 starts here */
1561   - truncate_inode_pages(&inode->i_data, 0);
  1561 + truncate_inode_pages_final(&inode->i_data);
1562 1562 gfs2_rs_delete(ip, NULL);
1563 1563 gfs2_ordered_del_inode(ip);
1564 1564 clear_inode(inode);
... ... @@ -547,7 +547,7 @@
547 547  
548 548 void hfs_evict_inode(struct inode *inode)
549 549 {
550   - truncate_inode_pages(&inode->i_data, 0);
  550 + truncate_inode_pages_final(&inode->i_data);
551 551 clear_inode(inode);
552 552 if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
553 553 HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
... ... @@ -161,7 +161,7 @@
161 161 static void hfsplus_evict_inode(struct inode *inode)
162 162 {
163 163 hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
164   - truncate_inode_pages(&inode->i_data, 0);
  164 + truncate_inode_pages_final(&inode->i_data);
165 165 clear_inode(inode);
166 166 if (HFSPLUS_IS_RSRC(inode)) {
167 167 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
fs/hostfs/hostfs_kern.c
... ... @@ -230,7 +230,7 @@
230 230  
231 231 static void hostfs_evict_inode(struct inode *inode)
232 232 {
233   - truncate_inode_pages(&inode->i_data, 0);
  233 + truncate_inode_pages_final(&inode->i_data);
234 234 clear_inode(inode);
235 235 if (HOSTFS_I(inode)->fd != -1) {
236 236 close_file(&HOSTFS_I(inode)->fd);
... ... @@ -304,7 +304,7 @@
304 304  
305 305 void hpfs_evict_inode(struct inode *inode)
306 306 {
307   - truncate_inode_pages(&inode->i_data, 0);
  307 + truncate_inode_pages_final(&inode->i_data);
308 308 clear_inode(inode);
309 309 if (!inode->i_nlink) {
310 310 hpfs_lock(inode->i_sb);
... ... @@ -503,6 +503,7 @@
503 503 */
504 504 spin_lock_irq(&inode->i_data.tree_lock);
505 505 BUG_ON(inode->i_data.nrpages);
  506 + BUG_ON(inode->i_data.nrshadows);
506 507 spin_unlock_irq(&inode->i_data.tree_lock);
507 508 BUG_ON(!list_empty(&inode->i_data.private_list));
508 509 BUG_ON(!(inode->i_state & I_FREEING));
... ... @@ -548,8 +549,7 @@
548 549 if (op->evict_inode) {
549 550 op->evict_inode(inode);
550 551 } else {
551   - if (inode->i_data.nrpages)
552   - truncate_inode_pages(&inode->i_data, 0);
  552 + truncate_inode_pages_final(&inode->i_data);
553 553 clear_inode(inode);
554 554 }
555 555 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
... ... @@ -242,7 +242,7 @@
242 242  
243 243 jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
244 244 __func__, inode->i_ino, inode->i_mode);
245   - truncate_inode_pages(&inode->i_data, 0);
  245 + truncate_inode_pages_final(&inode->i_data);
246 246 clear_inode(inode);
247 247 jffs2_do_clear_inode(c, f);
248 248 }
... ... @@ -154,7 +154,7 @@
154 154 dquot_initialize(inode);
155 155  
156 156 if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
157   - truncate_inode_pages(&inode->i_data, 0);
  157 + truncate_inode_pages_final(&inode->i_data);
158 158  
159 159 if (test_cflag(COMMIT_Freewmap, inode))
160 160 jfs_free_zero_link(inode);
... ... @@ -168,7 +168,7 @@
168 168 dquot_free_inode(inode);
169 169 }
170 170 } else {
171   - truncate_inode_pages(&inode->i_data, 0);
  171 + truncate_inode_pages_final(&inode->i_data);
172 172 }
173 173 clear_inode(inode);
174 174 dquot_drop(inode);
... ... @@ -355,7 +355,7 @@
355 355 {
356 356 struct kernfs_node *kn = inode->i_private;
357 357  
358   - truncate_inode_pages(&inode->i_data, 0);
  358 + truncate_inode_pages_final(&inode->i_data);
359 359 clear_inode(inode);
360 360 kernfs_put(kn);
361 361 }
fs/logfs/readwrite.c
... ... @@ -2180,7 +2180,7 @@
2180 2180 do_delete_inode(inode);
2181 2181 }
2182 2182 }
2183   - truncate_inode_pages(&inode->i_data, 0);
  2183 + truncate_inode_pages_final(&inode->i_data);
2184 2184 clear_inode(inode);
2185 2185  
2186 2186 /* Cheaper version of write_inode. All changes are concealed in
... ... @@ -26,7 +26,7 @@
26 26  
27 27 static void minix_evict_inode(struct inode *inode)
28 28 {
29   - truncate_inode_pages(&inode->i_data, 0);
  29 + truncate_inode_pages_final(&inode->i_data);
30 30 if (!inode->i_nlink) {
31 31 inode->i_size = 0;
32 32 minix_truncate(inode);
... ... @@ -296,7 +296,7 @@
296 296 static void
297 297 ncp_evict_inode(struct inode *inode)
298 298 {
299   - truncate_inode_pages(&inode->i_data, 0);
  299 + truncate_inode_pages_final(&inode->i_data);
300 300 clear_inode(inode);
301 301  
302 302 if (S_ISDIR(inode->i_mode)) {
... ... @@ -128,7 +128,7 @@
128 128  
129 129 void nfs_evict_inode(struct inode *inode)
130 130 {
131   - truncate_inode_pages(&inode->i_data, 0);
  131 + truncate_inode_pages_final(&inode->i_data);
132 132 clear_inode(inode);
133 133 nfs_clear_inode(inode);
134 134 }
... ... @@ -90,7 +90,7 @@
90 90 */
91 91 static void nfs4_evict_inode(struct inode *inode)
92 92 {
93   - truncate_inode_pages(&inode->i_data, 0);
  93 + truncate_inode_pages_final(&inode->i_data);
94 94 clear_inode(inode);
95 95 pnfs_return_layout(inode);
96 96 pnfs_destroy_layout(NFS_I(inode));
... ... @@ -783,16 +783,14 @@
783 783 int ret;
784 784  
785 785 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
786   - if (inode->i_data.nrpages)
787   - truncate_inode_pages(&inode->i_data, 0);
  786 + truncate_inode_pages_final(&inode->i_data);
788 787 clear_inode(inode);
789 788 nilfs_clear_inode(inode);
790 789 return;
791 790 }
792 791 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
793 792  
794   - if (inode->i_data.nrpages)
795   - truncate_inode_pages(&inode->i_data, 0);
  793 + truncate_inode_pages_final(&inode->i_data);
796 794  
797 795 /* TODO: some of the following operations may fail. */
798 796 nilfs_truncate_bmap(ii, 0);
... ... @@ -2259,7 +2259,7 @@
2259 2259 {
2260 2260 ntfs_inode *ni = NTFS_I(vi);
2261 2261  
2262   - truncate_inode_pages(&vi->i_data, 0);
  2262 + truncate_inode_pages_final(&vi->i_data);
2263 2263 clear_inode(vi);
2264 2264  
2265 2265 #ifdef NTFS_RW
... ... @@ -964,7 +964,7 @@
964 964 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
965 965 if (sync_data)
966 966 filemap_write_and_wait(inode->i_mapping);
967   - truncate_inode_pages(&inode->i_data, 0);
  967 + truncate_inode_pages_final(&inode->i_data);
968 968 }
969 969  
970 970 static void ocfs2_delete_inode(struct inode *inode)
... ... @@ -1181,7 +1181,7 @@
1181 1181 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
1182 1182 ocfs2_delete_inode(inode);
1183 1183 } else {
1184   - truncate_inode_pages(&inode->i_data, 0);
  1184 + truncate_inode_pages_final(&inode->i_data);
1185 1185 }
1186 1186 ocfs2_clear_inode(inode);
1187 1187 }
... ... @@ -183,7 +183,7 @@
183 183 */
184 184 static void omfs_evict_inode(struct inode *inode)
185 185 {
186   - truncate_inode_pages(&inode->i_data, 0);
  186 + truncate_inode_pages_final(&inode->i_data);
187 187 clear_inode(inode);
188 188  
189 189 if (inode->i_nlink)
... ... @@ -35,7 +35,7 @@
35 35 const struct proc_ns_operations *ns_ops;
36 36 void *ns;
37 37  
38   - truncate_inode_pages(&inode->i_data, 0);
  38 + truncate_inode_pages_final(&inode->i_data);
39 39 clear_inode(inode);
40 40  
41 41 /* Stop tracking associated processes */
... ... @@ -35,7 +35,7 @@
35 35 if (!inode->i_nlink && !is_bad_inode(inode))
36 36 dquot_initialize(inode);
37 37  
38   - truncate_inode_pages(&inode->i_data, 0);
  38 + truncate_inode_pages_final(&inode->i_data);
39 39 if (inode->i_nlink)
40 40 goto no_delete;
41 41  
... ... @@ -295,7 +295,7 @@
295 295  
296 296 static void sysv_evict_inode(struct inode *inode)
297 297 {
298   - truncate_inode_pages(&inode->i_data, 0);
  298 + truncate_inode_pages_final(&inode->i_data);
299 299 if (!inode->i_nlink) {
300 300 inode->i_size = 0;
301 301 sysv_truncate(inode);
... ... @@ -351,7 +351,7 @@
351 351 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
352 352 ubifs_assert(!atomic_read(&inode->i_count));
353 353  
354   - truncate_inode_pages(&inode->i_data, 0);
  354 + truncate_inode_pages_final(&inode->i_data);
355 355  
356 356 if (inode->i_nlink)
357 357 goto done;
... ... @@ -146,8 +146,8 @@
146 146 want_delete = 1;
147 147 udf_setsize(inode, 0);
148 148 udf_update_inode(inode, IS_SYNC(inode));
149   - } else
150   - truncate_inode_pages(&inode->i_data, 0);
  149 + }
  150 + truncate_inode_pages_final(&inode->i_data);
151 151 invalidate_inode_buffers(inode);
152 152 clear_inode(inode);
153 153 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
... ... @@ -885,7 +885,7 @@
885 885 if (!inode->i_nlink && !is_bad_inode(inode))
886 886 want_delete = 1;
887 887  
888   - truncate_inode_pages(&inode->i_data, 0);
  888 + truncate_inode_pages_final(&inode->i_data);
889 889 if (want_delete) {
890 890 loff_t old_i_size;
891 891 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
... ... @@ -996,7 +996,7 @@
996 996  
997 997 trace_xfs_evict_inode(ip);
998 998  
999   - truncate_inode_pages(&inode->i_data, 0);
  999 + truncate_inode_pages_final(&inode->i_data);
1000 1000 clear_inode(inode);
1001 1001 XFS_STATS_INC(vn_rele);
1002 1002 XFS_STATS_INC(vn_remove);
... ... @@ -419,6 +419,7 @@
419 419 struct mutex i_mmap_mutex; /* protect tree, count, list */
420 420 /* Protected by tree_lock together with the radix tree */
421 421 unsigned long nrpages; /* number of total pages */
  422 + unsigned long nrshadows; /* number of shadow entries */
422 423 pgoff_t writeback_index;/* writeback starts here */
423 424 const struct address_space_operations *a_ops; /* methods */
424 425 unsigned long flags; /* error bits/gfp mask */
... ... @@ -1834,6 +1834,7 @@
1834 1834 extern void truncate_inode_pages(struct address_space *, loff_t);
1835 1835 extern void truncate_inode_pages_range(struct address_space *,
1836 1836 loff_t lstart, loff_t lend);
  1837 +extern void truncate_inode_pages_final(struct address_space *);
1837 1838  
1838 1839 /* generic vm_area_ops exported for stackable file systems */
1839 1840 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
include/linux/pagemap.h
... ... @@ -25,6 +25,7 @@
25 25 AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */
26 26 AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */
27 27 AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */
  28 + AS_EXITING = __GFP_BITS_SHIFT + 5, /* final truncate in progress */
28 29 };
29 30  
30 31 static inline void mapping_set_error(struct address_space *mapping, int error)
... ... @@ -69,6 +70,16 @@
69 70 return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags);
70 71 }
71 72  
  73 +static inline void mapping_set_exiting(struct address_space *mapping)
  74 +{
  75 + set_bit(AS_EXITING, &mapping->flags);
  76 +}
  77 +
  78 +static inline int mapping_exiting(struct address_space *mapping)
  79 +{
  80 + return test_bit(AS_EXITING, &mapping->flags);
  81 +}
  82 +
72 83 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
73 84 {
74 85 return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
... ... @@ -547,7 +558,7 @@
547 558 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
548 559 pgoff_t index, gfp_t gfp_mask);
549 560 extern void delete_from_page_cache(struct page *page);
550   -extern void __delete_from_page_cache(struct page *page);
  561 +extern void __delete_from_page_cache(struct page *page, void *shadow);
551 562 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
552 563  
553 564 /*
... ... @@ -107,12 +107,33 @@
107 107 * ->tasklist_lock (memory_failure, collect_procs_ao)
108 108 */
109 109  
  110 +static void page_cache_tree_delete(struct address_space *mapping,
  111 + struct page *page, void *shadow)
  112 +{
  113 + if (shadow) {
  114 + void **slot;
  115 +
  116 + slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
  117 + radix_tree_replace_slot(slot, shadow);
  118 + mapping->nrshadows++;
  119 + /*
  120 + * Make sure the nrshadows update is committed before
  121 + * the nrpages update so that final truncate racing
  122 + * with reclaim does not see both counters 0 at the
  123 + * same time and miss a shadow entry.
  124 + */
  125 + smp_wmb();
  126 + } else
  127 + radix_tree_delete(&mapping->page_tree, page->index);
  128 + mapping->nrpages--;
  129 +}
  130 +
110 131 /*
111 132 * Delete a page from the page cache and free it. Caller has to make
112 133 * sure the page is locked and that nobody else uses it - or that usage
113 134 * is safe. The caller must hold the mapping's tree_lock.
114 135 */
115   -void __delete_from_page_cache(struct page *page)
  136 +void __delete_from_page_cache(struct page *page, void *shadow)
116 137 {
117 138 struct address_space *mapping = page->mapping;
118 139  
119 140  
... ... @@ -127,10 +148,11 @@
127 148 else
128 149 cleancache_invalidate_page(mapping, page);
129 150  
130   - radix_tree_delete(&mapping->page_tree, page->index);
  151 + page_cache_tree_delete(mapping, page, shadow);
  152 +
131 153 page->mapping = NULL;
132 154 /* Leave page->index set: truncation lookup relies upon it */
133   - mapping->nrpages--;
  155 +
134 156 __dec_zone_page_state(page, NR_FILE_PAGES);
135 157 if (PageSwapBacked(page))
136 158 __dec_zone_page_state(page, NR_SHMEM);
... ... @@ -166,7 +188,7 @@
166 188  
167 189 freepage = mapping->a_ops->freepage;
168 190 spin_lock_irq(&mapping->tree_lock);
169   - __delete_from_page_cache(page);
  191 + __delete_from_page_cache(page, NULL);
170 192 spin_unlock_irq(&mapping->tree_lock);
171 193 mem_cgroup_uncharge_cache_page(page);
172 194  
... ... @@ -426,7 +448,7 @@
426 448 new->index = offset;
427 449  
428 450 spin_lock_irq(&mapping->tree_lock);
429   - __delete_from_page_cache(old);
  451 + __delete_from_page_cache(old, NULL);
430 452 error = radix_tree_insert(&mapping->page_tree, offset, new);
431 453 BUG_ON(error);
432 454 mapping->nrpages++;
... ... @@ -460,6 +482,7 @@
460 482 if (!radix_tree_exceptional_entry(p))
461 483 return -EEXIST;
462 484 radix_tree_replace_slot(slot, page);
  485 + mapping->nrshadows--;
463 486 mapping->nrpages++;
464 487 return 0;
465 488 }
... ... @@ -35,7 +35,8 @@
35 35 * without the tree itself locked. These unlocked entries
36 36 * need verification under the tree lock.
37 37 */
38   - radix_tree_delete_item(&mapping->page_tree, index, entry);
  38 + if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry)
  39 + mapping->nrshadows--;
39 40 spin_unlock_irq(&mapping->tree_lock);
40 41 }
41 42  
... ... @@ -229,7 +230,7 @@
229 230 int i;
230 231  
231 232 cleancache_invalidate_inode(mapping);
232   - if (mapping->nrpages == 0)
  233 + if (mapping->nrpages == 0 && mapping->nrshadows == 0)
233 234 return;
234 235  
235 236 /* Offsets within partial pages */
... ... @@ -392,6 +393,53 @@
392 393 EXPORT_SYMBOL(truncate_inode_pages);
393 394  
394 395 /**
  396 + * truncate_inode_pages_final - truncate *all* pages before inode dies
  397 + * @mapping: mapping to truncate
  398 + *
  399 + * Called under (and serialized by) inode->i_mutex.
  400 + *
  401 + * Filesystems have to use this in the .evict_inode path to inform the
  402 + * VM that this is the final truncate and the inode is going away.
  403 + */
  404 +void truncate_inode_pages_final(struct address_space *mapping)
  405 +{
  406 + unsigned long nrshadows;
  407 + unsigned long nrpages;
  408 +
  409 + /*
  410 + * Page reclaim can not participate in regular inode lifetime
  411 + * management (can't call iput()) and thus can race with the
  412 + * inode teardown. Tell it when the address space is exiting,
  413 + * so that it does not install eviction information after the
  414 + * final truncate has begun.
  415 + */
  416 + mapping_set_exiting(mapping);
  417 +
  418 + /*
  419 + * When reclaim installs eviction entries, it increases
  420 + * nrshadows first, then decreases nrpages. Make sure we see
  421 + * this in the right order or we might miss an entry.
  422 + */
  423 + nrpages = mapping->nrpages;
  424 + smp_rmb();
  425 + nrshadows = mapping->nrshadows;
  426 +
  427 + if (nrpages || nrshadows) {
  428 + /*
  429 + * As truncation uses a lockless tree lookup, cycle
  430 + * the tree lock to make sure any ongoing tree
  431 + * modification that does not see AS_EXITING is
  432 + * completed before starting the final truncate.
  433 + */
  434 + spin_lock_irq(&mapping->tree_lock);
  435 + spin_unlock_irq(&mapping->tree_lock);
  436 +
  437 + truncate_inode_pages(mapping, 0);
  438 + }
  439 +}
  440 +EXPORT_SYMBOL(truncate_inode_pages_final);
  441 +
  442 +/**
395 443 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
396 444 * @mapping: the address_space which holds the pages to invalidate
397 445 * @start: the offset 'from' which to invalidate
... ... @@ -484,7 +532,7 @@
484 532 goto failed;
485 533  
486 534 BUG_ON(page_has_private(page));
487   - __delete_from_page_cache(page);
  535 + __delete_from_page_cache(page, NULL);
488 536 spin_unlock_irq(&mapping->tree_lock);
489 537 mem_cgroup_uncharge_cache_page(page);
490 538  
... ... @@ -572,7 +572,7 @@
572 572  
573 573 freepage = mapping->a_ops->freepage;
574 574  
575   - __delete_from_page_cache(page);
  575 + __delete_from_page_cache(page, NULL);
576 576 spin_unlock_irq(&mapping->tree_lock);
577 577 mem_cgroup_uncharge_cache_page(page);
578 578