Blame view

fs/xfs/xfs_sync.c 28.6 KB
fe4fa4b8e   David Chinner   [XFS] move sync c...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
  /*
   * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   * All Rights Reserved.
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public License as
   * published by the Free Software Foundation.
   *
   * This program is distributed in the hope that it would be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   *
   * You should have received a copy of the GNU General Public License
   * along with this program; if not, write the Free Software Foundation,
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  #include "xfs.h"
  #include "xfs_fs.h"
  #include "xfs_types.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_inum.h"
  #include "xfs_trans.h"
fd074841c   Dave Chinner   xfs: push the AIL...
25
  #include "xfs_trans_priv.h"
fe4fa4b8e   David Chinner   [XFS] move sync c...
26
27
  #include "xfs_sb.h"
  #include "xfs_ag.h"
fe4fa4b8e   David Chinner   [XFS] move sync c...
28
29
  #include "xfs_mount.h"
  #include "xfs_bmap_btree.h"
fe4fa4b8e   David Chinner   [XFS] move sync c...
30
31
32
  #include "xfs_inode.h"
  #include "xfs_dinode.h"
  #include "xfs_error.h"
fe4fa4b8e   David Chinner   [XFS] move sync c...
33
34
  #include "xfs_filestream.h"
  #include "xfs_vnodeops.h"
fe4fa4b8e   David Chinner   [XFS] move sync c...
35
  #include "xfs_inode_item.h"
7d095257e   Christoph Hellwig   xfs: kill xfs_qmops
36
  #include "xfs_quota.h"
0b1b213fc   Christoph Hellwig   xfs: event tracin...
37
  #include "xfs_trace.h"
1a387d3be   Dave Chinner   xfs: dummy transa...
38
  #include "xfs_fsops.h"
fe4fa4b8e   David Chinner   [XFS] move sync c...
39

a167b17e8   David Chinner   [XFS] move xfssyn...
40
41
  #include <linux/kthread.h>
  #include <linux/freezer.h>
c6d09b666   Dave Chinner   xfs: introduce a ...
42
  struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
78ae52567   Dave Chinner   xfs: implement ba...
43
44
45
46
47
48
49
  /*
   * The inode lookup is done in batches to keep the amount of lock traffic and
   * radix tree lookups to a minimum. The batch size is a trade off between
   * lookup reduction and stack usage. This is in the reclaim path, so we can't
   * be too greedy.
   */
  #define XFS_LOOKUP_BATCH	32
e13de955c   Dave Chinner   xfs: split out in...
50
51
52
53
54
  STATIC int
  xfs_inode_ag_walk_grab(
  	struct xfs_inode	*ip)
  {
  	struct inode		*inode = VFS_I(ip);
1a3e8f3da   Dave Chinner   xfs: convert inod...
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  	ASSERT(rcu_read_lock_held());
  
  	/*
  	 * check for stale RCU freed inode
  	 *
  	 * If the inode has been reallocated, it doesn't matter if it's not in
  	 * the AG we are walking - we are walking for writeback, so if it
  	 * passes all the "valid inode" checks and is dirty, then we'll write
  	 * it back anyway.  If it has been reallocated and still being
  	 * initialised, the XFS_INEW check below will catch it.
  	 */
  	spin_lock(&ip->i_flags_lock);
  	if (!ip->i_ino)
  		goto out_unlock_noent;
  
  	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
  	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
  		goto out_unlock_noent;
  	spin_unlock(&ip->i_flags_lock);
e13de955c   Dave Chinner   xfs: split out in...
74
75
76
  	/* nothing to sync during shutdown */
  	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  		return EFSCORRUPTED;
e13de955c   Dave Chinner   xfs: split out in...
77
78
79
80
81
82
83
84
85
86
87
  	/* If we can't grab the inode, it must on it's way to reclaim. */
  	if (!igrab(inode))
  		return ENOENT;
  
  	if (is_bad_inode(inode)) {
  		IRELE(ip);
  		return ENOENT;
  	}
  
  	/* inode is valid */
  	return 0;
1a3e8f3da   Dave Chinner   xfs: convert inod...
88
89
90
91
  
  out_unlock_noent:
  	spin_unlock(&ip->i_flags_lock);
  	return ENOENT;
e13de955c   Dave Chinner   xfs: split out in...
92
  }
75f3cb139   Dave Chinner   xfs: introduce a ...
93
94
95
  STATIC int
  xfs_inode_ag_walk(
  	struct xfs_mount	*mp,
5017e97d5   Dave Chinner   xfs: rename xfs_g...
96
  	struct xfs_perag	*pag,
75f3cb139   Dave Chinner   xfs: introduce a ...
97
98
  	int			(*execute)(struct xfs_inode *ip,
  					   struct xfs_perag *pag, int flags),
65d0f2053   Dave Chinner   xfs: split inode ...
99
  	int			flags)
75f3cb139   Dave Chinner   xfs: introduce a ...
100
  {
75f3cb139   Dave Chinner   xfs: introduce a ...
101
102
103
  	uint32_t		first_index;
  	int			last_error = 0;
  	int			skipped;
65d0f2053   Dave Chinner   xfs: split inode ...
104
  	int			done;
78ae52567   Dave Chinner   xfs: implement ba...
105
  	int			nr_found;
75f3cb139   Dave Chinner   xfs: introduce a ...
106
107
  
  restart:
65d0f2053   Dave Chinner   xfs: split inode ...
108
  	done = 0;
75f3cb139   Dave Chinner   xfs: introduce a ...
109
110
  	skipped = 0;
  	first_index = 0;
78ae52567   Dave Chinner   xfs: implement ba...
111
  	nr_found = 0;
75f3cb139   Dave Chinner   xfs: introduce a ...
112
  	do {
78ae52567   Dave Chinner   xfs: implement ba...
113
  		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
75f3cb139   Dave Chinner   xfs: introduce a ...
114
  		int		error = 0;
78ae52567   Dave Chinner   xfs: implement ba...
115
  		int		i;
75f3cb139   Dave Chinner   xfs: introduce a ...
116

1a3e8f3da   Dave Chinner   xfs: convert inod...
117
  		rcu_read_lock();
65d0f2053   Dave Chinner   xfs: split inode ...
118
  		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
78ae52567   Dave Chinner   xfs: implement ba...
119
120
  					(void **)batch, first_index,
  					XFS_LOOKUP_BATCH);
65d0f2053   Dave Chinner   xfs: split inode ...
121
  		if (!nr_found) {
1a3e8f3da   Dave Chinner   xfs: convert inod...
122
  			rcu_read_unlock();
75f3cb139   Dave Chinner   xfs: introduce a ...
123
  			break;
c8e20be02   Dave Chinner   xfs: reclaim inod...
124
  		}
75f3cb139   Dave Chinner   xfs: introduce a ...
125

65d0f2053   Dave Chinner   xfs: split inode ...
126
  		/*
78ae52567   Dave Chinner   xfs: implement ba...
127
128
  		 * Grab the inodes before we drop the lock. if we found
  		 * nothing, nr == 0 and the loop will be skipped.
65d0f2053   Dave Chinner   xfs: split inode ...
129
  		 */
78ae52567   Dave Chinner   xfs: implement ba...
130
131
132
133
134
135
136
  		for (i = 0; i < nr_found; i++) {
  			struct xfs_inode *ip = batch[i];
  
  			if (done || xfs_inode_ag_walk_grab(ip))
  				batch[i] = NULL;
  
  			/*
1a3e8f3da   Dave Chinner   xfs: convert inod...
137
138
139
140
141
142
143
144
145
146
  			 * Update the index for the next lookup. Catch
  			 * overflows into the next AG range which can occur if
  			 * we have inodes in the last block of the AG and we
  			 * are currently pointing to the last inode.
  			 *
  			 * Because we may see inodes that are from the wrong AG
  			 * due to RCU freeing and reallocation, only update the
  			 * index if it lies in this AG. It was a race that lead
  			 * us to see this inode, so another lookup from the
  			 * same index will not find it again.
78ae52567   Dave Chinner   xfs: implement ba...
147
  			 */
1a3e8f3da   Dave Chinner   xfs: convert inod...
148
149
  			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
  				continue;
78ae52567   Dave Chinner   xfs: implement ba...
150
151
152
  			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
  			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
  				done = 1;
e13de955c   Dave Chinner   xfs: split out in...
153
  		}
78ae52567   Dave Chinner   xfs: implement ba...
154
155
  
  		/* unlock now we've grabbed the inodes. */
1a3e8f3da   Dave Chinner   xfs: convert inod...
156
  		rcu_read_unlock();
e13de955c   Dave Chinner   xfs: split out in...
157

78ae52567   Dave Chinner   xfs: implement ba...
158
159
160
161
162
163
164
165
166
167
168
  		for (i = 0; i < nr_found; i++) {
  			if (!batch[i])
  				continue;
  			error = execute(batch[i], pag, flags);
  			IRELE(batch[i]);
  			if (error == EAGAIN) {
  				skipped++;
  				continue;
  			}
  			if (error && last_error != EFSCORRUPTED)
  				last_error = error;
75f3cb139   Dave Chinner   xfs: introduce a ...
169
  		}
c8e20be02   Dave Chinner   xfs: reclaim inod...
170
171
  
  		/* bail out if the filesystem is corrupted.  */
75f3cb139   Dave Chinner   xfs: introduce a ...
172
173
  		if (error == EFSCORRUPTED)
  			break;
8daaa8314   Dave Chinner   xfs: make use of ...
174
  		cond_resched();
78ae52567   Dave Chinner   xfs: implement ba...
175
  	} while (nr_found && !done);
75f3cb139   Dave Chinner   xfs: introduce a ...
176
177
178
179
180
  
  	if (skipped) {
  		delay(1);
  		goto restart;
  	}
75f3cb139   Dave Chinner   xfs: introduce a ...
181
182
  	return last_error;
  }
fe588ed32   Christoph Hellwig   xfs: use generic ...
183
  int
75f3cb139   Dave Chinner   xfs: introduce a ...
184
185
186
187
  xfs_inode_ag_iterator(
  	struct xfs_mount	*mp,
  	int			(*execute)(struct xfs_inode *ip,
  					   struct xfs_perag *pag, int flags),
65d0f2053   Dave Chinner   xfs: split inode ...
188
  	int			flags)
75f3cb139   Dave Chinner   xfs: introduce a ...
189
  {
16fd53673   Dave Chinner   xfs: track AGs wi...
190
  	struct xfs_perag	*pag;
75f3cb139   Dave Chinner   xfs: introduce a ...
191
192
193
  	int			error = 0;
  	int			last_error = 0;
  	xfs_agnumber_t		ag;
16fd53673   Dave Chinner   xfs: track AGs wi...
194
  	ag = 0;
65d0f2053   Dave Chinner   xfs: split inode ...
195
196
197
  	while ((pag = xfs_perag_get(mp, ag))) {
  		ag = pag->pag_agno + 1;
  		error = xfs_inode_ag_walk(mp, pag, execute, flags);
5017e97d5   Dave Chinner   xfs: rename xfs_g...
198
  		xfs_perag_put(pag);
75f3cb139   Dave Chinner   xfs: introduce a ...
199
200
201
202
203
204
205
206
  		if (error) {
  			last_error = error;
  			if (error == EFSCORRUPTED)
  				break;
  		}
  	}
  	return XFS_ERROR(last_error);
  }
5a34d5cd0   Dave Chinner   xfs: split inode ...
207
208
209
  STATIC int
  xfs_sync_inode_data(
  	struct xfs_inode	*ip,
75f3cb139   Dave Chinner   xfs: introduce a ...
210
  	struct xfs_perag	*pag,
5a34d5cd0   Dave Chinner   xfs: split inode ...
211
212
213
214
215
216
217
  	int			flags)
  {
  	struct inode		*inode = VFS_I(ip);
  	struct address_space *mapping = inode->i_mapping;
  	int			error = 0;
  
  	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
4a06fd262   Christoph Hellwig   xfs: remove i_ioc...
218
  		return 0;
5a34d5cd0   Dave Chinner   xfs: split inode ...
219
220
221
  
  	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
  		if (flags & SYNC_TRYLOCK)
4a06fd262   Christoph Hellwig   xfs: remove i_ioc...
222
  			return 0;
5a34d5cd0   Dave Chinner   xfs: split inode ...
223
224
225
226
  		xfs_ilock(ip, XFS_IOLOCK_SHARED);
  	}
  
  	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
0cadda1c5   Christoph Hellwig   xfs: remove dupli...
227
  				0 : XBF_ASYNC, FI_NONE);
5a34d5cd0   Dave Chinner   xfs: split inode ...
228
  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
5a34d5cd0   Dave Chinner   xfs: split inode ...
229
230
  	return error;
  }
845b6d0cb   Christoph Hellwig   xfs: split inode ...
231
232
233
  STATIC int
  xfs_sync_inode_attr(
  	struct xfs_inode	*ip,
75f3cb139   Dave Chinner   xfs: introduce a ...
234
  	struct xfs_perag	*pag,
845b6d0cb   Christoph Hellwig   xfs: split inode ...
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
  	int			flags)
  {
  	int			error = 0;
  
  	xfs_ilock(ip, XFS_ILOCK_SHARED);
  	if (xfs_inode_clean(ip))
  		goto out_unlock;
  	if (!xfs_iflock_nowait(ip)) {
  		if (!(flags & SYNC_WAIT))
  			goto out_unlock;
  		xfs_iflock(ip);
  	}
  
  	if (xfs_inode_clean(ip)) {
  		xfs_ifunlock(ip);
  		goto out_unlock;
  	}
c854363e8   Dave Chinner   xfs: Use delayed ...
252
  	error = xfs_iflush(ip, flags);
845b6d0cb   Christoph Hellwig   xfs: split inode ...
253

ee58abdfc   Dave Chinner   xfs: avoid gettin...
254
255
256
257
258
259
260
261
262
  	/*
  	 * We don't want to try again on non-blocking flushes that can't run
  	 * again immediately. If an inode really must be written, then that's
  	 * what the SYNC_WAIT flag is for.
  	 */
  	if (error == EAGAIN) {
  		ASSERT(!(flags & SYNC_WAIT));
  		error = 0;
  	}
845b6d0cb   Christoph Hellwig   xfs: split inode ...
263
264
265
266
   out_unlock:
  	xfs_iunlock(ip, XFS_ILOCK_SHARED);
  	return error;
  }
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
267
268
269
  /*
   * Write out pagecache data for the whole filesystem.
   */
64c861494   Christoph Hellwig   xfs: remove expli...
270
  STATIC int
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
271
272
273
  xfs_sync_data(
  	struct xfs_mount	*mp,
  	int			flags)
683a89708   David Chinner   [XFS] Use the ino...
274
  {
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
275
  	int			error;
fe4fa4b8e   David Chinner   [XFS] move sync c...
276

b0710ccc6   Christoph Hellwig   xfs: remove SYNC_...
277
  	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
fe4fa4b8e   David Chinner   [XFS] move sync c...
278

65d0f2053   Dave Chinner   xfs: split inode ...
279
  	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
280
281
  	if (error)
  		return XFS_ERROR(error);
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
282

a14a348bf   Christoph Hellwig   xfs: cleanup up x...
283
  	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
284
285
  	return 0;
  }
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
286

075fe1028   Christoph Hellwig   xfs: split xfs_sy...
287
288
289
  /*
   * Write out inode metadata (attributes) for the whole filesystem.
   */
64c861494   Christoph Hellwig   xfs: remove expli...
290
  STATIC int
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
291
292
293
294
295
  xfs_sync_attr(
  	struct xfs_mount	*mp,
  	int			flags)
  {
  	ASSERT((flags & ~SYNC_WAIT) == 0);
75f3cb139   Dave Chinner   xfs: introduce a ...
296

65d0f2053   Dave Chinner   xfs: split inode ...
297
  	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
fe4fa4b8e   David Chinner   [XFS] move sync c...
298
  }
2af75df7b   Christoph Hellwig   [XFS] split out t...
299
  STATIC int
2af75df7b   Christoph Hellwig   [XFS] split out t...
300
  xfs_sync_fsdata(
df308bcfe   Christoph Hellwig   xfs: remove perio...
301
  	struct xfs_mount	*mp)
2af75df7b   Christoph Hellwig   [XFS] split out t...
302
303
  {
  	struct xfs_buf		*bp;
c2b006c1d   Christoph Hellwig   xfs: let xfs_bwri...
304
  	int			error;
2af75df7b   Christoph Hellwig   [XFS] split out t...
305
306
  
  	/*
df308bcfe   Christoph Hellwig   xfs: remove perio...
307
308
309
310
311
312
  	 * If the buffer is pinned then push on the log so we won't get stuck
  	 * waiting in the write for someone, maybe ourselves, to flush the log.
  	 *
  	 * Even though we just pushed the log above, we did not have the
  	 * superblock buffer locked at that point so it can become pinned in
  	 * between there and here.
2af75df7b   Christoph Hellwig   [XFS] split out t...
313
  	 */
df308bcfe   Christoph Hellwig   xfs: remove perio...
314
  	bp = xfs_getsb(mp, 0);
811e64c71   Chandra Seetharaman   Replace the macro...
315
  	if (xfs_buf_ispinned(bp))
df308bcfe   Christoph Hellwig   xfs: remove perio...
316
  		xfs_log_force(mp, 0);
c2b006c1d   Christoph Hellwig   xfs: let xfs_bwri...
317
318
319
  	error = xfs_bwrite(bp);
  	xfs_buf_relse(bp);
  	return error;
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
320
321
322
  }
  
  /*
a4e4c4f4a   David Chinner   [XFS] Kill xfs_sy...
323
324
325
326
327
328
329
330
331
332
333
   * When remounting a filesystem read-only or freezing the filesystem, we have
   * two phases to execute. This first phase is syncing the data before we
   * quiesce the filesystem, and the second is flushing all the inodes out after
   * we've waited for all the transactions created by the first phase to
   * complete. The second phase ensures that the inodes are written to their
   * location on disk rather than just existing in transactions in the log. This
   * means after a quiesce there is no log replay required to write the inodes to
   * disk (this is the main difference between a sync and a quiesce).
   */
  /*
   * First stage of freeze - no writers will make progress now we are here,
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
334
335
   * so we flush delwri and delalloc buffers here, then wait for all I/O to
   * complete.  Data is frozen at that point. Metadata is not frozen,
a4e4c4f4a   David Chinner   [XFS] Kill xfs_sy...
336
337
   * transactions can still occur here so don't bother flushing the buftarg
   * because it'll just get dirty again.
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
338
339
340
341
342
   */
  int
  xfs_quiesce_data(
  	struct xfs_mount	*mp)
  {
df308bcfe   Christoph Hellwig   xfs: remove perio...
343
  	int			error, error2 = 0;
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
344

8b5403a6d   Christoph Hellwig   xfs: remove SYNC_...
345
  	xfs_qm_sync(mp, SYNC_TRYLOCK);
7d095257e   Christoph Hellwig   xfs: kill xfs_qmops
346
  	xfs_qm_sync(mp, SYNC_WAIT);
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
347

33b8f7c24   Christoph Hellwig   xfs: improve sync...
348
349
  	/* force out the newly dirtied log buffers */
  	xfs_log_force(mp, XFS_LOG_SYNC);
a4e4c4f4a   David Chinner   [XFS] Kill xfs_sy...
350
  	/* write superblock and hoover up shutdown errors */
df308bcfe   Christoph Hellwig   xfs: remove perio...
351
352
353
354
355
356
357
  	error = xfs_sync_fsdata(mp);
  
  	/* make sure all delwri buffers are written out */
  	xfs_flush_buftarg(mp->m_ddev_targp, 1);
  
  	/* mark the log as covered if needed */
  	if (xfs_log_need_covered(mp))
c58efdb44   Dave Chinner   xfs: ensure log c...
358
  		error2 = xfs_fs_log_dummy(mp);
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
359

a4e4c4f4a   David Chinner   [XFS] Kill xfs_sy...
360
  	/* flush data-only devices */
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
361
  	if (mp->m_rtdev_targp)
a9add83e5   Christoph Hellwig   xfs: remove XFS_b...
362
  		xfs_flush_buftarg(mp->m_rtdev_targp, 1);
e9f1c6ee1   David Chinner   [XFS] make SYNC_D...
363

df308bcfe   Christoph Hellwig   xfs: remove perio...
364
  	return error ? error : error2;
2af75df7b   Christoph Hellwig   [XFS] split out t...
365
  }
76bf105cb   David Chinner   [XFS] Move remain...
366
367
368
369
370
  STATIC void
  xfs_quiesce_fs(
  	struct xfs_mount	*mp)
  {
  	int	count = 0, pincount;
c854363e8   Dave Chinner   xfs: Use delayed ...
371
  	xfs_reclaim_inodes(mp, 0);
76bf105cb   David Chinner   [XFS] Move remain...
372
  	xfs_flush_buftarg(mp->m_ddev_targp, 0);
76bf105cb   David Chinner   [XFS] Move remain...
373
374
375
376
377
  
  	/*
  	 * This loop must run at least twice.  The first instance of the loop
  	 * will flush most meta data but that will generate more meta data
  	 * (typically directory updates).  Which then must be flushed and
c854363e8   Dave Chinner   xfs: Use delayed ...
378
379
  	 * logged before we can write the unmount record. We also so sync
  	 * reclaim of inodes to catch any that the above delwri flush skipped.
76bf105cb   David Chinner   [XFS] Move remain...
380
381
  	 */
  	do {
c854363e8   Dave Chinner   xfs: Use delayed ...
382
  		xfs_reclaim_inodes(mp, SYNC_WAIT);
075fe1028   Christoph Hellwig   xfs: split xfs_sy...
383
  		xfs_sync_attr(mp, SYNC_WAIT);
76bf105cb   David Chinner   [XFS] Move remain...
384
385
386
387
388
389
390
391
392
393
394
  		pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
  		if (!pincount) {
  			delay(50);
  			count++;
  		}
  	} while (count < 2);
  }
  
  /*
   * Second stage of a quiesce. The data is already synced, now we have to take
   * care of the metadata. New transactions are already blocked, so we need to
25985edce   Lucas De Marchi   Fix common misspe...
395
   * wait for any remaining transactions to drain out before proceeding.
76bf105cb   David Chinner   [XFS] Move remain...
396
397
398
399
400
401
402
403
404
405
406
407
408
   */
  void
  xfs_quiesce_attr(
  	struct xfs_mount	*mp)
  {
  	int	error = 0;
  
  	/* wait for all modifications to complete */
  	while (atomic_read(&mp->m_active_trans) > 0)
  		delay(100);
  
  	/* flush inodes and push all remaining buffers out to disk */
  	xfs_quiesce_fs(mp);
5e1065726   Felix Blyakher   [XFS] Warn on tra...
409
410
411
412
413
  	/*
  	 * Just warn here till VFS can correctly support
  	 * read-only remount without racing.
  	 */
  	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
76bf105cb   David Chinner   [XFS] Move remain...
414
415
  
  	/* Push the superblock and write an unmount record */
adab0f67d   Chandra Seetharaman   xfs: Remove the s...
416
  	error = xfs_log_sbcount(mp);
76bf105cb   David Chinner   [XFS] Move remain...
417
  	if (error)
4f10700a2   Dave Chinner   xfs: Convert linu...
418
  		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
76bf105cb   David Chinner   [XFS] Move remain...
419
420
421
422
  				"Frozen image may not be consistent.");
  	xfs_log_unmount_write(mp);
  	xfs_unmountfs_writesb(mp);
  }
c6d09b666   Dave Chinner   xfs: introduce a ...
423
424
425
  static void
  xfs_syncd_queue_sync(
  	struct xfs_mount        *mp)
a167b17e8   David Chinner   [XFS] move xfssyn...
426
  {
c6d09b666   Dave Chinner   xfs: introduce a ...
427
428
  	queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
  				msecs_to_jiffies(xfs_syncd_centisecs * 10));
a167b17e8   David Chinner   [XFS] move xfssyn...
429
  }
aacaa880b   David Chinner   [XFS] xfssyncd: d...
430
  /*
df308bcfe   Christoph Hellwig   xfs: remove perio...
431
432
   * Every sync period we need to unpin all items, reclaim inodes and sync
   * disk quotas.  We might need to cover the log to indicate that the
1a387d3be   Dave Chinner   xfs: dummy transa...
433
   * filesystem is idle and not frozen.
aacaa880b   David Chinner   [XFS] xfssyncd: d...
434
   */
a167b17e8   David Chinner   [XFS] move xfssyn...
435
436
  STATIC void
  xfs_sync_worker(
c6d09b666   Dave Chinner   xfs: introduce a ...
437
  	struct work_struct *work)
a167b17e8   David Chinner   [XFS] move xfssyn...
438
  {
c6d09b666   Dave Chinner   xfs: introduce a ...
439
440
  	struct xfs_mount *mp = container_of(to_delayed_work(work),
  					struct xfs_mount, m_sync_work);
a167b17e8   David Chinner   [XFS] move xfssyn...
441
  	int		error;
aacaa880b   David Chinner   [XFS] xfssyncd: d...
442
  	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
aacaa880b   David Chinner   [XFS] xfssyncd: d...
443
  		/* dgc: errors ignored here */
1a387d3be   Dave Chinner   xfs: dummy transa...
444
445
  		if (mp->m_super->s_frozen == SB_UNFROZEN &&
  		    xfs_log_need_covered(mp))
c58efdb44   Dave Chinner   xfs: ensure log c...
446
447
448
  			error = xfs_fs_log_dummy(mp);
  		else
  			xfs_log_force(mp, 0);
c58efdb44   Dave Chinner   xfs: ensure log c...
449
  		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
fd074841c   Dave Chinner   xfs: push the AIL...
450
451
452
  
  		/* start pushing all the metadata that is currently dirty */
  		xfs_ail_push_all(mp->m_ail);
aacaa880b   David Chinner   [XFS] xfssyncd: d...
453
  	}
c6d09b666   Dave Chinner   xfs: introduce a ...
454
455
456
  
  	/* queue us up again */
  	xfs_syncd_queue_sync(mp);
a167b17e8   David Chinner   [XFS] move xfssyn...
457
  }
89e4cb550   Dave Chinner   xfs: convert ENOS...
458
  /*
a7b339f1b   Dave Chinner   xfs: introduce ba...
459
460
461
462
463
464
465
466
467
   * Queue a new inode reclaim pass if there are reclaimable inodes and there
   * isn't a reclaim pass already in progress. By default it runs every 5s based
   * on the xfs syncd work default of 30s. Perhaps this should have it's own
   * tunable, but that can be done if this method proves to be ineffective or too
   * aggressive.
   */
  static void
  xfs_syncd_queue_reclaim(
  	struct xfs_mount        *mp)
a167b17e8   David Chinner   [XFS] move xfssyn...
468
  {
a167b17e8   David Chinner   [XFS] move xfssyn...
469

a7b339f1b   Dave Chinner   xfs: introduce ba...
470
471
472
473
474
475
476
  	/*
  	 * We can have inodes enter reclaim after we've shut down the syncd
  	 * workqueue during unmount, so don't allow reclaim work to be queued
  	 * during unmount.
  	 */
  	if (!(mp->m_super->s_flags & MS_ACTIVE))
  		return;
a167b17e8   David Chinner   [XFS] move xfssyn...
477

a7b339f1b   Dave Chinner   xfs: introduce ba...
478
479
480
481
  	rcu_read_lock();
  	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
  		queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
  			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
a167b17e8   David Chinner   [XFS] move xfssyn...
482
  	}
a7b339f1b   Dave Chinner   xfs: introduce ba...
483
484
  	rcu_read_unlock();
  }
a167b17e8   David Chinner   [XFS] move xfssyn...
485

a7b339f1b   Dave Chinner   xfs: introduce ba...
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
  /*
   * This is a fast pass over the inode cache to try to get reclaim moving on as
   * many inodes as possible in a short period of time. It kicks itself every few
   * seconds, as well as being kicked by the inode cache shrinker when memory
   * goes low. It scans as quickly as possible avoiding locked inodes or those
   * already being flushed, and once done schedules a future pass.
   */
  STATIC void
  xfs_reclaim_worker(
  	struct work_struct *work)
  {
  	struct xfs_mount *mp = container_of(to_delayed_work(work),
  					struct xfs_mount, m_reclaim_work);
  
  	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
  	xfs_syncd_queue_reclaim(mp);
  }
  
  /*
89e4cb550   Dave Chinner   xfs: convert ENOS...
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
   * Flush delayed allocate data, attempting to free up reserved space
   * from existing allocations.  At this point a new allocation attempt
   * has failed with ENOSPC and we are in the process of scratching our
   * heads, looking about for more room.
   *
   * Queue a new data flush if there isn't one already in progress and
   * wait for completion of the flush. This means that we only ever have one
   * inode flush in progress no matter how many ENOSPC events are occurring and
   * so will prevent the system from bogging down due to every concurrent
   * ENOSPC event scanning all the active inodes in the system for writeback.
   */
  void
  xfs_flush_inodes(
  	struct xfs_inode	*ip)
  {
  	struct xfs_mount	*mp = ip->i_mount;
  
  	queue_work(xfs_syncd_wq, &mp->m_flush_work);
  	flush_work_sync(&mp->m_flush_work);
  }
  
  STATIC void
  xfs_flush_worker(
  	struct work_struct *work)
  {
  	struct xfs_mount *mp = container_of(work,
  					struct xfs_mount, m_flush_work);
  
  	xfs_sync_data(mp, SYNC_TRYLOCK);
  	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
a167b17e8   David Chinner   [XFS] move xfssyn...
535
536
537
538
539
540
  }
  
  int
  xfs_syncd_init(
  	struct xfs_mount	*mp)
  {
89e4cb550   Dave Chinner   xfs: convert ENOS...
541
  	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
c6d09b666   Dave Chinner   xfs: introduce a ...
542
  	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
a7b339f1b   Dave Chinner   xfs: introduce ba...
543
  	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
c6d09b666   Dave Chinner   xfs: introduce a ...
544
  	xfs_syncd_queue_sync(mp);
a7b339f1b   Dave Chinner   xfs: introduce ba...
545
  	xfs_syncd_queue_reclaim(mp);
c6d09b666   Dave Chinner   xfs: introduce a ...
546

a167b17e8   David Chinner   [XFS] move xfssyn...
547
548
549
550
551
552
553
  	return 0;
  }
  
  void
  xfs_syncd_stop(
  	struct xfs_mount	*mp)
  {
c6d09b666   Dave Chinner   xfs: introduce a ...
554
  	cancel_delayed_work_sync(&mp->m_sync_work);
a7b339f1b   Dave Chinner   xfs: introduce ba...
555
  	cancel_delayed_work_sync(&mp->m_reclaim_work);
89e4cb550   Dave Chinner   xfs: convert ENOS...
556
  	cancel_work_sync(&mp->m_flush_work);
a167b17e8   David Chinner   [XFS] move xfssyn...
557
  }
bc990f5cb   Christoph Hellwig   xfs: fix locking ...
558
559
560
561
562
563
564
565
  void
  __xfs_inode_set_reclaim_tag(
  	struct xfs_perag	*pag,
  	struct xfs_inode	*ip)
  {
  	radix_tree_tag_set(&pag->pag_ici_root,
  			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
  			   XFS_ICI_RECLAIM_TAG);
16fd53673   Dave Chinner   xfs: track AGs wi...
566
567
568
569
570
571
572
573
  
  	if (!pag->pag_ici_reclaimable) {
  		/* propagate the reclaim tag up into the perag radix tree */
  		spin_lock(&ip->i_mount->m_perag_lock);
  		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
  				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
  				XFS_ICI_RECLAIM_TAG);
  		spin_unlock(&ip->i_mount->m_perag_lock);
a7b339f1b   Dave Chinner   xfs: introduce ba...
574
575
576
  
  		/* schedule periodic background inode reclaim */
  		xfs_syncd_queue_reclaim(ip->i_mount);
16fd53673   Dave Chinner   xfs: track AGs wi...
577
578
579
  		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
  							-1, _RET_IP_);
  	}
9bf729c0a   Dave Chinner   xfs: add a shrink...
580
  	pag->pag_ici_reclaimable++;
bc990f5cb   Christoph Hellwig   xfs: fix locking ...
581
  }
116545130   David Chinner   [XFS] kill delete...
582
583
584
585
586
  /*
   * We set the inode flag atomically with the radix tree tag.
   * Once we get tag lookups on the radix tree, this inode flag
   * can go away.
   */
396beb853   David Chinner   [XFS] mark inodes...
587
588
589
590
  void
  xfs_inode_set_reclaim_tag(
  	xfs_inode_t	*ip)
  {
5017e97d5   Dave Chinner   xfs: rename xfs_g...
591
592
  	struct xfs_mount *mp = ip->i_mount;
  	struct xfs_perag *pag;
396beb853   David Chinner   [XFS] mark inodes...
593

5017e97d5   Dave Chinner   xfs: rename xfs_g...
594
  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1a427ab0c   Dave Chinner   xfs: convert pag_...
595
  	spin_lock(&pag->pag_ici_lock);
396beb853   David Chinner   [XFS] mark inodes...
596
  	spin_lock(&ip->i_flags_lock);
bc990f5cb   Christoph Hellwig   xfs: fix locking ...
597
  	__xfs_inode_set_reclaim_tag(pag, ip);
116545130   David Chinner   [XFS] kill delete...
598
  	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
396beb853   David Chinner   [XFS] mark inodes...
599
  	spin_unlock(&ip->i_flags_lock);
1a427ab0c   Dave Chinner   xfs: convert pag_...
600
  	spin_unlock(&pag->pag_ici_lock);
5017e97d5   Dave Chinner   xfs: rename xfs_g...
601
  	xfs_perag_put(pag);
396beb853   David Chinner   [XFS] mark inodes...
602
  }
081003fff   Johannes Weiner   xfs: properly acc...
603
604
  STATIC void
  __xfs_inode_clear_reclaim(
396beb853   David Chinner   [XFS] mark inodes...
605
606
607
  	xfs_perag_t	*pag,
  	xfs_inode_t	*ip)
  {
9bf729c0a   Dave Chinner   xfs: add a shrink...
608
  	pag->pag_ici_reclaimable--;
16fd53673   Dave Chinner   xfs: track AGs wi...
609
610
611
612
613
614
615
616
617
618
  	if (!pag->pag_ici_reclaimable) {
  		/* clear the reclaim tag from the perag radix tree */
  		spin_lock(&ip->i_mount->m_perag_lock);
  		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
  				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
  				XFS_ICI_RECLAIM_TAG);
  		spin_unlock(&ip->i_mount->m_perag_lock);
  		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
  							-1, _RET_IP_);
  	}
396beb853   David Chinner   [XFS] mark inodes...
619
  }
081003fff   Johannes Weiner   xfs: properly acc...
620
621
622
623
624
625
626
627
628
629
  void
  __xfs_inode_clear_reclaim_tag(
  	xfs_mount_t	*mp,
  	xfs_perag_t	*pag,
  	xfs_inode_t	*ip)
  {
  	radix_tree_tag_clear(&pag->pag_ici_root,
  			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
  	__xfs_inode_clear_reclaim(pag, ip);
  }
777df5afd   Dave Chinner   xfs: Make inode r...
630
  /*
e3a20c0b0   Dave Chinner   xfs: batch inode ...
631
632
633
634
635
636
637
638
   * Grab the inode for reclaim exclusively.
   * Return 0 if we grabbed it, non-zero otherwise.
   */
  STATIC int
  xfs_reclaim_inode_grab(
  	struct xfs_inode	*ip,
  	int			flags)
  {
1a3e8f3da   Dave Chinner   xfs: convert inod...
639
640
641
642
643
  	ASSERT(rcu_read_lock_held());
  
  	/* quick check for stale RCU freed inode */
  	if (!ip->i_ino)
  		return 1;
e3a20c0b0   Dave Chinner   xfs: batch inode ...
644
645
  
  	/*
1a3e8f3da   Dave Chinner   xfs: convert inod...
646
  	 * do some unlocked checks first to avoid unnecessary lock traffic.
e3a20c0b0   Dave Chinner   xfs: batch inode ...
647
648
649
650
651
652
653
654
655
656
657
658
  	 * The first is a flush lock check, the second is a already in reclaim
  	 * check. Only do these checks if we are not going to block on locks.
  	 */
  	if ((flags & SYNC_TRYLOCK) &&
  	    (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
  		return 1;
  	}
  
  	/*
  	 * The radix tree lock here protects a thread in xfs_iget from racing
  	 * with us starting reclaim on the inode.  Once we have the
  	 * XFS_IRECLAIM flag set it will not touch us.
1a3e8f3da   Dave Chinner   xfs: convert inod...
659
660
661
662
663
  	 *
  	 * Due to RCU lookup, we may find inodes that have been freed and only
  	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
  	 * aren't candidates for reclaim at all, so we must check the
  	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
e3a20c0b0   Dave Chinner   xfs: batch inode ...
664
665
  	 */
  	spin_lock(&ip->i_flags_lock);
1a3e8f3da   Dave Chinner   xfs: convert inod...
666
667
668
  	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
  	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
  		/* not a reclaim candidate. */
e3a20c0b0   Dave Chinner   xfs: batch inode ...
669
670
671
672
673
674
675
676
677
  		spin_unlock(&ip->i_flags_lock);
  		return 1;
  	}
  	__xfs_iflags_set(ip, XFS_IRECLAIM);
  	spin_unlock(&ip->i_flags_lock);
  	return 0;
  }
  
  /*
777df5afd   Dave Chinner   xfs: Make inode r...
678
679
680
681
682
683
684
685
686
687
688
689
   * Inodes in different states need to be treated differently, and the return
   * value of xfs_iflush is not sufficient to get this right. The following table
   * lists the inode states and the reclaim actions necessary for non-blocking
   * reclaim:
   *
   *
   *	inode state	     iflush ret		required action
   *      ---------------      ----------         ---------------
   *	bad			-		reclaim
   *	shutdown		EIO		unpin and reclaim
   *	clean, unpinned		0		reclaim
   *	stale, unpinned		0		reclaim
c854363e8   Dave Chinner   xfs: Use delayed ...
690
691
692
693
694
   *	clean, pinned(*)	0		requeue
   *	stale, pinned		EAGAIN		requeue
   *	dirty, delwri ok	0		requeue
   *	dirty, delwri blocked	EAGAIN		requeue
   *	dirty, sync flush	0		reclaim
777df5afd   Dave Chinner   xfs: Make inode r...
695
696
697
698
   *
   * (*) dgc: I don't think the clean, pinned state is possible but it gets
   * handled anyway given the order of checks implemented.
   *
c854363e8   Dave Chinner   xfs: Use delayed ...
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
   * As can be seen from the table, the return value of xfs_iflush() is not
   * sufficient to correctly decide the reclaim action here. The checks in
   * xfs_iflush() might look like duplicates, but they are not.
   *
   * Also, because we get the flush lock first, we know that any inode that has
   * been flushed delwri has had the flush completed by the time we check that
   * the inode is clean. The clean inode check needs to be done before flushing
   * the inode delwri otherwise we would loop forever requeuing clean inodes as
   * we cannot tell apart a successful delwri flush and a clean inode from the
   * return value of xfs_iflush().
   *
   * Note that because the inode is flushed delayed write by background
   * writeback, the flush lock may already be held here and waiting on it can
   * result in very long latencies. Hence for sync reclaims, where we wait on the
   * flush lock, the caller should push out delayed write inodes first before
   * trying to reclaim them to minimise the amount of time spent waiting. For
   * background relaim, we just requeue the inode for the next pass.
   *
777df5afd   Dave Chinner   xfs: Make inode r...
717
718
719
   * Hence the order of actions after gaining the locks should be:
   *	bad		=> reclaim
   *	shutdown	=> unpin and reclaim
c854363e8   Dave Chinner   xfs: Use delayed ...
720
721
   *	pinned, delwri	=> requeue
   *	pinned, sync	=> unpin
777df5afd   Dave Chinner   xfs: Make inode r...
722
723
   *	stale		=> reclaim
   *	clean		=> reclaim
c854363e8   Dave Chinner   xfs: Use delayed ...
724
725
   *	dirty, delwri	=> flush and requeue
   *	dirty, sync	=> flush, wait and reclaim
777df5afd   Dave Chinner   xfs: Make inode r...
726
   */
75f3cb139   Dave Chinner   xfs: introduce a ...
727
  STATIC int
c8e20be02   Dave Chinner   xfs: reclaim inod...
728
  xfs_reclaim_inode(
75f3cb139   Dave Chinner   xfs: introduce a ...
729
730
  	struct xfs_inode	*ip,
  	struct xfs_perag	*pag,
c8e20be02   Dave Chinner   xfs: reclaim inod...
731
  	int			sync_mode)
fce08f2f3   David Chinner   [XFS] move inode ...
732
  {
1bfd8d041   Dave Chinner   xfs: introduce in...
733
  	int	error;
777df5afd   Dave Chinner   xfs: Make inode r...
734

1bfd8d041   Dave Chinner   xfs: introduce in...
735
736
  restart:
  	error = 0;
c8e20be02   Dave Chinner   xfs: reclaim inod...
737
  	xfs_ilock(ip, XFS_ILOCK_EXCL);
c854363e8   Dave Chinner   xfs: Use delayed ...
738
739
740
  	if (!xfs_iflock_nowait(ip)) {
  		if (!(sync_mode & SYNC_WAIT))
  			goto out;
4dd2cb4a2   Christoph Hellwig   xfs: force buffer...
741
742
743
744
745
746
747
748
749
750
751
  
  		/*
  		 * If we only have a single dirty inode in a cluster there is
  		 * a fair chance that the AIL push may have pushed it into
  		 * the buffer, but xfsbufd won't touch it until 30 seconds
  		 * from now, and thus we will lock up here.
  		 *
  		 * Promote the inode buffer to the front of the delwri list
  		 * and wake up xfsbufd now.
  		 */
  		xfs_promote_inode(ip);
c854363e8   Dave Chinner   xfs: Use delayed ...
752
753
  		xfs_iflock(ip);
  	}
7a3be02ba   David Chinner   [XFS] use the ino...
754

777df5afd   Dave Chinner   xfs: Make inode r...
755
756
757
758
759
760
  	if (is_bad_inode(VFS_I(ip)))
  		goto reclaim;
  	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  		xfs_iunpin_wait(ip);
  		goto reclaim;
  	}
c854363e8   Dave Chinner   xfs: Use delayed ...
761
762
763
764
765
  	if (xfs_ipincount(ip)) {
  		if (!(sync_mode & SYNC_WAIT)) {
  			xfs_ifunlock(ip);
  			goto out;
  		}
777df5afd   Dave Chinner   xfs: Make inode r...
766
  		xfs_iunpin_wait(ip);
c854363e8   Dave Chinner   xfs: Use delayed ...
767
  	}
777df5afd   Dave Chinner   xfs: Make inode r...
768
769
770
771
  	if (xfs_iflags_test(ip, XFS_ISTALE))
  		goto reclaim;
  	if (xfs_inode_clean(ip))
  		goto reclaim;
1bfd8d041   Dave Chinner   xfs: introduce in...
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
  	/*
  	 * Now we have an inode that needs flushing.
  	 *
  	 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
  	 * reclaim as we can deadlock with inode cluster removal.
  	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
  	 * ip->i_lock, and we are doing the exact opposite here. As a result,
  	 * doing a blocking xfs_itobp() to get the cluster buffer will result
  	 * in an ABBA deadlock with xfs_ifree_cluster().
  	 *
  	 * As xfs_ifree_cluser() must gather all inodes that are active in the
  	 * cache to mark them stale, if we hit this case we don't actually want
  	 * to do IO here - we want the inode marked stale so we can simply
  	 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
  	 * just unlock the inode, back off and try again. Hopefully the next
  	 * pass through will see the stale flag set on the inode.
  	 */
  	error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
c854363e8   Dave Chinner   xfs: Use delayed ...
790
  	if (sync_mode & SYNC_WAIT) {
1bfd8d041   Dave Chinner   xfs: introduce in...
791
792
793
794
795
796
  		if (error == EAGAIN) {
  			xfs_iunlock(ip, XFS_ILOCK_EXCL);
  			/* backoff longer than in xfs_ifree_cluster */
  			delay(2);
  			goto restart;
  		}
c854363e8   Dave Chinner   xfs: Use delayed ...
797
798
  		xfs_iflock(ip);
  		goto reclaim;
c8e20be02   Dave Chinner   xfs: reclaim inod...
799
  	}
c854363e8   Dave Chinner   xfs: Use delayed ...
800
801
802
803
804
805
  	/*
  	 * When we have to flush an inode but don't have SYNC_WAIT set, we
  	 * flush the inode out using a delwri buffer and wait for the next
  	 * call into reclaim to find it in a clean state instead of waiting for
  	 * it now. We also don't return errors here - if the error is transient
  	 * then the next reclaim pass will flush the inode, and if the error
f1d486a36   Dave Chinner   xfs: don't warn o...
806
  	 * is permanent then the next sync reclaim will reclaim the inode and
c854363e8   Dave Chinner   xfs: Use delayed ...
807
808
  	 * pass on the error.
  	 */
f1d486a36   Dave Chinner   xfs: don't warn o...
809
  	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
4f10700a2   Dave Chinner   xfs: Convert linu...
810
  		xfs_warn(ip->i_mount,
c854363e8   Dave Chinner   xfs: Use delayed ...
811
812
813
814
815
816
817
818
819
820
821
822
823
824
  			"inode 0x%llx background reclaim flush failed with %d",
  			(long long)ip->i_ino, error);
  	}
  out:
  	xfs_iflags_clear(ip, XFS_IRECLAIM);
  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
  	/*
  	 * We could return EAGAIN here to make reclaim rescan the inode tree in
  	 * a short while. However, this just burns CPU time scanning the tree
  	 * waiting for IO to complete and xfssyncd never goes back to the idle
  	 * state. Instead, return 0 to let the next scheduled background reclaim
  	 * attempt to reclaim the inode again.
  	 */
  	return 0;
777df5afd   Dave Chinner   xfs: Make inode r...
825
826
  reclaim:
  	xfs_ifunlock(ip);
c8e20be02   Dave Chinner   xfs: reclaim inod...
827
  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
2f11feabb   Dave Chinner   xfs: simplify and...
828
829
830
831
832
833
834
835
836
  
  	XFS_STATS_INC(xs_ig_reclaims);
  	/*
  	 * Remove the inode from the per-AG radix tree.
  	 *
  	 * Because radix_tree_delete won't complain even if the item was never
  	 * added to the tree assert that it's been there before to catch
  	 * problems with the inode life time early on.
  	 */
1a427ab0c   Dave Chinner   xfs: convert pag_...
837
  	spin_lock(&pag->pag_ici_lock);
2f11feabb   Dave Chinner   xfs: simplify and...
838
839
840
  	if (!radix_tree_delete(&pag->pag_ici_root,
  				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
  		ASSERT(0);
081003fff   Johannes Weiner   xfs: properly acc...
841
  	__xfs_inode_clear_reclaim(pag, ip);
1a427ab0c   Dave Chinner   xfs: convert pag_...
842
  	spin_unlock(&pag->pag_ici_lock);
2f11feabb   Dave Chinner   xfs: simplify and...
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
  
  	/*
  	 * Here we do an (almost) spurious inode lock in order to coordinate
  	 * with inode cache radix tree lookups.  This is because the lookup
  	 * can reference the inodes in the cache without taking references.
  	 *
  	 * We make that OK here by ensuring that we wait until the inode is
  	 * unlocked after the lookup before we go ahead and free it.  We get
  	 * both the ilock and the iolock because the code may need to drop the
  	 * ilock one but will still hold the iolock.
  	 */
  	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
  	xfs_qm_dqdetach(ip);
  	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
  
  	xfs_inode_free(ip);
c854363e8   Dave Chinner   xfs: Use delayed ...
859
  	return error;
7a3be02ba   David Chinner   [XFS] use the ino...
860
  }
65d0f2053   Dave Chinner   xfs: split inode ...
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
  /*
   * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
   * corrupted, we still want to try to reclaim all the inodes. If we don't,
   * then a shut down during filesystem unmount reclaim walk leak all the
   * unreclaimed inodes.
   */
  int
  xfs_reclaim_inodes_ag(
  	struct xfs_mount	*mp,
  	int			flags,
  	int			*nr_to_scan)
  {
  	struct xfs_perag	*pag;
  	int			error = 0;
  	int			last_error = 0;
  	xfs_agnumber_t		ag;
69b491c21   Dave Chinner   xfs: serialise in...
877
878
  	int			trylock = flags & SYNC_TRYLOCK;
  	int			skipped;
65d0f2053   Dave Chinner   xfs: split inode ...
879

69b491c21   Dave Chinner   xfs: serialise in...
880
  restart:
65d0f2053   Dave Chinner   xfs: split inode ...
881
  	ag = 0;
69b491c21   Dave Chinner   xfs: serialise in...
882
  	skipped = 0;
65d0f2053   Dave Chinner   xfs: split inode ...
883
884
885
  	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
  		unsigned long	first_index = 0;
  		int		done = 0;
e3a20c0b0   Dave Chinner   xfs: batch inode ...
886
  		int		nr_found = 0;
65d0f2053   Dave Chinner   xfs: split inode ...
887
888
  
  		ag = pag->pag_agno + 1;
69b491c21   Dave Chinner   xfs: serialise in...
889
890
891
  		if (trylock) {
  			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
  				skipped++;
f83282a8e   Dave Chinner   xfs: fix per-ag r...
892
  				xfs_perag_put(pag);
69b491c21   Dave Chinner   xfs: serialise in...
893
894
895
896
897
  				continue;
  			}
  			first_index = pag->pag_ici_reclaim_cursor;
  		} else
  			mutex_lock(&pag->pag_ici_reclaim_lock);
65d0f2053   Dave Chinner   xfs: split inode ...
898
  		do {
e3a20c0b0   Dave Chinner   xfs: batch inode ...
899
900
  			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
  			int	i;
65d0f2053   Dave Chinner   xfs: split inode ...
901

1a3e8f3da   Dave Chinner   xfs: convert inod...
902
  			rcu_read_lock();
e3a20c0b0   Dave Chinner   xfs: batch inode ...
903
904
905
906
  			nr_found = radix_tree_gang_lookup_tag(
  					&pag->pag_ici_root,
  					(void **)batch, first_index,
  					XFS_LOOKUP_BATCH,
65d0f2053   Dave Chinner   xfs: split inode ...
907
908
  					XFS_ICI_RECLAIM_TAG);
  			if (!nr_found) {
b22322195   Dave Chinner   xfs: ensure recla...
909
  				done = 1;
1a3e8f3da   Dave Chinner   xfs: convert inod...
910
  				rcu_read_unlock();
65d0f2053   Dave Chinner   xfs: split inode ...
911
912
913
914
  				break;
  			}
  
  			/*
e3a20c0b0   Dave Chinner   xfs: batch inode ...
915
916
  			 * Grab the inodes before we drop the lock. if we found
  			 * nothing, nr == 0 and the loop will be skipped.
65d0f2053   Dave Chinner   xfs: split inode ...
917
  			 */
e3a20c0b0   Dave Chinner   xfs: batch inode ...
918
919
920
921
922
923
924
925
926
927
928
929
  			for (i = 0; i < nr_found; i++) {
  				struct xfs_inode *ip = batch[i];
  
  				if (done || xfs_reclaim_inode_grab(ip, flags))
  					batch[i] = NULL;
  
  				/*
  				 * Update the index for the next lookup. Catch
  				 * overflows into the next AG range which can
  				 * occur if we have inodes in the last block of
  				 * the AG and we are currently pointing to the
  				 * last inode.
1a3e8f3da   Dave Chinner   xfs: convert inod...
930
931
932
933
934
935
936
  				 *
  				 * Because we may see inodes that are from the
  				 * wrong AG due to RCU freeing and
  				 * reallocation, only update the index if it
  				 * lies in this AG. It was a race that lead us
  				 * to see this inode, so another lookup from
  				 * the same index will not find it again.
e3a20c0b0   Dave Chinner   xfs: batch inode ...
937
  				 */
1a3e8f3da   Dave Chinner   xfs: convert inod...
938
939
940
  				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
  								pag->pag_agno)
  					continue;
e3a20c0b0   Dave Chinner   xfs: batch inode ...
941
942
943
944
  				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
  				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
  					done = 1;
  			}
65d0f2053   Dave Chinner   xfs: split inode ...
945

e3a20c0b0   Dave Chinner   xfs: batch inode ...
946
  			/* unlock now we've grabbed the inodes. */
1a3e8f3da   Dave Chinner   xfs: convert inod...
947
  			rcu_read_unlock();
e3a20c0b0   Dave Chinner   xfs: batch inode ...
948
949
950
951
952
953
954
955
956
957
  
  			for (i = 0; i < nr_found; i++) {
  				if (!batch[i])
  					continue;
  				error = xfs_reclaim_inode(batch[i], pag, flags);
  				if (error && last_error != EFSCORRUPTED)
  					last_error = error;
  			}
  
  			*nr_to_scan -= XFS_LOOKUP_BATCH;
65d0f2053   Dave Chinner   xfs: split inode ...
958

8daaa8314   Dave Chinner   xfs: make use of ...
959
  			cond_resched();
e3a20c0b0   Dave Chinner   xfs: batch inode ...
960
  		} while (nr_found && !done && *nr_to_scan > 0);
65d0f2053   Dave Chinner   xfs: split inode ...
961

69b491c21   Dave Chinner   xfs: serialise in...
962
963
964
965
966
  		if (trylock && !done)
  			pag->pag_ici_reclaim_cursor = first_index;
  		else
  			pag->pag_ici_reclaim_cursor = 0;
  		mutex_unlock(&pag->pag_ici_reclaim_lock);
65d0f2053   Dave Chinner   xfs: split inode ...
967
968
  		xfs_perag_put(pag);
  	}
69b491c21   Dave Chinner   xfs: serialise in...
969
970
971
972
973
974
975
976
  
  	/*
  	 * if we skipped any AG, and we still have scan count remaining, do
  	 * another pass this time using blocking reclaim semantics (i.e
  	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
  	 * ensure that when we get more reclaimers than AGs we block rather
  	 * than spin trying to execute reclaim.
  	 */
8daaa8314   Dave Chinner   xfs: make use of ...
977
  	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
69b491c21   Dave Chinner   xfs: serialise in...
978
979
980
  		trylock = 0;
  		goto restart;
  	}
65d0f2053   Dave Chinner   xfs: split inode ...
981
982
  	return XFS_ERROR(last_error);
  }
7a3be02ba   David Chinner   [XFS] use the ino...
983
984
985
  int
  xfs_reclaim_inodes(
  	xfs_mount_t	*mp,
7a3be02ba   David Chinner   [XFS] use the ino...
986
987
  	int		mode)
  {
65d0f2053   Dave Chinner   xfs: split inode ...
988
989
990
  	int		nr_to_scan = INT_MAX;
  
  	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
9bf729c0a   Dave Chinner   xfs: add a shrink...
991
992
993
  }
  
  /*
8daaa8314   Dave Chinner   xfs: make use of ...
994
   * Scan a certain number of inodes for reclaim.
a7b339f1b   Dave Chinner   xfs: introduce ba...
995
996
   *
   * When called we make sure that there is a background (fast) inode reclaim in
8daaa8314   Dave Chinner   xfs: make use of ...
997
   * progress, while we will throttle the speed of reclaim via doing synchronous
a7b339f1b   Dave Chinner   xfs: introduce ba...
998
999
1000
   * reclaim of inodes. That means if we come across dirty inodes, we wait for
   * them to be cleaned, which we hope will not be very long due to the
   * background walker having already kicked the IO off on those dirty inodes.
9bf729c0a   Dave Chinner   xfs: add a shrink...
1001
   */
8daaa8314   Dave Chinner   xfs: make use of ...
1002
1003
1004
1005
  void
  xfs_reclaim_inodes_nr(
  	struct xfs_mount	*mp,
  	int			nr_to_scan)
9bf729c0a   Dave Chinner   xfs: add a shrink...
1006
  {
8daaa8314   Dave Chinner   xfs: make use of ...
1007
1008
1009
  	/* kick background reclaimer and push the AIL */
  	xfs_syncd_queue_reclaim(mp);
  	xfs_ail_push_all(mp->m_ail);
a7b339f1b   Dave Chinner   xfs: introduce ba...
1010

8daaa8314   Dave Chinner   xfs: make use of ...
1011
1012
  	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
  }
9bf729c0a   Dave Chinner   xfs: add a shrink...
1013

8daaa8314   Dave Chinner   xfs: make use of ...
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
  /*
   * Return the number of reclaimable inodes in the filesystem for
   * the shrinker to determine how much to reclaim.
   */
  int
  xfs_reclaim_inodes_count(
  	struct xfs_mount	*mp)
  {
  	struct xfs_perag	*pag;
  	xfs_agnumber_t		ag = 0;
  	int			reclaimable = 0;
9bf729c0a   Dave Chinner   xfs: add a shrink...
1025

65d0f2053   Dave Chinner   xfs: split inode ...
1026
1027
  	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
  		ag = pag->pag_agno + 1;
70e60ce71   Dave Chinner   xfs: convert inod...
1028
1029
  		reclaimable += pag->pag_ici_reclaimable;
  		xfs_perag_put(pag);
9bf729c0a   Dave Chinner   xfs: add a shrink...
1030
  	}
9bf729c0a   Dave Chinner   xfs: add a shrink...
1031
1032
  	return reclaimable;
  }