Blame view

fs/fs-writeback.c 33 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   * fs/fs-writeback.c
   *
   * Copyright (C) 2002, Linus Torvalds.
   *
   * Contains all the functions related to writing back and waiting
   * upon dirty inodes against superblocks, and writing back dirty
   * pages against inodes.  ie: data writeback.  Writeout of the
   * inode itself is not handled here.
   *
e1f8e8744   Francois Cami   Remove Andrew Mor...
11
   * 10Apr2002	Andrew Morton
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
13
14
15
16
   *		Split out of fs/inode.c
   *		Additions for address_space-based writeback
   */
  
  #include <linux/kernel.h>
f5ff8422b   Jens Axboe   Fix warnings with...
17
  #include <linux/module.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
18
19
20
21
  #include <linux/spinlock.h>
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
03ba3782e   Jens Axboe   writeback: switch...
22
23
  #include <linux/kthread.h>
  #include <linux/freezer.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
  #include <linux/backing-dev.h>
  #include <linux/buffer_head.h>
07f3f05c1   David Howells   [PATCH] BLOCK: Mo...
28
  #include "internal.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29

66f3b8e2e   Jens Axboe   writeback: move d...
30
  #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
31

03ba3782e   Jens Axboe   writeback: switch...
32
  /*
d0bceac74   Jens Axboe   writeback: get ri...
33
34
35
36
37
   * We don't actually have pdflush, but this one is exported though /proc...
   */
  int nr_pdflush_threads;
  
  /*
c4a77a6c7   Jens Axboe   writeback: make w...
38
39
40
41
42
43
   * Passed into wb_writeback(), essentially a subset of writeback_control
   */
  struct wb_writeback_args {
  	long nr_pages;
  	struct super_block *sb;
  	enum writeback_sync_modes sync_mode;
d3ddec763   Wu Fengguang   writeback: stop b...
44
45
46
  	int for_kupdate:1;
  	int range_cyclic:1;
  	int for_background:1;
c4a77a6c7   Jens Axboe   writeback: make w...
47
48
49
  };
  
  /*
03ba3782e   Jens Axboe   writeback: switch...
50
   * Work items for the bdi_writeback threads
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
51
   */
03ba3782e   Jens Axboe   writeback: switch...
52
  struct bdi_work {
8010c3b63   Jens Axboe   writeback: add co...
53
54
  	struct list_head list;		/* pending work list */
  	struct rcu_head rcu_head;	/* for RCU free/clear of work */
03ba3782e   Jens Axboe   writeback: switch...
55

8010c3b63   Jens Axboe   writeback: add co...
56
57
  	unsigned long seen;		/* threads that have seen this work */
  	atomic_t pending;		/* number of threads still to do work */
03ba3782e   Jens Axboe   writeback: switch...
58

8010c3b63   Jens Axboe   writeback: add co...
59
  	struct wb_writeback_args args;	/* writeback arguments */
03ba3782e   Jens Axboe   writeback: switch...
60

8010c3b63   Jens Axboe   writeback: add co...
61
  	unsigned long state;		/* flag bits, see WS_* */
03ba3782e   Jens Axboe   writeback: switch...
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
  };
  
  enum {
  	WS_USED_B = 0,
  	WS_ONSTACK_B,
  };
  
  #define WS_USED (1 << WS_USED_B)
  #define WS_ONSTACK (1 << WS_ONSTACK_B)
  
  static inline bool bdi_work_on_stack(struct bdi_work *work)
  {
  	return test_bit(WS_ONSTACK_B, &work->state);
  }
  
  static inline void bdi_work_init(struct bdi_work *work,
b6e51316d   Jens Axboe   writeback: separa...
78
  				 struct wb_writeback_args *args)
03ba3782e   Jens Axboe   writeback: switch...
79
80
  {
  	INIT_RCU_HEAD(&work->rcu_head);
b6e51316d   Jens Axboe   writeback: separa...
81
  	work->args = *args;
03ba3782e   Jens Axboe   writeback: switch...
82
83
  	work->state = WS_USED;
  }
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
84
85
86
87
  /**
   * writeback_in_progress - determine whether there is writeback in progress
   * @bdi: the device's backing_dev_info structure.
   *
03ba3782e   Jens Axboe   writeback: switch...
88
89
   * Determine whether there is writeback waiting to be handled against a
   * backing device.
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
90
91
92
   */
  int writeback_in_progress(struct backing_dev_info *bdi)
  {
03ba3782e   Jens Axboe   writeback: switch...
93
  	return !list_empty(&bdi->work_list);
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
94
  }
03ba3782e   Jens Axboe   writeback: switch...
95
  static void bdi_work_clear(struct bdi_work *work)
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
96
  {
03ba3782e   Jens Axboe   writeback: switch...
97
98
  	clear_bit(WS_USED_B, &work->state);
  	smp_mb__after_clear_bit();
1ef7d9aa3   Nick Piggin   writeback: fix po...
99
100
101
102
103
  	/*
  	 * work can have disappeared at this point. bit waitq functions
  	 * should be able to tolerate this, provided bdi_sched_wait does
  	 * not dereference it's pointer argument.
  	*/
03ba3782e   Jens Axboe   writeback: switch...
104
  	wake_up_bit(&work->state, WS_USED_B);
f11b00f3b   Adrian Bunk   fs/fs-writeback.c...
105
  }
03ba3782e   Jens Axboe   writeback: switch...
106
  static void bdi_work_free(struct rcu_head *head)
4195f73d1   Nick Piggin   fs: block_dump mi...
107
  {
03ba3782e   Jens Axboe   writeback: switch...
108
  	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
4195f73d1   Nick Piggin   fs: block_dump mi...
109

03ba3782e   Jens Axboe   writeback: switch...
110
111
112
113
  	if (!bdi_work_on_stack(work))
  		kfree(work);
  	else
  		bdi_work_clear(work);
4195f73d1   Nick Piggin   fs: block_dump mi...
114
  }
03ba3782e   Jens Axboe   writeback: switch...
115
  static void wb_work_complete(struct bdi_work *work)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
116
  {
c4a77a6c7   Jens Axboe   writeback: make w...
117
  	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
77b9d059c   Nick Piggin   writeback: Fix bd...
118
  	int onstack = bdi_work_on_stack(work);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
119
120
  
  	/*
03ba3782e   Jens Axboe   writeback: switch...
121
122
123
124
  	 * For allocated work, we can clear the done/seen bit right here.
  	 * For on-stack work, we need to postpone both the clear and free
  	 * to after the RCU grace period, since the stack could be invalidated
  	 * as soon as bdi_work_clear() has done the wakeup.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
125
  	 */
77b9d059c   Nick Piggin   writeback: Fix bd...
126
  	if (!onstack)
03ba3782e   Jens Axboe   writeback: switch...
127
  		bdi_work_clear(work);
77b9d059c   Nick Piggin   writeback: Fix bd...
128
  	if (sync_mode == WB_SYNC_NONE || onstack)
03ba3782e   Jens Axboe   writeback: switch...
129
130
  		call_rcu(&work->rcu_head, bdi_work_free);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
131

03ba3782e   Jens Axboe   writeback: switch...
132
133
  static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
  	/*
03ba3782e   Jens Axboe   writeback: switch...
135
136
  	 * The caller has retrieved the work arguments from this work,
  	 * drop our reference. If this is the last ref, delete and free it
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
137
  	 */
03ba3782e   Jens Axboe   writeback: switch...
138
139
  	if (atomic_dec_and_test(&work->pending)) {
  		struct backing_dev_info *bdi = wb->bdi;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140

03ba3782e   Jens Axboe   writeback: switch...
141
142
143
  		spin_lock(&bdi->wb_lock);
  		list_del_rcu(&work->list);
  		spin_unlock(&bdi->wb_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144

03ba3782e   Jens Axboe   writeback: switch...
145
146
147
  		wb_work_complete(work);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
148

03ba3782e   Jens Axboe   writeback: switch...
149
150
  static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
  {
bcddc3f01   Jens Axboe   writeback: inline...
151
152
153
154
  	work->seen = bdi->wb_mask;
  	BUG_ON(!work->seen);
  	atomic_set(&work->pending, bdi->wb_cnt);
  	BUG_ON(!bdi->wb_cnt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155

bcddc3f01   Jens Axboe   writeback: inline...
156
  	/*
deed62edf   Nick Piggin   writeback: remove...
157
158
159
  	 * list_add_tail_rcu() contains the necessary barriers to
  	 * make sure the above stores are seen before the item is
  	 * noticed on the list
bcddc3f01   Jens Axboe   writeback: inline...
160
  	 */
bcddc3f01   Jens Axboe   writeback: inline...
161
162
163
  	spin_lock(&bdi->wb_lock);
  	list_add_tail_rcu(&work->list, &bdi->work_list);
  	spin_unlock(&bdi->wb_lock);
03ba3782e   Jens Axboe   writeback: switch...
164
165
166
167
168
169
170
171
172
  
  	/*
  	 * If the default thread isn't there, make sure we add it. When
  	 * it gets created and wakes up, we'll run this work.
  	 */
  	if (unlikely(list_empty_careful(&bdi->wb_list)))
  		wake_up_process(default_backing_dev_info.wb.task);
  	else {
  		struct bdi_writeback *wb = &bdi->wb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
173

1ef7d9aa3   Nick Piggin   writeback: fix po...
174
  		if (wb->task)
03ba3782e   Jens Axboe   writeback: switch...
175
  			wake_up_process(wb->task);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
176
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
  }
03ba3782e   Jens Axboe   writeback: switch...
178
179
180
181
182
183
184
185
186
  /*
   * Used for on-stack allocated work items. The caller needs to wait until
   * the wb threads have acked the work before it's safe to continue.
   */
  static void bdi_wait_on_work_clear(struct bdi_work *work)
  {
  	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
  		    TASK_UNINTERRUPTIBLE);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
187

f11fcae84   Jens Axboe   writeback: only u...
188
  static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
b6e51316d   Jens Axboe   writeback: separa...
189
  				 struct wb_writeback_args *args)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
190
  {
03ba3782e   Jens Axboe   writeback: switch...
191
  	struct bdi_work *work;
bcddc3f01   Jens Axboe   writeback: inline...
192
193
194
195
  	/*
  	 * This is WB_SYNC_NONE writeback, so if allocation fails just
  	 * wakeup the thread for old dirty data writeback
  	 */
03ba3782e   Jens Axboe   writeback: switch...
196
  	work = kmalloc(sizeof(*work), GFP_ATOMIC);
bcddc3f01   Jens Axboe   writeback: inline...
197
  	if (work) {
b6e51316d   Jens Axboe   writeback: separa...
198
  		bdi_work_init(work, args);
bcddc3f01   Jens Axboe   writeback: inline...
199
200
201
  		bdi_queue_work(bdi, work);
  	} else {
  		struct bdi_writeback *wb = &bdi->wb;
03ba3782e   Jens Axboe   writeback: switch...
202

bcddc3f01   Jens Axboe   writeback: inline...
203
204
205
  		if (wb->task)
  			wake_up_process(wb->task);
  	}
03ba3782e   Jens Axboe   writeback: switch...
206
  }
b6e51316d   Jens Axboe   writeback: separa...
207
208
209
210
211
212
213
214
215
216
217
218
  /**
   * bdi_sync_writeback - start and wait for writeback
   * @bdi: the backing device to write from
   * @sb: write inodes from this super_block
   *
   * Description:
   *   This does WB_SYNC_ALL data integrity writeback and waits for the
   *   IO to complete. Callers must hold the sb s_umount semaphore for
   *   reading, to avoid having the super disappear before we are done.
   */
  static void bdi_sync_writeback(struct backing_dev_info *bdi,
  			       struct super_block *sb)
03ba3782e   Jens Axboe   writeback: switch...
219
  {
b6e51316d   Jens Axboe   writeback: separa...
220
221
222
223
224
225
226
  	struct wb_writeback_args args = {
  		.sb		= sb,
  		.sync_mode	= WB_SYNC_ALL,
  		.nr_pages	= LONG_MAX,
  		.range_cyclic	= 0,
  	};
  	struct bdi_work work;
03ba3782e   Jens Axboe   writeback: switch...
227

b6e51316d   Jens Axboe   writeback: separa...
228
229
  	bdi_work_init(&work, &args);
  	work.state |= WS_ONSTACK;
03ba3782e   Jens Axboe   writeback: switch...
230

b6e51316d   Jens Axboe   writeback: separa...
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
  	bdi_queue_work(bdi, &work);
  	bdi_wait_on_work_clear(&work);
  }
  
  /**
   * bdi_start_writeback - start writeback
   * @bdi: the backing device to write from
   * @nr_pages: the number of pages to write
   *
   * Description:
   *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
   *   started when this function returns, we make no guarentees on
   *   completion. Caller need not hold sb s_umount semaphore.
   *
   */
a72bfd4de   Jens Axboe   writeback: pass i...
246
247
  void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
  			 long nr_pages)
b6e51316d   Jens Axboe   writeback: separa...
248
249
  {
  	struct wb_writeback_args args = {
a72bfd4de   Jens Axboe   writeback: pass i...
250
  		.sb		= sb,
b6e51316d   Jens Axboe   writeback: separa...
251
252
253
254
  		.sync_mode	= WB_SYNC_NONE,
  		.nr_pages	= nr_pages,
  		.range_cyclic	= 1,
  	};
d3ddec763   Wu Fengguang   writeback: stop b...
255
256
257
258
259
260
261
262
  	/*
  	 * We treat @nr_pages=0 as the special case to do background writeback,
  	 * ie. to sync pages until the background dirty threshold is reached.
  	 */
  	if (!nr_pages) {
  		args.nr_pages = LONG_MAX;
  		args.for_background = 1;
  	}
b6e51316d   Jens Axboe   writeback: separa...
263
  	bdi_alloc_queue_work(bdi, &args);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264
265
266
  }
  
  /*
6610a0bc8   Andrew Morton   writeback: fix ti...
267
268
269
270
   * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
   * furthest end of its superblock's dirty-inode list.
   *
   * Before stamping the inode's ->dirtied_when, we check to see whether it is
66f3b8e2e   Jens Axboe   writeback: move d...
271
   * already the most-recently-dirtied inode on the b_dirty list.  If that is
6610a0bc8   Andrew Morton   writeback: fix ti...
272
273
274
275
276
   * the case then the inode must have been redirtied while it was being written
   * out and we don't reset its dirtied_when.
   */
  static void redirty_tail(struct inode *inode)
  {
03ba3782e   Jens Axboe   writeback: switch...
277
  	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
6610a0bc8   Andrew Morton   writeback: fix ti...
278

03ba3782e   Jens Axboe   writeback: switch...
279
  	if (!list_empty(&wb->b_dirty)) {
66f3b8e2e   Jens Axboe   writeback: move d...
280
  		struct inode *tail;
6610a0bc8   Andrew Morton   writeback: fix ti...
281

03ba3782e   Jens Axboe   writeback: switch...
282
  		tail = list_entry(wb->b_dirty.next, struct inode, i_list);
66f3b8e2e   Jens Axboe   writeback: move d...
283
  		if (time_before(inode->dirtied_when, tail->dirtied_when))
6610a0bc8   Andrew Morton   writeback: fix ti...
284
285
  			inode->dirtied_when = jiffies;
  	}
03ba3782e   Jens Axboe   writeback: switch...
286
  	list_move(&inode->i_list, &wb->b_dirty);
6610a0bc8   Andrew Morton   writeback: fix ti...
287
288
289
  }
  
  /*
66f3b8e2e   Jens Axboe   writeback: move d...
290
   * requeue inode for re-scanning after bdi->b_io list is exhausted.
c986d1e2a   Andrew Morton   writeback: fix ti...
291
   */
0e0f4fc22   Ken Chen   writeback: fix pe...
292
  static void requeue_io(struct inode *inode)
c986d1e2a   Andrew Morton   writeback: fix ti...
293
  {
03ba3782e   Jens Axboe   writeback: switch...
294
295
296
  	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  
  	list_move(&inode->i_list, &wb->b_more_io);
c986d1e2a   Andrew Morton   writeback: fix ti...
297
  }
1c0eeaf56   Joern Engel   introduce I_SYNC
298
299
300
301
302
303
304
305
  static void inode_sync_complete(struct inode *inode)
  {
  	/*
  	 * Prevent speculative execution through spin_unlock(&inode_lock);
  	 */
  	smp_mb();
  	wake_up_bit(&inode->i_state, __I_SYNC);
  }
d2caa3c54   Jeff Layton   writeback: guard ...
306
307
308
309
310
311
312
313
  static bool inode_dirtied_after(struct inode *inode, unsigned long t)
  {
  	bool ret = time_after(inode->dirtied_when, t);
  #ifndef CONFIG_64BIT
  	/*
  	 * For inodes being constantly redirtied, dirtied_when can get stuck.
  	 * It _appears_ to be in the future, but is actually in distant past.
  	 * This test is necessary to prevent such wrapped-around relative times
5b0830cb9   Jens Axboe   writeback: get ri...
314
  	 * from permanently stopping the whole bdi writeback.
d2caa3c54   Jeff Layton   writeback: guard ...
315
316
317
318
319
  	 */
  	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
  #endif
  	return ret;
  }
c986d1e2a   Andrew Morton   writeback: fix ti...
320
  /*
2c1365791   Fengguang Wu   writeback: fix ti...
321
322
323
324
325
326
   * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
   */
  static void move_expired_inodes(struct list_head *delaying_queue,
  			       struct list_head *dispatch_queue,
  				unsigned long *older_than_this)
  {
5c03449d3   Shaohua Li   writeback: move i...
327
328
  	LIST_HEAD(tmp);
  	struct list_head *pos, *node;
cf137307c   Jens Axboe   writeback: don't ...
329
  	struct super_block *sb = NULL;
5c03449d3   Shaohua Li   writeback: move i...
330
  	struct inode *inode;
cf137307c   Jens Axboe   writeback: don't ...
331
  	int do_sb_sort = 0;
5c03449d3   Shaohua Li   writeback: move i...
332

2c1365791   Fengguang Wu   writeback: fix ti...
333
  	while (!list_empty(delaying_queue)) {
5c03449d3   Shaohua Li   writeback: move i...
334
  		inode = list_entry(delaying_queue->prev, struct inode, i_list);
2c1365791   Fengguang Wu   writeback: fix ti...
335
  		if (older_than_this &&
d2caa3c54   Jeff Layton   writeback: guard ...
336
  		    inode_dirtied_after(inode, *older_than_this))
2c1365791   Fengguang Wu   writeback: fix ti...
337
  			break;
cf137307c   Jens Axboe   writeback: don't ...
338
339
340
  		if (sb && sb != inode->i_sb)
  			do_sb_sort = 1;
  		sb = inode->i_sb;
5c03449d3   Shaohua Li   writeback: move i...
341
342
  		list_move(&inode->i_list, &tmp);
  	}
cf137307c   Jens Axboe   writeback: don't ...
343
344
345
346
347
  	/* just one sb in list, splice to dispatch_queue and we're done */
  	if (!do_sb_sort) {
  		list_splice(&tmp, dispatch_queue);
  		return;
  	}
5c03449d3   Shaohua Li   writeback: move i...
348
349
350
351
352
353
354
355
356
  	/* Move inodes from one superblock together */
  	while (!list_empty(&tmp)) {
  		inode = list_entry(tmp.prev, struct inode, i_list);
  		sb = inode->i_sb;
  		list_for_each_prev_safe(pos, node, &tmp) {
  			inode = list_entry(pos, struct inode, i_list);
  			if (inode->i_sb == sb)
  				list_move(&inode->i_list, dispatch_queue);
  		}
2c1365791   Fengguang Wu   writeback: fix ti...
357
358
359
360
361
362
  	}
  }
  
  /*
   * Queue all expired dirty inodes for io, eldest first.
   */
03ba3782e   Jens Axboe   writeback: switch...
363
  static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
66f3b8e2e   Jens Axboe   writeback: move d...
364
  {
03ba3782e   Jens Axboe   writeback: switch...
365
366
  	list_splice_init(&wb->b_more_io, wb->b_io.prev);
  	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
66f3b8e2e   Jens Axboe   writeback: move d...
367
  }
03ba3782e   Jens Axboe   writeback: switch...
368
  static int write_inode(struct inode *inode, int sync)
08d8e9749   Fengguang Wu   writeback: fix nt...
369
  {
03ba3782e   Jens Axboe   writeback: switch...
370
371
372
  	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
  		return inode->i_sb->s_op->write_inode(inode, sync);
  	return 0;
08d8e9749   Fengguang Wu   writeback: fix nt...
373
  }
08d8e9749   Fengguang Wu   writeback: fix nt...
374

2c1365791   Fengguang Wu   writeback: fix ti...
375
  /*
01c031945   Christoph Hellwig   cleanup __writeba...
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
   * Wait for writeback on an inode to complete.
   */
  static void inode_wait_for_writeback(struct inode *inode)
  {
  	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
  	wait_queue_head_t *wqh;
  
  	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
  	do {
  		spin_unlock(&inode_lock);
  		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
  		spin_lock(&inode_lock);
  	} while (inode->i_state & I_SYNC);
  }
  
  /*
   * Write out an inode's dirty pages.  Called under inode_lock.  Either the
   * caller has ref on the inode (either via __iget or via syscall against an fd)
   * or the inode has I_WILL_FREE set (via generic_forget_inode)
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
396
397
398
399
400
401
402
403
404
   * If `wait' is set, wait on the writeout.
   *
   * The whole writeout design is quite complex and fragile.  We want to avoid
   * starvation of particular inodes when others are being redirtied, prevent
   * livelocks, etc.
   *
   * Called under inode_lock.
   */
  static int
01c031945   Christoph Hellwig   cleanup __writeba...
405
  writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
406
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
407
  	struct address_space *mapping = inode->i_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
408
  	int wait = wbc->sync_mode == WB_SYNC_ALL;
01c031945   Christoph Hellwig   cleanup __writeba...
409
  	unsigned dirty;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
410
  	int ret;
01c031945   Christoph Hellwig   cleanup __writeba...
411
412
413
414
415
416
417
418
  	if (!atomic_read(&inode->i_count))
  		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
  	else
  		WARN_ON(inode->i_state & I_WILL_FREE);
  
  	if (inode->i_state & I_SYNC) {
  		/*
  		 * If this inode is locked for writeback and we are not doing
66f3b8e2e   Jens Axboe   writeback: move d...
419
  		 * writeback-for-data-integrity, move it to b_more_io so that
01c031945   Christoph Hellwig   cleanup __writeba...
420
421
422
  		 * writeback can proceed with the other inodes on s_io.
  		 *
  		 * We'll have another go at writing back this inode when we
66f3b8e2e   Jens Axboe   writeback: move d...
423
  		 * completed a full scan of b_io.
01c031945   Christoph Hellwig   cleanup __writeba...
424
425
426
427
428
429
430
431
432
433
434
  		 */
  		if (!wait) {
  			requeue_io(inode);
  			return 0;
  		}
  
  		/*
  		 * It's a data-integrity sync.  We must wait.
  		 */
  		inode_wait_for_writeback(inode);
  	}
1c0eeaf56   Joern Engel   introduce I_SYNC
435
  	BUG_ON(inode->i_state & I_SYNC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
436

1c0eeaf56   Joern Engel   introduce I_SYNC
437
  	/* Set I_SYNC, reset I_DIRTY */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
438
  	dirty = inode->i_state & I_DIRTY;
1c0eeaf56   Joern Engel   introduce I_SYNC
439
  	inode->i_state |= I_SYNC;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
  	inode->i_state &= ~I_DIRTY;
  
  	spin_unlock(&inode_lock);
  
  	ret = do_writepages(mapping, wbc);
  
  	/* Don't write the inode if only I_DIRTY_PAGES was set */
  	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
  		int err = write_inode(inode, wait);
  		if (ret == 0)
  			ret = err;
  	}
  
  	if (wait) {
  		int err = filemap_fdatawait(mapping);
  		if (ret == 0)
  			ret = err;
  	}
  
  	spin_lock(&inode_lock);
1c0eeaf56   Joern Engel   introduce I_SYNC
460
  	inode->i_state &= ~I_SYNC;
84a892456   Wu Fengguang   writeback: skip n...
461
  	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
b3af9468a   Wu Fengguang   writeback: don't ...
462
  		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
ae1b7f7d4   Wu Fengguang   writeback: cleanu...
463
  			/*
b3af9468a   Wu Fengguang   writeback: don't ...
464
465
466
467
468
469
470
  			 * More pages get dirtied by a fast dirtier.
  			 */
  			goto select_queue;
  		} else if (inode->i_state & I_DIRTY) {
  			/*
  			 * At least XFS will redirty the inode during the
  			 * writeback (delalloc) and on io completion (isize).
ae1b7f7d4   Wu Fengguang   writeback: cleanu...
471
472
473
  			 */
  			redirty_tail(inode);
  		} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
474
475
476
  			/*
  			 * We didn't write back all the pages.  nfs_writepages()
  			 * sometimes bales out without doing anything. Redirty
66f3b8e2e   Jens Axboe   writeback: move d...
477
  			 * the inode; Move it from b_io onto b_more_io/b_dirty.
1b43ef91d   Andrew Morton   writeback: fix co...
478
479
480
  			 */
  			/*
  			 * akpm: if the caller was the kupdate function we put
66f3b8e2e   Jens Axboe   writeback: move d...
481
  			 * this inode at the head of b_dirty so it gets first
1b43ef91d   Andrew Morton   writeback: fix co...
482
483
484
485
486
  			 * consideration.  Otherwise, move it to the tail, for
  			 * the reasons described there.  I'm not really sure
  			 * how much sense this makes.  Presumably I had a good
  			 * reasons for doing it this way, and I'd rather not
  			 * muck with it at present.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
487
488
489
  			 */
  			if (wbc->for_kupdate) {
  				/*
2c1365791   Fengguang Wu   writeback: fix ti...
490
  				 * For the kupdate function we move the inode
66f3b8e2e   Jens Axboe   writeback: move d...
491
  				 * to b_more_io so it will get more writeout as
2c1365791   Fengguang Wu   writeback: fix ti...
492
  				 * soon as the queue becomes uncongested.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
494
  				 */
  				inode->i_state |= I_DIRTY_PAGES;
b3af9468a   Wu Fengguang   writeback: don't ...
495
  select_queue:
8bc3be275   Fengguang Wu   writeback: speed ...
496
497
498
499
500
501
502
503
504
505
506
  				if (wbc->nr_to_write <= 0) {
  					/*
  					 * slice used up: queue for next turn
  					 */
  					requeue_io(inode);
  				} else {
  					/*
  					 * somehow blocked: retry later
  					 */
  					redirty_tail(inode);
  				}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
507
508
509
510
511
512
513
514
515
  			} else {
  				/*
  				 * Otherwise fully redirty the inode so that
  				 * other inodes on this superblock will get some
  				 * writeout.  Otherwise heavy writing to one
  				 * file would indefinitely suspend writeout of
  				 * all the other files.
  				 */
  				inode->i_state |= I_DIRTY_PAGES;
1b43ef91d   Andrew Morton   writeback: fix co...
516
  				redirty_tail(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
517
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
518
519
520
521
522
523
524
525
526
527
  		} else if (atomic_read(&inode->i_count)) {
  			/*
  			 * The inode is clean, inuse
  			 */
  			list_move(&inode->i_list, &inode_in_use);
  		} else {
  			/*
  			 * The inode is clean, unused
  			 */
  			list_move(&inode->i_list, &inode_unused);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
528
529
  		}
  	}
1c0eeaf56   Joern Engel   introduce I_SYNC
530
  	inode_sync_complete(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
531
532
  	return ret;
  }
9ecc2738a   Jens Axboe   writeback: make t...
533
534
535
536
537
538
539
540
541
542
  static void unpin_sb_for_writeback(struct super_block **psb)
  {
  	struct super_block *sb = *psb;
  
  	if (sb) {
  		up_read(&sb->s_umount);
  		put_super(sb);
  		*psb = NULL;
  	}
  }
03ba3782e   Jens Axboe   writeback: switch...
543
544
545
546
547
548
549
550
551
  /*
   * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
   * before calling writeback. So make sure that we do pin it, so it doesn't
   * go away while we are writing inodes from it.
   *
   * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
   * 1 if we failed.
   */
  static int pin_sb_for_writeback(struct writeback_control *wbc,
9ecc2738a   Jens Axboe   writeback: make t...
552
  				struct inode *inode, struct super_block **psb)
03ba3782e   Jens Axboe   writeback: switch...
553
554
555
556
  {
  	struct super_block *sb = inode->i_sb;
  
  	/*
9ecc2738a   Jens Axboe   writeback: make t...
557
558
559
560
561
562
563
564
565
  	 * If this sb is already pinned, nothing more to do. If not and
  	 * *psb is non-NULL, unpin the old one first
  	 */
  	if (sb == *psb)
  		return 0;
  	else if (*psb)
  		unpin_sb_for_writeback(psb);
  
  	/*
03ba3782e   Jens Axboe   writeback: switch...
566
567
568
569
570
571
572
573
574
575
576
577
  	 * Caller must already hold the ref for this
  	 */
  	if (wbc->sync_mode == WB_SYNC_ALL) {
  		WARN_ON(!rwsem_is_locked(&sb->s_umount));
  		return 0;
  	}
  
  	spin_lock(&sb_lock);
  	sb->s_count++;
  	if (down_read_trylock(&sb->s_umount)) {
  		if (sb->s_root) {
  			spin_unlock(&sb_lock);
9ecc2738a   Jens Axboe   writeback: make t...
578
  			goto pinned;
03ba3782e   Jens Axboe   writeback: switch...
579
580
581
582
583
584
585
586
587
588
  		}
  		/*
  		 * umounted, drop rwsem again and fall through to failure
  		 */
  		up_read(&sb->s_umount);
  	}
  
  	sb->s_count--;
  	spin_unlock(&sb_lock);
  	return 1;
9ecc2738a   Jens Axboe   writeback: make t...
589
590
591
  pinned:
  	*psb = sb;
  	return 0;
03ba3782e   Jens Axboe   writeback: switch...
592
593
594
595
  }
  
  static void writeback_inodes_wb(struct bdi_writeback *wb,
  				struct writeback_control *wbc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
596
  {
9ecc2738a   Jens Axboe   writeback: make t...
597
  	struct super_block *sb = wbc->sb, *pin_sb = NULL;
66f3b8e2e   Jens Axboe   writeback: move d...
598
  	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
599
  	const unsigned long start = jiffies;	/* livelock avoidance */
ae8547b0a   Hans Reiser   VFS: move inode_l...
600
  	spin_lock(&inode_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
601

03ba3782e   Jens Axboe   writeback: switch...
602
603
  	if (!wbc->for_kupdate || list_empty(&wb->b_io))
  		queue_io(wb, wbc->older_than_this);
66f3b8e2e   Jens Axboe   writeback: move d...
604

03ba3782e   Jens Axboe   writeback: switch...
605
606
  	while (!list_empty(&wb->b_io)) {
  		struct inode *inode = list_entry(wb->b_io.prev,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
607
  						struct inode, i_list);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
608
  		long pages_skipped;
66f3b8e2e   Jens Axboe   writeback: move d...
609
610
611
612
613
614
615
  		/*
  		 * super block given and doesn't match, skip this inode
  		 */
  		if (sb && sb != inode->i_sb) {
  			redirty_tail(inode);
  			continue;
  		}
03ba3782e   Jens Axboe   writeback: switch...
616
  		if (!bdi_cap_writeback_dirty(wb->bdi)) {
9852a0e76   Andrew Morton   writeback: fix ti...
617
  			redirty_tail(inode);
66f3b8e2e   Jens Axboe   writeback: move d...
618
  			if (is_blkdev_sb) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
619
620
621
622
623
624
625
626
627
628
629
630
631
  				/*
  				 * Dirty memory-backed blockdev: the ramdisk
  				 * driver does this.  Skip just this inode
  				 */
  				continue;
  			}
  			/*
  			 * Dirty memory-backed inode against a filesystem other
  			 * than the kernel-internal bdev filesystem.  Skip the
  			 * entire superblock.
  			 */
  			break;
  		}
84a892456   Wu Fengguang   writeback: skip n...
632
  		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
7ef0d7377   Nick Piggin   fs: new inode i_s...
633
634
635
  			requeue_io(inode);
  			continue;
  		}
03ba3782e   Jens Axboe   writeback: switch...
636
  		if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
637
  			wbc->encountered_congestion = 1;
66f3b8e2e   Jens Axboe   writeback: move d...
638
  			if (!is_blkdev_sb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
639
  				break;		/* Skip a congested fs */
0e0f4fc22   Ken Chen   writeback: fix pe...
640
  			requeue_io(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
641
642
  			continue;		/* Skip a congested blockdev */
  		}
d2caa3c54   Jeff Layton   writeback: guard ...
643
644
645
646
647
  		/*
  		 * Was this inode dirtied after sync_sb_inodes was called?
  		 * This keeps sync from extra jobs and livelock.
  		 */
  		if (inode_dirtied_after(inode, start))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
648
  			break;
9ecc2738a   Jens Axboe   writeback: make t...
649
  		if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
03ba3782e   Jens Axboe   writeback: switch...
650
651
652
  			requeue_io(inode);
  			continue;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
653

84a892456   Wu Fengguang   writeback: skip n...
654
  		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
655
656
  		__iget(inode);
  		pages_skipped = wbc->pages_skipped;
01c031945   Christoph Hellwig   cleanup __writeba...
657
  		writeback_single_inode(inode, wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
659
660
661
662
  		if (wbc->pages_skipped != pages_skipped) {
  			/*
  			 * writeback is not making progress due to locked
  			 * buffers.  Skip this inode for now.
  			 */
f57b9b7b4   Andrew Morton   writeback: fix ti...
663
  			redirty_tail(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
664
665
  		}
  		spin_unlock(&inode_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
666
  		iput(inode);
4ffc84442   OGAWA Hirofumi   [PATCH] Move cond...
667
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
668
  		spin_lock(&inode_lock);
8bc3be275   Fengguang Wu   writeback: speed ...
669
670
  		if (wbc->nr_to_write <= 0) {
  			wbc->more_io = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
671
  			break;
8bc3be275   Fengguang Wu   writeback: speed ...
672
  		}
03ba3782e   Jens Axboe   writeback: switch...
673
  		if (!list_empty(&wb->b_more_io))
8bc3be275   Fengguang Wu   writeback: speed ...
674
  			wbc->more_io = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
675
  	}
38f219776   Nick Piggin   fs: sync_sb_inode...
676

9ecc2738a   Jens Axboe   writeback: make t...
677
  	unpin_sb_for_writeback(&pin_sb);
66f3b8e2e   Jens Axboe   writeback: move d...
678
679
680
  	spin_unlock(&inode_lock);
  	/* Leave any unwritten inodes on b_io */
  }
03ba3782e   Jens Axboe   writeback: switch...
681
682
683
684
685
686
  void writeback_inodes_wbc(struct writeback_control *wbc)
  {
  	struct backing_dev_info *bdi = wbc->bdi;
  
  	writeback_inodes_wb(&bdi->wb, wbc);
  }
66f3b8e2e   Jens Axboe   writeback: move d...
687
  /*
03ba3782e   Jens Axboe   writeback: switch...
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
   * The maximum number of pages to writeout in a single bdi flush/kupdate
   * operation.  We do this so we don't hold I_SYNC against an inode for
   * enormous amounts of time, which would block a userspace task which has
   * been forced to throttle against that inode.  Also, the code reevaluates
   * the dirty each time it has written this many pages.
   */
  #define MAX_WRITEBACK_PAGES     1024
  
  static inline bool over_bground_thresh(void)
  {
  	unsigned long background_thresh, dirty_thresh;
  
  	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
  
  	return (global_page_state(NR_FILE_DIRTY) +
  		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
  }
  
  /*
   * Explicit flushing or periodic writeback of "old" data.
66f3b8e2e   Jens Axboe   writeback: move d...
708
   *
03ba3782e   Jens Axboe   writeback: switch...
709
710
711
712
   * Define "old": the first time one of an inode's pages is dirtied, we mark the
   * dirtying-time in the inode's address_space.  So this periodic writeback code
   * just walks the superblock inode list, writing back any inodes which are
   * older than a specific point in time.
66f3b8e2e   Jens Axboe   writeback: move d...
713
   *
03ba3782e   Jens Axboe   writeback: switch...
714
715
716
   * Try to run once per dirty_writeback_interval.  But if a writeback event
   * takes longer than a dirty_writeback_interval interval, then leave a
   * one-second gap.
66f3b8e2e   Jens Axboe   writeback: move d...
717
   *
03ba3782e   Jens Axboe   writeback: switch...
718
719
   * older_than_this takes precedence over nr_to_write.  So we'll only write back
   * all dirty pages if they are all attached to "old" mappings.
66f3b8e2e   Jens Axboe   writeback: move d...
720
   */
c4a77a6c7   Jens Axboe   writeback: make w...
721
722
  static long wb_writeback(struct bdi_writeback *wb,
  			 struct wb_writeback_args *args)
66f3b8e2e   Jens Axboe   writeback: move d...
723
  {
03ba3782e   Jens Axboe   writeback: switch...
724
725
  	struct writeback_control wbc = {
  		.bdi			= wb->bdi,
c4a77a6c7   Jens Axboe   writeback: make w...
726
727
  		.sb			= args->sb,
  		.sync_mode		= args->sync_mode,
03ba3782e   Jens Axboe   writeback: switch...
728
  		.older_than_this	= NULL,
c4a77a6c7   Jens Axboe   writeback: make w...
729
730
  		.for_kupdate		= args->for_kupdate,
  		.range_cyclic		= args->range_cyclic,
03ba3782e   Jens Axboe   writeback: switch...
731
732
733
  	};
  	unsigned long oldest_jif;
  	long wrote = 0;
a5989bdc9   Jan Kara   fs: Fix busyloop ...
734
  	struct inode *inode;
66f3b8e2e   Jens Axboe   writeback: move d...
735

03ba3782e   Jens Axboe   writeback: switch...
736
737
738
739
740
  	if (wbc.for_kupdate) {
  		wbc.older_than_this = &oldest_jif;
  		oldest_jif = jiffies -
  				msecs_to_jiffies(dirty_expire_interval * 10);
  	}
c4a77a6c7   Jens Axboe   writeback: make w...
741
742
743
744
  	if (!wbc.range_cyclic) {
  		wbc.range_start = 0;
  		wbc.range_end = LLONG_MAX;
  	}
38f219776   Nick Piggin   fs: sync_sb_inode...
745

03ba3782e   Jens Axboe   writeback: switch...
746
747
  	for (;;) {
  		/*
d3ddec763   Wu Fengguang   writeback: stop b...
748
  		 * Stop writeback when nr_pages has been consumed
03ba3782e   Jens Axboe   writeback: switch...
749
  		 */
d3ddec763   Wu Fengguang   writeback: stop b...
750
  		if (args->nr_pages <= 0)
03ba3782e   Jens Axboe   writeback: switch...
751
  			break;
66f3b8e2e   Jens Axboe   writeback: move d...
752

38f219776   Nick Piggin   fs: sync_sb_inode...
753
  		/*
d3ddec763   Wu Fengguang   writeback: stop b...
754
755
  		 * For background writeout, stop when we are below the
  		 * background dirty threshold
38f219776   Nick Piggin   fs: sync_sb_inode...
756
  		 */
d3ddec763   Wu Fengguang   writeback: stop b...
757
  		if (args->for_background && !over_bground_thresh())
03ba3782e   Jens Axboe   writeback: switch...
758
  			break;
38f219776   Nick Piggin   fs: sync_sb_inode...
759

03ba3782e   Jens Axboe   writeback: switch...
760
761
762
763
764
  		wbc.more_io = 0;
  		wbc.encountered_congestion = 0;
  		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
  		wbc.pages_skipped = 0;
  		writeback_inodes_wb(wb, &wbc);
c4a77a6c7   Jens Axboe   writeback: make w...
765
  		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
03ba3782e   Jens Axboe   writeback: switch...
766
767
768
  		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
  
  		/*
71fd05a88   Jens Axboe   writeback: improv...
769
  		 * If we consumed everything, see if we have more
03ba3782e   Jens Axboe   writeback: switch...
770
  		 */
71fd05a88   Jens Axboe   writeback: improv...
771
772
773
774
775
776
  		if (wbc.nr_to_write <= 0)
  			continue;
  		/*
  		 * Didn't write everything and we don't have more IO, bail
  		 */
  		if (!wbc.more_io)
03ba3782e   Jens Axboe   writeback: switch...
777
  			break;
71fd05a88   Jens Axboe   writeback: improv...
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
  		/*
  		 * Did we write something? Try for more
  		 */
  		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
  			continue;
  		/*
  		 * Nothing written. Wait for some inode to
  		 * become available for writeback. Otherwise
  		 * we'll just busyloop.
  		 */
  		spin_lock(&inode_lock);
  		if (!list_empty(&wb->b_more_io))  {
  			inode = list_entry(wb->b_more_io.prev,
  						struct inode, i_list);
  			inode_wait_for_writeback(inode);
03ba3782e   Jens Axboe   writeback: switch...
793
  		}
71fd05a88   Jens Axboe   writeback: improv...
794
  		spin_unlock(&inode_lock);
03ba3782e   Jens Axboe   writeback: switch...
795
796
797
798
799
800
801
  	}
  
  	return wrote;
  }
  
  /*
   * Return the next bdi_work struct that hasn't been processed by this
8010c3b63   Jens Axboe   writeback: add co...
802
803
804
805
806
   * wb thread yet. ->seen is initially set for each thread that exists
   * for this device, when a thread first notices a piece of work it
   * clears its bit. Depending on writeback type, the thread will notify
   * completion on either receiving the work (WB_SYNC_NONE) or after
   * it is done (WB_SYNC_ALL).
03ba3782e   Jens Axboe   writeback: switch...
807
808
809
810
811
812
813
814
815
   */
  static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
  					   struct bdi_writeback *wb)
  {
  	struct bdi_work *work, *ret = NULL;
  
  	rcu_read_lock();
  
  	list_for_each_entry_rcu(work, &bdi->work_list, list) {
77fad5e62   Nick Piggin   writeback: improv...
816
  		if (!test_bit(wb->nr, &work->seen))
03ba3782e   Jens Axboe   writeback: switch...
817
  			continue;
77fad5e62   Nick Piggin   writeback: improv...
818
  		clear_bit(wb->nr, &work->seen);
03ba3782e   Jens Axboe   writeback: switch...
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
  
  		ret = work;
  		break;
  	}
  
  	rcu_read_unlock();
  	return ret;
  }
  
  static long wb_check_old_data_flush(struct bdi_writeback *wb)
  {
  	unsigned long expired;
  	long nr_pages;
  
  	expired = wb->last_old_flush +
  			msecs_to_jiffies(dirty_writeback_interval * 10);
  	if (time_before(jiffies, expired))
  		return 0;
  
  	wb->last_old_flush = jiffies;
  	nr_pages = global_page_state(NR_FILE_DIRTY) +
  			global_page_state(NR_UNSTABLE_NFS) +
  			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
c4a77a6c7   Jens Axboe   writeback: make w...
842
843
844
845
846
847
848
849
850
851
  	if (nr_pages) {
  		struct wb_writeback_args args = {
  			.nr_pages	= nr_pages,
  			.sync_mode	= WB_SYNC_NONE,
  			.for_kupdate	= 1,
  			.range_cyclic	= 1,
  		};
  
  		return wb_writeback(wb, &args);
  	}
03ba3782e   Jens Axboe   writeback: switch...
852
853
854
855
856
857
858
859
860
861
862
  
  	return 0;
  }
  
  /*
   * Retrieve work items and do the writeback they describe
   */
  long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  {
  	struct backing_dev_info *bdi = wb->bdi;
  	struct bdi_work *work;
c4a77a6c7   Jens Axboe   writeback: make w...
863
  	long wrote = 0;
03ba3782e   Jens Axboe   writeback: switch...
864
865
  
  	while ((work = get_next_work_item(bdi, wb)) != NULL) {
c4a77a6c7   Jens Axboe   writeback: make w...
866
  		struct wb_writeback_args args = work->args;
03ba3782e   Jens Axboe   writeback: switch...
867
868
869
870
871
  
  		/*
  		 * Override sync mode, in case we must wait for completion
  		 */
  		if (force_wait)
c4a77a6c7   Jens Axboe   writeback: make w...
872
  			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
03ba3782e   Jens Axboe   writeback: switch...
873
874
875
876
877
  
  		/*
  		 * If this isn't a data integrity operation, just notify
  		 * that we have seen this work and we are now starting it.
  		 */
c4a77a6c7   Jens Axboe   writeback: make w...
878
  		if (args.sync_mode == WB_SYNC_NONE)
03ba3782e   Jens Axboe   writeback: switch...
879
  			wb_clear_pending(wb, work);
c4a77a6c7   Jens Axboe   writeback: make w...
880
  		wrote += wb_writeback(wb, &args);
03ba3782e   Jens Axboe   writeback: switch...
881
882
883
884
885
  
  		/*
  		 * This is a data integrity writeback, so only do the
  		 * notification when we have completed the work.
  		 */
c4a77a6c7   Jens Axboe   writeback: make w...
886
  		if (args.sync_mode == WB_SYNC_ALL)
03ba3782e   Jens Axboe   writeback: switch...
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
  			wb_clear_pending(wb, work);
  	}
  
  	/*
  	 * Check for periodic writeback, kupdated() style
  	 */
  	wrote += wb_check_old_data_flush(wb);
  
  	return wrote;
  }
  
  /*
   * Handle writeback of dirty data for the device backed by this bdi. Also
   * wakes up periodically and does kupdated style flushing.
   */
  int bdi_writeback_task(struct bdi_writeback *wb)
  {
  	unsigned long last_active = jiffies;
  	unsigned long wait_jiffies = -1UL;
  	long pages_written;
  
  	while (!kthread_should_stop()) {
  		pages_written = wb_do_writeback(wb, 0);
  
  		if (pages_written)
  			last_active = jiffies;
  		else if (wait_jiffies != -1UL) {
  			unsigned long max_idle;
38f219776   Nick Piggin   fs: sync_sb_inode...
915
  			/*
03ba3782e   Jens Axboe   writeback: switch...
916
917
918
  			 * Longest period of inactivity that we tolerate. If we
  			 * see dirty data again later, the task will get
  			 * recreated automatically.
38f219776   Nick Piggin   fs: sync_sb_inode...
919
  			 */
03ba3782e   Jens Axboe   writeback: switch...
920
921
922
923
924
925
  			max_idle = max(5UL * 60 * HZ, wait_jiffies);
  			if (time_after(jiffies, max_idle + last_active))
  				break;
  		}
  
  		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
49db04143   Jens Axboe   writeback: use sc...
926
  		schedule_timeout_interruptible(wait_jiffies);
03ba3782e   Jens Axboe   writeback: switch...
927
928
929
930
931
932
933
  		try_to_freeze();
  	}
  
  	return 0;
  }
  
  /*
b6e51316d   Jens Axboe   writeback: separa...
934
935
   * Schedule writeback for all backing devices. This does WB_SYNC_NONE
   * writeback, for integrity writeback see bdi_sync_writeback().
03ba3782e   Jens Axboe   writeback: switch...
936
   */
b6e51316d   Jens Axboe   writeback: separa...
937
  static void bdi_writeback_all(struct super_block *sb, long nr_pages)
03ba3782e   Jens Axboe   writeback: switch...
938
  {
b6e51316d   Jens Axboe   writeback: separa...
939
940
941
942
943
  	struct wb_writeback_args args = {
  		.sb		= sb,
  		.nr_pages	= nr_pages,
  		.sync_mode	= WB_SYNC_NONE,
  	};
03ba3782e   Jens Axboe   writeback: switch...
944
  	struct backing_dev_info *bdi;
03ba3782e   Jens Axboe   writeback: switch...
945

cfc4ba536   Jens Axboe   writeback: use RC...
946
  	rcu_read_lock();
03ba3782e   Jens Axboe   writeback: switch...
947

cfc4ba536   Jens Axboe   writeback: use RC...
948
  	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
03ba3782e   Jens Axboe   writeback: switch...
949
950
  		if (!bdi_has_dirty_io(bdi))
  			continue;
38f219776   Nick Piggin   fs: sync_sb_inode...
951

b6e51316d   Jens Axboe   writeback: separa...
952
  		bdi_alloc_queue_work(bdi, &args);
03ba3782e   Jens Axboe   writeback: switch...
953
  	}
cfc4ba536   Jens Axboe   writeback: use RC...
954
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
955
956
957
  }
  
  /*
03ba3782e   Jens Axboe   writeback: switch...
958
959
960
961
962
   * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
   * the whole world.
   */
  void wakeup_flusher_threads(long nr_pages)
  {
03ba3782e   Jens Axboe   writeback: switch...
963
964
965
  	if (nr_pages == 0)
  		nr_pages = global_page_state(NR_FILE_DIRTY) +
  				global_page_state(NR_UNSTABLE_NFS);
b6e51316d   Jens Axboe   writeback: separa...
966
  	bdi_writeback_all(NULL, nr_pages);
03ba3782e   Jens Axboe   writeback: switch...
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
  }
  
  static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  {
  	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
  		struct dentry *dentry;
  		const char *name = "?";
  
  		dentry = d_find_alias(inode);
  		if (dentry) {
  			spin_lock(&dentry->d_lock);
  			name = (const char *) dentry->d_name.name;
  		}
  		printk(KERN_DEBUG
  		       "%s(%d): dirtied inode %lu (%s) on %s
  ",
  		       current->comm, task_pid_nr(current), inode->i_ino,
  		       name, inode->i_sb->s_id);
  		if (dentry) {
  			spin_unlock(&dentry->d_lock);
  			dput(dentry);
  		}
  	}
  }
  
  /**
   *	__mark_inode_dirty -	internal function
   *	@inode: inode to mark
   *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
   *	Mark an inode as dirty. Callers should use mark_inode_dirty or
   *  	mark_inode_dirty_sync.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
998
   *
03ba3782e   Jens Axboe   writeback: switch...
999
1000
1001
1002
1003
1004
1005
1006
1007
   * Put the inode on the super block's dirty list.
   *
   * CAREFUL! We mark it dirty unconditionally, but move it onto the
   * dirty list only if it is hashed or if it refers to a blockdev.
   * If it was not hashed, it will never be added to the dirty list
   * even if it is later hashed, as it will have been marked dirty already.
   *
   * In short, make sure you hash any inodes _before_ you start marking
   * them dirty.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1008
   *
03ba3782e   Jens Axboe   writeback: switch...
1009
1010
   * This function *must* be atomic for the I_DIRTY_PAGES case -
   * set_page_dirty() is called under spinlock in several places.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1011
   *
03ba3782e   Jens Axboe   writeback: switch...
1012
1013
1014
1015
1016
1017
   * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
   * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
   * the kernel-internal blockdev inode represents the dirtying time of the
   * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
   * page->mapping->host, so the page-dirtying time is recorded in the internal
   * blockdev inode.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1018
   */
03ba3782e   Jens Axboe   writeback: switch...
1019
  void __mark_inode_dirty(struct inode *inode, int flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
  {
03ba3782e   Jens Axboe   writeback: switch...
1021
  	struct super_block *sb = inode->i_sb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1022

03ba3782e   Jens Axboe   writeback: switch...
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
  	/*
  	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
  	 * dirty the inode itself
  	 */
  	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
  		if (sb->s_op->dirty_inode)
  			sb->s_op->dirty_inode(inode);
  	}
  
  	/*
  	 * make sure that changes are seen by all cpus before we test i_state
  	 * -- mikulas
  	 */
  	smp_mb();
  
  	/* avoid the locking if we can */
  	if ((inode->i_state & flags) == flags)
  		return;
  
  	if (unlikely(block_dump))
  		block_dump___mark_inode_dirty(inode);
  
  	spin_lock(&inode_lock);
  	if ((inode->i_state & flags) != flags) {
  		const int was_dirty = inode->i_state & I_DIRTY;
  
  		inode->i_state |= flags;
  
  		/*
  		 * If the inode is being synced, just update its dirty state.
  		 * The unlocker will place the inode on the appropriate
  		 * superblock list, based upon its state.
  		 */
  		if (inode->i_state & I_SYNC)
  			goto out;
  
  		/*
  		 * Only add valid (hashed) inodes to the superblock's
  		 * dirty list.  Add blockdev inodes as well.
  		 */
  		if (!S_ISBLK(inode->i_mode)) {
  			if (hlist_unhashed(&inode->i_hash))
  				goto out;
  		}
  		if (inode->i_state & (I_FREEING|I_CLEAR))
  			goto out;
  
  		/*
  		 * If the inode was already on b_dirty/b_io/b_more_io, don't
  		 * reposition it (that would break b_dirty time-ordering).
  		 */
  		if (!was_dirty) {
  			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
500b067c5   Jens Axboe   writeback: check ...
1076
1077
1078
1079
1080
1081
1082
1083
1084
  			struct backing_dev_info *bdi = wb->bdi;
  
  			if (bdi_cap_writeback_dirty(bdi) &&
  			    !test_bit(BDI_registered, &bdi->state)) {
  				WARN_ON(1);
  				printk(KERN_ERR "bdi-%s not registered
  ",
  								bdi->name);
  			}
03ba3782e   Jens Axboe   writeback: switch...
1085
1086
1087
  
  			inode->dirtied_when = jiffies;
  			list_move(&inode->i_list, &wb->b_dirty);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1088
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1089
  	}
03ba3782e   Jens Axboe   writeback: switch...
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
  out:
  	spin_unlock(&inode_lock);
  }
  EXPORT_SYMBOL(__mark_inode_dirty);
  
  /*
   * Write out a superblock's list of dirty inodes.  A wait will be performed
   * upon no inodes, all inodes or the final one, depending upon sync_mode.
   *
   * If older_than_this is non-NULL, then only write out inodes which
   * had their first dirtying at a time earlier than *older_than_this.
   *
03ba3782e   Jens Axboe   writeback: switch...
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
   * If `bdi' is non-zero then we're being asked to writeback a specific queue.
   * This function assumes that the blockdev superblock's inodes are backed by
   * a variety of queues, so all inodes are searched.  For other superblocks,
   * assume that all inodes are backed by the same queue.
   *
   * The inodes to be written are parked on bdi->b_io.  They are moved back onto
   * bdi->b_dirty as they are selected for writing.  This way, none can be missed
   * on the writer throttling path, and we get decent balancing between many
   * throttled threads: we don't want them all piling up on inode_sync_wait.
   */
b6e51316d   Jens Axboe   writeback: separa...
1112
  static void wait_sb_inodes(struct super_block *sb)
03ba3782e   Jens Axboe   writeback: switch...
1113
1114
1115
1116
1117
1118
1119
  {
  	struct inode *inode, *old_inode = NULL;
  
  	/*
  	 * We need to be protected against the filesystem going from
  	 * r/o to r/w or vice versa.
  	 */
b6e51316d   Jens Axboe   writeback: separa...
1120
  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
03ba3782e   Jens Axboe   writeback: switch...
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
  
  	spin_lock(&inode_lock);
  
  	/*
  	 * Data integrity sync. Must wait for all pages under writeback,
  	 * because there may have been pages dirtied before our sync
  	 * call, but which had writeout started before we write it out.
  	 * In which case, the inode may not be on the dirty list, but
  	 * we still have to wait for that writeout.
  	 */
b6e51316d   Jens Axboe   writeback: separa...
1131
  	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
03ba3782e   Jens Axboe   writeback: switch...
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
  		struct address_space *mapping;
  
  		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
  			continue;
  		mapping = inode->i_mapping;
  		if (mapping->nrpages == 0)
  			continue;
  		__iget(inode);
  		spin_unlock(&inode_lock);
  		/*
  		 * We hold a reference to 'inode' so it couldn't have
  		 * been removed from s_inodes list while we dropped the
  		 * inode_lock.  We cannot iput the inode now as we can
  		 * be holding the last reference and we cannot iput it
  		 * under inode_lock. So we keep the reference and iput
  		 * it later.
  		 */
  		iput(old_inode);
  		old_inode = inode;
  
  		filemap_fdatawait(mapping);
  
  		cond_resched();
  
  		spin_lock(&inode_lock);
  	}
  	spin_unlock(&inode_lock);
  	iput(old_inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1160
  }
d8a8559cd   Jens Axboe   writeback: get ri...
1161
1162
1163
  /**
   * writeback_inodes_sb	-	writeback dirty inodes from given super_block
   * @sb: the superblock
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1164
   *
d8a8559cd   Jens Axboe   writeback: get ri...
1165
1166
1167
1168
   * Start writeback on some inodes on this super_block. No guarantees are made
   * on how many (if any) will be written, and this function does not wait
   * for IO completion of submitted IO. The number of pages submitted is
   * returned.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1169
   */
b6e51316d   Jens Axboe   writeback: separa...
1170
  void writeback_inodes_sb(struct super_block *sb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1171
  {
d8a8559cd   Jens Axboe   writeback: get ri...
1172
1173
1174
  	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
  	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
  	long nr_to_write;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1175

d8a8559cd   Jens Axboe   writeback: get ri...
1176
  	nr_to_write = nr_dirty + nr_unstable +
38f219776   Nick Piggin   fs: sync_sb_inode...
1177
  			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
38f219776   Nick Piggin   fs: sync_sb_inode...
1178

a72bfd4de   Jens Axboe   writeback: pass i...
1179
  	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
d8a8559cd   Jens Axboe   writeback: get ri...
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
  }
  EXPORT_SYMBOL(writeback_inodes_sb);
  
  /**
   * sync_inodes_sb	-	sync sb inode pages
   * @sb: the superblock
   *
   * This function writes and waits on any dirty inode belonging to this
   * super_block. The number of pages synced is returned.
   */
b6e51316d   Jens Axboe   writeback: separa...
1190
  void sync_inodes_sb(struct super_block *sb)
d8a8559cd   Jens Axboe   writeback: get ri...
1191
  {
b6e51316d   Jens Axboe   writeback: separa...
1192
1193
  	bdi_sync_writeback(sb->s_bdi, sb);
  	wait_sb_inodes(sb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1194
  }
d8a8559cd   Jens Axboe   writeback: get ri...
1195
  EXPORT_SYMBOL(sync_inodes_sb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1196

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1197
  /**
7f04c26d7   Andrea Arcangeli   [PATCH] fix nr_un...
1198
1199
1200
1201
1202
1203
   * write_inode_now	-	write an inode to disk
   * @inode: inode to write to disk
   * @sync: whether the write should be synchronous or not
   *
   * This function commits an inode to disk immediately if it is dirty. This is
   * primarily needed by knfsd.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1204
   *
7f04c26d7   Andrea Arcangeli   [PATCH] fix nr_un...
1205
   * The caller must either have a ref on the inode or must have set I_WILL_FREE.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1206
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1207
1208
1209
1210
1211
  int write_inode_now(struct inode *inode, int sync)
  {
  	int ret;
  	struct writeback_control wbc = {
  		.nr_to_write = LONG_MAX,
18914b188   Mike Galbraith   write_inode_now()...
1212
  		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
1213
1214
  		.range_start = 0,
  		.range_end = LLONG_MAX,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1215
1216
1217
  	};
  
  	if (!mapping_cap_writeback_dirty(inode->i_mapping))
49364ce25   Andrew Morton   [PATCH] write_ino...
1218
  		wbc.nr_to_write = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1219
1220
1221
  
  	might_sleep();
  	spin_lock(&inode_lock);
01c031945   Christoph Hellwig   cleanup __writeba...
1222
  	ret = writeback_single_inode(inode, &wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1223
1224
  	spin_unlock(&inode_lock);
  	if (sync)
1c0eeaf56   Joern Engel   introduce I_SYNC
1225
  		inode_sync_wait(inode);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
  	return ret;
  }
  EXPORT_SYMBOL(write_inode_now);
  
  /**
   * sync_inode - write an inode and its pages to disk.
   * @inode: the inode to sync
   * @wbc: controls the writeback mode
   *
   * sync_inode() will write an inode and its pages to disk.  It will also
   * correctly update the inode on its superblock's dirty inode lists and will
   * update inode->i_state.
   *
   * The caller must have a ref on the inode.
   */
  int sync_inode(struct inode *inode, struct writeback_control *wbc)
  {
  	int ret;
  
  	spin_lock(&inode_lock);
01c031945   Christoph Hellwig   cleanup __writeba...
1246
  	ret = writeback_single_inode(inode, wbc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1247
1248
1249
1250
  	spin_unlock(&inode_lock);
  	return ret;
  }
  EXPORT_SYMBOL(sync_inode);