Blame view

mm/vmscan.c 45.4 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
  /*
   *  linux/mm/vmscan.c
   *
   *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   *
   *  Swap reorganised 29.12.95, Stephen Tweedie.
   *  kswapd added: 7.1.96  sct
   *  Removed kswapd_ctl limits, and swap out as many pages as needed
   *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
   *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
   *  Multiqueue VM started 5.8.00, Rik van Riel.
   */
  
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
  #include <linux/pagemap.h>
  #include <linux/init.h>
  #include <linux/highmem.h>
e129b5c23   Andrew Morton   [PATCH] vm: add p...
22
  #include <linux/vmstat.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  #include <linux/file.h>
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
  #include <linux/buffer_head.h>	/* for try_to_release_page(),
  					buffer_heads_over_limit */
  #include <linux/mm_inline.h>
  #include <linux/pagevec.h>
  #include <linux/backing-dev.h>
  #include <linux/rmap.h>
  #include <linux/topology.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
  #include <linux/notifier.h>
  #include <linux/rwsem.h>
248a0301e   Rafael J. Wysocki   [PATCH] mm: make ...
37
  #include <linux/delay.h>
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
38
  #include <linux/kthread.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
39
40
41
42
43
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
  
  #include <linux/swapops.h>
0f8053a50   Nick Piggin   [PATCH] mm: make ...
44
  #include "internal.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
  struct scan_control {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
46
47
  	/* Incremented by the number of inactive pages that were scanned */
  	unsigned long nr_scanned;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  	/* This context's GFP mask */
6daa0e286   Al Viro   [PATCH] gfp_t: mm...
49
  	gfp_t gfp_mask;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
50
51
  
  	int may_writepage;
f1fd1067e   Christoph Lameter   [PATCH] Zone recl...
52
53
  	/* Can pages be swapped as part of reclaim? */
  	int may_swap;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
54
55
56
57
58
  	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
  	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
  	 * In this context, it doesn't matter that we scan the
  	 * whole list at once. */
  	int swap_cluster_max;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
59
60
  
  	int swappiness;
408d85441   Nick Piggin   [PATCH] oom: use ...
61
62
  
  	int all_unreclaimable;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
  };
  
  /*
   * The list of shrinker callbacks used by to apply pressure to
   * ageable caches.
   */
  struct shrinker {
  	shrinker_t		shrinker;
  	struct list_head	list;
  	int			seeks;	/* seeks to recreate an obj */
  	long			nr;	/* objs pending delete */
  };
  
  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  
  #ifdef ARCH_HAS_PREFETCH
  #define prefetch_prev_lru_page(_page, _base, _field)			\
  	do {								\
  		if ((_page)->lru.prev != _base) {			\
  			struct page *prev;				\
  									\
  			prev = lru_to_page(&(_page->lru));		\
  			prefetch(&prev->_field);			\
  		}							\
  	} while (0)
  #else
  #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
  #ifdef ARCH_HAS_PREFETCHW
  #define prefetchw_prev_lru_page(_page, _base, _field)			\
  	do {								\
  		if ((_page)->lru.prev != _base) {			\
  			struct page *prev;				\
  									\
  			prev = lru_to_page(&(_page->lru));		\
  			prefetchw(&prev->_field);			\
  		}							\
  	} while (0)
  #else
  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
  #endif
  
  /*
   * From 0 .. 100.  Higher means more swappy.
   */
  int vm_swappiness = 60;
bd1e22b8e   Andrew Morton   [PATCH] initialis...
110
  long vm_total_pages;	/* The total number of pages which the VM controls */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
  
  static LIST_HEAD(shrinker_list);
  static DECLARE_RWSEM(shrinker_rwsem);
  
  /*
   * Add a shrinker callback to be called from the vm
   */
  struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
  {
          struct shrinker *shrinker;
  
          shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
          if (shrinker) {
  	        shrinker->shrinker = theshrinker;
  	        shrinker->seeks = seeks;
  	        shrinker->nr = 0;
  	        down_write(&shrinker_rwsem);
  	        list_add_tail(&shrinker->list, &shrinker_list);
  	        up_write(&shrinker_rwsem);
  	}
  	return shrinker;
  }
  EXPORT_SYMBOL(set_shrinker);
  
  /*
   * Remove one
   */
  void remove_shrinker(struct shrinker *shrinker)
  {
  	down_write(&shrinker_rwsem);
  	list_del(&shrinker->list);
  	up_write(&shrinker_rwsem);
  	kfree(shrinker);
  }
  EXPORT_SYMBOL(remove_shrinker);
  
  #define SHRINK_BATCH 128
  /*
   * Call the shrink functions to age shrinkable caches
   *
   * Here we assume it costs one seek to replace a lru page and that it also
   * takes a seek to recreate a cache object.  With this in mind we age equal
   * percentages of the lru and ageable caches.  This should balance the seeks
   * generated by these structures.
   *
   * If the vm encounted mapped pages on the LRU it increase the pressure on
   * slab to avoid swapping.
   *
   * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
   *
   * `lru_pages' represents the number of on-LRU pages in all the zones which
   * are eligible for the caller's allocation attempt.  It is used for balancing
   * slab reclaim versus page reclaim.
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
164
165
   *
   * Returns the number of slab objects which we shrunk.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
166
   */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
167
168
  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
  			unsigned long lru_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
169
170
  {
  	struct shrinker *shrinker;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
171
  	unsigned long ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
172
173
174
175
176
  
  	if (scanned == 0)
  		scanned = SWAP_CLUSTER_MAX;
  
  	if (!down_read_trylock(&shrinker_rwsem))
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
177
  		return 1;	/* Assume we'll be able to shrink next time */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
178
179
180
181
  
  	list_for_each_entry(shrinker, &shrinker_list, list) {
  		unsigned long long delta;
  		unsigned long total_scan;
ea164d73a   Andrea Arcangeli   [PATCH] shrinker-...
182
  		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
183
184
  
  		delta = (4 * scanned) / shrinker->seeks;
ea164d73a   Andrea Arcangeli   [PATCH] shrinker-...
185
  		delta *= max_pass;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
186
187
  		do_div(delta, lru_pages + 1);
  		shrinker->nr += delta;
ea164d73a   Andrea Arcangeli   [PATCH] shrinker-...
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  		if (shrinker->nr < 0) {
  			printk(KERN_ERR "%s: nr=%ld
  ",
  					__FUNCTION__, shrinker->nr);
  			shrinker->nr = max_pass;
  		}
  
  		/*
  		 * Avoid risking looping forever due to too large nr value:
  		 * never try to free more than twice the estimate number of
  		 * freeable entries.
  		 */
  		if (shrinker->nr > max_pass * 2)
  			shrinker->nr = max_pass * 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
202
203
204
205
206
207
208
  
  		total_scan = shrinker->nr;
  		shrinker->nr = 0;
  
  		while (total_scan >= SHRINK_BATCH) {
  			long this_scan = SHRINK_BATCH;
  			int shrink_ret;
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
209
  			int nr_before;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210

b15e0905f   Andrew Morton   [PATCH] vmscan: n...
211
  			nr_before = (*shrinker->shrinker)(0, gfp_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
212
213
214
  			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
  			if (shrink_ret == -1)
  				break;
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
215
216
  			if (shrink_ret < nr_before)
  				ret += nr_before - shrink_ret;
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
217
  			count_vm_events(SLABS_SCANNED, this_scan);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
218
219
220
221
222
223
224
225
  			total_scan -= this_scan;
  
  			cond_resched();
  		}
  
  		shrinker->nr += total_scan;
  	}
  	up_read(&shrinker_rwsem);
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
226
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
  }
  
  /* Called without lock on whether page is mapped, so answer is unstable */
  static inline int page_mapping_inuse(struct page *page)
  {
  	struct address_space *mapping;
  
  	/* Page is in somebody's page tables. */
  	if (page_mapped(page))
  		return 1;
  
  	/* Be more reluctant to reclaim swapcache than pagecache */
  	if (PageSwapCache(page))
  		return 1;
  
  	mapping = page_mapping(page);
  	if (!mapping)
  		return 0;
  
  	/* File is mmap'd by somebody? */
  	return mapping_mapped(mapping);
  }
  
  static inline int is_page_cache_freeable(struct page *page)
  {
  	return page_count(page) - !!PagePrivate(page) == 2;
  }
  
  static int may_write_to_queue(struct backing_dev_info *bdi)
  {
930d91525   Christoph Lameter   [PATCH] Swap Migr...
257
  	if (current->flags & PF_SWAPWRITE)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
  		return 1;
  	if (!bdi_write_congested(bdi))
  		return 1;
  	if (bdi == current->backing_dev_info)
  		return 1;
  	return 0;
  }
  
  /*
   * We detected a synchronous write error writing a page out.  Probably
   * -ENOSPC.  We need to propagate that into the address_space for a subsequent
   * fsync(), msync() or close().
   *
   * The tricky part is that after writepage we cannot touch the mapping: nothing
   * prevents it from being freed up.  But we have a ref on the page and once
   * that page is locked, the mapping is pinned.
   *
   * We're allowed to run sleeping lock_page() here because we know the caller has
   * __GFP_FS.
   */
  static void handle_write_error(struct address_space *mapping,
  				struct page *page, int error)
  {
  	lock_page(page);
  	if (page_mapping(page) == mapping) {
  		if (error == -ENOSPC)
  			set_bit(AS_ENOSPC, &mapping->flags);
  		else
  			set_bit(AS_EIO, &mapping->flags);
  	}
  	unlock_page(page);
  }
04e62a29b   Christoph Lameter   [PATCH] More page...
290
291
292
293
294
295
296
297
298
299
300
  /* possible outcome of pageout() */
  typedef enum {
  	/* failed to write page out, page is locked */
  	PAGE_KEEP,
  	/* move page to the active list, page is locked */
  	PAGE_ACTIVATE,
  	/* page has been sent to the disk successfully, page is unlocked */
  	PAGE_SUCCESS,
  	/* page is clean and locked */
  	PAGE_CLEAN,
  } pageout_t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
301
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
302
303
   * pageout is called by shrink_page_list() for each dirty page.
   * Calls ->writepage().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
304
   */
04e62a29b   Christoph Lameter   [PATCH] More page...
305
  static pageout_t pageout(struct page *page, struct address_space *mapping)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
  {
  	/*
  	 * If the page is dirty, only perform writeback if that write
  	 * will be non-blocking.  To prevent this allocation from being
  	 * stalled by pagecache activity.  But note that there may be
  	 * stalls if we need to run get_block().  We could test
  	 * PagePrivate for that.
  	 *
  	 * If this process is currently in generic_file_write() against
  	 * this page's queue, we can perform writeback even if that
  	 * will block.
  	 *
  	 * If the page is swapcache, write it back even if that would
  	 * block, for some throttling. This happens by accident, because
  	 * swap_backing_dev_info is bust: it doesn't reflect the
  	 * congestion state of the swapdevs.  Easy to fix, if needed.
  	 * See swapfile.c:page_queue_congested().
  	 */
  	if (!is_page_cache_freeable(page))
  		return PAGE_KEEP;
  	if (!mapping) {
  		/*
  		 * Some data journaling orphaned pages can have
  		 * page->mapping == NULL while being dirty with clean buffers.
  		 */
323aca6c0   Andrew Morton   [PATCH] vmscan: p...
331
  		if (PagePrivate(page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
  			if (try_to_free_buffers(page)) {
  				ClearPageDirty(page);
  				printk("%s: orphaned page
  ", __FUNCTION__);
  				return PAGE_CLEAN;
  			}
  		}
  		return PAGE_KEEP;
  	}
  	if (mapping->a_ops->writepage == NULL)
  		return PAGE_ACTIVATE;
  	if (!may_write_to_queue(mapping->backing_dev_info))
  		return PAGE_KEEP;
  
  	if (clear_page_dirty_for_io(page)) {
  		int res;
  		struct writeback_control wbc = {
  			.sync_mode = WB_SYNC_NONE,
  			.nr_to_write = SWAP_CLUSTER_MAX,
111ebb6e6   OGAWA Hirofumi   [PATCH] writeback...
351
352
  			.range_start = 0,
  			.range_end = LLONG_MAX,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353
354
355
356
357
358
359
360
  			.nonblocking = 1,
  			.for_reclaim = 1,
  		};
  
  		SetPageReclaim(page);
  		res = mapping->a_ops->writepage(page, &wbc);
  		if (res < 0)
  			handle_write_error(mapping, page, res);
994fc28c7   Zach Brown   [PATCH] add AOP_T...
361
  		if (res == AOP_WRITEPAGE_ACTIVATE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362
363
364
365
366
367
368
  			ClearPageReclaim(page);
  			return PAGE_ACTIVATE;
  		}
  		if (!PageWriteback(page)) {
  			/* synchronous write or broken a_ops? */
  			ClearPageReclaim(page);
  		}
e129b5c23   Andrew Morton   [PATCH] vm: add p...
369
  		inc_zone_page_state(page, NR_VMSCAN_WRITE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
370
371
372
373
374
  		return PAGE_SUCCESS;
  	}
  
  	return PAGE_CLEAN;
  }
b20a35035   Christoph Lameter   [PATCH] page migr...
375
  int remove_mapping(struct address_space *mapping, struct page *page)
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
376
  {
28e4d965e   Nick Piggin   [PATCH] mm: remov...
377
378
  	BUG_ON(!PageLocked(page));
  	BUG_ON(mapping != page_mapping(page));
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
379
380
  
  	write_lock_irq(&mapping->tree_lock);
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
381
  	/*
0fd0e6b05   Nick Piggin   [PATCH] page inva...
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  	 * The non racy check for a busy page.
  	 *
  	 * Must be careful with the order of the tests. When someone has
  	 * a ref to the page, it may be possible that they dirty it then
  	 * drop the reference. So if PageDirty is tested before page_count
  	 * here, then the following race may occur:
  	 *
  	 * get_user_pages(&page);
  	 * [user mapping goes away]
  	 * write_to(page);
  	 *				!PageDirty(page)    [good]
  	 * SetPageDirty(page);
  	 * put_page(page);
  	 *				!page_count(page)   [good, discard it]
  	 *
  	 * [oops, our write_to data is lost]
  	 *
  	 * Reversing the order of the tests ensures such a situation cannot
  	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
  	 * load is not satisfied before that of page->_count.
  	 *
  	 * Note that if SetPageDirty is always performed via set_page_dirty,
  	 * and thus under tree_lock, then this ordering is not required.
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
  	 */
  	if (unlikely(page_count(page) != 2))
  		goto cannot_free;
  	smp_rmb();
  	if (unlikely(PageDirty(page)))
  		goto cannot_free;
  
  	if (PageSwapCache(page)) {
  		swp_entry_t swap = { .val = page_private(page) };
  		__delete_from_swap_cache(page);
  		write_unlock_irq(&mapping->tree_lock);
  		swap_free(swap);
  		__put_page(page);	/* The pagecache ref */
  		return 1;
  	}
  
  	__remove_from_page_cache(page);
  	write_unlock_irq(&mapping->tree_lock);
  	__put_page(page);
  	return 1;
  
  cannot_free:
  	write_unlock_irq(&mapping->tree_lock);
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
431
   * shrink_page_list() returns the number of reclaimed pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
   */
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
433
434
  static unsigned long shrink_page_list(struct list_head *page_list,
  					struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
435
436
437
438
  {
  	LIST_HEAD(ret_pages);
  	struct pagevec freed_pvec;
  	int pgactivate = 0;
05ff51376   Andrew Morton   [PATCH] vmscan re...
439
  	unsigned long nr_reclaimed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
  
  	cond_resched();
  
  	pagevec_init(&freed_pvec, 1);
  	while (!list_empty(page_list)) {
  		struct address_space *mapping;
  		struct page *page;
  		int may_enter_fs;
  		int referenced;
  
  		cond_resched();
  
  		page = lru_to_page(page_list);
  		list_del(&page->lru);
  
  		if (TestSetPageLocked(page))
  			goto keep;
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
457
  		VM_BUG_ON(PageActive(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
458
459
  
  		sc->nr_scanned++;
80e434260   Christoph Lameter   [PATCH] zone recl...
460
461
462
  
  		if (!sc->may_swap && page_mapped(page))
  			goto keep_locked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
463
464
465
466
467
468
  		/* Double the slab pressure for mapped and swapcache pages */
  		if (page_mapped(page) || PageSwapCache(page))
  			sc->nr_scanned++;
  
  		if (PageWriteback(page))
  			goto keep_locked;
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
469
  		referenced = page_referenced(page, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
470
471
472
473
474
475
476
477
478
  		/* In active use or really unfreeable?  Activate it. */
  		if (referenced && page_mapping_inuse(page))
  			goto activate_locked;
  
  #ifdef CONFIG_SWAP
  		/*
  		 * Anonymous process memory has backing store?
  		 * Try to allocate it some swap space here.
  		 */
6e5ef1a96   Christoph Lameter   [PATCH] vmscan: e...
479
  		if (PageAnon(page) && !PageSwapCache(page))
1480a540c   Christoph Lameter   [PATCH] SwapMig: ...
480
  			if (!add_to_swap(page, GFP_ATOMIC))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
481
  				goto activate_locked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
483
484
485
486
487
488
489
490
491
492
  #endif /* CONFIG_SWAP */
  
  		mapping = page_mapping(page);
  		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
  		/*
  		 * The page is mapped into the page tables of one or more
  		 * processes. Try to unmap it here.
  		 */
  		if (page_mapped(page) && mapping) {
a48d07afd   Christoph Lameter   [PATCH] Direct Mi...
493
  			switch (try_to_unmap(page, 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
494
495
496
497
498
499
500
501
502
503
504
505
506
507
  			case SWAP_FAIL:
  				goto activate_locked;
  			case SWAP_AGAIN:
  				goto keep_locked;
  			case SWAP_SUCCESS:
  				; /* try to free the page below */
  			}
  		}
  
  		if (PageDirty(page)) {
  			if (referenced)
  				goto keep_locked;
  			if (!may_enter_fs)
  				goto keep_locked;
52a8363ea   Christoph Lameter   [PATCH] mm: impro...
508
  			if (!sc->may_writepage)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
  				goto keep_locked;
  
  			/* Page is dirty, try to write it out here */
  			switch(pageout(page, mapping)) {
  			case PAGE_KEEP:
  				goto keep_locked;
  			case PAGE_ACTIVATE:
  				goto activate_locked;
  			case PAGE_SUCCESS:
  				if (PageWriteback(page) || PageDirty(page))
  					goto keep;
  				/*
  				 * A synchronous write - probably a ramdisk.  Go
  				 * ahead and try to reclaim the page.
  				 */
  				if (TestSetPageLocked(page))
  					goto keep;
  				if (PageDirty(page) || PageWriteback(page))
  					goto keep_locked;
  				mapping = page_mapping(page);
  			case PAGE_CLEAN:
  				; /* try to free the page below */
  			}
  		}
  
  		/*
  		 * If the page has buffers, try to free the buffer mappings
  		 * associated with this page. If we succeed we try to free
  		 * the page as well.
  		 *
  		 * We do this even if the page is PageDirty().
  		 * try_to_release_page() does not perform I/O, but it is
  		 * possible for a page to have PageDirty set, but it is actually
  		 * clean (all its buffers are clean).  This happens if the
  		 * buffers were written out directly, with submit_bh(). ext3
  		 * will do this, as well as the blockdev mapping. 
  		 * try_to_release_page() will discover that cleanness and will
  		 * drop the buffers and mark the page clean - it can be freed.
  		 *
  		 * Rarely, pages can have buffers and no ->mapping.  These are
  		 * the pages which were not successfully invalidated in
  		 * truncate_complete_page().  We try to drop those buffers here
  		 * and if that worked, and the page is no longer mapped into
  		 * process address space (page_count == 1) it can be freed.
  		 * Otherwise, leave the page on the LRU so it is swappable.
  		 */
  		if (PagePrivate(page)) {
  			if (!try_to_release_page(page, sc->gfp_mask))
  				goto activate_locked;
  			if (!mapping && page_count(page) == 1)
  				goto free_it;
  		}
28e4d965e   Nick Piggin   [PATCH] mm: remov...
561
  		if (!mapping || !remove_mapping(mapping, page))
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
562
  			goto keep_locked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
563
564
565
  
  free_it:
  		unlock_page(page);
05ff51376   Andrew Morton   [PATCH] vmscan re...
566
  		nr_reclaimed++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
567
568
569
570
571
572
573
574
575
576
577
  		if (!pagevec_add(&freed_pvec, page))
  			__pagevec_release_nonlru(&freed_pvec);
  		continue;
  
  activate_locked:
  		SetPageActive(page);
  		pgactivate++;
  keep_locked:
  		unlock_page(page);
  keep:
  		list_add(&page->lru, &ret_pages);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
578
  		VM_BUG_ON(PageLRU(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
579
580
581
582
  	}
  	list_splice(&ret_pages, page_list);
  	if (pagevec_count(&freed_pvec))
  		__pagevec_release_nonlru(&freed_pvec);
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
583
  	count_vm_events(PGACTIVATE, pgactivate);
05ff51376   Andrew Morton   [PATCH] vmscan re...
584
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
585
  }
49d2e9cc4   Christoph Lameter   [PATCH] Swap Migr...
586
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
   * zone->lru_lock is heavily contended.  Some of the functions that
   * shrink the lists perform better by taking out a batch of pages
   * and working on them outside the LRU lock.
   *
   * For pagecache intensive workloads, this function is the hottest
   * spot in the kernel (apart from copy_*_user functions).
   *
   * Appropriate locks must be held before calling this function.
   *
   * @nr_to_scan:	The number of pages to look through on the list.
   * @src:	The LRU list to pull pages off.
   * @dst:	The temp list to put pages on to.
   * @scanned:	The number of pages that were scanned.
   *
   * returns how many pages were moved onto *@dst.
   */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
603
604
605
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  		struct list_head *src, struct list_head *dst,
  		unsigned long *scanned)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
606
  {
69e05944a   Andrew Morton   [PATCH] vmscan: u...
607
  	unsigned long nr_taken = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
608
  	struct page *page;
c9b02d970   Wu Fengguang   [PATCH] mm: isola...
609
  	unsigned long scan;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
610

c9b02d970   Wu Fengguang   [PATCH] mm: isola...
611
  	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
7c8ee9a86   Nick Piggin   [PATCH] mm: simpl...
612
  		struct list_head *target;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
613
614
  		page = lru_to_page(src);
  		prefetchw_prev_lru_page(page, src, flags);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
615
  		VM_BUG_ON(!PageLRU(page));
8d438f96d   Nick Piggin   [PATCH] mm: PageL...
616

053837fce   Nick Piggin   [PATCH] mm: migra...
617
  		list_del(&page->lru);
7c8ee9a86   Nick Piggin   [PATCH] mm: simpl...
618
619
  		target = src;
  		if (likely(get_page_unless_zero(page))) {
053837fce   Nick Piggin   [PATCH] mm: migra...
620
  			/*
7c8ee9a86   Nick Piggin   [PATCH] mm: simpl...
621
622
623
  			 * Be careful not to clear PageLRU until after we're
  			 * sure the page is not being freed elsewhere -- the
  			 * page release code relies on it.
053837fce   Nick Piggin   [PATCH] mm: migra...
624
  			 */
7c8ee9a86   Nick Piggin   [PATCH] mm: simpl...
625
626
627
628
  			ClearPageLRU(page);
  			target = dst;
  			nr_taken++;
  		} /* else it is being freed elsewhere */
46453a6e1   Nick Piggin   [PATCH] mm: never...
629

7c8ee9a86   Nick Piggin   [PATCH] mm: simpl...
630
  		list_add(&page->lru, target);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
631
632
633
634
635
636
637
  	}
  
  	*scanned = scan;
  	return nr_taken;
  }
  
  /*
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
638
639
   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
   * of reclaimed pages
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
640
   */
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
641
642
  static unsigned long shrink_inactive_list(unsigned long max_scan,
  				struct zone *zone, struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
643
644
645
  {
  	LIST_HEAD(page_list);
  	struct pagevec pvec;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
646
  	unsigned long nr_scanned = 0;
05ff51376   Andrew Morton   [PATCH] vmscan re...
647
  	unsigned long nr_reclaimed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
648
649
650
651
652
  
  	pagevec_init(&pvec, 1);
  
  	lru_add_drain();
  	spin_lock_irq(&zone->lru_lock);
69e05944a   Andrew Morton   [PATCH] vmscan: u...
653
  	do {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
654
  		struct page *page;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
655
656
657
  		unsigned long nr_taken;
  		unsigned long nr_scan;
  		unsigned long nr_freed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
658
659
660
661
662
663
664
  
  		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
  					     &zone->inactive_list,
  					     &page_list, &nr_scan);
  		zone->nr_inactive -= nr_taken;
  		zone->pages_scanned += nr_scan;
  		spin_unlock_irq(&zone->lru_lock);
69e05944a   Andrew Morton   [PATCH] vmscan: u...
665
  		nr_scanned += nr_scan;
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
666
  		nr_freed = shrink_page_list(&page_list, sc);
05ff51376   Andrew Morton   [PATCH] vmscan re...
667
  		nr_reclaimed += nr_freed;
a74609faf   Nick Piggin   [PATCH] mm: page_...
668
669
  		local_irq_disable();
  		if (current_is_kswapd()) {
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
670
671
  			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
  			__count_vm_events(KSWAPD_STEAL, nr_freed);
a74609faf   Nick Piggin   [PATCH] mm: page_...
672
  		} else
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
673
674
  			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
  		__count_vm_events(PGACTIVATE, nr_freed);
a74609faf   Nick Piggin   [PATCH] mm: page_...
675

fb8d14e17   Wu Fengguang   [PATCH] mm: shrin...
676
677
  		if (nr_taken == 0)
  			goto done;
a74609faf   Nick Piggin   [PATCH] mm: page_...
678
  		spin_lock(&zone->lru_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
679
680
681
682
683
  		/*
  		 * Put back any unfreeable pages.
  		 */
  		while (!list_empty(&page_list)) {
  			page = lru_to_page(&page_list);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
684
  			VM_BUG_ON(PageLRU(page));
8d438f96d   Nick Piggin   [PATCH] mm: PageL...
685
  			SetPageLRU(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
686
687
688
689
690
691
692
693
694
695
696
  			list_del(&page->lru);
  			if (PageActive(page))
  				add_page_to_active_list(zone, page);
  			else
  				add_page_to_inactive_list(zone, page);
  			if (!pagevec_add(&pvec, page)) {
  				spin_unlock_irq(&zone->lru_lock);
  				__pagevec_release(&pvec);
  				spin_lock_irq(&zone->lru_lock);
  			}
  		}
69e05944a   Andrew Morton   [PATCH] vmscan: u...
697
    	} while (nr_scanned < max_scan);
fb8d14e17   Wu Fengguang   [PATCH] mm: shrin...
698
  	spin_unlock(&zone->lru_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
699
  done:
fb8d14e17   Wu Fengguang   [PATCH] mm: shrin...
700
  	local_irq_enable();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
701
  	pagevec_release(&pvec);
05ff51376   Andrew Morton   [PATCH] vmscan re...
702
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
703
  }
4ff1ffb48   Nick Piggin   [PATCH] oom: recl...
704
705
706
707
  static inline int zone_is_near_oom(struct zone *zone)
  {
  	return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
  /*
   * This moves pages from the active list to the inactive list.
   *
   * We move them the other way if the page is referenced by one or more
   * processes, from rmap.
   *
   * If the pages are mostly unmapped, the processing is fast and it is
   * appropriate to hold zone->lru_lock across the whole operation.  But if
   * the pages are mapped, the processing is slow (page_referenced()) so we
   * should drop zone->lru_lock around each page.  It's impossible to balance
   * this, so instead we remove the pages from the LRU while processing them.
   * It is safe to rely on PG_active against the non-LRU pages in here because
   * nobody will play with that bit on a non-LRU page.
   *
   * The downside is that we have to touch page->_count against each page.
   * But we had to alter page->flags anyway.
   */
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
725
726
  static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
  				struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
727
  {
69e05944a   Andrew Morton   [PATCH] vmscan: u...
728
  	unsigned long pgmoved;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
729
  	int pgdeactivate = 0;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
730
  	unsigned long pgscanned;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
731
732
733
734
735
736
  	LIST_HEAD(l_hold);	/* The pages which were snipped off */
  	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
  	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
  	struct page *page;
  	struct pagevec pvec;
  	int reclaim_mapped = 0;
2903fb169   Christoph Lameter   [PATCH] vmscan: s...
737

6e5ef1a96   Christoph Lameter   [PATCH] vmscan: e...
738
  	if (sc->may_swap) {
2903fb169   Christoph Lameter   [PATCH] vmscan: s...
739
740
741
  		long mapped_ratio;
  		long distress;
  		long swap_tendency;
4ff1ffb48   Nick Piggin   [PATCH] oom: recl...
742
743
  		if (zone_is_near_oom(zone))
  			goto force_reclaim_mapped;
2903fb169   Christoph Lameter   [PATCH] vmscan: s...
744
745
746
747
748
749
750
751
752
753
754
755
  		/*
  		 * `distress' is a measure of how much trouble we're having
  		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
  		 */
  		distress = 100 >> zone->prev_priority;
  
  		/*
  		 * The point of this algorithm is to decide when to start
  		 * reclaiming mapped memory instead of just pagecache.  Work out
  		 * how much memory
  		 * is mapped.
  		 */
f3dbd3446   Christoph Lameter   [PATCH] zoned vm ...
756
757
  		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
  				global_page_state(NR_ANON_PAGES)) * 100) /
bf02cf4b6   Christoph Lameter   [PATCH] zoned vm ...
758
  					vm_total_pages;
2903fb169   Christoph Lameter   [PATCH] vmscan: s...
759
760
761
762
763
764
765
766
767
768
769
770
771
  
  		/*
  		 * Now decide how much we really want to unmap some pages.  The
  		 * mapped ratio is downgraded - just because there's a lot of
  		 * mapped memory doesn't necessarily mean that page reclaim
  		 * isn't succeeding.
  		 *
  		 * The distress ratio is important - we don't want to start
  		 * going oom.
  		 *
  		 * A 100% value of vm_swappiness overrides this algorithm
  		 * altogether.
  		 */
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
772
  		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
2903fb169   Christoph Lameter   [PATCH] vmscan: s...
773
774
775
776
777
778
  
  		/*
  		 * Now use this metric to decide whether to start moving mapped
  		 * memory onto the inactive list.
  		 */
  		if (swap_tendency >= 100)
4ff1ffb48   Nick Piggin   [PATCH] oom: recl...
779
  force_reclaim_mapped:
2903fb169   Christoph Lameter   [PATCH] vmscan: s...
780
781
  			reclaim_mapped = 1;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
782
783
784
785
786
787
788
789
  
  	lru_add_drain();
  	spin_lock_irq(&zone->lru_lock);
  	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
  				    &l_hold, &pgscanned);
  	zone->pages_scanned += pgscanned;
  	zone->nr_active -= pgmoved;
  	spin_unlock_irq(&zone->lru_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
790
791
792
793
794
795
796
  	while (!list_empty(&l_hold)) {
  		cond_resched();
  		page = lru_to_page(&l_hold);
  		list_del(&page->lru);
  		if (page_mapped(page)) {
  			if (!reclaim_mapped ||
  			    (total_swap_pages == 0 && PageAnon(page)) ||
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
797
  			    page_referenced(page, 0)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
798
799
800
801
802
803
804
805
806
807
808
809
810
  				list_add(&page->lru, &l_active);
  				continue;
  			}
  		}
  		list_add(&page->lru, &l_inactive);
  	}
  
  	pagevec_init(&pvec, 1);
  	pgmoved = 0;
  	spin_lock_irq(&zone->lru_lock);
  	while (!list_empty(&l_inactive)) {
  		page = lru_to_page(&l_inactive);
  		prefetchw_prev_lru_page(page, &l_inactive, flags);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
811
  		VM_BUG_ON(PageLRU(page));
8d438f96d   Nick Piggin   [PATCH] mm: PageL...
812
  		SetPageLRU(page);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
813
  		VM_BUG_ON(!PageActive(page));
4c84cacfa   Nick Piggin   [PATCH] mm: PageA...
814
  		ClearPageActive(page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
  		list_move(&page->lru, &zone->inactive_list);
  		pgmoved++;
  		if (!pagevec_add(&pvec, page)) {
  			zone->nr_inactive += pgmoved;
  			spin_unlock_irq(&zone->lru_lock);
  			pgdeactivate += pgmoved;
  			pgmoved = 0;
  			if (buffer_heads_over_limit)
  				pagevec_strip(&pvec);
  			__pagevec_release(&pvec);
  			spin_lock_irq(&zone->lru_lock);
  		}
  	}
  	zone->nr_inactive += pgmoved;
  	pgdeactivate += pgmoved;
  	if (buffer_heads_over_limit) {
  		spin_unlock_irq(&zone->lru_lock);
  		pagevec_strip(&pvec);
  		spin_lock_irq(&zone->lru_lock);
  	}
  
  	pgmoved = 0;
  	while (!list_empty(&l_active)) {
  		page = lru_to_page(&l_active);
  		prefetchw_prev_lru_page(page, &l_active, flags);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
840
  		VM_BUG_ON(PageLRU(page));
8d438f96d   Nick Piggin   [PATCH] mm: PageL...
841
  		SetPageLRU(page);
725d704ec   Nick Piggin   [PATCH] mm: VM_BU...
842
  		VM_BUG_ON(!PageActive(page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
843
844
845
846
847
848
849
850
851
852
853
  		list_move(&page->lru, &zone->active_list);
  		pgmoved++;
  		if (!pagevec_add(&pvec, page)) {
  			zone->nr_active += pgmoved;
  			pgmoved = 0;
  			spin_unlock_irq(&zone->lru_lock);
  			__pagevec_release(&pvec);
  			spin_lock_irq(&zone->lru_lock);
  		}
  	}
  	zone->nr_active += pgmoved;
a74609faf   Nick Piggin   [PATCH] mm: page_...
854

f8891e5e1   Christoph Lameter   [PATCH] Light wei...
855
856
857
  	__count_zone_vm_events(PGREFILL, zone, pgscanned);
  	__count_vm_events(PGDEACTIVATE, pgdeactivate);
  	spin_unlock_irq(&zone->lru_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
858

a74609faf   Nick Piggin   [PATCH] mm: page_...
859
  	pagevec_release(&pvec);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
860
861
862
863
864
  }
  
  /*
   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
   */
05ff51376   Andrew Morton   [PATCH] vmscan re...
865
866
  static unsigned long shrink_zone(int priority, struct zone *zone,
  				struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
867
868
869
  {
  	unsigned long nr_active;
  	unsigned long nr_inactive;
8695949a1   Christoph Lameter   [PATCH] Thin out ...
870
  	unsigned long nr_to_scan;
05ff51376   Andrew Morton   [PATCH] vmscan re...
871
  	unsigned long nr_reclaimed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
872

53e9a6159   Martin Hicks   [PATCH] VM: zone ...
873
  	atomic_inc(&zone->reclaim_in_progress);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
874
875
876
877
  	/*
  	 * Add one to `nr_to_scan' just to make sure that the kernel will
  	 * slowly sift through the active list.
  	 */
8695949a1   Christoph Lameter   [PATCH] Thin out ...
878
  	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
879
880
881
882
883
  	nr_active = zone->nr_scan_active;
  	if (nr_active >= sc->swap_cluster_max)
  		zone->nr_scan_active = 0;
  	else
  		nr_active = 0;
8695949a1   Christoph Lameter   [PATCH] Thin out ...
884
  	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
885
886
887
888
889
  	nr_inactive = zone->nr_scan_inactive;
  	if (nr_inactive >= sc->swap_cluster_max)
  		zone->nr_scan_inactive = 0;
  	else
  		nr_inactive = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
890
891
  	while (nr_active || nr_inactive) {
  		if (nr_active) {
8695949a1   Christoph Lameter   [PATCH] Thin out ...
892
  			nr_to_scan = min(nr_active,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
893
  					(unsigned long)sc->swap_cluster_max);
8695949a1   Christoph Lameter   [PATCH] Thin out ...
894
  			nr_active -= nr_to_scan;
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
895
  			shrink_active_list(nr_to_scan, zone, sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
896
897
898
  		}
  
  		if (nr_inactive) {
8695949a1   Christoph Lameter   [PATCH] Thin out ...
899
  			nr_to_scan = min(nr_inactive,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
900
  					(unsigned long)sc->swap_cluster_max);
8695949a1   Christoph Lameter   [PATCH] Thin out ...
901
  			nr_inactive -= nr_to_scan;
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
902
903
  			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
  								sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
905
906
907
  		}
  	}
  
  	throttle_vm_writeout();
53e9a6159   Martin Hicks   [PATCH] VM: zone ...
908
909
  
  	atomic_dec(&zone->reclaim_in_progress);
05ff51376   Andrew Morton   [PATCH] vmscan re...
910
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
  }
  
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
   * request.
   *
   * We reclaim from a zone even if that zone is over pages_high.  Because:
   * a) The caller may be trying to free *extra* pages to satisfy a higher-order
   *    allocation or
   * b) The zones may be over pages_high but they must go *over* pages_high to
   *    satisfy the `incremental min' zone defense algorithm.
   *
   * Returns the number of reclaimed pages.
   *
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
   */
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
929
  static unsigned long shrink_zones(int priority, struct zone **zones,
05ff51376   Andrew Morton   [PATCH] vmscan re...
930
  					struct scan_control *sc)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
931
  {
05ff51376   Andrew Morton   [PATCH] vmscan re...
932
  	unsigned long nr_reclaimed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
933
  	int i;
408d85441   Nick Piggin   [PATCH] oom: use ...
934
  	sc->all_unreclaimable = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
935
936
  	for (i = 0; zones[i] != NULL; i++) {
  		struct zone *zone = zones[i];
f3fe65122   Con Kolivas   [PATCH] mm: add p...
937
  		if (!populated_zone(zone))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938
  			continue;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
939
  		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
940
  			continue;
8695949a1   Christoph Lameter   [PATCH] Thin out ...
941
942
943
  		zone->temp_priority = priority;
  		if (zone->prev_priority > priority)
  			zone->prev_priority = priority;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
944

8695949a1   Christoph Lameter   [PATCH] Thin out ...
945
  		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
946
  			continue;	/* Let kswapd poll it */
408d85441   Nick Piggin   [PATCH] oom: use ...
947
  		sc->all_unreclaimable = 0;
05ff51376   Andrew Morton   [PATCH] vmscan re...
948
  		nr_reclaimed += shrink_zone(priority, zone, sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
949
  	}
05ff51376   Andrew Morton   [PATCH] vmscan re...
950
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
  }
   
  /*
   * This is the main entry point to direct page reclaim.
   *
   * If a full scan of the inactive list fails to free enough memory then we
   * are "out of memory" and something needs to be killed.
   *
   * If the caller is !__GFP_FS then the probability of a failure is reasonably
   * high - the zone may be full of dirty or under-writeback pages, which this
   * caller can't do much about.  We kick pdflush and take explicit naps in the
   * hope that some of these pages can be written.  But if the allocating task
   * holds filesystem locks which prevent writeout this might not work, and the
   * allocation attempt will fail.
   */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
966
  unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
967
968
969
  {
  	int priority;
  	int ret = 0;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
970
  	unsigned long total_scanned = 0;
05ff51376   Andrew Morton   [PATCH] vmscan re...
971
  	unsigned long nr_reclaimed = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
972
  	struct reclaim_state *reclaim_state = current->reclaim_state;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
973
974
  	unsigned long lru_pages = 0;
  	int i;
179e96395   Andrew Morton   [PATCH] vmscan: s...
975
976
977
978
979
  	struct scan_control sc = {
  		.gfp_mask = gfp_mask,
  		.may_writepage = !laptop_mode,
  		.swap_cluster_max = SWAP_CLUSTER_MAX,
  		.may_swap = 1,
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
980
  		.swappiness = vm_swappiness,
179e96395   Andrew Morton   [PATCH] vmscan: s...
981
  	};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
982

f8891e5e1   Christoph Lameter   [PATCH] Light wei...
983
  	count_vm_event(ALLOCSTALL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
984
985
986
  
  	for (i = 0; zones[i] != NULL; i++) {
  		struct zone *zone = zones[i];
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
987
  		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
988
989
990
991
992
993
994
  			continue;
  
  		zone->temp_priority = DEF_PRIORITY;
  		lru_pages += zone->nr_active + zone->nr_inactive;
  	}
  
  	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
995
  		sc.nr_scanned = 0;
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
996
997
  		if (!priority)
  			disable_swap_token();
1742f19fa   Andrew Morton   [PATCH] vmscan: r...
998
  		nr_reclaimed += shrink_zones(priority, zones, &sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
999
1000
  		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
  		if (reclaim_state) {
05ff51376   Andrew Morton   [PATCH] vmscan re...
1001
  			nr_reclaimed += reclaim_state->reclaimed_slab;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1002
1003
1004
  			reclaim_state->reclaimed_slab = 0;
  		}
  		total_scanned += sc.nr_scanned;
05ff51376   Andrew Morton   [PATCH] vmscan re...
1005
  		if (nr_reclaimed >= sc.swap_cluster_max) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
  			ret = 1;
  			goto out;
  		}
  
  		/*
  		 * Try to write back as many pages as we just scanned.  This
  		 * tends to cause slow streaming writers to write data to the
  		 * disk smoothly, at the dirtying rate, which is nice.   But
  		 * that's undesirable in laptop mode, where we *want* lumpy
  		 * writeout.  So in laptop mode, write out the whole world.
  		 */
179e96395   Andrew Morton   [PATCH] vmscan: s...
1017
1018
  		if (total_scanned > sc.swap_cluster_max +
  					sc.swap_cluster_max / 2) {
687a21cee   Pekka J Enberg   [PATCH] rename wa...
1019
  			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
1021
1022
1023
1024
1025
1026
  			sc.may_writepage = 1;
  		}
  
  		/* Take a nap, wait for some writeback to complete */
  		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
  			blk_congestion_wait(WRITE, HZ/10);
  	}
408d85441   Nick Piggin   [PATCH] oom: use ...
1027
1028
1029
  	/* top priority shrink_caches still had more to do? don't OOM, then */
  	if (!sc.all_unreclaimable)
  		ret = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1030
1031
1032
  out:
  	for (i = 0; zones[i] != 0; i++) {
  		struct zone *zone = zones[i];
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
1033
  		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
  			continue;
  
  		zone->prev_priority = zone->temp_priority;
  	}
  	return ret;
  }
  
  /*
   * For kswapd, balance_pgdat() will work across all this node's zones until
   * they are all at pages_high.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
   * Returns the number of pages which were actually freed.
   *
   * There is special handling here for zones which are full of pinned pages.
   * This can happen if the pages are all mlocked, or if they are all used by
   * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
   * What we do is to detect the case where all pages in the zone have been
   * scanned twice and there has been zero successful reclaim.  Mark the zone as
   * dead and from now on, only perform a short scan.  Basically we're polling
   * the zone for when the problem goes away.
   *
   * kswapd scans the zones in the highmem->normal->dma direction.  It skips
   * zones which have free_pages > pages_high, but once a zone is found to have
   * free_pages <= pages_high, we scan that zone and the lower zones regardless
   * of the number of free pages in the lower zones.  This interoperates with
   * the page allocator fallback scheme to ensure that aging of pages is balanced
   * across the zones.
   */
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1062
  static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1063
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1064
1065
1066
  	int all_zones_ok;
  	int priority;
  	int i;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1067
  	unsigned long total_scanned;
05ff51376   Andrew Morton   [PATCH] vmscan re...
1068
  	unsigned long nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069
  	struct reclaim_state *reclaim_state = current->reclaim_state;
179e96395   Andrew Morton   [PATCH] vmscan: s...
1070
1071
1072
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
  		.may_swap = 1,
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1073
1074
  		.swap_cluster_max = SWAP_CLUSTER_MAX,
  		.swappiness = vm_swappiness,
179e96395   Andrew Morton   [PATCH] vmscan: s...
1075
  	};
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1076
1077
1078
  
  loop_again:
  	total_scanned = 0;
05ff51376   Andrew Morton   [PATCH] vmscan re...
1079
  	nr_reclaimed = 0;
c0bbbc73d   Christoph Lameter   [PATCH] typo in v...
1080
  	sc.may_writepage = !laptop_mode;
f8891e5e1   Christoph Lameter   [PATCH] Light wei...
1081
  	count_vm_event(PAGEOUTRUN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
  
  	for (i = 0; i < pgdat->nr_zones; i++) {
  		struct zone *zone = pgdat->node_zones + i;
  
  		zone->temp_priority = DEF_PRIORITY;
  	}
  
  	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
  		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
  		unsigned long lru_pages = 0;
f7b7fd8f3   Rik van Riel   [PATCH] temporari...
1092
1093
1094
  		/* The swap token gets in the way of swapout... */
  		if (!priority)
  			disable_swap_token();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1095
  		all_zones_ok = 1;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1096
1097
1098
1099
1100
1101
  		/*
  		 * Scan in the highmem->dma direction for the highest
  		 * zone which needs scanning
  		 */
  		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
  			struct zone *zone = pgdat->node_zones + i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1102

d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1103
1104
  			if (!populated_zone(zone))
  				continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1105

d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1106
1107
  			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
  				continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1108

d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1109
1110
1111
1112
  			if (!zone_watermark_ok(zone, order, zone->pages_high,
  					       0, 0)) {
  				end_zone = i;
  				goto scan;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1113
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1114
  		}
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1115
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
  scan:
  		for (i = 0; i <= end_zone; i++) {
  			struct zone *zone = pgdat->node_zones + i;
  
  			lru_pages += zone->nr_active + zone->nr_inactive;
  		}
  
  		/*
  		 * Now scan the zone in the dma->highmem direction, stopping
  		 * at the last zone which needs scanning.
  		 *
  		 * We do this because the page allocator works in the opposite
  		 * direction.  This prevents the page allocator from allocating
  		 * pages behind kswapd's direction of progress, which would
  		 * cause too much scanning of the lower zones.
  		 */
  		for (i = 0; i <= end_zone; i++) {
  			struct zone *zone = pgdat->node_zones + i;
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
1134
  			int nr_slab;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1135

f3fe65122   Con Kolivas   [PATCH] mm: add p...
1136
  			if (!populated_zone(zone))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1137
1138
1139
1140
  				continue;
  
  			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
  				continue;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1141
1142
1143
  			if (!zone_watermark_ok(zone, order, zone->pages_high,
  					       end_zone, 0))
  				all_zones_ok = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1144
1145
1146
1147
  			zone->temp_priority = priority;
  			if (zone->prev_priority > priority)
  				zone->prev_priority = priority;
  			sc.nr_scanned = 0;
05ff51376   Andrew Morton   [PATCH] vmscan re...
1148
  			nr_reclaimed += shrink_zone(priority, zone, &sc);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1149
  			reclaim_state->reclaimed_slab = 0;
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
1150
1151
  			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
  						lru_pages);
05ff51376   Andrew Morton   [PATCH] vmscan re...
1152
  			nr_reclaimed += reclaim_state->reclaimed_slab;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1153
1154
1155
  			total_scanned += sc.nr_scanned;
  			if (zone->all_unreclaimable)
  				continue;
b15e0905f   Andrew Morton   [PATCH] vmscan: n...
1156
  			if (nr_slab == 0 && zone->pages_scanned >=
4ff1ffb48   Nick Piggin   [PATCH] oom: recl...
1157
  				    (zone->nr_active + zone->nr_inactive) * 6)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1158
1159
1160
1161
1162
1163
1164
  				zone->all_unreclaimable = 1;
  			/*
  			 * If we've done a decent amount of scanning and
  			 * the reclaim ratio is low, start doing writepage
  			 * even in laptop mode
  			 */
  			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
05ff51376   Andrew Morton   [PATCH] vmscan re...
1165
  			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166
1167
  				sc.may_writepage = 1;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
  		if (all_zones_ok)
  			break;		/* kswapd: all done */
  		/*
  		 * OK, kswapd is getting into trouble.  Take a nap, then take
  		 * another pass across the zones.
  		 */
  		if (total_scanned && priority < DEF_PRIORITY - 2)
  			blk_congestion_wait(WRITE, HZ/10);
  
  		/*
  		 * We do this so kswapd doesn't build up large priorities for
  		 * example when it is freeing in parallel with allocators. It
  		 * matches the direct reclaim path behaviour in terms of impact
  		 * on zone->*_priority.
  		 */
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1183
  		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
  			break;
  	}
  out:
  	for (i = 0; i < pgdat->nr_zones; i++) {
  		struct zone *zone = pgdat->node_zones + i;
  
  		zone->prev_priority = zone->temp_priority;
  	}
  	if (!all_zones_ok) {
  		cond_resched();
  		goto loop_again;
  	}
05ff51376   Andrew Morton   [PATCH] vmscan re...
1196
  	return nr_reclaimed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
  }
  
  /*
   * The background pageout daemon, started as a kernel thread
   * from the init process. 
   *
   * This basically trickles out pages so that we have _some_
   * free memory available even if there is no other activity
   * that frees anything up. This is needed for things like routing
   * etc, where we otherwise might have all activity going on in
   * asynchronous contexts that cannot page things out.
   *
   * If there are applications that are active memory-allocators
   * (most normal use), this basically shouldn't matter.
   */
  static int kswapd(void *p)
  {
  	unsigned long order;
  	pg_data_t *pgdat = (pg_data_t*)p;
  	struct task_struct *tsk = current;
  	DEFINE_WAIT(wait);
  	struct reclaim_state reclaim_state = {
  		.reclaimed_slab = 0,
  	};
  	cpumask_t cpumask;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
  	cpumask = node_to_cpumask(pgdat->node_id);
  	if (!cpus_empty(cpumask))
  		set_cpus_allowed(tsk, cpumask);
  	current->reclaim_state = &reclaim_state;
  
  	/*
  	 * Tell the memory management that we're a "memory allocator",
  	 * and that if we need more memory we should get access to it
  	 * regardless (see "__alloc_pages()"). "kswapd" should
  	 * never get caught in the normal page freeing logic.
  	 *
  	 * (Kswapd normally doesn't need memory anyway, but sometimes
  	 * you need a small amount of memory in order to be able to
  	 * page out something else, and this flag essentially protects
  	 * us from recursively trying to free more memory as we're
  	 * trying to free the first piece of memory in the first place).
  	 */
930d91525   Christoph Lameter   [PATCH] Swap Migr...
1239
  	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1240
1241
1242
1243
  
  	order = 0;
  	for ( ; ; ) {
  		unsigned long new_order;
3e1d1d28d   Christoph Lameter   [PATCH] Cleanup p...
1244
1245
  
  		try_to_freeze();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
  
  		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  		new_order = pgdat->kswapd_max_order;
  		pgdat->kswapd_max_order = 0;
  		if (order < new_order) {
  			/*
  			 * Don't sleep if someone wants a larger 'order'
  			 * allocation
  			 */
  			order = new_order;
  		} else {
  			schedule();
  			order = pgdat->kswapd_max_order;
  		}
  		finish_wait(&pgdat->kswapd_wait, &wait);
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1261
  		balance_pgdat(pgdat, order);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
  	}
  	return 0;
  }
  
  /*
   * A zone is low on free memory, so wake its kswapd task to service it.
   */
  void wakeup_kswapd(struct zone *zone, int order)
  {
  	pg_data_t *pgdat;
f3fe65122   Con Kolivas   [PATCH] mm: add p...
1272
  	if (!populated_zone(zone))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1273
1274
1275
  		return;
  
  	pgdat = zone->zone_pgdat;
7fb1d9fca   Rohit Seth   [PATCH] mm: __all...
1276
  	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1277
1278
1279
  		return;
  	if (pgdat->kswapd_max_order < order)
  		pgdat->kswapd_max_order = order;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
1280
  	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1281
  		return;
8d0986e28   Con Kolivas   [PATCH] vm: kswap...
1282
  	if (!waitqueue_active(&pgdat->kswapd_wait))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1283
  		return;
8d0986e28   Con Kolivas   [PATCH] vm: kswap...
1284
  	wake_up_interruptible(&pgdat->kswapd_wait);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1285
1286
1287
1288
  }
  
  #ifdef CONFIG_PM
  /*
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
   * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
   * from LRU lists system-wide, for given pass and priority, and returns the
   * number of reclaimed pages
   *
   * For pass > 3 we also try to shrink the LRU lists that contain a few pages
   */
  static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
  				      int prio, struct scan_control *sc)
  {
  	struct zone *zone;
  	unsigned long nr_to_scan, ret = 0;
  
  	for_each_zone(zone) {
  
  		if (!populated_zone(zone))
  			continue;
  
  		if (zone->all_unreclaimable && prio != DEF_PRIORITY)
  			continue;
  
  		/* For pass = 0 we don't shrink the active list */
  		if (pass > 0) {
  			zone->nr_scan_active += (zone->nr_active >> prio) + 1;
  			if (zone->nr_scan_active >= nr_pages || pass > 3) {
  				zone->nr_scan_active = 0;
  				nr_to_scan = min(nr_pages, zone->nr_active);
  				shrink_active_list(nr_to_scan, zone, sc);
  			}
  		}
  
  		zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
  		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
  			zone->nr_scan_inactive = 0;
  			nr_to_scan = min(nr_pages, zone->nr_inactive);
  			ret += shrink_inactive_list(nr_to_scan, zone, sc);
  			if (ret >= nr_pages)
  				return ret;
  		}
  	}
  
  	return ret;
  }
  
  /*
   * Try to free `nr_pages' of memory, system-wide, and return the number of
   * freed pages.
   *
   * Rather than trying to age LRUs the aim is to preserve the overall
   * LRU order by reclaiming preferentially
   * inactive > active > active referenced > active mapped
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1339
   */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1340
  unsigned long shrink_all_memory(unsigned long nr_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1341
  {
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1342
  	unsigned long lru_pages, nr_slab;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1343
  	unsigned long ret = 0;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1344
1345
1346
1347
1348
1349
1350
1351
1352
  	int pass;
  	struct reclaim_state reclaim_state;
  	struct zone *zone;
  	struct scan_control sc = {
  		.gfp_mask = GFP_KERNEL,
  		.may_swap = 0,
  		.swap_cluster_max = nr_pages,
  		.may_writepage = 1,
  		.swappiness = vm_swappiness,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1353
1354
1355
  	};
  
  	current->reclaim_state = &reclaim_state;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1356

d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1357
1358
1359
  	lru_pages = 0;
  	for_each_zone(zone)
  		lru_pages += zone->nr_active + zone->nr_inactive;
972d1a7b1   Christoph Lameter   [PATCH] ZVC: Supp...
1360
  	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1361
1362
1363
1364
1365
  	/* If slab caches are huge, it's better to hit them first */
  	while (nr_slab >= lru_pages) {
  		reclaim_state.reclaimed_slab = 0;
  		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
  		if (!reclaim_state.reclaimed_slab)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1366
  			break;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1367
1368
1369
1370
1371
1372
  
  		ret += reclaim_state.reclaimed_slab;
  		if (ret >= nr_pages)
  			goto out;
  
  		nr_slab -= reclaim_state.reclaimed_slab;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1373
  	}
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
  
  	/*
  	 * We try to shrink LRUs in 5 passes:
  	 * 0 = Reclaim from inactive_list only
  	 * 1 = Reclaim from active list but don't reclaim mapped
  	 * 2 = 2nd pass of type 1
  	 * 3 = Reclaim mapped (normal reclaim)
  	 * 4 = 2nd pass of type 3
  	 */
  	for (pass = 0; pass < 5; pass++) {
  		int prio;
  
  		/* Needed for shrinking slab caches later on */
  		if (!lru_pages)
  			for_each_zone(zone) {
  				lru_pages += zone->nr_active;
  				lru_pages += zone->nr_inactive;
  			}
  
  		/* Force reclaiming mapped pages in the passes #3 and #4 */
  		if (pass > 2) {
  			sc.may_swap = 1;
  			sc.swappiness = 100;
  		}
  
  		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
  			unsigned long nr_to_scan = nr_pages - ret;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1401
  			sc.nr_scanned = 0;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
  			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
  			if (ret >= nr_pages)
  				goto out;
  
  			reclaim_state.reclaimed_slab = 0;
  			shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
  			ret += reclaim_state.reclaimed_slab;
  			if (ret >= nr_pages)
  				goto out;
  
  			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
  				blk_congestion_wait(WRITE, HZ / 10);
  		}
  
  		lru_pages = 0;
248a0301e   Rafael J. Wysocki   [PATCH] mm: make ...
1417
  	}
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
  
  	/*
  	 * If ret = 0, we could not shrink LRUs, but there may be something
  	 * in slab caches
  	 */
  	if (!ret)
  		do {
  			reclaim_state.reclaimed_slab = 0;
  			shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
  			ret += reclaim_state.reclaimed_slab;
  		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
  
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1431
  	current->reclaim_state = NULL;
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1432

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1433
1434
1435
1436
1437
1438
1439
1440
1441
  	return ret;
  }
  #endif
  
  #ifdef CONFIG_HOTPLUG_CPU
  /* It's optimal to keep kswapds on the same CPUs as their memory, but
     not required for correctness.  So if the last cpu in a node goes
     away, we get changed to run anywhere: as the first one comes back,
     restore their cpu bindings. */
9c7b216d2   Chandra Seetharaman   [PATCH] cpu hotpl...
1442
  static int __devinit cpu_callback(struct notifier_block *nfb,
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1443
  				  unsigned long action, void *hcpu)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1444
1445
1446
1447
1448
  {
  	pg_data_t *pgdat;
  	cpumask_t mask;
  
  	if (action == CPU_ONLINE) {
ec936fc56   KAMEZAWA Hiroyuki   [PATCH] for_each_...
1449
  		for_each_online_pgdat(pgdat) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1450
1451
1452
1453
1454
1455
1456
1457
1458
  			mask = node_to_cpumask(pgdat->node_id);
  			if (any_online_cpu(mask) != NR_CPUS)
  				/* One of our CPUs online: restore mask */
  				set_cpus_allowed(pgdat->kswapd, mask);
  		}
  	}
  	return NOTIFY_OK;
  }
  #endif /* CONFIG_HOTPLUG_CPU */
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
  /*
   * This kswapd start function will be called by init and node-hot-add.
   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
   */
  int kswapd_run(int nid)
  {
  	pg_data_t *pgdat = NODE_DATA(nid);
  	int ret = 0;
  
  	if (pgdat->kswapd)
  		return 0;
  
  	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
  	if (IS_ERR(pgdat->kswapd)) {
  		/* failure at boot is fatal */
  		BUG_ON(system_state == SYSTEM_BOOTING);
  		printk("Failed to start kswapd on node %d
  ",nid);
  		ret = -1;
  	}
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1481
1482
  static int __init kswapd_init(void)
  {
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
1483
  	int nid;
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1484

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1485
  	swap_setup();
3218ae14b   Yasunori Goto   [PATCH] pgdat all...
1486
1487
  	for_each_online_node(nid)
   		kswapd_run(nid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1488
1489
1490
1491
1492
  	hotcpu_notifier(cpu_callback, 0);
  	return 0;
  }
  
  module_init(kswapd_init)
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1493
1494
1495
1496
1497
1498
1499
  
  #ifdef CONFIG_NUMA
  /*
   * Zone reclaim mode
   *
   * If non-zero call zone_reclaim when the number of free pages falls below
   * the watermarks.
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1500
1501
   */
  int zone_reclaim_mode __read_mostly;
1b2ffb789   Christoph Lameter   [PATCH] Zone recl...
1502
1503
1504
1505
  #define RECLAIM_OFF 0
  #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
  #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
  #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1506
  /*
a92f71263   Christoph Lameter   [PATCH] zone_recl...
1507
1508
1509
1510
1511
   * Priority for ZONE_RECLAIM. This determines the fraction of pages
   * of a node considered for each zone_reclaim. 4 scans 1/16th of
   * a zone.
   */
  #define ZONE_RECLAIM_PRIORITY 4
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1512
  /*
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
1513
1514
1515
1516
1517
1518
   * Percentage of pages in a zone that must be unmapped for zone_reclaim to
   * occur.
   */
  int sysctl_min_unmapped_ratio = 1;
  
  /*
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1519
1520
1521
1522
1523
1524
   * If the number of slab pages in a zone grows beyond this percentage then
   * slab reclaim needs to occur.
   */
  int sysctl_min_slab_ratio = 5;
  
  /*
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1525
1526
   * Try to free up some pages from this zone through reclaim.
   */
179e96395   Andrew Morton   [PATCH] vmscan: s...
1527
  static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1528
  {
7fb2d46d3   Christoph Lameter   [PATCH] zone_recl...
1529
  	/* Minimum pages needed in order to stay on node */
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1530
  	const unsigned long nr_pages = 1 << order;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1531
1532
  	struct task_struct *p = current;
  	struct reclaim_state reclaim_state;
8695949a1   Christoph Lameter   [PATCH] Thin out ...
1533
  	int priority;
05ff51376   Andrew Morton   [PATCH] vmscan re...
1534
  	unsigned long nr_reclaimed = 0;
179e96395   Andrew Morton   [PATCH] vmscan: s...
1535
1536
1537
  	struct scan_control sc = {
  		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
  		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
69e05944a   Andrew Morton   [PATCH] vmscan: u...
1538
1539
  		.swap_cluster_max = max_t(unsigned long, nr_pages,
  					SWAP_CLUSTER_MAX),
179e96395   Andrew Morton   [PATCH] vmscan: s...
1540
  		.gfp_mask = gfp_mask,
d6277db4a   Rafael J. Wysocki   [PATCH] swsusp: r...
1541
  		.swappiness = vm_swappiness,
179e96395   Andrew Morton   [PATCH] vmscan: s...
1542
  	};
83e33a471   Christoph Lameter   [PATCH] zone recl...
1543
  	unsigned long slab_reclaimable;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1544
1545
  
  	disable_swap_token();
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1546
  	cond_resched();
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
1547
1548
1549
1550
1551
1552
  	/*
  	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
  	 * and we also need to be able to write out pages for RECLAIM_WRITE
  	 * and RECLAIM_SWAP.
  	 */
  	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1553
1554
  	reclaim_state.reclaimed_slab = 0;
  	p->reclaim_state = &reclaim_state;
c84db23c6   Christoph Lameter   [PATCH] zone_recl...
1555

0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
  	if (zone_page_state(zone, NR_FILE_PAGES) -
  		zone_page_state(zone, NR_FILE_MAPPED) >
  		zone->min_unmapped_pages) {
  		/*
  		 * Free memory by calling shrink zone with increasing
  		 * priorities until we have enough memory freed.
  		 */
  		priority = ZONE_RECLAIM_PRIORITY;
  		do {
  			nr_reclaimed += shrink_zone(priority, zone, &sc);
  			priority--;
  		} while (priority >= 0 && nr_reclaimed < nr_pages);
  	}
c84db23c6   Christoph Lameter   [PATCH] zone_recl...
1569

83e33a471   Christoph Lameter   [PATCH] zone recl...
1570
1571
  	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
  	if (slab_reclaimable > zone->min_slab_pages) {
2a16e3f4b   Christoph Lameter   [PATCH] Reclaim s...
1572
  		/*
7fb2d46d3   Christoph Lameter   [PATCH] zone_recl...
1573
  		 * shrink_slab() does not currently allow us to determine how
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1574
1575
1576
1577
  		 * many pages were freed in this zone. So we take the current
  		 * number of slab pages and shake the slab until it is reduced
  		 * by the same nr_pages that we used for reclaiming unmapped
  		 * pages.
2a16e3f4b   Christoph Lameter   [PATCH] Reclaim s...
1578
  		 *
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1579
1580
  		 * Note that shrink_slab will free memory on all zones and may
  		 * take a long time.
2a16e3f4b   Christoph Lameter   [PATCH] Reclaim s...
1581
  		 */
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1582
  		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
83e33a471   Christoph Lameter   [PATCH] zone recl...
1583
1584
  			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
  				slab_reclaimable - nr_pages)
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1585
  			;
83e33a471   Christoph Lameter   [PATCH] zone recl...
1586
1587
1588
1589
1590
1591
1592
  
  		/*
  		 * Update nr_reclaimed by the number of slab pages we
  		 * reclaimed from this zone.
  		 */
  		nr_reclaimed += slab_reclaimable -
  			zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2a16e3f4b   Christoph Lameter   [PATCH] Reclaim s...
1593
  	}
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1594
  	p->reclaim_state = NULL;
d4f7796e9   Christoph Lameter   [PATCH] vmscan: f...
1595
  	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
05ff51376   Andrew Morton   [PATCH] vmscan re...
1596
  	return nr_reclaimed >= nr_pages;
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1597
  }
179e96395   Andrew Morton   [PATCH] vmscan: s...
1598
1599
1600
1601
1602
1603
1604
  
  int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
  {
  	cpumask_t mask;
  	int node_id;
  
  	/*
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1605
1606
  	 * Zone reclaim reclaims unmapped file backed pages and
  	 * slab pages if we are over the defined limits.
34aa1330f   Christoph Lameter   [PATCH] zoned vm ...
1607
  	 *
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
1608
1609
1610
1611
1612
  	 * A small portion of unmapped file backed pages is needed for
  	 * file I/O otherwise pages read by file I/O will be immediately
  	 * thrown out if the zone is overallocated. So we do not reclaim
  	 * if less than a specified percentage of the zone is used by
  	 * unmapped file backed pages.
179e96395   Andrew Morton   [PATCH] vmscan: s...
1613
  	 */
34aa1330f   Christoph Lameter   [PATCH] zoned vm ...
1614
  	if (zone_page_state(zone, NR_FILE_PAGES) -
0ff38490c   Christoph Lameter   [PATCH] zone_recl...
1615
1616
1617
  	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
  	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
  			<= zone->min_slab_pages)
9614634fe   Christoph Lameter   [PATCH] ZVC/zone_...
1618
  		return 0;
179e96395   Andrew Morton   [PATCH] vmscan: s...
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
  
  	/*
  	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
  	 * not have reclaimable pages and if we should not delay the allocation
  	 * then do not scan.
  	 */
  	if (!(gfp_mask & __GFP_WAIT) ||
  		zone->all_unreclaimable ||
  		atomic_read(&zone->reclaim_in_progress) > 0 ||
  		(current->flags & PF_MEMALLOC))
  			return 0;
  
  	/*
  	 * Only run zone reclaim on the local zone or on zones that do not
  	 * have associated processors. This will favor the local processor
  	 * over remote processors and spread off node memory allocations
  	 * as wide as possible.
  	 */
89fa30242   Christoph Lameter   [PATCH] NUMA: Add...
1637
  	node_id = zone_to_nid(zone);
179e96395   Andrew Morton   [PATCH] vmscan: s...
1638
1639
1640
1641
1642
  	mask = node_to_cpumask(node_id);
  	if (!cpus_empty(mask) && node_id != numa_node_id())
  		return 0;
  	return __zone_reclaim(zone, gfp_mask, order);
  }
9eeff2395   Christoph Lameter   [PATCH] Zone recl...
1643
  #endif