Eric Lee / linux-smarc-t335x-v3.2

Commit 04e62a29bf157ce1edd168f2b71b533c80d13628

Authored by Christoph Lameter 2006-06-23 17:03:38 +0800

Committed by Linus Torvalds 2006-06-23 22:42:51 +0800

[PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed
pages during migration. Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.

Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.

And another writepage() ugliness shows up. writepage() can drop the page
lock. Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 124 additions and 43 deletions Inline Diff

include/linux/swap.h
mm/migrate.c
mm/rmap.c
mm/vmscan.c

include/linux/swap.h

Diff comments View file @ 04e62a2

 #ifndef _LINUX_SWAP_H
 #define _LINUX_SWAP_H
 #include <linux/spinlock.h>
 #include <linux/linkage.h>
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <asm/page.h>
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
 static inline int current_is_kswapd(void)
 {
 	return current->flags & PF_KSWAPD;
 }
 /*
  * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
  * be swapped to.  The swap type and the offset into that swap type are
  * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
  * for the type means that the maximum number of swapcache pages is 27 bits
  * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
 #ifndef CONFIG_MIGRATION
 #define MAX_SWAPFILES		(1 << MAX_SWAPFILES_SHIFT)
 #else
 /* Use last two entries for page migration swap entries */
 #define MAX_SWAPFILES		((1 << MAX_SWAPFILES_SHIFT)-2)
 #define SWP_MIGRATION_READ	MAX_SWAPFILES
 #define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + 1)
 #endif
 /*
  * Magic header for a swap area. The first part of the union is
  * what the swap magic looks like for the old (limited to 128MB)
  * swap area format, the second part of the union adds - in the
  * old reserved area - some extra information. Note that the first
  * kilobyte is reserved for boot loader or disk label stuff...
  *
  * Having the magic at the end of the PAGE_SIZE makes detecting swap
  * areas somewhat tricky on machines that support multiple page sizes.
  * For 2.5 we'll probably want to move the magic to just beyond the
  * bootbits...
  */
 union swap_header {
 	struct {
 		char reserved[PAGE_SIZE - 10];
 		char magic[10];			/* SWAP-SPACE or SWAPSPACE2 */
 	} magic;
 	struct {
 		char		bootbits[1024];	/* Space for disklabel etc. */
 		__u32		version;
 		__u32		last_page;
 		__u32		nr_badpages;
 		unsigned char	sws_uuid[16];
 		unsigned char	sws_volume[16];
 		__u32		padding[117];
 		__u32		badpages[1];
 	} info;
 };
  /* A swap entry has to fit into a "unsigned long", as
   * the entry is hidden in the "index" field of the
   * swapper address space.
   */
 typedef struct {
 	unsigned long val;
 } swp_entry_t;
 /*
  * current->reclaim_state points to one of these when a task is running
  * memory reclaim
  */
 struct reclaim_state {
 	unsigned long reclaimed_slab;
 };
 #ifdef __KERNEL__
 struct address_space;
 struct sysinfo;
 struct writeback_control;
 struct zone;
 /*
  * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
  * disk blocks.  A list of swap extents maps the entire swapfile.  (Where the
  * term `swapfile' refers to either a blockdevice or an IS_REG file.  Apart
  * from setup, they're handled identically.
  *
  * We always assume that blocks are of size PAGE_SIZE.
  */
 struct swap_extent {
 	struct list_head list;
 	pgoff_t start_page;
 	pgoff_t nr_pages;
 	sector_t start_block;
 };
 /*
  * Max bad pages in the new format..
  */
 #define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
 #define MAX_SWAP_BADPAGES \
 	((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int))
 enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
 	SWP_ACTIVE	= (SWP_USED | SWP_WRITEOK),
 					/* add others here before... */
 	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
 };
 #define SWAP_CLUSTER_MAX 32
 #define SWAP_MAP_MAX	0x7fff
 #define SWAP_MAP_BAD	0x8000
 /*
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
 	unsigned int flags;
 	int prio;			/* swap priority */
 	struct file *swap_file;
 	struct block_device *bdev;
 	struct list_head extent_list;
 	struct swap_extent *curr_swap_extent;
 	unsigned old_block_size;
 	unsigned short * swap_map;
 	unsigned int lowest_bit;
 	unsigned int highest_bit;
 	unsigned int cluster_next;
 	unsigned int cluster_nr;
 	unsigned int pages;
 	unsigned int max;
 	unsigned int inuse_pages;
 	int next;			/* next entry on swap list */
 };
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
 	int next;	/* swapfile to be used next */
 };
 /* Swap 50% full? Release swapcache more aggressively.. */
 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
 /* linux/mm/oom_kill.c */
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalhigh_pages;
 extern unsigned long totalreserve_pages;
 extern long nr_swap_pages;
 extern unsigned int nr_free_pages(void);
 extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat);
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(lru_cache_add_active(struct page *));
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
 extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zone **, gfp_t);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-extern pageout_t pageout(struct page *page, struct address_space *mapping);
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int zone_reclaim_interval;
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 #else
 #define zone_reclaim_mode 0
 static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #endif /* CONFIG_MMU */
 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
 /* linux/mm/swap_state.c */
 extern struct address_space swapper_space;
 #define total_swapcache_pages  swapper_space.nrpages
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern int move_to_swap_cache(struct page *, swp_entry_t);
 extern int move_from_swap_cache(struct page *, unsigned long,
 		struct address_space *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
 					   unsigned long addr);
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
 extern void free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 extern spinlock_t swap_lock;
-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
 extern unsigned long swap_token_default_timeout;
 extern void grab_swap_token(void);
 extern void __put_swap_token(struct mm_struct *);
 static inline int has_swap_token(struct mm_struct *mm)
 {
 	return (mm == swap_token_mm);
 }
 static inline void put_swap_token(struct mm_struct *mm)
 {
 	if (has_swap_token(mm))
 		__put_swap_token(mm);
 }
 static inline void disable_swap_token(void)
 {
 	put_swap_token(swap_token_mm);
 }
 #else /* CONFIG_SWAP */
 #define total_swap_pages			0
 #define total_swapcache_pages			0UL
 #define si_swapinfo(val) \
 	do { (val)->freeswap = (val)->totalswap = 0; } while (0)
 /* only sparc can not include linux/pagemap.h in this file
  * so leave page_cache_release and release_pages undeclared... */
 #define free_page_and_swap_cache(page) \
 	page_cache_release(page)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr), 0);
 #define show_swap_cache_info()			/*NOTHING*/
 #define free_swap_and_cache(swp)		/*NOTHING*/
 #define swap_duplicate(swp)			/*NOTHING*/
 #define swap_free(swp)				/*NOTHING*/
 #define read_swap_cache_async(swp,vma,addr)	NULL
 #define lookup_swap_cache(swp)			NULL
 #define valid_swaphandles(swp, off)		0
 #define can_share_swap_page(p)			(page_mapcount(p) == 1)
 #define move_to_swap_cache(p, swp)		1
 #define move_from_swap_cache(p, i, m)		1
 #define __delete_from_swap_cache(p)		/*NOTHING*/
 #define delete_from_swap_cache(p)		/*NOTHING*/
 #define swap_token_default_timeout		0
 static inline int remove_exclusive_swap_page(struct page *p)
 {
 	return 0;
 }
 static inline swp_entry_t get_swap_page(void)
 {
 	swp_entry_t entry;
 	entry.val = 0;
 	return entry;
 }
 /* linux/mm/thrash.c */
 #define put_swap_token(x) do { } while(0)
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 #define disable_swap_token() do { } while(0)
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */

mm/migrate.c

Diff comments View file @ 04e62a2

1	/*	1	/*
2	* Memory Migration functionality - linux/mm/migration.c	2	* Memory Migration functionality - linux/mm/migration.c
3	*	3	*
4	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter	4	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5	*	5	*
6	* Page migration was first developed in the context of the memory hotplug	6	* Page migration was first developed in the context of the memory hotplug
7	* project. The main authors of the migration code are:	7	* project. The main authors of the migration code are:
8	*	8	*
9	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>	9	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10	* Hirokazu Takahashi <taka@valinux.co.jp>	10	* Hirokazu Takahashi <taka@valinux.co.jp>
11	* Dave Hansen <haveblue@us.ibm.com>	11	* Dave Hansen <haveblue@us.ibm.com>
12	* Christoph Lameter <clameter@sgi.com>	12	* Christoph Lameter <clameter@sgi.com>
13	*/	13	*/
14		14
15	#include <linux/migrate.h>	15	#include <linux/migrate.h>
16	#include <linux/module.h>	16	#include <linux/module.h>
17	#include <linux/swap.h>	17	#include <linux/swap.h>
18	#include <linux/swapops.h>	18	#include <linux/swapops.h>
19	#include <linux/pagemap.h>	19	#include <linux/pagemap.h>
20	#include <linux/buffer_head.h>	20	#include <linux/buffer_head.h>
21	#include <linux/mm_inline.h>	21	#include <linux/mm_inline.h>
22	#include <linux/pagevec.h>	22	#include <linux/pagevec.h>
23	#include <linux/rmap.h>	23	#include <linux/rmap.h>
24	#include <linux/topology.h>	24	#include <linux/topology.h>
25	#include <linux/cpu.h>	25	#include <linux/cpu.h>
26	#include <linux/cpuset.h>	26	#include <linux/cpuset.h>
		27	#include <linux/writeback.h>
27		28
28	#include "internal.h"	29	#include "internal.h"
29		30
30	/* The maximum number of pages to take off the LRU for migration */	31	/* The maximum number of pages to take off the LRU for migration */
31	#define MIGRATE_CHUNK_SIZE 256	32	#define MIGRATE_CHUNK_SIZE 256
32		33
33	#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))	34	#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
34		35
35	/*	36	/*
36	* Isolate one page from the LRU lists. If successful put it onto	37	* Isolate one page from the LRU lists. If successful put it onto
37	* the indicated list with elevated page count.	38	* the indicated list with elevated page count.
38	*	39	*
39	* Result:	40	* Result:
40	* -EBUSY: page not on LRU list	41	* -EBUSY: page not on LRU list
41	* 0: page removed from LRU list and added to the specified list.	42	* 0: page removed from LRU list and added to the specified list.
42	*/	43	*/
43	int isolate_lru_page(struct page page, struct list_head pagelist)	44	int isolate_lru_page(struct page page, struct list_head pagelist)
44	{	45	{
45	int ret = -EBUSY;	46	int ret = -EBUSY;
46		47
47	if (PageLRU(page)) {	48	if (PageLRU(page)) {
48	struct zone *zone = page_zone(page);	49	struct zone *zone = page_zone(page);
49		50
50	spin_lock_irq(&zone->lru_lock);	51	spin_lock_irq(&zone->lru_lock);
51	if (PageLRU(page)) {	52	if (PageLRU(page)) {
52	ret = 0;	53	ret = 0;
53	get_page(page);	54	get_page(page);
54	ClearPageLRU(page);	55	ClearPageLRU(page);
55	if (PageActive(page))	56	if (PageActive(page))
56	del_page_from_active_list(zone, page);	57	del_page_from_active_list(zone, page);
57	else	58	else
58	del_page_from_inactive_list(zone, page);	59	del_page_from_inactive_list(zone, page);
59	list_add_tail(&page->lru, pagelist);	60	list_add_tail(&page->lru, pagelist);
60	}	61	}
61	spin_unlock_irq(&zone->lru_lock);	62	spin_unlock_irq(&zone->lru_lock);
62	}	63	}
63	return ret;	64	return ret;
64	}	65	}
65		66
66	/*	67	/*
67	* migrate_prep() needs to be called after we have compiled the list of pages	68	* migrate_prep() needs to be called after we have compiled the list of pages
68	* to be migrated using isolate_lru_page() but before we begin a series of calls	69	* to be migrated using isolate_lru_page() but before we begin a series of calls
69	* to migrate_pages().	70	* to migrate_pages().
70	*/	71	*/
71	int migrate_prep(void)	72	int migrate_prep(void)
72	{	73	{
73	/*	74	/*
74	* Clear the LRU lists so pages can be isolated.	75	* Clear the LRU lists so pages can be isolated.
75	* Note that pages may be moved off the LRU after we have	76	* Note that pages may be moved off the LRU after we have
76	* drained them. Those pages will fail to migrate like other	77	* drained them. Those pages will fail to migrate like other
77	* pages that may be busy.	78	* pages that may be busy.
78	*/	79	*/
79	lru_add_drain_all();	80	lru_add_drain_all();
80		81
81	return 0;	82	return 0;
82	}	83	}
83		84
84	static inline void move_to_lru(struct page *page)	85	static inline void move_to_lru(struct page *page)
85	{	86	{
86	list_del(&page->lru);	87	list_del(&page->lru);
87	if (PageActive(page)) {	88	if (PageActive(page)) {
88	/*	89	/*
89	* lru_cache_add_active checks that	90	* lru_cache_add_active checks that
90	* the PG_active bit is off.	91	* the PG_active bit is off.
91	*/	92	*/
92	ClearPageActive(page);	93	ClearPageActive(page);
93	lru_cache_add_active(page);	94	lru_cache_add_active(page);
94	} else {	95	} else {
95	lru_cache_add(page);	96	lru_cache_add(page);
96	}	97	}
97	put_page(page);	98	put_page(page);
98	}	99	}
99		100
100	/*	101	/*
101	* Add isolated pages on the list back to the LRU.	102	* Add isolated pages on the list back to the LRU.
102	*	103	*
103	* returns the number of pages put back.	104	* returns the number of pages put back.
104	*/	105	*/
105	int putback_lru_pages(struct list_head *l)	106	int putback_lru_pages(struct list_head *l)
106	{	107	{
107	struct page *page;	108	struct page *page;
108	struct page *page2;	109	struct page *page2;
109	int count = 0;	110	int count = 0;
110		111
111	list_for_each_entry_safe(page, page2, l, lru) {	112	list_for_each_entry_safe(page, page2, l, lru) {
112	move_to_lru(page);	113	move_to_lru(page);
113	count++;	114	count++;
114	}	115	}
115	return count;	116	return count;
116	}	117	}
117		118
118	static inline int is_swap_pte(pte_t pte)	119	static inline int is_swap_pte(pte_t pte)
119	{	120	{
120	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);	121	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
121	}	122	}
122		123
123	/*	124	/*
124	* Restore a potential migration pte to a working pte entry	125	* Restore a potential migration pte to a working pte entry
125	*/	126	*/
126	static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,	127	static void remove_migration_pte(struct vm_area_struct *vma,
127	struct page old, struct page new)	128	struct page old, struct page new)
128	{	129	{
129	struct mm_struct *mm = vma->vm_mm;	130	struct mm_struct *mm = vma->vm_mm;
130	swp_entry_t entry;	131	swp_entry_t entry;
131	pgd_t *pgd;	132	pgd_t *pgd;
132	pud_t *pud;	133	pud_t *pud;
133	pmd_t *pmd;	134	pmd_t *pmd;
134	pte_t *ptep, pte;	135	pte_t *ptep, pte;
135	spinlock_t *ptl;	136	spinlock_t *ptl;
		137	unsigned long addr = page_address_in_vma(new, vma);
136		138
		139	if (addr == -EFAULT)
		140	return;
		141
137	pgd = pgd_offset(mm, addr);	142	pgd = pgd_offset(mm, addr);
138	if (!pgd_present(*pgd))	143	if (!pgd_present(*pgd))
139	return;	144	return;
140		145
141	pud = pud_offset(pgd, addr);	146	pud = pud_offset(pgd, addr);
142	if (!pud_present(*pud))	147	if (!pud_present(*pud))
143	return;	148	return;
144		149
145	pmd = pmd_offset(pud, addr);	150	pmd = pmd_offset(pud, addr);
146	if (!pmd_present(*pmd))	151	if (!pmd_present(*pmd))
147	return;	152	return;
148		153
149	ptep = pte_offset_map(pmd, addr);	154	ptep = pte_offset_map(pmd, addr);
150		155
151	if (!is_swap_pte(*ptep)) {	156	if (!is_swap_pte(*ptep)) {
152	pte_unmap(ptep);	157	pte_unmap(ptep);
153	return;	158	return;
154	}	159	}
155		160
156	ptl = pte_lockptr(mm, pmd);	161	ptl = pte_lockptr(mm, pmd);
157	spin_lock(ptl);	162	spin_lock(ptl);
158	pte = *ptep;	163	pte = *ptep;
159	if (!is_swap_pte(pte))	164	if (!is_swap_pte(pte))
160	goto out;	165	goto out;
161		166
162	entry = pte_to_swp_entry(pte);	167	entry = pte_to_swp_entry(pte);
163		168
164	if (!is_migration_entry(entry) \|\| migration_entry_to_page(entry) != old)	169	if (!is_migration_entry(entry) \|\| migration_entry_to_page(entry) != old)
165	goto out;	170	goto out;
166		171
167	get_page(new);	172	get_page(new);
168	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));	173	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
169	if (is_write_migration_entry(entry))	174	if (is_write_migration_entry(entry))
170	pte = pte_mkwrite(pte);	175	pte = pte_mkwrite(pte);
171	set_pte_at(mm, addr, ptep, pte);	176	set_pte_at(mm, addr, ptep, pte);
172	page_add_anon_rmap(new, vma, addr);	177
		178	if (PageAnon(new))
		179	page_add_anon_rmap(new, vma, addr);
		180	else
		181	page_add_file_rmap(new);
		182
		183	/* No need to invalidate - it was non-present before */
		184	update_mmu_cache(vma, addr, pte);
		185	lazy_mmu_prot_update(pte);
		186
173	out:	187	out:
174	pte_unmap_unlock(ptep, ptl);	188	pte_unmap_unlock(ptep, ptl);
175	}	189	}
176		190
177	/*	191	/*
178	* Get rid of all migration entries and replace them by	192	* Note that remove_file_migration_ptes will only work on regular mappings,
179	* references to the indicated page.	193	* Nonlinear mappings do not use migration entries.
180	*	194	*/
		195	static void remove_file_migration_ptes(struct page old, struct page new)
		196	{
		197	struct vm_area_struct *vma;
		198	struct address_space *mapping = page_mapping(new);
		199	struct prio_tree_iter iter;
		200	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
		201
		202	if (!mapping)
		203	return;
		204
		205	spin_lock(&mapping->i_mmap_lock);
		206
		207	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
		208	remove_migration_pte(vma, old, new);
		209
		210	spin_unlock(&mapping->i_mmap_lock);
		211	}
		212
		213	/*
181	* Must hold mmap_sem lock on at least one of the vmas containing	214	* Must hold mmap_sem lock on at least one of the vmas containing
182	* the page so that the anon_vma cannot vanish.	215	* the page so that the anon_vma cannot vanish.
183	*/	216	*/
184	static void remove_migration_ptes(struct page old, struct page new)	217	static void remove_anon_migration_ptes(struct page old, struct page new)
185	{	218	{
186	struct anon_vma *anon_vma;	219	struct anon_vma *anon_vma;
187	struct vm_area_struct *vma;	220	struct vm_area_struct *vma;
188	unsigned long mapping;	221	unsigned long mapping;
189		222
190	mapping = (unsigned long)new->mapping;	223	mapping = (unsigned long)new->mapping;
191		224
192	if (!mapping \|\| (mapping & PAGE_MAPPING_ANON) == 0)	225	if (!mapping \|\| (mapping & PAGE_MAPPING_ANON) == 0)
193	return;	226	return;
194		227
195	/*	228	/*
196	* We hold the mmap_sem lock. So no need to call page_lock_anon_vma.	229	* We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
197	*/	230	*/
198	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);	231	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
199	spin_lock(&anon_vma->lock);	232	spin_lock(&anon_vma->lock);
200		233
201	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)	234	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
202	remove_migration_pte(vma, page_address_in_vma(new, vma),	235	remove_migration_pte(vma, old, new);
203	old, new);
204		236
205	spin_unlock(&anon_vma->lock);	237	spin_unlock(&anon_vma->lock);
206	}	238	}
207		239
208	/*	240	/*
		241	* Get rid of all migration entries and replace them by
		242	* references to the indicated page.
		243	*/
		244	static void remove_migration_ptes(struct page old, struct page new)
		245	{
		246	if (PageAnon(new))
		247	remove_anon_migration_ptes(old, new);
		248	else
		249	remove_file_migration_ptes(old, new);
		250	}
		251
		252	/*
209	* Something used the pte of a page under migration. We need to	253	* Something used the pte of a page under migration. We need to
210	* get to the page and wait until migration is finished.	254	* get to the page and wait until migration is finished.
211	* When we return from this function the fault will be retried.	255	* When we return from this function the fault will be retried.
212	*	256	*
213	* This function is called from do_swap_page().	257	* This function is called from do_swap_page().
214	*/	258	*/
215	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,	259	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
216	unsigned long address)	260	unsigned long address)
217	{	261	{
218	pte_t *ptep, pte;	262	pte_t *ptep, pte;
219	spinlock_t *ptl;	263	spinlock_t *ptl;
220	swp_entry_t entry;	264	swp_entry_t entry;
221	struct page *page;	265	struct page *page;
222		266
223	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);	267	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
224	pte = *ptep;	268	pte = *ptep;
225	if (!is_swap_pte(pte))	269	if (!is_swap_pte(pte))
226	goto out;	270	goto out;
227		271
228	entry = pte_to_swp_entry(pte);	272	entry = pte_to_swp_entry(pte);
229	if (!is_migration_entry(entry))	273	if (!is_migration_entry(entry))
230	goto out;	274	goto out;
231		275
232	page = migration_entry_to_page(entry);	276	page = migration_entry_to_page(entry);
233		277
234	get_page(page);	278	get_page(page);
235	pte_unmap_unlock(ptep, ptl);	279	pte_unmap_unlock(ptep, ptl);
236	wait_on_page_locked(page);	280	wait_on_page_locked(page);
237	put_page(page);	281	put_page(page);
238	return;	282	return;
239	out:	283	out:
240	pte_unmap_unlock(ptep, ptl);	284	pte_unmap_unlock(ptep, ptl);
241	}	285	}
242		286
243	/*	287	/*
244	* Replace the page in the mapping.	288	* Replace the page in the mapping.
245	*	289	*
246	* The number of remaining references must be:	290	* The number of remaining references must be:
247	* 1 for anonymous pages without a mapping	291	* 1 for anonymous pages without a mapping
248	* 2 for pages with a mapping	292	* 2 for pages with a mapping
249	* 3 for pages with a mapping and PagePrivate set.	293	* 3 for pages with a mapping and PagePrivate set.
250	*/	294	*/
251	static int migrate_page_move_mapping(struct address_space *mapping,	295	static int migrate_page_move_mapping(struct address_space *mapping,
252	struct page newpage, struct page page)	296	struct page newpage, struct page page)
253	{	297	{
254	struct page **radix_pointer;	298	struct page **radix_pointer;
255		299
256	if (!mapping) {	300	if (!mapping) {
257	/* Anonymous page */	301	/* Anonymous page */
258	if (page_count(page) != 1)	302	if (page_count(page) != 1)
259	return -EAGAIN;	303	return -EAGAIN;
260	return 0;	304	return 0;
261	}	305	}
262		306
263	write_lock_irq(&mapping->tree_lock);	307	write_lock_irq(&mapping->tree_lock);
264		308
265	radix_pointer = (struct page **)radix_tree_lookup_slot(	309	radix_pointer = (struct page **)radix_tree_lookup_slot(
266	&mapping->page_tree,	310	&mapping->page_tree,
267	page_index(page));	311	page_index(page));
268		312
269	if (page_count(page) != 2 + !!PagePrivate(page) \|\|	313	if (page_count(page) != 2 + !!PagePrivate(page) \|\|
270	*radix_pointer != page) {	314	*radix_pointer != page) {
271	write_unlock_irq(&mapping->tree_lock);	315	write_unlock_irq(&mapping->tree_lock);
272	return -EAGAIN;	316	return -EAGAIN;
273	}	317	}
274		318
275	/*	319	/*
276	* Now we know that no one else is looking at the page.	320	* Now we know that no one else is looking at the page.
277	*/	321	*/
278	get_page(newpage);	322	get_page(newpage);
279	#ifdef CONFIG_SWAP	323	#ifdef CONFIG_SWAP
280	if (PageSwapCache(page)) {	324	if (PageSwapCache(page)) {
281	SetPageSwapCache(newpage);	325	SetPageSwapCache(newpage);
282	set_page_private(newpage, page_private(page));	326	set_page_private(newpage, page_private(page));
283	}	327	}
284	#endif	328	#endif
285		329
286	*radix_pointer = newpage;	330	*radix_pointer = newpage;
287	__put_page(page);	331	__put_page(page);
288	write_unlock_irq(&mapping->tree_lock);	332	write_unlock_irq(&mapping->tree_lock);
289		333
290	return 0;	334	return 0;
291	}	335	}
292		336
293	/*	337	/*
294	* Copy the page to its new location	338	* Copy the page to its new location
295	*/	339	*/
296	static void migrate_page_copy(struct page newpage, struct page page)	340	static void migrate_page_copy(struct page newpage, struct page page)
297	{	341	{
298	copy_highpage(newpage, page);	342	copy_highpage(newpage, page);
299		343
300	if (PageError(page))	344	if (PageError(page))
301	SetPageError(newpage);	345	SetPageError(newpage);
302	if (PageReferenced(page))	346	if (PageReferenced(page))
303	SetPageReferenced(newpage);	347	SetPageReferenced(newpage);
304	if (PageUptodate(page))	348	if (PageUptodate(page))
305	SetPageUptodate(newpage);	349	SetPageUptodate(newpage);
306	if (PageActive(page))	350	if (PageActive(page))
307	SetPageActive(newpage);	351	SetPageActive(newpage);
308	if (PageChecked(page))	352	if (PageChecked(page))
309	SetPageChecked(newpage);	353	SetPageChecked(newpage);
310	if (PageMappedToDisk(page))	354	if (PageMappedToDisk(page))
311	SetPageMappedToDisk(newpage);	355	SetPageMappedToDisk(newpage);
312		356
313	if (PageDirty(page)) {	357	if (PageDirty(page)) {
314	clear_page_dirty_for_io(page);	358	clear_page_dirty_for_io(page);
315	set_page_dirty(newpage);	359	set_page_dirty(newpage);
316	}	360	}
317		361
318	#ifdef CONFIG_SWAP	362	#ifdef CONFIG_SWAP
319	ClearPageSwapCache(page);	363	ClearPageSwapCache(page);
320	#endif	364	#endif
321	ClearPageActive(page);	365	ClearPageActive(page);
322	ClearPagePrivate(page);	366	ClearPagePrivate(page);
323	set_page_private(page, 0);	367	set_page_private(page, 0);
324	page->mapping = NULL;	368	page->mapping = NULL;
325		369
326	/*	370	/*
327	* If any waiters have accumulated on the new page then	371	* If any waiters have accumulated on the new page then
328	* wake them up.	372	* wake them up.
329	*/	373	*/
330	if (PageWriteback(newpage))	374	if (PageWriteback(newpage))
331	end_page_writeback(newpage);	375	end_page_writeback(newpage);
332	}	376	}
333		377
334	/************************************************************	378	/************************************************************
335	* Migration functions	379	* Migration functions
336	***********************************************************/	380	***********************************************************/
337		381
338	/* Always fail migration. Used for mappings that are not movable */	382	/* Always fail migration. Used for mappings that are not movable */
339	int fail_migrate_page(struct address_space *mapping,	383	int fail_migrate_page(struct address_space *mapping,
340	struct page newpage, struct page page)	384	struct page newpage, struct page page)
341	{	385	{
342	return -EIO;	386	return -EIO;
343	}	387	}
344	EXPORT_SYMBOL(fail_migrate_page);	388	EXPORT_SYMBOL(fail_migrate_page);
345		389
346	/*	390	/*
347	* Common logic to directly migrate a single page suitable for	391	* Common logic to directly migrate a single page suitable for
348	* pages that do not use PagePrivate.	392	* pages that do not use PagePrivate.
349	*	393	*
350	* Pages are locked upon entry and exit.	394	* Pages are locked upon entry and exit.
351	*/	395	*/
352	int migrate_page(struct address_space *mapping,	396	int migrate_page(struct address_space *mapping,
353	struct page newpage, struct page page)	397	struct page newpage, struct page page)
354	{	398	{
355	int rc;	399	int rc;
356		400
357	BUG_ON(PageWriteback(page)); /* Writeback must be complete */	401	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
358		402
359	rc = migrate_page_move_mapping(mapping, newpage, page);	403	rc = migrate_page_move_mapping(mapping, newpage, page);
360		404
361	if (rc)	405	if (rc)
362	return rc;	406	return rc;
363		407
364	migrate_page_copy(newpage, page);	408	migrate_page_copy(newpage, page);
365	return 0;	409	return 0;
366	}	410	}
367	EXPORT_SYMBOL(migrate_page);	411	EXPORT_SYMBOL(migrate_page);
368		412
369	/*	413	/*
370	* Migration function for pages with buffers. This function can only be used	414	* Migration function for pages with buffers. This function can only be used
371	* if the underlying filesystem guarantees that no other references to "page"	415	* if the underlying filesystem guarantees that no other references to "page"
372	* exist.	416	* exist.
373	*/	417	*/
374	int buffer_migrate_page(struct address_space *mapping,	418	int buffer_migrate_page(struct address_space *mapping,
375	struct page newpage, struct page page)	419	struct page newpage, struct page page)
376	{	420	{
377	struct buffer_head bh, head;	421	struct buffer_head bh, head;
378	int rc;	422	int rc;
379		423
380	if (!page_has_buffers(page))	424	if (!page_has_buffers(page))
381	return migrate_page(mapping, newpage, page);	425	return migrate_page(mapping, newpage, page);
382		426
383	head = page_buffers(page);	427	head = page_buffers(page);
384		428
385	rc = migrate_page_move_mapping(mapping, newpage, page);	429	rc = migrate_page_move_mapping(mapping, newpage, page);
386		430
387	if (rc)	431	if (rc)
388	return rc;	432	return rc;
389		433
390	bh = head;	434	bh = head;
391	do {	435	do {
392	get_bh(bh);	436	get_bh(bh);
393	lock_buffer(bh);	437	lock_buffer(bh);
394	bh = bh->b_this_page;	438	bh = bh->b_this_page;
395		439
396	} while (bh != head);	440	} while (bh != head);
397		441
398	ClearPagePrivate(page);	442	ClearPagePrivate(page);
399	set_page_private(newpage, page_private(page));	443	set_page_private(newpage, page_private(page));
400	set_page_private(page, 0);	444	set_page_private(page, 0);
401	put_page(page);	445	put_page(page);
402	get_page(newpage);	446	get_page(newpage);
403		447
404	bh = head;	448	bh = head;
405	do {	449	do {
406	set_bh_page(bh, newpage, bh_offset(bh));	450	set_bh_page(bh, newpage, bh_offset(bh));
407	bh = bh->b_this_page;	451	bh = bh->b_this_page;
408		452
409	} while (bh != head);	453	} while (bh != head);
410		454
411	SetPagePrivate(newpage);	455	SetPagePrivate(newpage);
412		456
413	migrate_page_copy(newpage, page);	457	migrate_page_copy(newpage, page);
414		458
415	bh = head;	459	bh = head;
416	do {	460	do {
417	unlock_buffer(bh);	461	unlock_buffer(bh);
418	put_bh(bh);	462	put_bh(bh);
419	bh = bh->b_this_page;	463	bh = bh->b_this_page;
420		464
421	} while (bh != head);	465	} while (bh != head);
422		466
423	return 0;	467	return 0;
424	}	468	}
425	EXPORT_SYMBOL(buffer_migrate_page);	469	EXPORT_SYMBOL(buffer_migrate_page);
426		470
427	static int fallback_migrate_page(struct address_space *mapping,	471	/*
428	struct page newpage, struct page page)	472	* Writeback a page to clean the dirty state
		473	*/
		474	static int writeout(struct address_space mapping, struct page page)
429	{	475	{
		476	struct writeback_control wbc = {
		477	.sync_mode = WB_SYNC_NONE,
		478	.nr_to_write = 1,
		479	.range_start = 0,
		480	.range_end = LLONG_MAX,
		481	.nonblocking = 1,
		482	.for_reclaim = 1
		483	};
		484	int rc;
		485
		486	if (!mapping->a_ops->writepage)
		487	/* No write method for the address space */
		488	return -EINVAL;
		489
		490	if (!clear_page_dirty_for_io(page))
		491	/* Someone else already triggered a write */
		492	return -EAGAIN;
		493
430	/*	494	/*
431	* Default handling if a filesystem does not provide	495	* A dirty page may imply that the underlying filesystem has
432	* a migration function. We can only migrate clean	496	* the page on some queue. So the page must be clean for
433	* pages so try to write out any dirty pages first.	497	* migration. Writeout may mean we loose the lock and the
		498	* page state is no longer what we checked for earlier.
		499	* At this point we know that the migration attempt cannot
		500	* be successful.
434	*/	501	*/
435	if (PageDirty(page)) {	502	remove_migration_ptes(page, page);
436	switch (pageout(page, mapping)) {
437	case PAGE_KEEP:
438	case PAGE_ACTIVATE:
439	return -EAGAIN;
440		503
441	case PAGE_SUCCESS:	504	rc = mapping->a_ops->writepage(page, &wbc);
442	/* Relock since we lost the lock */	505	if (rc < 0)
443	lock_page(page);	506	/* I/O Error writing */
444	/* Must retry since page state may have changed */	507	return -EIO;
445	return -EAGAIN;
446		508
447	case PAGE_CLEAN:	509	if (rc != AOP_WRITEPAGE_ACTIVATE)
448	; /* try to migrate the page below */	510	/* unlocked. Relock */
449	}	511	lock_page(page);
450	}	512
		513	return -EAGAIN;
		514	}
		515
		516	/*
		517	* Default handling if a filesystem does not provide a migration function.
		518	*/
		519	static int fallback_migrate_page(struct address_space *mapping,
		520	struct page newpage, struct page page)
		521	{
		522	if (PageDirty(page))
		523	return writeout(mapping, page);
451		524
452	/*	525	/*
453	* Buffers may be managed in a filesystem specific way.	526	* Buffers may be managed in a filesystem specific way.
454	* We must have no buffers or drop them.	527	* We must have no buffers or drop them.
455	*/	528	*/
456	if (page_has_buffers(page) &&	529	if (page_has_buffers(page) &&
457	!try_to_release_page(page, GFP_KERNEL))	530	!try_to_release_page(page, GFP_KERNEL))
458	return -EAGAIN;	531	return -EAGAIN;
459		532
460	return migrate_page(mapping, newpage, page);	533	return migrate_page(mapping, newpage, page);
461	}	534	}
462		535
463	/*	536	/*
464	* migrate_pages	537	* migrate_pages
465	*	538	*
466	* Two lists are passed to this function. The first list	539	* Two lists are passed to this function. The first list
467	* contains the pages isolated from the LRU to be migrated.	540	* contains the pages isolated from the LRU to be migrated.
468	* The second list contains new pages that the pages isolated	541	* The second list contains new pages that the pages isolated
469	* can be moved to.	542	* can be moved to.
470	*	543	*
471	* The function returns after 10 attempts or if no pages	544	* The function returns after 10 attempts or if no pages
472	* are movable anymore because to has become empty	545	* are movable anymore because to has become empty
473	* or no retryable pages exist anymore.	546	* or no retryable pages exist anymore.
474	*	547	*
475	* Return: Number of pages not migrated when "to" ran empty.	548	* Return: Number of pages not migrated when "to" ran empty.
476	*/	549	*/
477	int migrate_pages(struct list_head from, struct list_head to,	550	int migrate_pages(struct list_head from, struct list_head to,
478	struct list_head moved, struct list_head failed)	551	struct list_head moved, struct list_head failed)
479	{	552	{
480	int retry;	553	int retry;
481	int nr_failed = 0;	554	int nr_failed = 0;
482	int pass = 0;	555	int pass = 0;
483	struct page *page;	556	struct page *page;
484	struct page *page2;	557	struct page *page2;
485	int swapwrite = current->flags & PF_SWAPWRITE;	558	int swapwrite = current->flags & PF_SWAPWRITE;
486	int rc;	559	int rc;
487		560
488	if (!swapwrite)	561	if (!swapwrite)
489	current->flags \|= PF_SWAPWRITE;	562	current->flags \|= PF_SWAPWRITE;
490		563
491	redo:	564	redo:
492	retry = 0;	565	retry = 0;
493		566
494	list_for_each_entry_safe(page, page2, from, lru) {	567	list_for_each_entry_safe(page, page2, from, lru) {
495	struct page *newpage = NULL;	568	struct page *newpage = NULL;
496	struct address_space *mapping;	569	struct address_space *mapping;
497		570
498	cond_resched();	571	cond_resched();
499		572
500	rc = 0;	573	rc = 0;
501	if (page_count(page) == 1)	574	if (page_count(page) == 1)
502	/* page was freed from under us. So we are done. */	575	/* page was freed from under us. So we are done. */
503	goto next;	576	goto next;
504		577
505	if (to && list_empty(to))	578	if (to && list_empty(to))
506	break;	579	break;
507		580
508	/*	581	/*
509	* Skip locked pages during the first two passes to give the	582	* Skip locked pages during the first two passes to give the
510	* functions holding the lock time to release the page. Later we	583	* functions holding the lock time to release the page. Later we
511	* use lock_page() to have a higher chance of acquiring the	584	* use lock_page() to have a higher chance of acquiring the
512	* lock.	585	* lock.
513	*/	586	*/
514	rc = -EAGAIN;	587	rc = -EAGAIN;
515	if (pass > 2)	588	if (pass > 2)
516	lock_page(page);	589	lock_page(page);
517	else	590	else
518	if (TestSetPageLocked(page))	591	if (TestSetPageLocked(page))
519	goto next;	592	goto next;
520		593
521	/*	594	/*
522	* Only wait on writeback if we have already done a pass where	595	* Only wait on writeback if we have already done a pass where
523	* we we may have triggered writeouts for lots of pages.	596	* we we may have triggered writeouts for lots of pages.
524	*/	597	*/
525	if (pass > 0)	598	if (pass > 0)
526	wait_on_page_writeback(page);	599	wait_on_page_writeback(page);
527	else	600	else
528	if (PageWriteback(page))	601	if (PageWriteback(page))
529	goto unlock_page;	602	goto unlock_page;
530		603
531	/*	604	/*
532	* Establish migration ptes or remove ptes	605	* Establish migration ptes or remove ptes
533	*/	606	*/
534	rc = -EPERM;	607	rc = -EPERM;
535	if (try_to_unmap(page, 1) == SWAP_FAIL)	608	if (try_to_unmap(page, 1) == SWAP_FAIL)
536	/* A vma has VM_LOCKED set -> permanent failure */	609	/* A vma has VM_LOCKED set -> permanent failure */
537	goto unlock_page;	610	goto unlock_page;
538		611
539	rc = -EAGAIN;	612	rc = -EAGAIN;
540	if (page_mapped(page))	613	if (page_mapped(page))
541	goto unlock_page;	614	goto unlock_page;
542		615
543	newpage = lru_to_page(to);	616	newpage = lru_to_page(to);
544	lock_page(newpage);	617	lock_page(newpage);
545	/* Prepare mapping for the new page.*/	618	/* Prepare mapping for the new page.*/
546	newpage->index = page->index;	619	newpage->index = page->index;
547	newpage->mapping = page->mapping;	620	newpage->mapping = page->mapping;
548		621
549	/*	622	/*
550	* Pages are properly locked and writeback is complete.	623	* Pages are properly locked and writeback is complete.
551	* Try to migrate the page.	624	* Try to migrate the page.
552	*/	625	*/
553	mapping = page_mapping(page);	626	mapping = page_mapping(page);
554	if (!mapping)	627	if (!mapping)
555	rc = migrate_page(mapping, newpage, page);	628	rc = migrate_page(mapping, newpage, page);
556		629
557	else if (mapping->a_ops->migratepage)	630	else if (mapping->a_ops->migratepage)
558	/*	631	/*
559	* Most pages have a mapping and most filesystems	632	* Most pages have a mapping and most filesystems
560	* should provide a migration function. Anonymous	633	* should provide a migration function. Anonymous
561	* pages are part of swap space which also has its	634	* pages are part of swap space which also has its
562	* own migration function. This is the most common	635	* own migration function. This is the most common
563	* path for page migration.	636	* path for page migration.
564	*/	637	*/
565	rc = mapping->a_ops->migratepage(mapping,	638	rc = mapping->a_ops->migratepage(mapping,
566	newpage, page);	639	newpage, page);
567	else	640	else
568	rc = fallback_migrate_page(mapping, newpage, page);	641	rc = fallback_migrate_page(mapping, newpage, page);
569		642
570	if (!rc)	643	if (!rc)
571	remove_migration_ptes(page, newpage);	644	remove_migration_ptes(page, newpage);
572		645
573	unlock_page(newpage);	646	unlock_page(newpage);
574		647
575	unlock_page:	648	unlock_page:
576	if (rc)	649	if (rc)
577	remove_migration_ptes(page, page);	650	remove_migration_ptes(page, page);
578		651
579	unlock_page(page);	652	unlock_page(page);
580		653
581	next:	654	next:
582	if (rc) {	655	if (rc) {
583	if (newpage)	656	if (newpage)
584	newpage->mapping = NULL;	657	newpage->mapping = NULL;
585		658
586	if (rc == -EAGAIN)	659	if (rc == -EAGAIN)
587	retry++;	660	retry++;
588	else {	661	else {
589	/* Permanent failure */	662	/* Permanent failure */
590	list_move(&page->lru, failed);	663	list_move(&page->lru, failed);
591	nr_failed++;	664	nr_failed++;
592	}	665	}
593	} else {	666	} else {
594	if (newpage) {	667	if (newpage) {
595	/* Successful migration. Return page to LRU */	668	/* Successful migration. Return page to LRU */
596	move_to_lru(newpage);	669	move_to_lru(newpage);
597	}	670	}
598	list_move(&page->lru, moved);	671	list_move(&page->lru, moved);
599	}	672	}
600	}	673	}
601	if (retry && pass++ < 10)	674	if (retry && pass++ < 10)
602	goto redo;	675	goto redo;
603		676
604	if (!swapwrite)	677	if (!swapwrite)
605	current->flags &= ~PF_SWAPWRITE;	678	current->flags &= ~PF_SWAPWRITE;
606		679
607	return nr_failed + retry;	680	return nr_failed + retry;
608	}	681	}
609		682
610	/*	683	/*
611	* Migrate the list 'pagelist' of pages to a certain destination.	684	* Migrate the list 'pagelist' of pages to a certain destination.
612	*	685	*
613	* Specify destination with either non-NULL vma or dest_node >= 0	686	* Specify destination with either non-NULL vma or dest_node >= 0
614	* Return the number of pages not migrated or error code	687	* Return the number of pages not migrated or error code
615	*/	688	*/
616	int migrate_pages_to(struct list_head *pagelist,	689	int migrate_pages_to(struct list_head *pagelist,
617	struct vm_area_struct *vma, int dest)	690	struct vm_area_struct *vma, int dest)
618	{	691	{
619	LIST_HEAD(newlist);	692	LIST_HEAD(newlist);
620	LIST_HEAD(moved);	693	LIST_HEAD(moved);
621	LIST_HEAD(failed);	694	LIST_HEAD(failed);
622	int err = 0;	695	int err = 0;
623	unsigned long offset = 0;	696	unsigned long offset = 0;
624	int nr_pages;	697	int nr_pages;
625	struct page *page;	698	struct page *page;
626	struct list_head *p;	699	struct list_head *p;
627		700
628	redo:	701	redo:
629	nr_pages = 0;	702	nr_pages = 0;
630	list_for_each(p, pagelist) {	703	list_for_each(p, pagelist) {
631	if (vma) {	704	if (vma) {
632	/*	705	/*
633	* The address passed to alloc_page_vma is used to	706	* The address passed to alloc_page_vma is used to
634	* generate the proper interleave behavior. We fake	707	* generate the proper interleave behavior. We fake
635	* the address here by an increasing offset in order	708	* the address here by an increasing offset in order
636	* to get the proper distribution of pages.	709	* to get the proper distribution of pages.
637	*	710	*
638	* No decision has been made as to which page	711	* No decision has been made as to which page
639	* a certain old page is moved to so we cannot	712	* a certain old page is moved to so we cannot
640	* specify the correct address.	713	* specify the correct address.
641	*/	714	*/
642	page = alloc_page_vma(GFP_HIGHUSER, vma,	715	page = alloc_page_vma(GFP_HIGHUSER, vma,
643	offset + vma->vm_start);	716	offset + vma->vm_start);
644	offset += PAGE_SIZE;	717	offset += PAGE_SIZE;
645	}	718	}
646	else	719	else
647	page = alloc_pages_node(dest, GFP_HIGHUSER, 0);	720	page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
648		721
649	if (!page) {	722	if (!page) {
650	err = -ENOMEM;	723	err = -ENOMEM;
651	goto out;	724	goto out;
652	}	725	}
653	list_add_tail(&page->lru, &newlist);	726	list_add_tail(&page->lru, &newlist);
654	nr_pages++;	727	nr_pages++;
655	if (nr_pages > MIGRATE_CHUNK_SIZE)	728	if (nr_pages > MIGRATE_CHUNK_SIZE)
656	break;	729	break;
657	}	730	}
658	err = migrate_pages(pagelist, &newlist, &moved, &failed);	731	err = migrate_pages(pagelist, &newlist, &moved, &failed);
659		732
660	putback_lru_pages(&moved); /* Call release pages instead ?? */	733	putback_lru_pages(&moved); /* Call release pages instead ?? */
661		734
662	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))	735	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
663	goto redo;	736	goto redo;
664	out:	737	out:
665	/* Return leftover allocated pages */	738	/* Return leftover allocated pages */
666	while (!list_empty(&newlist)) {	739	while (!list_empty(&newlist)) {
667	page = list_entry(newlist.next, struct page, lru);	740	page = list_entry(newlist.next, struct page, lru);
668	list_del(&page->lru);	741	list_del(&page->lru);
669	__free_page(page);	742	__free_page(page);
670	}	743	}
671	list_splice(&failed, pagelist);	744	list_splice(&failed, pagelist);
672	if (err < 0)	745	if (err < 0)
673	return err;	746	return err;
674		747
675	/* Calculate number of leftover pages */	748	/* Calculate number of leftover pages */

mm/rmap.c

Diff comments View file @ 04e62a2

 /*
  * mm/rmap.c - physical to virtual reverse mappings
  *
  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
  * Released under the General Public License (GPL).
  *
  * Simple, low overhead reverse mapping scheme.
  * Please try to keep this thing as modular as possible.
  *
  * Provides methods for unmapping each kind of mapped page:
  * the anon methods track anonymous pages, and
  * the file methods track pages belonging to an inode.
  *
  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  */
 /*
  * Lock ordering in mm:
  *
  * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   inode->i_alloc_sem
  *
  * When a page fault occurs in writing from user to file, down_read
  * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
  * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
  * taken together; in truncation, i_mutex is taken outermost.
  *
  * mm->mmap_sem
  *   page->flags PG_locked (lock_page)
  *     mapping->i_mmap_lock
  *       anon_vma->lock
  *         mm->page_table_lock or pte_lock
  *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
  *             mapping->private_lock (in __set_page_dirty_buffers)
  *             inode_lock (in set_page_dirty's __mark_inode_dirty)
  *               sb_lock (within inode_lock in fs/fs-writeback.c)
  *               mapping->tree_lock (widely used, in set_page_dirty,
  *                         in arch-dependent flush_dcache_mmap_lock,
  *                         within inode_lock in __sync_single_inode)
  */
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
 #include <asm/tlbflush.h>
 struct kmem_cache *anon_vma_cachep;
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
 #ifdef CONFIG_DEBUG_VM
 	struct anon_vma *anon_vma = find_vma->anon_vma;
 	struct vm_area_struct *vma;
 	unsigned int mapcount = 0;
 	int found = 0;
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		mapcount++;
 		BUG_ON(mapcount > 100000);
 		if (vma == find_vma)
 			found = 1;
 	}
 	BUG_ON(!found);
 #endif
 }
 /* This must be called under the mmap_sem. */
 int anon_vma_prepare(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	might_sleep();
 	if (unlikely(!anon_vma)) {
 		struct mm_struct *mm = vma->vm_mm;
 		struct anon_vma *allocated, *locked;
 		anon_vma = find_mergeable_anon_vma(vma);
 		if (anon_vma) {
 			allocated = NULL;
 			locked = anon_vma;
 			spin_lock(&locked->lock);
 		} else {
 			anon_vma = anon_vma_alloc();
 			if (unlikely(!anon_vma))
 				return -ENOMEM;
 			allocated = anon_vma;
 			locked = NULL;
 		}
 		/* page_table_lock to protect against threads */
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
 			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 			allocated = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
 		if (locked)
 			spin_unlock(&locked->lock);
 		if (unlikely(allocated))
 			anon_vma_free(allocated);
 	}
 	return 0;
 }
 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
 {
 	BUG_ON(vma->anon_vma != next->anon_vma);
 	list_del(&next->anon_vma_node);
 }
 void __anon_vma_link(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma) {
 		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 	}
 }
 void anon_vma_link(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma) {
 		spin_lock(&anon_vma->lock);
 		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 		spin_unlock(&anon_vma->lock);
 	}
 }
 void anon_vma_unlink(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	int empty;
 	if (!anon_vma)
 		return;
 	spin_lock(&anon_vma->lock);
 	validate_anon_vma(vma);
 	list_del(&vma->anon_vma_node);
 	/* We must garbage collect the anon_vma if it's empty */
 	empty = list_empty(&anon_vma->head);
 	spin_unlock(&anon_vma->lock);
 	if (empty)
 		anon_vma_free(anon_vma);
 }
 static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
 			  unsigned long flags)
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 						SLAB_CTOR_CONSTRUCTOR) {
 		struct anon_vma *anon_vma = data;
 		spin_lock_init(&anon_vma->lock);
 		INIT_LIST_HEAD(&anon_vma->head);
 	}
 }
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
 			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
 }
 /*
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
 static struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma = NULL;
 	unsigned long anon_mapping;
 	rcu_read_lock();
 	anon_mapping = (unsigned long) page->mapping;
 	if (!(anon_mapping & PAGE_MAPPING_ANON))
 		goto out;
 	if (!page_mapped(page))
 		goto out;
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
 	spin_lock(&anon_vma->lock);
 out:
 	rcu_read_unlock();
 	return anon_vma;
 }
 /*
  * At what user virtual address is page expected in vma?
  */
 static inline unsigned long
 vma_address(struct page *page, struct vm_area_struct *vma)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	unsigned long address;
 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 		/* page should be within any vma from prio_tree_next */
 		BUG_ON(!PageAnon(page));
 		return -EFAULT;
 	}
 	return address;
 }
 /*
  * At what user virtual address is page expected in vma? checking that the
  * page matches the vma: currently only used on anon pages, by unuse_vma;
  */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
 	if (PageAnon(page)) {
 		if ((void *)vma->anon_vma !=
 		    (void *)page->mapping - PAGE_MAPPING_ANON)
 			return -EFAULT;
 	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
 		if (!vma->vm_file ||
 		    vma->vm_file->f_mapping != page->mapping)
 			return -EFAULT;
 	} else
 		return -EFAULT;
 	return vma_address(page, vma);
 }
 /*
  * Check that @page is mapped at @address into @mm.
  *
  * On success returns with pte mapped and locked.
  */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 			  unsigned long address, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	spinlock_t *ptl;
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		return NULL;
 	pud = pud_offset(pgd, address);
 	if (!pud_present(*pud))
 		return NULL;
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
 		return NULL;
 	pte = pte_offset_map(pmd, address);
 	/* Make a quick check before getting the lock */
 	if (!pte_present(*pte)) {
 		pte_unmap(pte);
 		return NULL;
 	}
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
 		return pte;
 	}
 	pte_unmap_unlock(pte, ptl);
 	return NULL;
 }
 /*
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
 static int page_referenced_one(struct page *page,
 	struct vm_area_struct *vma, unsigned int *mapcount)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pte_t *pte;
 	spinlock_t *ptl;
 	int referenced = 0;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
 	pte = page_check_address(page, mm, address, &ptl);
 	if (!pte)
 		goto out;
 	if (ptep_clear_flush_young(vma, address, pte))
 		referenced++;
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
 	if (mm != current->mm && has_swap_token(mm) &&
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
 	return referenced;
 }
 static int page_referenced_anon(struct page *page)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	int referenced = 0;
 	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return referenced;
 	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
 	}
 	spin_unlock(&anon_vma->lock);
 	return referenced;
 }
 /**
  * page_referenced_file - referenced check for object-based rmap
  * @page: the page we're checking references on.
  *
  * For an object-based mapped page, find all the places it is mapped and
  * check/clear the referenced flag.  This is done by following the page->mapping
  * pointer, then walking the chain of vmas it holds.  It returns the number
  * of references it found.
  *
  * This function is only called from page_referenced for object-based pages.
  */
 static int page_referenced_file(struct page *page)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 	int referenced = 0;
 	/*
 	 * The caller's checks on page->mapping and !PageAnon have made
 	 * sure that this is a file page: the check for page->mapping
 	 * excludes the case just before it gets set on an anon page.
 	 */
 	BUG_ON(PageAnon(page));
 	/*
 	 * The page lock not only makes sure that page->mapping cannot
 	 * suddenly be NULLified by truncation, it makes sure that the
 	 * structure at mapping cannot be freed and reused yet,
 	 * so we can safely take mapping->i_mmap_lock.
 	 */
 	BUG_ON(!PageLocked(page));
 	spin_lock(&mapping->i_mmap_lock);
 	/*
 	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
 	 * is more likely to be accurate if we note it after spinning.
 	 */
 	mapcount = page_mapcount(page);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 				  == (VM_LOCKED|VM_MAYSHARE)) {
 			referenced++;
 			break;
 		}
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
 	}
 	spin_unlock(&mapping->i_mmap_lock);
 	return referenced;
 }
 /**
  * page_referenced - test if the page was referenced
  * @page: the page to test
  * @is_locked: caller holds lock on the page
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
 int page_referenced(struct page *page, int is_locked)
 {
 	int referenced = 0;
 	if (page_test_and_clear_young(page))
 		referenced++;
 	if (TestClearPageReferenced(page))
 		referenced++;
 	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
 			referenced += page_referenced_anon(page);
 		else if (is_locked)
 			referenced += page_referenced_file(page);
 		else if (TestSetPageLocked(page))
 			referenced++;
 		else {
 			if (page->mapping)
 				referenced += page_referenced_file(page);
 			unlock_page(page);
 		}
 	}
 	return referenced;
 }
 /**
  * page_set_anon_rmap - setup new anonymous rmap
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  */
 static void __page_set_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	BUG_ON(!anon_vma);
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	page->mapping = (struct address_space *) anon_vma;
 	page->index = linear_page_index(vma, address);
 	/*
 	 * nr_mapped state can be updated without turning off
 	 * interrupts because it is not modified via interrupt.
 	 */
 	__inc_page_state(nr_mapped);
 }
 /**
  * page_add_anon_rmap - add pte mapping to an anonymous page
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  *
  * The caller needs to hold the pte lock.
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
 	if (atomic_inc_and_test(&page->_mapcount))
 		__page_set_anon_rmap(page, vma, address);
 	/* else checking page index and mapping is racy */
 }
 /*
  * page_add_new_anon_rmap - add pte mapping to a new anonymous page
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  *
  * Same as page_add_anon_rmap but must only be called on *new* pages.
  * This means the inc-and-test can be bypassed.
  */
 void page_add_new_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
 	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
 	__page_set_anon_rmap(page, vma, address);
 }
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
  *
  * The caller needs to hold the pte lock.
  */
 void page_add_file_rmap(struct page *page)
 {
 	if (atomic_inc_and_test(&page->_mapcount))
 		__inc_page_state(nr_mapped);
 }
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
  *
  * The caller needs to hold the pte lock.
  */
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
 #ifdef CONFIG_DEBUG_VM
 		if (unlikely(page_mapcount(page) < 0)) {
 			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 		}
 #endif
 		BUG_ON(page_mapcount(page) < 0);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
 		 * but that might overwrite a racing page_add_anon_rmap
 		 * which increments mapcount after us but sets mapping
 		 * before us: so leave the reset to free_hot_cold_page,
 		 * and remember that it's only reliable while mapped.
 		 * Leaving it set also helps swapoff to reinstate ptes
 		 * faster for those pages still in swapcache.
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
 		__dec_page_state(nr_mapped);
 	}
 }
 /*
  * Subfunctions of try_to_unmap: try_to_unmap_one called
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				int migration)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
 	spinlock_t *ptl;
 	int ret = SWAP_AGAIN;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
 	pte = page_check_address(page, mm, address, &ptl);
 	if (!pte)
 		goto out;
 	/*
 	 * If the page is mlock()d, we cannot swap it out.
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
 	if ((vma->vm_flags & VM_LOCKED) ||
 			(ptep_clear_flush_young(vma, address, pte)
 				&& !migration)) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
 	pteval = ptep_clear_flush(vma, address, pte);
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
 		if (PageSwapCache(page)) {
 			/*
 			 * Store the swap location in the pte.
 			 * See handle_pte_fault() ...
 			 */
 			swap_duplicate(entry);
 			if (list_empty(&mm->mmlist)) {
 				spin_lock(&mmlist_lock);
 				if (list_empty(&mm->mmlist))
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
 		} else {
 			/*
 			 * Store the pfn of the page in a special migration
 			 * pte. do_swap_page() will wait until the migration
 			 * pte is removed and then restart fault handling.
 			 */
 			BUG_ON(!migration);
 			entry = make_migration_entry(page, pte_write(pteval));
+#endif
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 	} else
+#ifdef CONFIG_MIGRATION
+	if (migration) {
+		/* Establish migration entry for a file page */
+		swp_entry_t entry;
+		entry = make_migration_entry(page, pte_write(pteval));
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+	} else
+#endif
 		dec_mm_counter(mm, file_rss);
 	page_remove_rmap(page);
 	page_cache_release(page);
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 out:
 	return ret;
 }
 /*
  * objrmap doesn't work for nonlinear VMAs because the assumption that
  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
  * Consequently, given a particular page and its ->index, we cannot locate the
  * ptes which are mapping that page without an exhaustive linear search.
  *
  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
  * maps the file to which the target page belongs.  The ->vm_private_data field
  * holds the current cursor into that scan.  Successive searches will circulate
  * around the vma's virtual address space.
  *
  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
  * more scanning pressure is placed against them as well.   Eventually pages
  * will become fully unmapped and are eligible for eviction.
  *
  * For very sparsely populated VMAs this is a little inefficient - chances are
  * there there won't be many ptes located within the scan cluster.  In this case
  * maybe we could scan further - to the end of the pte page, perhaps.
  */
 #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
 static void try_to_unmap_cluster(unsigned long cursor,
 	unsigned int *mapcount, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t pteval;
 	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
 	if (address < vma->vm_start)
 		address = vma->vm_start;
 	if (end > vma->vm_end)
 		end = vma->vm_end;
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		return;
 	pud = pud_offset(pgd, address);
 	if (!pud_present(*pud))
 		return;
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
 		return;
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
 	for (; address < end; pte++, address += PAGE_SIZE) {
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pte));
 		pteval = ptep_clear_flush(vma, address, pte);
 		/* If nonlinear, store the file page offset in the pte. */
 		if (page->index != linear_page_index(vma, address))
 			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
 		/* Move the dirty bit to the physical page now the pte is gone. */
 		if (pte_dirty(pteval))
 			set_page_dirty(page);
 		page_remove_rmap(page);
 		page_cache_release(page);
 		dec_mm_counter(mm, file_rss);
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 }
 static int try_to_unmap_anon(struct page *page, int migration)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return ret;
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
 	}
 	spin_unlock(&anon_vma->lock);
 	return ret;
 }
 /**
  * try_to_unmap_file - unmap file page using the object-based rmap method
  * @page: the page to unmap
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
  *
  * This function is only called from try_to_unmap for object-based pages.
  */
 static int try_to_unmap_file(struct page *page, int migration)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 	int ret = SWAP_AGAIN;
 	unsigned long cursor;
 	unsigned long max_nl_cursor = 0;
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
 	}
 	if (list_empty(&mapping->i_mmap_nonlinear))
 		goto out;
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
 		if (vma->vm_flags & VM_LOCKED)
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
 			max_nl_cursor = cursor;
 		cursor = vma->vm_end - vma->vm_start;
 		if (cursor > max_nl_size)
 			max_nl_size = cursor;
 	}
 	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
 		ret = SWAP_FAIL;
 		goto out;
 	}
 	/*
 	 * We don't try to search for this page in the nonlinear vmas,
 	 * and page_referenced wouldn't have found it anyway.  Instead
 	 * just walk the nonlinear vmas trying to age and unmap some.
 	 * The mapcount of the page we came in with is irrelevant,
 	 * but even so use it as a guide to how hard we should try?
 	 */
 	mapcount = page_mapcount(page);
 	if (!mapcount)
 		goto out;
 	cond_resched_lock(&mapping->i_mmap_lock);
 	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 	if (max_nl_cursor == 0)
 		max_nl_cursor = CLUSTER_SIZE;
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
 			if (vma->vm_flags & VM_LOCKED)
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
 				try_to_unmap_cluster(cursor, &mapcount, vma);
 				cursor += CLUSTER_SIZE;
 				vma->vm_private_data = (void *) cursor;
 				if ((int)mapcount <= 0)
 					goto out;
 			}
 			vma->vm_private_data = (void *) max_nl_cursor;
 		}
 		cond_resched_lock(&mapping->i_mmap_lock);
 		max_nl_cursor += CLUSTER_SIZE;
 	} while (max_nl_cursor <= max_nl_size);
 	/*
 	 * Don't loop forever (perhaps all the remaining pages are
 	 * in locked vmas).  Reset cursor on all unreserved nonlinear
 	 * vmas, now forgetting on which ones it had fallen behind.
 	 */
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 		vma->vm_private_data = NULL;
 out:
 	spin_unlock(&mapping->i_mmap_lock);
 	return ret;
 }
 /**
  * try_to_unmap - try to remove all page table mappings to a page
  * @page: the page to get unmapped
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
  * Return values are:
  *
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  */
 int try_to_unmap(struct page *page, int migration)
 {
 	int ret;
 	BUG_ON(!PageLocked(page));
 	if (PageAnon(page))
 		ret = try_to_unmap_anon(page, migration);
 	else
 		ret = try_to_unmap_file(page, migration);
 	if (!page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
 }

mm/vmscan.c

Diff comments View file @ 04e62a2

 /*
  *  linux/mm/vmscan.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Swap reorganised 29.12.95, Stephen Tweedie.
  *  kswapd added: 7.1.96  sct
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <linux/swapops.h>
 #include "internal.h"
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 	unsigned long nr_mapped;	/* From page_state */
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 	int may_writepage;
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
 	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
 	int swap_cluster_max;
 	int swappiness;
 };
 /*
  * The list of shrinker callbacks used by to apply pressure to
  * ageable caches.
  */
 struct shrinker {
 	shrinker_t		shrinker;
 	struct list_head	list;
 	int			seeks;	/* seeks to recreate an obj */
 	long			nr;	/* objs pending delete */
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetch(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 #ifdef ARCH_HAS_PREFETCHW
 #define prefetchw_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetchw(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 /*
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
 static long total_memory;
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 /*
  * Add a shrinker callback to be called from the vm
  */
 struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
 {
         struct shrinker *shrinker;
         shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
         if (shrinker) {
 	        shrinker->shrinker = theshrinker;
 	        shrinker->seeks = seeks;
 	        shrinker->nr = 0;
 	        down_write(&shrinker_rwsem);
 	        list_add_tail(&shrinker->list, &shrinker_list);
 	        up_write(&shrinker_rwsem);
 	}
 	return shrinker;
 }
 EXPORT_SYMBOL(set_shrinker);
 /*
  * Remove one
  */
 void remove_shrinker(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
 	kfree(shrinker);
 }
 EXPORT_SYMBOL(remove_shrinker);
 #define SHRINK_BATCH 128
 /*
  * Call the shrink functions to age shrinkable caches
  *
  * Here we assume it costs one seek to replace a lru page and that it also
  * takes a seek to recreate a cache object.  With this in mind we age equal
  * percentages of the lru and ageable caches.  This should balance the seeks
  * generated by these structures.
  *
  * If the vm encounted mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
  *
  * `lru_pages' represents the number of on-LRU pages in all the zones which
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	unsigned long ret = 0;
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
 	if (!down_read_trylock(&shrinker_rwsem))
 		return 1;	/* Assume we'll be able to shrink next time */
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
 		unsigned long total_scan;
 		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
 		delta = (4 * scanned) / shrinker->seeks;
 		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
 		shrinker->nr += delta;
 		if (shrinker->nr < 0) {
 			printk(KERN_ERR "%s: nr=%ld\n",
 					__FUNCTION__, shrinker->nr);
 			shrinker->nr = max_pass;
 		}
 		/*
 		 * Avoid risking looping forever due to too large nr value:
 		 * never try to free more than twice the estimate number of
 		 * freeable entries.
 		 */
 		if (shrinker->nr > max_pass * 2)
 			shrinker->nr = max_pass * 2;
 		total_scan = shrinker->nr;
 		shrinker->nr = 0;
 		while (total_scan >= SHRINK_BATCH) {
 			long this_scan = SHRINK_BATCH;
 			int shrink_ret;
 			int nr_before;
 			nr_before = (*shrinker->shrinker)(0, gfp_mask);
 			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
 			if (shrink_ret == -1)
 				break;
 			if (shrink_ret < nr_before)
 				ret += nr_before - shrink_ret;
 			mod_page_state(slabs_scanned, this_scan);
 			total_scan -= this_scan;
 			cond_resched();
 		}
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
 	return ret;
 }
 /* Called without lock on whether page is mapped, so answer is unstable */
 static inline int page_mapping_inuse(struct page *page)
 {
 	struct address_space *mapping;
 	/* Page is in somebody's page tables. */
 	if (page_mapped(page))
 		return 1;
 	/* Be more reluctant to reclaim swapcache than pagecache */
 	if (PageSwapCache(page))
 		return 1;
 	mapping = page_mapping(page);
 	if (!mapping)
 		return 0;
 	/* File is mmap'd by somebody? */
 	return mapping_mapped(mapping);
 }
 static inline int is_page_cache_freeable(struct page *page)
 {
 	return page_count(page) - !!PagePrivate(page) == 2;
 }
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
 	return 0;
 }
 /*
  * We detected a synchronous write error writing a page out.  Probably
  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
  * fsync(), msync() or close().
  *
  * The tricky part is that after writepage we cannot touch the mapping: nothing
  * prevents it from being freed up.  But we have a ref on the page and once
  * that page is locked, the mapping is pinned.
  *
  * We're allowed to run sleeping lock_page() here because we know the caller has
  * __GFP_FS.
  */
 static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
 	lock_page(page);
 	if (page_mapping(page) == mapping) {
 		if (error == -ENOSPC)
 			set_bit(AS_ENOSPC, &mapping->flags);
 		else
 			set_bit(AS_EIO, &mapping->flags);
 	}
 	unlock_page(page);
 }
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
 	 * will be non-blocking.  To prevent this allocation from being
 	 * stalled by pagecache activity.  But note that there may be
 	 * stalls if we need to run get_block().  We could test
 	 * PagePrivate for that.
 	 *
 	 * If this process is currently in generic_file_write() against
 	 * this page's queue, we can perform writeback even if that
 	 * will block.
 	 *
 	 * If the page is swapcache, write it back even if that would
 	 * block, for some throttling. This happens by accident, because
 	 * swap_backing_dev_info is bust: it doesn't reflect the
 	 * congestion state of the swapdevs.  Easy to fix, if needed.
 	 * See swapfile.c:page_queue_congested().
 	 */
 	if (!is_page_cache_freeable(page))
 		return PAGE_KEEP;
 	if (!mapping) {
 		/*
 		 * Some data journaling orphaned pages can have
 		 * page->mapping == NULL while being dirty with clean buffers.
 		 */
 		if (PagePrivate(page)) {
 			if (try_to_free_buffers(page)) {
 				ClearPageDirty(page);
 				printk("%s: orphaned page\n", __FUNCTION__);
 				return PAGE_CLEAN;
 			}
 		}
 		return PAGE_KEEP;
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
 	if (!may_write_to_queue(mapping->backing_dev_info))
 		return PAGE_KEEP;
 	if (clear_page_dirty_for_io(page)) {
 		int res;
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
 		SetPageReclaim(page);
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
 		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
 		if (!PageWriteback(page)) {
 			/* synchronous write or broken a_ops? */
 			ClearPageReclaim(page);
 		}
 		return PAGE_SUCCESS;
 	}
 	return PAGE_CLEAN;
 }
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (!mapping)
 		return 0;		/* truncate got there first */
 	write_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non-racy check for busy page.  It is critical to check
 	 * PageDirty _after_ making sure that the page is freeable and
 	 * not in use by anybody. 	(pagecache + us == 2)
 	 */
 	if (unlikely(page_count(page) != 2))
 		goto cannot_free;
 	smp_rmb();
 	if (unlikely(PageDirty(page)))
 		goto cannot_free;
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		write_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
 		__put_page(page);	/* The pagecache ref */
 		return 1;
 	}
 	__remove_from_page_cache(page);
 	write_unlock_irq(&mapping->tree_lock);
 	__put_page(page);
 	return 1;
 cannot_free:
 	write_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
 					struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
 	cond_resched();
 	pagevec_init(&freed_pvec, 1);
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
 		int referenced;
 		cond_resched();
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 		if (TestSetPageLocked(page))
 			goto keep;
 		BUG_ON(PageActive(page));
 		sc->nr_scanned++;
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
 		if (PageWriteback(page))
 			goto keep_locked;
 		referenced = page_referenced(page, 1);
 		/* In active use or really unfreeable?  Activate it. */
 		if (referenced && page_mapping_inuse(page))
 			goto activate_locked;
 #ifdef CONFIG_SWAP
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page))
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
 #endif /* CONFIG_SWAP */
 		mapping = page_mapping(page);
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
 		if (PageDirty(page)) {
 			if (referenced)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
 			if (!sc->may_writepage)
 				goto keep_locked;
 			/* Page is dirty, try to write it out here */
 			switch(pageout(page, mapping)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page) || PageDirty(page))
 					goto keep;
 				/*
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
 				if (TestSetPageLocked(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
 				mapping = page_mapping(page);
 			case PAGE_CLEAN:
 				; /* try to free the page below */
 			}
 		}
 		/*
 		 * If the page has buffers, try to free the buffer mappings
 		 * associated with this page. If we succeed we try to free
 		 * the page as well.
 		 *
 		 * We do this even if the page is PageDirty().
 		 * try_to_release_page() does not perform I/O, but it is
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
 		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
 		 * Rarely, pages can have buffers and no ->mapping.  These are
 		 * the pages which were not successfully invalidated in
 		 * truncate_complete_page().  We try to drop those buffers here
 		 * and if that worked, and the page is no longer mapped into
 		 * process address space (page_count == 1) it can be freed.
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
 		if (PagePrivate(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1)
 				goto free_it;
 		}
 		if (!remove_mapping(mapping, page))
 			goto keep_locked;
 free_it:
 		unlock_page(page);
 		nr_reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
 		BUG_ON(PageLRU(page));
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgactivate, pgactivate);
 	return nr_reclaimed;
 }
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
  * For pagecache intensive workloads, this function is the hottest
  * spot in the kernel (apart from copy_*_user functions).
  *
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:	The number of pages to look through on the list.
  * @src:	The LRU list to pull pages off.
  * @dst:	The temp list to put pages on to.
  * @scanned:	The number of pages that were scanned.
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
 		unsigned long *scanned)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
 	unsigned long scan;
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 		BUG_ON(!PageLRU(page));
 		list_del(&page->lru);
 		target = src;
 		if (likely(get_page_unless_zero(page))) {
 			/*
 			 * Be careful not to clear PageLRU until after we're
 			 * sure the page is not being freed elsewhere -- the
 			 * page release code relies on it.
 			 */
 			ClearPageLRU(page);
 			target = dst;
 			nr_taken++;
 		} /* else it is being freed elsewhere */
 		list_add(&page->lru, target);
 	}
 	*scanned = scan;
 	return nr_taken;
 }
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
 static unsigned long shrink_inactive_list(unsigned long max_scan,
 				struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	unsigned long nr_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	pagevec_init(&pvec, 1);
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	do {
 		struct page *page;
 		unsigned long nr_taken;
 		unsigned long nr_scan;
 		unsigned long nr_freed;
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
 					     &zone->inactive_list,
 					     &page_list, &nr_scan);
 		zone->nr_inactive -= nr_taken;
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 		nr_scanned += nr_scan;
 		nr_freed = shrink_page_list(&page_list, sc);
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
 			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 			__mod_page_state(kswapd_steal, nr_freed);
 		} else
 			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		__mod_page_state_zone(zone, pgsteal, nr_freed);
 		if (nr_taken == 0)
 			goto done;
 		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
 			BUG_ON(PageLRU(page));
 			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
 			else
 				add_page_to_inactive_list(zone, page);
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
   	} while (nr_scanned < max_scan);
 	spin_unlock(&zone->lru_lock);
 done:
 	local_irq_enable();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
 /*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
  * appropriate to hold zone->lru_lock across the whole operation.  But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
  * should drop zone->lru_lock around each page.  It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 				struct scan_control *sc)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
 	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
 	struct page *page;
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
 	if (sc->may_swap) {
 		long mapped_ratio;
 		long distress;
 		long swap_tendency;
 		/*
 		 * `distress' is a measure of how much trouble we're having
 		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
 		 */
 		distress = 100 >> zone->prev_priority;
 		/*
 		 * The point of this algorithm is to decide when to start
 		 * reclaiming mapped memory instead of just pagecache.  Work out
 		 * how much memory
 		 * is mapped.
 		 */
 		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
 		/*
 		 * Now decide how much we really want to unmap some pages.  The
 		 * mapped ratio is downgraded - just because there's a lot of
 		 * mapped memory doesn't necessarily mean that page reclaim
 		 * isn't succeeding.
 		 *
 		 * The distress ratio is important - we don't want to start
 		 * going oom.
 		 *
 		 * A 100% value of vm_swappiness overrides this algorithm
 		 * altogether.
 		 */
 		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
 		/*
 		 * Now use this metric to decide whether to start moving mapped
 		 * memory onto the inactive list.
 		 */
 		if (swap_tendency >= 100)
 			reclaim_mapped = 1;
 	}
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
 				    &l_hold, &pgscanned);
 	zone->pages_scanned += pgscanned;
 	zone->nr_active -= pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
 			    page_referenced(page, 0)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
 		}
 		list_add(&page->lru, &l_inactive);
 	}
 	pagevec_init(&pvec, 1);
 	pgmoved = 0;
 	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		BUG_ON(!PageActive(page));
 		ClearPageActive(page);
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_inactive += pgmoved;
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
 			if (buffer_heads_over_limit)
 				pagevec_strip(&pvec);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
 	zone->nr_inactive += pgmoved;
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
 		pagevec_strip(&pvec);
 		spin_lock_irq(&zone->lru_lock);
 	}
 	pgmoved = 0;
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_active += pgmoved;
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
 	zone->nr_active += pgmoved;
 	spin_unlock(&zone->lru_lock);
 	__mod_page_state_zone(zone, pgrefill, pgscanned);
 	__mod_page_state(pgdeactivate, pgdeactivate);
 	local_irq_enable();
 	pagevec_release(&pvec);
 }
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static unsigned long shrink_zone(int priority, struct zone *zone,
 				struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
 	atomic_inc(&zone->reclaim_in_progress);
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
 	 */
 	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
 	nr_active = zone->nr_scan_active;
 	if (nr_active >= sc->swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
 	if (nr_inactive >= sc->swap_cluster_max)
 		zone->nr_scan_inactive = 0;
 	else
 		nr_inactive = 0;
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
 			nr_active -= nr_to_scan;
 			shrink_active_list(nr_to_scan, zone, sc);
 		}
 		if (nr_inactive) {
 			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
 			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
 								sc);
 		}
 	}
 	throttle_vm_writeout();
 	atomic_dec(&zone->reclaim_in_progress);
 	return nr_reclaimed;
 }
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
  * We reclaim from a zone even if that zone is over pages_high.  Because:
  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
  *    allocation or
  * b) The zones may be over pages_high but they must go *over* pages_high to
  *    satisfy the `incremental min' zone defense algorithm.
  *
  * Returns the number of reclaimed pages.
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
 static unsigned long shrink_zones(int priority, struct zone **zones,
 					struct scan_control *sc)
 {
 	unsigned long nr_reclaimed = 0;
 	int i;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 		if (!populated_zone(zone))
 			continue;
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 		zone->temp_priority = priority;
 		if (zone->prev_priority > priority)
 			zone->prev_priority = priority;
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
 	return nr_reclaimed;
 }
 /*
  * This is the main entry point to direct page reclaim.
  *
  * If a full scan of the inactive list fails to free enough memory then we
  * are "out of memory" and something needs to be killed.
  *
  * If the caller is !__GFP_FS then the probability of a failure is reasonably
  * high - the zone may be full of dirty or under-writeback pages, which this
  * caller can't do much about.  We kick pdflush and take explicit naps in the
  * hope that some of these pages can be written.  But if the allocating task
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
 unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
 	int priority;
 	int ret = 0;
 	unsigned long total_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.may_swap = 1,
 		.swappiness = vm_swappiness,
 	};
 	inc_page_state(allocstall);
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 		zone->temp_priority = DEF_PRIORITY;
 		lru_pages += zone->nr_active + zone->nr_inactive;
 	}
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
 		nr_reclaimed += shrink_zones(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
 		total_scanned += sc.nr_scanned;
 		if (nr_reclaimed >= sc.swap_cluster_max) {
 			ret = 1;
 			goto out;
 		}
 		/*
 		 * Try to write back as many pages as we just scanned.  This
 		 * tends to cause slow streaming writers to write data to the
 		 * disk smoothly, at the dirtying rate, which is nice.   But
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
 		if (total_scanned > sc.swap_cluster_max +
 					sc.swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
 		/* Take a nap, wait for some writeback to complete */
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 		zone->prev_priority = zone->temp_priority;
 	}
 	return ret;
 }
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
  *
  * Returns the number of pages which were actually freed.
  *
  * There is special handling here for zones which are full of pinned pages.
  * This can happen if the pages are all mlocked, or if they are all used by
  * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
  * What we do is to detect the case where all pages in the zone have been
  * scanned twice and there has been zero successful reclaim.  Mark the zone as
  * dead and from now on, only perform a short scan.  Basically we're polling
  * the zone for when the problem goes away.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > pages_high, but once a zone is found to have
  * free_pages <= pages_high, we scan that zone and the lower zones regardless
  * of the number of free pages in the lower zones.  This interoperates with
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 {
 	int all_zones_ok;
 	int priority;
 	int i;
 	unsigned long total_scanned;
 	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 	};
 loop_again:
 	total_scanned = 0;
 	nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	sc.nr_mapped = read_page_state(nr_mapped);
 	inc_page_state(pageoutrun);
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 		zone->temp_priority = DEF_PRIORITY;
 	}
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 		/* The swap token gets in the way of swapout... */
 		if (!priority)
 			disable_swap_token();
 		all_zones_ok = 1;
 		/*
 		 * Scan in the highmem->dma direction for the highest
 		 * zone which needs scanning
 		 */
 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       0, 0)) {
 				end_zone = i;
 				goto scan;
 			}
 		}
 		goto out;
 scan:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			lru_pages += zone->nr_active + zone->nr_inactive;
 		}
 		/*
 		 * Now scan the zone in the dma->highmem direction, stopping
 		 * at the last zone which needs scanning.
 		 *
 		 * We do this because the page allocator works in the opposite
 		 * direction.  This prevents the page allocator from allocating
 		 * pages behind kswapd's direction of progress, which would
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 			if (!populated_zone(zone))
 				continue;
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       end_zone, 0))
 				all_zones_ok = 0;
 			zone->temp_priority = priority;
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
 			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
 				    (zone->nr_active + zone->nr_inactive) * 4)
 				zone->all_unreclaimable = 1;
 			/*
 			 * If we've done a decent amount of scanning and
 			 * the reclaim ratio is low, start doing writepage
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
 			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
 		/*
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
 		if (total_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 		/*
 		 * We do this so kswapd doesn't build up large priorities for
 		 * example when it is freeing in parallel with allocators. It
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
 		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
 			break;
 	}
 out:
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 		zone->prev_priority = zone->temp_priority;
 	}
 	if (!all_zones_ok) {
 		cond_resched();
 		goto loop_again;
 	}
 	return nr_reclaimed;
 }
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
  * that frees anything up. This is needed for things like routing
  * etc, where we otherwise might have all activity going on in
  * asynchronous contexts that cannot page things out.
  *
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
 static int kswapd(void *p)
 {
 	unsigned long order;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 	DEFINE_WAIT(wait);
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 	cpumask_t cpumask;
 	daemonize("kswapd%d", pgdat->node_id);
 	cpumask = node_to_cpumask(pgdat->node_id);
 	if (!cpus_empty(cpumask))
 		set_cpus_allowed(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "__alloc_pages()"). "kswapd" should
 	 * never get caught in the normal page freeing logic.
 	 *
 	 * (Kswapd normally doesn't need memory anyway, but sometimes
 	 * you need a small amount of memory in order to be able to
 	 * page out something else, and this flag essentially protects
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	order = 0;
 	for ( ; ; ) {
 		unsigned long new_order;
 		try_to_freeze();
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 		new_order = pgdat->kswapd_max_order;
 		pgdat->kswapd_max_order = 0;
 		if (order < new_order) {
 			/*
 			 * Don't sleep if someone wants a larger 'order'
 			 * allocation
 			 */
 			order = new_order;
 		} else {
 			schedule();
 			order = pgdat->kswapd_max_order;
 		}
 		finish_wait(&pgdat->kswapd_wait, &wait);
 		balance_pgdat(pgdat, order);
 	}
 	return 0;
 }
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
 void wakeup_kswapd(struct zone *zone, int order)
 {
 	pg_data_t *pgdat;
 	if (!populated_zone(zone))
 		return;
 	pgdat = zone->zone_pgdat;
 	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
 	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 #ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
  * from LRU lists system-wide, for given pass and priority, and returns the
  * number of reclaimed pages
  *
  * For pass > 3 we also try to shrink the LRU lists that contain a few pages
  */
 static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
 				      int prio, struct scan_control *sc)
 {
 	struct zone *zone;
 	unsigned long nr_to_scan, ret = 0;
 	for_each_zone(zone) {
 		if (!populated_zone(zone))
 			continue;
 		if (zone->all_unreclaimable && prio != DEF_PRIORITY)
 			continue;
 		/* For pass = 0 we don't shrink the active list */
 		if (pass > 0) {
 			zone->nr_scan_active += (zone->nr_active >> prio) + 1;
 			if (zone->nr_scan_active >= nr_pages || pass > 3) {
 				zone->nr_scan_active = 0;
 				nr_to_scan = min(nr_pages, zone->nr_active);
 				shrink_active_list(nr_to_scan, zone, sc);
 			}
 		}
 		zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
 		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
 			zone->nr_scan_inactive = 0;
 			nr_to_scan = min(nr_pages, zone->nr_inactive);
 			ret += shrink_inactive_list(nr_to_scan, zone, sc);
 			if (ret >= nr_pages)
 				return ret;
 		}
 	}
 	return ret;
 }
 /*
  * Try to free `nr_pages' of memory, system-wide, and return the number of
  * freed pages.
  *
  * Rather than trying to age LRUs the aim is to preserve the overall
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	unsigned long lru_pages, nr_slab;
 	unsigned long ret = 0;
 	int pass;
 	struct reclaim_state reclaim_state;
 	struct zone *zone;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 0,
 		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.swappiness = vm_swappiness,
 	};
 	current->reclaim_state = &reclaim_state;
 	lru_pages = 0;
 	for_each_zone(zone)
 		lru_pages += zone->nr_active + zone->nr_inactive;
 	nr_slab = read_page_state(nr_slab);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
 		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
 		if (!reclaim_state.reclaimed_slab)
 			break;
 		ret += reclaim_state.reclaimed_slab;
 		if (ret >= nr_pages)
 			goto out;
 		nr_slab -= reclaim_state.reclaimed_slab;
 	}
 	/*
 	 * We try to shrink LRUs in 5 passes:
 	 * 0 = Reclaim from inactive_list only
 	 * 1 = Reclaim from active list but don't reclaim mapped
 	 * 2 = 2nd pass of type 1
 	 * 3 = Reclaim mapped (normal reclaim)
 	 * 4 = 2nd pass of type 3
 	 */
 	for (pass = 0; pass < 5; pass++) {
 		int prio;
 		/* Needed for shrinking slab caches later on */
 		if (!lru_pages)
 			for_each_zone(zone) {
 				lru_pages += zone->nr_active;
 				lru_pages += zone->nr_inactive;
 			}
 		/* Force reclaiming mapped pages in the passes #3 and #4 */
 		if (pass > 2) {
 			sc.may_swap = 1;
 			sc.swappiness = 100;
 		}
 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
 			unsigned long nr_to_scan = nr_pages - ret;
 			sc.nr_mapped = read_page_state(nr_mapped);
 			sc.nr_scanned = 0;
 			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
 			if (ret >= nr_pages)
 				goto out;
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
 			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
 				blk_congestion_wait(WRITE, HZ / 10);
 		}
 		lru_pages = 0;
 	}
 	/*
 	 * If ret = 0, we could not shrink LRUs, but there may be something
 	 * in slab caches
 	 */
 	if (!ret)
 		do {
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 out:
 	current->reclaim_state = NULL;
 	return ret;
 }
 #endif
 #ifdef CONFIG_HOTPLUG_CPU
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
 	cpumask_t mask;
 	if (action == CPU_ONLINE) {
 		for_each_online_pgdat(pgdat) {
 			mask = node_to_cpumask(pgdat->node_id);
 			if (any_online_cpu(mask) != NR_CPUS)
 				/* One of our CPUs online: restore mask */
 				set_cpus_allowed(pgdat->kswapd, mask);
 		}
 	}
 	return NOTIFY_OK;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
 	swap_setup();
 	for_each_online_pgdat(pgdat) {
 		pid_t pid;
 		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
 		BUG_ON(pid < 0);
 		read_lock(&tasklist_lock);
 		pgdat->kswapd = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
 	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
 module_init(kswapd_init)
 #ifdef CONFIG_NUMA
 /*
  * Zone reclaim mode
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
  *
  * In the future we may add flags to the mode. However, the page allocator
  * should only have to check that zone_reclaim_mode != 0 before calling
  * zone_reclaim().
  */
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
 #define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
 /*
  * Mininum time between zone reclaim scans
  */
 int zone_reclaim_interval __read_mostly = 30*HZ;
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
 #define ZONE_RECLAIM_PRIORITY 4
 /*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
 	unsigned long nr_reclaimed = 0;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.nr_mapped = read_page_state(nr_mapped),
 		.swap_cluster_max = max_t(unsigned long, nr_pages,
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
 	};
 	disable_swap_token();
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
 	 * and RECLAIM_SWAP.
 	 */
 	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	/*
 	 * Free memory by calling shrink zone with increasing priorities
 	 * until we have enough memory freed.
 	 */
 	priority = ZONE_RECLAIM_PRIORITY;
 	do {
 		nr_reclaimed += shrink_zone(priority, zone, &sc);
 		priority--;
 	} while (priority >= 0 && nr_reclaimed < nr_pages);
 	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
 		 * many pages were freed in this zone. So we just shake the slab
 		 * a bit and then go off node for this particular allocation
 		 * despite possibly having freed enough memory to allocate in
 		 * this zone.  If we freed local memory then the next
 		 * allocations will be local again.
 		 *
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
 		 */
 		shrink_slab(sc.nr_scanned, gfp_mask, order);
 	}
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	if (nr_reclaimed == 0) {
 		/*
 		 * We were unable to reclaim enough pages to stay on node.  We
 		 * now allow off node accesses for a certain time period before
 		 * trying again to reclaim pages from the local zone.
 		 */
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 	}
 	return nr_reclaimed >= nr_pages;
 }
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	cpumask_t mask;
 	int node_id;
 	/*
 	 * Do not reclaim if there was a recent unsuccessful attempt at zone
 	 * reclaim.  In that case we let allocations go off node for the
 	 * zone_reclaim_interval.  Otherwise we would scan for each off-node
 	 * page allocation.
 	 */
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
 			return 0;
 	/*
 	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
 	 * not have reclaimable pages and if we should not delay the allocation
 	 * then do not scan.
 	 */
 	if (!(gfp_mask & __GFP_WAIT) ||
 		zone->all_unreclaimable ||
 		atomic_read(&zone->reclaim_in_progress) > 0 ||
 		(current->flags & PF_MEMALLOC))
 			return 0;
 	/*
 	 * Only run zone reclaim on the local zone or on zones that do not
 	 * have associated processors. This will favor the local processor
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
 	node_id = zone->zone_pgdat->node_id;
 	mask = node_to_cpumask(node_id);
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
 	return __zone_reclaim(zone, gfp_mask, order);
 }
 #endif