[PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] More page migration: use migration entries for file pages
This implements the use of migration entries to preserve ptes of file backed pages during migration. Processes can therefore be migrated back and forth without loosing their connection to pagecache pages. Note that we implement the migration entries only for linear mappings. Nonlinear mappings still require the unmapping of the ptes for migration. And another writepage() ugliness shows up. writepage() can drop the page lock. Therefore we have to remove migration ptes before calling writepages() in order to avoid having migration entries point to unlocked pages. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Christoph Lameter · Linus Torvalds
1 parent 442c9137de
Showing 4 changed files with 124 additions and 43 deletions Side-by-side Diff
include/linux/swap.h
mm/migrate.c
mm/rmap.c
mm/vmscan.c
@@ -186,20 +186,6 @@
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
  
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
-extern pageout_t pageout(struct page *page, struct address_space *mapping);
-
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int zone_reclaim_interval;
@@ -259,7 +245,6 @@
 struct backing_dev_info;
  
 extern spinlock_t swap_lock;
-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
  
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
@@ -24,6 +24,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/writeback.h>
  
 #include "internal.h"
  
@@ -123,7 +124,7 @@
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+static void remove_migration_pte(struct vm_area_struct *vma,
 		struct page *old, struct page *new)
 {
 	struct mm_struct *mm = vma->vm_mm;
  
@@ -133,7 +134,11 @@
  	pmd_t *pmd;
 	pte_t *ptep, pte;
  	spinlock_t *ptl;
+	unsigned long addr = page_address_in_vma(new, vma);
  
+	if (addr == -EFAULT)
+		return;
+
  	pgd = pgd_offset(mm, addr);
 	if (!pgd_present(*pgd))
                 return;
  
  
@@ -169,19 +174,47 @@
 	if (is_write_migration_entry(entry))
 		pte = pte_mkwrite(pte);
 	set_pte_at(mm, addr, ptep, pte);
-	page_add_anon_rmap(new, vma, addr);
+
+	if (PageAnon(new))
+		page_add_anon_rmap(new, vma, addr);
+	else
+		page_add_file_rmap(new);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, pte);
+	lazy_mmu_prot_update(pte);
+
 out:
 	pte_unmap_unlock(ptep, ptl);
 }
  
 /*
- * Get rid of all migration entries and replace them by
- * references to the indicated page.
- *
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+	struct vm_area_struct *vma;
+	struct address_space *mapping = page_mapping(new);
+	struct prio_tree_iter iter;
+	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+	if (!mapping)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+		remove_migration_pte(vma, old, new);
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
  * Must hold mmap_sem lock on at least one of the vmas containing
  * the page so that the anon_vma cannot vanish.
  */
-static void remove_migration_ptes(struct page *old, struct page *new)
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
  
@@ -199,13 +232,24 @@
 	spin_lock(&anon_vma->lock);
  
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_migration_pte(vma, page_address_in_vma(new, vma),
-					old, new);
+		remove_migration_pte(vma, old, new);
  
 	spin_unlock(&anon_vma->lock);
 }
  
 /*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	if (PageAnon(new))
+		remove_anon_migration_ptes(old, new);
+	else
+		remove_file_migration_ptes(old, new);
+}
+
+/*
  * Something used the pte of a page under migration. We need to
  * get to the page and wait until migration is finished.
  * When we return from this function the fault will be retried.
  
  
  
  
  
@@ -424,30 +468,59 @@
 }
 EXPORT_SYMBOL(buffer_migrate_page);
  
-static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page)
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
 {
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.nonblocking = 1,
+		.for_reclaim = 1
+	};
+	int rc;
+
+	if (!mapping->a_ops->writepage)
+		/* No write method for the address space */
+		return -EINVAL;
+
+	if (!clear_page_dirty_for_io(page))
+		/* Someone else already triggered a write */
+		return -EAGAIN;
+
 	/*
-	 * Default handling if a filesystem does not provide
-	 * a migration function. We can only migrate clean
-	 * pages so try to write out any dirty pages first.
+	 * A dirty page may imply that the underlying filesystem has
+	 * the page on some queue. So the page must be clean for
+	 * migration. Writeout may mean we loose the lock and the
+	 * page state is no longer what we checked for earlier.
+	 * At this point we know that the migration attempt cannot
+	 * be successful.
 	 */
-	if (PageDirty(page)) {
-		switch (pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			return -EAGAIN;
+	remove_migration_ptes(page, page);
  
-		case PAGE_SUCCESS:
-			/* Relock since we lost the lock */
-			lock_page(page);
-			/* Must retry since page state may have changed */
-			return -EAGAIN;
+	rc = mapping->a_ops->writepage(page, &wbc);
+	if (rc < 0)
+		/* I/O Error writing */
+		return -EIO;
  
-		case PAGE_CLEAN:
-			; /* try to migrate the page below */
-		}
-	}
+	if (rc != AOP_WRITEPAGE_ACTIVATE)
+		/* unlocked. Relock */
+		lock_page(page);
+
+	return -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+	struct page *newpage, struct page *page)
+{
+	if (PageDirty(page))
+		return writeout(mapping, page);
  
 	/*
 	 * Buffers may be managed in a filesystem specific way.
@@ -596,6 +596,7 @@
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
 		} else {
 			/*
 			 * Store the pfn of the page in a special migration
  
  
@@ -604,11 +605,21 @@
 			 */
 			BUG_ON(!migration);
 			entry = make_migration_entry(page, pte_write(pteval));
+#endif
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 	} else
+#ifdef CONFIG_MIGRATION
+	if (migration) {
+		/* Establish migration entry for a file page */
+		swp_entry_t entry;
+		entry = make_migration_entry(page, pte_write(pteval));
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+	} else
+#endif
 		dec_mm_counter(mm, file_rss);
+
  
 	page_remove_rmap(page);
 	page_cache_release(page);
@@ -290,11 +290,23 @@
 	unlock_page(page);
 }
  
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
...	...	@@ -186,20 +186,6 @@
186	186	extern int vm_swappiness;
187	187	extern int remove_mapping(struct address_space mapping, struct page page);
188	188
189		-/* possible outcome of pageout() */
190		-typedef enum {
191		- /* failed to write page out, page is locked */
192		- PAGE_KEEP,
193		- /* move page to the active list, page is locked */
194		- PAGE_ACTIVATE,
195		- /* page has been sent to the disk successfully, page is unlocked */
196		- PAGE_SUCCESS,
197		- /* page is clean and locked */
198		- PAGE_CLEAN,
199		-} pageout_t;
200		-
201		-extern pageout_t pageout(struct page page, struct address_space mapping);
202		-
203	189	#ifdef CONFIG_NUMA
204	190	extern int zone_reclaim_mode;
205	191	extern int zone_reclaim_interval;
...	...	@@ -259,7 +245,6 @@
259	245	struct backing_dev_info;
260	246
261	247	extern spinlock_t swap_lock;
262		-extern int remove_vma_swap(struct vm_area_struct vma, struct page page);
263	248
264	249	/* linux/mm/thrash.c */
265	250	extern struct mm_struct * swap_token_mm;
...	...	@@ -24,6 +24,7 @@
24	24	#include <linux/topology.h>
25	25	#include <linux/cpu.h>
26	26	#include <linux/cpuset.h>
	27	+#include <linux/writeback.h>
27	28
28	29	#include "internal.h"
29	30
...	...	@@ -123,7 +124,7 @@
123	124	/*
124	125	* Restore a potential migration pte to a working pte entry
125	126	*/
126		-static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
	127	+static void remove_migration_pte(struct vm_area_struct *vma,
127	128	struct page old, struct page new)
128	129	{
129	130	struct mm_struct *mm = vma->vm_mm;
130	131
...	...	@@ -133,7 +134,11 @@
133	134	pmd_t *pmd;
134	135	pte_t *ptep, pte;
135	136	spinlock_t *ptl;
	137	+ unsigned long addr = page_address_in_vma(new, vma);
136	138
	139	+ if (addr == -EFAULT)
	140	+ return;
	141	+
137	142	pgd = pgd_offset(mm, addr);
138	143	if (!pgd_present(*pgd))
139	144	return;
140	145
141	146
...	...	@@ -169,19 +174,47 @@
169	174	if (is_write_migration_entry(entry))
170	175	pte = pte_mkwrite(pte);
171	176	set_pte_at(mm, addr, ptep, pte);
172		- page_add_anon_rmap(new, vma, addr);
	177	+
	178	+ if (PageAnon(new))
	179	+ page_add_anon_rmap(new, vma, addr);
	180	+ else
	181	+ page_add_file_rmap(new);
	182	+
	183	+ /* No need to invalidate - it was non-present before */
	184	+ update_mmu_cache(vma, addr, pte);
	185	+ lazy_mmu_prot_update(pte);
	186	+
173	187	out:
174	188	pte_unmap_unlock(ptep, ptl);
175	189	}
176	190
177	191	/*
178		- * Get rid of all migration entries and replace them by
179		- * references to the indicated page.
180		- *
	192	+ * Note that remove_file_migration_ptes will only work on regular mappings,
	193	+ * Nonlinear mappings do not use migration entries.
	194	+ */
	195	+static void remove_file_migration_ptes(struct page old, struct page new)
	196	+{
	197	+ struct vm_area_struct *vma;
	198	+ struct address_space *mapping = page_mapping(new);
	199	+ struct prio_tree_iter iter;
	200	+ pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
	201	+
	202	+ if (!mapping)
	203	+ return;
	204	+
	205	+ spin_lock(&mapping->i_mmap_lock);
	206	+
	207	+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
	208	+ remove_migration_pte(vma, old, new);
	209	+
	210	+ spin_unlock(&mapping->i_mmap_lock);
	211	+}
	212	+
	213	+/*
181	214	* Must hold mmap_sem lock on at least one of the vmas containing
182	215	* the page so that the anon_vma cannot vanish.
183	216	*/
184		-static void remove_migration_ptes(struct page old, struct page new)
	217	+static void remove_anon_migration_ptes(struct page old, struct page new)
185	218	{
186	219	struct anon_vma *anon_vma;
187	220	struct vm_area_struct *vma;
188	221
...	...	@@ -199,13 +232,24 @@
199	232	spin_lock(&anon_vma->lock);
200	233
201	234	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
202		- remove_migration_pte(vma, page_address_in_vma(new, vma),
203		- old, new);
	235	+ remove_migration_pte(vma, old, new);
204	236
205	237	spin_unlock(&anon_vma->lock);
206	238	}
207	239
208	240	/*
	241	+ * Get rid of all migration entries and replace them by
	242	+ * references to the indicated page.
	243	+ */
	244	+static void remove_migration_ptes(struct page old, struct page new)
	245	+{
	246	+ if (PageAnon(new))
	247	+ remove_anon_migration_ptes(old, new);
	248	+ else
	249	+ remove_file_migration_ptes(old, new);
	250	+}
	251	+
	252	+/*
209	253	* Something used the pte of a page under migration. We need to
210	254	* get to the page and wait until migration is finished.
211	255	* When we return from this function the fault will be retried.
212	256
213	257
214	258
215	259
216	260
...	...	@@ -424,30 +468,59 @@
424	468	}
425	469	EXPORT_SYMBOL(buffer_migrate_page);
426	470
427		-static int fallback_migrate_page(struct address_space *mapping,
428		- struct page newpage, struct page page)
	471	+/*
	472	+ * Writeback a page to clean the dirty state
	473	+ */
	474	+static int writeout(struct address_space mapping, struct page page)
429	475	{
	476	+ struct writeback_control wbc = {
	477	+ .sync_mode = WB_SYNC_NONE,
	478	+ .nr_to_write = 1,
	479	+ .range_start = 0,
	480	+ .range_end = LLONG_MAX,
	481	+ .nonblocking = 1,
	482	+ .for_reclaim = 1
	483	+ };
	484	+ int rc;
	485	+
	486	+ if (!mapping->a_ops->writepage)
	487	+ /* No write method for the address space */
	488	+ return -EINVAL;
	489	+
	490	+ if (!clear_page_dirty_for_io(page))
	491	+ /* Someone else already triggered a write */
	492	+ return -EAGAIN;
	493	+
430	494	/*
431		- * Default handling if a filesystem does not provide
432		- * a migration function. We can only migrate clean
433		- * pages so try to write out any dirty pages first.
	495	+ * A dirty page may imply that the underlying filesystem has
	496	+ * the page on some queue. So the page must be clean for
	497	+ * migration. Writeout may mean we loose the lock and the
	498	+ * page state is no longer what we checked for earlier.
	499	+ * At this point we know that the migration attempt cannot
	500	+ * be successful.
434	501	*/
435		- if (PageDirty(page)) {
436		- switch (pageout(page, mapping)) {
437		- case PAGE_KEEP:
438		- case PAGE_ACTIVATE:
439		- return -EAGAIN;
	502	+ remove_migration_ptes(page, page);
440	503
441		- case PAGE_SUCCESS:
442		- /* Relock since we lost the lock */
443		- lock_page(page);
444		- /* Must retry since page state may have changed */
445		- return -EAGAIN;
	504	+ rc = mapping->a_ops->writepage(page, &wbc);
	505	+ if (rc < 0)
	506	+ /* I/O Error writing */
	507	+ return -EIO;
446	508
447		- case PAGE_CLEAN:
448		- ; /* try to migrate the page below */
449		- }
450		- }
	509	+ if (rc != AOP_WRITEPAGE_ACTIVATE)
	510	+ /* unlocked. Relock */
	511	+ lock_page(page);
	512	+
	513	+ return -EAGAIN;
	514	+}
	515	+
	516	+/*
	517	+ * Default handling if a filesystem does not provide a migration function.
	518	+ */
	519	+static int fallback_migrate_page(struct address_space *mapping,
	520	+ struct page newpage, struct page page)
	521	+{
	522	+ if (PageDirty(page))
	523	+ return writeout(mapping, page);
451	524
452	525	/*
453	526	* Buffers may be managed in a filesystem specific way.
...	...	@@ -596,6 +596,7 @@
596	596	spin_unlock(&mmlist_lock);
597	597	}
598	598	dec_mm_counter(mm, anon_rss);
	599	+#ifdef CONFIG_MIGRATION
599	600	} else {
600	601	/*
601	602	* Store the pfn of the page in a special migration
602	603
603	604
...	...	@@ -604,11 +605,21 @@
604	605	*/
605	606	BUG_ON(!migration);
606	607	entry = make_migration_entry(page, pte_write(pteval));
	608	+#endif
607	609	}
608	610	set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
609	611	BUG_ON(pte_file(*pte));
610	612	} else
	613	+#ifdef CONFIG_MIGRATION
	614	+ if (migration) {
	615	+ /* Establish migration entry for a file page */
	616	+ swp_entry_t entry;
	617	+ entry = make_migration_entry(page, pte_write(pteval));
	618	+ set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
	619	+ } else
	620	+#endif
611	621	dec_mm_counter(mm, file_rss);
	622	+
612	623
613	624	page_remove_rmap(page);
614	625	page_cache_release(page);
...	...	@@ -290,11 +290,23 @@
290	290	unlock_page(page);
291	291	}
292	292
	293	+/* possible outcome of pageout() */
	294	+typedef enum {
	295	+ /* failed to write page out, page is locked */
	296	+ PAGE_KEEP,
	297	+ /* move page to the active list, page is locked */
	298	+ PAGE_ACTIVATE,
	299	+ /* page has been sent to the disk successfully, page is unlocked */
	300	+ PAGE_SUCCESS,
	301	+ /* page is clean and locked */
	302	+ PAGE_CLEAN,
	303	+} pageout_t;
	304	+
293	305	/*
294	306	* pageout is called by shrink_page_list() for each dirty page.
295	307	* Calls ->writepage().
296	308	*/
297		-pageout_t pageout(struct page page, struct address_space mapping)
	309	+static pageout_t pageout(struct page page, struct address_space mapping)
298	310	{
299	311	/*
300	312	* If the page is dirty, only perform writeback if that write