Commit 04e62a29bf157ce1edd168f2b71b533c80d13628

Authored by Christoph Lameter
Committed by Linus Torvalds
1 parent 442c9137de

[PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed
pages during migration.  Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.

Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.

And another writepage() ugliness shows up.  writepage() can drop the page
lock.  Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 124 additions and 43 deletions Side-by-side Diff

include/linux/swap.h
... ... @@ -186,20 +186,6 @@
186 186 extern int vm_swappiness;
187 187 extern int remove_mapping(struct address_space *mapping, struct page *page);
188 188  
189   -/* possible outcome of pageout() */
190   -typedef enum {
191   - /* failed to write page out, page is locked */
192   - PAGE_KEEP,
193   - /* move page to the active list, page is locked */
194   - PAGE_ACTIVATE,
195   - /* page has been sent to the disk successfully, page is unlocked */
196   - PAGE_SUCCESS,
197   - /* page is clean and locked */
198   - PAGE_CLEAN,
199   -} pageout_t;
200   -
201   -extern pageout_t pageout(struct page *page, struct address_space *mapping);
202   -
203 189 #ifdef CONFIG_NUMA
204 190 extern int zone_reclaim_mode;
205 191 extern int zone_reclaim_interval;
... ... @@ -259,7 +245,6 @@
259 245 struct backing_dev_info;
260 246  
261 247 extern spinlock_t swap_lock;
262   -extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
263 248  
264 249 /* linux/mm/thrash.c */
265 250 extern struct mm_struct * swap_token_mm;
... ... @@ -24,6 +24,7 @@
24 24 #include <linux/topology.h>
25 25 #include <linux/cpu.h>
26 26 #include <linux/cpuset.h>
  27 +#include <linux/writeback.h>
27 28  
28 29 #include "internal.h"
29 30  
... ... @@ -123,7 +124,7 @@
123 124 /*
124 125 * Restore a potential migration pte to a working pte entry
125 126 */
126   -static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
  127 +static void remove_migration_pte(struct vm_area_struct *vma,
127 128 struct page *old, struct page *new)
128 129 {
129 130 struct mm_struct *mm = vma->vm_mm;
130 131  
... ... @@ -133,7 +134,11 @@
133 134 pmd_t *pmd;
134 135 pte_t *ptep, pte;
135 136 spinlock_t *ptl;
  137 + unsigned long addr = page_address_in_vma(new, vma);
136 138  
  139 + if (addr == -EFAULT)
  140 + return;
  141 +
137 142 pgd = pgd_offset(mm, addr);
138 143 if (!pgd_present(*pgd))
139 144 return;
140 145  
141 146  
... ... @@ -169,19 +174,47 @@
169 174 if (is_write_migration_entry(entry))
170 175 pte = pte_mkwrite(pte);
171 176 set_pte_at(mm, addr, ptep, pte);
172   - page_add_anon_rmap(new, vma, addr);
  177 +
  178 + if (PageAnon(new))
  179 + page_add_anon_rmap(new, vma, addr);
  180 + else
  181 + page_add_file_rmap(new);
  182 +
  183 + /* No need to invalidate - it was non-present before */
  184 + update_mmu_cache(vma, addr, pte);
  185 + lazy_mmu_prot_update(pte);
  186 +
173 187 out:
174 188 pte_unmap_unlock(ptep, ptl);
175 189 }
176 190  
177 191 /*
178   - * Get rid of all migration entries and replace them by
179   - * references to the indicated page.
180   - *
  192 + * Note that remove_file_migration_ptes will only work on regular mappings,
  193 + * Nonlinear mappings do not use migration entries.
  194 + */
  195 +static void remove_file_migration_ptes(struct page *old, struct page *new)
  196 +{
  197 + struct vm_area_struct *vma;
  198 + struct address_space *mapping = page_mapping(new);
  199 + struct prio_tree_iter iter;
  200 + pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  201 +
  202 + if (!mapping)
  203 + return;
  204 +
  205 + spin_lock(&mapping->i_mmap_lock);
  206 +
  207 + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
  208 + remove_migration_pte(vma, old, new);
  209 +
  210 + spin_unlock(&mapping->i_mmap_lock);
  211 +}
  212 +
  213 +/*
181 214 * Must hold mmap_sem lock on at least one of the vmas containing
182 215 * the page so that the anon_vma cannot vanish.
183 216 */
184   -static void remove_migration_ptes(struct page *old, struct page *new)
  217 +static void remove_anon_migration_ptes(struct page *old, struct page *new)
185 218 {
186 219 struct anon_vma *anon_vma;
187 220 struct vm_area_struct *vma;
188 221  
... ... @@ -199,13 +232,24 @@
199 232 spin_lock(&anon_vma->lock);
200 233  
201 234 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
202   - remove_migration_pte(vma, page_address_in_vma(new, vma),
203   - old, new);
  235 + remove_migration_pte(vma, old, new);
204 236  
205 237 spin_unlock(&anon_vma->lock);
206 238 }
207 239  
208 240 /*
  241 + * Get rid of all migration entries and replace them by
  242 + * references to the indicated page.
  243 + */
  244 +static void remove_migration_ptes(struct page *old, struct page *new)
  245 +{
  246 + if (PageAnon(new))
  247 + remove_anon_migration_ptes(old, new);
  248 + else
  249 + remove_file_migration_ptes(old, new);
  250 +}
  251 +
  252 +/*
209 253 * Something used the pte of a page under migration. We need to
210 254 * get to the page and wait until migration is finished.
211 255 * When we return from this function the fault will be retried.
212 256  
213 257  
214 258  
215 259  
216 260  
... ... @@ -424,30 +468,59 @@
424 468 }
425 469 EXPORT_SYMBOL(buffer_migrate_page);
426 470  
427   -static int fallback_migrate_page(struct address_space *mapping,
428   - struct page *newpage, struct page *page)
  471 +/*
  472 + * Writeback a page to clean the dirty state
  473 + */
  474 +static int writeout(struct address_space *mapping, struct page *page)
429 475 {
  476 + struct writeback_control wbc = {
  477 + .sync_mode = WB_SYNC_NONE,
  478 + .nr_to_write = 1,
  479 + .range_start = 0,
  480 + .range_end = LLONG_MAX,
  481 + .nonblocking = 1,
  482 + .for_reclaim = 1
  483 + };
  484 + int rc;
  485 +
  486 + if (!mapping->a_ops->writepage)
  487 + /* No write method for the address space */
  488 + return -EINVAL;
  489 +
  490 + if (!clear_page_dirty_for_io(page))
  491 + /* Someone else already triggered a write */
  492 + return -EAGAIN;
  493 +
430 494 /*
431   - * Default handling if a filesystem does not provide
432   - * a migration function. We can only migrate clean
433   - * pages so try to write out any dirty pages first.
  495 + * A dirty page may imply that the underlying filesystem has
  496 + * the page on some queue. So the page must be clean for
  497 + * migration. Writeout may mean we loose the lock and the
  498 + * page state is no longer what we checked for earlier.
  499 + * At this point we know that the migration attempt cannot
  500 + * be successful.
434 501 */
435   - if (PageDirty(page)) {
436   - switch (pageout(page, mapping)) {
437   - case PAGE_KEEP:
438   - case PAGE_ACTIVATE:
439   - return -EAGAIN;
  502 + remove_migration_ptes(page, page);
440 503  
441   - case PAGE_SUCCESS:
442   - /* Relock since we lost the lock */
443   - lock_page(page);
444   - /* Must retry since page state may have changed */
445   - return -EAGAIN;
  504 + rc = mapping->a_ops->writepage(page, &wbc);
  505 + if (rc < 0)
  506 + /* I/O Error writing */
  507 + return -EIO;
446 508  
447   - case PAGE_CLEAN:
448   - ; /* try to migrate the page below */
449   - }
450   - }
  509 + if (rc != AOP_WRITEPAGE_ACTIVATE)
  510 + /* unlocked. Relock */
  511 + lock_page(page);
  512 +
  513 + return -EAGAIN;
  514 +}
  515 +
  516 +/*
  517 + * Default handling if a filesystem does not provide a migration function.
  518 + */
  519 +static int fallback_migrate_page(struct address_space *mapping,
  520 + struct page *newpage, struct page *page)
  521 +{
  522 + if (PageDirty(page))
  523 + return writeout(mapping, page);
451 524  
452 525 /*
453 526 * Buffers may be managed in a filesystem specific way.
... ... @@ -596,6 +596,7 @@
596 596 spin_unlock(&mmlist_lock);
597 597 }
598 598 dec_mm_counter(mm, anon_rss);
  599 +#ifdef CONFIG_MIGRATION
599 600 } else {
600 601 /*
601 602 * Store the pfn of the page in a special migration
602 603  
603 604  
... ... @@ -604,11 +605,21 @@
604 605 */
605 606 BUG_ON(!migration);
606 607 entry = make_migration_entry(page, pte_write(pteval));
  608 +#endif
607 609 }
608 610 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
609 611 BUG_ON(pte_file(*pte));
610 612 } else
  613 +#ifdef CONFIG_MIGRATION
  614 + if (migration) {
  615 + /* Establish migration entry for a file page */
  616 + swp_entry_t entry;
  617 + entry = make_migration_entry(page, pte_write(pteval));
  618 + set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
  619 + } else
  620 +#endif
611 621 dec_mm_counter(mm, file_rss);
  622 +
612 623  
613 624 page_remove_rmap(page);
614 625 page_cache_release(page);
... ... @@ -290,11 +290,23 @@
290 290 unlock_page(page);
291 291 }
292 292  
  293 +/* possible outcome of pageout() */
  294 +typedef enum {
  295 + /* failed to write page out, page is locked */
  296 + PAGE_KEEP,
  297 + /* move page to the active list, page is locked */
  298 + PAGE_ACTIVATE,
  299 + /* page has been sent to the disk successfully, page is unlocked */
  300 + PAGE_SUCCESS,
  301 + /* page is clean and locked */
  302 + PAGE_CLEAN,
  303 +} pageout_t;
  304 +
293 305 /*
294 306 * pageout is called by shrink_page_list() for each dirty page.
295 307 * Calls ->writepage().
296 308 */
297   -pageout_t pageout(struct page *page, struct address_space *mapping)
  309 +static pageout_t pageout(struct page *page, struct address_space *mapping)
298 310 {
299 311 /*
300 312 * If the page is dirty, only perform writeback if that write