Commit cbf86cfe04a66471f23b9e62e5eba4e525f38855

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 8aafa6a485

ksm: remove old stable nodes more thoroughly

Switching merge_across_nodes after running KSM is liable to oops on stale
nodes still left over from the previous stable tree.  It's not something
that people will often want to do, but it would be lame to demand a reboot
when they're trying to determine which merge_across_nodes setting is best.

How can this happen?  We only permit switching merge_across_nodes when
pages_shared is 0, and usually set run 2 to force that beforehand, which
ought to unmerge everything: yet oopses still occur when you then run 1.

Three causes:

1. The old stable tree (built according to the inverse
   merge_across_nodes) has not been fully torn down.  A stable node
   lingers until get_ksm_page() notices that the page it references no
   longer references it: but the page is not necessarily freed as soon as
   expected, particularly when swapcache.

   Fix this with a pass through the old stable tree, applying
   get_ksm_page() to each of the remaining nodes (most found stale and
   removed immediately), with forced removal of any left over.  Unless the
   page is still mapped: I've not seen that case, it shouldn't occur, but
   better to WARN_ON_ONCE and EBUSY than BUG.

2. __ksm_enter() has a nice little optimization, to insert the new mm
   just behind ksmd's cursor, so there's a full pass for it to stabilize
   (or be removed) before ksmd addresses it.  Nice when ksmd is running,
   but not so nice when we're trying to unmerge all mms: we were missing
   those mms forked and inserted behind the unmerge cursor.  Easily fixed
   by inserting at the end when KSM_RUN_UNMERGE.

3.  It is possible for a KSM page to be faulted back from swapcache
   into an mm, just after unmerge_and_remove_all_rmap_items() scanned past
   it.  Fix this by copying on fault when KSM_RUN_UNMERGE: but that is
   private to ksm.c, so dissolve the distinction between
   ksm_might_need_to_copy() and ksm_does_need_to_copy(), doing it all in
   the one call into ksm.c.

A long outstanding, unrelated bugfix sneaks in with that third fix:
ksm_does_need_to_copy() would copy from a !PageUptodate page (implying I/O
error when read in from swap) to a page which it then marks Uptodate.  Fix
this case by not copying, letting do_swap_page() discover the error.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Izik Eidus <izik.eidus@ravellosystems.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 92 additions and 28 deletions Side-by-side Diff

... ... @@ -16,9 +16,6 @@
16 16 struct stable_node;
17 17 struct mem_cgroup;
18 18  
19   -struct page *ksm_does_need_to_copy(struct page *page,
20   - struct vm_area_struct *vma, unsigned long address);
21   -
22 19 #ifdef CONFIG_KSM
23 20 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
24 21 unsigned long end, int advice, unsigned long *vm_flags);
25 22  
... ... @@ -73,16 +70,9 @@
73 70 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
74 71 * but what if the vma was unmerged while the page was swapped out?
75 72 */
76   -static inline int ksm_might_need_to_copy(struct page *page,
77   - struct vm_area_struct *vma, unsigned long address)
78   -{
79   - struct anon_vma *anon_vma = page_anon_vma(page);
  73 +struct page *ksm_might_need_to_copy(struct page *page,
  74 + struct vm_area_struct *vma, unsigned long address);
80 75  
81   - return anon_vma &&
82   - (anon_vma->root != vma->anon_vma->root ||
83   - page->index != linear_page_index(vma, address));
84   -}
85   -
86 76 int page_referenced_ksm(struct page *page,
87 77 struct mem_cgroup *memcg, unsigned long *vm_flags);
88 78 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
89 79  
... ... @@ -113,10 +103,10 @@
113 103 return 0;
114 104 }
115 105  
116   -static inline int ksm_might_need_to_copy(struct page *page,
  106 +static inline struct page *ksm_might_need_to_copy(struct page *page,
117 107 struct vm_area_struct *vma, unsigned long address)
118 108 {
119   - return 0;
  109 + return page;
120 110 }
121 111  
122 112 static inline int page_referenced_ksm(struct page *page,
... ... @@ -644,6 +644,57 @@
644 644 /*
645 645 * Only called through the sysfs control interface:
646 646 */
  647 +static int remove_stable_node(struct stable_node *stable_node)
  648 +{
  649 + struct page *page;
  650 + int err;
  651 +
  652 + page = get_ksm_page(stable_node, true);
  653 + if (!page) {
  654 + /*
  655 + * get_ksm_page did remove_node_from_stable_tree itself.
  656 + */
  657 + return 0;
  658 + }
  659 +
  660 + if (WARN_ON_ONCE(page_mapped(page)))
  661 + err = -EBUSY;
  662 + else {
  663 + /*
  664 + * This page might be in a pagevec waiting to be freed,
  665 + * or it might be PageSwapCache (perhaps under writeback),
  666 + * or it might have been removed from swapcache a moment ago.
  667 + */
  668 + set_page_stable_node(page, NULL);
  669 + remove_node_from_stable_tree(stable_node);
  670 + err = 0;
  671 + }
  672 +
  673 + unlock_page(page);
  674 + put_page(page);
  675 + return err;
  676 +}
  677 +
  678 +static int remove_all_stable_nodes(void)
  679 +{
  680 + struct stable_node *stable_node;
  681 + int nid;
  682 + int err = 0;
  683 +
  684 + for (nid = 0; nid < nr_node_ids; nid++) {
  685 + while (root_stable_tree[nid].rb_node) {
  686 + stable_node = rb_entry(root_stable_tree[nid].rb_node,
  687 + struct stable_node, node);
  688 + if (remove_stable_node(stable_node)) {
  689 + err = -EBUSY;
  690 + break; /* proceed to next nid */
  691 + }
  692 + cond_resched();
  693 + }
  694 + }
  695 + return err;
  696 +}
  697 +
647 698 static int unmerge_and_remove_all_rmap_items(void)
648 699 {
649 700 struct mm_slot *mm_slot;
... ... @@ -691,6 +742,8 @@
691 742 }
692 743 }
693 744  
  745 + /* Clean up stable nodes, but don't worry if some are still busy */
  746 + remove_all_stable_nodes();
694 747 ksm_scan.seqnr = 0;
695 748 return 0;
696 749  
697 750  
698 751  
... ... @@ -1586,11 +1639,19 @@
1586 1639 spin_lock(&ksm_mmlist_lock);
1587 1640 insert_to_mm_slots_hash(mm, mm_slot);
1588 1641 /*
1589   - * Insert just behind the scanning cursor, to let the area settle
  1642 + * When KSM_RUN_MERGE (or KSM_RUN_STOP),
  1643 + * insert just behind the scanning cursor, to let the area settle
1590 1644 * down a little; when fork is followed by immediate exec, we don't
1591 1645 * want ksmd to waste time setting up and tearing down an rmap_list.
  1646 + *
  1647 + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
  1648 + * scanning cursor, otherwise KSM pages in newly forked mms will be
  1649 + * missed: then we might as well insert at the end of the list.
1592 1650 */
1593   - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
  1651 + if (ksm_run & KSM_RUN_UNMERGE)
  1652 + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
  1653 + else
  1654 + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1594 1655 spin_unlock(&ksm_mmlist_lock);
1595 1656  
1596 1657 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1597 1658  
1598 1659  
... ... @@ -1640,11 +1701,25 @@
1640 1701 }
1641 1702 }
1642 1703  
1643   -struct page *ksm_does_need_to_copy(struct page *page,
  1704 +struct page *ksm_might_need_to_copy(struct page *page,
1644 1705 struct vm_area_struct *vma, unsigned long address)
1645 1706 {
  1707 + struct anon_vma *anon_vma = page_anon_vma(page);
1646 1708 struct page *new_page;
1647 1709  
  1710 + if (PageKsm(page)) {
  1711 + if (page_stable_node(page) &&
  1712 + !(ksm_run & KSM_RUN_UNMERGE))
  1713 + return page; /* no need to copy it */
  1714 + } else if (!anon_vma) {
  1715 + return page; /* no need to copy it */
  1716 + } else if (anon_vma->root == vma->anon_vma->root &&
  1717 + page->index == linear_page_index(vma, address)) {
  1718 + return page; /* still no need to copy it */
  1719 + }
  1720 + if (!PageUptodate(page))
  1721 + return page; /* let do_swap_page report the error */
  1722 +
1648 1723 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1649 1724 if (new_page) {
1650 1725 copy_user_highpage(new_page, page, address, vma);
... ... @@ -2024,7 +2099,7 @@
2024 2099  
2025 2100 mutex_lock(&ksm_thread_mutex);
2026 2101 if (ksm_merge_across_nodes != knob) {
2027   - if (ksm_pages_shared)
  2102 + if (ksm_pages_shared || remove_all_stable_nodes())
2028 2103 err = -EBUSY;
2029 2104 else
2030 2105 ksm_merge_across_nodes = knob;
... ... @@ -2994,17 +2994,16 @@
2994 2994 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2995 2995 goto out_page;
2996 2996  
2997   - if (ksm_might_need_to_copy(page, vma, address)) {
2998   - swapcache = page;
2999   - page = ksm_does_need_to_copy(page, vma, address);
3000   -
3001   - if (unlikely(!page)) {
3002   - ret = VM_FAULT_OOM;
3003   - page = swapcache;
3004   - swapcache = NULL;
3005   - goto out_page;
3006   - }
  2997 + swapcache = page;
  2998 + page = ksm_might_need_to_copy(page, vma, address);
  2999 + if (unlikely(!page)) {
  3000 + ret = VM_FAULT_OOM;
  3001 + page = swapcache;
  3002 + swapcache = NULL;
  3003 + goto out_page;
3007 3004 }
  3005 + if (page == swapcache)
  3006 + swapcache = NULL;
3008 3007  
3009 3008 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
3010 3009 ret = VM_FAULT_OOM;