ksm: let shared pages be swappable

Initial implementation for swapping out KSM's shared pages: add page_referenced_ksm() and try_to_unmap_ksm(), which rmap.c calls when faced with a PageKsm page. Most of what's needed can be got from the rmap_items listed from the stable_node of the ksm page, without discovering the actual vma: so in this patch just fake up a struct vma for page_referenced_one() or try_to_unmap_one(), then refine that in the next patch. Add VM_NONLINEAR to ksm_madvise()'s list of exclusions: it has always been implicit there (being only set with VM_SHARED, already excluded), but let's make it explicit, to help justify the lack of nonlinear unmap. Rely on the page lock to protect against concurrent modifications to that page's node of the stable tree. The awkward part is not swapout but swapin: do_swap_page() and page_add_anon_rmap() now have to allow for new possibilities - perhaps a ksm page still in swapcache, perhaps a swapcache page associated with one location in one anon_vma now needed for another location or anon_vma. (And the vma might even be no longer VM_MERGEABLE when that happens.) ksm_might_need_to_copy() checks for that case, and supplies a duplicate page when necessary, simply leaving it to a subsequent pass of ksmd to rediscover the identity and merge them back into one ksm page. Disappointingly primitive: but the alternative would have to accumulate unswappable info about the swapped out ksm pages, limiting swappability. Remove page_add_ksm_rmap(): page_add_anon_rmap() now has to allow for the particular case it was handling, so just use it instead. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

ksm: let shared pages be swappable
Initial implementation for swapping out KSM's shared pages: add page_referenced_ksm() and try_to_unmap_ksm(), which rmap.c calls when faced with a PageKsm page. Most of what's needed can be got from the rmap_items listed from the stable_node of the ksm page, without discovering the actual vma: so in this patch just fake up a struct vma for page_referenced_one() or try_to_unmap_one(), then refine that in the next patch. Add VM_NONLINEAR to ksm_madvise()'s list of exclusions: it has always been implicit there (being only set with VM_SHARED, already excluded), but let's make it explicit, to help justify the lack of nonlinear unmap. Rely on the page lock to protect against concurrent modifications to that page's node of the stable tree. The awkward part is not swapout but swapin: do_swap_page() and page_add_anon_rmap() now have to allow for new possibilities - perhaps a ksm page still in swapcache, perhaps a swapcache page associated with one location in one anon_vma now needed for another location or anon_vma. (And the vma might even be no longer VM_MERGEABLE when that happens.) ksm_might_need_to_copy() checks for that case, and supplies a duplicate page when necessary, simply leaving it to a subsequent pass of ksmd to rediscover the identity and merge them back into one ksm page. Disappointingly primitive: but the alternative would have to accumulate unswappable info about the swapped out ksm pages, limiting swappability. Remove page_add_ksm_rmap(): page_add_anon_rmap() now has to allow for the particular case it was handling, so just use it instead. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Hugh Dickins · Linus Torvalds
1 parent 73848b4684
Showing 6 changed files with 264 additions and 49 deletions Side-by-side Diff
include/linux/ksm.h
include/linux/rmap.h
mm/ksm.c
mm/memory.c
mm/rmap.c
mm/swapfile.c
@@ -9,10 +9,12 @@
  
 #include <linux/bitops.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
 #include <linux/sched.h>
-#include <linux/vmstat.h>
  
 struct stable_node;
+struct mem_cgroup;
  
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  
  
@@ -57,11 +59,36 @@
 				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 }
  
-static inline void page_add_ksm_rmap(struct page *page)
+/*
+ * When do_swap_page() first faults in from swap what used to be a KSM page,
+ * no problem, it will be assigned to this vma's anon_vma; but thereafter,
+ * it might be faulted into a different anon_vma (or perhaps to a different
+ * offset in the same anon_vma).  do_swap_page() cannot do all the locking
+ * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
+ * a copy, and leave remerging the pages to a later pass of ksmd.
+ *
+ * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
+ * but what if the vma was unmerged while the page was swapped out?
+ */
+struct page *ksm_does_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address);
+static inline struct page *ksm_might_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
 {
-	if (atomic_inc_and_test(&page->_mapcount))
-		__inc_zone_page_state(page, NR_ANON_PAGES);
+	struct anon_vma *anon_vma = page_anon_vma(page);
+
+	if (!anon_vma ||
+	    (anon_vma == vma->anon_vma &&
+	     page->index == linear_page_index(vma, address)))
+		return page;
+
+	return ksm_does_need_to_copy(page, vma, address);
 }
+
+int page_referenced_ksm(struct page *page,
+			struct mem_cgroup *memcg, unsigned long *vm_flags);
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
+
 #else  /* !CONFIG_KSM */
  
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  
@@ -84,8 +111,23 @@
 	return 0;
 }
  
-/* No stub required for page_add_ksm_rmap(page) */
+static inline struct page *ksm_might_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	return page;
+}
+
+static inline int page_referenced_ksm(struct page *page,
+			struct mem_cgroup *memcg, unsigned long *vm_flags)
+{
+	return 0;
+}
+
+static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+	return 0;
+}
 #endif /* !CONFIG_KSM */
  
-#endif
+#endif /* __LINUX_KSM_H */
@@ -89,6 +89,9 @@
  */
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *cnt, unsigned long *vm_flags);
+int page_referenced_one(struct page *, struct vm_area_struct *,
+	unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
+
 enum ttu_flags {
 	TTU_UNMAP = 0,			/* unmap mode */
 	TTU_MIGRATION = 1,		/* migration mode */
@@ -102,6 +105,8 @@
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
  
 int try_to_unmap(struct page *, enum ttu_flags flags);
+int try_to_unmap_one(struct page *, struct vm_area_struct *,
+			unsigned long address, enum ttu_flags flags);
  
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
@@ -196,6 +196,13 @@
 static DEFINE_MUTEX(ksm_thread_mutex);
 static DEFINE_SPINLOCK(ksm_mmlist_lock);
  
+/*
+ * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
+ * later we rework things a little to get the right vma to them.
+ */
+static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
+static struct vm_area_struct ksm_fallback_vma;
+
 #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL)
  
  
  
@@ -445,14 +452,20 @@
 {
 	if (rmap_item->address & STABLE_FLAG) {
 		struct stable_node *stable_node;
+		struct page *page;
  
 		stable_node = rmap_item->head;
+		page = stable_node->page;
+		lock_page(page);
+
 		hlist_del(&rmap_item->hlist);
-		if (stable_node->hlist.first)
+		if (stable_node->hlist.first) {
+			unlock_page(page);
 			ksm_pages_sharing--;
-		else {
-			set_page_stable_node(stable_node->page, NULL);
-			put_page(stable_node->page);
+		} else {
+			set_page_stable_node(page, NULL);
+			unlock_page(page);
+			put_page(page);
  
 			rb_erase(&stable_node->node, &root_stable_tree);
 			free_stable_node(stable_node);
@@ -710,7 +723,7 @@
 	}
  
 	get_page(kpage);
-	page_add_ksm_rmap(kpage);
+	page_add_anon_rmap(kpage, vma, addr);
  
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
 	ptep_clear_flush(vma, addr, ptep);
  
@@ -763,8 +776,16 @@
 	    pages_identical(page, kpage))
 		err = replace_page(vma, page, kpage, orig_pte);
  
-	if ((vma->vm_flags & VM_LOCKED) && !err)
+	if ((vma->vm_flags & VM_LOCKED) && !err) {
 		munlock_vma_page(page);
+		if (!PageMlocked(kpage)) {
+			unlock_page(page);
+			lru_add_drain();
+			lock_page(kpage);
+			mlock_vma_page(kpage);
+			page = kpage;		/* for final unlock */
+		}
+	}
  
 	unlock_page(page);
 out:
  
@@ -841,7 +862,11 @@
  
 	copy_user_highpage(kpage, page, rmap_item->address, vma);
  
+	SetPageDirty(kpage);
+	__SetPageUptodate(kpage);
+	SetPageSwapBacked(kpage);
 	set_page_stable_node(kpage, NULL);	/* mark it PageKsm */
+	lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
  
 	err = try_to_merge_one_page(vma, page, kpage);
 up:
  
@@ -1071,7 +1096,9 @@
 			 * The page was successfully merged:
 			 * add its rmap_item to the stable tree.
 			 */
+			lock_page(kpage);
 			stable_tree_append(rmap_item, stable_node);
+			unlock_page(kpage);
 		}
 		put_page(kpage);
 		return;
  
@@ -1112,11 +1139,13 @@
 		if (kpage) {
 			remove_rmap_item_from_tree(tree_rmap_item);
  
+			lock_page(kpage);
 			stable_node = stable_tree_insert(kpage);
 			if (stable_node) {
 				stable_tree_append(tree_rmap_item, stable_node);
 				stable_tree_append(rmap_item, stable_node);
 			}
+			unlock_page(kpage);
 			put_page(kpage);
  
 			/*
@@ -1285,14 +1314,6 @@
 			return;
 		if (!PageKsm(page) || !in_stable_tree(rmap_item))
 			cmp_and_merge_page(page, rmap_item);
-		else if (page_mapcount(page) == 1) {
-			/*
-			 * Replace now-unshared ksm page by ordinary page.
-			 */
-			break_cow(rmap_item);
-			remove_rmap_item_from_tree(rmap_item);
-			rmap_item->oldchecksum = calc_checksum(page);
-		}
 		put_page(page);
 	}
 }
@@ -1337,7 +1358,7 @@
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
 				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
-				 VM_MIXEDMAP  | VM_SAO))
+				 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
 			return 0;		/* just ignore the advice */
  
 		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1433,6 +1454,127 @@
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
 	}
+}
+
+struct page *ksm_does_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	struct page *new_page;
+
+	unlock_page(page);	/* any racers will COW it, not modify it */
+
+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+	if (new_page) {
+		copy_user_highpage(new_page, page, address, vma);
+
+		SetPageDirty(new_page);
+		__SetPageUptodate(new_page);
+		SetPageSwapBacked(new_page);
+		__set_page_locked(new_page);
+
+		if (page_evictable(new_page, vma))
+			lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+		else
+			add_page_to_unevictable_list(new_page);
+	}
+
+	page_cache_release(page);
+	return new_page;
+}
+
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+			unsigned long *vm_flags)
+{
+	struct stable_node *stable_node;
+	struct rmap_item *rmap_item;
+	struct hlist_node *hlist;
+	unsigned int mapcount = page_mapcount(page);
+	int referenced = 0;
+	struct vm_area_struct *vma;
+
+	VM_BUG_ON(!PageKsm(page));
+	VM_BUG_ON(!PageLocked(page));
+
+	stable_node = page_stable_node(page);
+	if (!stable_node)
+		return 0;
+
+	/*
+	 * Temporary hack: really we need anon_vma in rmap_item, to
+	 * provide the correct vma, and to find recently forked instances.
+	 * Use zalloc to avoid weirdness if any other fields are involved.
+	 */
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
+	if (!vma) {
+		spin_lock(&ksm_fallback_vma_lock);
+		vma = &ksm_fallback_vma;
+	}
+
+	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+		if (memcg && !mm_match_cgroup(rmap_item->mm, memcg))
+			continue;
+
+		vma->vm_mm = rmap_item->mm;
+		vma->vm_start = rmap_item->address;
+		vma->vm_end = vma->vm_start + PAGE_SIZE;
+
+		referenced += page_referenced_one(page, vma,
+				rmap_item->address, &mapcount, vm_flags);
+		if (!mapcount)
+			goto out;
+	}
+out:
+	if (vma == &ksm_fallback_vma)
+		spin_unlock(&ksm_fallback_vma_lock);
+	else
+		kmem_cache_free(vm_area_cachep, vma);
+	return referenced;
+}
+
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+	struct stable_node *stable_node;
+	struct hlist_node *hlist;
+	struct rmap_item *rmap_item;
+	int ret = SWAP_AGAIN;
+	struct vm_area_struct *vma;
+
+	VM_BUG_ON(!PageKsm(page));
+	VM_BUG_ON(!PageLocked(page));
+
+	stable_node = page_stable_node(page);
+	if (!stable_node)
+		return SWAP_FAIL;
+
+	/*
+	 * Temporary hack: really we need anon_vma in rmap_item, to
+	 * provide the correct vma, and to find recently forked instances.
+	 * Use zalloc to avoid weirdness if any other fields are involved.
+	 */
+	if (TTU_ACTION(flags) != TTU_UNMAP)
+		return SWAP_FAIL;
+
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
+	if (!vma) {
+		spin_lock(&ksm_fallback_vma_lock);
+		vma = &ksm_fallback_vma;
+	}
+
+	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+		vma->vm_mm = rmap_item->mm;
+		vma->vm_start = rmap_item->address;
+		vma->vm_end = vma->vm_start + PAGE_SIZE;
+
+		ret = try_to_unmap_one(page, vma, rmap_item->address, flags);
+		if (ret != SWAP_AGAIN || !page_mapped(page))
+			goto out;
+	}
+out:
+	if (vma == &ksm_fallback_vma)
+		spin_unlock(&ksm_fallback_vma_lock);
+	else
+		kmem_cache_free(vm_area_cachep, vma);
+	return ret;
 }
  
 #ifdef CONFIG_SYSFS
@@ -2561,6 +2561,12 @@
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  
+	page = ksm_might_need_to_copy(page, vma, address);
+	if (!page) {
+		ret = VM_FAULT_OOM;
+		goto out;
+	}
+
 	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
 		ret = VM_FAULT_OOM;
 		goto out_page;
@@ -49,6 +49,7 @@
 #include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
@@ -336,9 +337,9 @@
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
-			       unsigned long address, unsigned int *mapcount,
-			       unsigned long *vm_flags)
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+			unsigned long address, unsigned int *mapcount,
+			unsigned long *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte;
  
  
  
  
  
@@ -507,28 +508,33 @@
 		    unsigned long *vm_flags)
 {
 	int referenced = 0;
+	int we_locked = 0;
  
 	if (TestClearPageReferenced(page))
 		referenced++;
  
 	*vm_flags = 0;
 	if (page_mapped(page) && page_rmapping(page)) {
-		if (PageAnon(page))
+		if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+			we_locked = trylock_page(page);
+			if (!we_locked) {
+				referenced++;
+				goto out;
+			}
+		}
+		if (unlikely(PageKsm(page)))
+			referenced += page_referenced_ksm(page, mem_cont,
+								vm_flags);
+		else if (PageAnon(page))
 			referenced += page_referenced_anon(page, mem_cont,
 								vm_flags);
-		else if (is_locked)
+		else if (page->mapping)
 			referenced += page_referenced_file(page, mem_cont,
 								vm_flags);
-		else if (!trylock_page(page))
-			referenced++;
-		else {
-			if (page->mapping)
-				referenced += page_referenced_file(page,
-							mem_cont, vm_flags);
+		if (we_locked)
 			unlock_page(page);
-		}
 	}
-
+out:
 	if (page_test_and_clear_young(page))
 		referenced++;
  
  
@@ -620,14 +626,7 @@
 	BUG_ON(!anon_vma);
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 	page->mapping = (struct address_space *) anon_vma;
-
 	page->index = linear_page_index(vma, address);
-
-	/*
-	 * nr_mapped state can be updated without turning off
-	 * interrupts because it is not modified via interrupt.
-	 */
-	__inc_zone_page_state(page, NR_ANON_PAGES);
 }
  
 /**
  
  
@@ -665,14 +664,21 @@
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  *
- * The caller needs to hold the pte lock and the page must be locked.
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting.
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
+	int first = atomic_inc_and_test(&page->_mapcount);
+	if (first)
+		__inc_zone_page_state(page, NR_ANON_PAGES);
+	if (unlikely(PageKsm(page)))
+		return;
+
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-	if (atomic_inc_and_test(&page->_mapcount))
+	if (first)
 		__page_set_anon_rmap(page, vma, address);
 	else
 		__page_check_anon_rmap(page, vma, address);
@@ -694,6 +700,7 @@
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	SetPageSwapBacked(page);
 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+	__inc_zone_page_state(page, NR_ANON_PAGES);
 	__page_set_anon_rmap(page, vma, address);
 	if (page_evictable(page, vma))
 		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -760,8 +767,8 @@
  * Subfunctions of try_to_unmap: try_to_unmap_one called
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-			    unsigned long address, enum ttu_flags flags)
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+		     unsigned long address, enum ttu_flags flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte;
@@ -1156,7 +1163,9 @@
  
 	BUG_ON(!PageLocked(page));
  
-	if (PageAnon(page))
+	if (unlikely(PageKsm(page)))
+		ret = try_to_unmap_ksm(page, flags);
+	else if (PageAnon(page))
 		ret = try_to_unmap_anon(page, flags);
 	else
 		ret = try_to_unmap_file(page, flags);
  
@@ -1177,13 +1186,16 @@
  *
  * SWAP_AGAIN	- no vma is holding page mlocked, or,
  * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL	- page cannot be located at present
  * SWAP_MLOCK	- page is now mlocked.
  */
 int try_to_munlock(struct page *page)
 {
 	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
  
-	if (PageAnon(page))
+	if (unlikely(PageKsm(page)))
+		return try_to_unmap_ksm(page, TTU_MUNLOCK);
+	else if (PageAnon(page))
 		return try_to_unmap_anon(page, TTU_MUNLOCK);
 	else
 		return try_to_unmap_file(page, TTU_MUNLOCK);
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
@@ -650,6 +651,8 @@
 	int count;
  
 	VM_BUG_ON(!PageLocked(page));
+	if (unlikely(PageKsm(page)))
+		return 0;
 	count = page_mapcount(page);
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
@@ -658,7 +661,7 @@
 			SetPageDirty(page);
 		}
 	}
-	return count == 1;
+	return count <= 1;
 }
  
 /*
@@ -1185,6 +1188,12 @@
 		 * read from disk into another page.  Splitting into two
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
+		 *
+		 * Given how unuse_vma() targets one particular offset
+		 * in an anon_vma, once the anon_vma has been determined,
+		 * this splitting happens to be just what is needed to
+		 * handle where KSM pages have been swapped out: re-reading
+		 * is unnecessarily slow, but we can fix that later on.
 		 */
 		if (swap_count(*swap_map) &&
 		     PageDirty(page) && PageSwapCache(page)) {
...	...	@@ -9,10 +9,12 @@
9	9
10	10	#include <linux/bitops.h>
11	11	#include <linux/mm.h>
	12	+#include <linux/pagemap.h>
	13	+#include <linux/rmap.h>
12	14	#include <linux/sched.h>
13		-#include <linux/vmstat.h>
14	15
15	16	struct stable_node;
	17	+struct mem_cgroup;
16	18
17	19	#ifdef CONFIG_KSM
18	20	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
19	21
20	22
...	...	@@ -57,11 +59,36 @@
57	59	(PAGE_MAPPING_ANON \| PAGE_MAPPING_KSM);
58	60	}
59	61
60		-static inline void page_add_ksm_rmap(struct page *page)
	62	+/*
	63	+ * When do_swap_page() first faults in from swap what used to be a KSM page,
	64	+ * no problem, it will be assigned to this vma's anon_vma; but thereafter,
	65	+ * it might be faulted into a different anon_vma (or perhaps to a different
	66	+ * offset in the same anon_vma). do_swap_page() cannot do all the locking
	67	+ * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
	68	+ * a copy, and leave remerging the pages to a later pass of ksmd.
	69	+ *
	70	+ * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
	71	+ * but what if the vma was unmerged while the page was swapped out?
	72	+ */
	73	+struct page ksm_does_need_to_copy(struct page page,
	74	+ struct vm_area_struct *vma, unsigned long address);
	75	+static inline struct page ksm_might_need_to_copy(struct page page,
	76	+ struct vm_area_struct *vma, unsigned long address)
61	77	{
62		- if (atomic_inc_and_test(&page->_mapcount))
63		- __inc_zone_page_state(page, NR_ANON_PAGES);
	78	+ struct anon_vma *anon_vma = page_anon_vma(page);
	79	+
	80	+ if (!anon_vma \|\|
	81	+ (anon_vma == vma->anon_vma &&
	82	+ page->index == linear_page_index(vma, address)))
	83	+ return page;
	84	+
	85	+ return ksm_does_need_to_copy(page, vma, address);
64	86	}
	87	+
	88	+int page_referenced_ksm(struct page *page,
	89	+ struct mem_cgroup memcg, unsigned long vm_flags);
	90	+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
	91	+
65	92	#else /* !CONFIG_KSM */
66	93
67	94	static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
68	95
...	...	@@ -84,8 +111,23 @@
84	111	return 0;
85	112	}
86	113
87		-/* No stub required for page_add_ksm_rmap(page) */
	114	+static inline struct page ksm_might_need_to_copy(struct page page,
	115	+ struct vm_area_struct *vma, unsigned long address)
	116	+{
	117	+ return page;
	118	+}
	119	+
	120	+static inline int page_referenced_ksm(struct page *page,
	121	+ struct mem_cgroup memcg, unsigned long vm_flags)
	122	+{
	123	+ return 0;
	124	+}
	125	+
	126	+static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
	127	+{
	128	+ return 0;
	129	+}
88	130	#endif /* !CONFIG_KSM */
89	131
90		-#endif
	132	+#endif /* __LINUX_KSM_H */
...	...	@@ -89,6 +89,9 @@
89	89	*/
90	90	int page_referenced(struct page *, int is_locked,
91	91	struct mem_cgroup cnt, unsigned long vm_flags);
	92	+int page_referenced_one(struct page , struct vm_area_struct ,
	93	+ unsigned long address, unsigned int mapcount, unsigned long vm_flags);
	94	+
92	95	enum ttu_flags {
93	96	TTU_UNMAP = 0, /* unmap mode */
94	97	TTU_MIGRATION = 1, /* migration mode */
...	...	@@ -102,6 +105,8 @@
102	105	#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
103	106
104	107	int try_to_unmap(struct page *, enum ttu_flags flags);
	108	+int try_to_unmap_one(struct page , struct vm_area_struct ,
	109	+ unsigned long address, enum ttu_flags flags);
105	110
106	111	/*
107	112	* Called from mm/filemap_xip.c to unmap empty zero page
...	...	@@ -196,6 +196,13 @@
196	196	static DEFINE_MUTEX(ksm_thread_mutex);
197	197	static DEFINE_SPINLOCK(ksm_mmlist_lock);
198	198
	199	+/*
	200	+ * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
	201	+ * later we rework things a little to get the right vma to them.
	202	+ */
	203	+static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
	204	+static struct vm_area_struct ksm_fallback_vma;
	205	+
199	206	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
200	207	sizeof(struct __struct), __alignof__(struct __struct),\
201	208	(__flags), NULL)
202	209
203	210
204	211
...	...	@@ -445,14 +452,20 @@
445	452	{
446	453	if (rmap_item->address & STABLE_FLAG) {
447	454	struct stable_node *stable_node;
	455	+ struct page *page;
448	456
449	457	stable_node = rmap_item->head;
	458	+ page = stable_node->page;
	459	+ lock_page(page);
	460	+
450	461	hlist_del(&rmap_item->hlist);
451		- if (stable_node->hlist.first)
	462	+ if (stable_node->hlist.first) {
	463	+ unlock_page(page);
452	464	ksm_pages_sharing--;
453		- else {
454		- set_page_stable_node(stable_node->page, NULL);
455		- put_page(stable_node->page);
	465	+ } else {
	466	+ set_page_stable_node(page, NULL);
	467	+ unlock_page(page);
	468	+ put_page(page);
456	469
457	470	rb_erase(&stable_node->node, &root_stable_tree);
458	471	free_stable_node(stable_node);
...	...	@@ -710,7 +723,7 @@
710	723	}
711	724
712	725	get_page(kpage);
713		- page_add_ksm_rmap(kpage);
	726	+ page_add_anon_rmap(kpage, vma, addr);
714	727
715	728	flush_cache_page(vma, addr, pte_pfn(*ptep));
716	729	ptep_clear_flush(vma, addr, ptep);
717	730
...	...	@@ -763,8 +776,16 @@
763	776	pages_identical(page, kpage))
764	777	err = replace_page(vma, page, kpage, orig_pte);
765	778
766		- if ((vma->vm_flags & VM_LOCKED) && !err)
	779	+ if ((vma->vm_flags & VM_LOCKED) && !err) {
767	780	munlock_vma_page(page);
	781	+ if (!PageMlocked(kpage)) {
	782	+ unlock_page(page);
	783	+ lru_add_drain();
	784	+ lock_page(kpage);
	785	+ mlock_vma_page(kpage);
	786	+ page = kpage; /* for final unlock */
	787	+ }
	788	+ }
768	789
769	790	unlock_page(page);
770	791	out:
771	792
...	...	@@ -841,7 +862,11 @@
841	862
842	863	copy_user_highpage(kpage, page, rmap_item->address, vma);
843	864
	865	+ SetPageDirty(kpage);
	866	+ __SetPageUptodate(kpage);
	867	+ SetPageSwapBacked(kpage);
844	868	set_page_stable_node(kpage, NULL); /* mark it PageKsm */
	869	+ lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
845	870
846	871	err = try_to_merge_one_page(vma, page, kpage);
847	872	up:
848	873
...	...	@@ -1071,7 +1096,9 @@
1071	1096	* The page was successfully merged:
1072	1097	* add its rmap_item to the stable tree.
1073	1098	*/
	1099	+ lock_page(kpage);
1074	1100	stable_tree_append(rmap_item, stable_node);
	1101	+ unlock_page(kpage);
1075	1102	}
1076	1103	put_page(kpage);
1077	1104	return;
1078	1105
...	...	@@ -1112,11 +1139,13 @@
1112	1139	if (kpage) {
1113	1140	remove_rmap_item_from_tree(tree_rmap_item);
1114	1141
	1142	+ lock_page(kpage);
1115	1143	stable_node = stable_tree_insert(kpage);
1116	1144	if (stable_node) {
1117	1145	stable_tree_append(tree_rmap_item, stable_node);
1118	1146	stable_tree_append(rmap_item, stable_node);
1119	1147	}
	1148	+ unlock_page(kpage);
1120	1149	put_page(kpage);
1121	1150
1122	1151	/*
...	...	@@ -1285,14 +1314,6 @@
1285	1314	return;
1286	1315	if (!PageKsm(page) \|\| !in_stable_tree(rmap_item))
1287	1316	cmp_and_merge_page(page, rmap_item);
1288		- else if (page_mapcount(page) == 1) {
1289		- /*
1290		- * Replace now-unshared ksm page by ordinary page.
1291		- */
1292		- break_cow(rmap_item);
1293		- remove_rmap_item_from_tree(rmap_item);
1294		- rmap_item->oldchecksum = calc_checksum(page);
1295		- }
1296	1317	put_page(page);
1297	1318	}
1298	1319	}
...	...	@@ -1337,7 +1358,7 @@
1337	1358	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
1338	1359	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
1339	1360	VM_RESERVED \| VM_HUGETLB \| VM_INSERTPAGE \|
1340		- VM_MIXEDMAP \| VM_SAO))
	1361	+ VM_NONLINEAR \| VM_MIXEDMAP \| VM_SAO))
1341	1362	return 0; /* just ignore the advice */
1342	1363
1343	1364	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
...	...	@@ -1433,6 +1454,127 @@
1433	1454	down_write(&mm->mmap_sem);
1434	1455	up_write(&mm->mmap_sem);
1435	1456	}
	1457	+}
	1458	+
	1459	+struct page ksm_does_need_to_copy(struct page page,
	1460	+ struct vm_area_struct *vma, unsigned long address)
	1461	+{
	1462	+ struct page *new_page;
	1463	+
	1464	+ unlock_page(page); /* any racers will COW it, not modify it */
	1465	+
	1466	+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
	1467	+ if (new_page) {
	1468	+ copy_user_highpage(new_page, page, address, vma);
	1469	+
	1470	+ SetPageDirty(new_page);
	1471	+ __SetPageUptodate(new_page);
	1472	+ SetPageSwapBacked(new_page);
	1473	+ __set_page_locked(new_page);
	1474	+
	1475	+ if (page_evictable(new_page, vma))
	1476	+ lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
	1477	+ else
	1478	+ add_page_to_unevictable_list(new_page);
	1479	+ }
	1480	+
	1481	+ page_cache_release(page);
	1482	+ return new_page;
	1483	+}
	1484	+
	1485	+int page_referenced_ksm(struct page page, struct mem_cgroup memcg,
	1486	+ unsigned long *vm_flags)
	1487	+{
	1488	+ struct stable_node *stable_node;
	1489	+ struct rmap_item *rmap_item;
	1490	+ struct hlist_node *hlist;
	1491	+ unsigned int mapcount = page_mapcount(page);
	1492	+ int referenced = 0;
	1493	+ struct vm_area_struct *vma;
	1494	+
	1495	+ VM_BUG_ON(!PageKsm(page));
	1496	+ VM_BUG_ON(!PageLocked(page));
	1497	+
	1498	+ stable_node = page_stable_node(page);
	1499	+ if (!stable_node)
	1500	+ return 0;
	1501	+
	1502	+ /*
	1503	+ * Temporary hack: really we need anon_vma in rmap_item, to
	1504	+ * provide the correct vma, and to find recently forked instances.
	1505	+ * Use zalloc to avoid weirdness if any other fields are involved.
	1506	+ */
	1507	+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
	1508	+ if (!vma) {
	1509	+ spin_lock(&ksm_fallback_vma_lock);
	1510	+ vma = &ksm_fallback_vma;
	1511	+ }
	1512	+
	1513	+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
	1514	+ if (memcg && !mm_match_cgroup(rmap_item->mm, memcg))
	1515	+ continue;
	1516	+
	1517	+ vma->vm_mm = rmap_item->mm;
	1518	+ vma->vm_start = rmap_item->address;
	1519	+ vma->vm_end = vma->vm_start + PAGE_SIZE;
	1520	+
	1521	+ referenced += page_referenced_one(page, vma,
	1522	+ rmap_item->address, &mapcount, vm_flags);
	1523	+ if (!mapcount)
	1524	+ goto out;
	1525	+ }
	1526	+out:
	1527	+ if (vma == &ksm_fallback_vma)
	1528	+ spin_unlock(&ksm_fallback_vma_lock);
	1529	+ else
	1530	+ kmem_cache_free(vm_area_cachep, vma);
	1531	+ return referenced;
	1532	+}
	1533	+
	1534	+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
	1535	+{
	1536	+ struct stable_node *stable_node;
	1537	+ struct hlist_node *hlist;
	1538	+ struct rmap_item *rmap_item;
	1539	+ int ret = SWAP_AGAIN;
	1540	+ struct vm_area_struct *vma;
	1541	+
	1542	+ VM_BUG_ON(!PageKsm(page));
	1543	+ VM_BUG_ON(!PageLocked(page));
	1544	+
	1545	+ stable_node = page_stable_node(page);
	1546	+ if (!stable_node)
	1547	+ return SWAP_FAIL;
	1548	+
	1549	+ /*
	1550	+ * Temporary hack: really we need anon_vma in rmap_item, to
	1551	+ * provide the correct vma, and to find recently forked instances.
	1552	+ * Use zalloc to avoid weirdness if any other fields are involved.
	1553	+ */
	1554	+ if (TTU_ACTION(flags) != TTU_UNMAP)
	1555	+ return SWAP_FAIL;
	1556	+
	1557	+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
	1558	+ if (!vma) {
	1559	+ spin_lock(&ksm_fallback_vma_lock);
	1560	+ vma = &ksm_fallback_vma;
	1561	+ }
	1562	+
	1563	+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
	1564	+ vma->vm_mm = rmap_item->mm;
	1565	+ vma->vm_start = rmap_item->address;
	1566	+ vma->vm_end = vma->vm_start + PAGE_SIZE;
	1567	+
	1568	+ ret = try_to_unmap_one(page, vma, rmap_item->address, flags);
	1569	+ if (ret != SWAP_AGAIN \|\| !page_mapped(page))
	1570	+ goto out;
	1571	+ }
	1572	+out:
	1573	+ if (vma == &ksm_fallback_vma)
	1574	+ spin_unlock(&ksm_fallback_vma_lock);
	1575	+ else
	1576	+ kmem_cache_free(vm_area_cachep, vma);
	1577	+ return ret;
1436	1578	}
1437	1579
1438	1580	#ifdef CONFIG_SYSFS
...	...	@@ -2561,6 +2561,12 @@
2561	2561	lock_page(page);
2562	2562	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2563	2563
	2564	+ page = ksm_might_need_to_copy(page, vma, address);
	2565	+ if (!page) {
	2566	+ ret = VM_FAULT_OOM;
	2567	+ goto out;
	2568	+ }
	2569	+
2564	2570	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2565	2571	ret = VM_FAULT_OOM;
2566	2572	goto out_page;
...	...	@@ -49,6 +49,7 @@
49	49	#include <linux/swapops.h>
50	50	#include <linux/slab.h>
51	51	#include <linux/init.h>
	52	+#include <linux/ksm.h>
52	53	#include <linux/rmap.h>
53	54	#include <linux/rcupdate.h>
54	55	#include <linux/module.h>
...	...	@@ -336,9 +337,9 @@
336	337	* Subfunctions of page_referenced: page_referenced_one called
337	338	* repeatedly from either page_referenced_anon or page_referenced_file.
338	339	*/
339		-static int page_referenced_one(struct page page, struct vm_area_struct vma,
340		- unsigned long address, unsigned int *mapcount,
341		- unsigned long *vm_flags)
	340	+int page_referenced_one(struct page page, struct vm_area_struct vma,
	341	+ unsigned long address, unsigned int *mapcount,
	342	+ unsigned long *vm_flags)
342	343	{
343	344	struct mm_struct *mm = vma->vm_mm;
344	345	pte_t *pte;
345	346
346	347
347	348
348	349
349	350
...	...	@@ -507,28 +508,33 @@
507	508	unsigned long *vm_flags)
508	509	{
509	510	int referenced = 0;
	511	+ int we_locked = 0;
510	512
511	513	if (TestClearPageReferenced(page))
512	514	referenced++;
513	515
514	516	*vm_flags = 0;
515	517	if (page_mapped(page) && page_rmapping(page)) {
516		- if (PageAnon(page))
	518	+ if (!is_locked && (!PageAnon(page) \|\| PageKsm(page))) {
	519	+ we_locked = trylock_page(page);
	520	+ if (!we_locked) {
	521	+ referenced++;
	522	+ goto out;
	523	+ }
	524	+ }
	525	+ if (unlikely(PageKsm(page)))
	526	+ referenced += page_referenced_ksm(page, mem_cont,
	527	+ vm_flags);
	528	+ else if (PageAnon(page))
517	529	referenced += page_referenced_anon(page, mem_cont,
518	530	vm_flags);
519		- else if (is_locked)
	531	+ else if (page->mapping)
520	532	referenced += page_referenced_file(page, mem_cont,
521	533	vm_flags);
522		- else if (!trylock_page(page))
523		- referenced++;
524		- else {
525		- if (page->mapping)
526		- referenced += page_referenced_file(page,
527		- mem_cont, vm_flags);
	534	+ if (we_locked)
528	535	unlock_page(page);
529		- }
530	536	}
531		-
	537	+out:
532	538	if (page_test_and_clear_young(page))
533	539	referenced++;
534	540
535	541
...	...	@@ -620,14 +626,7 @@
620	626	BUG_ON(!anon_vma);
621	627	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
622	628	page->mapping = (struct address_space *) anon_vma;
623		-
624	629	page->index = linear_page_index(vma, address);
625		-
626		- /*
627		- * nr_mapped state can be updated without turning off
628		- * interrupts because it is not modified via interrupt.
629		- */
630		- __inc_zone_page_state(page, NR_ANON_PAGES);
631	630	}
632	631
633	632	/**
634	633
635	634
...	...	@@ -665,14 +664,21 @@
665	664	* @vma: the vm area in which the mapping is added
666	665	* @address: the user virtual address mapped
667	666	*
668		- * The caller needs to hold the pte lock and the page must be locked.
	667	+ * The caller needs to hold the pte lock, and the page must be locked in
	668	+ * the anon_vma case: to serialize mapping,index checking after setting.
669	669	*/
670	670	void page_add_anon_rmap(struct page *page,
671	671	struct vm_area_struct *vma, unsigned long address)
672	672	{
	673	+ int first = atomic_inc_and_test(&page->_mapcount);
	674	+ if (first)
	675	+ __inc_zone_page_state(page, NR_ANON_PAGES);
	676	+ if (unlikely(PageKsm(page)))
	677	+ return;
	678	+
673	679	VM_BUG_ON(!PageLocked(page));
674	680	VM_BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
675		- if (atomic_inc_and_test(&page->_mapcount))
	681	+ if (first)
676	682	__page_set_anon_rmap(page, vma, address);
677	683	else
678	684	__page_check_anon_rmap(page, vma, address);
...	...	@@ -694,6 +700,7 @@
694	700	VM_BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
695	701	SetPageSwapBacked(page);
696	702	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
	703	+ __inc_zone_page_state(page, NR_ANON_PAGES);
697	704	__page_set_anon_rmap(page, vma, address);
698	705	if (page_evictable(page, vma))
699	706	lru_cache_add_lru(page, LRU_ACTIVE_ANON);
...	...	@@ -760,8 +767,8 @@
760	767	* Subfunctions of try_to_unmap: try_to_unmap_one called
761	768	* repeatedly from either try_to_unmap_anon or try_to_unmap_file.
762	769	*/
763		-static int try_to_unmap_one(struct page page, struct vm_area_struct vma,
764		- unsigned long address, enum ttu_flags flags)
	770	+int try_to_unmap_one(struct page page, struct vm_area_struct vma,
	771	+ unsigned long address, enum ttu_flags flags)
765	772	{
766	773	struct mm_struct *mm = vma->vm_mm;
767	774	pte_t *pte;
...	...	@@ -1156,7 +1163,9 @@
1156	1163
1157	1164	BUG_ON(!PageLocked(page));
1158	1165
1159		- if (PageAnon(page))
	1166	+ if (unlikely(PageKsm(page)))
	1167	+ ret = try_to_unmap_ksm(page, flags);
	1168	+ else if (PageAnon(page))
1160	1169	ret = try_to_unmap_anon(page, flags);
1161	1170	else
1162	1171	ret = try_to_unmap_file(page, flags);
1163	1172
...	...	@@ -1177,13 +1186,16 @@
1177	1186	*
1178	1187	* SWAP_AGAIN - no vma is holding page mlocked, or,
1179	1188	* SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
	1189	+ * SWAP_FAIL - page cannot be located at present
1180	1190	* SWAP_MLOCK - page is now mlocked.
1181	1191	*/
1182	1192	int try_to_munlock(struct page *page)
1183	1193	{
1184	1194	VM_BUG_ON(!PageLocked(page) \|\| PageLRU(page));
1185	1195
1186		- if (PageAnon(page))
	1196	+ if (unlikely(PageKsm(page)))
	1197	+ return try_to_unmap_ksm(page, TTU_MUNLOCK);
	1198	+ else if (PageAnon(page))
1187	1199	return try_to_unmap_anon(page, TTU_MUNLOCK);
1188	1200	else
1189	1201	return try_to_unmap_file(page, TTU_MUNLOCK);
...	...	@@ -22,6 +22,7 @@
22	22	#include <linux/seq_file.h>
23	23	#include <linux/init.h>
24	24	#include <linux/module.h>
	25	+#include <linux/ksm.h>
25	26	#include <linux/rmap.h>
26	27	#include <linux/security.h>
27	28	#include <linux/backing-dev.h>
...	...	@@ -650,6 +651,8 @@
650	651	int count;
651	652
652	653	VM_BUG_ON(!PageLocked(page));
	654	+ if (unlikely(PageKsm(page)))
	655	+ return 0;
653	656	count = page_mapcount(page);
654	657	if (count <= 1 && PageSwapCache(page)) {
655	658	count += page_swapcount(page);
...	...	@@ -658,7 +661,7 @@
658	661	SetPageDirty(page);
659	662	}
660	663	}
661		- return count == 1;
	664	+ return count <= 1;
662	665	}
663	666
664	667	/*
...	...	@@ -1185,6 +1188,12 @@
1185	1188	* read from disk into another page. Splitting into two
1186	1189	* pages would be incorrect if swap supported "shared
1187	1190	* private" pages, but they are handled by tmpfs files.
	1191	+ *
	1192	+ * Given how unuse_vma() targets one particular offset
	1193	+ * in an anon_vma, once the anon_vma has been determined,
	1194	+ * this splitting happens to be just what is needed to
	1195	+ * handle where KSM pages have been swapped out: re-reading
	1196	+ * is unnecessarily slow, but we can fix that later on.
1188	1197	*/
1189	1198	if (swap_count(*swap_map) &&
1190	1199	PageDirty(page) && PageSwapCache(page)) {