ksm: fix deadlock with munlock in exit_mmap

Rawhide users have reported hang at startup when cryptsetup is run: the same problem can be simply reproduced by running a program int main() { mlockall(MCL_CURRENT | MCL_FUTURE); return 0; } The problem is that exit_mmap() applies munlock_vma_pages_all() to clean up VM_LOCKED areas, and its current implementation (stupidly) tries to fault in absent pages, for example where PROT_NONE prevented them being faulted in when mlocking. Whereas the "ksm: fix oom deadlock" patch, knowing there's a race by which KSM might try to fault in pages after exit_mmap() had finally zapped the range, backs out of such faults doing nothing when its ksm_test_exit() notices mm_users 0. So revert that part of "ksm: fix oom deadlock" which moved the ksm_exit() call from before exit_mmap() to the middle of exit_mmap(); and remove those ksm_test_exit() checks from the page fault paths, so allowing the munlocking to proceed without interference. ksm_exit, if there are rmap_items still chained on this mm slot, takes mmap_sem write side: so preventing KSM from working on an mm while exit_mmap runs. And KSM will bail out as soon as it notices that mm_users is already zero, thanks to its internal ksm_test_exit checks. So that when a task is killed by OOM killer or the user, KSM will not indefinitely prevent it from running exit_mmap to release its memory. This does break a part of what "ksm: fix oom deadlock" was trying to achieve. When unmerging KSM (echo 2 >/sys/kernel/mm/ksm), and even when ksmd itself has to cancel a KSM page, it is possible that the first OOM-kill victim would be the KSM process being faulted: then its memory won't be freed until a second victim has been selected (freeing memory for the unmerging fault to complete). But the OOM killer is already liable to kill a second victim once the intended victim's p->mm goes to NULL: so there's not much point in rejecting this KSM patch before fixing that OOM behaviour. It is very much more important to allow KSM users to boot up, than to haggle over an unlikely and poorly supported OOM case. We also intend to fix munlocking to not fault pages: at which point this patch _could_ be reverted; though that would be controversial, so we hope to find a better solution. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Justin M. Forbes <jforbes@redhat.com> Acked-for-now-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

ksm: fix deadlock with munlock in exit_mmap
Rawhide users have reported hang at startup when cryptsetup is run: the same problem can be simply reproduced by running a program int main() { mlockall(MCL_CURRENT | MCL_FUTURE); return 0; } The problem is that exit_mmap() applies munlock_vma_pages_all() to clean up VM_LOCKED areas, and its current implementation (stupidly) tries to fault in absent pages, for example where PROT_NONE prevented them being faulted in when mlocking. Whereas the "ksm: fix oom deadlock" patch, knowing there's a race by which KSM might try to fault in pages after exit_mmap() had finally zapped the range, backs out of such faults doing nothing when its ksm_test_exit() notices mm_users 0. So revert that part of "ksm: fix oom deadlock" which moved the ksm_exit() call from before exit_mmap() to the middle of exit_mmap(); and remove those ksm_test_exit() checks from the page fault paths, so allowing the munlocking to proceed without interference. ksm_exit, if there are rmap_items still chained on this mm slot, takes mmap_sem write side: so preventing KSM from working on an mm while exit_mmap runs. And KSM will bail out as soon as it notices that mm_users is already zero, thanks to its internal ksm_test_exit checks. So that when a task is killed by OOM killer or the user, KSM will not indefinitely prevent it from running exit_mmap to release its memory. This does break a part of what "ksm: fix oom deadlock" was trying to achieve. When unmerging KSM (echo 2 >/sys/kernel/mm/ksm), and even when ksmd itself has to cancel a KSM page, it is possible that the first OOM-kill victim would be the KSM process being faulted: then its memory won't be freed until a second victim has been selected (freeing memory for the unmerging fault to complete). But the OOM killer is already liable to kill a second victim once the intended victim's p->mm goes to NULL: so there's not much point in rejecting this KSM patch before fixing that OOM behaviour. It is very much more important to allow KSM users to boot up, than to haggle over an unlikely and poorly supported OOM case. We also intend to fix munlocking to not fault pages: at which point this patch _could_ be reverted; though that would be controversial, so we hope to find a better solution. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Justin M. Forbes <jforbes@redhat.com> Acked-for-now-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Andrea Arcangeli · Linus Torvalds
1 parent 9ba6929480
Showing 5 changed files with 8 additions and 20 deletions Side-by-side Diff
include/linux/ksm.h
kernel/fork.c
mm/ksm.c
mm/memory.c
mm/mmap.c
@@ -18,8 +18,7 @@
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
 int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm,
-		struct mmu_gather **tlbp, unsigned long end);
+void __ksm_exit(struct mm_struct *mm);
  
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
  
@@ -41,11 +40,10 @@
 	return atomic_read(&mm->mm_users) == 0;
 }
  
-static inline void ksm_exit(struct mm_struct *mm,
-			    struct mmu_gather **tlbp, unsigned long end)
+static inline void ksm_exit(struct mm_struct *mm)
 {
 	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-		__ksm_exit(mm, tlbp, end);
+		__ksm_exit(mm);
 }
  
 /*
@@ -86,8 +84,7 @@
 	return 0;
 }
  
-static inline void ksm_exit(struct mm_struct *mm,
-			    struct mmu_gather **tlbp, unsigned long end)
+static inline void ksm_exit(struct mm_struct *mm)
 {
 }
  
@@ -501,6 +501,7 @@
  
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
+		ksm_exit(mm);
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
@@ -1416,8 +1416,7 @@
 	return 0;
 }
  
-void __ksm_exit(struct mm_struct *mm,
-		struct mmu_gather **tlbp, unsigned long end)
+void __ksm_exit(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
 	int easy_to_free = 0;
  
@@ -1450,10 +1449,8 @@
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
 		mmdrop(mm);
 	} else if (mm_slot) {
-		tlb_finish_mmu(*tlbp, 0, end);
 		down_write(&mm->mmap_sem);
 		up_write(&mm->mmap_sem);
-		*tlbp = tlb_gather_mmu(mm, 1);
 	}
 }
  
@@ -2648,7 +2648,7 @@
 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (!pte_none(*page_table) || ksm_test_exit(mm))
+	if (!pte_none(*page_table))
 		goto release;
  
 	inc_mm_counter(mm, anon_rss);
@@ -2792,7 +2792,7 @@
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) {
+	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
@@ -2113,13 +2113,6 @@
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
  
-	/*
-	 * For KSM to handle OOM without deadlock when it's breaking COW in a
-	 * likely victim of the OOM killer, we must serialize with ksm_exit()
-	 * after freeing mm's pages but before freeing its page tables.
-	 */
-	ksm_exit(mm, &tlb, end);
-
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
...	...	@@ -18,8 +18,7 @@
18	18	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
19	19	unsigned long end, int advice, unsigned long *vm_flags);
20	20	int __ksm_enter(struct mm_struct *mm);
21		-void __ksm_exit(struct mm_struct *mm,
22		- struct mmu_gather **tlbp, unsigned long end);
	21	+void __ksm_exit(struct mm_struct *mm);
23	22
24	23	static inline int ksm_fork(struct mm_struct mm, struct mm_struct oldmm)
25	24	{
26	25
...	...	@@ -41,11 +40,10 @@
41	40	return atomic_read(&mm->mm_users) == 0;
42	41	}
43	42
44		-static inline void ksm_exit(struct mm_struct *mm,
45		- struct mmu_gather **tlbp, unsigned long end)
	43	+static inline void ksm_exit(struct mm_struct *mm)
46	44	{
47	45	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
48		- __ksm_exit(mm, tlbp, end);
	46	+ __ksm_exit(mm);
49	47	}
50	48
51	49	/*
...	...	@@ -86,8 +84,7 @@
86	84	return 0;
87	85	}
88	86
89		-static inline void ksm_exit(struct mm_struct *mm,
90		- struct mmu_gather **tlbp, unsigned long end)
	87	+static inline void ksm_exit(struct mm_struct *mm)
91	88	{
92	89	}
93	90
...	...	@@ -501,6 +501,7 @@
501	501
502	502	if (atomic_dec_and_test(&mm->mm_users)) {
503	503	exit_aio(mm);
	504	+ ksm_exit(mm);
504	505	exit_mmap(mm);
505	506	set_mm_exe_file(mm, NULL);
506	507	if (!list_empty(&mm->mmlist)) {
...	...	@@ -1416,8 +1416,7 @@
1416	1416	return 0;
1417	1417	}
1418	1418
1419		-void __ksm_exit(struct mm_struct *mm,
1420		- struct mmu_gather **tlbp, unsigned long end)
	1419	+void __ksm_exit(struct mm_struct *mm)
1421	1420	{
1422	1421	struct mm_slot *mm_slot;
1423	1422	int easy_to_free = 0;
1424	1423
...	...	@@ -1450,10 +1449,8 @@
1450	1449	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1451	1450	mmdrop(mm);
1452	1451	} else if (mm_slot) {
1453		- tlb_finish_mmu(*tlbp, 0, end);
1454	1452	down_write(&mm->mmap_sem);
1455	1453	up_write(&mm->mmap_sem);
1456		- *tlbp = tlb_gather_mmu(mm, 1);
1457	1454	}
1458	1455	}
1459	1456
...	...	@@ -2648,7 +2648,7 @@
2648	2648	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2649	2649
2650	2650	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2651		- if (!pte_none(*page_table) \|\| ksm_test_exit(mm))
	2651	+ if (!pte_none(*page_table))
2652	2652	goto release;
2653	2653
2654	2654	inc_mm_counter(mm, anon_rss);
...	...	@@ -2792,7 +2792,7 @@
2792	2792	* handle that later.
2793	2793	*/
2794	2794	/* Only go through if we didn't race with anybody else... */
2795		- if (likely(pte_same(*page_table, orig_pte) && !ksm_test_exit(mm))) {
	2795	+ if (likely(pte_same(*page_table, orig_pte))) {
2796	2796	flush_icache_page(vma, page);
2797	2797	entry = mk_pte(page, vma->vm_page_prot);
2798	2798	if (flags & FAULT_FLAG_WRITE)
...	...	@@ -2113,13 +2113,6 @@
2113	2113	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2114	2114	vm_unacct_memory(nr_accounted);
2115	2115
2116		- /*
2117		- * For KSM to handle OOM without deadlock when it's breaking COW in a
2118		- * likely victim of the OOM killer, we must serialize with ksm_exit()
2119		- * after freeing mm's pages but before freeing its page tables.
2120		- */
2121		- ksm_exit(mm, &tlb, end);
2122		-
2123	2116	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2124	2117	tlb_finish_mmu(tlb, 0, end);
2125	2118