Commit 751efd8610d3d7d67b7bdf7f62646edea7365dd7

Authored by Robin Holt
Committed by Linus Torvalds
1 parent c1f1949527

mmu_notifier_unregister NULL Pointer deref and multiple ->release() callouts

There is a race condition between mmu_notifier_unregister() and
__mmu_notifier_release().

Assume two tasks, one calling mmu_notifier_unregister() as a result of a
filp_close() ->flush() callout (task A), and the other calling
mmu_notifier_release() from an mmput() (task B).

                A                               B
t1                                              srcu_read_lock()
t2              if (!hlist_unhashed())
t3                                              srcu_read_unlock()
t4              srcu_read_lock()
t5                                              hlist_del_init_rcu()
t6                                              synchronize_srcu()
t7              srcu_read_unlock()
t8              hlist_del_rcu()  <--- NULL pointer deref.

Additionally, the list traversal in __mmu_notifier_release() is not
protected by the by the mmu_notifier_mm->hlist_lock which can result in
callouts to the ->release() notifier from both mmu_notifier_unregister()
and __mmu_notifier_release().

-stable suggestions:

The stable trees prior to 3.7.y need commits 21a92735f660 and
70400303ce0c cherry-picked in that order prior to cherry-picking this
commit.  The 3.7.y tree already has those two commits.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Sagi Grimberg <sagig@mellanox.co.il>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 42 additions and 40 deletions Side-by-side Diff

... ... @@ -37,50 +37,52 @@
37 37 void __mmu_notifier_release(struct mm_struct *mm)
38 38 {
39 39 struct mmu_notifier *mn;
40   - struct hlist_node *n;
41 40 int id;
42 41  
43 42 /*
44   - * SRCU here will block mmu_notifier_unregister until
45   - * ->release returns.
  43 + * srcu_read_lock() here will block synchronize_srcu() in
  44 + * mmu_notifier_unregister() until all registered
  45 + * ->release() callouts this function makes have
  46 + * returned.
46 47 */
47 48 id = srcu_read_lock(&srcu);
48   - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
49   - /*
50   - * if ->release runs before mmu_notifier_unregister it
51   - * must be handled as it's the only way for the driver
52   - * to flush all existing sptes and stop the driver
53   - * from establishing any more sptes before all the
54   - * pages in the mm are freed.
55   - */
56   - if (mn->ops->release)
57   - mn->ops->release(mn, mm);
58   - srcu_read_unlock(&srcu, id);
59   -
60 49 spin_lock(&mm->mmu_notifier_mm->lock);
61 50 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
62 51 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
63 52 struct mmu_notifier,
64 53 hlist);
  54 +
65 55 /*
66   - * We arrived before mmu_notifier_unregister so
67   - * mmu_notifier_unregister will do nothing other than
68   - * to wait ->release to finish and
69   - * mmu_notifier_unregister to return.
  56 + * Unlink. This will prevent mmu_notifier_unregister()
  57 + * from also making the ->release() callout.
70 58 */
71 59 hlist_del_init_rcu(&mn->hlist);
  60 + spin_unlock(&mm->mmu_notifier_mm->lock);
  61 +
  62 + /*
  63 + * Clear sptes. (see 'release' description in mmu_notifier.h)
  64 + */
  65 + if (mn->ops->release)
  66 + mn->ops->release(mn, mm);
  67 +
  68 + spin_lock(&mm->mmu_notifier_mm->lock);
72 69 }
73 70 spin_unlock(&mm->mmu_notifier_mm->lock);
74 71  
75 72 /*
76   - * synchronize_srcu here prevents mmu_notifier_release to
77   - * return to exit_mmap (which would proceed freeing all pages
78   - * in the mm) until the ->release method returns, if it was
79   - * invoked by mmu_notifier_unregister.
80   - *
81   - * The mmu_notifier_mm can't go away from under us because one
82   - * mm_count is hold by exit_mmap.
  73 + * All callouts to ->release() which we have done are complete.
  74 + * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
83 75 */
  76 + srcu_read_unlock(&srcu, id);
  77 +
  78 + /*
  79 + * mmu_notifier_unregister() may have unlinked a notifier and may
  80 + * still be calling out to it. Additionally, other notifiers
  81 + * may have been active via vmtruncate() et. al. Block here
  82 + * to ensure that all notifier callouts for this mm have been
  83 + * completed and the sptes are really cleaned up before returning
  84 + * to exit_mmap().
  85 + */
84 86 synchronize_srcu(&srcu);
85 87 }
86 88  
87 89  
88 90  
89 91  
90 92  
91 93  
92 94  
93 95  
94 96  
... ... @@ -294,31 +296,31 @@
294 296 {
295 297 BUG_ON(atomic_read(&mm->mm_count) <= 0);
296 298  
  299 + spin_lock(&mm->mmu_notifier_mm->lock);
297 300 if (!hlist_unhashed(&mn->hlist)) {
298   - /*
299   - * SRCU here will force exit_mmap to wait ->release to finish
300   - * before freeing the pages.
301   - */
302 301 int id;
303 302  
304   - id = srcu_read_lock(&srcu);
305 303 /*
306   - * exit_mmap will block in mmu_notifier_release to
307   - * guarantee ->release is called before freeing the
308   - * pages.
  304 + * Ensure we synchronize up with __mmu_notifier_release().
309 305 */
  306 + id = srcu_read_lock(&srcu);
  307 +
  308 + hlist_del_rcu(&mn->hlist);
  309 + spin_unlock(&mm->mmu_notifier_mm->lock);
  310 +
310 311 if (mn->ops->release)
311 312 mn->ops->release(mn, mm);
312   - srcu_read_unlock(&srcu, id);
313 313  
314   - spin_lock(&mm->mmu_notifier_mm->lock);
315   - hlist_del_rcu(&mn->hlist);
  314 + /*
  315 + * Allow __mmu_notifier_release() to complete.
  316 + */
  317 + srcu_read_unlock(&srcu, id);
  318 + } else
316 319 spin_unlock(&mm->mmu_notifier_mm->lock);
317   - }
318 320  
319 321 /*
320   - * Wait any running method to finish, of course including
321   - * ->release if it was run by mmu_notifier_relase instead of us.
  322 + * Wait for any running method to finish, including ->release() if it
  323 + * was run by __mmu_notifier_release() instead of us.
322 324 */
323 325 synchronize_srcu(&srcu);
324 326