Commit 3ad3d901bbcfb15a5e4690e55350db0899095a68

Authored by Xiao Guangrong
Committed by Linus Torvalds
1 parent bdf4f4d216

mm: mmu_notifier: fix freed page still mapped in secondary MMU

mmu_notifier_release() is called when the process is exiting.  It will
delete all the mmu notifiers.  But at this time the page belonging to the
process is still present in page tables and is present on the LRU list, so
this race will happen:

      CPU 0                 CPU 1
mmu_notifier_release:    try_to_unmap:
   hlist_del_init_rcu(&mn->hlist);
                            ptep_clear_flush_notify:
                                  mmu nofifler not found
                            free page  !!!!!!
                            /*
                             * At the point, the page has been
                             * freed, but it is still mapped in
                             * the secondary MMU.
                             */

  mn->ops->release(mn, mm);

Then the box is not stable and sometimes we can get this bug:

[  738.075923] BUG: Bad page state in process migrate-perf  pfn:03bec
[  738.075931] page:ffffea00000efb00 count:0 mapcount:0 mapping:          (null) index:0x8076
[  738.075936] page flags: 0x20000000000014(referenced|dirty)

The same issue is present in mmu_notifier_unregister().

We can call ->release before deleting the notifier to ensure the page has
been unmapped from the secondary MMU before it is freed.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 23 additions and 22 deletions Side-by-side Diff

... ... @@ -33,7 +33,25 @@
33 33 void __mmu_notifier_release(struct mm_struct *mm)
34 34 {
35 35 struct mmu_notifier *mn;
  36 + struct hlist_node *n;
36 37  
  38 + /*
  39 + * RCU here will block mmu_notifier_unregister until
  40 + * ->release returns.
  41 + */
  42 + rcu_read_lock();
  43 + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
  44 + /*
  45 + * if ->release runs before mmu_notifier_unregister it
  46 + * must be handled as it's the only way for the driver
  47 + * to flush all existing sptes and stop the driver
  48 + * from establishing any more sptes before all the
  49 + * pages in the mm are freed.
  50 + */
  51 + if (mn->ops->release)
  52 + mn->ops->release(mn, mm);
  53 + rcu_read_unlock();
  54 +
37 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
39 57 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
... ... @@ -46,23 +64,6 @@
46 64 * mmu_notifier_unregister to return.
47 65 */
48 66 hlist_del_init_rcu(&mn->hlist);
49   - /*
50   - * RCU here will block mmu_notifier_unregister until
51   - * ->release returns.
52   - */
53   - rcu_read_lock();
54   - spin_unlock(&mm->mmu_notifier_mm->lock);
55   - /*
56   - * if ->release runs before mmu_notifier_unregister it
57   - * must be handled as it's the only way for the driver
58   - * to flush all existing sptes and stop the driver
59   - * from establishing any more sptes before all the
60   - * pages in the mm are freed.
61   - */
62   - if (mn->ops->release)
63   - mn->ops->release(mn, mm);
64   - rcu_read_unlock();
65   - spin_lock(&mm->mmu_notifier_mm->lock);
66 67 }
67 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69  
69 70  
70 71  
... ... @@ -284,16 +285,13 @@
284 285 {
285 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287  
287   - spin_lock(&mm->mmu_notifier_mm->lock);
288 288 if (!hlist_unhashed(&mn->hlist)) {
289   - hlist_del_rcu(&mn->hlist);
290   -
291 289 /*
292 290 * RCU here will force exit_mmap to wait ->release to finish
293 291 * before freeing the pages.
294 292 */
295 293 rcu_read_lock();
296   - spin_unlock(&mm->mmu_notifier_mm->lock);
  294 +
297 295 /*
298 296 * exit_mmap will block in mmu_notifier_release to
299 297 * guarantee ->release is called before freeing the
300 298  
... ... @@ -302,8 +300,11 @@
302 300 if (mn->ops->release)
303 301 mn->ops->release(mn, mm);
304 302 rcu_read_unlock();
305   - } else
  303 +
  304 + spin_lock(&mm->mmu_notifier_mm->lock);
  305 + hlist_del_rcu(&mn->hlist);
306 306 spin_unlock(&mm->mmu_notifier_mm->lock);
  307 + }
307 308  
308 309 /*
309 310 * Wait any running method to finish, of course including