Commit 64141da587241301ce8638cc945f8b67853156ec

Authored by Jeremy Fitzhardinge
Committed by Linus Torvalds
1 parent 853ff88324

vmalloc: eagerly clear ptes on vunmap

On stock 2.6.37-rc4, running:

  # mount lilith:/export /mnt/lilith
  # find  /mnt/lilith/ -type f -print0 | xargs -0 file

crashes the machine fairly quickly under Xen.  Often it results in oops
messages, but the couple of times I tried just now, it just hung quietly
and made Xen print some rude messages:

    (XEN) mm.c:2389:d80 Bad type (saw 7400000000000001 != exp
    3000000000000000) for mfn 1d7058 (pfn 18fa7)
    (XEN) mm.c:964:d80 Attempt to create linear p.t. with write perms
    (XEN) mm.c:2389:d80 Bad type (saw 7400000000000010 != exp
    1000000000000000) for mfn 1d2e04 (pfn 1d1fb)
    (XEN) mm.c:2965:d80 Error while pinning mfn 1d2e04

Which means the domain tried to map a pagetable page RW, which would
allow it to map arbitrary memory, so Xen stopped it.  This is because
vm_unmap_ram() left some pages mapped in the vmalloc area after NFS had
finished with them, and those pages got recycled as pagetable pages
while still having these RW aliases.

Removing those mappings immediately removes the Xen-visible aliases, and
so it has no problem with those pages being reused as pagetable pages.
Deferring the TLB flush doesn't upset Xen because it can flush the TLB
itself as needed to maintain its invariants.

When unmapping a region in the vmalloc space, clear the ptes
immediately.  There's no point in deferring this because there's no
amortization benefit.

The TLBs are left dirty, and they are flushed lazily to amortize the
cost of the IPIs.

This specific motivation for this patch is an oops-causing regression
since 2.6.36 when using NFS under Xen, triggered by the NFS client's use
of vm_map_ram() introduced in 56e4ebf877b60 ("NFS: readdir with vmapped
pages") .  XFS also uses vm_map_ram() and could cause similar problems.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Bryan Schumaker <bjschuma@netapp.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 17 additions and 15 deletions Side-by-side Diff

... ... @@ -2415,8 +2415,6 @@
2415 2415 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2416 2416 pv_mmu_ops = xen_mmu_ops;
2417 2417  
2418   - vmap_lazy_unmap = false;
2419   -
2420 2418 memset(dummy_mapping, 0xff, PAGE_SIZE);
2421 2419 }
2422 2420  
include/linux/vmalloc.h
... ... @@ -7,8 +7,6 @@
7 7  
8 8 struct vm_area_struct; /* vma defining user mapping in mm_types.h */
9 9  
10   -extern bool vmap_lazy_unmap;
11   -
12 10 /* bits in flags of vmalloc's vm_struct below */
13 11 #define VM_IOREMAP 0x00000001 /* ioremap() and friends */
14 12 #define VM_ALLOC 0x00000002 /* vmalloc() */
... ... @@ -31,8 +31,6 @@
31 31 #include <asm/tlbflush.h>
32 32 #include <asm/shmparam.h>
33 33  
34   -bool vmap_lazy_unmap __read_mostly = true;
35   -
36 34 /*** Page table manipulation functions ***/
37 35  
38 36 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
... ... @@ -503,9 +501,6 @@
503 501 {
504 502 unsigned int log;
505 503  
506   - if (!vmap_lazy_unmap)
507   - return 0;
508   -
509 504 log = fls(num_online_cpus());
510 505  
511 506 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
... ... @@ -566,7 +561,6 @@
566 561 if (va->va_end > *end)
567 562 *end = va->va_end;
568 563 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
569   - unmap_vmap_area(va);
570 564 list_add_tail(&va->purge_list, &valist);
571 565 va->flags |= VM_LAZY_FREEING;
572 566 va->flags &= ~VM_LAZY_FREE;
573 567  
... ... @@ -611,10 +605,11 @@
611 605 }
612 606  
613 607 /*
614   - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
615   - * called for the correct range previously.
  608 + * Free a vmap area, caller ensuring that the area has been unmapped
  609 + * and flush_cache_vunmap had been called for the correct range
  610 + * previously.
616 611 */
617   -static void free_unmap_vmap_area_noflush(struct vmap_area *va)
  612 +static void free_vmap_area_noflush(struct vmap_area *va)
618 613 {
619 614 va->flags |= VM_LAZY_FREE;
620 615 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
... ... @@ -623,6 +618,16 @@
623 618 }
624 619  
625 620 /*
  621 + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
  622 + * called for the correct range previously.
  623 + */
  624 +static void free_unmap_vmap_area_noflush(struct vmap_area *va)
  625 +{
  626 + unmap_vmap_area(va);
  627 + free_vmap_area_noflush(va);
  628 +}
  629 +
  630 +/*
626 631 * Free and unmap a vmap area
627 632 */
628 633 static void free_unmap_vmap_area(struct vmap_area *va)
... ... @@ -798,7 +803,7 @@
798 803 spin_unlock(&vmap_block_tree_lock);
799 804 BUG_ON(tmp != vb);
800 805  
801   - free_unmap_vmap_area_noflush(vb->va);
  806 + free_vmap_area_noflush(vb->va);
802 807 call_rcu(&vb->rcu_head, rcu_free_vb);
803 808 }
804 809  
... ... @@ -936,6 +941,8 @@
936 941 rcu_read_unlock();
937 942 BUG_ON(!vb);
938 943  
  944 + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
  945 +
939 946 spin_lock(&vb->lock);
940 947 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
941 948  
... ... @@ -988,7 +995,6 @@
988 995  
989 996 s = vb->va->va_start + (i << PAGE_SHIFT);
990 997 e = vb->va->va_start + (j << PAGE_SHIFT);
991   - vunmap_page_range(s, e);
992 998 flush = 1;
993 999  
994 1000 if (s < start)