Commit e0da382c92626ad1d7f4b7527d19b80104d67a83

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 9f6c6fc505

[PATCH] freepgt: free_pgtables use vma list

Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.

Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables.  This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).

Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.

Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels.  (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.

But what if is_hugepage_only_range?  Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.

What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?

And the range to sparc64's flush_tlb_pgtables?  It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.

Special thanks to David Miller for time spent repairing my ceilings.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 5 changed files with 141 additions and 155 deletions Side-by-side Diff

arch/i386/mm/pgtable.c
... ... @@ -255,7 +255,7 @@
255 255 if (PTRS_PER_PMD > 1)
256 256 for (i = 0; i < USER_PTRS_PER_PGD; ++i)
257 257 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
258   - /* in the non-PAE case, clear_page_range() clears user pgd entries */
  258 + /* in the non-PAE case, free_pgtables() clears user pgd entries */
259 259 kmem_cache_free(pgd_cache, pgd);
260 260 }
arch/ia64/mm/hugetlbpage.c
... ... @@ -187,45 +187,12 @@
187 187 }
188 188  
189 189 /*
190   - * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
191   - * are hugetlb region specific.
  190 + * Do nothing, until we've worked out what to do! To allow build, we
  191 + * must remove reference to clear_page_range since it no longer exists.
192 192 */
193 193 void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
194 194 unsigned long start, unsigned long end)
195 195 {
196   - unsigned long first = start & HUGETLB_PGDIR_MASK;
197   - unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
198   - struct mm_struct *mm = tlb->mm;
199   -
200   - if (!prev) {
201   - prev = mm->mmap;
202   - if (!prev)
203   - goto no_mmaps;
204   - if (prev->vm_end > start) {
205   - if (last > prev->vm_start)
206   - last = prev->vm_start;
207   - goto no_mmaps;
208   - }
209   - }
210   - for (;;) {
211   - struct vm_area_struct *next = prev->vm_next;
212   -
213   - if (next) {
214   - if (next->vm_start < start) {
215   - prev = next;
216   - continue;
217   - }
218   - if (last > next->vm_start)
219   - last = next->vm_start;
220   - }
221   - if (prev->vm_end > first)
222   - first = prev->vm_end;
223   - break;
224   - }
225   -no_mmaps:
226   - if (last < first) /* for arches with discontiguous pgd indices */
227   - return;
228   - clear_page_range(tlb, first, last);
229 196 }
230 197  
231 198 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
... ... @@ -592,7 +592,8 @@
592 592 struct vm_area_struct *start_vma, unsigned long start_addr,
593 593 unsigned long end_addr, unsigned long *nr_accounted,
594 594 struct zap_details *);
595   -void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);
  595 +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
  596 + unsigned long floor, unsigned long ceiling);
596 597 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
597 598 struct vm_area_struct *vma);
598 599 int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
... ... @@ -110,87 +110,165 @@
110 110 * Note: this doesn't free the actual pages themselves. That
111 111 * has been handled earlier when unmapping all the memory regions.
112 112 */
113   -static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
114   - unsigned long addr, unsigned long end)
  113 +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
115 114 {
116   - if (!((addr | end) & ~PMD_MASK)) {
117   - /* Only free fully aligned ranges */
118   - struct page *page = pmd_page(*pmd);
119   - pmd_clear(pmd);
120   - dec_page_state(nr_page_table_pages);
121   - tlb->mm->nr_ptes--;
122   - pte_free_tlb(tlb, page);
123   - }
  115 + struct page *page = pmd_page(*pmd);
  116 + pmd_clear(pmd);
  117 + pte_free_tlb(tlb, page);
  118 + dec_page_state(nr_page_table_pages);
  119 + tlb->mm->nr_ptes--;
124 120 }
125 121  
126   -static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
127   - unsigned long addr, unsigned long end)
  122 +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  123 + unsigned long addr, unsigned long end,
  124 + unsigned long floor, unsigned long ceiling)
128 125 {
129 126 pmd_t *pmd;
130 127 unsigned long next;
131   - pmd_t *empty_pmd = NULL;
  128 + unsigned long start;
132 129  
  130 + start = addr;
133 131 pmd = pmd_offset(pud, addr);
134   -
135   - /* Only free fully aligned ranges */
136   - if (!((addr | end) & ~PUD_MASK))
137   - empty_pmd = pmd;
138 132 do {
139 133 next = pmd_addr_end(addr, end);
140 134 if (pmd_none_or_clear_bad(pmd))
141 135 continue;
142   - clear_pte_range(tlb, pmd, addr, next);
  136 + free_pte_range(tlb, pmd);
143 137 } while (pmd++, addr = next, addr != end);
144 138  
145   - if (empty_pmd) {
146   - pud_clear(pud);
147   - pmd_free_tlb(tlb, empty_pmd);
  139 + start &= PUD_MASK;
  140 + if (start < floor)
  141 + return;
  142 + if (ceiling) {
  143 + ceiling &= PUD_MASK;
  144 + if (!ceiling)
  145 + return;
148 146 }
  147 + if (end - 1 > ceiling - 1)
  148 + return;
  149 +
  150 + pmd = pmd_offset(pud, start);
  151 + pud_clear(pud);
  152 + pmd_free_tlb(tlb, pmd);
149 153 }
150 154  
151   -static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
152   - unsigned long addr, unsigned long end)
  155 +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  156 + unsigned long addr, unsigned long end,
  157 + unsigned long floor, unsigned long ceiling)
153 158 {
154 159 pud_t *pud;
155 160 unsigned long next;
156   - pud_t *empty_pud = NULL;
  161 + unsigned long start;
157 162  
  163 + start = addr;
158 164 pud = pud_offset(pgd, addr);
159   -
160   - /* Only free fully aligned ranges */
161   - if (!((addr | end) & ~PGDIR_MASK))
162   - empty_pud = pud;
163 165 do {
164 166 next = pud_addr_end(addr, end);
165 167 if (pud_none_or_clear_bad(pud))
166 168 continue;
167   - clear_pmd_range(tlb, pud, addr, next);
  169 + free_pmd_range(tlb, pud, addr, next, floor, ceiling);
168 170 } while (pud++, addr = next, addr != end);
169 171  
170   - if (empty_pud) {
171   - pgd_clear(pgd);
172   - pud_free_tlb(tlb, empty_pud);
  172 + start &= PGDIR_MASK;
  173 + if (start < floor)
  174 + return;
  175 + if (ceiling) {
  176 + ceiling &= PGDIR_MASK;
  177 + if (!ceiling)
  178 + return;
173 179 }
  180 + if (end - 1 > ceiling - 1)
  181 + return;
  182 +
  183 + pud = pud_offset(pgd, start);
  184 + pgd_clear(pgd);
  185 + pud_free_tlb(tlb, pud);
174 186 }
175 187  
176 188 /*
177   - * This function clears user-level page tables of a process.
178   - * Unlike other pagetable walks, some memory layouts might give end 0.
  189 + * This function frees user-level page tables of a process.
  190 + *
179 191 * Must be called with pagetable lock held.
180 192 */
181   -void clear_page_range(struct mmu_gather *tlb,
182   - unsigned long addr, unsigned long end)
  193 +static inline void free_pgd_range(struct mmu_gather *tlb,
  194 + unsigned long addr, unsigned long end,
  195 + unsigned long floor, unsigned long ceiling)
183 196 {
184 197 pgd_t *pgd;
185 198 unsigned long next;
  199 + unsigned long start;
186 200  
  201 + /*
  202 + * The next few lines have given us lots of grief...
  203 + *
  204 + * Why are we testing PMD* at this top level? Because often
  205 + * there will be no work to do at all, and we'd prefer not to
  206 + * go all the way down to the bottom just to discover that.
  207 + *
  208 + * Why all these "- 1"s? Because 0 represents both the bottom
  209 + * of the address space and the top of it (using -1 for the
  210 + * top wouldn't help much: the masks would do the wrong thing).
  211 + * The rule is that addr 0 and floor 0 refer to the bottom of
  212 + * the address space, but end 0 and ceiling 0 refer to the top
  213 + * Comparisons need to use "end - 1" and "ceiling - 1" (though
  214 + * that end 0 case should be mythical).
  215 + *
  216 + * Wherever addr is brought up or ceiling brought down, we must
  217 + * be careful to reject "the opposite 0" before it confuses the
  218 + * subsequent tests. But what about where end is brought down
  219 + * by PMD_SIZE below? no, end can't go down to 0 there.
  220 + *
  221 + * Whereas we round start (addr) and ceiling down, by different
  222 + * masks at different levels, in order to test whether a table
  223 + * now has no other vmas using it, so can be freed, we don't
  224 + * bother to round floor or end up - the tests don't need that.
  225 + */
  226 +
  227 + addr &= PMD_MASK;
  228 + if (addr < floor) {
  229 + addr += PMD_SIZE;
  230 + if (!addr)
  231 + return;
  232 + }
  233 + if (ceiling) {
  234 + ceiling &= PMD_MASK;
  235 + if (!ceiling)
  236 + return;
  237 + }
  238 + if (end - 1 > ceiling - 1)
  239 + end -= PMD_SIZE;
  240 + if (addr > end - 1)
  241 + return;
  242 +
  243 + start = addr;
187 244 pgd = pgd_offset(tlb->mm, addr);
188 245 do {
189 246 next = pgd_addr_end(addr, end);
190 247 if (pgd_none_or_clear_bad(pgd))
191 248 continue;
192   - clear_pud_range(tlb, pgd, addr, next);
  249 + free_pud_range(tlb, pgd, addr, next, floor, ceiling);
193 250 } while (pgd++, addr = next, addr != end);
  251 +
  252 + if (!tlb_is_full_mm(tlb))
  253 + flush_tlb_pgtables(tlb->mm, start, end);
  254 +}
  255 +
  256 +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
  257 + unsigned long floor, unsigned long ceiling)
  258 +{
  259 + while (vma) {
  260 + struct vm_area_struct *next = vma->vm_next;
  261 + unsigned long addr = vma->vm_start;
  262 +
  263 + /* Optimization: gather nearby vmas into a single call down */
  264 + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
  265 + vma = next;
  266 + next = vma->vm_next;
  267 + }
  268 + free_pgd_range(*tlb, addr, vma->vm_end,
  269 + floor, next? next->vm_start: ceiling);
  270 + vma = next;
  271 + }
194 272 }
195 273  
196 274 pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
... ... @@ -29,6 +29,10 @@
29 29 #include <asm/cacheflush.h>
30 30 #include <asm/tlb.h>
31 31  
  32 +static void unmap_region(struct mm_struct *mm,
  33 + struct vm_area_struct *vma, struct vm_area_struct *prev,
  34 + unsigned long start, unsigned long end);
  35 +
32 36 /*
33 37 * WARNING: the debugging will use recursive algorithms so never enable this
34 38 * unless you know what you are doing.
... ... @@ -1129,7 +1133,8 @@
1129 1133 fput(file);
1130 1134  
1131 1135 /* Undo any partial mapping done by a device driver. */
1132   - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
  1136 + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
  1137 + charged = 0;
1133 1138 free_vma:
1134 1139 kmem_cache_free(vm_area_cachep, vma);
1135 1140 unacct_error:
... ... @@ -1572,66 +1577,6 @@
1572 1577 }
1573 1578 #endif
1574 1579  
1575   -/*
1576   - * Try to free as many page directory entries as we can,
1577   - * without having to work very hard at actually scanning
1578   - * the page tables themselves.
1579   - *
1580   - * Right now we try to free page tables if we have a nice
1581   - * PGDIR-aligned area that got free'd up. We could be more
1582   - * granular if we want to, but this is fast and simple,
1583   - * and covers the bad cases.
1584   - *
1585   - * "prev", if it exists, points to a vma before the one
1586   - * we just free'd - but there's no telling how much before.
1587   - */
1588   -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
1589   - unsigned long start, unsigned long end)
1590   -{
1591   - unsigned long first = start & PGDIR_MASK;
1592   - unsigned long last = end + PGDIR_SIZE - 1;
1593   - struct mm_struct *mm = tlb->mm;
1594   -
1595   - if (last > MM_VM_SIZE(mm) || last < end)
1596   - last = MM_VM_SIZE(mm);
1597   -
1598   - if (!prev) {
1599   - prev = mm->mmap;
1600   - if (!prev)
1601   - goto no_mmaps;
1602   - if (prev->vm_end > start) {
1603   - if (last > prev->vm_start)
1604   - last = prev->vm_start;
1605   - goto no_mmaps;
1606   - }
1607   - }
1608   - for (;;) {
1609   - struct vm_area_struct *next = prev->vm_next;
1610   -
1611   - if (next) {
1612   - if (next->vm_start < start) {
1613   - prev = next;
1614   - continue;
1615   - }
1616   - if (last > next->vm_start)
1617   - last = next->vm_start;
1618   - }
1619   - if (prev->vm_end > first)
1620   - first = prev->vm_end;
1621   - break;
1622   - }
1623   -no_mmaps:
1624   - if (last < first) /* for arches with discontiguous pgd indices */
1625   - return;
1626   - if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
1627   - first = FIRST_USER_PGD_NR * PGDIR_SIZE;
1628   - /* No point trying to free anything if we're in the same pte page */
1629   - if ((first & PMD_MASK) < (last & PMD_MASK)) {
1630   - clear_page_range(tlb, first, last);
1631   - flush_tlb_pgtables(mm, first, last);
1632   - }
1633   -}
1634   -
1635 1580 /* Normal function to fix up a mapping
1636 1581 * This function is the default for when an area has no specific
1637 1582 * function. This may be used as part of a more specific routine.
1638 1583  
1639 1584  
1640 1585  
1641 1586  
... ... @@ -1674,24 +1619,22 @@
1674 1619 * Called with the page table lock held.
1675 1620 */
1676 1621 static void unmap_region(struct mm_struct *mm,
1677   - struct vm_area_struct *vma,
1678   - struct vm_area_struct *prev,
1679   - unsigned long start,
1680   - unsigned long end)
  1622 + struct vm_area_struct *vma, struct vm_area_struct *prev,
  1623 + unsigned long start, unsigned long end)
1681 1624 {
  1625 + struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1682 1626 struct mmu_gather *tlb;
1683 1627 unsigned long nr_accounted = 0;
1684 1628  
1685 1629 lru_add_drain();
  1630 + spin_lock(&mm->page_table_lock);
1686 1631 tlb = tlb_gather_mmu(mm, 0);
1687 1632 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1688 1633 vm_unacct_memory(nr_accounted);
1689   -
1690   - if (is_hugepage_only_range(mm, start, end - start))
1691   - hugetlb_free_pgtables(tlb, prev, start, end);
1692   - else
1693   - free_pgtables(tlb, prev, start, end);
  1634 + free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
  1635 + next? next->vm_start: 0);
1694 1636 tlb_finish_mmu(tlb, start, end);
  1637 + spin_unlock(&mm->page_table_lock);
1695 1638 }
1696 1639  
1697 1640 /*
1698 1641  
... ... @@ -1823,9 +1766,7 @@
1823 1766 * Remove the vma's, and unmap the actual pages
1824 1767 */
1825 1768 detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1826   - spin_lock(&mm->page_table_lock);
1827 1769 unmap_region(mm, mpnt, prev, start, end);
1828   - spin_unlock(&mm->page_table_lock);
1829 1770  
1830 1771 /* Fix up all other VM information */
1831 1772 unmap_vma_list(mm, mpnt);
1832 1773  
1833 1774  
1834 1775  
1835 1776  
... ... @@ -1957,25 +1898,21 @@
1957 1898 void exit_mmap(struct mm_struct *mm)
1958 1899 {
1959 1900 struct mmu_gather *tlb;
1960   - struct vm_area_struct *vma;
  1901 + struct vm_area_struct *vma = mm->mmap;
1961 1902 unsigned long nr_accounted = 0;
1962 1903  
1963 1904 lru_add_drain();
1964 1905  
1965 1906 spin_lock(&mm->page_table_lock);
1966 1907  
1967   - tlb = tlb_gather_mmu(mm, 1);
1968 1908 flush_cache_mm(mm);
1969   - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1970   - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1971   - ~0UL, &nr_accounted, NULL);
  1909 + tlb = tlb_gather_mmu(mm, 1);
  1910 + /* Use -1 here to ensure all VMAs in the mm are unmapped */
  1911 + mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1972 1912 vm_unacct_memory(nr_accounted);
1973   - BUG_ON(mm->map_count); /* This is just debugging */
1974   - clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
1975   -
  1913 + free_pgtables(&tlb, vma, 0, 0);
1976 1914 tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1977 1915  
1978   - vma = mm->mmap;
1979 1916 mm->mmap = mm->mmap_cache = NULL;
1980 1917 mm->mm_rb = RB_ROOT;
1981 1918 set_mm_counter(mm, rss, 0);
... ... @@ -1993,6 +1930,9 @@
1993 1930 remove_vm_struct(vma);
1994 1931 vma = next;
1995 1932 }
  1933 +
  1934 + BUG_ON(mm->map_count); /* This is just debugging */
  1935 + BUG_ON(mm->nr_ptes); /* This is just debugging */
1996 1936 }
1997 1937  
1998 1938 /* Insert vm structure into process list sorted by address