Commit e0da382c92626ad1d7f4b7527d19b80104d67a83
Committed by
Linus Torvalds
1 parent
9f6c6fc505
Exists in
master
and in
4 other branches
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables. Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma? And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that we need to do more than is done here - every PMD_SIZE ever occupied will be flushed, do we really have to flush every PGDIR_SIZE ever partially occupied? A shame to complicate it unnecessarily. Special thanks to David Miller for time spent repairing my ceilings. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 5 changed files with 141 additions and 155 deletions Side-by-side Diff
arch/i386/mm/pgtable.c
... | ... | @@ -255,7 +255,7 @@ |
255 | 255 | if (PTRS_PER_PMD > 1) |
256 | 256 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) |
257 | 257 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); |
258 | - /* in the non-PAE case, clear_page_range() clears user pgd entries */ | |
258 | + /* in the non-PAE case, free_pgtables() clears user pgd entries */ | |
259 | 259 | kmem_cache_free(pgd_cache, pgd); |
260 | 260 | } |
arch/ia64/mm/hugetlbpage.c
... | ... | @@ -187,45 +187,12 @@ |
187 | 187 | } |
188 | 188 | |
189 | 189 | /* |
190 | - * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset | |
191 | - * are hugetlb region specific. | |
190 | + * Do nothing, until we've worked out what to do! To allow build, we | |
191 | + * must remove reference to clear_page_range since it no longer exists. | |
192 | 192 | */ |
193 | 193 | void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, |
194 | 194 | unsigned long start, unsigned long end) |
195 | 195 | { |
196 | - unsigned long first = start & HUGETLB_PGDIR_MASK; | |
197 | - unsigned long last = end + HUGETLB_PGDIR_SIZE - 1; | |
198 | - struct mm_struct *mm = tlb->mm; | |
199 | - | |
200 | - if (!prev) { | |
201 | - prev = mm->mmap; | |
202 | - if (!prev) | |
203 | - goto no_mmaps; | |
204 | - if (prev->vm_end > start) { | |
205 | - if (last > prev->vm_start) | |
206 | - last = prev->vm_start; | |
207 | - goto no_mmaps; | |
208 | - } | |
209 | - } | |
210 | - for (;;) { | |
211 | - struct vm_area_struct *next = prev->vm_next; | |
212 | - | |
213 | - if (next) { | |
214 | - if (next->vm_start < start) { | |
215 | - prev = next; | |
216 | - continue; | |
217 | - } | |
218 | - if (last > next->vm_start) | |
219 | - last = next->vm_start; | |
220 | - } | |
221 | - if (prev->vm_end > first) | |
222 | - first = prev->vm_end; | |
223 | - break; | |
224 | - } | |
225 | -no_mmaps: | |
226 | - if (last < first) /* for arches with discontiguous pgd indices */ | |
227 | - return; | |
228 | - clear_page_range(tlb, first, last); | |
229 | 196 | } |
230 | 197 | |
231 | 198 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) |
include/linux/mm.h
... | ... | @@ -592,7 +592,8 @@ |
592 | 592 | struct vm_area_struct *start_vma, unsigned long start_addr, |
593 | 593 | unsigned long end_addr, unsigned long *nr_accounted, |
594 | 594 | struct zap_details *); |
595 | -void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end); | |
595 | +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |
596 | + unsigned long floor, unsigned long ceiling); | |
596 | 597 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, |
597 | 598 | struct vm_area_struct *vma); |
598 | 599 | int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, |
mm/memory.c
... | ... | @@ -110,87 +110,165 @@ |
110 | 110 | * Note: this doesn't free the actual pages themselves. That |
111 | 111 | * has been handled earlier when unmapping all the memory regions. |
112 | 112 | */ |
113 | -static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |
114 | - unsigned long addr, unsigned long end) | |
113 | +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | |
115 | 114 | { |
116 | - if (!((addr | end) & ~PMD_MASK)) { | |
117 | - /* Only free fully aligned ranges */ | |
118 | - struct page *page = pmd_page(*pmd); | |
119 | - pmd_clear(pmd); | |
120 | - dec_page_state(nr_page_table_pages); | |
121 | - tlb->mm->nr_ptes--; | |
122 | - pte_free_tlb(tlb, page); | |
123 | - } | |
115 | + struct page *page = pmd_page(*pmd); | |
116 | + pmd_clear(pmd); | |
117 | + pte_free_tlb(tlb, page); | |
118 | + dec_page_state(nr_page_table_pages); | |
119 | + tlb->mm->nr_ptes--; | |
124 | 120 | } |
125 | 121 | |
126 | -static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |
127 | - unsigned long addr, unsigned long end) | |
122 | +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |
123 | + unsigned long addr, unsigned long end, | |
124 | + unsigned long floor, unsigned long ceiling) | |
128 | 125 | { |
129 | 126 | pmd_t *pmd; |
130 | 127 | unsigned long next; |
131 | - pmd_t *empty_pmd = NULL; | |
128 | + unsigned long start; | |
132 | 129 | |
130 | + start = addr; | |
133 | 131 | pmd = pmd_offset(pud, addr); |
134 | - | |
135 | - /* Only free fully aligned ranges */ | |
136 | - if (!((addr | end) & ~PUD_MASK)) | |
137 | - empty_pmd = pmd; | |
138 | 132 | do { |
139 | 133 | next = pmd_addr_end(addr, end); |
140 | 134 | if (pmd_none_or_clear_bad(pmd)) |
141 | 135 | continue; |
142 | - clear_pte_range(tlb, pmd, addr, next); | |
136 | + free_pte_range(tlb, pmd); | |
143 | 137 | } while (pmd++, addr = next, addr != end); |
144 | 138 | |
145 | - if (empty_pmd) { | |
146 | - pud_clear(pud); | |
147 | - pmd_free_tlb(tlb, empty_pmd); | |
139 | + start &= PUD_MASK; | |
140 | + if (start < floor) | |
141 | + return; | |
142 | + if (ceiling) { | |
143 | + ceiling &= PUD_MASK; | |
144 | + if (!ceiling) | |
145 | + return; | |
148 | 146 | } |
147 | + if (end - 1 > ceiling - 1) | |
148 | + return; | |
149 | + | |
150 | + pmd = pmd_offset(pud, start); | |
151 | + pud_clear(pud); | |
152 | + pmd_free_tlb(tlb, pmd); | |
149 | 153 | } |
150 | 154 | |
151 | -static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |
152 | - unsigned long addr, unsigned long end) | |
155 | +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |
156 | + unsigned long addr, unsigned long end, | |
157 | + unsigned long floor, unsigned long ceiling) | |
153 | 158 | { |
154 | 159 | pud_t *pud; |
155 | 160 | unsigned long next; |
156 | - pud_t *empty_pud = NULL; | |
161 | + unsigned long start; | |
157 | 162 | |
163 | + start = addr; | |
158 | 164 | pud = pud_offset(pgd, addr); |
159 | - | |
160 | - /* Only free fully aligned ranges */ | |
161 | - if (!((addr | end) & ~PGDIR_MASK)) | |
162 | - empty_pud = pud; | |
163 | 165 | do { |
164 | 166 | next = pud_addr_end(addr, end); |
165 | 167 | if (pud_none_or_clear_bad(pud)) |
166 | 168 | continue; |
167 | - clear_pmd_range(tlb, pud, addr, next); | |
169 | + free_pmd_range(tlb, pud, addr, next, floor, ceiling); | |
168 | 170 | } while (pud++, addr = next, addr != end); |
169 | 171 | |
170 | - if (empty_pud) { | |
171 | - pgd_clear(pgd); | |
172 | - pud_free_tlb(tlb, empty_pud); | |
172 | + start &= PGDIR_MASK; | |
173 | + if (start < floor) | |
174 | + return; | |
175 | + if (ceiling) { | |
176 | + ceiling &= PGDIR_MASK; | |
177 | + if (!ceiling) | |
178 | + return; | |
173 | 179 | } |
180 | + if (end - 1 > ceiling - 1) | |
181 | + return; | |
182 | + | |
183 | + pud = pud_offset(pgd, start); | |
184 | + pgd_clear(pgd); | |
185 | + pud_free_tlb(tlb, pud); | |
174 | 186 | } |
175 | 187 | |
176 | 188 | /* |
177 | - * This function clears user-level page tables of a process. | |
178 | - * Unlike other pagetable walks, some memory layouts might give end 0. | |
189 | + * This function frees user-level page tables of a process. | |
190 | + * | |
179 | 191 | * Must be called with pagetable lock held. |
180 | 192 | */ |
181 | -void clear_page_range(struct mmu_gather *tlb, | |
182 | - unsigned long addr, unsigned long end) | |
193 | +static inline void free_pgd_range(struct mmu_gather *tlb, | |
194 | + unsigned long addr, unsigned long end, | |
195 | + unsigned long floor, unsigned long ceiling) | |
183 | 196 | { |
184 | 197 | pgd_t *pgd; |
185 | 198 | unsigned long next; |
199 | + unsigned long start; | |
186 | 200 | |
201 | + /* | |
202 | + * The next few lines have given us lots of grief... | |
203 | + * | |
204 | + * Why are we testing PMD* at this top level? Because often | |
205 | + * there will be no work to do at all, and we'd prefer not to | |
206 | + * go all the way down to the bottom just to discover that. | |
207 | + * | |
208 | + * Why all these "- 1"s? Because 0 represents both the bottom | |
209 | + * of the address space and the top of it (using -1 for the | |
210 | + * top wouldn't help much: the masks would do the wrong thing). | |
211 | + * The rule is that addr 0 and floor 0 refer to the bottom of | |
212 | + * the address space, but end 0 and ceiling 0 refer to the top | |
213 | + * Comparisons need to use "end - 1" and "ceiling - 1" (though | |
214 | + * that end 0 case should be mythical). | |
215 | + * | |
216 | + * Wherever addr is brought up or ceiling brought down, we must | |
217 | + * be careful to reject "the opposite 0" before it confuses the | |
218 | + * subsequent tests. But what about where end is brought down | |
219 | + * by PMD_SIZE below? no, end can't go down to 0 there. | |
220 | + * | |
221 | + * Whereas we round start (addr) and ceiling down, by different | |
222 | + * masks at different levels, in order to test whether a table | |
223 | + * now has no other vmas using it, so can be freed, we don't | |
224 | + * bother to round floor or end up - the tests don't need that. | |
225 | + */ | |
226 | + | |
227 | + addr &= PMD_MASK; | |
228 | + if (addr < floor) { | |
229 | + addr += PMD_SIZE; | |
230 | + if (!addr) | |
231 | + return; | |
232 | + } | |
233 | + if (ceiling) { | |
234 | + ceiling &= PMD_MASK; | |
235 | + if (!ceiling) | |
236 | + return; | |
237 | + } | |
238 | + if (end - 1 > ceiling - 1) | |
239 | + end -= PMD_SIZE; | |
240 | + if (addr > end - 1) | |
241 | + return; | |
242 | + | |
243 | + start = addr; | |
187 | 244 | pgd = pgd_offset(tlb->mm, addr); |
188 | 245 | do { |
189 | 246 | next = pgd_addr_end(addr, end); |
190 | 247 | if (pgd_none_or_clear_bad(pgd)) |
191 | 248 | continue; |
192 | - clear_pud_range(tlb, pgd, addr, next); | |
249 | + free_pud_range(tlb, pgd, addr, next, floor, ceiling); | |
193 | 250 | } while (pgd++, addr = next, addr != end); |
251 | + | |
252 | + if (!tlb_is_full_mm(tlb)) | |
253 | + flush_tlb_pgtables(tlb->mm, start, end); | |
254 | +} | |
255 | + | |
256 | +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |
257 | + unsigned long floor, unsigned long ceiling) | |
258 | +{ | |
259 | + while (vma) { | |
260 | + struct vm_area_struct *next = vma->vm_next; | |
261 | + unsigned long addr = vma->vm_start; | |
262 | + | |
263 | + /* Optimization: gather nearby vmas into a single call down */ | |
264 | + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { | |
265 | + vma = next; | |
266 | + next = vma->vm_next; | |
267 | + } | |
268 | + free_pgd_range(*tlb, addr, vma->vm_end, | |
269 | + floor, next? next->vm_start: ceiling); | |
270 | + vma = next; | |
271 | + } | |
194 | 272 | } |
195 | 273 | |
196 | 274 | pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) |
mm/mmap.c
... | ... | @@ -29,6 +29,10 @@ |
29 | 29 | #include <asm/cacheflush.h> |
30 | 30 | #include <asm/tlb.h> |
31 | 31 | |
32 | +static void unmap_region(struct mm_struct *mm, | |
33 | + struct vm_area_struct *vma, struct vm_area_struct *prev, | |
34 | + unsigned long start, unsigned long end); | |
35 | + | |
32 | 36 | /* |
33 | 37 | * WARNING: the debugging will use recursive algorithms so never enable this |
34 | 38 | * unless you know what you are doing. |
... | ... | @@ -1129,7 +1133,8 @@ |
1129 | 1133 | fput(file); |
1130 | 1134 | |
1131 | 1135 | /* Undo any partial mapping done by a device driver. */ |
1132 | - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); | |
1136 | + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); | |
1137 | + charged = 0; | |
1133 | 1138 | free_vma: |
1134 | 1139 | kmem_cache_free(vm_area_cachep, vma); |
1135 | 1140 | unacct_error: |
... | ... | @@ -1572,66 +1577,6 @@ |
1572 | 1577 | } |
1573 | 1578 | #endif |
1574 | 1579 | |
1575 | -/* | |
1576 | - * Try to free as many page directory entries as we can, | |
1577 | - * without having to work very hard at actually scanning | |
1578 | - * the page tables themselves. | |
1579 | - * | |
1580 | - * Right now we try to free page tables if we have a nice | |
1581 | - * PGDIR-aligned area that got free'd up. We could be more | |
1582 | - * granular if we want to, but this is fast and simple, | |
1583 | - * and covers the bad cases. | |
1584 | - * | |
1585 | - * "prev", if it exists, points to a vma before the one | |
1586 | - * we just free'd - but there's no telling how much before. | |
1587 | - */ | |
1588 | -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, | |
1589 | - unsigned long start, unsigned long end) | |
1590 | -{ | |
1591 | - unsigned long first = start & PGDIR_MASK; | |
1592 | - unsigned long last = end + PGDIR_SIZE - 1; | |
1593 | - struct mm_struct *mm = tlb->mm; | |
1594 | - | |
1595 | - if (last > MM_VM_SIZE(mm) || last < end) | |
1596 | - last = MM_VM_SIZE(mm); | |
1597 | - | |
1598 | - if (!prev) { | |
1599 | - prev = mm->mmap; | |
1600 | - if (!prev) | |
1601 | - goto no_mmaps; | |
1602 | - if (prev->vm_end > start) { | |
1603 | - if (last > prev->vm_start) | |
1604 | - last = prev->vm_start; | |
1605 | - goto no_mmaps; | |
1606 | - } | |
1607 | - } | |
1608 | - for (;;) { | |
1609 | - struct vm_area_struct *next = prev->vm_next; | |
1610 | - | |
1611 | - if (next) { | |
1612 | - if (next->vm_start < start) { | |
1613 | - prev = next; | |
1614 | - continue; | |
1615 | - } | |
1616 | - if (last > next->vm_start) | |
1617 | - last = next->vm_start; | |
1618 | - } | |
1619 | - if (prev->vm_end > first) | |
1620 | - first = prev->vm_end; | |
1621 | - break; | |
1622 | - } | |
1623 | -no_mmaps: | |
1624 | - if (last < first) /* for arches with discontiguous pgd indices */ | |
1625 | - return; | |
1626 | - if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) | |
1627 | - first = FIRST_USER_PGD_NR * PGDIR_SIZE; | |
1628 | - /* No point trying to free anything if we're in the same pte page */ | |
1629 | - if ((first & PMD_MASK) < (last & PMD_MASK)) { | |
1630 | - clear_page_range(tlb, first, last); | |
1631 | - flush_tlb_pgtables(mm, first, last); | |
1632 | - } | |
1633 | -} | |
1634 | - | |
1635 | 1580 | /* Normal function to fix up a mapping |
1636 | 1581 | * This function is the default for when an area has no specific |
1637 | 1582 | * function. This may be used as part of a more specific routine. |
1638 | 1583 | |
1639 | 1584 | |
1640 | 1585 | |
1641 | 1586 | |
... | ... | @@ -1674,24 +1619,22 @@ |
1674 | 1619 | * Called with the page table lock held. |
1675 | 1620 | */ |
1676 | 1621 | static void unmap_region(struct mm_struct *mm, |
1677 | - struct vm_area_struct *vma, | |
1678 | - struct vm_area_struct *prev, | |
1679 | - unsigned long start, | |
1680 | - unsigned long end) | |
1622 | + struct vm_area_struct *vma, struct vm_area_struct *prev, | |
1623 | + unsigned long start, unsigned long end) | |
1681 | 1624 | { |
1625 | + struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | |
1682 | 1626 | struct mmu_gather *tlb; |
1683 | 1627 | unsigned long nr_accounted = 0; |
1684 | 1628 | |
1685 | 1629 | lru_add_drain(); |
1630 | + spin_lock(&mm->page_table_lock); | |
1686 | 1631 | tlb = tlb_gather_mmu(mm, 0); |
1687 | 1632 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); |
1688 | 1633 | vm_unacct_memory(nr_accounted); |
1689 | - | |
1690 | - if (is_hugepage_only_range(mm, start, end - start)) | |
1691 | - hugetlb_free_pgtables(tlb, prev, start, end); | |
1692 | - else | |
1693 | - free_pgtables(tlb, prev, start, end); | |
1634 | + free_pgtables(&tlb, vma, prev? prev->vm_end: 0, | |
1635 | + next? next->vm_start: 0); | |
1694 | 1636 | tlb_finish_mmu(tlb, start, end); |
1637 | + spin_unlock(&mm->page_table_lock); | |
1695 | 1638 | } |
1696 | 1639 | |
1697 | 1640 | /* |
1698 | 1641 | |
... | ... | @@ -1823,9 +1766,7 @@ |
1823 | 1766 | * Remove the vma's, and unmap the actual pages |
1824 | 1767 | */ |
1825 | 1768 | detach_vmas_to_be_unmapped(mm, mpnt, prev, end); |
1826 | - spin_lock(&mm->page_table_lock); | |
1827 | 1769 | unmap_region(mm, mpnt, prev, start, end); |
1828 | - spin_unlock(&mm->page_table_lock); | |
1829 | 1770 | |
1830 | 1771 | /* Fix up all other VM information */ |
1831 | 1772 | unmap_vma_list(mm, mpnt); |
1832 | 1773 | |
1833 | 1774 | |
1834 | 1775 | |
1835 | 1776 | |
... | ... | @@ -1957,25 +1898,21 @@ |
1957 | 1898 | void exit_mmap(struct mm_struct *mm) |
1958 | 1899 | { |
1959 | 1900 | struct mmu_gather *tlb; |
1960 | - struct vm_area_struct *vma; | |
1901 | + struct vm_area_struct *vma = mm->mmap; | |
1961 | 1902 | unsigned long nr_accounted = 0; |
1962 | 1903 | |
1963 | 1904 | lru_add_drain(); |
1964 | 1905 | |
1965 | 1906 | spin_lock(&mm->page_table_lock); |
1966 | 1907 | |
1967 | - tlb = tlb_gather_mmu(mm, 1); | |
1968 | 1908 | flush_cache_mm(mm); |
1969 | - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ | |
1970 | - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, | |
1971 | - ~0UL, &nr_accounted, NULL); | |
1909 | + tlb = tlb_gather_mmu(mm, 1); | |
1910 | + /* Use -1 here to ensure all VMAs in the mm are unmapped */ | |
1911 | + mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); | |
1972 | 1912 | vm_unacct_memory(nr_accounted); |
1973 | - BUG_ON(mm->map_count); /* This is just debugging */ | |
1974 | - clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); | |
1975 | - | |
1913 | + free_pgtables(&tlb, vma, 0, 0); | |
1976 | 1914 | tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); |
1977 | 1915 | |
1978 | - vma = mm->mmap; | |
1979 | 1916 | mm->mmap = mm->mmap_cache = NULL; |
1980 | 1917 | mm->mm_rb = RB_ROOT; |
1981 | 1918 | set_mm_counter(mm, rss, 0); |
... | ... | @@ -1993,6 +1930,9 @@ |
1993 | 1930 | remove_vm_struct(vma); |
1994 | 1931 | vma = next; |
1995 | 1932 | } |
1933 | + | |
1934 | + BUG_ON(mm->map_count); /* This is just debugging */ | |
1935 | + BUG_ON(mm->nr_ptes); /* This is just debugging */ | |
1996 | 1936 | } |
1997 | 1937 | |
1998 | 1938 | /* Insert vm structure into process list sorted by address |