Commit 365e9c87a982c03d0af3886e29d877f581b59611
Committed by
Linus Torvalds
1 parent
861f2fb8e7
Exists in
master
and in
4 other branches
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 14 changed files with 64 additions and 42 deletions Side-by-side Diff
fs/compat.c
fs/exec.c
fs/proc/task_mmu.c
... | ... | @@ -14,22 +14,41 @@ |
14 | 14 | char *task_mem(struct mm_struct *mm, char *buffer) |
15 | 15 | { |
16 | 16 | unsigned long data, text, lib; |
17 | + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; | |
17 | 18 | |
19 | + /* | |
20 | + * Note: to minimize their overhead, mm maintains hiwater_vm and | |
21 | + * hiwater_rss only when about to *lower* total_vm or rss. Any | |
22 | + * collector of these hiwater stats must therefore get total_vm | |
23 | + * and rss too, which will usually be the higher. Barriers? not | |
24 | + * worth the effort, such snapshots can always be inconsistent. | |
25 | + */ | |
26 | + hiwater_vm = total_vm = mm->total_vm; | |
27 | + if (hiwater_vm < mm->hiwater_vm) | |
28 | + hiwater_vm = mm->hiwater_vm; | |
29 | + hiwater_rss = total_rss = get_mm_rss(mm); | |
30 | + if (hiwater_rss < mm->hiwater_rss) | |
31 | + hiwater_rss = mm->hiwater_rss; | |
32 | + | |
18 | 33 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; |
19 | 34 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; |
20 | 35 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; |
21 | 36 | buffer += sprintf(buffer, |
37 | + "VmPeak:\t%8lu kB\n" | |
22 | 38 | "VmSize:\t%8lu kB\n" |
23 | 39 | "VmLck:\t%8lu kB\n" |
40 | + "VmHWM:\t%8lu kB\n" | |
24 | 41 | "VmRSS:\t%8lu kB\n" |
25 | 42 | "VmData:\t%8lu kB\n" |
26 | 43 | "VmStk:\t%8lu kB\n" |
27 | 44 | "VmExe:\t%8lu kB\n" |
28 | 45 | "VmLib:\t%8lu kB\n" |
29 | 46 | "VmPTE:\t%8lu kB\n", |
30 | - (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | |
47 | + hiwater_vm << (PAGE_SHIFT-10), | |
48 | + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | |
31 | 49 | mm->locked_vm << (PAGE_SHIFT-10), |
32 | - get_mm_rss(mm) << (PAGE_SHIFT-10), | |
50 | + hiwater_rss << (PAGE_SHIFT-10), | |
51 | + total_rss << (PAGE_SHIFT-10), | |
33 | 52 | data << (PAGE_SHIFT-10), |
34 | 53 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, |
35 | 54 | (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); |
include/linux/mm.h
... | ... | @@ -938,9 +938,6 @@ |
938 | 938 | } |
939 | 939 | #endif /* CONFIG_PROC_FS */ |
940 | 940 | |
941 | -/* update per process rss and vm hiwater data */ | |
942 | -extern void update_mem_hiwater(struct task_struct *tsk); | |
943 | - | |
944 | 941 | #ifndef CONFIG_DEBUG_PAGEALLOC |
945 | 942 | static inline void |
946 | 943 | kernel_map_pages(struct page *page, int numpages, int enable) |
include/linux/sched.h
... | ... | @@ -256,6 +256,16 @@ |
256 | 256 | #define dec_mm_counter(mm, member) (mm)->_##member-- |
257 | 257 | #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) |
258 | 258 | |
259 | +#define update_hiwater_rss(mm) do { \ | |
260 | + unsigned long _rss = get_mm_rss(mm); \ | |
261 | + if ((mm)->hiwater_rss < _rss) \ | |
262 | + (mm)->hiwater_rss = _rss; \ | |
263 | +} while (0) | |
264 | +#define update_hiwater_vm(mm) do { \ | |
265 | + if ((mm)->hiwater_vm < (mm)->total_vm) \ | |
266 | + (mm)->hiwater_vm = (mm)->total_vm; \ | |
267 | +} while (0) | |
268 | + | |
259 | 269 | typedef unsigned long mm_counter_t; |
260 | 270 | |
261 | 271 | struct mm_struct { |
kernel/exit.c
... | ... | @@ -839,7 +839,10 @@ |
839 | 839 | preempt_count()); |
840 | 840 | |
841 | 841 | acct_update_integrals(tsk); |
842 | - update_mem_hiwater(tsk); | |
842 | + if (tsk->mm) { | |
843 | + update_hiwater_rss(tsk->mm); | |
844 | + update_hiwater_vm(tsk->mm); | |
845 | + } | |
843 | 846 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
844 | 847 | if (group_dead) { |
845 | 848 | del_timer_sync(&tsk->signal->real_timer); |
kernel/sched.c
mm/fremap.c
... | ... | @@ -143,8 +143,10 @@ |
143 | 143 | if (!pte) |
144 | 144 | goto err_unlock; |
145 | 145 | |
146 | - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) | |
146 | + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { | |
147 | + update_hiwater_rss(mm); | |
147 | 148 | dec_mm_counter(mm, file_rss); |
149 | + } | |
148 | 150 | |
149 | 151 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
150 | 152 | pte_val = *pte; |
mm/hugetlb.c
... | ... | @@ -310,6 +310,9 @@ |
310 | 310 | BUG_ON(start & ~HPAGE_MASK); |
311 | 311 | BUG_ON(end & ~HPAGE_MASK); |
312 | 312 | |
313 | + /* Update high watermark before we lower rss */ | |
314 | + update_hiwater_rss(mm); | |
315 | + | |
313 | 316 | for (address = start; address < end; address += HPAGE_SIZE) { |
314 | 317 | ptep = huge_pte_offset(mm, address); |
315 | 318 | if (! ptep) |
mm/memory.c
... | ... | @@ -820,6 +820,7 @@ |
820 | 820 | lru_add_drain(); |
821 | 821 | spin_lock(&mm->page_table_lock); |
822 | 822 | tlb = tlb_gather_mmu(mm, 0); |
823 | + update_hiwater_rss(mm); | |
823 | 824 | end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); |
824 | 825 | tlb_finish_mmu(tlb, address, end); |
825 | 826 | spin_unlock(&mm->page_table_lock); |
... | ... | @@ -2224,22 +2225,6 @@ |
2224 | 2225 | } |
2225 | 2226 | |
2226 | 2227 | EXPORT_SYMBOL(vmalloc_to_pfn); |
2227 | - | |
2228 | -/* | |
2229 | - * update_mem_hiwater | |
2230 | - * - update per process rss and vm high water data | |
2231 | - */ | |
2232 | -void update_mem_hiwater(struct task_struct *tsk) | |
2233 | -{ | |
2234 | - if (tsk->mm) { | |
2235 | - unsigned long rss = get_mm_rss(tsk->mm); | |
2236 | - | |
2237 | - if (tsk->mm->hiwater_rss < rss) | |
2238 | - tsk->mm->hiwater_rss = rss; | |
2239 | - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | |
2240 | - tsk->mm->hiwater_vm = tsk->mm->total_vm; | |
2241 | - } | |
2242 | -} | |
2243 | 2228 | |
2244 | 2229 | #if !defined(__HAVE_ARCH_GATE_AREA) |
2245 | 2230 |
mm/mmap.c
... | ... | @@ -1640,6 +1640,8 @@ |
1640 | 1640 | */ |
1641 | 1641 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
1642 | 1642 | { |
1643 | + /* Update high watermark before we lower total_vm */ | |
1644 | + update_hiwater_vm(mm); | |
1643 | 1645 | do { |
1644 | 1646 | long nrpages = vma_pages(vma); |
1645 | 1647 | |
... | ... | @@ -1668,6 +1670,7 @@ |
1668 | 1670 | lru_add_drain(); |
1669 | 1671 | spin_lock(&mm->page_table_lock); |
1670 | 1672 | tlb = tlb_gather_mmu(mm, 0); |
1673 | + update_hiwater_rss(mm); | |
1671 | 1674 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); |
1672 | 1675 | vm_unacct_memory(nr_accounted); |
1673 | 1676 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
... | ... | @@ -1953,6 +1956,7 @@ |
1953 | 1956 | |
1954 | 1957 | flush_cache_mm(mm); |
1955 | 1958 | tlb = tlb_gather_mmu(mm, 1); |
1959 | + /* Don't update_hiwater_rss(mm) here, do_exit already did */ | |
1956 | 1960 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
1957 | 1961 | end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); |
1958 | 1962 | vm_unacct_memory(nr_accounted); |
mm/mremap.c
... | ... | @@ -167,6 +167,7 @@ |
167 | 167 | unsigned long new_pgoff; |
168 | 168 | unsigned long moved_len; |
169 | 169 | unsigned long excess = 0; |
170 | + unsigned long hiwater_vm; | |
170 | 171 | int split = 0; |
171 | 172 | |
172 | 173 | /* |
173 | 174 | |
... | ... | @@ -205,9 +206,15 @@ |
205 | 206 | } |
206 | 207 | |
207 | 208 | /* |
208 | - * if we failed to move page tables we still do total_vm increment | |
209 | - * since do_munmap() will decrement it by old_len == new_len | |
209 | + * If we failed to move page tables we still do total_vm increment | |
210 | + * since do_munmap() will decrement it by old_len == new_len. | |
211 | + * | |
212 | + * Since total_vm is about to be raised artificially high for a | |
213 | + * moment, we need to restore high watermark afterwards: if stats | |
214 | + * are taken meanwhile, total_vm and hiwater_vm appear too high. | |
215 | + * If this were a serious issue, we'd add a flag to do_munmap(). | |
210 | 216 | */ |
217 | + hiwater_vm = mm->hiwater_vm; | |
211 | 218 | mm->total_vm += new_len >> PAGE_SHIFT; |
212 | 219 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
213 | 220 | |
... | ... | @@ -216,6 +223,7 @@ |
216 | 223 | vm_unacct_memory(excess >> PAGE_SHIFT); |
217 | 224 | excess = 0; |
218 | 225 | } |
226 | + mm->hiwater_vm = hiwater_vm; | |
219 | 227 | |
220 | 228 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ |
221 | 229 | if (excess) { |
mm/nommu.c
... | ... | @@ -931,6 +931,8 @@ |
931 | 931 | realalloc -= kobjsize(vml); |
932 | 932 | askedalloc -= sizeof(*vml); |
933 | 933 | kfree(vml); |
934 | + | |
935 | + update_hiwater_vm(mm); | |
934 | 936 | mm->total_vm -= len >> PAGE_SHIFT; |
935 | 937 | |
936 | 938 | #ifdef DEBUG |
... | ... | @@ -1076,19 +1078,6 @@ |
1076 | 1078 | |
1077 | 1079 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) |
1078 | 1080 | { |
1079 | -} | |
1080 | - | |
1081 | -void update_mem_hiwater(struct task_struct *tsk) | |
1082 | -{ | |
1083 | - unsigned long rss; | |
1084 | - | |
1085 | - if (likely(tsk->mm)) { | |
1086 | - rss = get_mm_rss(tsk->mm); | |
1087 | - if (tsk->mm->hiwater_rss < rss) | |
1088 | - tsk->mm->hiwater_rss = rss; | |
1089 | - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | |
1090 | - tsk->mm->hiwater_vm = tsk->mm->total_vm; | |
1091 | - } | |
1092 | 1081 | } |
1093 | 1082 | |
1094 | 1083 | void unmap_mapping_range(struct address_space *mapping, |
mm/rmap.c
... | ... | @@ -538,6 +538,9 @@ |
538 | 538 | if (pte_dirty(pteval)) |
539 | 539 | set_page_dirty(page); |
540 | 540 | |
541 | + /* Update high watermark before we lower rss */ | |
542 | + update_hiwater_rss(mm); | |
543 | + | |
541 | 544 | if (PageAnon(page)) { |
542 | 545 | swp_entry_t entry = { .val = page->private }; |
543 | 546 | /* |
... | ... | @@ -627,6 +630,9 @@ |
627 | 630 | pmd = pmd_offset(pud, address); |
628 | 631 | if (!pmd_present(*pmd)) |
629 | 632 | goto out_unlock; |
633 | + | |
634 | + /* Update high watermark before we lower rss */ | |
635 | + update_hiwater_rss(mm); | |
630 | 636 | |
631 | 637 | for (original_pte = pte = pte_offset_map(pmd, address); |
632 | 638 | address < end; pte++, address += PAGE_SIZE) { |