Commit 365e9c87a982c03d0af3886e29d877f581b59611

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 861f2fb8e7

[PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 14 changed files with 64 additions and 42 deletions Side-by-side Diff

... ... @@ -1490,7 +1490,6 @@
1490 1490 /* execve success */
1491 1491 security_bprm_free(bprm);
1492 1492 acct_update_integrals(current);
1493   - update_mem_hiwater(current);
1494 1493 kfree(bprm);
1495 1494 return retval;
1496 1495 }
... ... @@ -1207,7 +1207,6 @@
1207 1207 /* execve success */
1208 1208 security_bprm_free(bprm);
1209 1209 acct_update_integrals(current);
1210   - update_mem_hiwater(current);
1211 1210 kfree(bprm);
1212 1211 return retval;
1213 1212 }
... ... @@ -14,22 +14,41 @@
14 14 char *task_mem(struct mm_struct *mm, char *buffer)
15 15 {
16 16 unsigned long data, text, lib;
  17 + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
17 18  
  19 + /*
  20 + * Note: to minimize their overhead, mm maintains hiwater_vm and
  21 + * hiwater_rss only when about to *lower* total_vm or rss. Any
  22 + * collector of these hiwater stats must therefore get total_vm
  23 + * and rss too, which will usually be the higher. Barriers? not
  24 + * worth the effort, such snapshots can always be inconsistent.
  25 + */
  26 + hiwater_vm = total_vm = mm->total_vm;
  27 + if (hiwater_vm < mm->hiwater_vm)
  28 + hiwater_vm = mm->hiwater_vm;
  29 + hiwater_rss = total_rss = get_mm_rss(mm);
  30 + if (hiwater_rss < mm->hiwater_rss)
  31 + hiwater_rss = mm->hiwater_rss;
  32 +
18 33 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
19 34 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
20 35 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
21 36 buffer += sprintf(buffer,
  37 + "VmPeak:\t%8lu kB\n"
22 38 "VmSize:\t%8lu kB\n"
23 39 "VmLck:\t%8lu kB\n"
  40 + "VmHWM:\t%8lu kB\n"
24 41 "VmRSS:\t%8lu kB\n"
25 42 "VmData:\t%8lu kB\n"
26 43 "VmStk:\t%8lu kB\n"
27 44 "VmExe:\t%8lu kB\n"
28 45 "VmLib:\t%8lu kB\n"
29 46 "VmPTE:\t%8lu kB\n",
30   - (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
  47 + hiwater_vm << (PAGE_SHIFT-10),
  48 + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
31 49 mm->locked_vm << (PAGE_SHIFT-10),
32   - get_mm_rss(mm) << (PAGE_SHIFT-10),
  50 + hiwater_rss << (PAGE_SHIFT-10),
  51 + total_rss << (PAGE_SHIFT-10),
33 52 data << (PAGE_SHIFT-10),
34 53 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
35 54 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
... ... @@ -938,9 +938,6 @@
938 938 }
939 939 #endif /* CONFIG_PROC_FS */
940 940  
941   -/* update per process rss and vm hiwater data */
942   -extern void update_mem_hiwater(struct task_struct *tsk);
943   -
944 941 #ifndef CONFIG_DEBUG_PAGEALLOC
945 942 static inline void
946 943 kernel_map_pages(struct page *page, int numpages, int enable)
include/linux/sched.h
... ... @@ -256,6 +256,16 @@
256 256 #define dec_mm_counter(mm, member) (mm)->_##member--
257 257 #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
258 258  
  259 +#define update_hiwater_rss(mm) do { \
  260 + unsigned long _rss = get_mm_rss(mm); \
  261 + if ((mm)->hiwater_rss < _rss) \
  262 + (mm)->hiwater_rss = _rss; \
  263 +} while (0)
  264 +#define update_hiwater_vm(mm) do { \
  265 + if ((mm)->hiwater_vm < (mm)->total_vm) \
  266 + (mm)->hiwater_vm = (mm)->total_vm; \
  267 +} while (0)
  268 +
259 269 typedef unsigned long mm_counter_t;
260 270  
261 271 struct mm_struct {
... ... @@ -839,7 +839,10 @@
839 839 preempt_count());
840 840  
841 841 acct_update_integrals(tsk);
842   - update_mem_hiwater(tsk);
  842 + if (tsk->mm) {
  843 + update_hiwater_rss(tsk->mm);
  844 + update_hiwater_vm(tsk->mm);
  845 + }
843 846 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 847 if (group_dead) {
845 848 del_timer_sync(&tsk->signal->real_timer);
... ... @@ -2511,8 +2511,6 @@
2511 2511 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2512 2512 /* Account for system time used */
2513 2513 acct_update_integrals(p);
2514   - /* Update rss highwater mark */
2515   - update_mem_hiwater(p);
2516 2514 }
2517 2515  
2518 2516 /*
... ... @@ -143,8 +143,10 @@
143 143 if (!pte)
144 144 goto err_unlock;
145 145  
146   - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
  146 + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
  147 + update_hiwater_rss(mm);
147 148 dec_mm_counter(mm, file_rss);
  149 + }
148 150  
149 151 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
150 152 pte_val = *pte;
... ... @@ -310,6 +310,9 @@
310 310 BUG_ON(start & ~HPAGE_MASK);
311 311 BUG_ON(end & ~HPAGE_MASK);
312 312  
  313 + /* Update high watermark before we lower rss */
  314 + update_hiwater_rss(mm);
  315 +
313 316 for (address = start; address < end; address += HPAGE_SIZE) {
314 317 ptep = huge_pte_offset(mm, address);
315 318 if (! ptep)
... ... @@ -820,6 +820,7 @@
820 820 lru_add_drain();
821 821 spin_lock(&mm->page_table_lock);
822 822 tlb = tlb_gather_mmu(mm, 0);
  823 + update_hiwater_rss(mm);
823 824 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
824 825 tlb_finish_mmu(tlb, address, end);
825 826 spin_unlock(&mm->page_table_lock);
... ... @@ -2224,22 +2225,6 @@
2224 2225 }
2225 2226  
2226 2227 EXPORT_SYMBOL(vmalloc_to_pfn);
2227   -
2228   -/*
2229   - * update_mem_hiwater
2230   - * - update per process rss and vm high water data
2231   - */
2232   -void update_mem_hiwater(struct task_struct *tsk)
2233   -{
2234   - if (tsk->mm) {
2235   - unsigned long rss = get_mm_rss(tsk->mm);
2236   -
2237   - if (tsk->mm->hiwater_rss < rss)
2238   - tsk->mm->hiwater_rss = rss;
2239   - if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2240   - tsk->mm->hiwater_vm = tsk->mm->total_vm;
2241   - }
2242   -}
2243 2228  
2244 2229 #if !defined(__HAVE_ARCH_GATE_AREA)
2245 2230  
... ... @@ -1640,6 +1640,8 @@
1640 1640 */
1641 1641 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1642 1642 {
  1643 + /* Update high watermark before we lower total_vm */
  1644 + update_hiwater_vm(mm);
1643 1645 do {
1644 1646 long nrpages = vma_pages(vma);
1645 1647  
... ... @@ -1668,6 +1670,7 @@
1668 1670 lru_add_drain();
1669 1671 spin_lock(&mm->page_table_lock);
1670 1672 tlb = tlb_gather_mmu(mm, 0);
  1673 + update_hiwater_rss(mm);
1671 1674 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1672 1675 vm_unacct_memory(nr_accounted);
1673 1676 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
... ... @@ -1953,6 +1956,7 @@
1953 1956  
1954 1957 flush_cache_mm(mm);
1955 1958 tlb = tlb_gather_mmu(mm, 1);
  1959 + /* Don't update_hiwater_rss(mm) here, do_exit already did */
1956 1960 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1957 1961 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1958 1962 vm_unacct_memory(nr_accounted);
... ... @@ -167,6 +167,7 @@
167 167 unsigned long new_pgoff;
168 168 unsigned long moved_len;
169 169 unsigned long excess = 0;
  170 + unsigned long hiwater_vm;
170 171 int split = 0;
171 172  
172 173 /*
173 174  
... ... @@ -205,9 +206,15 @@
205 206 }
206 207  
207 208 /*
208   - * if we failed to move page tables we still do total_vm increment
209   - * since do_munmap() will decrement it by old_len == new_len
  209 + * If we failed to move page tables we still do total_vm increment
  210 + * since do_munmap() will decrement it by old_len == new_len.
  211 + *
  212 + * Since total_vm is about to be raised artificially high for a
  213 + * moment, we need to restore high watermark afterwards: if stats
  214 + * are taken meanwhile, total_vm and hiwater_vm appear too high.
  215 + * If this were a serious issue, we'd add a flag to do_munmap().
210 216 */
  217 + hiwater_vm = mm->hiwater_vm;
211 218 mm->total_vm += new_len >> PAGE_SHIFT;
212 219 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
213 220  
... ... @@ -216,6 +223,7 @@
216 223 vm_unacct_memory(excess >> PAGE_SHIFT);
217 224 excess = 0;
218 225 }
  226 + mm->hiwater_vm = hiwater_vm;
219 227  
220 228 /* Restore VM_ACCOUNT if one or two pieces of vma left */
221 229 if (excess) {
... ... @@ -931,6 +931,8 @@
931 931 realalloc -= kobjsize(vml);
932 932 askedalloc -= sizeof(*vml);
933 933 kfree(vml);
  934 +
  935 + update_hiwater_vm(mm);
934 936 mm->total_vm -= len >> PAGE_SHIFT;
935 937  
936 938 #ifdef DEBUG
... ... @@ -1076,19 +1078,6 @@
1076 1078  
1077 1079 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1078 1080 {
1079   -}
1080   -
1081   -void update_mem_hiwater(struct task_struct *tsk)
1082   -{
1083   - unsigned long rss;
1084   -
1085   - if (likely(tsk->mm)) {
1086   - rss = get_mm_rss(tsk->mm);
1087   - if (tsk->mm->hiwater_rss < rss)
1088   - tsk->mm->hiwater_rss = rss;
1089   - if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
1090   - tsk->mm->hiwater_vm = tsk->mm->total_vm;
1091   - }
1092 1081 }
1093 1082  
1094 1083 void unmap_mapping_range(struct address_space *mapping,
... ... @@ -538,6 +538,9 @@
538 538 if (pte_dirty(pteval))
539 539 set_page_dirty(page);
540 540  
  541 + /* Update high watermark before we lower rss */
  542 + update_hiwater_rss(mm);
  543 +
541 544 if (PageAnon(page)) {
542 545 swp_entry_t entry = { .val = page->private };
543 546 /*
... ... @@ -627,6 +630,9 @@
627 630 pmd = pmd_offset(pud, address);
628 631 if (!pmd_present(*pmd))
629 632 goto out_unlock;
  633 +
  634 + /* Update high watermark before we lower rss */
  635 + update_hiwater_rss(mm);
630 636  
631 637 for (original_pte = pte = pte_offset_map(pmd, address);
632 638 address < end; pte++, address += PAGE_SIZE) {