[PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Hugh Dickins · Linus Torvalds
1 parent 861f2fb8e7
Showing 14 changed files with 64 additions and 42 deletions Side-by-side Diff
fs/compat.c
fs/exec.c
fs/proc/task_mmu.c
include/linux/mm.h
include/linux/sched.h
kernel/exit.c
kernel/sched.c
mm/fremap.c
mm/hugetlb.c
mm/memory.c
mm/mmap.c
mm/mremap.c
mm/nommu.c
mm/rmap.c
@@ -1490,7 +1490,6 @@
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
@@ -1207,7 +1207,6 @@
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
@@ -14,22 +14,41 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
  
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
+
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
+		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n",
-		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		hiwater_vm << (PAGE_SHIFT-10),
+		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_rss(mm) << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -938,9 +938,6 @@
 }
 #endif /* CONFIG_PROC_FS */
  
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
@@ -256,6 +256,16 @@
 #define dec_mm_counter(mm, member) (mm)->_##member--
 #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
  
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
 typedef unsigned long mm_counter_t;
  
 struct mm_struct {
@@ -839,7 +839,10 @@
 				preempt_count());
  
 	acct_update_integrals(tsk);
-	update_mem_hiwater(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
@@ -2511,8 +2511,6 @@
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
-	/* Update rss highwater mark */
-	update_mem_hiwater(p);
 }
  
 /*
@@ -143,8 +143,10 @@
 	if (!pte)
 		goto err_unlock;
  
-	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
+	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+		update_hiwater_rss(mm);
 		dec_mm_counter(mm, file_rss);
+	}
  
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
@@ -310,6 +310,9 @@
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
  
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (! ptep)
@@ -820,6 +820,7 @@
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
 	tlb_finish_mmu(tlb, address, end);
 	spin_unlock(&mm->page_table_lock);
@@ -2224,22 +2225,6 @@
 }
  
 EXPORT_SYMBOL(vmalloc_to_pfn);
-
-/*
- * update_mem_hiwater
- *	- update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	if (tsk->mm) {
-		unsigned long rss = get_mm_rss(tsk->mm);
-
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
  
 #if !defined(__HAVE_ARCH_GATE_AREA)
  
@@ -1640,6 +1640,8 @@
  */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
+	/* Update high watermark before we lower total_vm */
+	update_hiwater_vm(mm);
 	do {
 		long nrpages = vma_pages(vma);
  
@@ -1668,6 +1670,7 @@
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
@@ -1953,6 +1956,7 @@
  
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
+	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
@@ -167,6 +167,7 @@
 	unsigned long new_pgoff;
 	unsigned long moved_len;
 	unsigned long excess = 0;
+	unsigned long hiwater_vm;
 	int split = 0;
  
 	/*
  
@@ -205,9 +206,15 @@
 	}
  
 	/*
-	 * if we failed to move page tables we still do total_vm increment
-	 * since do_munmap() will decrement it by old_len == new_len
+	 * If we failed to move page tables we still do total_vm increment
+	 * since do_munmap() will decrement it by old_len == new_len.
+	 *
+	 * Since total_vm is about to be raised artificially high for a
+	 * moment, we need to restore high watermark afterwards: if stats
+	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
+	 * If this were a serious issue, we'd add a flag to do_munmap().
 	 */
+	hiwater_vm = mm->hiwater_vm;
 	mm->total_vm += new_len >> PAGE_SHIFT;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
  
@@ -216,6 +223,7 @@
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+	mm->hiwater_vm = hiwater_vm;
  
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
@@ -931,6 +931,8 @@
 	realalloc -= kobjsize(vml);
 	askedalloc -= sizeof(*vml);
 	kfree(vml);
+
+	update_hiwater_vm(mm);
 	mm->total_vm -= len >> PAGE_SHIFT;
  
 #ifdef DEBUG
@@ -1076,19 +1078,6 @@
  
 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
-}
-
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	unsigned long rss;
-
-	if (likely(tsk->mm)) {
-		rss = get_mm_rss(tsk->mm);
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
 }
  
 void unmap_mapping_range(struct address_space *mapping,
@@ -538,6 +538,9 @@
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
  
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page->private };
 		/*
@@ -627,6 +630,9 @@
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
 		goto out_unlock;
+
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
  
 	for (original_pte = pte = pte_offset_map(pmd, address);
 			address < end; pte++, address += PAGE_SIZE) {
...	...	@@ -1490,7 +1490,6 @@
1490	1490	/* execve success */
1491	1491	security_bprm_free(bprm);
1492	1492	acct_update_integrals(current);
1493		- update_mem_hiwater(current);
1494	1493	kfree(bprm);
1495	1494	return retval;
1496	1495	}
...	...	@@ -1207,7 +1207,6 @@
1207	1207	/* execve success */
1208	1208	security_bprm_free(bprm);
1209	1209	acct_update_integrals(current);
1210		- update_mem_hiwater(current);
1211	1210	kfree(bprm);
1212	1211	return retval;
1213	1212	}
...	...	@@ -14,22 +14,41 @@
14	14	char task_mem(struct mm_struct mm, char *buffer)
15	15	{
16	16	unsigned long data, text, lib;
	17	+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
17	18
	19	+ /*
	20	+ * Note: to minimize their overhead, mm maintains hiwater_vm and
	21	+ * hiwater_rss only when about to lower total_vm or rss. Any
	22	+ * collector of these hiwater stats must therefore get total_vm
	23	+ * and rss too, which will usually be the higher. Barriers? not
	24	+ * worth the effort, such snapshots can always be inconsistent.
	25	+ */
	26	+ hiwater_vm = total_vm = mm->total_vm;
	27	+ if (hiwater_vm < mm->hiwater_vm)
	28	+ hiwater_vm = mm->hiwater_vm;
	29	+ hiwater_rss = total_rss = get_mm_rss(mm);
	30	+ if (hiwater_rss < mm->hiwater_rss)
	31	+ hiwater_rss = mm->hiwater_rss;
	32	+
18	33	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
19	34	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
20	35	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
21	36	buffer += sprintf(buffer,
	37	+ "VmPeak:\t%8lu kB\n"
22	38	"VmSize:\t%8lu kB\n"
23	39	"VmLck:\t%8lu kB\n"
	40	+ "VmHWM:\t%8lu kB\n"
24	41	"VmRSS:\t%8lu kB\n"
25	42	"VmData:\t%8lu kB\n"
26	43	"VmStk:\t%8lu kB\n"
27	44	"VmExe:\t%8lu kB\n"
28	45	"VmLib:\t%8lu kB\n"
29	46	"VmPTE:\t%8lu kB\n",
30		- (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
	47	+ hiwater_vm << (PAGE_SHIFT-10),
	48	+ (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
31	49	mm->locked_vm << (PAGE_SHIFT-10),
32		- get_mm_rss(mm) << (PAGE_SHIFT-10),
	50	+ hiwater_rss << (PAGE_SHIFT-10),
	51	+ total_rss << (PAGE_SHIFT-10),
33	52	data << (PAGE_SHIFT-10),
34	53	mm->stack_vm << (PAGE_SHIFT-10), text, lib,
35	54	(PTRS_PER_PTEsizeof(pte_t)mm->nr_ptes) >> 10);
...	...	@@ -938,9 +938,6 @@
938	938	}
939	939	#endif /* CONFIG_PROC_FS */
940	940
941		-/* update per process rss and vm hiwater data */
942		-extern void update_mem_hiwater(struct task_struct *tsk);
943		-
944	941	#ifndef CONFIG_DEBUG_PAGEALLOC
945	942	static inline void
946	943	kernel_map_pages(struct page *page, int numpages, int enable)
...	...	@@ -256,6 +256,16 @@
256	256	#define dec_mm_counter(mm, member) (mm)->_##member--
257	257	#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
258	258
	259	+#define update_hiwater_rss(mm) do { \
	260	+ unsigned long _rss = get_mm_rss(mm); \
	261	+ if ((mm)->hiwater_rss < _rss) \
	262	+ (mm)->hiwater_rss = _rss; \
	263	+} while (0)
	264	+#define update_hiwater_vm(mm) do { \
	265	+ if ((mm)->hiwater_vm < (mm)->total_vm) \
	266	+ (mm)->hiwater_vm = (mm)->total_vm; \
	267	+} while (0)
	268	+
259	269	typedef unsigned long mm_counter_t;
260	270
261	271	struct mm_struct {
...	...	@@ -839,7 +839,10 @@
839	839	preempt_count());
840	840
841	841	acct_update_integrals(tsk);
842		- update_mem_hiwater(tsk);
	842	+ if (tsk->mm) {
	843	+ update_hiwater_rss(tsk->mm);
	844	+ update_hiwater_vm(tsk->mm);
	845	+ }
843	846	group_dead = atomic_dec_and_test(&tsk->signal->live);
844	847	if (group_dead) {
845	848	del_timer_sync(&tsk->signal->real_timer);
...	...	@@ -2511,8 +2511,6 @@
2511	2511	cpustat->idle = cputime64_add(cpustat->idle, tmp);
2512	2512	/* Account for system time used */
2513	2513	acct_update_integrals(p);
2514		- /* Update rss highwater mark */
2515		- update_mem_hiwater(p);
2516	2514	}
2517	2515
2518	2516	/*
...	...	@@ -143,8 +143,10 @@
143	143	if (!pte)
144	144	goto err_unlock;
145	145
146		- if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
	146	+ if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
	147	+ update_hiwater_rss(mm);
147	148	dec_mm_counter(mm, file_rss);
	149	+ }
148	150
149	151	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
150	152	pte_val = *pte;
...	...	@@ -310,6 +310,9 @@
310	310	BUG_ON(start & ~HPAGE_MASK);
311	311	BUG_ON(end & ~HPAGE_MASK);
312	312
	313	+ /* Update high watermark before we lower rss */
	314	+ update_hiwater_rss(mm);
	315	+
313	316	for (address = start; address < end; address += HPAGE_SIZE) {
314	317	ptep = huge_pte_offset(mm, address);
315	318	if (! ptep)
...	...	@@ -820,6 +820,7 @@
820	820	lru_add_drain();
821	821	spin_lock(&mm->page_table_lock);
822	822	tlb = tlb_gather_mmu(mm, 0);
	823	+ update_hiwater_rss(mm);
823	824	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
824	825	tlb_finish_mmu(tlb, address, end);
825	826	spin_unlock(&mm->page_table_lock);
...	...	@@ -2224,22 +2225,6 @@
2224	2225	}
2225	2226
2226	2227	EXPORT_SYMBOL(vmalloc_to_pfn);
2227		-
2228		-/*
2229		- * update_mem_hiwater
2230		- * - update per process rss and vm high water data
2231		- */
2232		-void update_mem_hiwater(struct task_struct *tsk)
2233		-{
2234		- if (tsk->mm) {
2235		- unsigned long rss = get_mm_rss(tsk->mm);
2236		-
2237		- if (tsk->mm->hiwater_rss < rss)
2238		- tsk->mm->hiwater_rss = rss;
2239		- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2240		- tsk->mm->hiwater_vm = tsk->mm->total_vm;
2241		- }
2242		-}
2243	2228
2244	2229	#if !defined(__HAVE_ARCH_GATE_AREA)
2245	2230