mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is …

…low and kswapd is awake Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is cheaper than scanning a number of lists. To avoid synchronization overhead, counter deltas are maintained on a per-cpu basis and drained both periodically and when the delta is above a threshold. On large CPU systems, the difference between the estimated and real value of NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than number of real free page in buddy, the VM can allocate pages below min watermark, at worst reducing the real number of pages to zero. Even if the OOM killer kills some victim for freeing memory, it may not free memory if the exit path requires a new page resulting in livelock. This patch introduces a zone_page_state_snapshot() function (courtesy of Christoph) that takes a slightly more accurate view of an arbitrary vmstat counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid the watermark being accidentally broken. The estimate is not perfect and may result in cache line bounces but is expected to be lighter than the IPI calls necessary to continually drain the per-cpu counters while kswapd is awake. Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is …
…low and kswapd is awake Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is cheaper than scanning a number of lists. To avoid synchronization overhead, counter deltas are maintained on a per-cpu basis and drained both periodically and when the delta is above a threshold. On large CPU systems, the difference between the estimated and real value of NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than number of real free page in buddy, the VM can allocate pages below min watermark, at worst reducing the real number of pages to zero. Even if the OOM killer kills some victim for freeing memory, it may not free memory if the exit path requires a new page resulting in livelock. This patch introduces a zone_page_state_snapshot() function (courtesy of Christoph) that takes a slightly more accurate view of an arbitrary vmstat counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid the watermark being accidentally broken. The estimate is not perfect and may result in cache line bounces but is expected to be lighter than the IPI calls necessary to continually drain the per-cpu counters while kswapd is awake. Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Christoph Lameter · Linus Torvalds
1 parent 72853e2991
Showing 5 changed files with 72 additions and 3 deletions Side-by-side Diff
include/linux/mmzone.h
include/linux/vmstat.h
mm/mmzone.c
mm/page_alloc.c
mm/vmstat.c
@@ -284,6 +284,13 @@
 	unsigned long watermark[NR_WMARK];
  
 	/*
+	 * When free pages are below this point, additional steps are taken
+	 * when reading the number of free pages to avoid per-cpu counter
+	 * drift allowing watermarks to be breached
+	 */
+	unsigned long percpu_drift_mark;
+
+	/*
 	 * We don't know if the memory that we're going to allocate will be freeable
 	 * or/and it will be released eventually, so to avoid totally wasting several
 	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -440,6 +447,12 @@
 {
 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
+
+#ifdef CONFIG_SMP
+unsigned long zone_nr_free_pages(struct zone *zone);
+#else
+#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
+#endif /* CONFIG_SMP */
  
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -170,6 +170,28 @@
 	return x;
 }
  
+/*
+ * More accurate version that also considers the currently pending
+ * deltas. For that we need to loop over all cpus to find the current
+ * deltas. There is no synchronization so the result cannot be
+ * exactly accurate either.
+ */
+static inline unsigned long zone_page_state_snapshot(struct zone *zone,
+					enum zone_stat_item item)
+{
+	long x = atomic_long_read(&zone->vm_stat[item]);
+
+#ifdef CONFIG_SMP
+	int cpu;
+	for_each_online_cpu(cpu)
+		x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 extern unsigned long global_reclaimable_pages(void);
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
  
@@ -87,4 +87,25 @@
 	return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+#ifdef CONFIG_SMP
+/* Called when a more accurate view of NR_FREE_PAGES is needed */
+unsigned long zone_nr_free_pages(struct zone *zone)
+{
+	unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
+	/*
+	 * While kswapd is awake, it is considered the zone is under some
+	 * memory pressure. Under pressure, there is a risk that
+	 * per-cpu-counter-drift will allow the min watermark to be breached
+	 * potentially causing a live-lock. While kswapd is awake and
+	 * free pages are low, get a better estimate for free pages
+	 */
+	if (nr_free_pages < zone->percpu_drift_mark &&
+			!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+		return zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+	return nr_free_pages;
+}
+#endif /* CONFIG_SMP */
@@ -1462,7 +1462,7 @@
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
-	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
+	long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
 	int o;
  
 	if (alloc_flags & ALLOC_HIGH)
@@ -2424,7 +2424,7 @@
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
-			K(zone_page_state(zone, NR_FREE_PAGES)),
+			K(zone_nr_free_pages(zone)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
@@ -138,11 +138,24 @@
 	int threshold;
  
 	for_each_populated_zone(zone) {
+		unsigned long max_drift, tolerate_drift;
+
 		threshold = calculate_threshold(zone);
  
 		for_each_online_cpu(cpu)
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
+
+		/*
+		 * Only set percpu_drift_mark if there is a danger that
+		 * NR_FREE_PAGES reports the low watermark is ok when in fact
+		 * the min watermark could be breached by an allocation
+		 */
+		tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+		max_drift = num_online_cpus() * threshold;
+		if (max_drift > tolerate_drift)
+			zone->percpu_drift_mark = high_wmark_pages(zone) +
+					max_drift;
 	}
 }
  
@@ -813,7 +826,7 @@
 		   "\n        scanned  %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
-		   zone_page_state(zone, NR_FREE_PAGES),
+		   zone_nr_free_pages(zone),
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
...	...	@@ -284,6 +284,13 @@
284	284	unsigned long watermark[NR_WMARK];
285	285
286	286	/*
	287	+ * When free pages are below this point, additional steps are taken
	288	+ * when reading the number of free pages to avoid per-cpu counter
	289	+ * drift allowing watermarks to be breached
	290	+ */
	291	+ unsigned long percpu_drift_mark;
	292	+
	293	+ /*
287	294	* We don't know if the memory that we're going to allocate will be freeable
288	295	* or/and it will be released eventually, so to avoid totally wasting several
289	296	* GB of ram we must reserve some of the lower zone memory (otherwise we risk
...	...	@@ -440,6 +447,12 @@
440	447	{
441	448	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
442	449	}
	450	+
	451	+#ifdef CONFIG_SMP
	452	+unsigned long zone_nr_free_pages(struct zone *zone);
	453	+#else
	454	+#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
	455	+#endif /* CONFIG_SMP */
443	456
444	457	/*
445	458	* The "priority" of VM scanning is how much of the queues we will scan in one
...	...	@@ -170,6 +170,28 @@
170	170	return x;
171	171	}
172	172
	173	+/*
	174	+ * More accurate version that also considers the currently pending
	175	+ * deltas. For that we need to loop over all cpus to find the current
	176	+ * deltas. There is no synchronization so the result cannot be
	177	+ * exactly accurate either.
	178	+ */
	179	+static inline unsigned long zone_page_state_snapshot(struct zone *zone,
	180	+ enum zone_stat_item item)
	181	+{
	182	+ long x = atomic_long_read(&zone->vm_stat[item]);
	183	+
	184	+#ifdef CONFIG_SMP
	185	+ int cpu;
	186	+ for_each_online_cpu(cpu)
	187	+ x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
	188	+
	189	+ if (x < 0)
	190	+ x = 0;
	191	+#endif
	192	+ return x;
	193	+}
	194	+
173	195	extern unsigned long global_reclaimable_pages(void);
174	196	extern unsigned long zone_reclaimable_pages(struct zone *zone);
175	197
...	...	@@ -87,4 +87,25 @@
87	87	return 1;
88	88	}
89	89	#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
	90	+
	91	+#ifdef CONFIG_SMP
	92	+/* Called when a more accurate view of NR_FREE_PAGES is needed */
	93	+unsigned long zone_nr_free_pages(struct zone *zone)
	94	+{
	95	+ unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
	96	+
	97	+ /*
	98	+ * While kswapd is awake, it is considered the zone is under some
	99	+ * memory pressure. Under pressure, there is a risk that
	100	+ * per-cpu-counter-drift will allow the min watermark to be breached
	101	+ * potentially causing a live-lock. While kswapd is awake and
	102	+ * free pages are low, get a better estimate for free pages
	103	+ */
	104	+ if (nr_free_pages < zone->percpu_drift_mark &&
	105	+ !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
	106	+ return zone_page_state_snapshot(zone, NR_FREE_PAGES);
	107	+
	108	+ return nr_free_pages;
	109	+}
	110	+#endif /* CONFIG_SMP */
...	...	@@ -1462,7 +1462,7 @@
1462	1462	{
1463	1463	/* free_pages my go negative - that's OK */
1464	1464	long min = mark;
1465		- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
	1465	+ long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466	1466	int o;
1467	1467
1468	1468	if (alloc_flags & ALLOC_HIGH)
...	...	@@ -2424,7 +2424,7 @@
2424	2424	" all_unreclaimable? %s"
2425	2425	"\n",
2426	2426	zone->name,
2427		- K(zone_page_state(zone, NR_FREE_PAGES)),
	2427	+ K(zone_nr_free_pages(zone)),
2428	2428	K(min_wmark_pages(zone)),
2429	2429	K(low_wmark_pages(zone)),
2430	2430	K(high_wmark_pages(zone)),
...	...	@@ -138,11 +138,24 @@
138	138	int threshold;
139	139
140	140	for_each_populated_zone(zone) {
	141	+ unsigned long max_drift, tolerate_drift;
	142	+
141	143	threshold = calculate_threshold(zone);
142	144
143	145	for_each_online_cpu(cpu)
144	146	per_cpu_ptr(zone->pageset, cpu)->stat_threshold
145	147	= threshold;
	148	+
	149	+ /*
	150	+ * Only set percpu_drift_mark if there is a danger that
	151	+ * NR_FREE_PAGES reports the low watermark is ok when in fact
	152	+ * the min watermark could be breached by an allocation
	153	+ */
	154	+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
	155	+ max_drift = num_online_cpus() * threshold;
	156	+ if (max_drift > tolerate_drift)
	157	+ zone->percpu_drift_mark = high_wmark_pages(zone) +
	158	+ max_drift;
146	159	}
147	160	}
148	161
...	...	@@ -813,7 +826,7 @@
813	826	"\n scanned %lu"
814	827	"\n spanned %lu"
815	828	"\n present %lu",
816		- zone_page_state(zone, NR_FREE_PAGES),
	829	+ zone_nr_free_pages(zone),
817	830	min_wmark_pages(zone),
818	831	low_wmark_pages(zone),
819	832	high_wmark_pages(zone),