mm: page_alloc: reduce cost of the fair zone allocation policy

commit 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 upstream. The fair zone allocation policy round-robins allocations between zones within a node to avoid age inversion problems during reclaim. If the first allocation fails, the batch counts are reset and a second attempt made before entering the slow path. One assumption made with this scheme is that batches expire at roughly the same time and the resets each time are justified. This assumption does not hold when zones reach their low watermark as the batches will be consumed at uneven rates. Allocation failure due to watermark depletion result in additional zonelist scans for the reset and another watermark check before hitting the slowpath. On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA machine it's variable due to the variability of measuring overhead with the vmstat changes. The system CPU overhead comparison looks like 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanilla vmstat-v5 lowercost-v5 User 746.94 774.56 802.00 System 65336.22 32847.27 40852.33 Elapsed 27553.52 27415.04 27368.46 However it is worth noting that the overall benchmark still completed faster and intuitively it makes sense to take as few passes as possible through the zonelists. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Jiri Slaby <jslaby@suse.cz>

mm: page_alloc: reduce cost of the fair zone allocation policy
commit 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 upstream. The fair zone allocation policy round-robins allocations between zones within a node to avoid age inversion problems during reclaim. If the first allocation fails, the batch counts are reset and a second attempt made before entering the slow path. One assumption made with this scheme is that batches expire at roughly the same time and the resets each time are justified. This assumption does not hold when zones reach their low watermark as the batches will be consumed at uneven rates. Allocation failure due to watermark depletion result in additional zonelist scans for the reset and another watermark check before hitting the slowpath. On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA machine it's variable due to the variability of measuring overhead with the vmstat changes. The system CPU overhead comparison looks like 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanilla vmstat-v5 lowercost-v5 User 746.94 774.56 802.00 System 65336.22 32847.27 40852.33 Elapsed 27553.52 27415.04 27368.46 However it is worth noting that the overall benchmark still completed faster and intuitively it makes sense to take as few passes as possible through the zonelists. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Mel Gorman · Jiri Slaby
1 parent 51aad0a515
Showing 2 changed files with 59 additions and 48 deletions Side-by-side Diff
include/linux/mmzone.h
mm/page_alloc.c
@@ -529,6 +529,7 @@
 	ZONE_WRITEBACK,			/* reclaim scanning has recently found
 					 * many pages under writeback
 					 */
+	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
 } zone_flags_t;
  
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -564,6 +565,11 @@
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+}
+
+static inline int zone_is_fair_depleted(const struct zone *zone)
+{
+	return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
 }
  
 static inline int zone_is_oom_locked(const struct zone *zone)
@@ -1589,6 +1589,9 @@
 	}
  
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+	if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
+	    !zone_is_fair_depleted(zone))
+		zone_set_flag(zone, ZONE_FAIR_DEPLETED);
  
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1913,6 +1916,18 @@
 }
 #endif	/* CONFIG_NUMA */
  
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+	struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+
+	do {
+		mod_zone_page_state(zone, NR_ALLOC_BATCH,
+			high_wmark_pages(zone) - low_wmark_pages(zone) -
+			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+		zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+	} while (zone++ != preferred_zone);
+}
+
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  
@@ -1930,8 +1945,12 @@
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
 				(gfp_mask & __GFP_WRITE);
+	int nr_fair_skipped = 0;
+	bool zonelist_rescan;
  
 zonelist_scan:
+	zonelist_rescan = false;
+
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
  
@@ -1956,8 +1975,10 @@
 		if (alloc_flags & ALLOC_FAIR) {
 			if (!zone_local(preferred_zone, zone))
 				break;
-			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+			if (zone_is_fair_depleted(zone)) {
+				nr_fair_skipped++;
 				continue;
+			}
 		}
 		/*
 		 * When allocating a page cache page for writing, we
@@ -2063,13 +2084,7 @@
 			zlc_mark_zone_full(zonelist, z);
 	}
  
-	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
-		/* Disable zlc cache for second zonelist scan */
-		zlc_active = 0;
-		goto zonelist_scan;
-	}
-
-	if (page)
+	if (page) {
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
 		 * necessary to allocate the page. The expectation is
  
@@ -2078,8 +2093,37 @@
 		 * for !PFMEMALLOC purposes.
 		 */
 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+		return page;
+	}
  
-	return page;
+	/*
+	 * The first pass makes sure allocations are spread fairly within the
+	 * local node.  However, the local node might have free pages left
+	 * after the fairness batches are exhausted, and remote zones haven't
+	 * even been considered yet.  Try once more without fairness, and
+	 * include remote zones now, before entering the slowpath and waking
+	 * kswapd: prefer spilling to a remote zone over swapping locally.
+	 */
+	if (alloc_flags & ALLOC_FAIR) {
+		alloc_flags &= ~ALLOC_FAIR;
+		if (nr_fair_skipped) {
+			zonelist_rescan = true;
+			reset_alloc_batches(preferred_zone);
+		}
+		if (nr_online_nodes > 1)
+			zonelist_rescan = true;
+	}
+
+	if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+		/* Disable zlc cache for second zonelist scan */
+		zlc_active = 0;
+		zonelist_rescan = true;
+	}
+
+	if (zonelist_rescan)
+		goto zonelist_scan;
+
+	return NULL;
 }
  
 /*
@@ -2407,28 +2451,6 @@
 	return page;
 }
  
-static void reset_alloc_batches(struct zonelist *zonelist,
-				enum zone_type high_zoneidx,
-				struct zone *preferred_zone)
-{
-	struct zoneref *z;
-	struct zone *zone;
-
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-		/*
-		 * Only reset the batches of zones that were actually
-		 * considered in the fairness pass, we don't want to
-		 * trash fairness information for zones that are not
-		 * actually part of this zonelist's round-robin cycle.
-		 */
-		if (!zone_local(preferred_zone, zone))
-			continue;
-		mod_zone_page_state(zone, NR_ALLOC_BATCH,
-			high_wmark_pages(zone) - low_wmark_pages(zone) -
-			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-	}
-}
-
 static void wake_all_kswapds(unsigned int order,
 			     struct zonelist *zonelist,
 			     enum zone_type high_zoneidx,
  
@@ -2759,28 +2781,11 @@
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
-retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, classzone_idx, migratetype);
 	if (unlikely(!page)) {
-		/*
-		 * The first pass makes sure allocations are spread
-		 * fairly within the local node.  However, the local
-		 * node might have free pages left after the fairness
-		 * batches are exhausted, and remote zones haven't
-		 * even been considered yet.  Try once more without
-		 * fairness, and include remote zones now, before
-		 * entering the slowpath and waking kswapd: prefer
-		 * spilling to a remote zone over swapping locally.
-		 */
-		if (alloc_flags & ALLOC_FAIR) {
-			reset_alloc_batches(zonelist, high_zoneidx,
-					    preferred_zone);
-			alloc_flags &= ~ALLOC_FAIR;
-			goto retry;
-		}
 		/*
 		 * Runtime PM, block IO and its error handling path
 		 * can deadlock because I/O on the device might not
...	...	@@ -529,6 +529,7 @@
529	529	ZONE_WRITEBACK, /* reclaim scanning has recently found
530	530	* many pages under writeback
531	531	*/
	532	+ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
532	533	} zone_flags_t;
533	534
534	535	static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
...	...	@@ -564,6 +565,11 @@
564	565	static inline int zone_is_reclaim_locked(const struct zone *zone)
565	566	{
566	567	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
	568	+}
	569	+
	570	+static inline int zone_is_fair_depleted(const struct zone *zone)
	571	+{
	572	+ return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
567	573	}
568	574
569	575	static inline int zone_is_oom_locked(const struct zone *zone)
...	...	@@ -1589,6 +1589,9 @@
1589	1589	}
1590	1590
1591	1591	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
	1592	+ if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
	1593	+ !zone_is_fair_depleted(zone))
	1594	+ zone_set_flag(zone, ZONE_FAIR_DEPLETED);
1592	1595
1593	1596	__count_zone_vm_events(PGALLOC, zone, 1 << order);
1594	1597	zone_statistics(preferred_zone, zone, gfp_flags);
...	...	@@ -1913,6 +1916,18 @@
1913	1916	}
1914	1917	#endif /* CONFIG_NUMA */
1915	1918
	1919	+static void reset_alloc_batches(struct zone *preferred_zone)
	1920	+{
	1921	+ struct zone *zone = preferred_zone->zone_pgdat->node_zones;
	1922	+
	1923	+ do {
	1924	+ mod_zone_page_state(zone, NR_ALLOC_BATCH,
	1925	+ high_wmark_pages(zone) - low_wmark_pages(zone) -
	1926	+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
	1927	+ zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
	1928	+ } while (zone++ != preferred_zone);
	1929	+}
	1930	+
1916	1931	/*
1917	1932	* get_page_from_freelist goes through the zonelist trying to allocate
1918	1933	* a page.
1919	1934
...	...	@@ -1930,8 +1945,12 @@
1930	1945	int did_zlc_setup = 0; /* just call zlc_setup() one time */
1931	1946	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1932	1947	(gfp_mask & __GFP_WRITE);
	1948	+ int nr_fair_skipped = 0;
	1949	+ bool zonelist_rescan;
1933	1950
1934	1951	zonelist_scan:
	1952	+ zonelist_rescan = false;
	1953	+
1935	1954	/*
1936	1955	* Scan zonelist, looking for a zone with enough free.
1937	1956	* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1938	1957
...	...	@@ -1956,8 +1975,10 @@
1956	1975	if (alloc_flags & ALLOC_FAIR) {
1957	1976	if (!zone_local(preferred_zone, zone))
1958	1977	break;
1959		- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
	1978	+ if (zone_is_fair_depleted(zone)) {
	1979	+ nr_fair_skipped++;
1960	1980	continue;
	1981	+ }
1961	1982	}
1962	1983	/*
1963	1984	* When allocating a page cache page for writing, we
...	...	@@ -2063,13 +2084,7 @@
2063	2084	zlc_mark_zone_full(zonelist, z);
2064	2085	}
2065	2086
2066		- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2067		- /* Disable zlc cache for second zonelist scan */
2068		- zlc_active = 0;
2069		- goto zonelist_scan;
2070		- }
2071		-
2072		- if (page)
	2087	+ if (page) {
2073	2088	/*
2074	2089	* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2075	2090	* necessary to allocate the page. The expectation is
2076	2091
...	...	@@ -2078,8 +2093,37 @@
2078	2093	* for !PFMEMALLOC purposes.
2079	2094	*/
2080	2095	page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
	2096	+ return page;
	2097	+ }
2081	2098
2082		- return page;
	2099	+ /*
	2100	+ * The first pass makes sure allocations are spread fairly within the
	2101	+ * local node. However, the local node might have free pages left
	2102	+ * after the fairness batches are exhausted, and remote zones haven't
	2103	+ * even been considered yet. Try once more without fairness, and
	2104	+ * include remote zones now, before entering the slowpath and waking
	2105	+ * kswapd: prefer spilling to a remote zone over swapping locally.
	2106	+ */
	2107	+ if (alloc_flags & ALLOC_FAIR) {
	2108	+ alloc_flags &= ~ALLOC_FAIR;
	2109	+ if (nr_fair_skipped) {
	2110	+ zonelist_rescan = true;
	2111	+ reset_alloc_batches(preferred_zone);
	2112	+ }
	2113	+ if (nr_online_nodes > 1)
	2114	+ zonelist_rescan = true;
	2115	+ }
	2116	+
	2117	+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
	2118	+ /* Disable zlc cache for second zonelist scan */
	2119	+ zlc_active = 0;
	2120	+ zonelist_rescan = true;
	2121	+ }
	2122	+
	2123	+ if (zonelist_rescan)
	2124	+ goto zonelist_scan;
	2125	+
	2126	+ return NULL;
2083	2127	}
2084	2128
2085	2129	/*
...	...	@@ -2407,28 +2451,6 @@
2407	2451	return page;
2408	2452	}
2409	2453
2410		-static void reset_alloc_batches(struct zonelist *zonelist,
2411		- enum zone_type high_zoneidx,
2412		- struct zone *preferred_zone)
2413		-{
2414		- struct zoneref *z;
2415		- struct zone *zone;
2416		-
2417		- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2418		- /*
2419		- * Only reset the batches of zones that were actually
2420		- * considered in the fairness pass, we don't want to
2421		- * trash fairness information for zones that are not
2422		- * actually part of this zonelist's round-robin cycle.
2423		- */
2424		- if (!zone_local(preferred_zone, zone))
2425		- continue;
2426		- mod_zone_page_state(zone, NR_ALLOC_BATCH,
2427		- high_wmark_pages(zone) - low_wmark_pages(zone) -
2428		- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2429		- }
2430		-}
2431		-
2432	2454	static void wake_all_kswapds(unsigned int order,
2433	2455	struct zonelist *zonelist,
2434	2456	enum zone_type high_zoneidx,
2435	2457
...	...	@@ -2759,28 +2781,11 @@
2759	2781	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2760	2782	alloc_flags \|= ALLOC_CMA;
2761	2783	#endif
2762		-retry:
2763	2784	/* First allocation attempt */
2764	2785	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,
2765	2786	zonelist, high_zoneidx, alloc_flags,
2766	2787	preferred_zone, classzone_idx, migratetype);
2767	2788	if (unlikely(!page)) {
2768		- /*
2769		- * The first pass makes sure allocations are spread
2770		- * fairly within the local node. However, the local
2771		- * node might have free pages left after the fairness
2772		- * batches are exhausted, and remote zones haven't
2773		- * even been considered yet. Try once more without
2774		- * fairness, and include remote zones now, before
2775		- * entering the slowpath and waking kswapd: prefer
2776		- * spilling to a remote zone over swapping locally.
2777		- */
2778		- if (alloc_flags & ALLOC_FAIR) {
2779		- reset_alloc_batches(zonelist, high_zoneidx,
2780		- preferred_zone);
2781		- alloc_flags &= ~ALLOC_FAIR;
2782		- goto retry;
2783		- }
2784	2789	/*
2785	2790	* Runtime PM, block IO and its error handling path
2786	2791	* can deadlock because I/O on the device might not