Commit 99ed1bd0c77355d65de5f112eb92d79f9bace84f

Authored by Mel Gorman
Committed by Jiri Slaby
1 parent 51aad0a515

mm: page_alloc: reduce cost of the fair zone allocation policy

commit 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 upstream.

The fair zone allocation policy round-robins allocations between zones
within a node to avoid age inversion problems during reclaim.  If the
first allocation fails, the batch counts are reset and a second attempt
made before entering the slow path.

One assumption made with this scheme is that batches expire at roughly
the same time and the resets each time are justified.  This assumption
does not hold when zones reach their low watermark as the batches will
be consumed at uneven rates.  Allocation failure due to watermark
depletion result in additional zonelist scans for the reset and another
watermark check before hitting the slowpath.

On UMA, the benefit is negligible -- around 0.25%.  On 4-socket NUMA
machine it's variable due to the variability of measuring overhead with
the vmstat changes.  The system CPU overhead comparison looks like

          3.16.0-rc3  3.16.0-rc3  3.16.0-rc3
             vanilla   vmstat-v5 lowercost-v5
User          746.94      774.56      802.00
System      65336.22    32847.27    40852.33
Elapsed     27553.52    27415.04    27368.46

However it is worth noting that the overall benchmark still completed
faster and intuitively it makes sense to take as few passes as possible
through the zonelists.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>

Showing 2 changed files with 59 additions and 48 deletions Side-by-side Diff

include/linux/mmzone.h
... ... @@ -529,6 +529,7 @@
529 529 ZONE_WRITEBACK, /* reclaim scanning has recently found
530 530 * many pages under writeback
531 531 */
  532 + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
532 533 } zone_flags_t;
533 534  
534 535 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
... ... @@ -564,6 +565,11 @@
564 565 static inline int zone_is_reclaim_locked(const struct zone *zone)
565 566 {
566 567 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
  568 +}
  569 +
  570 +static inline int zone_is_fair_depleted(const struct zone *zone)
  571 +{
  572 + return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
567 573 }
568 574  
569 575 static inline int zone_is_oom_locked(const struct zone *zone)
... ... @@ -1589,6 +1589,9 @@
1589 1589 }
1590 1590  
1591 1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  1592 + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
  1593 + !zone_is_fair_depleted(zone))
  1594 + zone_set_flag(zone, ZONE_FAIR_DEPLETED);
1592 1595  
1593 1596 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1594 1597 zone_statistics(preferred_zone, zone, gfp_flags);
... ... @@ -1913,6 +1916,18 @@
1913 1916 }
1914 1917 #endif /* CONFIG_NUMA */
1915 1918  
  1919 +static void reset_alloc_batches(struct zone *preferred_zone)
  1920 +{
  1921 + struct zone *zone = preferred_zone->zone_pgdat->node_zones;
  1922 +
  1923 + do {
  1924 + mod_zone_page_state(zone, NR_ALLOC_BATCH,
  1925 + high_wmark_pages(zone) - low_wmark_pages(zone) -
  1926 + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
  1927 + zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
  1928 + } while (zone++ != preferred_zone);
  1929 +}
  1930 +
1916 1931 /*
1917 1932 * get_page_from_freelist goes through the zonelist trying to allocate
1918 1933 * a page.
1919 1934  
... ... @@ -1930,8 +1945,12 @@
1930 1945 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1931 1946 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1932 1947 (gfp_mask & __GFP_WRITE);
  1948 + int nr_fair_skipped = 0;
  1949 + bool zonelist_rescan;
1933 1950  
1934 1951 zonelist_scan:
  1952 + zonelist_rescan = false;
  1953 +
1935 1954 /*
1936 1955 * Scan zonelist, looking for a zone with enough free.
1937 1956 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1938 1957  
... ... @@ -1956,8 +1975,10 @@
1956 1975 if (alloc_flags & ALLOC_FAIR) {
1957 1976 if (!zone_local(preferred_zone, zone))
1958 1977 break;
1959   - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
  1978 + if (zone_is_fair_depleted(zone)) {
  1979 + nr_fair_skipped++;
1960 1980 continue;
  1981 + }
1961 1982 }
1962 1983 /*
1963 1984 * When allocating a page cache page for writing, we
... ... @@ -2063,13 +2084,7 @@
2063 2084 zlc_mark_zone_full(zonelist, z);
2064 2085 }
2065 2086  
2066   - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
2067   - /* Disable zlc cache for second zonelist scan */
2068   - zlc_active = 0;
2069   - goto zonelist_scan;
2070   - }
2071   -
2072   - if (page)
  2087 + if (page) {
2073 2088 /*
2074 2089 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2075 2090 * necessary to allocate the page. The expectation is
2076 2091  
... ... @@ -2078,8 +2093,37 @@
2078 2093 * for !PFMEMALLOC purposes.
2079 2094 */
2080 2095 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
  2096 + return page;
  2097 + }
2081 2098  
2082   - return page;
  2099 + /*
  2100 + * The first pass makes sure allocations are spread fairly within the
  2101 + * local node. However, the local node might have free pages left
  2102 + * after the fairness batches are exhausted, and remote zones haven't
  2103 + * even been considered yet. Try once more without fairness, and
  2104 + * include remote zones now, before entering the slowpath and waking
  2105 + * kswapd: prefer spilling to a remote zone over swapping locally.
  2106 + */
  2107 + if (alloc_flags & ALLOC_FAIR) {
  2108 + alloc_flags &= ~ALLOC_FAIR;
  2109 + if (nr_fair_skipped) {
  2110 + zonelist_rescan = true;
  2111 + reset_alloc_batches(preferred_zone);
  2112 + }
  2113 + if (nr_online_nodes > 1)
  2114 + zonelist_rescan = true;
  2115 + }
  2116 +
  2117 + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
  2118 + /* Disable zlc cache for second zonelist scan */
  2119 + zlc_active = 0;
  2120 + zonelist_rescan = true;
  2121 + }
  2122 +
  2123 + if (zonelist_rescan)
  2124 + goto zonelist_scan;
  2125 +
  2126 + return NULL;
2083 2127 }
2084 2128  
2085 2129 /*
... ... @@ -2407,28 +2451,6 @@
2407 2451 return page;
2408 2452 }
2409 2453  
2410   -static void reset_alloc_batches(struct zonelist *zonelist,
2411   - enum zone_type high_zoneidx,
2412   - struct zone *preferred_zone)
2413   -{
2414   - struct zoneref *z;
2415   - struct zone *zone;
2416   -
2417   - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2418   - /*
2419   - * Only reset the batches of zones that were actually
2420   - * considered in the fairness pass, we don't want to
2421   - * trash fairness information for zones that are not
2422   - * actually part of this zonelist's round-robin cycle.
2423   - */
2424   - if (!zone_local(preferred_zone, zone))
2425   - continue;
2426   - mod_zone_page_state(zone, NR_ALLOC_BATCH,
2427   - high_wmark_pages(zone) - low_wmark_pages(zone) -
2428   - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2429   - }
2430   -}
2431   -
2432 2454 static void wake_all_kswapds(unsigned int order,
2433 2455 struct zonelist *zonelist,
2434 2456 enum zone_type high_zoneidx,
2435 2457  
... ... @@ -2759,28 +2781,11 @@
2759 2781 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2760 2782 alloc_flags |= ALLOC_CMA;
2761 2783 #endif
2762   -retry:
2763 2784 /* First allocation attempt */
2764 2785 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2765 2786 zonelist, high_zoneidx, alloc_flags,
2766 2787 preferred_zone, classzone_idx, migratetype);
2767 2788 if (unlikely(!page)) {
2768   - /*
2769   - * The first pass makes sure allocations are spread
2770   - * fairly within the local node. However, the local
2771   - * node might have free pages left after the fairness
2772   - * batches are exhausted, and remote zones haven't
2773   - * even been considered yet. Try once more without
2774   - * fairness, and include remote zones now, before
2775   - * entering the slowpath and waking kswapd: prefer
2776   - * spilling to a remote zone over swapping locally.
2777   - */
2778   - if (alloc_flags & ALLOC_FAIR) {
2779   - reset_alloc_batches(zonelist, high_zoneidx,
2780   - preferred_zone);
2781   - alloc_flags &= ~ALLOC_FAIR;
2782   - goto retry;
2783   - }
2784 2789 /*
2785 2790 * Runtime PM, block IO and its error handling path
2786 2791 * can deadlock because I/O on the device might not