Commit 418589663d6011de9006425b6c5721e1544fb47a

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent a3af9c389a

page allocator: use allocation flags as an index to the zone watermark

ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether
pages_min, pages_low or pages_high is used as the zone watermark when
allocating the pages.  Two branches in the allocator hotpath determine
which watermark to use.

This patch uses the flags as an array index into a watermark array that is
indexed with WMARK_* defines accessed via helpers.  All call sites that
use zone->pages_* are updated to use the helpers for accessing the values
and the array offsets for setting.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 7 changed files with 83 additions and 64 deletions Side-by-side Diff

Documentation/sysctl/vm.txt
... ... @@ -233,8 +233,8 @@
233 233 for page allocation or should be reclaimed.
234 234  
235 235 In this example, if normal pages (index=2) are required to this DMA zone and
236   -pages_high is used for watermark, the kernel judges this zone should not be
237   -used because pages_free(1355) is smaller than watermark + protection[2]
  236 +watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
  237 +not be used because pages_free(1355) is smaller than watermark + protection[2]
238 238 (4 + 2004 = 2008). If this protection value is 0, this zone would be used for
239 239 normal page requirement. If requirement is DMA zone(index=0), protection[0]
240 240 (=0) is used.
... ... @@ -280,9 +280,10 @@
280 280 min_free_kbytes:
281 281  
282 282 This is used to force the Linux VM to keep a minimum number
283   -of kilobytes free. The VM uses this number to compute a pages_min
284   -value for each lowmem zone in the system. Each lowmem zone gets
285   -a number of reserved free pages based proportionally on its size.
  283 +of kilobytes free. The VM uses this number to compute a
  284 +watermark[WMARK_MIN] value for each lowmem zone in the system.
  285 +Each lowmem zone gets a number of reserved free pages based
  286 +proportionally on its size.
286 287  
287 288 Some minimal amount of memory is needed to satisfy PF_MEMALLOC
288 289 allocations; if you set this to lower than 1024KB, your system will
Documentation/vm/balance
... ... @@ -75,15 +75,15 @@
75 75 alleviate memory pressure on any zone in the page's node that has fallen below
76 76 its watermark.
77 77  
78   -pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are
79   -per-zone fields, used to determine when a zone needs to be balanced. When
80   -the number of pages falls below pages_min, the hysteric field low_on_memory
81   -gets set. This stays set till the number of free pages becomes pages_high.
82   -When low_on_memory is set, page allocation requests will try to free some
83   -pages in the zone (providing GFP_WAIT is set in the request). Orthogonal
84   -to this, is the decision to poke kswapd to free some zone pages. That
85   -decision is not hysteresis based, and is done when the number of free
86   -pages is below pages_low; in which case zone_wake_kswapd is also set.
  78 +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
  79 +are per-zone fields, used to determine when a zone needs to be balanced. When
  80 +the number of pages falls below watermark[WMARK_MIN], the hysteric field
  81 +low_on_memory gets set. This stays set till the number of free pages becomes
  82 +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
  83 +try to free some pages in the zone (providing GFP_WAIT is set in the request).
  84 +Orthogonal to this, is the decision to poke kswapd to free some zone pages.
  85 +That decision is not hysteresis based, and is done when the number of free
  86 +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
87 87  
88 88  
89 89 (Good) Ideas that I have heard:
arch/m32r/mm/discontig.c
... ... @@ -154,9 +154,9 @@
154 154 * Use all area of internal RAM.
155 155 * see __alloc_pages()
156 156 */
157   - NODE_DATA(1)->node_zones->pages_min = 0;
158   - NODE_DATA(1)->node_zones->pages_low = 0;
159   - NODE_DATA(1)->node_zones->pages_high = 0;
  157 + NODE_DATA(1)->node_zones->watermark[WMARK_MIN] = 0;
  158 + NODE_DATA(1)->node_zones->watermark[WMARK_LOW] = 0;
  159 + NODE_DATA(1)->node_zones->watermark[WMARK_HIGH] = 0;
160 160  
161 161 return holes;
162 162 }
include/linux/mmzone.h
... ... @@ -163,6 +163,17 @@
163 163 #endif
164 164 }
165 165  
  166 +enum zone_watermarks {
  167 + WMARK_MIN,
  168 + WMARK_LOW,
  169 + WMARK_HIGH,
  170 + NR_WMARK
  171 +};
  172 +
  173 +#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
  174 +#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
  175 +#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
  176 +
166 177 struct per_cpu_pages {
167 178 int count; /* number of pages in the list */
168 179 int high; /* high watermark, emptying needed */
... ... @@ -275,7 +286,10 @@
275 286  
276 287 struct zone {
277 288 /* Fields commonly accessed by the page allocator */
278   - unsigned long pages_min, pages_low, pages_high;
  289 +
  290 + /* zone watermarks, access with *_wmark_pages(zone) macros */
  291 + unsigned long watermark[NR_WMARK];
  292 +
279 293 /*
280 294 * We don't know if the memory that we're going to allocate will be freeable
281 295 * or/and it will be released eventually, so to avoid totally wasting several
... ... @@ -1150,10 +1150,15 @@
1150 1150 return NULL;
1151 1151 }
1152 1152  
1153   -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
1154   -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
1155   -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
1156   -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
  1153 +/* The ALLOC_WMARK bits are used as an index to zone->watermark */
  1154 +#define ALLOC_WMARK_MIN WMARK_MIN
  1155 +#define ALLOC_WMARK_LOW WMARK_LOW
  1156 +#define ALLOC_WMARK_HIGH WMARK_HIGH
  1157 +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
  1158 +
  1159 +/* Mask to get the watermark bits */
  1160 +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
  1161 +
1157 1162 #define ALLOC_HARDER 0x10 /* try to alloc harder */
1158 1163 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1159 1164 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1160 1165  
... ... @@ -1440,14 +1445,10 @@
1440 1445 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1441 1446 goto try_next_zone;
1442 1447  
  1448 + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1443 1449 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1444 1450 unsigned long mark;
1445   - if (alloc_flags & ALLOC_WMARK_MIN)
1446   - mark = zone->pages_min;
1447   - else if (alloc_flags & ALLOC_WMARK_LOW)
1448   - mark = zone->pages_low;
1449   - else
1450   - mark = zone->pages_high;
  1451 + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1451 1452 if (!zone_watermark_ok(zone, order, mark,
1452 1453 classzone_idx, alloc_flags)) {
1453 1454 if (!zone_reclaim_mode ||
... ... @@ -1959,7 +1960,7 @@
1959 1960  
1960 1961 for_each_zone_zonelist(zone, z, zonelist, offset) {
1961 1962 unsigned long size = zone->present_pages;
1962   - unsigned long high = zone->pages_high;
  1963 + unsigned long high = high_wmark_pages(zone);
1963 1964 if (size > high)
1964 1965 sum += size - high;
1965 1966 }
... ... @@ -2096,9 +2097,9 @@
2096 2097 "\n",
2097 2098 zone->name,
2098 2099 K(zone_page_state(zone, NR_FREE_PAGES)),
2099   - K(zone->pages_min),
2100   - K(zone->pages_low),
2101   - K(zone->pages_high),
  2100 + K(min_wmark_pages(zone)),
  2101 + K(low_wmark_pages(zone)),
  2102 + K(high_wmark_pages(zone)),
2102 2103 K(zone_page_state(zone, NR_ACTIVE_ANON)),
2103 2104 K(zone_page_state(zone, NR_INACTIVE_ANON)),
2104 2105 K(zone_page_state(zone, NR_ACTIVE_FILE)),
... ... @@ -2702,8 +2703,8 @@
2702 2703  
2703 2704 /*
2704 2705 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2705   - * of blocks reserved is based on zone->pages_min. The memory within the
2706   - * reserve will tend to store contiguous free pages. Setting min_free_kbytes
  2706 + * of blocks reserved is based on min_wmark_pages(zone). The memory within
  2707 + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2707 2708 * higher will lead to a bigger reserve which will get freed as contiguous
2708 2709 * blocks as reclaim kicks in
2709 2710 */
... ... @@ -2716,7 +2717,7 @@
2716 2717 /* Get the start pfn, end pfn and the number of blocks to reserve */
2717 2718 start_pfn = zone->zone_start_pfn;
2718 2719 end_pfn = start_pfn + zone->spanned_pages;
2719   - reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
  2720 + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2720 2721 pageblock_order;
2721 2722  
2722 2723 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
... ... @@ -4319,8 +4320,8 @@
4319 4320 max = zone->lowmem_reserve[j];
4320 4321 }
4321 4322  
4322   - /* we treat pages_high as reserved pages. */
4323   - max += zone->pages_high;
  4323 + /* we treat the high watermark as reserved pages. */
  4324 + max += high_wmark_pages(zone);
4324 4325  
4325 4326 if (max > zone->present_pages)
4326 4327 max = zone->present_pages;
... ... @@ -4400,7 +4401,7 @@
4400 4401 * need highmem pages, so cap pages_min to a small
4401 4402 * value here.
4402 4403 *
4403   - * The (pages_high-pages_low) and (pages_low-pages_min)
  4404 + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4404 4405 * deltas controls asynch page reclaim, and so should
4405 4406 * not be capped for highmem.
4406 4407 */
4407 4408  
4408 4409  
... ... @@ -4411,17 +4412,17 @@
4411 4412 min_pages = SWAP_CLUSTER_MAX;
4412 4413 if (min_pages > 128)
4413 4414 min_pages = 128;
4414   - zone->pages_min = min_pages;
  4415 + zone->watermark[WMARK_MIN] = min_pages;
4415 4416 } else {
4416 4417 /*
4417 4418 * If it's a lowmem zone, reserve a number of pages
4418 4419 * proportionate to the zone's size.
4419 4420 */
4420   - zone->pages_min = tmp;
  4421 + zone->watermark[WMARK_MIN] = tmp;
4421 4422 }
4422 4423  
4423   - zone->pages_low = zone->pages_min + (tmp >> 2);
4424   - zone->pages_high = zone->pages_min + (tmp >> 1);
  4424 + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
  4425 + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4425 4426 setup_zone_migrate_reserve(zone);
4426 4427 spin_unlock_irqrestore(&zone->lock, flags);
4427 4428 }
... ... @@ -4566,7 +4567,7 @@
4566 4567 * whenever sysctl_lowmem_reserve_ratio changes.
4567 4568 *
4568 4569 * The reserve ratio obviously has absolutely no relation with the
4569   - * pages_min watermarks. The lowmem reserve ratio can only make sense
  4570 + * minimum watermarks. The lowmem reserve ratio can only make sense
4570 4571 * if in function of the boot time zone sizes.
4571 4572 */
4572 4573 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
... ... @@ -1401,7 +1401,7 @@
1401 1401 free = zone_page_state(zone, NR_FREE_PAGES);
1402 1402 /* If we have very few page cache pages,
1403 1403 force-scan anon pages. */
1404   - if (unlikely(file + free <= zone->pages_high)) {
  1404 + if (unlikely(file + free <= high_wmark_pages(zone))) {
1405 1405 percent[0] = 100;
1406 1406 percent[1] = 0;
1407 1407 return;
1408 1408  
... ... @@ -1533,11 +1533,13 @@
1533 1533 * try to reclaim pages from zones which will satisfy the caller's allocation
1534 1534 * request.
1535 1535 *
1536   - * We reclaim from a zone even if that zone is over pages_high. Because:
  1536 + * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
  1537 + * Because:
1537 1538 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1538 1539 * allocation or
1539   - * b) The zones may be over pages_high but they must go *over* pages_high to
1540   - * satisfy the `incremental min' zone defense algorithm.
  1540 + * b) The target zone may be at high_wmark_pages(zone) but the lower zones
  1541 + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
  1542 + * zone defense algorithm.
1541 1543 *
1542 1544 * If a zone is deemed to be full of pinned pages then just give it a light
1543 1545 * scan then give up on it.
... ... @@ -1743,7 +1745,7 @@
1743 1745  
1744 1746 /*
1745 1747 * For kswapd, balance_pgdat() will work across all this node's zones until
1746   - * they are all at pages_high.
  1748 + * they are all at high_wmark_pages(zone).
1747 1749 *
1748 1750 * Returns the number of pages which were actually freed.
1749 1751 *
... ... @@ -1756,11 +1758,11 @@
1756 1758 * the zone for when the problem goes away.
1757 1759 *
1758 1760 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1759   - * zones which have free_pages > pages_high, but once a zone is found to have
1760   - * free_pages <= pages_high, we scan that zone and the lower zones regardless
1761   - * of the number of free pages in the lower zones. This interoperates with
1762   - * the page allocator fallback scheme to ensure that aging of pages is balanced
1763   - * across the zones.
  1761 + * zones which have free_pages > high_wmark_pages(zone), but once a zone is
  1762 + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
  1763 + * lower zones regardless of the number of free pages in the lower zones. This
  1764 + * interoperates with the page allocator fallback scheme to ensure that aging
  1765 + * of pages is balanced across the zones.
1764 1766 */
1765 1767 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1766 1768 {
... ... @@ -1781,7 +1783,8 @@
1781 1783 };
1782 1784 /*
1783 1785 * temp_priority is used to remember the scanning priority at which
1784   - * this zone was successfully refilled to free_pages == pages_high.
  1786 + * this zone was successfully refilled to
  1787 + * free_pages == high_wmark_pages(zone).
1785 1788 */
1786 1789 int temp_priority[MAX_NR_ZONES];
1787 1790  
... ... @@ -1826,8 +1829,8 @@
1826 1829 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1827 1830 &sc, priority, 0);
1828 1831  
1829   - if (!zone_watermark_ok(zone, order, zone->pages_high,
1830   - 0, 0)) {
  1832 + if (!zone_watermark_ok(zone, order,
  1833 + high_wmark_pages(zone), 0, 0)) {
1831 1834 end_zone = i;
1832 1835 break;
1833 1836 }
... ... @@ -1861,8 +1864,8 @@
1861 1864 priority != DEF_PRIORITY)
1862 1865 continue;
1863 1866  
1864   - if (!zone_watermark_ok(zone, order, zone->pages_high,
1865   - end_zone, 0))
  1867 + if (!zone_watermark_ok(zone, order,
  1868 + high_wmark_pages(zone), end_zone, 0))
1866 1869 all_zones_ok = 0;
1867 1870 temp_priority[i] = priority;
1868 1871 sc.nr_scanned = 0;
... ... @@ -1871,8 +1874,8 @@
1871 1874 * We put equal pressure on every zone, unless one
1872 1875 * zone has way too many pages free already.
1873 1876 */
1874   - if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1875   - end_zone, 0))
  1877 + if (!zone_watermark_ok(zone, order,
  1878 + 8*high_wmark_pages(zone), end_zone, 0))
1876 1879 shrink_zone(priority, zone, &sc);
1877 1880 reclaim_state->reclaimed_slab = 0;
1878 1881 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
... ... @@ -2038,7 +2041,7 @@
2038 2041 return;
2039 2042  
2040 2043 pgdat = zone->zone_pgdat;
2041   - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
  2044 + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2042 2045 return;
2043 2046 if (pgdat->kswapd_max_order < order)
2044 2047 pgdat->kswapd_max_order = order;
... ... @@ -714,9 +714,9 @@
714 714 "\n spanned %lu"
715 715 "\n present %lu",
716 716 zone_page_state(zone, NR_FREE_PAGES),
717   - zone->pages_min,
718   - zone->pages_low,
719   - zone->pages_high,
  717 + min_wmark_pages(zone),
  718 + low_wmark_pages(zone),
  719 + high_wmark_pages(zone),
720 720 zone->pages_scanned,
721 721 zone->lru[LRU_ACTIVE_ANON].nr_scan,
722 722 zone->lru[LRU_INACTIVE_ANON].nr_scan,