Commit 2a1e274acf0b1c192face19a4be7c12d4503eaaf

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 769848c038

Create the ZONE_MOVABLE zone

The following 8 patches against 2.6.20-mm2 create a zone called ZONE_MOVABLE
that is only usable by allocations that specify both __GFP_HIGHMEM and
__GFP_MOVABLE.  This has the effect of keeping all non-movable pages within a
single memory partition while allowing movable allocations to be satisfied
from either partition.  The patches may be applied with the list-based
anti-fragmentation patches that groups pages together based on mobility.

The size of the zone is determined by a kernelcore= parameter specified at
boot-time.  This specifies how much memory is usable by non-movable
allocations and the remainder is used for ZONE_MOVABLE.  Any range of pages
within ZONE_MOVABLE can be released by migrating the pages or by reclaiming.

When selecting a zone to take pages from for ZONE_MOVABLE, there are two
things to consider.  First, only memory from the highest populated zone is
used for ZONE_MOVABLE.  On the x86, this is probably going to be ZONE_HIGHMEM
but it would be ZONE_DMA on ppc64 or possibly ZONE_DMA32 on x86_64.  Second,
the amount of memory usable by the kernel will be spread evenly throughout
NUMA nodes where possible.  If the nodes are not of equal size, the amount of
memory usable by the kernel on some nodes may be greater than others.

By default, the zone is not as useful for hugetlb allocations because they are
pinned and non-migratable (currently at least).  A sysctl is provided that
allows huge pages to be allocated from that zone.  This means that the huge
page pool can be resized to the size of ZONE_MOVABLE during the lifetime of
the system assuming that pages are not mlocked.  Despite huge pages being
non-movable, we do not introduce additional external fragmentation of note as
huge pages are always the largest contiguous block we care about.

Credit goes to Andy Whitcroft for catching a large variety of problems during
review of the patches.

This patch creates an additional zone, ZONE_MOVABLE.  This zone is only usable
by allocations which specify both __GFP_HIGHMEM and __GFP_MOVABLE.  Hot-added
memory continues to be placed in their existing destination as there is no
mechanism to redirect them to a specific zone.

[y-goto@jp.fujitsu.com: Fix section mismatch of memory hotplug related code]
[akpm@linux-foundation.org: various fixes]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 7 changed files with 260 additions and 10 deletions Side-by-side Diff

... ... @@ -106,6 +106,9 @@
106 106 if (flags & __GFP_DMA32)
107 107 return ZONE_DMA32;
108 108 #endif
  109 + if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
  110 + (__GFP_HIGHMEM | __GFP_MOVABLE))
  111 + return ZONE_MOVABLE;
109 112 #ifdef CONFIG_HIGHMEM
110 113 if (flags & __GFP_HIGHMEM)
111 114 return ZONE_HIGHMEM;
... ... @@ -1005,6 +1005,7 @@
1005 1005 extern void free_bootmem_with_active_regions(int nid,
1006 1006 unsigned long max_low_pfn);
1007 1007 extern void sparse_memory_present_with_active_regions(int nid);
  1008 +extern int cmdline_parse_kernelcore(char *p);
1008 1009 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
1009 1010 extern int early_pfn_to_nid(unsigned long pfn);
1010 1011 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
include/linux/mmzone.h
... ... @@ -146,6 +146,7 @@
146 146 */
147 147 ZONE_HIGHMEM,
148 148 #endif
  149 + ZONE_MOVABLE,
149 150 MAX_NR_ZONES
150 151 };
151 152  
... ... @@ -167,6 +168,7 @@
167 168 + defined(CONFIG_ZONE_DMA32) \
168 169 + 1 \
169 170 + defined(CONFIG_HIGHMEM) \
  171 + + 1 \
170 172 )
171 173 #if __ZONE_COUNT < 2
172 174 #define ZONES_SHIFT 0
173 175  
... ... @@ -499,10 +501,22 @@
499 501 return (!!zone->present_pages);
500 502 }
501 503  
  504 +extern int movable_zone;
  505 +
  506 +static inline int zone_movable_is_highmem(void)
  507 +{
  508 +#if defined(CONFIG_HIGHMEM) && defined(CONFIG_ARCH_POPULATES_NODE_MAP)
  509 + return movable_zone == ZONE_HIGHMEM;
  510 +#else
  511 + return 0;
  512 +#endif
  513 +}
  514 +
502 515 static inline int is_highmem_idx(enum zone_type idx)
503 516 {
504 517 #ifdef CONFIG_HIGHMEM
505   - return (idx == ZONE_HIGHMEM);
  518 + return (idx == ZONE_HIGHMEM ||
  519 + (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
506 520 #else
507 521 return 0;
508 522 #endif
... ... @@ -522,7 +536,9 @@
522 536 static inline int is_highmem(struct zone *zone)
523 537 {
524 538 #ifdef CONFIG_HIGHMEM
525   - return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
  539 + int zone_idx = zone - zone->zone_pgdat->node_zones;
  540 + return zone_idx == ZONE_HIGHMEM ||
  541 + (zone_idx == ZONE_MOVABLE && zone_movable_is_highmem());
526 542 #else
527 543 return 0;
528 544 #endif
include/linux/vmstat.h
... ... @@ -25,7 +25,7 @@
25 25 #define HIGHMEM_ZONE(xx)
26 26 #endif
27 27  
28   -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx)
  28 +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE
29 29  
30 30 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
31 31 FOR_ALL_ZONES(PGALLOC),
... ... @@ -170,7 +170,8 @@
170 170 #ifdef CONFIG_HIGHMEM
171 171 zone_page_state(&zones[ZONE_HIGHMEM], item) +
172 172 #endif
173   - zone_page_state(&zones[ZONE_NORMAL], item);
  173 + zone_page_state(&zones[ZONE_NORMAL], item) +
  174 + zone_page_state(&zones[ZONE_MOVABLE], item);
174 175 }
175 176  
176 177 extern void zone_statistics(struct zonelist *, struct zone *);
... ... @@ -46,9 +46,14 @@
46 46 pg_data_t *pgdat;
47 47 unsigned int pages = 0;
48 48  
49   - for_each_online_pgdat(pgdat)
  49 + for_each_online_pgdat(pgdat) {
50 50 pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
51 51 NR_FREE_PAGES);
  52 + if (zone_movable_is_highmem())
  53 + pages += zone_page_state(
  54 + &pgdat->node_zones[ZONE_MOVABLE],
  55 + NR_FREE_PAGES);
  56 + }
52 57  
53 58 return pages;
54 59 }
... ... @@ -80,8 +80,9 @@
80 80 256,
81 81 #endif
82 82 #ifdef CONFIG_HIGHMEM
83   - 32
  83 + 32,
84 84 #endif
  85 + 32,
85 86 };
86 87  
87 88 EXPORT_SYMBOL(totalram_pages);
88 89  
... ... @@ -95,8 +96,9 @@
95 96 #endif
96 97 "Normal",
97 98 #ifdef CONFIG_HIGHMEM
98   - "HighMem"
  99 + "HighMem",
99 100 #endif
  101 + "Movable",
100 102 };
101 103  
102 104 int min_free_kbytes = 1024;
... ... @@ -134,6 +136,12 @@
134 136 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
135 137 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
136 138 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
  139 + unsigned long __initdata required_kernelcore;
  140 + unsigned long __initdata zone_movable_pfn[MAX_NUMNODES];
  141 +
  142 + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  143 + int movable_zone;
  144 + EXPORT_SYMBOL(movable_zone);
137 145 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
138 146  
139 147 #if MAX_NUMNODES > 1
... ... @@ -1480,7 +1488,7 @@
1480 1488 */
1481 1489 unsigned int nr_free_pagecache_pages(void)
1482 1490 {
1483   - return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
  1491 + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1484 1492 }
1485 1493  
1486 1494 static inline void show_node(struct zone *zone)
... ... @@ -2667,6 +2675,63 @@
2667 2675 }
2668 2676  
2669 2677 /*
  2678 + * This finds a zone that can be used for ZONE_MOVABLE pages. The
  2679 + * assumption is made that zones within a node are ordered in monotonic
  2680 + * increasing memory addresses so that the "highest" populated zone is used
  2681 + */
  2682 +void __init find_usable_zone_for_movable(void)
  2683 +{
  2684 + int zone_index;
  2685 + for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
  2686 + if (zone_index == ZONE_MOVABLE)
  2687 + continue;
  2688 +
  2689 + if (arch_zone_highest_possible_pfn[zone_index] >
  2690 + arch_zone_lowest_possible_pfn[zone_index])
  2691 + break;
  2692 + }
  2693 +
  2694 + VM_BUG_ON(zone_index == -1);
  2695 + movable_zone = zone_index;
  2696 +}
  2697 +
  2698 +/*
  2699 + * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  2700 + * because it is sized independant of architecture. Unlike the other zones,
  2701 + * the starting point for ZONE_MOVABLE is not fixed. It may be different
  2702 + * in each node depending on the size of each node and how evenly kernelcore
  2703 + * is distributed. This helper function adjusts the zone ranges
  2704 + * provided by the architecture for a given node by using the end of the
  2705 + * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  2706 + * zones within a node are in order of monotonic increases memory addresses
  2707 + */
  2708 +void __meminit adjust_zone_range_for_zone_movable(int nid,
  2709 + unsigned long zone_type,
  2710 + unsigned long node_start_pfn,
  2711 + unsigned long node_end_pfn,
  2712 + unsigned long *zone_start_pfn,
  2713 + unsigned long *zone_end_pfn)
  2714 +{
  2715 + /* Only adjust if ZONE_MOVABLE is on this node */
  2716 + if (zone_movable_pfn[nid]) {
  2717 + /* Size ZONE_MOVABLE */
  2718 + if (zone_type == ZONE_MOVABLE) {
  2719 + *zone_start_pfn = zone_movable_pfn[nid];
  2720 + *zone_end_pfn = min(node_end_pfn,
  2721 + arch_zone_highest_possible_pfn[movable_zone]);
  2722 +
  2723 + /* Adjust for ZONE_MOVABLE starting within this range */
  2724 + } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
  2725 + *zone_end_pfn > zone_movable_pfn[nid]) {
  2726 + *zone_end_pfn = zone_movable_pfn[nid];
  2727 +
  2728 + /* Check if this whole range is within ZONE_MOVABLE */
  2729 + } else if (*zone_start_pfn >= zone_movable_pfn[nid])
  2730 + *zone_start_pfn = *zone_end_pfn;
  2731 + }
  2732 +}
  2733 +
  2734 +/*
2670 2735 * Return the number of pages a zone spans in a node, including holes
2671 2736 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2672 2737 */
... ... @@ -2681,6 +2746,9 @@
2681 2746 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2682 2747 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2683 2748 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
  2749 + adjust_zone_range_for_zone_movable(nid, zone_type,
  2750 + node_start_pfn, node_end_pfn,
  2751 + &zone_start_pfn, &zone_end_pfn);
2684 2752  
2685 2753 /* Check that this node has pages within the zone's required range */
2686 2754 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
... ... @@ -2771,6 +2839,9 @@
2771 2839 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2772 2840 node_end_pfn);
2773 2841  
  2842 + adjust_zone_range_for_zone_movable(nid, zone_type,
  2843 + node_start_pfn, node_end_pfn,
  2844 + &zone_start_pfn, &zone_end_pfn);
2774 2845 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2775 2846 }
2776 2847  
... ... @@ -3148,6 +3219,122 @@
3148 3219 return max_pfn;
3149 3220 }
3150 3221  
  3222 +/*
  3223 + * Find the PFN the Movable zone begins in each node. Kernel memory
  3224 + * is spread evenly between nodes as long as the nodes have enough
  3225 + * memory. When they don't, some nodes will have more kernelcore than
  3226 + * others
  3227 + */
  3228 +void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
  3229 +{
  3230 + int i, nid;
  3231 + unsigned long usable_startpfn;
  3232 + unsigned long kernelcore_node, kernelcore_remaining;
  3233 + int usable_nodes = num_online_nodes();
  3234 +
  3235 + /* If kernelcore was not specified, there is no ZONE_MOVABLE */
  3236 + if (!required_kernelcore)
  3237 + return;
  3238 +
  3239 + /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
  3240 + find_usable_zone_for_movable();
  3241 + usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  3242 +
  3243 +restart:
  3244 + /* Spread kernelcore memory as evenly as possible throughout nodes */
  3245 + kernelcore_node = required_kernelcore / usable_nodes;
  3246 + for_each_online_node(nid) {
  3247 + /*
  3248 + * Recalculate kernelcore_node if the division per node
  3249 + * now exceeds what is necessary to satisfy the requested
  3250 + * amount of memory for the kernel
  3251 + */
  3252 + if (required_kernelcore < kernelcore_node)
  3253 + kernelcore_node = required_kernelcore / usable_nodes;
  3254 +
  3255 + /*
  3256 + * As the map is walked, we track how much memory is usable
  3257 + * by the kernel using kernelcore_remaining. When it is
  3258 + * 0, the rest of the node is usable by ZONE_MOVABLE
  3259 + */
  3260 + kernelcore_remaining = kernelcore_node;
  3261 +
  3262 + /* Go through each range of PFNs within this node */
  3263 + for_each_active_range_index_in_nid(i, nid) {
  3264 + unsigned long start_pfn, end_pfn;
  3265 + unsigned long size_pages;
  3266 +
  3267 + start_pfn = max(early_node_map[i].start_pfn,
  3268 + zone_movable_pfn[nid]);
  3269 + end_pfn = early_node_map[i].end_pfn;
  3270 + if (start_pfn >= end_pfn)
  3271 + continue;
  3272 +
  3273 + /* Account for what is only usable for kernelcore */
  3274 + if (start_pfn < usable_startpfn) {
  3275 + unsigned long kernel_pages;
  3276 + kernel_pages = min(end_pfn, usable_startpfn)
  3277 + - start_pfn;
  3278 +
  3279 + kernelcore_remaining -= min(kernel_pages,
  3280 + kernelcore_remaining);
  3281 + required_kernelcore -= min(kernel_pages,
  3282 + required_kernelcore);
  3283 +
  3284 + /* Continue if range is now fully accounted */
  3285 + if (end_pfn <= usable_startpfn) {
  3286 +
  3287 + /*
  3288 + * Push zone_movable_pfn to the end so
  3289 + * that if we have to rebalance
  3290 + * kernelcore across nodes, we will
  3291 + * not double account here
  3292 + */
  3293 + zone_movable_pfn[nid] = end_pfn;
  3294 + continue;
  3295 + }
  3296 + start_pfn = usable_startpfn;
  3297 + }
  3298 +
  3299 + /*
  3300 + * The usable PFN range for ZONE_MOVABLE is from
  3301 + * start_pfn->end_pfn. Calculate size_pages as the
  3302 + * number of pages used as kernelcore
  3303 + */
  3304 + size_pages = end_pfn - start_pfn;
  3305 + if (size_pages > kernelcore_remaining)
  3306 + size_pages = kernelcore_remaining;
  3307 + zone_movable_pfn[nid] = start_pfn + size_pages;
  3308 +
  3309 + /*
  3310 + * Some kernelcore has been met, update counts and
  3311 + * break if the kernelcore for this node has been
  3312 + * satisified
  3313 + */
  3314 + required_kernelcore -= min(required_kernelcore,
  3315 + size_pages);
  3316 + kernelcore_remaining -= size_pages;
  3317 + if (!kernelcore_remaining)
  3318 + break;
  3319 + }
  3320 + }
  3321 +
  3322 + /*
  3323 + * If there is still required_kernelcore, we do another pass with one
  3324 + * less node in the count. This will push zone_movable_pfn[nid] further
  3325 + * along on the nodes that still have memory until kernelcore is
  3326 + * satisified
  3327 + */
  3328 + usable_nodes--;
  3329 + if (usable_nodes && required_kernelcore > usable_nodes)
  3330 + goto restart;
  3331 +
  3332 + /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
  3333 + for (nid = 0; nid < MAX_NUMNODES; nid++)
  3334 + zone_movable_pfn[nid] =
  3335 + roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
  3336 +}
  3337 +
3151 3338 /**
3152 3339 * free_area_init_nodes - Initialise all pg_data_t and zone data
3153 3340 * @max_zone_pfn: an array of max PFNs for each zone
3154 3341  
3155 3342  
3156 3343  
3157 3344  
3158 3345  
... ... @@ -3177,20 +3364,38 @@
3177 3364 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
3178 3365 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
3179 3366 for (i = 1; i < MAX_NR_ZONES; i++) {
  3367 + if (i == ZONE_MOVABLE)
  3368 + continue;
3180 3369 arch_zone_lowest_possible_pfn[i] =
3181 3370 arch_zone_highest_possible_pfn[i-1];
3182 3371 arch_zone_highest_possible_pfn[i] =
3183 3372 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
3184 3373 }
  3374 + arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
  3375 + arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
3185 3376  
  3377 + /* Find the PFNs that ZONE_MOVABLE begins at in each node */
  3378 + memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
  3379 + find_zone_movable_pfns_for_nodes(zone_movable_pfn);
  3380 +
3186 3381 /* Print out the zone ranges */
3187 3382 printk("Zone PFN ranges:\n");
3188   - for (i = 0; i < MAX_NR_ZONES; i++)
  3383 + for (i = 0; i < MAX_NR_ZONES; i++) {
  3384 + if (i == ZONE_MOVABLE)
  3385 + continue;
3189 3386 printk(" %-8s %8lu -> %8lu\n",
3190 3387 zone_names[i],
3191 3388 arch_zone_lowest_possible_pfn[i],
3192 3389 arch_zone_highest_possible_pfn[i]);
  3390 + }
3193 3391  
  3392 + /* Print out the PFNs ZONE_MOVABLE begins at in each node */
  3393 + printk("Movable zone start PFN for each node\n");
  3394 + for (i = 0; i < MAX_NUMNODES; i++) {
  3395 + if (zone_movable_pfn[i])
  3396 + printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
  3397 + }
  3398 +
3194 3399 /* Print out the early_node_map[] */
3195 3400 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
3196 3401 for (i = 0; i < nr_nodemap_entries; i++)
... ... @@ -3205,6 +3410,25 @@
3205 3410 free_area_init_node(nid, pgdat, NULL,
3206 3411 find_min_pfn_for_node(nid), NULL);
3207 3412 }
  3413 +}
  3414 +
  3415 +/*
  3416 + * kernelcore=size sets the amount of memory for use for allocations that
  3417 + * cannot be reclaimed or migrated.
  3418 + */
  3419 +int __init cmdline_parse_kernelcore(char *p)
  3420 +{
  3421 + unsigned long long coremem;
  3422 + if (!p)
  3423 + return -EINVAL;
  3424 +
  3425 + coremem = memparse(p, &p);
  3426 + required_kernelcore = coremem >> PAGE_SHIFT;
  3427 +
  3428 + /* Paranoid check that UL is enough for required_kernelcore */
  3429 + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
  3430 +
  3431 + return 0;
3208 3432 }
3209 3433 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
3210 3434  
... ... @@ -472,7 +472,7 @@
472 472 #endif
473 473  
474 474 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
475   - TEXT_FOR_HIGHMEM(xx)
  475 + TEXT_FOR_HIGHMEM(xx) xx "_movable",
476 476  
477 477 static const char * const vmstat_text[] = {
478 478 /* Zoned VM counters */