Commit 68ad8df42e12037c3894c9706ab428bf5cd6426b

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 2dbb51c49f

mm: print out the zonelists on request for manual verification

This patch prints out the zonelists during boot for manual verification by the
user if the mminit_loglevel is MMINIT_VERIFY or higher.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 51 additions and 0 deletions Inline Diff

1 /* internal.h: mm/ internal definitions 1 /* internal.h: mm/ internal definitions
2 * 2 *
3 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License 7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 #ifndef __MM_INTERNAL_H 11 #ifndef __MM_INTERNAL_H
12 #define __MM_INTERNAL_H 12 #define __MM_INTERNAL_H
13 13
14 #include <linux/mm.h> 14 #include <linux/mm.h>
15 15
16 static inline void set_page_count(struct page *page, int v) 16 static inline void set_page_count(struct page *page, int v)
17 { 17 {
18 atomic_set(&page->_count, v); 18 atomic_set(&page->_count, v);
19 } 19 }
20 20
21 /* 21 /*
22 * Turn a non-refcounted page (->_count == 0) into refcounted with 22 * Turn a non-refcounted page (->_count == 0) into refcounted with
23 * a count of one. 23 * a count of one.
24 */ 24 */
25 static inline void set_page_refcounted(struct page *page) 25 static inline void set_page_refcounted(struct page *page)
26 { 26 {
27 VM_BUG_ON(PageTail(page)); 27 VM_BUG_ON(PageTail(page));
28 VM_BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30 } 30 }
31 31
32 static inline void __put_page(struct page *page) 32 static inline void __put_page(struct page *page)
33 { 33 {
34 atomic_dec(&page->_count); 34 atomic_dec(&page->_count);
35 } 35 }
36 36
37 extern void __free_pages_bootmem(struct page *page, unsigned int order); 37 extern void __free_pages_bootmem(struct page *page, unsigned int order);
38 38
39 /* 39 /*
40 * function for dealing with page's order in buddy system. 40 * function for dealing with page's order in buddy system.
41 * zone->lock is already acquired when we use these. 41 * zone->lock is already acquired when we use these.
42 * So, we don't need atomic page->flags operations here. 42 * So, we don't need atomic page->flags operations here.
43 */ 43 */
44 static inline unsigned long page_order(struct page *page) 44 static inline unsigned long page_order(struct page *page)
45 { 45 {
46 VM_BUG_ON(!PageBuddy(page)); 46 VM_BUG_ON(!PageBuddy(page));
47 return page_private(page); 47 return page_private(page);
48 } 48 }
49 49
50 /* 50 /*
51 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 51 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
52 * so all functions starting at paging_init should be marked __init 52 * so all functions starting at paging_init should be marked __init
53 * in those cases. SPARSEMEM, however, allows for memory hotplug, 53 * in those cases. SPARSEMEM, however, allows for memory hotplug,
54 * and alloc_bootmem_node is not used. 54 * and alloc_bootmem_node is not used.
55 */ 55 */
56 #ifdef CONFIG_SPARSEMEM 56 #ifdef CONFIG_SPARSEMEM
57 #define __paginginit __meminit 57 #define __paginginit __meminit
58 #else 58 #else
59 #define __paginginit __init 59 #define __paginginit __init
60 #endif 60 #endif
61 61
62 /* Memory initialisation debug and verification */ 62 /* Memory initialisation debug and verification */
63 enum mminit_level { 63 enum mminit_level {
64 MMINIT_WARNING, 64 MMINIT_WARNING,
65 MMINIT_VERIFY, 65 MMINIT_VERIFY,
66 MMINIT_TRACE 66 MMINIT_TRACE
67 }; 67 };
68 68
69 #ifdef CONFIG_DEBUG_MEMORY_INIT 69 #ifdef CONFIG_DEBUG_MEMORY_INIT
70 70
71 extern int mminit_loglevel; 71 extern int mminit_loglevel;
72 72
73 #define mminit_dprintk(level, prefix, fmt, arg...) \ 73 #define mminit_dprintk(level, prefix, fmt, arg...) \
74 do { \ 74 do { \
75 if (level < mminit_loglevel) { \ 75 if (level < mminit_loglevel) { \
76 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ 76 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
77 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ 77 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
78 } \ 78 } \
79 } while (0) 79 } while (0)
80 80
81 extern void mminit_verify_pageflags_layout(void); 81 extern void mminit_verify_pageflags_layout(void);
82 extern void mminit_verify_page_links(struct page *page, 82 extern void mminit_verify_page_links(struct page *page,
83 enum zone_type zone, unsigned long nid, unsigned long pfn); 83 enum zone_type zone, unsigned long nid, unsigned long pfn);
84 extern void mminit_verify_zonelist(void);
84 85
85 #else 86 #else
86 87
87 static inline void mminit_dprintk(enum mminit_level level, 88 static inline void mminit_dprintk(enum mminit_level level,
88 const char *prefix, const char *fmt, ...) 89 const char *prefix, const char *fmt, ...)
89 { 90 {
90 } 91 }
91 92
92 static inline void mminit_verify_pageflags_layout(void) 93 static inline void mminit_verify_pageflags_layout(void)
93 { 94 {
94 } 95 }
95 96
96 static inline void mminit_verify_page_links(struct page *page, 97 static inline void mminit_verify_page_links(struct page *page,
97 enum zone_type zone, unsigned long nid, unsigned long pfn) 98 enum zone_type zone, unsigned long nid, unsigned long pfn)
99 {
100 }
101
102 static inline void mminit_verify_zonelist(void)
98 { 103 {
99 } 104 }
100 #endif /* CONFIG_DEBUG_MEMORY_INIT */ 105 #endif /* CONFIG_DEBUG_MEMORY_INIT */
101 106
102 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ 107 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
103 #if defined(CONFIG_SPARSEMEM) 108 #if defined(CONFIG_SPARSEMEM)
104 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, 109 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
105 unsigned long *end_pfn); 110 unsigned long *end_pfn);
106 #else 111 #else
107 static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, 112 static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
108 unsigned long *end_pfn) 113 unsigned long *end_pfn)
109 { 114 {
110 } 115 }
111 #endif /* CONFIG_SPARSEMEM */ 116 #endif /* CONFIG_SPARSEMEM */
112 117
113 #endif 118 #endif
114 119
1 /* 1 /*
2 * mm_init.c - Memory initialisation verification and debugging 2 * mm_init.c - Memory initialisation verification and debugging
3 * 3 *
4 * Copyright 2008 IBM Corporation, 2008 4 * Copyright 2008 IBM Corporation, 2008
5 * Author Mel Gorman <mel@csn.ul.ie> 5 * Author Mel Gorman <mel@csn.ul.ie>
6 * 6 *
7 */ 7 */
8 #include <linux/kernel.h> 8 #include <linux/kernel.h>
9 #include <linux/init.h> 9 #include <linux/init.h>
10 #include "internal.h" 10 #include "internal.h"
11 11
12 int __meminitdata mminit_loglevel; 12 int __meminitdata mminit_loglevel;
13 13
14 /* The zonelists are simply reported, validation is manual. */
15 void mminit_verify_zonelist(void)
16 {
17 int nid;
18
19 if (mminit_loglevel < MMINIT_VERIFY)
20 return;
21
22 for_each_online_node(nid) {
23 pg_data_t *pgdat = NODE_DATA(nid);
24 struct zone *zone;
25 struct zoneref *z;
26 struct zonelist *zonelist;
27 int i, listid, zoneid;
28
29 BUG_ON(MAX_ZONELISTS > 2);
30 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
31
32 /* Identify the zone and nodelist */
33 zoneid = i % MAX_NR_ZONES;
34 listid = i / MAX_NR_ZONES;
35 zonelist = &pgdat->node_zonelists[listid];
36 zone = &pgdat->node_zones[zoneid];
37 if (!populated_zone(zone))
38 continue;
39
40 /* Print information about the zonelist */
41 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
42 listid > 0 ? "thisnode" : "general", nid,
43 zone->name);
44
45 /* Iterate the zonelist */
46 for_each_zone_zonelist(zone, z, zonelist, zoneid) {
47 #ifdef CONFIG_NUMA
48 printk(KERN_CONT "%d:%s ",
49 zone->node, zone->name);
50 #else
51 printk(KERN_CONT "0:%s ", zone->name);
52 #endif /* CONFIG_NUMA */
53 }
54 printk(KERN_CONT "\n");
55 }
56 }
57 }
58
14 void __init mminit_verify_pageflags_layout(void) 59 void __init mminit_verify_pageflags_layout(void)
15 { 60 {
16 int shift, width; 61 int shift, width;
17 unsigned long or_mask, add_mask; 62 unsigned long or_mask, add_mask;
18 63
19 shift = 8 * sizeof(unsigned long); 64 shift = 8 * sizeof(unsigned long);
20 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; 65 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
21 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 66 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
22 "Section %d Node %d Zone %d Flags %d\n", 67 "Section %d Node %d Zone %d Flags %d\n",
23 SECTIONS_WIDTH, 68 SECTIONS_WIDTH,
24 NODES_WIDTH, 69 NODES_WIDTH,
25 ZONES_WIDTH, 70 ZONES_WIDTH,
26 NR_PAGEFLAGS); 71 NR_PAGEFLAGS);
27 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 72 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
28 "Section %d Node %d Zone %d\n", 73 "Section %d Node %d Zone %d\n",
29 #ifdef SECTIONS_SHIFT 74 #ifdef SECTIONS_SHIFT
30 SECTIONS_SHIFT, 75 SECTIONS_SHIFT,
31 #else 76 #else
32 0, 77 0,
33 #endif 78 #endif
34 NODES_SHIFT, 79 NODES_SHIFT,
35 ZONES_SHIFT); 80 ZONES_SHIFT);
36 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", 81 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
37 "Section %lu Node %lu Zone %lu\n", 82 "Section %lu Node %lu Zone %lu\n",
38 (unsigned long)SECTIONS_PGSHIFT, 83 (unsigned long)SECTIONS_PGSHIFT,
39 (unsigned long)NODES_PGSHIFT, 84 (unsigned long)NODES_PGSHIFT,
40 (unsigned long)ZONES_PGSHIFT); 85 (unsigned long)ZONES_PGSHIFT);
41 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", 86 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
42 "Zone ID: %lu -> %lu\n", 87 "Zone ID: %lu -> %lu\n",
43 (unsigned long)ZONEID_PGOFF, 88 (unsigned long)ZONEID_PGOFF,
44 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); 89 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
45 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", 90 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
46 "location: %d -> %d unused %d -> %d flags %d -> %d\n", 91 "location: %d -> %d unused %d -> %d flags %d -> %d\n",
47 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); 92 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
48 #ifdef NODE_NOT_IN_PAGE_FLAGS 93 #ifdef NODE_NOT_IN_PAGE_FLAGS
49 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
50 "Node not in page flags"); 95 "Node not in page flags");
51 #endif 96 #endif
52 97
53 if (SECTIONS_WIDTH) { 98 if (SECTIONS_WIDTH) {
54 shift -= SECTIONS_WIDTH; 99 shift -= SECTIONS_WIDTH;
55 BUG_ON(shift != SECTIONS_PGSHIFT); 100 BUG_ON(shift != SECTIONS_PGSHIFT);
56 } 101 }
57 if (NODES_WIDTH) { 102 if (NODES_WIDTH) {
58 shift -= NODES_WIDTH; 103 shift -= NODES_WIDTH;
59 BUG_ON(shift != NODES_PGSHIFT); 104 BUG_ON(shift != NODES_PGSHIFT);
60 } 105 }
61 if (ZONES_WIDTH) { 106 if (ZONES_WIDTH) {
62 shift -= ZONES_WIDTH; 107 shift -= ZONES_WIDTH;
63 BUG_ON(shift != ZONES_PGSHIFT); 108 BUG_ON(shift != ZONES_PGSHIFT);
64 } 109 }
65 110
66 /* Check for bitmask overlaps */ 111 /* Check for bitmask overlaps */
67 or_mask = (ZONES_MASK << ZONES_PGSHIFT) | 112 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
68 (NODES_MASK << NODES_PGSHIFT) | 113 (NODES_MASK << NODES_PGSHIFT) |
69 (SECTIONS_MASK << SECTIONS_PGSHIFT); 114 (SECTIONS_MASK << SECTIONS_PGSHIFT);
70 add_mask = (ZONES_MASK << ZONES_PGSHIFT) + 115 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
71 (NODES_MASK << NODES_PGSHIFT) + 116 (NODES_MASK << NODES_PGSHIFT) +
72 (SECTIONS_MASK << SECTIONS_PGSHIFT); 117 (SECTIONS_MASK << SECTIONS_PGSHIFT);
73 BUG_ON(or_mask != add_mask); 118 BUG_ON(or_mask != add_mask);
74 } 119 }
75 120
76 void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, 121 void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
77 unsigned long nid, unsigned long pfn) 122 unsigned long nid, unsigned long pfn)
78 { 123 {
79 BUG_ON(page_to_nid(page) != nid); 124 BUG_ON(page_to_nid(page) != nid);
80 BUG_ON(page_zonenum(page) != zone); 125 BUG_ON(page_zonenum(page) != zone);
81 BUG_ON(page_to_pfn(page) != pfn); 126 BUG_ON(page_to_pfn(page) != pfn);
82 } 127 }
83 128
84 static __init int set_mminit_loglevel(char *str) 129 static __init int set_mminit_loglevel(char *str)
85 { 130 {
86 get_option(&str, &mminit_loglevel); 131 get_option(&str, &mminit_loglevel);
87 return 0; 132 return 0;
88 } 133 }
89 early_param("mminit_loglevel", set_mminit_loglevel); 134 early_param("mminit_loglevel", set_mminit_loglevel);
90 135
1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/compiler.h> 24 #include <linux/compiler.h>
25 #include <linux/kernel.h> 25 #include <linux/kernel.h>
26 #include <linux/module.h> 26 #include <linux/module.h>
27 #include <linux/suspend.h> 27 #include <linux/suspend.h>
28 #include <linux/pagevec.h> 28 #include <linux/pagevec.h>
29 #include <linux/blkdev.h> 29 #include <linux/blkdev.h>
30 #include <linux/slab.h> 30 #include <linux/slab.h>
31 #include <linux/oom.h> 31 #include <linux/oom.h>
32 #include <linux/notifier.h> 32 #include <linux/notifier.h>
33 #include <linux/topology.h> 33 #include <linux/topology.h>
34 #include <linux/sysctl.h> 34 #include <linux/sysctl.h>
35 #include <linux/cpu.h> 35 #include <linux/cpu.h>
36 #include <linux/cpuset.h> 36 #include <linux/cpuset.h>
37 #include <linux/memory_hotplug.h> 37 #include <linux/memory_hotplug.h>
38 #include <linux/nodemask.h> 38 #include <linux/nodemask.h>
39 #include <linux/vmalloc.h> 39 #include <linux/vmalloc.h>
40 #include <linux/mempolicy.h> 40 #include <linux/mempolicy.h>
41 #include <linux/stop_machine.h> 41 #include <linux/stop_machine.h>
42 #include <linux/sort.h> 42 #include <linux/sort.h>
43 #include <linux/pfn.h> 43 #include <linux/pfn.h>
44 #include <linux/backing-dev.h> 44 #include <linux/backing-dev.h>
45 #include <linux/fault-inject.h> 45 #include <linux/fault-inject.h>
46 #include <linux/page-isolation.h> 46 #include <linux/page-isolation.h>
47 #include <linux/memcontrol.h> 47 #include <linux/memcontrol.h>
48 #include <linux/debugobjects.h> 48 #include <linux/debugobjects.h>
49 49
50 #include <asm/tlbflush.h> 50 #include <asm/tlbflush.h>
51 #include <asm/div64.h> 51 #include <asm/div64.h>
52 #include "internal.h" 52 #include "internal.h"
53 53
54 /* 54 /*
55 * Array of node states. 55 * Array of node states.
56 */ 56 */
57 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 57 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
58 [N_POSSIBLE] = NODE_MASK_ALL, 58 [N_POSSIBLE] = NODE_MASK_ALL,
59 [N_ONLINE] = { { [0] = 1UL } }, 59 [N_ONLINE] = { { [0] = 1UL } },
60 #ifndef CONFIG_NUMA 60 #ifndef CONFIG_NUMA
61 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 61 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
62 #ifdef CONFIG_HIGHMEM 62 #ifdef CONFIG_HIGHMEM
63 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 63 [N_HIGH_MEMORY] = { { [0] = 1UL } },
64 #endif 64 #endif
65 [N_CPU] = { { [0] = 1UL } }, 65 [N_CPU] = { { [0] = 1UL } },
66 #endif /* NUMA */ 66 #endif /* NUMA */
67 }; 67 };
68 EXPORT_SYMBOL(node_states); 68 EXPORT_SYMBOL(node_states);
69 69
70 unsigned long totalram_pages __read_mostly; 70 unsigned long totalram_pages __read_mostly;
71 unsigned long totalreserve_pages __read_mostly; 71 unsigned long totalreserve_pages __read_mostly;
72 long nr_swap_pages; 72 long nr_swap_pages;
73 int percpu_pagelist_fraction; 73 int percpu_pagelist_fraction;
74 74
75 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 75 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
76 int pageblock_order __read_mostly; 76 int pageblock_order __read_mostly;
77 #endif 77 #endif
78 78
79 static void __free_pages_ok(struct page *page, unsigned int order); 79 static void __free_pages_ok(struct page *page, unsigned int order);
80 80
81 /* 81 /*
82 * results with 256, 32 in the lowmem_reserve sysctl: 82 * results with 256, 32 in the lowmem_reserve sysctl:
83 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 83 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
84 * 1G machine -> (16M dma, 784M normal, 224M high) 84 * 1G machine -> (16M dma, 784M normal, 224M high)
85 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 85 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
86 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 86 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
87 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 87 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
88 * 88 *
89 * TBD: should special case ZONE_DMA32 machines here - in those we normally 89 * TBD: should special case ZONE_DMA32 machines here - in those we normally
90 * don't need any ZONE_NORMAL reservation 90 * don't need any ZONE_NORMAL reservation
91 */ 91 */
92 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 92 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
93 #ifdef CONFIG_ZONE_DMA 93 #ifdef CONFIG_ZONE_DMA
94 256, 94 256,
95 #endif 95 #endif
96 #ifdef CONFIG_ZONE_DMA32 96 #ifdef CONFIG_ZONE_DMA32
97 256, 97 256,
98 #endif 98 #endif
99 #ifdef CONFIG_HIGHMEM 99 #ifdef CONFIG_HIGHMEM
100 32, 100 32,
101 #endif 101 #endif
102 32, 102 32,
103 }; 103 };
104 104
105 EXPORT_SYMBOL(totalram_pages); 105 EXPORT_SYMBOL(totalram_pages);
106 106
107 static char * const zone_names[MAX_NR_ZONES] = { 107 static char * const zone_names[MAX_NR_ZONES] = {
108 #ifdef CONFIG_ZONE_DMA 108 #ifdef CONFIG_ZONE_DMA
109 "DMA", 109 "DMA",
110 #endif 110 #endif
111 #ifdef CONFIG_ZONE_DMA32 111 #ifdef CONFIG_ZONE_DMA32
112 "DMA32", 112 "DMA32",
113 #endif 113 #endif
114 "Normal", 114 "Normal",
115 #ifdef CONFIG_HIGHMEM 115 #ifdef CONFIG_HIGHMEM
116 "HighMem", 116 "HighMem",
117 #endif 117 #endif
118 "Movable", 118 "Movable",
119 }; 119 };
120 120
121 int min_free_kbytes = 1024; 121 int min_free_kbytes = 1024;
122 122
123 unsigned long __meminitdata nr_kernel_pages; 123 unsigned long __meminitdata nr_kernel_pages;
124 unsigned long __meminitdata nr_all_pages; 124 unsigned long __meminitdata nr_all_pages;
125 static unsigned long __meminitdata dma_reserve; 125 static unsigned long __meminitdata dma_reserve;
126 126
127 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 127 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
128 /* 128 /*
129 * MAX_ACTIVE_REGIONS determines the maximum number of distinct 129 * MAX_ACTIVE_REGIONS determines the maximum number of distinct
130 * ranges of memory (RAM) that may be registered with add_active_range(). 130 * ranges of memory (RAM) that may be registered with add_active_range().
131 * Ranges passed to add_active_range() will be merged if possible 131 * Ranges passed to add_active_range() will be merged if possible
132 * so the number of times add_active_range() can be called is 132 * so the number of times add_active_range() can be called is
133 * related to the number of nodes and the number of holes 133 * related to the number of nodes and the number of holes
134 */ 134 */
135 #ifdef CONFIG_MAX_ACTIVE_REGIONS 135 #ifdef CONFIG_MAX_ACTIVE_REGIONS
136 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 136 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
137 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 137 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
138 #else 138 #else
139 #if MAX_NUMNODES >= 32 139 #if MAX_NUMNODES >= 32
140 /* If there can be many nodes, allow up to 50 holes per node */ 140 /* If there can be many nodes, allow up to 50 holes per node */
141 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) 141 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
142 #else 142 #else
143 /* By default, allow up to 256 distinct regions */ 143 /* By default, allow up to 256 distinct regions */
144 #define MAX_ACTIVE_REGIONS 256 144 #define MAX_ACTIVE_REGIONS 256
145 #endif 145 #endif
146 #endif 146 #endif
147 147
148 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; 148 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
149 static int __meminitdata nr_nodemap_entries; 149 static int __meminitdata nr_nodemap_entries;
150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
152 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 152 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 155 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 unsigned long __initdata required_kernelcore; 156 unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 157 static unsigned long __initdata required_movablecore;
158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 158 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
159 159
160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 160 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
161 int movable_zone; 161 int movable_zone;
162 EXPORT_SYMBOL(movable_zone); 162 EXPORT_SYMBOL(movable_zone);
163 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 163 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
164 164
165 #if MAX_NUMNODES > 1 165 #if MAX_NUMNODES > 1
166 int nr_node_ids __read_mostly = MAX_NUMNODES; 166 int nr_node_ids __read_mostly = MAX_NUMNODES;
167 EXPORT_SYMBOL(nr_node_ids); 167 EXPORT_SYMBOL(nr_node_ids);
168 #endif 168 #endif
169 169
170 int page_group_by_mobility_disabled __read_mostly; 170 int page_group_by_mobility_disabled __read_mostly;
171 171
172 static void set_pageblock_migratetype(struct page *page, int migratetype) 172 static void set_pageblock_migratetype(struct page *page, int migratetype)
173 { 173 {
174 set_pageblock_flags_group(page, (unsigned long)migratetype, 174 set_pageblock_flags_group(page, (unsigned long)migratetype,
175 PB_migrate, PB_migrate_end); 175 PB_migrate, PB_migrate_end);
176 } 176 }
177 177
178 #ifdef CONFIG_DEBUG_VM 178 #ifdef CONFIG_DEBUG_VM
179 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 179 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
180 { 180 {
181 int ret = 0; 181 int ret = 0;
182 unsigned seq; 182 unsigned seq;
183 unsigned long pfn = page_to_pfn(page); 183 unsigned long pfn = page_to_pfn(page);
184 184
185 do { 185 do {
186 seq = zone_span_seqbegin(zone); 186 seq = zone_span_seqbegin(zone);
187 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 187 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
188 ret = 1; 188 ret = 1;
189 else if (pfn < zone->zone_start_pfn) 189 else if (pfn < zone->zone_start_pfn)
190 ret = 1; 190 ret = 1;
191 } while (zone_span_seqretry(zone, seq)); 191 } while (zone_span_seqretry(zone, seq));
192 192
193 return ret; 193 return ret;
194 } 194 }
195 195
196 static int page_is_consistent(struct zone *zone, struct page *page) 196 static int page_is_consistent(struct zone *zone, struct page *page)
197 { 197 {
198 if (!pfn_valid_within(page_to_pfn(page))) 198 if (!pfn_valid_within(page_to_pfn(page)))
199 return 0; 199 return 0;
200 if (zone != page_zone(page)) 200 if (zone != page_zone(page))
201 return 0; 201 return 0;
202 202
203 return 1; 203 return 1;
204 } 204 }
205 /* 205 /*
206 * Temporary debugging check for pages not lying within a given zone. 206 * Temporary debugging check for pages not lying within a given zone.
207 */ 207 */
208 static int bad_range(struct zone *zone, struct page *page) 208 static int bad_range(struct zone *zone, struct page *page)
209 { 209 {
210 if (page_outside_zone_boundaries(zone, page)) 210 if (page_outside_zone_boundaries(zone, page))
211 return 1; 211 return 1;
212 if (!page_is_consistent(zone, page)) 212 if (!page_is_consistent(zone, page))
213 return 1; 213 return 1;
214 214
215 return 0; 215 return 0;
216 } 216 }
217 #else 217 #else
218 static inline int bad_range(struct zone *zone, struct page *page) 218 static inline int bad_range(struct zone *zone, struct page *page)
219 { 219 {
220 return 0; 220 return 0;
221 } 221 }
222 #endif 222 #endif
223 223
224 static void bad_page(struct page *page) 224 static void bad_page(struct page *page)
225 { 225 {
226 void *pc = page_get_page_cgroup(page); 226 void *pc = page_get_page_cgroup(page);
227 227
228 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 228 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
229 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 229 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
230 current->comm, page, (int)(2*sizeof(unsigned long)), 230 current->comm, page, (int)(2*sizeof(unsigned long)),
231 (unsigned long)page->flags, page->mapping, 231 (unsigned long)page->flags, page->mapping,
232 page_mapcount(page), page_count(page)); 232 page_mapcount(page), page_count(page));
233 if (pc) { 233 if (pc) {
234 printk(KERN_EMERG "cgroup:%p\n", pc); 234 printk(KERN_EMERG "cgroup:%p\n", pc);
235 page_reset_bad_cgroup(page); 235 page_reset_bad_cgroup(page);
236 } 236 }
237 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 237 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
238 KERN_EMERG "Backtrace:\n"); 238 KERN_EMERG "Backtrace:\n");
239 dump_stack(); 239 dump_stack();
240 page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; 240 page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
241 set_page_count(page, 0); 241 set_page_count(page, 0);
242 reset_page_mapcount(page); 242 reset_page_mapcount(page);
243 page->mapping = NULL; 243 page->mapping = NULL;
244 add_taint(TAINT_BAD_PAGE); 244 add_taint(TAINT_BAD_PAGE);
245 } 245 }
246 246
247 /* 247 /*
248 * Higher-order pages are called "compound pages". They are structured thusly: 248 * Higher-order pages are called "compound pages". They are structured thusly:
249 * 249 *
250 * The first PAGE_SIZE page is called the "head page". 250 * The first PAGE_SIZE page is called the "head page".
251 * 251 *
252 * The remaining PAGE_SIZE pages are called "tail pages". 252 * The remaining PAGE_SIZE pages are called "tail pages".
253 * 253 *
254 * All pages have PG_compound set. All pages have their ->private pointing at 254 * All pages have PG_compound set. All pages have their ->private pointing at
255 * the head page (even the head page has this). 255 * the head page (even the head page has this).
256 * 256 *
257 * The first tail page's ->lru.next holds the address of the compound page's 257 * The first tail page's ->lru.next holds the address of the compound page's
258 * put_page() function. Its ->lru.prev holds the order of allocation. 258 * put_page() function. Its ->lru.prev holds the order of allocation.
259 * This usage means that zero-order pages may not be compound. 259 * This usage means that zero-order pages may not be compound.
260 */ 260 */
261 261
262 static void free_compound_page(struct page *page) 262 static void free_compound_page(struct page *page)
263 { 263 {
264 __free_pages_ok(page, compound_order(page)); 264 __free_pages_ok(page, compound_order(page));
265 } 265 }
266 266
267 static void prep_compound_page(struct page *page, unsigned long order) 267 static void prep_compound_page(struct page *page, unsigned long order)
268 { 268 {
269 int i; 269 int i;
270 int nr_pages = 1 << order; 270 int nr_pages = 1 << order;
271 271
272 set_compound_page_dtor(page, free_compound_page); 272 set_compound_page_dtor(page, free_compound_page);
273 set_compound_order(page, order); 273 set_compound_order(page, order);
274 __SetPageHead(page); 274 __SetPageHead(page);
275 for (i = 1; i < nr_pages; i++) { 275 for (i = 1; i < nr_pages; i++) {
276 struct page *p = page + i; 276 struct page *p = page + i;
277 277
278 __SetPageTail(p); 278 __SetPageTail(p);
279 p->first_page = page; 279 p->first_page = page;
280 } 280 }
281 } 281 }
282 282
283 static void destroy_compound_page(struct page *page, unsigned long order) 283 static void destroy_compound_page(struct page *page, unsigned long order)
284 { 284 {
285 int i; 285 int i;
286 int nr_pages = 1 << order; 286 int nr_pages = 1 << order;
287 287
288 if (unlikely(compound_order(page) != order)) 288 if (unlikely(compound_order(page) != order))
289 bad_page(page); 289 bad_page(page);
290 290
291 if (unlikely(!PageHead(page))) 291 if (unlikely(!PageHead(page)))
292 bad_page(page); 292 bad_page(page);
293 __ClearPageHead(page); 293 __ClearPageHead(page);
294 for (i = 1; i < nr_pages; i++) { 294 for (i = 1; i < nr_pages; i++) {
295 struct page *p = page + i; 295 struct page *p = page + i;
296 296
297 if (unlikely(!PageTail(p) | 297 if (unlikely(!PageTail(p) |
298 (p->first_page != page))) 298 (p->first_page != page)))
299 bad_page(page); 299 bad_page(page);
300 __ClearPageTail(p); 300 __ClearPageTail(p);
301 } 301 }
302 } 302 }
303 303
304 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 304 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
305 { 305 {
306 int i; 306 int i;
307 307
308 /* 308 /*
309 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 309 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
310 * and __GFP_HIGHMEM from hard or soft interrupt context. 310 * and __GFP_HIGHMEM from hard or soft interrupt context.
311 */ 311 */
312 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 312 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
313 for (i = 0; i < (1 << order); i++) 313 for (i = 0; i < (1 << order); i++)
314 clear_highpage(page + i); 314 clear_highpage(page + i);
315 } 315 }
316 316
317 static inline void set_page_order(struct page *page, int order) 317 static inline void set_page_order(struct page *page, int order)
318 { 318 {
319 set_page_private(page, order); 319 set_page_private(page, order);
320 __SetPageBuddy(page); 320 __SetPageBuddy(page);
321 } 321 }
322 322
323 static inline void rmv_page_order(struct page *page) 323 static inline void rmv_page_order(struct page *page)
324 { 324 {
325 __ClearPageBuddy(page); 325 __ClearPageBuddy(page);
326 set_page_private(page, 0); 326 set_page_private(page, 0);
327 } 327 }
328 328
329 /* 329 /*
330 * Locate the struct page for both the matching buddy in our 330 * Locate the struct page for both the matching buddy in our
331 * pair (buddy1) and the combined O(n+1) page they form (page). 331 * pair (buddy1) and the combined O(n+1) page they form (page).
332 * 332 *
333 * 1) Any buddy B1 will have an order O twin B2 which satisfies 333 * 1) Any buddy B1 will have an order O twin B2 which satisfies
334 * the following equation: 334 * the following equation:
335 * B2 = B1 ^ (1 << O) 335 * B2 = B1 ^ (1 << O)
336 * For example, if the starting buddy (buddy2) is #8 its order 336 * For example, if the starting buddy (buddy2) is #8 its order
337 * 1 buddy is #10: 337 * 1 buddy is #10:
338 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 338 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
339 * 339 *
340 * 2) Any buddy B will have an order O+1 parent P which 340 * 2) Any buddy B will have an order O+1 parent P which
341 * satisfies the following equation: 341 * satisfies the following equation:
342 * P = B & ~(1 << O) 342 * P = B & ~(1 << O)
343 * 343 *
344 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 344 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
345 */ 345 */
346 static inline struct page * 346 static inline struct page *
347 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 347 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
348 { 348 {
349 unsigned long buddy_idx = page_idx ^ (1 << order); 349 unsigned long buddy_idx = page_idx ^ (1 << order);
350 350
351 return page + (buddy_idx - page_idx); 351 return page + (buddy_idx - page_idx);
352 } 352 }
353 353
354 static inline unsigned long 354 static inline unsigned long
355 __find_combined_index(unsigned long page_idx, unsigned int order) 355 __find_combined_index(unsigned long page_idx, unsigned int order)
356 { 356 {
357 return (page_idx & ~(1 << order)); 357 return (page_idx & ~(1 << order));
358 } 358 }
359 359
360 /* 360 /*
361 * This function checks whether a page is free && is the buddy 361 * This function checks whether a page is free && is the buddy
362 * we can do coalesce a page and its buddy if 362 * we can do coalesce a page and its buddy if
363 * (a) the buddy is not in a hole && 363 * (a) the buddy is not in a hole &&
364 * (b) the buddy is in the buddy system && 364 * (b) the buddy is in the buddy system &&
365 * (c) a page and its buddy have the same order && 365 * (c) a page and its buddy have the same order &&
366 * (d) a page and its buddy are in the same zone. 366 * (d) a page and its buddy are in the same zone.
367 * 367 *
368 * For recording whether a page is in the buddy system, we use PG_buddy. 368 * For recording whether a page is in the buddy system, we use PG_buddy.
369 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 369 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
370 * 370 *
371 * For recording page's order, we use page_private(page). 371 * For recording page's order, we use page_private(page).
372 */ 372 */
373 static inline int page_is_buddy(struct page *page, struct page *buddy, 373 static inline int page_is_buddy(struct page *page, struct page *buddy,
374 int order) 374 int order)
375 { 375 {
376 if (!pfn_valid_within(page_to_pfn(buddy))) 376 if (!pfn_valid_within(page_to_pfn(buddy)))
377 return 0; 377 return 0;
378 378
379 if (page_zone_id(page) != page_zone_id(buddy)) 379 if (page_zone_id(page) != page_zone_id(buddy))
380 return 0; 380 return 0;
381 381
382 if (PageBuddy(buddy) && page_order(buddy) == order) { 382 if (PageBuddy(buddy) && page_order(buddy) == order) {
383 BUG_ON(page_count(buddy) != 0); 383 BUG_ON(page_count(buddy) != 0);
384 return 1; 384 return 1;
385 } 385 }
386 return 0; 386 return 0;
387 } 387 }
388 388
389 /* 389 /*
390 * Freeing function for a buddy system allocator. 390 * Freeing function for a buddy system allocator.
391 * 391 *
392 * The concept of a buddy system is to maintain direct-mapped table 392 * The concept of a buddy system is to maintain direct-mapped table
393 * (containing bit values) for memory blocks of various "orders". 393 * (containing bit values) for memory blocks of various "orders".
394 * The bottom level table contains the map for the smallest allocatable 394 * The bottom level table contains the map for the smallest allocatable
395 * units of memory (here, pages), and each level above it describes 395 * units of memory (here, pages), and each level above it describes
396 * pairs of units from the levels below, hence, "buddies". 396 * pairs of units from the levels below, hence, "buddies".
397 * At a high level, all that happens here is marking the table entry 397 * At a high level, all that happens here is marking the table entry
398 * at the bottom level available, and propagating the changes upward 398 * at the bottom level available, and propagating the changes upward
399 * as necessary, plus some accounting needed to play nicely with other 399 * as necessary, plus some accounting needed to play nicely with other
400 * parts of the VM system. 400 * parts of the VM system.
401 * At each level, we keep a list of pages, which are heads of continuous 401 * At each level, we keep a list of pages, which are heads of continuous
402 * free pages of length of (1 << order) and marked with PG_buddy. Page's 402 * free pages of length of (1 << order) and marked with PG_buddy. Page's
403 * order is recorded in page_private(page) field. 403 * order is recorded in page_private(page) field.
404 * So when we are allocating or freeing one, we can derive the state of the 404 * So when we are allocating or freeing one, we can derive the state of the
405 * other. That is, if we allocate a small block, and both were 405 * other. That is, if we allocate a small block, and both were
406 * free, the remainder of the region must be split into blocks. 406 * free, the remainder of the region must be split into blocks.
407 * If a block is freed, and its buddy is also free, then this 407 * If a block is freed, and its buddy is also free, then this
408 * triggers coalescing into a block of larger size. 408 * triggers coalescing into a block of larger size.
409 * 409 *
410 * -- wli 410 * -- wli
411 */ 411 */
412 412
413 static inline void __free_one_page(struct page *page, 413 static inline void __free_one_page(struct page *page,
414 struct zone *zone, unsigned int order) 414 struct zone *zone, unsigned int order)
415 { 415 {
416 unsigned long page_idx; 416 unsigned long page_idx;
417 int order_size = 1 << order; 417 int order_size = 1 << order;
418 int migratetype = get_pageblock_migratetype(page); 418 int migratetype = get_pageblock_migratetype(page);
419 419
420 if (unlikely(PageCompound(page))) 420 if (unlikely(PageCompound(page)))
421 destroy_compound_page(page, order); 421 destroy_compound_page(page, order);
422 422
423 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 423 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
424 424
425 VM_BUG_ON(page_idx & (order_size - 1)); 425 VM_BUG_ON(page_idx & (order_size - 1));
426 VM_BUG_ON(bad_range(zone, page)); 426 VM_BUG_ON(bad_range(zone, page));
427 427
428 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 428 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
429 while (order < MAX_ORDER-1) { 429 while (order < MAX_ORDER-1) {
430 unsigned long combined_idx; 430 unsigned long combined_idx;
431 struct page *buddy; 431 struct page *buddy;
432 432
433 buddy = __page_find_buddy(page, page_idx, order); 433 buddy = __page_find_buddy(page, page_idx, order);
434 if (!page_is_buddy(page, buddy, order)) 434 if (!page_is_buddy(page, buddy, order))
435 break; /* Move the buddy up one level. */ 435 break; /* Move the buddy up one level. */
436 436
437 list_del(&buddy->lru); 437 list_del(&buddy->lru);
438 zone->free_area[order].nr_free--; 438 zone->free_area[order].nr_free--;
439 rmv_page_order(buddy); 439 rmv_page_order(buddy);
440 combined_idx = __find_combined_index(page_idx, order); 440 combined_idx = __find_combined_index(page_idx, order);
441 page = page + (combined_idx - page_idx); 441 page = page + (combined_idx - page_idx);
442 page_idx = combined_idx; 442 page_idx = combined_idx;
443 order++; 443 order++;
444 } 444 }
445 set_page_order(page, order); 445 set_page_order(page, order);
446 list_add(&page->lru, 446 list_add(&page->lru,
447 &zone->free_area[order].free_list[migratetype]); 447 &zone->free_area[order].free_list[migratetype]);
448 zone->free_area[order].nr_free++; 448 zone->free_area[order].nr_free++;
449 } 449 }
450 450
451 static inline int free_pages_check(struct page *page) 451 static inline int free_pages_check(struct page *page)
452 { 452 {
453 if (unlikely(page_mapcount(page) | 453 if (unlikely(page_mapcount(page) |
454 (page->mapping != NULL) | 454 (page->mapping != NULL) |
455 (page_get_page_cgroup(page) != NULL) | 455 (page_get_page_cgroup(page) != NULL) |
456 (page_count(page) != 0) | 456 (page_count(page) != 0) |
457 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 457 (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
458 bad_page(page); 458 bad_page(page);
459 if (PageDirty(page)) 459 if (PageDirty(page))
460 __ClearPageDirty(page); 460 __ClearPageDirty(page);
461 /* 461 /*
462 * For now, we report if PG_reserved was found set, but do not 462 * For now, we report if PG_reserved was found set, but do not
463 * clear it, and do not free the page. But we shall soon need 463 * clear it, and do not free the page. But we shall soon need
464 * to do more, for when the ZERO_PAGE count wraps negative. 464 * to do more, for when the ZERO_PAGE count wraps negative.
465 */ 465 */
466 return PageReserved(page); 466 return PageReserved(page);
467 } 467 }
468 468
469 /* 469 /*
470 * Frees a list of pages. 470 * Frees a list of pages.
471 * Assumes all pages on list are in same zone, and of same order. 471 * Assumes all pages on list are in same zone, and of same order.
472 * count is the number of pages to free. 472 * count is the number of pages to free.
473 * 473 *
474 * If the zone was previously in an "all pages pinned" state then look to 474 * If the zone was previously in an "all pages pinned" state then look to
475 * see if this freeing clears that state. 475 * see if this freeing clears that state.
476 * 476 *
477 * And clear the zone's pages_scanned counter, to hold off the "all pages are 477 * And clear the zone's pages_scanned counter, to hold off the "all pages are
478 * pinned" detection logic. 478 * pinned" detection logic.
479 */ 479 */
480 static void free_pages_bulk(struct zone *zone, int count, 480 static void free_pages_bulk(struct zone *zone, int count,
481 struct list_head *list, int order) 481 struct list_head *list, int order)
482 { 482 {
483 spin_lock(&zone->lock); 483 spin_lock(&zone->lock);
484 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 484 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
485 zone->pages_scanned = 0; 485 zone->pages_scanned = 0;
486 while (count--) { 486 while (count--) {
487 struct page *page; 487 struct page *page;
488 488
489 VM_BUG_ON(list_empty(list)); 489 VM_BUG_ON(list_empty(list));
490 page = list_entry(list->prev, struct page, lru); 490 page = list_entry(list->prev, struct page, lru);
491 /* have to delete it as __free_one_page list manipulates */ 491 /* have to delete it as __free_one_page list manipulates */
492 list_del(&page->lru); 492 list_del(&page->lru);
493 __free_one_page(page, zone, order); 493 __free_one_page(page, zone, order);
494 } 494 }
495 spin_unlock(&zone->lock); 495 spin_unlock(&zone->lock);
496 } 496 }
497 497
498 static void free_one_page(struct zone *zone, struct page *page, int order) 498 static void free_one_page(struct zone *zone, struct page *page, int order)
499 { 499 {
500 spin_lock(&zone->lock); 500 spin_lock(&zone->lock);
501 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 501 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
502 zone->pages_scanned = 0; 502 zone->pages_scanned = 0;
503 __free_one_page(page, zone, order); 503 __free_one_page(page, zone, order);
504 spin_unlock(&zone->lock); 504 spin_unlock(&zone->lock);
505 } 505 }
506 506
507 static void __free_pages_ok(struct page *page, unsigned int order) 507 static void __free_pages_ok(struct page *page, unsigned int order)
508 { 508 {
509 unsigned long flags; 509 unsigned long flags;
510 int i; 510 int i;
511 int reserved = 0; 511 int reserved = 0;
512 512
513 for (i = 0 ; i < (1 << order) ; ++i) 513 for (i = 0 ; i < (1 << order) ; ++i)
514 reserved += free_pages_check(page + i); 514 reserved += free_pages_check(page + i);
515 if (reserved) 515 if (reserved)
516 return; 516 return;
517 517
518 if (!PageHighMem(page)) { 518 if (!PageHighMem(page)) {
519 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 519 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
520 debug_check_no_obj_freed(page_address(page), 520 debug_check_no_obj_freed(page_address(page),
521 PAGE_SIZE << order); 521 PAGE_SIZE << order);
522 } 522 }
523 arch_free_page(page, order); 523 arch_free_page(page, order);
524 kernel_map_pages(page, 1 << order, 0); 524 kernel_map_pages(page, 1 << order, 0);
525 525
526 local_irq_save(flags); 526 local_irq_save(flags);
527 __count_vm_events(PGFREE, 1 << order); 527 __count_vm_events(PGFREE, 1 << order);
528 free_one_page(page_zone(page), page, order); 528 free_one_page(page_zone(page), page, order);
529 local_irq_restore(flags); 529 local_irq_restore(flags);
530 } 530 }
531 531
532 /* 532 /*
533 * permit the bootmem allocator to evade page validation on high-order frees 533 * permit the bootmem allocator to evade page validation on high-order frees
534 */ 534 */
535 void __free_pages_bootmem(struct page *page, unsigned int order) 535 void __free_pages_bootmem(struct page *page, unsigned int order)
536 { 536 {
537 if (order == 0) { 537 if (order == 0) {
538 __ClearPageReserved(page); 538 __ClearPageReserved(page);
539 set_page_count(page, 0); 539 set_page_count(page, 0);
540 set_page_refcounted(page); 540 set_page_refcounted(page);
541 __free_page(page); 541 __free_page(page);
542 } else { 542 } else {
543 int loop; 543 int loop;
544 544
545 prefetchw(page); 545 prefetchw(page);
546 for (loop = 0; loop < BITS_PER_LONG; loop++) { 546 for (loop = 0; loop < BITS_PER_LONG; loop++) {
547 struct page *p = &page[loop]; 547 struct page *p = &page[loop];
548 548
549 if (loop + 1 < BITS_PER_LONG) 549 if (loop + 1 < BITS_PER_LONG)
550 prefetchw(p + 1); 550 prefetchw(p + 1);
551 __ClearPageReserved(p); 551 __ClearPageReserved(p);
552 set_page_count(p, 0); 552 set_page_count(p, 0);
553 } 553 }
554 554
555 set_page_refcounted(page); 555 set_page_refcounted(page);
556 __free_pages(page, order); 556 __free_pages(page, order);
557 } 557 }
558 } 558 }
559 559
560 560
561 /* 561 /*
562 * The order of subdivision here is critical for the IO subsystem. 562 * The order of subdivision here is critical for the IO subsystem.
563 * Please do not alter this order without good reasons and regression 563 * Please do not alter this order without good reasons and regression
564 * testing. Specifically, as large blocks of memory are subdivided, 564 * testing. Specifically, as large blocks of memory are subdivided,
565 * the order in which smaller blocks are delivered depends on the order 565 * the order in which smaller blocks are delivered depends on the order
566 * they're subdivided in this function. This is the primary factor 566 * they're subdivided in this function. This is the primary factor
567 * influencing the order in which pages are delivered to the IO 567 * influencing the order in which pages are delivered to the IO
568 * subsystem according to empirical testing, and this is also justified 568 * subsystem according to empirical testing, and this is also justified
569 * by considering the behavior of a buddy system containing a single 569 * by considering the behavior of a buddy system containing a single
570 * large block of memory acted on by a series of small allocations. 570 * large block of memory acted on by a series of small allocations.
571 * This behavior is a critical factor in sglist merging's success. 571 * This behavior is a critical factor in sglist merging's success.
572 * 572 *
573 * -- wli 573 * -- wli
574 */ 574 */
575 static inline void expand(struct zone *zone, struct page *page, 575 static inline void expand(struct zone *zone, struct page *page,
576 int low, int high, struct free_area *area, 576 int low, int high, struct free_area *area,
577 int migratetype) 577 int migratetype)
578 { 578 {
579 unsigned long size = 1 << high; 579 unsigned long size = 1 << high;
580 580
581 while (high > low) { 581 while (high > low) {
582 area--; 582 area--;
583 high--; 583 high--;
584 size >>= 1; 584 size >>= 1;
585 VM_BUG_ON(bad_range(zone, &page[size])); 585 VM_BUG_ON(bad_range(zone, &page[size]));
586 list_add(&page[size].lru, &area->free_list[migratetype]); 586 list_add(&page[size].lru, &area->free_list[migratetype]);
587 area->nr_free++; 587 area->nr_free++;
588 set_page_order(&page[size], high); 588 set_page_order(&page[size], high);
589 } 589 }
590 } 590 }
591 591
592 /* 592 /*
593 * This page is about to be returned from the page allocator 593 * This page is about to be returned from the page allocator
594 */ 594 */
595 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 595 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
596 { 596 {
597 if (unlikely(page_mapcount(page) | 597 if (unlikely(page_mapcount(page) |
598 (page->mapping != NULL) | 598 (page->mapping != NULL) |
599 (page_get_page_cgroup(page) != NULL) | 599 (page_get_page_cgroup(page) != NULL) |
600 (page_count(page) != 0) | 600 (page_count(page) != 0) |
601 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 601 (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
602 bad_page(page); 602 bad_page(page);
603 603
604 /* 604 /*
605 * For now, we report if PG_reserved was found set, but do not 605 * For now, we report if PG_reserved was found set, but do not
606 * clear it, and do not allocate the page: as a safety net. 606 * clear it, and do not allocate the page: as a safety net.
607 */ 607 */
608 if (PageReserved(page)) 608 if (PageReserved(page))
609 return 1; 609 return 1;
610 610
611 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 611 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
612 1 << PG_referenced | 1 << PG_arch_1 | 612 1 << PG_referenced | 1 << PG_arch_1 |
613 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 613 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
614 set_page_private(page, 0); 614 set_page_private(page, 0);
615 set_page_refcounted(page); 615 set_page_refcounted(page);
616 616
617 arch_alloc_page(page, order); 617 arch_alloc_page(page, order);
618 kernel_map_pages(page, 1 << order, 1); 618 kernel_map_pages(page, 1 << order, 1);
619 619
620 if (gfp_flags & __GFP_ZERO) 620 if (gfp_flags & __GFP_ZERO)
621 prep_zero_page(page, order, gfp_flags); 621 prep_zero_page(page, order, gfp_flags);
622 622
623 if (order && (gfp_flags & __GFP_COMP)) 623 if (order && (gfp_flags & __GFP_COMP))
624 prep_compound_page(page, order); 624 prep_compound_page(page, order);
625 625
626 return 0; 626 return 0;
627 } 627 }
628 628
629 /* 629 /*
630 * Go through the free lists for the given migratetype and remove 630 * Go through the free lists for the given migratetype and remove
631 * the smallest available page from the freelists 631 * the smallest available page from the freelists
632 */ 632 */
633 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 633 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
634 int migratetype) 634 int migratetype)
635 { 635 {
636 unsigned int current_order; 636 unsigned int current_order;
637 struct free_area * area; 637 struct free_area * area;
638 struct page *page; 638 struct page *page;
639 639
640 /* Find a page of the appropriate size in the preferred list */ 640 /* Find a page of the appropriate size in the preferred list */
641 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 641 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
642 area = &(zone->free_area[current_order]); 642 area = &(zone->free_area[current_order]);
643 if (list_empty(&area->free_list[migratetype])) 643 if (list_empty(&area->free_list[migratetype]))
644 continue; 644 continue;
645 645
646 page = list_entry(area->free_list[migratetype].next, 646 page = list_entry(area->free_list[migratetype].next,
647 struct page, lru); 647 struct page, lru);
648 list_del(&page->lru); 648 list_del(&page->lru);
649 rmv_page_order(page); 649 rmv_page_order(page);
650 area->nr_free--; 650 area->nr_free--;
651 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 651 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
652 expand(zone, page, order, current_order, area, migratetype); 652 expand(zone, page, order, current_order, area, migratetype);
653 return page; 653 return page;
654 } 654 }
655 655
656 return NULL; 656 return NULL;
657 } 657 }
658 658
659 659
660 /* 660 /*
661 * This array describes the order lists are fallen back to when 661 * This array describes the order lists are fallen back to when
662 * the free lists for the desirable migrate type are depleted 662 * the free lists for the desirable migrate type are depleted
663 */ 663 */
664 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 664 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
665 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 665 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
666 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 666 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
667 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 667 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
668 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ 668 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
669 }; 669 };
670 670
671 /* 671 /*
672 * Move the free pages in a range to the free lists of the requested type. 672 * Move the free pages in a range to the free lists of the requested type.
673 * Note that start_page and end_pages are not aligned on a pageblock 673 * Note that start_page and end_pages are not aligned on a pageblock
674 * boundary. If alignment is required, use move_freepages_block() 674 * boundary. If alignment is required, use move_freepages_block()
675 */ 675 */
676 int move_freepages(struct zone *zone, 676 int move_freepages(struct zone *zone,
677 struct page *start_page, struct page *end_page, 677 struct page *start_page, struct page *end_page,
678 int migratetype) 678 int migratetype)
679 { 679 {
680 struct page *page; 680 struct page *page;
681 unsigned long order; 681 unsigned long order;
682 int pages_moved = 0; 682 int pages_moved = 0;
683 683
684 #ifndef CONFIG_HOLES_IN_ZONE 684 #ifndef CONFIG_HOLES_IN_ZONE
685 /* 685 /*
686 * page_zone is not safe to call in this context when 686 * page_zone is not safe to call in this context when
687 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 687 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
688 * anyway as we check zone boundaries in move_freepages_block(). 688 * anyway as we check zone boundaries in move_freepages_block().
689 * Remove at a later date when no bug reports exist related to 689 * Remove at a later date when no bug reports exist related to
690 * grouping pages by mobility 690 * grouping pages by mobility
691 */ 691 */
692 BUG_ON(page_zone(start_page) != page_zone(end_page)); 692 BUG_ON(page_zone(start_page) != page_zone(end_page));
693 #endif 693 #endif
694 694
695 for (page = start_page; page <= end_page;) { 695 for (page = start_page; page <= end_page;) {
696 if (!pfn_valid_within(page_to_pfn(page))) { 696 if (!pfn_valid_within(page_to_pfn(page))) {
697 page++; 697 page++;
698 continue; 698 continue;
699 } 699 }
700 700
701 if (!PageBuddy(page)) { 701 if (!PageBuddy(page)) {
702 page++; 702 page++;
703 continue; 703 continue;
704 } 704 }
705 705
706 order = page_order(page); 706 order = page_order(page);
707 list_del(&page->lru); 707 list_del(&page->lru);
708 list_add(&page->lru, 708 list_add(&page->lru,
709 &zone->free_area[order].free_list[migratetype]); 709 &zone->free_area[order].free_list[migratetype]);
710 page += 1 << order; 710 page += 1 << order;
711 pages_moved += 1 << order; 711 pages_moved += 1 << order;
712 } 712 }
713 713
714 return pages_moved; 714 return pages_moved;
715 } 715 }
716 716
717 int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 717 int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
718 { 718 {
719 unsigned long start_pfn, end_pfn; 719 unsigned long start_pfn, end_pfn;
720 struct page *start_page, *end_page; 720 struct page *start_page, *end_page;
721 721
722 start_pfn = page_to_pfn(page); 722 start_pfn = page_to_pfn(page);
723 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 723 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
724 start_page = pfn_to_page(start_pfn); 724 start_page = pfn_to_page(start_pfn);
725 end_page = start_page + pageblock_nr_pages - 1; 725 end_page = start_page + pageblock_nr_pages - 1;
726 end_pfn = start_pfn + pageblock_nr_pages - 1; 726 end_pfn = start_pfn + pageblock_nr_pages - 1;
727 727
728 /* Do not cross zone boundaries */ 728 /* Do not cross zone boundaries */
729 if (start_pfn < zone->zone_start_pfn) 729 if (start_pfn < zone->zone_start_pfn)
730 start_page = page; 730 start_page = page;
731 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 731 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
732 return 0; 732 return 0;
733 733
734 return move_freepages(zone, start_page, end_page, migratetype); 734 return move_freepages(zone, start_page, end_page, migratetype);
735 } 735 }
736 736
737 /* Remove an element from the buddy allocator from the fallback list */ 737 /* Remove an element from the buddy allocator from the fallback list */
738 static struct page *__rmqueue_fallback(struct zone *zone, int order, 738 static struct page *__rmqueue_fallback(struct zone *zone, int order,
739 int start_migratetype) 739 int start_migratetype)
740 { 740 {
741 struct free_area * area; 741 struct free_area * area;
742 int current_order; 742 int current_order;
743 struct page *page; 743 struct page *page;
744 int migratetype, i; 744 int migratetype, i;
745 745
746 /* Find the largest possible block of pages in the other list */ 746 /* Find the largest possible block of pages in the other list */
747 for (current_order = MAX_ORDER-1; current_order >= order; 747 for (current_order = MAX_ORDER-1; current_order >= order;
748 --current_order) { 748 --current_order) {
749 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 749 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
750 migratetype = fallbacks[start_migratetype][i]; 750 migratetype = fallbacks[start_migratetype][i];
751 751
752 /* MIGRATE_RESERVE handled later if necessary */ 752 /* MIGRATE_RESERVE handled later if necessary */
753 if (migratetype == MIGRATE_RESERVE) 753 if (migratetype == MIGRATE_RESERVE)
754 continue; 754 continue;
755 755
756 area = &(zone->free_area[current_order]); 756 area = &(zone->free_area[current_order]);
757 if (list_empty(&area->free_list[migratetype])) 757 if (list_empty(&area->free_list[migratetype]))
758 continue; 758 continue;
759 759
760 page = list_entry(area->free_list[migratetype].next, 760 page = list_entry(area->free_list[migratetype].next,
761 struct page, lru); 761 struct page, lru);
762 area->nr_free--; 762 area->nr_free--;
763 763
764 /* 764 /*
765 * If breaking a large block of pages, move all free 765 * If breaking a large block of pages, move all free
766 * pages to the preferred allocation list. If falling 766 * pages to the preferred allocation list. If falling
767 * back for a reclaimable kernel allocation, be more 767 * back for a reclaimable kernel allocation, be more
768 * agressive about taking ownership of free pages 768 * agressive about taking ownership of free pages
769 */ 769 */
770 if (unlikely(current_order >= (pageblock_order >> 1)) || 770 if (unlikely(current_order >= (pageblock_order >> 1)) ||
771 start_migratetype == MIGRATE_RECLAIMABLE) { 771 start_migratetype == MIGRATE_RECLAIMABLE) {
772 unsigned long pages; 772 unsigned long pages;
773 pages = move_freepages_block(zone, page, 773 pages = move_freepages_block(zone, page,
774 start_migratetype); 774 start_migratetype);
775 775
776 /* Claim the whole block if over half of it is free */ 776 /* Claim the whole block if over half of it is free */
777 if (pages >= (1 << (pageblock_order-1))) 777 if (pages >= (1 << (pageblock_order-1)))
778 set_pageblock_migratetype(page, 778 set_pageblock_migratetype(page,
779 start_migratetype); 779 start_migratetype);
780 780
781 migratetype = start_migratetype; 781 migratetype = start_migratetype;
782 } 782 }
783 783
784 /* Remove the page from the freelists */ 784 /* Remove the page from the freelists */
785 list_del(&page->lru); 785 list_del(&page->lru);
786 rmv_page_order(page); 786 rmv_page_order(page);
787 __mod_zone_page_state(zone, NR_FREE_PAGES, 787 __mod_zone_page_state(zone, NR_FREE_PAGES,
788 -(1UL << order)); 788 -(1UL << order));
789 789
790 if (current_order == pageblock_order) 790 if (current_order == pageblock_order)
791 set_pageblock_migratetype(page, 791 set_pageblock_migratetype(page,
792 start_migratetype); 792 start_migratetype);
793 793
794 expand(zone, page, order, current_order, area, migratetype); 794 expand(zone, page, order, current_order, area, migratetype);
795 return page; 795 return page;
796 } 796 }
797 } 797 }
798 798
799 /* Use MIGRATE_RESERVE rather than fail an allocation */ 799 /* Use MIGRATE_RESERVE rather than fail an allocation */
800 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); 800 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
801 } 801 }
802 802
803 /* 803 /*
804 * Do the hard work of removing an element from the buddy allocator. 804 * Do the hard work of removing an element from the buddy allocator.
805 * Call me with the zone->lock already held. 805 * Call me with the zone->lock already held.
806 */ 806 */
807 static struct page *__rmqueue(struct zone *zone, unsigned int order, 807 static struct page *__rmqueue(struct zone *zone, unsigned int order,
808 int migratetype) 808 int migratetype)
809 { 809 {
810 struct page *page; 810 struct page *page;
811 811
812 page = __rmqueue_smallest(zone, order, migratetype); 812 page = __rmqueue_smallest(zone, order, migratetype);
813 813
814 if (unlikely(!page)) 814 if (unlikely(!page))
815 page = __rmqueue_fallback(zone, order, migratetype); 815 page = __rmqueue_fallback(zone, order, migratetype);
816 816
817 return page; 817 return page;
818 } 818 }
819 819
820 /* 820 /*
821 * Obtain a specified number of elements from the buddy allocator, all under 821 * Obtain a specified number of elements from the buddy allocator, all under
822 * a single hold of the lock, for efficiency. Add them to the supplied list. 822 * a single hold of the lock, for efficiency. Add them to the supplied list.
823 * Returns the number of new pages which were placed at *list. 823 * Returns the number of new pages which were placed at *list.
824 */ 824 */
825 static int rmqueue_bulk(struct zone *zone, unsigned int order, 825 static int rmqueue_bulk(struct zone *zone, unsigned int order,
826 unsigned long count, struct list_head *list, 826 unsigned long count, struct list_head *list,
827 int migratetype) 827 int migratetype)
828 { 828 {
829 int i; 829 int i;
830 830
831 spin_lock(&zone->lock); 831 spin_lock(&zone->lock);
832 for (i = 0; i < count; ++i) { 832 for (i = 0; i < count; ++i) {
833 struct page *page = __rmqueue(zone, order, migratetype); 833 struct page *page = __rmqueue(zone, order, migratetype);
834 if (unlikely(page == NULL)) 834 if (unlikely(page == NULL))
835 break; 835 break;
836 836
837 /* 837 /*
838 * Split buddy pages returned by expand() are received here 838 * Split buddy pages returned by expand() are received here
839 * in physical page order. The page is added to the callers and 839 * in physical page order. The page is added to the callers and
840 * list and the list head then moves forward. From the callers 840 * list and the list head then moves forward. From the callers
841 * perspective, the linked list is ordered by page number in 841 * perspective, the linked list is ordered by page number in
842 * some conditions. This is useful for IO devices that can 842 * some conditions. This is useful for IO devices that can
843 * merge IO requests if the physical pages are ordered 843 * merge IO requests if the physical pages are ordered
844 * properly. 844 * properly.
845 */ 845 */
846 list_add(&page->lru, list); 846 list_add(&page->lru, list);
847 set_page_private(page, migratetype); 847 set_page_private(page, migratetype);
848 list = &page->lru; 848 list = &page->lru;
849 } 849 }
850 spin_unlock(&zone->lock); 850 spin_unlock(&zone->lock);
851 return i; 851 return i;
852 } 852 }
853 853
854 #ifdef CONFIG_NUMA 854 #ifdef CONFIG_NUMA
855 /* 855 /*
856 * Called from the vmstat counter updater to drain pagesets of this 856 * Called from the vmstat counter updater to drain pagesets of this
857 * currently executing processor on remote nodes after they have 857 * currently executing processor on remote nodes after they have
858 * expired. 858 * expired.
859 * 859 *
860 * Note that this function must be called with the thread pinned to 860 * Note that this function must be called with the thread pinned to
861 * a single processor. 861 * a single processor.
862 */ 862 */
863 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 863 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
864 { 864 {
865 unsigned long flags; 865 unsigned long flags;
866 int to_drain; 866 int to_drain;
867 867
868 local_irq_save(flags); 868 local_irq_save(flags);
869 if (pcp->count >= pcp->batch) 869 if (pcp->count >= pcp->batch)
870 to_drain = pcp->batch; 870 to_drain = pcp->batch;
871 else 871 else
872 to_drain = pcp->count; 872 to_drain = pcp->count;
873 free_pages_bulk(zone, to_drain, &pcp->list, 0); 873 free_pages_bulk(zone, to_drain, &pcp->list, 0);
874 pcp->count -= to_drain; 874 pcp->count -= to_drain;
875 local_irq_restore(flags); 875 local_irq_restore(flags);
876 } 876 }
877 #endif 877 #endif
878 878
879 /* 879 /*
880 * Drain pages of the indicated processor. 880 * Drain pages of the indicated processor.
881 * 881 *
882 * The processor must either be the current processor and the 882 * The processor must either be the current processor and the
883 * thread pinned to the current processor or a processor that 883 * thread pinned to the current processor or a processor that
884 * is not online. 884 * is not online.
885 */ 885 */
886 static void drain_pages(unsigned int cpu) 886 static void drain_pages(unsigned int cpu)
887 { 887 {
888 unsigned long flags; 888 unsigned long flags;
889 struct zone *zone; 889 struct zone *zone;
890 890
891 for_each_zone(zone) { 891 for_each_zone(zone) {
892 struct per_cpu_pageset *pset; 892 struct per_cpu_pageset *pset;
893 struct per_cpu_pages *pcp; 893 struct per_cpu_pages *pcp;
894 894
895 if (!populated_zone(zone)) 895 if (!populated_zone(zone))
896 continue; 896 continue;
897 897
898 pset = zone_pcp(zone, cpu); 898 pset = zone_pcp(zone, cpu);
899 899
900 pcp = &pset->pcp; 900 pcp = &pset->pcp;
901 local_irq_save(flags); 901 local_irq_save(flags);
902 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 902 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
903 pcp->count = 0; 903 pcp->count = 0;
904 local_irq_restore(flags); 904 local_irq_restore(flags);
905 } 905 }
906 } 906 }
907 907
908 /* 908 /*
909 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 909 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
910 */ 910 */
911 void drain_local_pages(void *arg) 911 void drain_local_pages(void *arg)
912 { 912 {
913 drain_pages(smp_processor_id()); 913 drain_pages(smp_processor_id());
914 } 914 }
915 915
916 /* 916 /*
917 * Spill all the per-cpu pages from all CPUs back into the buddy allocator 917 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
918 */ 918 */
919 void drain_all_pages(void) 919 void drain_all_pages(void)
920 { 920 {
921 on_each_cpu(drain_local_pages, NULL, 1); 921 on_each_cpu(drain_local_pages, NULL, 1);
922 } 922 }
923 923
924 #ifdef CONFIG_HIBERNATION 924 #ifdef CONFIG_HIBERNATION
925 925
926 void mark_free_pages(struct zone *zone) 926 void mark_free_pages(struct zone *zone)
927 { 927 {
928 unsigned long pfn, max_zone_pfn; 928 unsigned long pfn, max_zone_pfn;
929 unsigned long flags; 929 unsigned long flags;
930 int order, t; 930 int order, t;
931 struct list_head *curr; 931 struct list_head *curr;
932 932
933 if (!zone->spanned_pages) 933 if (!zone->spanned_pages)
934 return; 934 return;
935 935
936 spin_lock_irqsave(&zone->lock, flags); 936 spin_lock_irqsave(&zone->lock, flags);
937 937
938 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 938 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
939 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 939 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
940 if (pfn_valid(pfn)) { 940 if (pfn_valid(pfn)) {
941 struct page *page = pfn_to_page(pfn); 941 struct page *page = pfn_to_page(pfn);
942 942
943 if (!swsusp_page_is_forbidden(page)) 943 if (!swsusp_page_is_forbidden(page))
944 swsusp_unset_page_free(page); 944 swsusp_unset_page_free(page);
945 } 945 }
946 946
947 for_each_migratetype_order(order, t) { 947 for_each_migratetype_order(order, t) {
948 list_for_each(curr, &zone->free_area[order].free_list[t]) { 948 list_for_each(curr, &zone->free_area[order].free_list[t]) {
949 unsigned long i; 949 unsigned long i;
950 950
951 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 951 pfn = page_to_pfn(list_entry(curr, struct page, lru));
952 for (i = 0; i < (1UL << order); i++) 952 for (i = 0; i < (1UL << order); i++)
953 swsusp_set_page_free(pfn_to_page(pfn + i)); 953 swsusp_set_page_free(pfn_to_page(pfn + i));
954 } 954 }
955 } 955 }
956 spin_unlock_irqrestore(&zone->lock, flags); 956 spin_unlock_irqrestore(&zone->lock, flags);
957 } 957 }
958 #endif /* CONFIG_PM */ 958 #endif /* CONFIG_PM */
959 959
960 /* 960 /*
961 * Free a 0-order page 961 * Free a 0-order page
962 */ 962 */
963 static void free_hot_cold_page(struct page *page, int cold) 963 static void free_hot_cold_page(struct page *page, int cold)
964 { 964 {
965 struct zone *zone = page_zone(page); 965 struct zone *zone = page_zone(page);
966 struct per_cpu_pages *pcp; 966 struct per_cpu_pages *pcp;
967 unsigned long flags; 967 unsigned long flags;
968 968
969 if (PageAnon(page)) 969 if (PageAnon(page))
970 page->mapping = NULL; 970 page->mapping = NULL;
971 if (free_pages_check(page)) 971 if (free_pages_check(page))
972 return; 972 return;
973 973
974 if (!PageHighMem(page)) { 974 if (!PageHighMem(page)) {
975 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 975 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
976 debug_check_no_obj_freed(page_address(page), PAGE_SIZE); 976 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
977 } 977 }
978 arch_free_page(page, 0); 978 arch_free_page(page, 0);
979 kernel_map_pages(page, 1, 0); 979 kernel_map_pages(page, 1, 0);
980 980
981 pcp = &zone_pcp(zone, get_cpu())->pcp; 981 pcp = &zone_pcp(zone, get_cpu())->pcp;
982 local_irq_save(flags); 982 local_irq_save(flags);
983 __count_vm_event(PGFREE); 983 __count_vm_event(PGFREE);
984 if (cold) 984 if (cold)
985 list_add_tail(&page->lru, &pcp->list); 985 list_add_tail(&page->lru, &pcp->list);
986 else 986 else
987 list_add(&page->lru, &pcp->list); 987 list_add(&page->lru, &pcp->list);
988 set_page_private(page, get_pageblock_migratetype(page)); 988 set_page_private(page, get_pageblock_migratetype(page));
989 pcp->count++; 989 pcp->count++;
990 if (pcp->count >= pcp->high) { 990 if (pcp->count >= pcp->high) {
991 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 991 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
992 pcp->count -= pcp->batch; 992 pcp->count -= pcp->batch;
993 } 993 }
994 local_irq_restore(flags); 994 local_irq_restore(flags);
995 put_cpu(); 995 put_cpu();
996 } 996 }
997 997
998 void free_hot_page(struct page *page) 998 void free_hot_page(struct page *page)
999 { 999 {
1000 free_hot_cold_page(page, 0); 1000 free_hot_cold_page(page, 0);
1001 } 1001 }
1002 1002
1003 void free_cold_page(struct page *page) 1003 void free_cold_page(struct page *page)
1004 { 1004 {
1005 free_hot_cold_page(page, 1); 1005 free_hot_cold_page(page, 1);
1006 } 1006 }
1007 1007
1008 /* 1008 /*
1009 * split_page takes a non-compound higher-order page, and splits it into 1009 * split_page takes a non-compound higher-order page, and splits it into
1010 * n (1<<order) sub-pages: page[0..n] 1010 * n (1<<order) sub-pages: page[0..n]
1011 * Each sub-page must be freed individually. 1011 * Each sub-page must be freed individually.
1012 * 1012 *
1013 * Note: this is probably too low level an operation for use in drivers. 1013 * Note: this is probably too low level an operation for use in drivers.
1014 * Please consult with lkml before using this in your driver. 1014 * Please consult with lkml before using this in your driver.
1015 */ 1015 */
1016 void split_page(struct page *page, unsigned int order) 1016 void split_page(struct page *page, unsigned int order)
1017 { 1017 {
1018 int i; 1018 int i;
1019 1019
1020 VM_BUG_ON(PageCompound(page)); 1020 VM_BUG_ON(PageCompound(page));
1021 VM_BUG_ON(!page_count(page)); 1021 VM_BUG_ON(!page_count(page));
1022 for (i = 1; i < (1 << order); i++) 1022 for (i = 1; i < (1 << order); i++)
1023 set_page_refcounted(page + i); 1023 set_page_refcounted(page + i);
1024 } 1024 }
1025 1025
1026 /* 1026 /*
1027 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1027 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1028 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1028 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1029 * or two. 1029 * or two.
1030 */ 1030 */
1031 static struct page *buffered_rmqueue(struct zone *preferred_zone, 1031 static struct page *buffered_rmqueue(struct zone *preferred_zone,
1032 struct zone *zone, int order, gfp_t gfp_flags) 1032 struct zone *zone, int order, gfp_t gfp_flags)
1033 { 1033 {
1034 unsigned long flags; 1034 unsigned long flags;
1035 struct page *page; 1035 struct page *page;
1036 int cold = !!(gfp_flags & __GFP_COLD); 1036 int cold = !!(gfp_flags & __GFP_COLD);
1037 int cpu; 1037 int cpu;
1038 int migratetype = allocflags_to_migratetype(gfp_flags); 1038 int migratetype = allocflags_to_migratetype(gfp_flags);
1039 1039
1040 again: 1040 again:
1041 cpu = get_cpu(); 1041 cpu = get_cpu();
1042 if (likely(order == 0)) { 1042 if (likely(order == 0)) {
1043 struct per_cpu_pages *pcp; 1043 struct per_cpu_pages *pcp;
1044 1044
1045 pcp = &zone_pcp(zone, cpu)->pcp; 1045 pcp = &zone_pcp(zone, cpu)->pcp;
1046 local_irq_save(flags); 1046 local_irq_save(flags);
1047 if (!pcp->count) { 1047 if (!pcp->count) {
1048 pcp->count = rmqueue_bulk(zone, 0, 1048 pcp->count = rmqueue_bulk(zone, 0,
1049 pcp->batch, &pcp->list, migratetype); 1049 pcp->batch, &pcp->list, migratetype);
1050 if (unlikely(!pcp->count)) 1050 if (unlikely(!pcp->count))
1051 goto failed; 1051 goto failed;
1052 } 1052 }
1053 1053
1054 /* Find a page of the appropriate migrate type */ 1054 /* Find a page of the appropriate migrate type */
1055 if (cold) { 1055 if (cold) {
1056 list_for_each_entry_reverse(page, &pcp->list, lru) 1056 list_for_each_entry_reverse(page, &pcp->list, lru)
1057 if (page_private(page) == migratetype) 1057 if (page_private(page) == migratetype)
1058 break; 1058 break;
1059 } else { 1059 } else {
1060 list_for_each_entry(page, &pcp->list, lru) 1060 list_for_each_entry(page, &pcp->list, lru)
1061 if (page_private(page) == migratetype) 1061 if (page_private(page) == migratetype)
1062 break; 1062 break;
1063 } 1063 }
1064 1064
1065 /* Allocate more to the pcp list if necessary */ 1065 /* Allocate more to the pcp list if necessary */
1066 if (unlikely(&page->lru == &pcp->list)) { 1066 if (unlikely(&page->lru == &pcp->list)) {
1067 pcp->count += rmqueue_bulk(zone, 0, 1067 pcp->count += rmqueue_bulk(zone, 0,
1068 pcp->batch, &pcp->list, migratetype); 1068 pcp->batch, &pcp->list, migratetype);
1069 page = list_entry(pcp->list.next, struct page, lru); 1069 page = list_entry(pcp->list.next, struct page, lru);
1070 } 1070 }
1071 1071
1072 list_del(&page->lru); 1072 list_del(&page->lru);
1073 pcp->count--; 1073 pcp->count--;
1074 } else { 1074 } else {
1075 spin_lock_irqsave(&zone->lock, flags); 1075 spin_lock_irqsave(&zone->lock, flags);
1076 page = __rmqueue(zone, order, migratetype); 1076 page = __rmqueue(zone, order, migratetype);
1077 spin_unlock(&zone->lock); 1077 spin_unlock(&zone->lock);
1078 if (!page) 1078 if (!page)
1079 goto failed; 1079 goto failed;
1080 } 1080 }
1081 1081
1082 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1082 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1083 zone_statistics(preferred_zone, zone); 1083 zone_statistics(preferred_zone, zone);
1084 local_irq_restore(flags); 1084 local_irq_restore(flags);
1085 put_cpu(); 1085 put_cpu();
1086 1086
1087 VM_BUG_ON(bad_range(zone, page)); 1087 VM_BUG_ON(bad_range(zone, page));
1088 if (prep_new_page(page, order, gfp_flags)) 1088 if (prep_new_page(page, order, gfp_flags))
1089 goto again; 1089 goto again;
1090 return page; 1090 return page;
1091 1091
1092 failed: 1092 failed:
1093 local_irq_restore(flags); 1093 local_irq_restore(flags);
1094 put_cpu(); 1094 put_cpu();
1095 return NULL; 1095 return NULL;
1096 } 1096 }
1097 1097
1098 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1098 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
1099 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1099 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
1100 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1100 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
1101 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1101 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
1102 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 1102 #define ALLOC_HARDER 0x10 /* try to alloc harder */
1103 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1103 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1104 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1104 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1105 1105
1106 #ifdef CONFIG_FAIL_PAGE_ALLOC 1106 #ifdef CONFIG_FAIL_PAGE_ALLOC
1107 1107
1108 static struct fail_page_alloc_attr { 1108 static struct fail_page_alloc_attr {
1109 struct fault_attr attr; 1109 struct fault_attr attr;
1110 1110
1111 u32 ignore_gfp_highmem; 1111 u32 ignore_gfp_highmem;
1112 u32 ignore_gfp_wait; 1112 u32 ignore_gfp_wait;
1113 u32 min_order; 1113 u32 min_order;
1114 1114
1115 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1115 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1116 1116
1117 struct dentry *ignore_gfp_highmem_file; 1117 struct dentry *ignore_gfp_highmem_file;
1118 struct dentry *ignore_gfp_wait_file; 1118 struct dentry *ignore_gfp_wait_file;
1119 struct dentry *min_order_file; 1119 struct dentry *min_order_file;
1120 1120
1121 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1121 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1122 1122
1123 } fail_page_alloc = { 1123 } fail_page_alloc = {
1124 .attr = FAULT_ATTR_INITIALIZER, 1124 .attr = FAULT_ATTR_INITIALIZER,
1125 .ignore_gfp_wait = 1, 1125 .ignore_gfp_wait = 1,
1126 .ignore_gfp_highmem = 1, 1126 .ignore_gfp_highmem = 1,
1127 .min_order = 1, 1127 .min_order = 1,
1128 }; 1128 };
1129 1129
1130 static int __init setup_fail_page_alloc(char *str) 1130 static int __init setup_fail_page_alloc(char *str)
1131 { 1131 {
1132 return setup_fault_attr(&fail_page_alloc.attr, str); 1132 return setup_fault_attr(&fail_page_alloc.attr, str);
1133 } 1133 }
1134 __setup("fail_page_alloc=", setup_fail_page_alloc); 1134 __setup("fail_page_alloc=", setup_fail_page_alloc);
1135 1135
1136 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1136 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1137 { 1137 {
1138 if (order < fail_page_alloc.min_order) 1138 if (order < fail_page_alloc.min_order)
1139 return 0; 1139 return 0;
1140 if (gfp_mask & __GFP_NOFAIL) 1140 if (gfp_mask & __GFP_NOFAIL)
1141 return 0; 1141 return 0;
1142 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1142 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1143 return 0; 1143 return 0;
1144 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1144 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1145 return 0; 1145 return 0;
1146 1146
1147 return should_fail(&fail_page_alloc.attr, 1 << order); 1147 return should_fail(&fail_page_alloc.attr, 1 << order);
1148 } 1148 }
1149 1149
1150 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1150 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1151 1151
1152 static int __init fail_page_alloc_debugfs(void) 1152 static int __init fail_page_alloc_debugfs(void)
1153 { 1153 {
1154 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1154 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1155 struct dentry *dir; 1155 struct dentry *dir;
1156 int err; 1156 int err;
1157 1157
1158 err = init_fault_attr_dentries(&fail_page_alloc.attr, 1158 err = init_fault_attr_dentries(&fail_page_alloc.attr,
1159 "fail_page_alloc"); 1159 "fail_page_alloc");
1160 if (err) 1160 if (err)
1161 return err; 1161 return err;
1162 dir = fail_page_alloc.attr.dentries.dir; 1162 dir = fail_page_alloc.attr.dentries.dir;
1163 1163
1164 fail_page_alloc.ignore_gfp_wait_file = 1164 fail_page_alloc.ignore_gfp_wait_file =
1165 debugfs_create_bool("ignore-gfp-wait", mode, dir, 1165 debugfs_create_bool("ignore-gfp-wait", mode, dir,
1166 &fail_page_alloc.ignore_gfp_wait); 1166 &fail_page_alloc.ignore_gfp_wait);
1167 1167
1168 fail_page_alloc.ignore_gfp_highmem_file = 1168 fail_page_alloc.ignore_gfp_highmem_file =
1169 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1169 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1170 &fail_page_alloc.ignore_gfp_highmem); 1170 &fail_page_alloc.ignore_gfp_highmem);
1171 fail_page_alloc.min_order_file = 1171 fail_page_alloc.min_order_file =
1172 debugfs_create_u32("min-order", mode, dir, 1172 debugfs_create_u32("min-order", mode, dir,
1173 &fail_page_alloc.min_order); 1173 &fail_page_alloc.min_order);
1174 1174
1175 if (!fail_page_alloc.ignore_gfp_wait_file || 1175 if (!fail_page_alloc.ignore_gfp_wait_file ||
1176 !fail_page_alloc.ignore_gfp_highmem_file || 1176 !fail_page_alloc.ignore_gfp_highmem_file ||
1177 !fail_page_alloc.min_order_file) { 1177 !fail_page_alloc.min_order_file) {
1178 err = -ENOMEM; 1178 err = -ENOMEM;
1179 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); 1179 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1180 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); 1180 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1181 debugfs_remove(fail_page_alloc.min_order_file); 1181 debugfs_remove(fail_page_alloc.min_order_file);
1182 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 1182 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1183 } 1183 }
1184 1184
1185 return err; 1185 return err;
1186 } 1186 }
1187 1187
1188 late_initcall(fail_page_alloc_debugfs); 1188 late_initcall(fail_page_alloc_debugfs);
1189 1189
1190 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1190 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1191 1191
1192 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1192 #else /* CONFIG_FAIL_PAGE_ALLOC */
1193 1193
1194 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1194 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1195 { 1195 {
1196 return 0; 1196 return 0;
1197 } 1197 }
1198 1198
1199 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1199 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1200 1200
1201 /* 1201 /*
1202 * Return 1 if free pages are above 'mark'. This takes into account the order 1202 * Return 1 if free pages are above 'mark'. This takes into account the order
1203 * of the allocation. 1203 * of the allocation.
1204 */ 1204 */
1205 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1205 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1206 int classzone_idx, int alloc_flags) 1206 int classzone_idx, int alloc_flags)
1207 { 1207 {
1208 /* free_pages my go negative - that's OK */ 1208 /* free_pages my go negative - that's OK */
1209 long min = mark; 1209 long min = mark;
1210 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1210 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
1211 int o; 1211 int o;
1212 1212
1213 if (alloc_flags & ALLOC_HIGH) 1213 if (alloc_flags & ALLOC_HIGH)
1214 min -= min / 2; 1214 min -= min / 2;
1215 if (alloc_flags & ALLOC_HARDER) 1215 if (alloc_flags & ALLOC_HARDER)
1216 min -= min / 4; 1216 min -= min / 4;
1217 1217
1218 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1218 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1219 return 0; 1219 return 0;
1220 for (o = 0; o < order; o++) { 1220 for (o = 0; o < order; o++) {
1221 /* At the next order, this order's pages become unavailable */ 1221 /* At the next order, this order's pages become unavailable */
1222 free_pages -= z->free_area[o].nr_free << o; 1222 free_pages -= z->free_area[o].nr_free << o;
1223 1223
1224 /* Require fewer higher order pages to be free */ 1224 /* Require fewer higher order pages to be free */
1225 min >>= 1; 1225 min >>= 1;
1226 1226
1227 if (free_pages <= min) 1227 if (free_pages <= min)
1228 return 0; 1228 return 0;
1229 } 1229 }
1230 return 1; 1230 return 1;
1231 } 1231 }
1232 1232
1233 #ifdef CONFIG_NUMA 1233 #ifdef CONFIG_NUMA
1234 /* 1234 /*
1235 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1235 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1236 * skip over zones that are not allowed by the cpuset, or that have 1236 * skip over zones that are not allowed by the cpuset, or that have
1237 * been recently (in last second) found to be nearly full. See further 1237 * been recently (in last second) found to be nearly full. See further
1238 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1238 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1239 * that have to skip over a lot of full or unallowed zones. 1239 * that have to skip over a lot of full or unallowed zones.
1240 * 1240 *
1241 * If the zonelist cache is present in the passed in zonelist, then 1241 * If the zonelist cache is present in the passed in zonelist, then
1242 * returns a pointer to the allowed node mask (either the current 1242 * returns a pointer to the allowed node mask (either the current
1243 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1243 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1244 * 1244 *
1245 * If the zonelist cache is not available for this zonelist, does 1245 * If the zonelist cache is not available for this zonelist, does
1246 * nothing and returns NULL. 1246 * nothing and returns NULL.
1247 * 1247 *
1248 * If the fullzones BITMAP in the zonelist cache is stale (more than 1248 * If the fullzones BITMAP in the zonelist cache is stale (more than
1249 * a second since last zap'd) then we zap it out (clear its bits.) 1249 * a second since last zap'd) then we zap it out (clear its bits.)
1250 * 1250 *
1251 * We hold off even calling zlc_setup, until after we've checked the 1251 * We hold off even calling zlc_setup, until after we've checked the
1252 * first zone in the zonelist, on the theory that most allocations will 1252 * first zone in the zonelist, on the theory that most allocations will
1253 * be satisfied from that first zone, so best to examine that zone as 1253 * be satisfied from that first zone, so best to examine that zone as
1254 * quickly as we can. 1254 * quickly as we can.
1255 */ 1255 */
1256 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1256 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1257 { 1257 {
1258 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1258 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1259 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1259 nodemask_t *allowednodes; /* zonelist_cache approximation */
1260 1260
1261 zlc = zonelist->zlcache_ptr; 1261 zlc = zonelist->zlcache_ptr;
1262 if (!zlc) 1262 if (!zlc)
1263 return NULL; 1263 return NULL;
1264 1264
1265 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1265 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1266 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1266 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1267 zlc->last_full_zap = jiffies; 1267 zlc->last_full_zap = jiffies;
1268 } 1268 }
1269 1269
1270 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1270 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1271 &cpuset_current_mems_allowed : 1271 &cpuset_current_mems_allowed :
1272 &node_states[N_HIGH_MEMORY]; 1272 &node_states[N_HIGH_MEMORY];
1273 return allowednodes; 1273 return allowednodes;
1274 } 1274 }
1275 1275
1276 /* 1276 /*
1277 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1277 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1278 * if it is worth looking at further for free memory: 1278 * if it is worth looking at further for free memory:
1279 * 1) Check that the zone isn't thought to be full (doesn't have its 1279 * 1) Check that the zone isn't thought to be full (doesn't have its
1280 * bit set in the zonelist_cache fullzones BITMAP). 1280 * bit set in the zonelist_cache fullzones BITMAP).
1281 * 2) Check that the zones node (obtained from the zonelist_cache 1281 * 2) Check that the zones node (obtained from the zonelist_cache
1282 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1282 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1283 * Return true (non-zero) if zone is worth looking at further, or 1283 * Return true (non-zero) if zone is worth looking at further, or
1284 * else return false (zero) if it is not. 1284 * else return false (zero) if it is not.
1285 * 1285 *
1286 * This check -ignores- the distinction between various watermarks, 1286 * This check -ignores- the distinction between various watermarks,
1287 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1287 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1288 * found to be full for any variation of these watermarks, it will 1288 * found to be full for any variation of these watermarks, it will
1289 * be considered full for up to one second by all requests, unless 1289 * be considered full for up to one second by all requests, unless
1290 * we are so low on memory on all allowed nodes that we are forced 1290 * we are so low on memory on all allowed nodes that we are forced
1291 * into the second scan of the zonelist. 1291 * into the second scan of the zonelist.
1292 * 1292 *
1293 * In the second scan we ignore this zonelist cache and exactly 1293 * In the second scan we ignore this zonelist cache and exactly
1294 * apply the watermarks to all zones, even it is slower to do so. 1294 * apply the watermarks to all zones, even it is slower to do so.
1295 * We are low on memory in the second scan, and should leave no stone 1295 * We are low on memory in the second scan, and should leave no stone
1296 * unturned looking for a free page. 1296 * unturned looking for a free page.
1297 */ 1297 */
1298 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1298 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1299 nodemask_t *allowednodes) 1299 nodemask_t *allowednodes)
1300 { 1300 {
1301 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1301 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1302 int i; /* index of *z in zonelist zones */ 1302 int i; /* index of *z in zonelist zones */
1303 int n; /* node that zone *z is on */ 1303 int n; /* node that zone *z is on */
1304 1304
1305 zlc = zonelist->zlcache_ptr; 1305 zlc = zonelist->zlcache_ptr;
1306 if (!zlc) 1306 if (!zlc)
1307 return 1; 1307 return 1;
1308 1308
1309 i = z - zonelist->_zonerefs; 1309 i = z - zonelist->_zonerefs;
1310 n = zlc->z_to_n[i]; 1310 n = zlc->z_to_n[i];
1311 1311
1312 /* This zone is worth trying if it is allowed but not full */ 1312 /* This zone is worth trying if it is allowed but not full */
1313 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1313 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1314 } 1314 }
1315 1315
1316 /* 1316 /*
1317 * Given 'z' scanning a zonelist, set the corresponding bit in 1317 * Given 'z' scanning a zonelist, set the corresponding bit in
1318 * zlc->fullzones, so that subsequent attempts to allocate a page 1318 * zlc->fullzones, so that subsequent attempts to allocate a page
1319 * from that zone don't waste time re-examining it. 1319 * from that zone don't waste time re-examining it.
1320 */ 1320 */
1321 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1321 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1322 { 1322 {
1323 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1323 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1324 int i; /* index of *z in zonelist zones */ 1324 int i; /* index of *z in zonelist zones */
1325 1325
1326 zlc = zonelist->zlcache_ptr; 1326 zlc = zonelist->zlcache_ptr;
1327 if (!zlc) 1327 if (!zlc)
1328 return; 1328 return;
1329 1329
1330 i = z - zonelist->_zonerefs; 1330 i = z - zonelist->_zonerefs;
1331 1331
1332 set_bit(i, zlc->fullzones); 1332 set_bit(i, zlc->fullzones);
1333 } 1333 }
1334 1334
1335 #else /* CONFIG_NUMA */ 1335 #else /* CONFIG_NUMA */
1336 1336
1337 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1337 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1338 { 1338 {
1339 return NULL; 1339 return NULL;
1340 } 1340 }
1341 1341
1342 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1342 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1343 nodemask_t *allowednodes) 1343 nodemask_t *allowednodes)
1344 { 1344 {
1345 return 1; 1345 return 1;
1346 } 1346 }
1347 1347
1348 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1348 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1349 { 1349 {
1350 } 1350 }
1351 #endif /* CONFIG_NUMA */ 1351 #endif /* CONFIG_NUMA */
1352 1352
1353 /* 1353 /*
1354 * get_page_from_freelist goes through the zonelist trying to allocate 1354 * get_page_from_freelist goes through the zonelist trying to allocate
1355 * a page. 1355 * a page.
1356 */ 1356 */
1357 static struct page * 1357 static struct page *
1358 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1358 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1359 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1359 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1360 { 1360 {
1361 struct zoneref *z; 1361 struct zoneref *z;
1362 struct page *page = NULL; 1362 struct page *page = NULL;
1363 int classzone_idx; 1363 int classzone_idx;
1364 struct zone *zone, *preferred_zone; 1364 struct zone *zone, *preferred_zone;
1365 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1365 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1366 int zlc_active = 0; /* set if using zonelist_cache */ 1366 int zlc_active = 0; /* set if using zonelist_cache */
1367 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1367 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1368 1368
1369 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, 1369 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1370 &preferred_zone); 1370 &preferred_zone);
1371 if (!preferred_zone) 1371 if (!preferred_zone)
1372 return NULL; 1372 return NULL;
1373 1373
1374 classzone_idx = zone_idx(preferred_zone); 1374 classzone_idx = zone_idx(preferred_zone);
1375 1375
1376 zonelist_scan: 1376 zonelist_scan:
1377 /* 1377 /*
1378 * Scan zonelist, looking for a zone with enough free. 1378 * Scan zonelist, looking for a zone with enough free.
1379 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1379 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1380 */ 1380 */
1381 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1381 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1382 high_zoneidx, nodemask) { 1382 high_zoneidx, nodemask) {
1383 if (NUMA_BUILD && zlc_active && 1383 if (NUMA_BUILD && zlc_active &&
1384 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1384 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1385 continue; 1385 continue;
1386 if ((alloc_flags & ALLOC_CPUSET) && 1386 if ((alloc_flags & ALLOC_CPUSET) &&
1387 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1387 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1388 goto try_next_zone; 1388 goto try_next_zone;
1389 1389
1390 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1390 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1391 unsigned long mark; 1391 unsigned long mark;
1392 if (alloc_flags & ALLOC_WMARK_MIN) 1392 if (alloc_flags & ALLOC_WMARK_MIN)
1393 mark = zone->pages_min; 1393 mark = zone->pages_min;
1394 else if (alloc_flags & ALLOC_WMARK_LOW) 1394 else if (alloc_flags & ALLOC_WMARK_LOW)
1395 mark = zone->pages_low; 1395 mark = zone->pages_low;
1396 else 1396 else
1397 mark = zone->pages_high; 1397 mark = zone->pages_high;
1398 if (!zone_watermark_ok(zone, order, mark, 1398 if (!zone_watermark_ok(zone, order, mark,
1399 classzone_idx, alloc_flags)) { 1399 classzone_idx, alloc_flags)) {
1400 if (!zone_reclaim_mode || 1400 if (!zone_reclaim_mode ||
1401 !zone_reclaim(zone, gfp_mask, order)) 1401 !zone_reclaim(zone, gfp_mask, order))
1402 goto this_zone_full; 1402 goto this_zone_full;
1403 } 1403 }
1404 } 1404 }
1405 1405
1406 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1406 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
1407 if (page) 1407 if (page)
1408 break; 1408 break;
1409 this_zone_full: 1409 this_zone_full:
1410 if (NUMA_BUILD) 1410 if (NUMA_BUILD)
1411 zlc_mark_zone_full(zonelist, z); 1411 zlc_mark_zone_full(zonelist, z);
1412 try_next_zone: 1412 try_next_zone:
1413 if (NUMA_BUILD && !did_zlc_setup) { 1413 if (NUMA_BUILD && !did_zlc_setup) {
1414 /* we do zlc_setup after the first zone is tried */ 1414 /* we do zlc_setup after the first zone is tried */
1415 allowednodes = zlc_setup(zonelist, alloc_flags); 1415 allowednodes = zlc_setup(zonelist, alloc_flags);
1416 zlc_active = 1; 1416 zlc_active = 1;
1417 did_zlc_setup = 1; 1417 did_zlc_setup = 1;
1418 } 1418 }
1419 } 1419 }
1420 1420
1421 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1421 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1422 /* Disable zlc cache for second zonelist scan */ 1422 /* Disable zlc cache for second zonelist scan */
1423 zlc_active = 0; 1423 zlc_active = 0;
1424 goto zonelist_scan; 1424 goto zonelist_scan;
1425 } 1425 }
1426 return page; 1426 return page;
1427 } 1427 }
1428 1428
1429 /* 1429 /*
1430 * This is the 'heart' of the zoned buddy allocator. 1430 * This is the 'heart' of the zoned buddy allocator.
1431 */ 1431 */
1432 static struct page * 1432 static struct page *
1433 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1433 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1434 struct zonelist *zonelist, nodemask_t *nodemask) 1434 struct zonelist *zonelist, nodemask_t *nodemask)
1435 { 1435 {
1436 const gfp_t wait = gfp_mask & __GFP_WAIT; 1436 const gfp_t wait = gfp_mask & __GFP_WAIT;
1437 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1437 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1438 struct zoneref *z; 1438 struct zoneref *z;
1439 struct zone *zone; 1439 struct zone *zone;
1440 struct page *page; 1440 struct page *page;
1441 struct reclaim_state reclaim_state; 1441 struct reclaim_state reclaim_state;
1442 struct task_struct *p = current; 1442 struct task_struct *p = current;
1443 int do_retry; 1443 int do_retry;
1444 int alloc_flags; 1444 int alloc_flags;
1445 unsigned long did_some_progress; 1445 unsigned long did_some_progress;
1446 unsigned long pages_reclaimed = 0; 1446 unsigned long pages_reclaimed = 0;
1447 1447
1448 might_sleep_if(wait); 1448 might_sleep_if(wait);
1449 1449
1450 if (should_fail_alloc_page(gfp_mask, order)) 1450 if (should_fail_alloc_page(gfp_mask, order))
1451 return NULL; 1451 return NULL;
1452 1452
1453 restart: 1453 restart:
1454 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1454 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
1455 1455
1456 if (unlikely(!z->zone)) { 1456 if (unlikely(!z->zone)) {
1457 /* 1457 /*
1458 * Happens if we have an empty zonelist as a result of 1458 * Happens if we have an empty zonelist as a result of
1459 * GFP_THISNODE being used on a memoryless node 1459 * GFP_THISNODE being used on a memoryless node
1460 */ 1460 */
1461 return NULL; 1461 return NULL;
1462 } 1462 }
1463 1463
1464 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1464 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1465 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1465 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1466 if (page) 1466 if (page)
1467 goto got_pg; 1467 goto got_pg;
1468 1468
1469 /* 1469 /*
1470 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1470 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1471 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1471 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1472 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1472 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1473 * using a larger set of nodes after it has established that the 1473 * using a larger set of nodes after it has established that the
1474 * allowed per node queues are empty and that nodes are 1474 * allowed per node queues are empty and that nodes are
1475 * over allocated. 1475 * over allocated.
1476 */ 1476 */
1477 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1477 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1478 goto nopage; 1478 goto nopage;
1479 1479
1480 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1480 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1481 wakeup_kswapd(zone, order); 1481 wakeup_kswapd(zone, order);
1482 1482
1483 /* 1483 /*
1484 * OK, we're below the kswapd watermark and have kicked background 1484 * OK, we're below the kswapd watermark and have kicked background
1485 * reclaim. Now things get more complex, so set up alloc_flags according 1485 * reclaim. Now things get more complex, so set up alloc_flags according
1486 * to how we want to proceed. 1486 * to how we want to proceed.
1487 * 1487 *
1488 * The caller may dip into page reserves a bit more if the caller 1488 * The caller may dip into page reserves a bit more if the caller
1489 * cannot run direct reclaim, or if the caller has realtime scheduling 1489 * cannot run direct reclaim, or if the caller has realtime scheduling
1490 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1490 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1491 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1491 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1492 */ 1492 */
1493 alloc_flags = ALLOC_WMARK_MIN; 1493 alloc_flags = ALLOC_WMARK_MIN;
1494 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1494 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1495 alloc_flags |= ALLOC_HARDER; 1495 alloc_flags |= ALLOC_HARDER;
1496 if (gfp_mask & __GFP_HIGH) 1496 if (gfp_mask & __GFP_HIGH)
1497 alloc_flags |= ALLOC_HIGH; 1497 alloc_flags |= ALLOC_HIGH;
1498 if (wait) 1498 if (wait)
1499 alloc_flags |= ALLOC_CPUSET; 1499 alloc_flags |= ALLOC_CPUSET;
1500 1500
1501 /* 1501 /*
1502 * Go through the zonelist again. Let __GFP_HIGH and allocations 1502 * Go through the zonelist again. Let __GFP_HIGH and allocations
1503 * coming from realtime tasks go deeper into reserves. 1503 * coming from realtime tasks go deeper into reserves.
1504 * 1504 *
1505 * This is the last chance, in general, before the goto nopage. 1505 * This is the last chance, in general, before the goto nopage.
1506 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1506 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1507 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1507 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1508 */ 1508 */
1509 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1509 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1510 high_zoneidx, alloc_flags); 1510 high_zoneidx, alloc_flags);
1511 if (page) 1511 if (page)
1512 goto got_pg; 1512 goto got_pg;
1513 1513
1514 /* This allocation should allow future memory freeing. */ 1514 /* This allocation should allow future memory freeing. */
1515 1515
1516 rebalance: 1516 rebalance:
1517 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1517 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1518 && !in_interrupt()) { 1518 && !in_interrupt()) {
1519 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1519 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1520 nofail_alloc: 1520 nofail_alloc:
1521 /* go through the zonelist yet again, ignoring mins */ 1521 /* go through the zonelist yet again, ignoring mins */
1522 page = get_page_from_freelist(gfp_mask, nodemask, order, 1522 page = get_page_from_freelist(gfp_mask, nodemask, order,
1523 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1523 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1524 if (page) 1524 if (page)
1525 goto got_pg; 1525 goto got_pg;
1526 if (gfp_mask & __GFP_NOFAIL) { 1526 if (gfp_mask & __GFP_NOFAIL) {
1527 congestion_wait(WRITE, HZ/50); 1527 congestion_wait(WRITE, HZ/50);
1528 goto nofail_alloc; 1528 goto nofail_alloc;
1529 } 1529 }
1530 } 1530 }
1531 goto nopage; 1531 goto nopage;
1532 } 1532 }
1533 1533
1534 /* Atomic allocations - we can't balance anything */ 1534 /* Atomic allocations - we can't balance anything */
1535 if (!wait) 1535 if (!wait)
1536 goto nopage; 1536 goto nopage;
1537 1537
1538 cond_resched(); 1538 cond_resched();
1539 1539
1540 /* We now go into synchronous reclaim */ 1540 /* We now go into synchronous reclaim */
1541 cpuset_memory_pressure_bump(); 1541 cpuset_memory_pressure_bump();
1542 p->flags |= PF_MEMALLOC; 1542 p->flags |= PF_MEMALLOC;
1543 reclaim_state.reclaimed_slab = 0; 1543 reclaim_state.reclaimed_slab = 0;
1544 p->reclaim_state = &reclaim_state; 1544 p->reclaim_state = &reclaim_state;
1545 1545
1546 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1546 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
1547 1547
1548 p->reclaim_state = NULL; 1548 p->reclaim_state = NULL;
1549 p->flags &= ~PF_MEMALLOC; 1549 p->flags &= ~PF_MEMALLOC;
1550 1550
1551 cond_resched(); 1551 cond_resched();
1552 1552
1553 if (order != 0) 1553 if (order != 0)
1554 drain_all_pages(); 1554 drain_all_pages();
1555 1555
1556 if (likely(did_some_progress)) { 1556 if (likely(did_some_progress)) {
1557 page = get_page_from_freelist(gfp_mask, nodemask, order, 1557 page = get_page_from_freelist(gfp_mask, nodemask, order,
1558 zonelist, high_zoneidx, alloc_flags); 1558 zonelist, high_zoneidx, alloc_flags);
1559 if (page) 1559 if (page)
1560 goto got_pg; 1560 goto got_pg;
1561 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1561 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1562 if (!try_set_zone_oom(zonelist, gfp_mask)) { 1562 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1563 schedule_timeout_uninterruptible(1); 1563 schedule_timeout_uninterruptible(1);
1564 goto restart; 1564 goto restart;
1565 } 1565 }
1566 1566
1567 /* 1567 /*
1568 * Go through the zonelist yet one more time, keep 1568 * Go through the zonelist yet one more time, keep
1569 * very high watermark here, this is only to catch 1569 * very high watermark here, this is only to catch
1570 * a parallel oom killing, we must fail if we're still 1570 * a parallel oom killing, we must fail if we're still
1571 * under heavy pressure. 1571 * under heavy pressure.
1572 */ 1572 */
1573 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 1573 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1574 order, zonelist, high_zoneidx, 1574 order, zonelist, high_zoneidx,
1575 ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1575 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1576 if (page) { 1576 if (page) {
1577 clear_zonelist_oom(zonelist, gfp_mask); 1577 clear_zonelist_oom(zonelist, gfp_mask);
1578 goto got_pg; 1578 goto got_pg;
1579 } 1579 }
1580 1580
1581 /* The OOM killer will not help higher order allocs so fail */ 1581 /* The OOM killer will not help higher order allocs so fail */
1582 if (order > PAGE_ALLOC_COSTLY_ORDER) { 1582 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1583 clear_zonelist_oom(zonelist, gfp_mask); 1583 clear_zonelist_oom(zonelist, gfp_mask);
1584 goto nopage; 1584 goto nopage;
1585 } 1585 }
1586 1586
1587 out_of_memory(zonelist, gfp_mask, order); 1587 out_of_memory(zonelist, gfp_mask, order);
1588 clear_zonelist_oom(zonelist, gfp_mask); 1588 clear_zonelist_oom(zonelist, gfp_mask);
1589 goto restart; 1589 goto restart;
1590 } 1590 }
1591 1591
1592 /* 1592 /*
1593 * Don't let big-order allocations loop unless the caller explicitly 1593 * Don't let big-order allocations loop unless the caller explicitly
1594 * requests that. Wait for some write requests to complete then retry. 1594 * requests that. Wait for some write requests to complete then retry.
1595 * 1595 *
1596 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 1596 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1597 * means __GFP_NOFAIL, but that may not be true in other 1597 * means __GFP_NOFAIL, but that may not be true in other
1598 * implementations. 1598 * implementations.
1599 * 1599 *
1600 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 1600 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1601 * specified, then we retry until we no longer reclaim any pages 1601 * specified, then we retry until we no longer reclaim any pages
1602 * (above), or we've reclaimed an order of pages at least as 1602 * (above), or we've reclaimed an order of pages at least as
1603 * large as the allocation's order. In both cases, if the 1603 * large as the allocation's order. In both cases, if the
1604 * allocation still fails, we stop retrying. 1604 * allocation still fails, we stop retrying.
1605 */ 1605 */
1606 pages_reclaimed += did_some_progress; 1606 pages_reclaimed += did_some_progress;
1607 do_retry = 0; 1607 do_retry = 0;
1608 if (!(gfp_mask & __GFP_NORETRY)) { 1608 if (!(gfp_mask & __GFP_NORETRY)) {
1609 if (order <= PAGE_ALLOC_COSTLY_ORDER) { 1609 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1610 do_retry = 1; 1610 do_retry = 1;
1611 } else { 1611 } else {
1612 if (gfp_mask & __GFP_REPEAT && 1612 if (gfp_mask & __GFP_REPEAT &&
1613 pages_reclaimed < (1 << order)) 1613 pages_reclaimed < (1 << order))
1614 do_retry = 1; 1614 do_retry = 1;
1615 } 1615 }
1616 if (gfp_mask & __GFP_NOFAIL) 1616 if (gfp_mask & __GFP_NOFAIL)
1617 do_retry = 1; 1617 do_retry = 1;
1618 } 1618 }
1619 if (do_retry) { 1619 if (do_retry) {
1620 congestion_wait(WRITE, HZ/50); 1620 congestion_wait(WRITE, HZ/50);
1621 goto rebalance; 1621 goto rebalance;
1622 } 1622 }
1623 1623
1624 nopage: 1624 nopage:
1625 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1625 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1626 printk(KERN_WARNING "%s: page allocation failure." 1626 printk(KERN_WARNING "%s: page allocation failure."
1627 " order:%d, mode:0x%x\n", 1627 " order:%d, mode:0x%x\n",
1628 p->comm, order, gfp_mask); 1628 p->comm, order, gfp_mask);
1629 dump_stack(); 1629 dump_stack();
1630 show_mem(); 1630 show_mem();
1631 } 1631 }
1632 got_pg: 1632 got_pg:
1633 return page; 1633 return page;
1634 } 1634 }
1635 1635
1636 struct page * 1636 struct page *
1637 __alloc_pages(gfp_t gfp_mask, unsigned int order, 1637 __alloc_pages(gfp_t gfp_mask, unsigned int order,
1638 struct zonelist *zonelist) 1638 struct zonelist *zonelist)
1639 { 1639 {
1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); 1640 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1641 } 1641 }
1642 1642
1643 struct page * 1643 struct page *
1644 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 1644 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1645 struct zonelist *zonelist, nodemask_t *nodemask) 1645 struct zonelist *zonelist, nodemask_t *nodemask)
1646 { 1646 {
1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); 1647 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1648 } 1648 }
1649 1649
1650 EXPORT_SYMBOL(__alloc_pages); 1650 EXPORT_SYMBOL(__alloc_pages);
1651 1651
1652 /* 1652 /*
1653 * Common helper functions. 1653 * Common helper functions.
1654 */ 1654 */
1655 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1655 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1656 { 1656 {
1657 struct page * page; 1657 struct page * page;
1658 page = alloc_pages(gfp_mask, order); 1658 page = alloc_pages(gfp_mask, order);
1659 if (!page) 1659 if (!page)
1660 return 0; 1660 return 0;
1661 return (unsigned long) page_address(page); 1661 return (unsigned long) page_address(page);
1662 } 1662 }
1663 1663
1664 EXPORT_SYMBOL(__get_free_pages); 1664 EXPORT_SYMBOL(__get_free_pages);
1665 1665
1666 unsigned long get_zeroed_page(gfp_t gfp_mask) 1666 unsigned long get_zeroed_page(gfp_t gfp_mask)
1667 { 1667 {
1668 struct page * page; 1668 struct page * page;
1669 1669
1670 /* 1670 /*
1671 * get_zeroed_page() returns a 32-bit address, which cannot represent 1671 * get_zeroed_page() returns a 32-bit address, which cannot represent
1672 * a highmem page 1672 * a highmem page
1673 */ 1673 */
1674 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1674 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1675 1675
1676 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1676 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1677 if (page) 1677 if (page)
1678 return (unsigned long) page_address(page); 1678 return (unsigned long) page_address(page);
1679 return 0; 1679 return 0;
1680 } 1680 }
1681 1681
1682 EXPORT_SYMBOL(get_zeroed_page); 1682 EXPORT_SYMBOL(get_zeroed_page);
1683 1683
1684 void __pagevec_free(struct pagevec *pvec) 1684 void __pagevec_free(struct pagevec *pvec)
1685 { 1685 {
1686 int i = pagevec_count(pvec); 1686 int i = pagevec_count(pvec);
1687 1687
1688 while (--i >= 0) 1688 while (--i >= 0)
1689 free_hot_cold_page(pvec->pages[i], pvec->cold); 1689 free_hot_cold_page(pvec->pages[i], pvec->cold);
1690 } 1690 }
1691 1691
1692 void __free_pages(struct page *page, unsigned int order) 1692 void __free_pages(struct page *page, unsigned int order)
1693 { 1693 {
1694 if (put_page_testzero(page)) { 1694 if (put_page_testzero(page)) {
1695 if (order == 0) 1695 if (order == 0)
1696 free_hot_page(page); 1696 free_hot_page(page);
1697 else 1697 else
1698 __free_pages_ok(page, order); 1698 __free_pages_ok(page, order);
1699 } 1699 }
1700 } 1700 }
1701 1701
1702 EXPORT_SYMBOL(__free_pages); 1702 EXPORT_SYMBOL(__free_pages);
1703 1703
1704 void free_pages(unsigned long addr, unsigned int order) 1704 void free_pages(unsigned long addr, unsigned int order)
1705 { 1705 {
1706 if (addr != 0) { 1706 if (addr != 0) {
1707 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1707 VM_BUG_ON(!virt_addr_valid((void *)addr));
1708 __free_pages(virt_to_page((void *)addr), order); 1708 __free_pages(virt_to_page((void *)addr), order);
1709 } 1709 }
1710 } 1710 }
1711 1711
1712 EXPORT_SYMBOL(free_pages); 1712 EXPORT_SYMBOL(free_pages);
1713 1713
1714 static unsigned int nr_free_zone_pages(int offset) 1714 static unsigned int nr_free_zone_pages(int offset)
1715 { 1715 {
1716 struct zoneref *z; 1716 struct zoneref *z;
1717 struct zone *zone; 1717 struct zone *zone;
1718 1718
1719 /* Just pick one node, since fallback list is circular */ 1719 /* Just pick one node, since fallback list is circular */
1720 unsigned int sum = 0; 1720 unsigned int sum = 0;
1721 1721
1722 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 1722 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1723 1723
1724 for_each_zone_zonelist(zone, z, zonelist, offset) { 1724 for_each_zone_zonelist(zone, z, zonelist, offset) {
1725 unsigned long size = zone->present_pages; 1725 unsigned long size = zone->present_pages;
1726 unsigned long high = zone->pages_high; 1726 unsigned long high = zone->pages_high;
1727 if (size > high) 1727 if (size > high)
1728 sum += size - high; 1728 sum += size - high;
1729 } 1729 }
1730 1730
1731 return sum; 1731 return sum;
1732 } 1732 }
1733 1733
1734 /* 1734 /*
1735 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1735 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1736 */ 1736 */
1737 unsigned int nr_free_buffer_pages(void) 1737 unsigned int nr_free_buffer_pages(void)
1738 { 1738 {
1739 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1739 return nr_free_zone_pages(gfp_zone(GFP_USER));
1740 } 1740 }
1741 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 1741 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
1742 1742
1743 /* 1743 /*
1744 * Amount of free RAM allocatable within all zones 1744 * Amount of free RAM allocatable within all zones
1745 */ 1745 */
1746 unsigned int nr_free_pagecache_pages(void) 1746 unsigned int nr_free_pagecache_pages(void)
1747 { 1747 {
1748 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 1748 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1749 } 1749 }
1750 1750
1751 static inline void show_node(struct zone *zone) 1751 static inline void show_node(struct zone *zone)
1752 { 1752 {
1753 if (NUMA_BUILD) 1753 if (NUMA_BUILD)
1754 printk("Node %d ", zone_to_nid(zone)); 1754 printk("Node %d ", zone_to_nid(zone));
1755 } 1755 }
1756 1756
1757 void si_meminfo(struct sysinfo *val) 1757 void si_meminfo(struct sysinfo *val)
1758 { 1758 {
1759 val->totalram = totalram_pages; 1759 val->totalram = totalram_pages;
1760 val->sharedram = 0; 1760 val->sharedram = 0;
1761 val->freeram = global_page_state(NR_FREE_PAGES); 1761 val->freeram = global_page_state(NR_FREE_PAGES);
1762 val->bufferram = nr_blockdev_pages(); 1762 val->bufferram = nr_blockdev_pages();
1763 val->totalhigh = totalhigh_pages; 1763 val->totalhigh = totalhigh_pages;
1764 val->freehigh = nr_free_highpages(); 1764 val->freehigh = nr_free_highpages();
1765 val->mem_unit = PAGE_SIZE; 1765 val->mem_unit = PAGE_SIZE;
1766 } 1766 }
1767 1767
1768 EXPORT_SYMBOL(si_meminfo); 1768 EXPORT_SYMBOL(si_meminfo);
1769 1769
1770 #ifdef CONFIG_NUMA 1770 #ifdef CONFIG_NUMA
1771 void si_meminfo_node(struct sysinfo *val, int nid) 1771 void si_meminfo_node(struct sysinfo *val, int nid)
1772 { 1772 {
1773 pg_data_t *pgdat = NODE_DATA(nid); 1773 pg_data_t *pgdat = NODE_DATA(nid);
1774 1774
1775 val->totalram = pgdat->node_present_pages; 1775 val->totalram = pgdat->node_present_pages;
1776 val->freeram = node_page_state(nid, NR_FREE_PAGES); 1776 val->freeram = node_page_state(nid, NR_FREE_PAGES);
1777 #ifdef CONFIG_HIGHMEM 1777 #ifdef CONFIG_HIGHMEM
1778 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1778 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1779 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 1779 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
1780 NR_FREE_PAGES); 1780 NR_FREE_PAGES);
1781 #else 1781 #else
1782 val->totalhigh = 0; 1782 val->totalhigh = 0;
1783 val->freehigh = 0; 1783 val->freehigh = 0;
1784 #endif 1784 #endif
1785 val->mem_unit = PAGE_SIZE; 1785 val->mem_unit = PAGE_SIZE;
1786 } 1786 }
1787 #endif 1787 #endif
1788 1788
1789 #define K(x) ((x) << (PAGE_SHIFT-10)) 1789 #define K(x) ((x) << (PAGE_SHIFT-10))
1790 1790
1791 /* 1791 /*
1792 * Show free area list (used inside shift_scroll-lock stuff) 1792 * Show free area list (used inside shift_scroll-lock stuff)
1793 * We also calculate the percentage fragmentation. We do this by counting the 1793 * We also calculate the percentage fragmentation. We do this by counting the
1794 * memory on each free list with the exception of the first item on the list. 1794 * memory on each free list with the exception of the first item on the list.
1795 */ 1795 */
1796 void show_free_areas(void) 1796 void show_free_areas(void)
1797 { 1797 {
1798 int cpu; 1798 int cpu;
1799 struct zone *zone; 1799 struct zone *zone;
1800 1800
1801 for_each_zone(zone) { 1801 for_each_zone(zone) {
1802 if (!populated_zone(zone)) 1802 if (!populated_zone(zone))
1803 continue; 1803 continue;
1804 1804
1805 show_node(zone); 1805 show_node(zone);
1806 printk("%s per-cpu:\n", zone->name); 1806 printk("%s per-cpu:\n", zone->name);
1807 1807
1808 for_each_online_cpu(cpu) { 1808 for_each_online_cpu(cpu) {
1809 struct per_cpu_pageset *pageset; 1809 struct per_cpu_pageset *pageset;
1810 1810
1811 pageset = zone_pcp(zone, cpu); 1811 pageset = zone_pcp(zone, cpu);
1812 1812
1813 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 1813 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
1814 cpu, pageset->pcp.high, 1814 cpu, pageset->pcp.high,
1815 pageset->pcp.batch, pageset->pcp.count); 1815 pageset->pcp.batch, pageset->pcp.count);
1816 } 1816 }
1817 } 1817 }
1818 1818
1819 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1819 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
1820 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1820 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1821 global_page_state(NR_ACTIVE), 1821 global_page_state(NR_ACTIVE),
1822 global_page_state(NR_INACTIVE), 1822 global_page_state(NR_INACTIVE),
1823 global_page_state(NR_FILE_DIRTY), 1823 global_page_state(NR_FILE_DIRTY),
1824 global_page_state(NR_WRITEBACK), 1824 global_page_state(NR_WRITEBACK),
1825 global_page_state(NR_UNSTABLE_NFS), 1825 global_page_state(NR_UNSTABLE_NFS),
1826 global_page_state(NR_FREE_PAGES), 1826 global_page_state(NR_FREE_PAGES),
1827 global_page_state(NR_SLAB_RECLAIMABLE) + 1827 global_page_state(NR_SLAB_RECLAIMABLE) +
1828 global_page_state(NR_SLAB_UNRECLAIMABLE), 1828 global_page_state(NR_SLAB_UNRECLAIMABLE),
1829 global_page_state(NR_FILE_MAPPED), 1829 global_page_state(NR_FILE_MAPPED),
1830 global_page_state(NR_PAGETABLE), 1830 global_page_state(NR_PAGETABLE),
1831 global_page_state(NR_BOUNCE)); 1831 global_page_state(NR_BOUNCE));
1832 1832
1833 for_each_zone(zone) { 1833 for_each_zone(zone) {
1834 int i; 1834 int i;
1835 1835
1836 if (!populated_zone(zone)) 1836 if (!populated_zone(zone))
1837 continue; 1837 continue;
1838 1838
1839 show_node(zone); 1839 show_node(zone);
1840 printk("%s" 1840 printk("%s"
1841 " free:%lukB" 1841 " free:%lukB"
1842 " min:%lukB" 1842 " min:%lukB"
1843 " low:%lukB" 1843 " low:%lukB"
1844 " high:%lukB" 1844 " high:%lukB"
1845 " active:%lukB" 1845 " active:%lukB"
1846 " inactive:%lukB" 1846 " inactive:%lukB"
1847 " present:%lukB" 1847 " present:%lukB"
1848 " pages_scanned:%lu" 1848 " pages_scanned:%lu"
1849 " all_unreclaimable? %s" 1849 " all_unreclaimable? %s"
1850 "\n", 1850 "\n",
1851 zone->name, 1851 zone->name,
1852 K(zone_page_state(zone, NR_FREE_PAGES)), 1852 K(zone_page_state(zone, NR_FREE_PAGES)),
1853 K(zone->pages_min), 1853 K(zone->pages_min),
1854 K(zone->pages_low), 1854 K(zone->pages_low),
1855 K(zone->pages_high), 1855 K(zone->pages_high),
1856 K(zone_page_state(zone, NR_ACTIVE)), 1856 K(zone_page_state(zone, NR_ACTIVE)),
1857 K(zone_page_state(zone, NR_INACTIVE)), 1857 K(zone_page_state(zone, NR_INACTIVE)),
1858 K(zone->present_pages), 1858 K(zone->present_pages),
1859 zone->pages_scanned, 1859 zone->pages_scanned,
1860 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1860 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
1861 ); 1861 );
1862 printk("lowmem_reserve[]:"); 1862 printk("lowmem_reserve[]:");
1863 for (i = 0; i < MAX_NR_ZONES; i++) 1863 for (i = 0; i < MAX_NR_ZONES; i++)
1864 printk(" %lu", zone->lowmem_reserve[i]); 1864 printk(" %lu", zone->lowmem_reserve[i]);
1865 printk("\n"); 1865 printk("\n");
1866 } 1866 }
1867 1867
1868 for_each_zone(zone) { 1868 for_each_zone(zone) {
1869 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1869 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1870 1870
1871 if (!populated_zone(zone)) 1871 if (!populated_zone(zone))
1872 continue; 1872 continue;
1873 1873
1874 show_node(zone); 1874 show_node(zone);
1875 printk("%s: ", zone->name); 1875 printk("%s: ", zone->name);
1876 1876
1877 spin_lock_irqsave(&zone->lock, flags); 1877 spin_lock_irqsave(&zone->lock, flags);
1878 for (order = 0; order < MAX_ORDER; order++) { 1878 for (order = 0; order < MAX_ORDER; order++) {
1879 nr[order] = zone->free_area[order].nr_free; 1879 nr[order] = zone->free_area[order].nr_free;
1880 total += nr[order] << order; 1880 total += nr[order] << order;
1881 } 1881 }
1882 spin_unlock_irqrestore(&zone->lock, flags); 1882 spin_unlock_irqrestore(&zone->lock, flags);
1883 for (order = 0; order < MAX_ORDER; order++) 1883 for (order = 0; order < MAX_ORDER; order++)
1884 printk("%lu*%lukB ", nr[order], K(1UL) << order); 1884 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1885 printk("= %lukB\n", K(total)); 1885 printk("= %lukB\n", K(total));
1886 } 1886 }
1887 1887
1888 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 1888 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
1889 1889
1890 show_swap_cache_info(); 1890 show_swap_cache_info();
1891 } 1891 }
1892 1892
1893 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 1893 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
1894 { 1894 {
1895 zoneref->zone = zone; 1895 zoneref->zone = zone;
1896 zoneref->zone_idx = zone_idx(zone); 1896 zoneref->zone_idx = zone_idx(zone);
1897 } 1897 }
1898 1898
1899 /* 1899 /*
1900 * Builds allocation fallback zone lists. 1900 * Builds allocation fallback zone lists.
1901 * 1901 *
1902 * Add all populated zones of a node to the zonelist. 1902 * Add all populated zones of a node to the zonelist.
1903 */ 1903 */
1904 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 1904 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1905 int nr_zones, enum zone_type zone_type) 1905 int nr_zones, enum zone_type zone_type)
1906 { 1906 {
1907 struct zone *zone; 1907 struct zone *zone;
1908 1908
1909 BUG_ON(zone_type >= MAX_NR_ZONES); 1909 BUG_ON(zone_type >= MAX_NR_ZONES);
1910 zone_type++; 1910 zone_type++;
1911 1911
1912 do { 1912 do {
1913 zone_type--; 1913 zone_type--;
1914 zone = pgdat->node_zones + zone_type; 1914 zone = pgdat->node_zones + zone_type;
1915 if (populated_zone(zone)) { 1915 if (populated_zone(zone)) {
1916 zoneref_set_zone(zone, 1916 zoneref_set_zone(zone,
1917 &zonelist->_zonerefs[nr_zones++]); 1917 &zonelist->_zonerefs[nr_zones++]);
1918 check_highest_zone(zone_type); 1918 check_highest_zone(zone_type);
1919 } 1919 }
1920 1920
1921 } while (zone_type); 1921 } while (zone_type);
1922 return nr_zones; 1922 return nr_zones;
1923 } 1923 }
1924 1924
1925 1925
1926 /* 1926 /*
1927 * zonelist_order: 1927 * zonelist_order:
1928 * 0 = automatic detection of better ordering. 1928 * 0 = automatic detection of better ordering.
1929 * 1 = order by ([node] distance, -zonetype) 1929 * 1 = order by ([node] distance, -zonetype)
1930 * 2 = order by (-zonetype, [node] distance) 1930 * 2 = order by (-zonetype, [node] distance)
1931 * 1931 *
1932 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 1932 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
1933 * the same zonelist. So only NUMA can configure this param. 1933 * the same zonelist. So only NUMA can configure this param.
1934 */ 1934 */
1935 #define ZONELIST_ORDER_DEFAULT 0 1935 #define ZONELIST_ORDER_DEFAULT 0
1936 #define ZONELIST_ORDER_NODE 1 1936 #define ZONELIST_ORDER_NODE 1
1937 #define ZONELIST_ORDER_ZONE 2 1937 #define ZONELIST_ORDER_ZONE 2
1938 1938
1939 /* zonelist order in the kernel. 1939 /* zonelist order in the kernel.
1940 * set_zonelist_order() will set this to NODE or ZONE. 1940 * set_zonelist_order() will set this to NODE or ZONE.
1941 */ 1941 */
1942 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 1942 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
1943 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 1943 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1944 1944
1945 1945
1946 #ifdef CONFIG_NUMA 1946 #ifdef CONFIG_NUMA
1947 /* The value user specified ....changed by config */ 1947 /* The value user specified ....changed by config */
1948 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1948 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1949 /* string for sysctl */ 1949 /* string for sysctl */
1950 #define NUMA_ZONELIST_ORDER_LEN 16 1950 #define NUMA_ZONELIST_ORDER_LEN 16
1951 char numa_zonelist_order[16] = "default"; 1951 char numa_zonelist_order[16] = "default";
1952 1952
1953 /* 1953 /*
1954 * interface for configure zonelist ordering. 1954 * interface for configure zonelist ordering.
1955 * command line option "numa_zonelist_order" 1955 * command line option "numa_zonelist_order"
1956 * = "[dD]efault - default, automatic configuration. 1956 * = "[dD]efault - default, automatic configuration.
1957 * = "[nN]ode - order by node locality, then by zone within node 1957 * = "[nN]ode - order by node locality, then by zone within node
1958 * = "[zZ]one - order by zone, then by locality within zone 1958 * = "[zZ]one - order by zone, then by locality within zone
1959 */ 1959 */
1960 1960
1961 static int __parse_numa_zonelist_order(char *s) 1961 static int __parse_numa_zonelist_order(char *s)
1962 { 1962 {
1963 if (*s == 'd' || *s == 'D') { 1963 if (*s == 'd' || *s == 'D') {
1964 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1964 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1965 } else if (*s == 'n' || *s == 'N') { 1965 } else if (*s == 'n' || *s == 'N') {
1966 user_zonelist_order = ZONELIST_ORDER_NODE; 1966 user_zonelist_order = ZONELIST_ORDER_NODE;
1967 } else if (*s == 'z' || *s == 'Z') { 1967 } else if (*s == 'z' || *s == 'Z') {
1968 user_zonelist_order = ZONELIST_ORDER_ZONE; 1968 user_zonelist_order = ZONELIST_ORDER_ZONE;
1969 } else { 1969 } else {
1970 printk(KERN_WARNING 1970 printk(KERN_WARNING
1971 "Ignoring invalid numa_zonelist_order value: " 1971 "Ignoring invalid numa_zonelist_order value: "
1972 "%s\n", s); 1972 "%s\n", s);
1973 return -EINVAL; 1973 return -EINVAL;
1974 } 1974 }
1975 return 0; 1975 return 0;
1976 } 1976 }
1977 1977
1978 static __init int setup_numa_zonelist_order(char *s) 1978 static __init int setup_numa_zonelist_order(char *s)
1979 { 1979 {
1980 if (s) 1980 if (s)
1981 return __parse_numa_zonelist_order(s); 1981 return __parse_numa_zonelist_order(s);
1982 return 0; 1982 return 0;
1983 } 1983 }
1984 early_param("numa_zonelist_order", setup_numa_zonelist_order); 1984 early_param("numa_zonelist_order", setup_numa_zonelist_order);
1985 1985
1986 /* 1986 /*
1987 * sysctl handler for numa_zonelist_order 1987 * sysctl handler for numa_zonelist_order
1988 */ 1988 */
1989 int numa_zonelist_order_handler(ctl_table *table, int write, 1989 int numa_zonelist_order_handler(ctl_table *table, int write,
1990 struct file *file, void __user *buffer, size_t *length, 1990 struct file *file, void __user *buffer, size_t *length,
1991 loff_t *ppos) 1991 loff_t *ppos)
1992 { 1992 {
1993 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 1993 char saved_string[NUMA_ZONELIST_ORDER_LEN];
1994 int ret; 1994 int ret;
1995 1995
1996 if (write) 1996 if (write)
1997 strncpy(saved_string, (char*)table->data, 1997 strncpy(saved_string, (char*)table->data,
1998 NUMA_ZONELIST_ORDER_LEN); 1998 NUMA_ZONELIST_ORDER_LEN);
1999 ret = proc_dostring(table, write, file, buffer, length, ppos); 1999 ret = proc_dostring(table, write, file, buffer, length, ppos);
2000 if (ret) 2000 if (ret)
2001 return ret; 2001 return ret;
2002 if (write) { 2002 if (write) {
2003 int oldval = user_zonelist_order; 2003 int oldval = user_zonelist_order;
2004 if (__parse_numa_zonelist_order((char*)table->data)) { 2004 if (__parse_numa_zonelist_order((char*)table->data)) {
2005 /* 2005 /*
2006 * bogus value. restore saved string 2006 * bogus value. restore saved string
2007 */ 2007 */
2008 strncpy((char*)table->data, saved_string, 2008 strncpy((char*)table->data, saved_string,
2009 NUMA_ZONELIST_ORDER_LEN); 2009 NUMA_ZONELIST_ORDER_LEN);
2010 user_zonelist_order = oldval; 2010 user_zonelist_order = oldval;
2011 } else if (oldval != user_zonelist_order) 2011 } else if (oldval != user_zonelist_order)
2012 build_all_zonelists(); 2012 build_all_zonelists();
2013 } 2013 }
2014 return 0; 2014 return 0;
2015 } 2015 }
2016 2016
2017 2017
2018 #define MAX_NODE_LOAD (num_online_nodes()) 2018 #define MAX_NODE_LOAD (num_online_nodes())
2019 static int node_load[MAX_NUMNODES]; 2019 static int node_load[MAX_NUMNODES];
2020 2020
2021 /** 2021 /**
2022 * find_next_best_node - find the next node that should appear in a given node's fallback list 2022 * find_next_best_node - find the next node that should appear in a given node's fallback list
2023 * @node: node whose fallback list we're appending 2023 * @node: node whose fallback list we're appending
2024 * @used_node_mask: nodemask_t of already used nodes 2024 * @used_node_mask: nodemask_t of already used nodes
2025 * 2025 *
2026 * We use a number of factors to determine which is the next node that should 2026 * We use a number of factors to determine which is the next node that should
2027 * appear on a given node's fallback list. The node should not have appeared 2027 * appear on a given node's fallback list. The node should not have appeared
2028 * already in @node's fallback list, and it should be the next closest node 2028 * already in @node's fallback list, and it should be the next closest node
2029 * according to the distance array (which contains arbitrary distance values 2029 * according to the distance array (which contains arbitrary distance values
2030 * from each node to each node in the system), and should also prefer nodes 2030 * from each node to each node in the system), and should also prefer nodes
2031 * with no CPUs, since presumably they'll have very little allocation pressure 2031 * with no CPUs, since presumably they'll have very little allocation pressure
2032 * on them otherwise. 2032 * on them otherwise.
2033 * It returns -1 if no node is found. 2033 * It returns -1 if no node is found.
2034 */ 2034 */
2035 static int find_next_best_node(int node, nodemask_t *used_node_mask) 2035 static int find_next_best_node(int node, nodemask_t *used_node_mask)
2036 { 2036 {
2037 int n, val; 2037 int n, val;
2038 int min_val = INT_MAX; 2038 int min_val = INT_MAX;
2039 int best_node = -1; 2039 int best_node = -1;
2040 node_to_cpumask_ptr(tmp, 0); 2040 node_to_cpumask_ptr(tmp, 0);
2041 2041
2042 /* Use the local node if we haven't already */ 2042 /* Use the local node if we haven't already */
2043 if (!node_isset(node, *used_node_mask)) { 2043 if (!node_isset(node, *used_node_mask)) {
2044 node_set(node, *used_node_mask); 2044 node_set(node, *used_node_mask);
2045 return node; 2045 return node;
2046 } 2046 }
2047 2047
2048 for_each_node_state(n, N_HIGH_MEMORY) { 2048 for_each_node_state(n, N_HIGH_MEMORY) {
2049 2049
2050 /* Don't want a node to appear more than once */ 2050 /* Don't want a node to appear more than once */
2051 if (node_isset(n, *used_node_mask)) 2051 if (node_isset(n, *used_node_mask))
2052 continue; 2052 continue;
2053 2053
2054 /* Use the distance array to find the distance */ 2054 /* Use the distance array to find the distance */
2055 val = node_distance(node, n); 2055 val = node_distance(node, n);
2056 2056
2057 /* Penalize nodes under us ("prefer the next node") */ 2057 /* Penalize nodes under us ("prefer the next node") */
2058 val += (n < node); 2058 val += (n < node);
2059 2059
2060 /* Give preference to headless and unused nodes */ 2060 /* Give preference to headless and unused nodes */
2061 node_to_cpumask_ptr_next(tmp, n); 2061 node_to_cpumask_ptr_next(tmp, n);
2062 if (!cpus_empty(*tmp)) 2062 if (!cpus_empty(*tmp))
2063 val += PENALTY_FOR_NODE_WITH_CPUS; 2063 val += PENALTY_FOR_NODE_WITH_CPUS;
2064 2064
2065 /* Slight preference for less loaded node */ 2065 /* Slight preference for less loaded node */
2066 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 2066 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
2067 val += node_load[n]; 2067 val += node_load[n];
2068 2068
2069 if (val < min_val) { 2069 if (val < min_val) {
2070 min_val = val; 2070 min_val = val;
2071 best_node = n; 2071 best_node = n;
2072 } 2072 }
2073 } 2073 }
2074 2074
2075 if (best_node >= 0) 2075 if (best_node >= 0)
2076 node_set(best_node, *used_node_mask); 2076 node_set(best_node, *used_node_mask);
2077 2077
2078 return best_node; 2078 return best_node;
2079 } 2079 }
2080 2080
2081 2081
2082 /* 2082 /*
2083 * Build zonelists ordered by node and zones within node. 2083 * Build zonelists ordered by node and zones within node.
2084 * This results in maximum locality--normal zone overflows into local 2084 * This results in maximum locality--normal zone overflows into local
2085 * DMA zone, if any--but risks exhausting DMA zone. 2085 * DMA zone, if any--but risks exhausting DMA zone.
2086 */ 2086 */
2087 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 2087 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2088 { 2088 {
2089 int j; 2089 int j;
2090 struct zonelist *zonelist; 2090 struct zonelist *zonelist;
2091 2091
2092 zonelist = &pgdat->node_zonelists[0]; 2092 zonelist = &pgdat->node_zonelists[0];
2093 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 2093 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
2094 ; 2094 ;
2095 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 2095 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2096 MAX_NR_ZONES - 1); 2096 MAX_NR_ZONES - 1);
2097 zonelist->_zonerefs[j].zone = NULL; 2097 zonelist->_zonerefs[j].zone = NULL;
2098 zonelist->_zonerefs[j].zone_idx = 0; 2098 zonelist->_zonerefs[j].zone_idx = 0;
2099 } 2099 }
2100 2100
2101 /* 2101 /*
2102 * Build gfp_thisnode zonelists 2102 * Build gfp_thisnode zonelists
2103 */ 2103 */
2104 static void build_thisnode_zonelists(pg_data_t *pgdat) 2104 static void build_thisnode_zonelists(pg_data_t *pgdat)
2105 { 2105 {
2106 int j; 2106 int j;
2107 struct zonelist *zonelist; 2107 struct zonelist *zonelist;
2108 2108
2109 zonelist = &pgdat->node_zonelists[1]; 2109 zonelist = &pgdat->node_zonelists[1];
2110 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 2110 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2111 zonelist->_zonerefs[j].zone = NULL; 2111 zonelist->_zonerefs[j].zone = NULL;
2112 zonelist->_zonerefs[j].zone_idx = 0; 2112 zonelist->_zonerefs[j].zone_idx = 0;
2113 } 2113 }
2114 2114
2115 /* 2115 /*
2116 * Build zonelists ordered by zone and nodes within zones. 2116 * Build zonelists ordered by zone and nodes within zones.
2117 * This results in conserving DMA zone[s] until all Normal memory is 2117 * This results in conserving DMA zone[s] until all Normal memory is
2118 * exhausted, but results in overflowing to remote node while memory 2118 * exhausted, but results in overflowing to remote node while memory
2119 * may still exist in local DMA zone. 2119 * may still exist in local DMA zone.
2120 */ 2120 */
2121 static int node_order[MAX_NUMNODES]; 2121 static int node_order[MAX_NUMNODES];
2122 2122
2123 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 2123 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2124 { 2124 {
2125 int pos, j, node; 2125 int pos, j, node;
2126 int zone_type; /* needs to be signed */ 2126 int zone_type; /* needs to be signed */
2127 struct zone *z; 2127 struct zone *z;
2128 struct zonelist *zonelist; 2128 struct zonelist *zonelist;
2129 2129
2130 zonelist = &pgdat->node_zonelists[0]; 2130 zonelist = &pgdat->node_zonelists[0];
2131 pos = 0; 2131 pos = 0;
2132 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 2132 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
2133 for (j = 0; j < nr_nodes; j++) { 2133 for (j = 0; j < nr_nodes; j++) {
2134 node = node_order[j]; 2134 node = node_order[j];
2135 z = &NODE_DATA(node)->node_zones[zone_type]; 2135 z = &NODE_DATA(node)->node_zones[zone_type];
2136 if (populated_zone(z)) { 2136 if (populated_zone(z)) {
2137 zoneref_set_zone(z, 2137 zoneref_set_zone(z,
2138 &zonelist->_zonerefs[pos++]); 2138 &zonelist->_zonerefs[pos++]);
2139 check_highest_zone(zone_type); 2139 check_highest_zone(zone_type);
2140 } 2140 }
2141 } 2141 }
2142 } 2142 }
2143 zonelist->_zonerefs[pos].zone = NULL; 2143 zonelist->_zonerefs[pos].zone = NULL;
2144 zonelist->_zonerefs[pos].zone_idx = 0; 2144 zonelist->_zonerefs[pos].zone_idx = 0;
2145 } 2145 }
2146 2146
2147 static int default_zonelist_order(void) 2147 static int default_zonelist_order(void)
2148 { 2148 {
2149 int nid, zone_type; 2149 int nid, zone_type;
2150 unsigned long low_kmem_size,total_size; 2150 unsigned long low_kmem_size,total_size;
2151 struct zone *z; 2151 struct zone *z;
2152 int average_size; 2152 int average_size;
2153 /* 2153 /*
2154 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 2154 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
2155 * If they are really small and used heavily, the system can fall 2155 * If they are really small and used heavily, the system can fall
2156 * into OOM very easily. 2156 * into OOM very easily.
2157 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2157 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
2158 */ 2158 */
2159 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2159 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2160 low_kmem_size = 0; 2160 low_kmem_size = 0;
2161 total_size = 0; 2161 total_size = 0;
2162 for_each_online_node(nid) { 2162 for_each_online_node(nid) {
2163 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 2163 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2164 z = &NODE_DATA(nid)->node_zones[zone_type]; 2164 z = &NODE_DATA(nid)->node_zones[zone_type];
2165 if (populated_zone(z)) { 2165 if (populated_zone(z)) {
2166 if (zone_type < ZONE_NORMAL) 2166 if (zone_type < ZONE_NORMAL)
2167 low_kmem_size += z->present_pages; 2167 low_kmem_size += z->present_pages;
2168 total_size += z->present_pages; 2168 total_size += z->present_pages;
2169 } 2169 }
2170 } 2170 }
2171 } 2171 }
2172 if (!low_kmem_size || /* there are no DMA area. */ 2172 if (!low_kmem_size || /* there are no DMA area. */
2173 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 2173 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
2174 return ZONELIST_ORDER_NODE; 2174 return ZONELIST_ORDER_NODE;
2175 /* 2175 /*
2176 * look into each node's config. 2176 * look into each node's config.
2177 * If there is a node whose DMA/DMA32 memory is very big area on 2177 * If there is a node whose DMA/DMA32 memory is very big area on
2178 * local memory, NODE_ORDER may be suitable. 2178 * local memory, NODE_ORDER may be suitable.
2179 */ 2179 */
2180 average_size = total_size / 2180 average_size = total_size /
2181 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 2181 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
2182 for_each_online_node(nid) { 2182 for_each_online_node(nid) {
2183 low_kmem_size = 0; 2183 low_kmem_size = 0;
2184 total_size = 0; 2184 total_size = 0;
2185 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 2185 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
2186 z = &NODE_DATA(nid)->node_zones[zone_type]; 2186 z = &NODE_DATA(nid)->node_zones[zone_type];
2187 if (populated_zone(z)) { 2187 if (populated_zone(z)) {
2188 if (zone_type < ZONE_NORMAL) 2188 if (zone_type < ZONE_NORMAL)
2189 low_kmem_size += z->present_pages; 2189 low_kmem_size += z->present_pages;
2190 total_size += z->present_pages; 2190 total_size += z->present_pages;
2191 } 2191 }
2192 } 2192 }
2193 if (low_kmem_size && 2193 if (low_kmem_size &&
2194 total_size > average_size && /* ignore small node */ 2194 total_size > average_size && /* ignore small node */
2195 low_kmem_size > total_size * 70/100) 2195 low_kmem_size > total_size * 70/100)
2196 return ZONELIST_ORDER_NODE; 2196 return ZONELIST_ORDER_NODE;
2197 } 2197 }
2198 return ZONELIST_ORDER_ZONE; 2198 return ZONELIST_ORDER_ZONE;
2199 } 2199 }
2200 2200
2201 static void set_zonelist_order(void) 2201 static void set_zonelist_order(void)
2202 { 2202 {
2203 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 2203 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
2204 current_zonelist_order = default_zonelist_order(); 2204 current_zonelist_order = default_zonelist_order();
2205 else 2205 else
2206 current_zonelist_order = user_zonelist_order; 2206 current_zonelist_order = user_zonelist_order;
2207 } 2207 }
2208 2208
2209 static void build_zonelists(pg_data_t *pgdat) 2209 static void build_zonelists(pg_data_t *pgdat)
2210 { 2210 {
2211 int j, node, load; 2211 int j, node, load;
2212 enum zone_type i; 2212 enum zone_type i;
2213 nodemask_t used_mask; 2213 nodemask_t used_mask;
2214 int local_node, prev_node; 2214 int local_node, prev_node;
2215 struct zonelist *zonelist; 2215 struct zonelist *zonelist;
2216 int order = current_zonelist_order; 2216 int order = current_zonelist_order;
2217 2217
2218 /* initialize zonelists */ 2218 /* initialize zonelists */
2219 for (i = 0; i < MAX_ZONELISTS; i++) { 2219 for (i = 0; i < MAX_ZONELISTS; i++) {
2220 zonelist = pgdat->node_zonelists + i; 2220 zonelist = pgdat->node_zonelists + i;
2221 zonelist->_zonerefs[0].zone = NULL; 2221 zonelist->_zonerefs[0].zone = NULL;
2222 zonelist->_zonerefs[0].zone_idx = 0; 2222 zonelist->_zonerefs[0].zone_idx = 0;
2223 } 2223 }
2224 2224
2225 /* NUMA-aware ordering of nodes */ 2225 /* NUMA-aware ordering of nodes */
2226 local_node = pgdat->node_id; 2226 local_node = pgdat->node_id;
2227 load = num_online_nodes(); 2227 load = num_online_nodes();
2228 prev_node = local_node; 2228 prev_node = local_node;
2229 nodes_clear(used_mask); 2229 nodes_clear(used_mask);
2230 2230
2231 memset(node_load, 0, sizeof(node_load)); 2231 memset(node_load, 0, sizeof(node_load));
2232 memset(node_order, 0, sizeof(node_order)); 2232 memset(node_order, 0, sizeof(node_order));
2233 j = 0; 2233 j = 0;
2234 2234
2235 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 2235 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
2236 int distance = node_distance(local_node, node); 2236 int distance = node_distance(local_node, node);
2237 2237
2238 /* 2238 /*
2239 * If another node is sufficiently far away then it is better 2239 * If another node is sufficiently far away then it is better
2240 * to reclaim pages in a zone before going off node. 2240 * to reclaim pages in a zone before going off node.
2241 */ 2241 */
2242 if (distance > RECLAIM_DISTANCE) 2242 if (distance > RECLAIM_DISTANCE)
2243 zone_reclaim_mode = 1; 2243 zone_reclaim_mode = 1;
2244 2244
2245 /* 2245 /*
2246 * We don't want to pressure a particular node. 2246 * We don't want to pressure a particular node.
2247 * So adding penalty to the first node in same 2247 * So adding penalty to the first node in same
2248 * distance group to make it round-robin. 2248 * distance group to make it round-robin.
2249 */ 2249 */
2250 if (distance != node_distance(local_node, prev_node)) 2250 if (distance != node_distance(local_node, prev_node))
2251 node_load[node] = load; 2251 node_load[node] = load;
2252 2252
2253 prev_node = node; 2253 prev_node = node;
2254 load--; 2254 load--;
2255 if (order == ZONELIST_ORDER_NODE) 2255 if (order == ZONELIST_ORDER_NODE)
2256 build_zonelists_in_node_order(pgdat, node); 2256 build_zonelists_in_node_order(pgdat, node);
2257 else 2257 else
2258 node_order[j++] = node; /* remember order */ 2258 node_order[j++] = node; /* remember order */
2259 } 2259 }
2260 2260
2261 if (order == ZONELIST_ORDER_ZONE) { 2261 if (order == ZONELIST_ORDER_ZONE) {
2262 /* calculate node order -- i.e., DMA last! */ 2262 /* calculate node order -- i.e., DMA last! */
2263 build_zonelists_in_zone_order(pgdat, j); 2263 build_zonelists_in_zone_order(pgdat, j);
2264 } 2264 }
2265 2265
2266 build_thisnode_zonelists(pgdat); 2266 build_thisnode_zonelists(pgdat);
2267 } 2267 }
2268 2268
2269 /* Construct the zonelist performance cache - see further mmzone.h */ 2269 /* Construct the zonelist performance cache - see further mmzone.h */
2270 static void build_zonelist_cache(pg_data_t *pgdat) 2270 static void build_zonelist_cache(pg_data_t *pgdat)
2271 { 2271 {
2272 struct zonelist *zonelist; 2272 struct zonelist *zonelist;
2273 struct zonelist_cache *zlc; 2273 struct zonelist_cache *zlc;
2274 struct zoneref *z; 2274 struct zoneref *z;
2275 2275
2276 zonelist = &pgdat->node_zonelists[0]; 2276 zonelist = &pgdat->node_zonelists[0];
2277 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2277 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2278 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2278 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2279 for (z = zonelist->_zonerefs; z->zone; z++) 2279 for (z = zonelist->_zonerefs; z->zone; z++)
2280 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 2280 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2281 } 2281 }
2282 2282
2283 2283
2284 #else /* CONFIG_NUMA */ 2284 #else /* CONFIG_NUMA */
2285 2285
2286 static void set_zonelist_order(void) 2286 static void set_zonelist_order(void)
2287 { 2287 {
2288 current_zonelist_order = ZONELIST_ORDER_ZONE; 2288 current_zonelist_order = ZONELIST_ORDER_ZONE;
2289 } 2289 }
2290 2290
2291 static void build_zonelists(pg_data_t *pgdat) 2291 static void build_zonelists(pg_data_t *pgdat)
2292 { 2292 {
2293 int node, local_node; 2293 int node, local_node;
2294 enum zone_type j; 2294 enum zone_type j;
2295 struct zonelist *zonelist; 2295 struct zonelist *zonelist;
2296 2296
2297 local_node = pgdat->node_id; 2297 local_node = pgdat->node_id;
2298 2298
2299 zonelist = &pgdat->node_zonelists[0]; 2299 zonelist = &pgdat->node_zonelists[0];
2300 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 2300 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2301 2301
2302 /* 2302 /*
2303 * Now we build the zonelist so that it contains the zones 2303 * Now we build the zonelist so that it contains the zones
2304 * of all the other nodes. 2304 * of all the other nodes.
2305 * We don't want to pressure a particular node, so when 2305 * We don't want to pressure a particular node, so when
2306 * building the zones for node N, we make sure that the 2306 * building the zones for node N, we make sure that the
2307 * zones coming right after the local ones are those from 2307 * zones coming right after the local ones are those from
2308 * node N+1 (modulo N) 2308 * node N+1 (modulo N)
2309 */ 2309 */
2310 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2310 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
2311 if (!node_online(node)) 2311 if (!node_online(node))
2312 continue; 2312 continue;
2313 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 2313 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2314 MAX_NR_ZONES - 1); 2314 MAX_NR_ZONES - 1);
2315 } 2315 }
2316 for (node = 0; node < local_node; node++) { 2316 for (node = 0; node < local_node; node++) {
2317 if (!node_online(node)) 2317 if (!node_online(node))
2318 continue; 2318 continue;
2319 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 2319 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2320 MAX_NR_ZONES - 1); 2320 MAX_NR_ZONES - 1);
2321 } 2321 }
2322 2322
2323 zonelist->_zonerefs[j].zone = NULL; 2323 zonelist->_zonerefs[j].zone = NULL;
2324 zonelist->_zonerefs[j].zone_idx = 0; 2324 zonelist->_zonerefs[j].zone_idx = 0;
2325 } 2325 }
2326 2326
2327 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2327 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
2328 static void build_zonelist_cache(pg_data_t *pgdat) 2328 static void build_zonelist_cache(pg_data_t *pgdat)
2329 { 2329 {
2330 pgdat->node_zonelists[0].zlcache_ptr = NULL; 2330 pgdat->node_zonelists[0].zlcache_ptr = NULL;
2331 } 2331 }
2332 2332
2333 #endif /* CONFIG_NUMA */ 2333 #endif /* CONFIG_NUMA */
2334 2334
2335 /* return values int ....just for stop_machine_run() */ 2335 /* return values int ....just for stop_machine_run() */
2336 static int __build_all_zonelists(void *dummy) 2336 static int __build_all_zonelists(void *dummy)
2337 { 2337 {
2338 int nid; 2338 int nid;
2339 2339
2340 for_each_online_node(nid) { 2340 for_each_online_node(nid) {
2341 pg_data_t *pgdat = NODE_DATA(nid); 2341 pg_data_t *pgdat = NODE_DATA(nid);
2342 2342
2343 build_zonelists(pgdat); 2343 build_zonelists(pgdat);
2344 build_zonelist_cache(pgdat); 2344 build_zonelist_cache(pgdat);
2345 } 2345 }
2346 return 0; 2346 return 0;
2347 } 2347 }
2348 2348
2349 void build_all_zonelists(void) 2349 void build_all_zonelists(void)
2350 { 2350 {
2351 set_zonelist_order(); 2351 set_zonelist_order();
2352 2352
2353 if (system_state == SYSTEM_BOOTING) { 2353 if (system_state == SYSTEM_BOOTING) {
2354 __build_all_zonelists(NULL); 2354 __build_all_zonelists(NULL);
2355 mminit_verify_zonelist();
2355 cpuset_init_current_mems_allowed(); 2356 cpuset_init_current_mems_allowed();
2356 } else { 2357 } else {
2357 /* we have to stop all cpus to guarantee there is no user 2358 /* we have to stop all cpus to guarantee there is no user
2358 of zonelist */ 2359 of zonelist */
2359 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2360 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
2360 /* cpuset refresh routine should be here */ 2361 /* cpuset refresh routine should be here */
2361 } 2362 }
2362 vm_total_pages = nr_free_pagecache_pages(); 2363 vm_total_pages = nr_free_pagecache_pages();
2363 /* 2364 /*
2364 * Disable grouping by mobility if the number of pages in the 2365 * Disable grouping by mobility if the number of pages in the
2365 * system is too low to allow the mechanism to work. It would be 2366 * system is too low to allow the mechanism to work. It would be
2366 * more accurate, but expensive to check per-zone. This check is 2367 * more accurate, but expensive to check per-zone. This check is
2367 * made on memory-hotadd so a system can start with mobility 2368 * made on memory-hotadd so a system can start with mobility
2368 * disabled and enable it later 2369 * disabled and enable it later
2369 */ 2370 */
2370 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 2371 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2371 page_group_by_mobility_disabled = 1; 2372 page_group_by_mobility_disabled = 1;
2372 else 2373 else
2373 page_group_by_mobility_disabled = 0; 2374 page_group_by_mobility_disabled = 0;
2374 2375
2375 printk("Built %i zonelists in %s order, mobility grouping %s. " 2376 printk("Built %i zonelists in %s order, mobility grouping %s. "
2376 "Total pages: %ld\n", 2377 "Total pages: %ld\n",
2377 num_online_nodes(), 2378 num_online_nodes(),
2378 zonelist_order_name[current_zonelist_order], 2379 zonelist_order_name[current_zonelist_order],
2379 page_group_by_mobility_disabled ? "off" : "on", 2380 page_group_by_mobility_disabled ? "off" : "on",
2380 vm_total_pages); 2381 vm_total_pages);
2381 #ifdef CONFIG_NUMA 2382 #ifdef CONFIG_NUMA
2382 printk("Policy zone: %s\n", zone_names[policy_zone]); 2383 printk("Policy zone: %s\n", zone_names[policy_zone]);
2383 #endif 2384 #endif
2384 } 2385 }
2385 2386
2386 /* 2387 /*
2387 * Helper functions to size the waitqueue hash table. 2388 * Helper functions to size the waitqueue hash table.
2388 * Essentially these want to choose hash table sizes sufficiently 2389 * Essentially these want to choose hash table sizes sufficiently
2389 * large so that collisions trying to wait on pages are rare. 2390 * large so that collisions trying to wait on pages are rare.
2390 * But in fact, the number of active page waitqueues on typical 2391 * But in fact, the number of active page waitqueues on typical
2391 * systems is ridiculously low, less than 200. So this is even 2392 * systems is ridiculously low, less than 200. So this is even
2392 * conservative, even though it seems large. 2393 * conservative, even though it seems large.
2393 * 2394 *
2394 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 2395 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
2395 * waitqueues, i.e. the size of the waitq table given the number of pages. 2396 * waitqueues, i.e. the size of the waitq table given the number of pages.
2396 */ 2397 */
2397 #define PAGES_PER_WAITQUEUE 256 2398 #define PAGES_PER_WAITQUEUE 256
2398 2399
2399 #ifndef CONFIG_MEMORY_HOTPLUG 2400 #ifndef CONFIG_MEMORY_HOTPLUG
2400 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2401 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
2401 { 2402 {
2402 unsigned long size = 1; 2403 unsigned long size = 1;
2403 2404
2404 pages /= PAGES_PER_WAITQUEUE; 2405 pages /= PAGES_PER_WAITQUEUE;
2405 2406
2406 while (size < pages) 2407 while (size < pages)
2407 size <<= 1; 2408 size <<= 1;
2408 2409
2409 /* 2410 /*
2410 * Once we have dozens or even hundreds of threads sleeping 2411 * Once we have dozens or even hundreds of threads sleeping
2411 * on IO we've got bigger problems than wait queue collision. 2412 * on IO we've got bigger problems than wait queue collision.
2412 * Limit the size of the wait table to a reasonable size. 2413 * Limit the size of the wait table to a reasonable size.
2413 */ 2414 */
2414 size = min(size, 4096UL); 2415 size = min(size, 4096UL);
2415 2416
2416 return max(size, 4UL); 2417 return max(size, 4UL);
2417 } 2418 }
2418 #else 2419 #else
2419 /* 2420 /*
2420 * A zone's size might be changed by hot-add, so it is not possible to determine 2421 * A zone's size might be changed by hot-add, so it is not possible to determine
2421 * a suitable size for its wait_table. So we use the maximum size now. 2422 * a suitable size for its wait_table. So we use the maximum size now.
2422 * 2423 *
2423 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 2424 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
2424 * 2425 *
2425 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 2426 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
2426 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 2427 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
2427 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 2428 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
2428 * 2429 *
2429 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 2430 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
2430 * or more by the traditional way. (See above). It equals: 2431 * or more by the traditional way. (See above). It equals:
2431 * 2432 *
2432 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 2433 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
2433 * ia64(16K page size) : = ( 8G + 4M)byte. 2434 * ia64(16K page size) : = ( 8G + 4M)byte.
2434 * powerpc (64K page size) : = (32G +16M)byte. 2435 * powerpc (64K page size) : = (32G +16M)byte.
2435 */ 2436 */
2436 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2437 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
2437 { 2438 {
2438 return 4096UL; 2439 return 4096UL;
2439 } 2440 }
2440 #endif 2441 #endif
2441 2442
2442 /* 2443 /*
2443 * This is an integer logarithm so that shifts can be used later 2444 * This is an integer logarithm so that shifts can be used later
2444 * to extract the more random high bits from the multiplicative 2445 * to extract the more random high bits from the multiplicative
2445 * hash function before the remainder is taken. 2446 * hash function before the remainder is taken.
2446 */ 2447 */
2447 static inline unsigned long wait_table_bits(unsigned long size) 2448 static inline unsigned long wait_table_bits(unsigned long size)
2448 { 2449 {
2449 return ffz(~size); 2450 return ffz(~size);
2450 } 2451 }
2451 2452
2452 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2453 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2453 2454
2454 /* 2455 /*
2455 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2456 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2456 * of blocks reserved is based on zone->pages_min. The memory within the 2457 * of blocks reserved is based on zone->pages_min. The memory within the
2457 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2458 * reserve will tend to store contiguous free pages. Setting min_free_kbytes
2458 * higher will lead to a bigger reserve which will get freed as contiguous 2459 * higher will lead to a bigger reserve which will get freed as contiguous
2459 * blocks as reclaim kicks in 2460 * blocks as reclaim kicks in
2460 */ 2461 */
2461 static void setup_zone_migrate_reserve(struct zone *zone) 2462 static void setup_zone_migrate_reserve(struct zone *zone)
2462 { 2463 {
2463 unsigned long start_pfn, pfn, end_pfn; 2464 unsigned long start_pfn, pfn, end_pfn;
2464 struct page *page; 2465 struct page *page;
2465 unsigned long reserve, block_migratetype; 2466 unsigned long reserve, block_migratetype;
2466 2467
2467 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2468 /* Get the start pfn, end pfn and the number of blocks to reserve */
2468 start_pfn = zone->zone_start_pfn; 2469 start_pfn = zone->zone_start_pfn;
2469 end_pfn = start_pfn + zone->spanned_pages; 2470 end_pfn = start_pfn + zone->spanned_pages;
2470 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2471 reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
2471 pageblock_order; 2472 pageblock_order;
2472 2473
2473 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2474 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2474 if (!pfn_valid(pfn)) 2475 if (!pfn_valid(pfn))
2475 continue; 2476 continue;
2476 page = pfn_to_page(pfn); 2477 page = pfn_to_page(pfn);
2477 2478
2478 /* Blocks with reserved pages will never free, skip them. */ 2479 /* Blocks with reserved pages will never free, skip them. */
2479 if (PageReserved(page)) 2480 if (PageReserved(page))
2480 continue; 2481 continue;
2481 2482
2482 block_migratetype = get_pageblock_migratetype(page); 2483 block_migratetype = get_pageblock_migratetype(page);
2483 2484
2484 /* If this block is reserved, account for it */ 2485 /* If this block is reserved, account for it */
2485 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 2486 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2486 reserve--; 2487 reserve--;
2487 continue; 2488 continue;
2488 } 2489 }
2489 2490
2490 /* Suitable for reserving if this block is movable */ 2491 /* Suitable for reserving if this block is movable */
2491 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 2492 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2492 set_pageblock_migratetype(page, MIGRATE_RESERVE); 2493 set_pageblock_migratetype(page, MIGRATE_RESERVE);
2493 move_freepages_block(zone, page, MIGRATE_RESERVE); 2494 move_freepages_block(zone, page, MIGRATE_RESERVE);
2494 reserve--; 2495 reserve--;
2495 continue; 2496 continue;
2496 } 2497 }
2497 2498
2498 /* 2499 /*
2499 * If the reserve is met and this is a previous reserved block, 2500 * If the reserve is met and this is a previous reserved block,
2500 * take it back 2501 * take it back
2501 */ 2502 */
2502 if (block_migratetype == MIGRATE_RESERVE) { 2503 if (block_migratetype == MIGRATE_RESERVE) {
2503 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2504 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2504 move_freepages_block(zone, page, MIGRATE_MOVABLE); 2505 move_freepages_block(zone, page, MIGRATE_MOVABLE);
2505 } 2506 }
2506 } 2507 }
2507 } 2508 }
2508 2509
2509 /* 2510 /*
2510 * Initially all pages are reserved - free ones are freed 2511 * Initially all pages are reserved - free ones are freed
2511 * up by free_all_bootmem() once the early boot process is 2512 * up by free_all_bootmem() once the early boot process is
2512 * done. Non-atomic initialization, single-pass. 2513 * done. Non-atomic initialization, single-pass.
2513 */ 2514 */
2514 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 2515 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2515 unsigned long start_pfn, enum memmap_context context) 2516 unsigned long start_pfn, enum memmap_context context)
2516 { 2517 {
2517 struct page *page; 2518 struct page *page;
2518 unsigned long end_pfn = start_pfn + size; 2519 unsigned long end_pfn = start_pfn + size;
2519 unsigned long pfn; 2520 unsigned long pfn;
2520 struct zone *z; 2521 struct zone *z;
2521 2522
2522 z = &NODE_DATA(nid)->node_zones[zone]; 2523 z = &NODE_DATA(nid)->node_zones[zone];
2523 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2524 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2524 /* 2525 /*
2525 * There can be holes in boot-time mem_map[]s 2526 * There can be holes in boot-time mem_map[]s
2526 * handed to this function. They do not 2527 * handed to this function. They do not
2527 * exist on hotplugged memory. 2528 * exist on hotplugged memory.
2528 */ 2529 */
2529 if (context == MEMMAP_EARLY) { 2530 if (context == MEMMAP_EARLY) {
2530 if (!early_pfn_valid(pfn)) 2531 if (!early_pfn_valid(pfn))
2531 continue; 2532 continue;
2532 if (!early_pfn_in_nid(pfn, nid)) 2533 if (!early_pfn_in_nid(pfn, nid))
2533 continue; 2534 continue;
2534 } 2535 }
2535 page = pfn_to_page(pfn); 2536 page = pfn_to_page(pfn);
2536 set_page_links(page, zone, nid, pfn); 2537 set_page_links(page, zone, nid, pfn);
2537 mminit_verify_page_links(page, zone, nid, pfn); 2538 mminit_verify_page_links(page, zone, nid, pfn);
2538 init_page_count(page); 2539 init_page_count(page);
2539 reset_page_mapcount(page); 2540 reset_page_mapcount(page);
2540 SetPageReserved(page); 2541 SetPageReserved(page);
2541 /* 2542 /*
2542 * Mark the block movable so that blocks are reserved for 2543 * Mark the block movable so that blocks are reserved for
2543 * movable at startup. This will force kernel allocations 2544 * movable at startup. This will force kernel allocations
2544 * to reserve their blocks rather than leaking throughout 2545 * to reserve their blocks rather than leaking throughout
2545 * the address space during boot when many long-lived 2546 * the address space during boot when many long-lived
2546 * kernel allocations are made. Later some blocks near 2547 * kernel allocations are made. Later some blocks near
2547 * the start are marked MIGRATE_RESERVE by 2548 * the start are marked MIGRATE_RESERVE by
2548 * setup_zone_migrate_reserve() 2549 * setup_zone_migrate_reserve()
2549 * 2550 *
2550 * bitmap is created for zone's valid pfn range. but memmap 2551 * bitmap is created for zone's valid pfn range. but memmap
2551 * can be created for invalid pages (for alignment) 2552 * can be created for invalid pages (for alignment)
2552 * check here not to call set_pageblock_migratetype() against 2553 * check here not to call set_pageblock_migratetype() against
2553 * pfn out of zone. 2554 * pfn out of zone.
2554 */ 2555 */
2555 if ((z->zone_start_pfn <= pfn) 2556 if ((z->zone_start_pfn <= pfn)
2556 && (pfn < z->zone_start_pfn + z->spanned_pages) 2557 && (pfn < z->zone_start_pfn + z->spanned_pages)
2557 && !(pfn & (pageblock_nr_pages - 1))) 2558 && !(pfn & (pageblock_nr_pages - 1)))
2558 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2559 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2559 2560
2560 INIT_LIST_HEAD(&page->lru); 2561 INIT_LIST_HEAD(&page->lru);
2561 #ifdef WANT_PAGE_VIRTUAL 2562 #ifdef WANT_PAGE_VIRTUAL
2562 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2563 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
2563 if (!is_highmem_idx(zone)) 2564 if (!is_highmem_idx(zone))
2564 set_page_address(page, __va(pfn << PAGE_SHIFT)); 2565 set_page_address(page, __va(pfn << PAGE_SHIFT));
2565 #endif 2566 #endif
2566 } 2567 }
2567 } 2568 }
2568 2569
2569 static void __meminit zone_init_free_lists(struct zone *zone) 2570 static void __meminit zone_init_free_lists(struct zone *zone)
2570 { 2571 {
2571 int order, t; 2572 int order, t;
2572 for_each_migratetype_order(order, t) { 2573 for_each_migratetype_order(order, t) {
2573 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 2574 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
2574 zone->free_area[order].nr_free = 0; 2575 zone->free_area[order].nr_free = 0;
2575 } 2576 }
2576 } 2577 }
2577 2578
2578 #ifndef __HAVE_ARCH_MEMMAP_INIT 2579 #ifndef __HAVE_ARCH_MEMMAP_INIT
2579 #define memmap_init(size, nid, zone, start_pfn) \ 2580 #define memmap_init(size, nid, zone, start_pfn) \
2580 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 2581 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
2581 #endif 2582 #endif
2582 2583
2583 static int zone_batchsize(struct zone *zone) 2584 static int zone_batchsize(struct zone *zone)
2584 { 2585 {
2585 int batch; 2586 int batch;
2586 2587
2587 /* 2588 /*
2588 * The per-cpu-pages pools are set to around 1000th of the 2589 * The per-cpu-pages pools are set to around 1000th of the
2589 * size of the zone. But no more than 1/2 of a meg. 2590 * size of the zone. But no more than 1/2 of a meg.
2590 * 2591 *
2591 * OK, so we don't know how big the cache is. So guess. 2592 * OK, so we don't know how big the cache is. So guess.
2592 */ 2593 */
2593 batch = zone->present_pages / 1024; 2594 batch = zone->present_pages / 1024;
2594 if (batch * PAGE_SIZE > 512 * 1024) 2595 if (batch * PAGE_SIZE > 512 * 1024)
2595 batch = (512 * 1024) / PAGE_SIZE; 2596 batch = (512 * 1024) / PAGE_SIZE;
2596 batch /= 4; /* We effectively *= 4 below */ 2597 batch /= 4; /* We effectively *= 4 below */
2597 if (batch < 1) 2598 if (batch < 1)
2598 batch = 1; 2599 batch = 1;
2599 2600
2600 /* 2601 /*
2601 * Clamp the batch to a 2^n - 1 value. Having a power 2602 * Clamp the batch to a 2^n - 1 value. Having a power
2602 * of 2 value was found to be more likely to have 2603 * of 2 value was found to be more likely to have
2603 * suboptimal cache aliasing properties in some cases. 2604 * suboptimal cache aliasing properties in some cases.
2604 * 2605 *
2605 * For example if 2 tasks are alternately allocating 2606 * For example if 2 tasks are alternately allocating
2606 * batches of pages, one task can end up with a lot 2607 * batches of pages, one task can end up with a lot
2607 * of pages of one half of the possible page colors 2608 * of pages of one half of the possible page colors
2608 * and the other with pages of the other colors. 2609 * and the other with pages of the other colors.
2609 */ 2610 */
2610 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2611 batch = (1 << (fls(batch + batch/2)-1)) - 1;
2611 2612
2612 return batch; 2613 return batch;
2613 } 2614 }
2614 2615
2615 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2616 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2616 { 2617 {
2617 struct per_cpu_pages *pcp; 2618 struct per_cpu_pages *pcp;
2618 2619
2619 memset(p, 0, sizeof(*p)); 2620 memset(p, 0, sizeof(*p));
2620 2621
2621 pcp = &p->pcp; 2622 pcp = &p->pcp;
2622 pcp->count = 0; 2623 pcp->count = 0;
2623 pcp->high = 6 * batch; 2624 pcp->high = 6 * batch;
2624 pcp->batch = max(1UL, 1 * batch); 2625 pcp->batch = max(1UL, 1 * batch);
2625 INIT_LIST_HEAD(&pcp->list); 2626 INIT_LIST_HEAD(&pcp->list);
2626 } 2627 }
2627 2628
2628 /* 2629 /*
2629 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 2630 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
2630 * to the value high for the pageset p. 2631 * to the value high for the pageset p.
2631 */ 2632 */
2632 2633
2633 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 2634 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2634 unsigned long high) 2635 unsigned long high)
2635 { 2636 {
2636 struct per_cpu_pages *pcp; 2637 struct per_cpu_pages *pcp;
2637 2638
2638 pcp = &p->pcp; 2639 pcp = &p->pcp;
2639 pcp->high = high; 2640 pcp->high = high;
2640 pcp->batch = max(1UL, high/4); 2641 pcp->batch = max(1UL, high/4);
2641 if ((high/4) > (PAGE_SHIFT * 8)) 2642 if ((high/4) > (PAGE_SHIFT * 8))
2642 pcp->batch = PAGE_SHIFT * 8; 2643 pcp->batch = PAGE_SHIFT * 8;
2643 } 2644 }
2644 2645
2645 2646
2646 #ifdef CONFIG_NUMA 2647 #ifdef CONFIG_NUMA
2647 /* 2648 /*
2648 * Boot pageset table. One per cpu which is going to be used for all 2649 * Boot pageset table. One per cpu which is going to be used for all
2649 * zones and all nodes. The parameters will be set in such a way 2650 * zones and all nodes. The parameters will be set in such a way
2650 * that an item put on a list will immediately be handed over to 2651 * that an item put on a list will immediately be handed over to
2651 * the buddy list. This is safe since pageset manipulation is done 2652 * the buddy list. This is safe since pageset manipulation is done
2652 * with interrupts disabled. 2653 * with interrupts disabled.
2653 * 2654 *
2654 * Some NUMA counter updates may also be caught by the boot pagesets. 2655 * Some NUMA counter updates may also be caught by the boot pagesets.
2655 * 2656 *
2656 * The boot_pagesets must be kept even after bootup is complete for 2657 * The boot_pagesets must be kept even after bootup is complete for
2657 * unused processors and/or zones. They do play a role for bootstrapping 2658 * unused processors and/or zones. They do play a role for bootstrapping
2658 * hotplugged processors. 2659 * hotplugged processors.
2659 * 2660 *
2660 * zoneinfo_show() and maybe other functions do 2661 * zoneinfo_show() and maybe other functions do
2661 * not check if the processor is online before following the pageset pointer. 2662 * not check if the processor is online before following the pageset pointer.
2662 * Other parts of the kernel may not check if the zone is available. 2663 * Other parts of the kernel may not check if the zone is available.
2663 */ 2664 */
2664 static struct per_cpu_pageset boot_pageset[NR_CPUS]; 2665 static struct per_cpu_pageset boot_pageset[NR_CPUS];
2665 2666
2666 /* 2667 /*
2667 * Dynamically allocate memory for the 2668 * Dynamically allocate memory for the
2668 * per cpu pageset array in struct zone. 2669 * per cpu pageset array in struct zone.
2669 */ 2670 */
2670 static int __cpuinit process_zones(int cpu) 2671 static int __cpuinit process_zones(int cpu)
2671 { 2672 {
2672 struct zone *zone, *dzone; 2673 struct zone *zone, *dzone;
2673 int node = cpu_to_node(cpu); 2674 int node = cpu_to_node(cpu);
2674 2675
2675 node_set_state(node, N_CPU); /* this node has a cpu */ 2676 node_set_state(node, N_CPU); /* this node has a cpu */
2676 2677
2677 for_each_zone(zone) { 2678 for_each_zone(zone) {
2678 2679
2679 if (!populated_zone(zone)) 2680 if (!populated_zone(zone))
2680 continue; 2681 continue;
2681 2682
2682 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2683 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2683 GFP_KERNEL, node); 2684 GFP_KERNEL, node);
2684 if (!zone_pcp(zone, cpu)) 2685 if (!zone_pcp(zone, cpu))
2685 goto bad; 2686 goto bad;
2686 2687
2687 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 2688 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
2688 2689
2689 if (percpu_pagelist_fraction) 2690 if (percpu_pagelist_fraction)
2690 setup_pagelist_highmark(zone_pcp(zone, cpu), 2691 setup_pagelist_highmark(zone_pcp(zone, cpu),
2691 (zone->present_pages / percpu_pagelist_fraction)); 2692 (zone->present_pages / percpu_pagelist_fraction));
2692 } 2693 }
2693 2694
2694 return 0; 2695 return 0;
2695 bad: 2696 bad:
2696 for_each_zone(dzone) { 2697 for_each_zone(dzone) {
2697 if (!populated_zone(dzone)) 2698 if (!populated_zone(dzone))
2698 continue; 2699 continue;
2699 if (dzone == zone) 2700 if (dzone == zone)
2700 break; 2701 break;
2701 kfree(zone_pcp(dzone, cpu)); 2702 kfree(zone_pcp(dzone, cpu));
2702 zone_pcp(dzone, cpu) = NULL; 2703 zone_pcp(dzone, cpu) = NULL;
2703 } 2704 }
2704 return -ENOMEM; 2705 return -ENOMEM;
2705 } 2706 }
2706 2707
2707 static inline void free_zone_pagesets(int cpu) 2708 static inline void free_zone_pagesets(int cpu)
2708 { 2709 {
2709 struct zone *zone; 2710 struct zone *zone;
2710 2711
2711 for_each_zone(zone) { 2712 for_each_zone(zone) {
2712 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 2713 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2713 2714
2714 /* Free per_cpu_pageset if it is slab allocated */ 2715 /* Free per_cpu_pageset if it is slab allocated */
2715 if (pset != &boot_pageset[cpu]) 2716 if (pset != &boot_pageset[cpu])
2716 kfree(pset); 2717 kfree(pset);
2717 zone_pcp(zone, cpu) = NULL; 2718 zone_pcp(zone, cpu) = NULL;
2718 } 2719 }
2719 } 2720 }
2720 2721
2721 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 2722 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2722 unsigned long action, 2723 unsigned long action,
2723 void *hcpu) 2724 void *hcpu)
2724 { 2725 {
2725 int cpu = (long)hcpu; 2726 int cpu = (long)hcpu;
2726 int ret = NOTIFY_OK; 2727 int ret = NOTIFY_OK;
2727 2728
2728 switch (action) { 2729 switch (action) {
2729 case CPU_UP_PREPARE: 2730 case CPU_UP_PREPARE:
2730 case CPU_UP_PREPARE_FROZEN: 2731 case CPU_UP_PREPARE_FROZEN:
2731 if (process_zones(cpu)) 2732 if (process_zones(cpu))
2732 ret = NOTIFY_BAD; 2733 ret = NOTIFY_BAD;
2733 break; 2734 break;
2734 case CPU_UP_CANCELED: 2735 case CPU_UP_CANCELED:
2735 case CPU_UP_CANCELED_FROZEN: 2736 case CPU_UP_CANCELED_FROZEN:
2736 case CPU_DEAD: 2737 case CPU_DEAD:
2737 case CPU_DEAD_FROZEN: 2738 case CPU_DEAD_FROZEN:
2738 free_zone_pagesets(cpu); 2739 free_zone_pagesets(cpu);
2739 break; 2740 break;
2740 default: 2741 default:
2741 break; 2742 break;
2742 } 2743 }
2743 return ret; 2744 return ret;
2744 } 2745 }
2745 2746
2746 static struct notifier_block __cpuinitdata pageset_notifier = 2747 static struct notifier_block __cpuinitdata pageset_notifier =
2747 { &pageset_cpuup_callback, NULL, 0 }; 2748 { &pageset_cpuup_callback, NULL, 0 };
2748 2749
2749 void __init setup_per_cpu_pageset(void) 2750 void __init setup_per_cpu_pageset(void)
2750 { 2751 {
2751 int err; 2752 int err;
2752 2753
2753 /* Initialize per_cpu_pageset for cpu 0. 2754 /* Initialize per_cpu_pageset for cpu 0.
2754 * A cpuup callback will do this for every cpu 2755 * A cpuup callback will do this for every cpu
2755 * as it comes online 2756 * as it comes online
2756 */ 2757 */
2757 err = process_zones(smp_processor_id()); 2758 err = process_zones(smp_processor_id());
2758 BUG_ON(err); 2759 BUG_ON(err);
2759 register_cpu_notifier(&pageset_notifier); 2760 register_cpu_notifier(&pageset_notifier);
2760 } 2761 }
2761 2762
2762 #endif 2763 #endif
2763 2764
2764 static noinline __init_refok 2765 static noinline __init_refok
2765 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2766 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2766 { 2767 {
2767 int i; 2768 int i;
2768 struct pglist_data *pgdat = zone->zone_pgdat; 2769 struct pglist_data *pgdat = zone->zone_pgdat;
2769 size_t alloc_size; 2770 size_t alloc_size;
2770 2771
2771 /* 2772 /*
2772 * The per-page waitqueue mechanism uses hashed waitqueues 2773 * The per-page waitqueue mechanism uses hashed waitqueues
2773 * per zone. 2774 * per zone.
2774 */ 2775 */
2775 zone->wait_table_hash_nr_entries = 2776 zone->wait_table_hash_nr_entries =
2776 wait_table_hash_nr_entries(zone_size_pages); 2777 wait_table_hash_nr_entries(zone_size_pages);
2777 zone->wait_table_bits = 2778 zone->wait_table_bits =
2778 wait_table_bits(zone->wait_table_hash_nr_entries); 2779 wait_table_bits(zone->wait_table_hash_nr_entries);
2779 alloc_size = zone->wait_table_hash_nr_entries 2780 alloc_size = zone->wait_table_hash_nr_entries
2780 * sizeof(wait_queue_head_t); 2781 * sizeof(wait_queue_head_t);
2781 2782
2782 if (!slab_is_available()) { 2783 if (!slab_is_available()) {
2783 zone->wait_table = (wait_queue_head_t *) 2784 zone->wait_table = (wait_queue_head_t *)
2784 alloc_bootmem_node(pgdat, alloc_size); 2785 alloc_bootmem_node(pgdat, alloc_size);
2785 } else { 2786 } else {
2786 /* 2787 /*
2787 * This case means that a zone whose size was 0 gets new memory 2788 * This case means that a zone whose size was 0 gets new memory
2788 * via memory hot-add. 2789 * via memory hot-add.
2789 * But it may be the case that a new node was hot-added. In 2790 * But it may be the case that a new node was hot-added. In
2790 * this case vmalloc() will not be able to use this new node's 2791 * this case vmalloc() will not be able to use this new node's
2791 * memory - this wait_table must be initialized to use this new 2792 * memory - this wait_table must be initialized to use this new
2792 * node itself as well. 2793 * node itself as well.
2793 * To use this new node's memory, further consideration will be 2794 * To use this new node's memory, further consideration will be
2794 * necessary. 2795 * necessary.
2795 */ 2796 */
2796 zone->wait_table = vmalloc(alloc_size); 2797 zone->wait_table = vmalloc(alloc_size);
2797 } 2798 }
2798 if (!zone->wait_table) 2799 if (!zone->wait_table)
2799 return -ENOMEM; 2800 return -ENOMEM;
2800 2801
2801 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 2802 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2802 init_waitqueue_head(zone->wait_table + i); 2803 init_waitqueue_head(zone->wait_table + i);
2803 2804
2804 return 0; 2805 return 0;
2805 } 2806 }
2806 2807
2807 static __meminit void zone_pcp_init(struct zone *zone) 2808 static __meminit void zone_pcp_init(struct zone *zone)
2808 { 2809 {
2809 int cpu; 2810 int cpu;
2810 unsigned long batch = zone_batchsize(zone); 2811 unsigned long batch = zone_batchsize(zone);
2811 2812
2812 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2813 for (cpu = 0; cpu < NR_CPUS; cpu++) {
2813 #ifdef CONFIG_NUMA 2814 #ifdef CONFIG_NUMA
2814 /* Early boot. Slab allocator not functional yet */ 2815 /* Early boot. Slab allocator not functional yet */
2815 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 2816 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2816 setup_pageset(&boot_pageset[cpu],0); 2817 setup_pageset(&boot_pageset[cpu],0);
2817 #else 2818 #else
2818 setup_pageset(zone_pcp(zone,cpu), batch); 2819 setup_pageset(zone_pcp(zone,cpu), batch);
2819 #endif 2820 #endif
2820 } 2821 }
2821 if (zone->present_pages) 2822 if (zone->present_pages)
2822 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2823 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
2823 zone->name, zone->present_pages, batch); 2824 zone->name, zone->present_pages, batch);
2824 } 2825 }
2825 2826
2826 __meminit int init_currently_empty_zone(struct zone *zone, 2827 __meminit int init_currently_empty_zone(struct zone *zone,
2827 unsigned long zone_start_pfn, 2828 unsigned long zone_start_pfn,
2828 unsigned long size, 2829 unsigned long size,
2829 enum memmap_context context) 2830 enum memmap_context context)
2830 { 2831 {
2831 struct pglist_data *pgdat = zone->zone_pgdat; 2832 struct pglist_data *pgdat = zone->zone_pgdat;
2832 int ret; 2833 int ret;
2833 ret = zone_wait_table_init(zone, size); 2834 ret = zone_wait_table_init(zone, size);
2834 if (ret) 2835 if (ret)
2835 return ret; 2836 return ret;
2836 pgdat->nr_zones = zone_idx(zone) + 1; 2837 pgdat->nr_zones = zone_idx(zone) + 1;
2837 2838
2838 zone->zone_start_pfn = zone_start_pfn; 2839 zone->zone_start_pfn = zone_start_pfn;
2839 2840
2840 mminit_dprintk(MMINIT_TRACE, "memmap_init", 2841 mminit_dprintk(MMINIT_TRACE, "memmap_init",
2841 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 2842 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
2842 pgdat->node_id, 2843 pgdat->node_id,
2843 (unsigned long)zone_idx(zone), 2844 (unsigned long)zone_idx(zone),
2844 zone_start_pfn, (zone_start_pfn + size)); 2845 zone_start_pfn, (zone_start_pfn + size));
2845 2846
2846 zone_init_free_lists(zone); 2847 zone_init_free_lists(zone);
2847 2848
2848 return 0; 2849 return 0;
2849 } 2850 }
2850 2851
2851 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2852 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2852 /* 2853 /*
2853 * Basic iterator support. Return the first range of PFNs for a node 2854 * Basic iterator support. Return the first range of PFNs for a node
2854 * Note: nid == MAX_NUMNODES returns first region regardless of node 2855 * Note: nid == MAX_NUMNODES returns first region regardless of node
2855 */ 2856 */
2856 static int __meminit first_active_region_index_in_nid(int nid) 2857 static int __meminit first_active_region_index_in_nid(int nid)
2857 { 2858 {
2858 int i; 2859 int i;
2859 2860
2860 for (i = 0; i < nr_nodemap_entries; i++) 2861 for (i = 0; i < nr_nodemap_entries; i++)
2861 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) 2862 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2862 return i; 2863 return i;
2863 2864
2864 return -1; 2865 return -1;
2865 } 2866 }
2866 2867
2867 /* 2868 /*
2868 * Basic iterator support. Return the next active range of PFNs for a node 2869 * Basic iterator support. Return the next active range of PFNs for a node
2869 * Note: nid == MAX_NUMNODES returns next region regardless of node 2870 * Note: nid == MAX_NUMNODES returns next region regardless of node
2870 */ 2871 */
2871 static int __meminit next_active_region_index_in_nid(int index, int nid) 2872 static int __meminit next_active_region_index_in_nid(int index, int nid)
2872 { 2873 {
2873 for (index = index + 1; index < nr_nodemap_entries; index++) 2874 for (index = index + 1; index < nr_nodemap_entries; index++)
2874 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2875 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2875 return index; 2876 return index;
2876 2877
2877 return -1; 2878 return -1;
2878 } 2879 }
2879 2880
2880 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 2881 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2881 /* 2882 /*
2882 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 2883 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2883 * Architectures may implement their own version but if add_active_range() 2884 * Architectures may implement their own version but if add_active_range()
2884 * was used and there are no special requirements, this is a convenient 2885 * was used and there are no special requirements, this is a convenient
2885 * alternative 2886 * alternative
2886 */ 2887 */
2887 int __meminit early_pfn_to_nid(unsigned long pfn) 2888 int __meminit early_pfn_to_nid(unsigned long pfn)
2888 { 2889 {
2889 int i; 2890 int i;
2890 2891
2891 for (i = 0; i < nr_nodemap_entries; i++) { 2892 for (i = 0; i < nr_nodemap_entries; i++) {
2892 unsigned long start_pfn = early_node_map[i].start_pfn; 2893 unsigned long start_pfn = early_node_map[i].start_pfn;
2893 unsigned long end_pfn = early_node_map[i].end_pfn; 2894 unsigned long end_pfn = early_node_map[i].end_pfn;
2894 2895
2895 if (start_pfn <= pfn && pfn < end_pfn) 2896 if (start_pfn <= pfn && pfn < end_pfn)
2896 return early_node_map[i].nid; 2897 return early_node_map[i].nid;
2897 } 2898 }
2898 2899
2899 return 0; 2900 return 0;
2900 } 2901 }
2901 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 2902 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2902 2903
2903 /* Basic iterator support to walk early_node_map[] */ 2904 /* Basic iterator support to walk early_node_map[] */
2904 #define for_each_active_range_index_in_nid(i, nid) \ 2905 #define for_each_active_range_index_in_nid(i, nid) \
2905 for (i = first_active_region_index_in_nid(nid); i != -1; \ 2906 for (i = first_active_region_index_in_nid(nid); i != -1; \
2906 i = next_active_region_index_in_nid(i, nid)) 2907 i = next_active_region_index_in_nid(i, nid))
2907 2908
2908 /** 2909 /**
2909 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 2910 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2910 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 2911 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2911 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 2912 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2912 * 2913 *
2913 * If an architecture guarantees that all ranges registered with 2914 * If an architecture guarantees that all ranges registered with
2914 * add_active_ranges() contain no holes and may be freed, this 2915 * add_active_ranges() contain no holes and may be freed, this
2915 * this function may be used instead of calling free_bootmem() manually. 2916 * this function may be used instead of calling free_bootmem() manually.
2916 */ 2917 */
2917 void __init free_bootmem_with_active_regions(int nid, 2918 void __init free_bootmem_with_active_regions(int nid,
2918 unsigned long max_low_pfn) 2919 unsigned long max_low_pfn)
2919 { 2920 {
2920 int i; 2921 int i;
2921 2922
2922 for_each_active_range_index_in_nid(i, nid) { 2923 for_each_active_range_index_in_nid(i, nid) {
2923 unsigned long size_pages = 0; 2924 unsigned long size_pages = 0;
2924 unsigned long end_pfn = early_node_map[i].end_pfn; 2925 unsigned long end_pfn = early_node_map[i].end_pfn;
2925 2926
2926 if (early_node_map[i].start_pfn >= max_low_pfn) 2927 if (early_node_map[i].start_pfn >= max_low_pfn)
2927 continue; 2928 continue;
2928 2929
2929 if (end_pfn > max_low_pfn) 2930 if (end_pfn > max_low_pfn)
2930 end_pfn = max_low_pfn; 2931 end_pfn = max_low_pfn;
2931 2932
2932 size_pages = end_pfn - early_node_map[i].start_pfn; 2933 size_pages = end_pfn - early_node_map[i].start_pfn;
2933 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 2934 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2934 PFN_PHYS(early_node_map[i].start_pfn), 2935 PFN_PHYS(early_node_map[i].start_pfn),
2935 size_pages << PAGE_SHIFT); 2936 size_pages << PAGE_SHIFT);
2936 } 2937 }
2937 } 2938 }
2938 2939
2939 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 2940 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
2940 { 2941 {
2941 int i; 2942 int i;
2942 int ret; 2943 int ret;
2943 2944
2944 for_each_active_range_index_in_nid(i, nid) { 2945 for_each_active_range_index_in_nid(i, nid) {
2945 ret = work_fn(early_node_map[i].start_pfn, 2946 ret = work_fn(early_node_map[i].start_pfn,
2946 early_node_map[i].end_pfn, data); 2947 early_node_map[i].end_pfn, data);
2947 if (ret) 2948 if (ret)
2948 break; 2949 break;
2949 } 2950 }
2950 } 2951 }
2951 /** 2952 /**
2952 * sparse_memory_present_with_active_regions - Call memory_present for each active range 2953 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2953 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 2954 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2954 * 2955 *
2955 * If an architecture guarantees that all ranges registered with 2956 * If an architecture guarantees that all ranges registered with
2956 * add_active_ranges() contain no holes and may be freed, this 2957 * add_active_ranges() contain no holes and may be freed, this
2957 * function may be used instead of calling memory_present() manually. 2958 * function may be used instead of calling memory_present() manually.
2958 */ 2959 */
2959 void __init sparse_memory_present_with_active_regions(int nid) 2960 void __init sparse_memory_present_with_active_regions(int nid)
2960 { 2961 {
2961 int i; 2962 int i;
2962 2963
2963 for_each_active_range_index_in_nid(i, nid) 2964 for_each_active_range_index_in_nid(i, nid)
2964 memory_present(early_node_map[i].nid, 2965 memory_present(early_node_map[i].nid,
2965 early_node_map[i].start_pfn, 2966 early_node_map[i].start_pfn,
2966 early_node_map[i].end_pfn); 2967 early_node_map[i].end_pfn);
2967 } 2968 }
2968 2969
2969 /** 2970 /**
2970 * push_node_boundaries - Push node boundaries to at least the requested boundary 2971 * push_node_boundaries - Push node boundaries to at least the requested boundary
2971 * @nid: The nid of the node to push the boundary for 2972 * @nid: The nid of the node to push the boundary for
2972 * @start_pfn: The start pfn of the node 2973 * @start_pfn: The start pfn of the node
2973 * @end_pfn: The end pfn of the node 2974 * @end_pfn: The end pfn of the node
2974 * 2975 *
2975 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd 2976 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2976 * time. Specifically, on x86_64, SRAT will report ranges that can potentially 2977 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2977 * be hotplugged even though no physical memory exists. This function allows 2978 * be hotplugged even though no physical memory exists. This function allows
2978 * an arch to push out the node boundaries so mem_map is allocated that can 2979 * an arch to push out the node boundaries so mem_map is allocated that can
2979 * be used later. 2980 * be used later.
2980 */ 2981 */
2981 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2982 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2982 void __init push_node_boundaries(unsigned int nid, 2983 void __init push_node_boundaries(unsigned int nid,
2983 unsigned long start_pfn, unsigned long end_pfn) 2984 unsigned long start_pfn, unsigned long end_pfn)
2984 { 2985 {
2985 mminit_dprintk(MMINIT_TRACE, "zoneboundary", 2986 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
2986 "Entering push_node_boundaries(%u, %lu, %lu)\n", 2987 "Entering push_node_boundaries(%u, %lu, %lu)\n",
2987 nid, start_pfn, end_pfn); 2988 nid, start_pfn, end_pfn);
2988 2989
2989 /* Initialise the boundary for this node if necessary */ 2990 /* Initialise the boundary for this node if necessary */
2990 if (node_boundary_end_pfn[nid] == 0) 2991 if (node_boundary_end_pfn[nid] == 0)
2991 node_boundary_start_pfn[nid] = -1UL; 2992 node_boundary_start_pfn[nid] = -1UL;
2992 2993
2993 /* Update the boundaries */ 2994 /* Update the boundaries */
2994 if (node_boundary_start_pfn[nid] > start_pfn) 2995 if (node_boundary_start_pfn[nid] > start_pfn)
2995 node_boundary_start_pfn[nid] = start_pfn; 2996 node_boundary_start_pfn[nid] = start_pfn;
2996 if (node_boundary_end_pfn[nid] < end_pfn) 2997 if (node_boundary_end_pfn[nid] < end_pfn)
2997 node_boundary_end_pfn[nid] = end_pfn; 2998 node_boundary_end_pfn[nid] = end_pfn;
2998 } 2999 }
2999 3000
3000 /* If necessary, push the node boundary out for reserve hotadd */ 3001 /* If necessary, push the node boundary out for reserve hotadd */
3001 static void __meminit account_node_boundary(unsigned int nid, 3002 static void __meminit account_node_boundary(unsigned int nid,
3002 unsigned long *start_pfn, unsigned long *end_pfn) 3003 unsigned long *start_pfn, unsigned long *end_pfn)
3003 { 3004 {
3004 mminit_dprintk(MMINIT_TRACE, "zoneboundary", 3005 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3005 "Entering account_node_boundary(%u, %lu, %lu)\n", 3006 "Entering account_node_boundary(%u, %lu, %lu)\n",
3006 nid, *start_pfn, *end_pfn); 3007 nid, *start_pfn, *end_pfn);
3007 3008
3008 /* Return if boundary information has not been provided */ 3009 /* Return if boundary information has not been provided */
3009 if (node_boundary_end_pfn[nid] == 0) 3010 if (node_boundary_end_pfn[nid] == 0)
3010 return; 3011 return;
3011 3012
3012 /* Check the boundaries and update if necessary */ 3013 /* Check the boundaries and update if necessary */
3013 if (node_boundary_start_pfn[nid] < *start_pfn) 3014 if (node_boundary_start_pfn[nid] < *start_pfn)
3014 *start_pfn = node_boundary_start_pfn[nid]; 3015 *start_pfn = node_boundary_start_pfn[nid];
3015 if (node_boundary_end_pfn[nid] > *end_pfn) 3016 if (node_boundary_end_pfn[nid] > *end_pfn)
3016 *end_pfn = node_boundary_end_pfn[nid]; 3017 *end_pfn = node_boundary_end_pfn[nid];
3017 } 3018 }
3018 #else 3019 #else
3019 void __init push_node_boundaries(unsigned int nid, 3020 void __init push_node_boundaries(unsigned int nid,
3020 unsigned long start_pfn, unsigned long end_pfn) {} 3021 unsigned long start_pfn, unsigned long end_pfn) {}
3021 3022
3022 static void __meminit account_node_boundary(unsigned int nid, 3023 static void __meminit account_node_boundary(unsigned int nid,
3023 unsigned long *start_pfn, unsigned long *end_pfn) {} 3024 unsigned long *start_pfn, unsigned long *end_pfn) {}
3024 #endif 3025 #endif
3025 3026
3026 3027
3027 /** 3028 /**
3028 * get_pfn_range_for_nid - Return the start and end page frames for a node 3029 * get_pfn_range_for_nid - Return the start and end page frames for a node
3029 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3030 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3030 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3031 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
3031 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 3032 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
3032 * 3033 *
3033 * It returns the start and end page frame of a node based on information 3034 * It returns the start and end page frame of a node based on information
3034 * provided by an arch calling add_active_range(). If called for a node 3035 * provided by an arch calling add_active_range(). If called for a node
3035 * with no available memory, a warning is printed and the start and end 3036 * with no available memory, a warning is printed and the start and end
3036 * PFNs will be 0. 3037 * PFNs will be 0.
3037 */ 3038 */
3038 void __meminit get_pfn_range_for_nid(unsigned int nid, 3039 void __meminit get_pfn_range_for_nid(unsigned int nid,
3039 unsigned long *start_pfn, unsigned long *end_pfn) 3040 unsigned long *start_pfn, unsigned long *end_pfn)
3040 { 3041 {
3041 int i; 3042 int i;
3042 *start_pfn = -1UL; 3043 *start_pfn = -1UL;
3043 *end_pfn = 0; 3044 *end_pfn = 0;
3044 3045
3045 for_each_active_range_index_in_nid(i, nid) { 3046 for_each_active_range_index_in_nid(i, nid) {
3046 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 3047 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
3047 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3048 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
3048 } 3049 }
3049 3050
3050 if (*start_pfn == -1UL) 3051 if (*start_pfn == -1UL)
3051 *start_pfn = 0; 3052 *start_pfn = 0;
3052 3053
3053 /* Push the node boundaries out if requested */ 3054 /* Push the node boundaries out if requested */
3054 account_node_boundary(nid, start_pfn, end_pfn); 3055 account_node_boundary(nid, start_pfn, end_pfn);
3055 } 3056 }
3056 3057
3057 /* 3058 /*
3058 * This finds a zone that can be used for ZONE_MOVABLE pages. The 3059 * This finds a zone that can be used for ZONE_MOVABLE pages. The
3059 * assumption is made that zones within a node are ordered in monotonic 3060 * assumption is made that zones within a node are ordered in monotonic
3060 * increasing memory addresses so that the "highest" populated zone is used 3061 * increasing memory addresses so that the "highest" populated zone is used
3061 */ 3062 */
3062 void __init find_usable_zone_for_movable(void) 3063 void __init find_usable_zone_for_movable(void)
3063 { 3064 {
3064 int zone_index; 3065 int zone_index;
3065 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3066 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
3066 if (zone_index == ZONE_MOVABLE) 3067 if (zone_index == ZONE_MOVABLE)
3067 continue; 3068 continue;
3068 3069
3069 if (arch_zone_highest_possible_pfn[zone_index] > 3070 if (arch_zone_highest_possible_pfn[zone_index] >
3070 arch_zone_lowest_possible_pfn[zone_index]) 3071 arch_zone_lowest_possible_pfn[zone_index])
3071 break; 3072 break;
3072 } 3073 }
3073 3074
3074 VM_BUG_ON(zone_index == -1); 3075 VM_BUG_ON(zone_index == -1);
3075 movable_zone = zone_index; 3076 movable_zone = zone_index;
3076 } 3077 }
3077 3078
3078 /* 3079 /*
3079 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3080 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3080 * because it is sized independant of architecture. Unlike the other zones, 3081 * because it is sized independant of architecture. Unlike the other zones,
3081 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3082 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3082 * in each node depending on the size of each node and how evenly kernelcore 3083 * in each node depending on the size of each node and how evenly kernelcore
3083 * is distributed. This helper function adjusts the zone ranges 3084 * is distributed. This helper function adjusts the zone ranges
3084 * provided by the architecture for a given node by using the end of the 3085 * provided by the architecture for a given node by using the end of the
3085 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3086 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
3086 * zones within a node are in order of monotonic increases memory addresses 3087 * zones within a node are in order of monotonic increases memory addresses
3087 */ 3088 */
3088 void __meminit adjust_zone_range_for_zone_movable(int nid, 3089 void __meminit adjust_zone_range_for_zone_movable(int nid,
3089 unsigned long zone_type, 3090 unsigned long zone_type,
3090 unsigned long node_start_pfn, 3091 unsigned long node_start_pfn,
3091 unsigned long node_end_pfn, 3092 unsigned long node_end_pfn,
3092 unsigned long *zone_start_pfn, 3093 unsigned long *zone_start_pfn,
3093 unsigned long *zone_end_pfn) 3094 unsigned long *zone_end_pfn)
3094 { 3095 {
3095 /* Only adjust if ZONE_MOVABLE is on this node */ 3096 /* Only adjust if ZONE_MOVABLE is on this node */
3096 if (zone_movable_pfn[nid]) { 3097 if (zone_movable_pfn[nid]) {
3097 /* Size ZONE_MOVABLE */ 3098 /* Size ZONE_MOVABLE */
3098 if (zone_type == ZONE_MOVABLE) { 3099 if (zone_type == ZONE_MOVABLE) {
3099 *zone_start_pfn = zone_movable_pfn[nid]; 3100 *zone_start_pfn = zone_movable_pfn[nid];
3100 *zone_end_pfn = min(node_end_pfn, 3101 *zone_end_pfn = min(node_end_pfn,
3101 arch_zone_highest_possible_pfn[movable_zone]); 3102 arch_zone_highest_possible_pfn[movable_zone]);
3102 3103
3103 /* Adjust for ZONE_MOVABLE starting within this range */ 3104 /* Adjust for ZONE_MOVABLE starting within this range */
3104 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 3105 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
3105 *zone_end_pfn > zone_movable_pfn[nid]) { 3106 *zone_end_pfn > zone_movable_pfn[nid]) {
3106 *zone_end_pfn = zone_movable_pfn[nid]; 3107 *zone_end_pfn = zone_movable_pfn[nid];
3107 3108
3108 /* Check if this whole range is within ZONE_MOVABLE */ 3109 /* Check if this whole range is within ZONE_MOVABLE */
3109 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 3110 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
3110 *zone_start_pfn = *zone_end_pfn; 3111 *zone_start_pfn = *zone_end_pfn;
3111 } 3112 }
3112 } 3113 }
3113 3114
3114 /* 3115 /*
3115 * Return the number of pages a zone spans in a node, including holes 3116 * Return the number of pages a zone spans in a node, including holes
3116 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 3117 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
3117 */ 3118 */
3118 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 3119 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3119 unsigned long zone_type, 3120 unsigned long zone_type,
3120 unsigned long *ignored) 3121 unsigned long *ignored)
3121 { 3122 {
3122 unsigned long node_start_pfn, node_end_pfn; 3123 unsigned long node_start_pfn, node_end_pfn;
3123 unsigned long zone_start_pfn, zone_end_pfn; 3124 unsigned long zone_start_pfn, zone_end_pfn;
3124 3125
3125 /* Get the start and end of the node and zone */ 3126 /* Get the start and end of the node and zone */
3126 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3127 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
3127 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 3128 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
3128 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 3129 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
3129 adjust_zone_range_for_zone_movable(nid, zone_type, 3130 adjust_zone_range_for_zone_movable(nid, zone_type,
3130 node_start_pfn, node_end_pfn, 3131 node_start_pfn, node_end_pfn,
3131 &zone_start_pfn, &zone_end_pfn); 3132 &zone_start_pfn, &zone_end_pfn);
3132 3133
3133 /* Check that this node has pages within the zone's required range */ 3134 /* Check that this node has pages within the zone's required range */
3134 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 3135 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
3135 return 0; 3136 return 0;
3136 3137
3137 /* Move the zone boundaries inside the node if necessary */ 3138 /* Move the zone boundaries inside the node if necessary */
3138 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 3139 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
3139 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 3140 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
3140 3141
3141 /* Return the spanned pages */ 3142 /* Return the spanned pages */
3142 return zone_end_pfn - zone_start_pfn; 3143 return zone_end_pfn - zone_start_pfn;
3143 } 3144 }
3144 3145
3145 /* 3146 /*
3146 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3147 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3147 * then all holes in the requested range will be accounted for. 3148 * then all holes in the requested range will be accounted for.
3148 */ 3149 */
3149 unsigned long __meminit __absent_pages_in_range(int nid, 3150 unsigned long __meminit __absent_pages_in_range(int nid,
3150 unsigned long range_start_pfn, 3151 unsigned long range_start_pfn,
3151 unsigned long range_end_pfn) 3152 unsigned long range_end_pfn)
3152 { 3153 {
3153 int i = 0; 3154 int i = 0;
3154 unsigned long prev_end_pfn = 0, hole_pages = 0; 3155 unsigned long prev_end_pfn = 0, hole_pages = 0;
3155 unsigned long start_pfn; 3156 unsigned long start_pfn;
3156 3157
3157 /* Find the end_pfn of the first active range of pfns in the node */ 3158 /* Find the end_pfn of the first active range of pfns in the node */
3158 i = first_active_region_index_in_nid(nid); 3159 i = first_active_region_index_in_nid(nid);
3159 if (i == -1) 3160 if (i == -1)
3160 return 0; 3161 return 0;
3161 3162
3162 prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 3163 prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
3163 3164
3164 /* Account for ranges before physical memory on this node */ 3165 /* Account for ranges before physical memory on this node */
3165 if (early_node_map[i].start_pfn > range_start_pfn) 3166 if (early_node_map[i].start_pfn > range_start_pfn)
3166 hole_pages = prev_end_pfn - range_start_pfn; 3167 hole_pages = prev_end_pfn - range_start_pfn;
3167 3168
3168 /* Find all holes for the zone within the node */ 3169 /* Find all holes for the zone within the node */
3169 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { 3170 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
3170 3171
3171 /* No need to continue if prev_end_pfn is outside the zone */ 3172 /* No need to continue if prev_end_pfn is outside the zone */
3172 if (prev_end_pfn >= range_end_pfn) 3173 if (prev_end_pfn >= range_end_pfn)
3173 break; 3174 break;
3174 3175
3175 /* Make sure the end of the zone is not within the hole */ 3176 /* Make sure the end of the zone is not within the hole */
3176 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 3177 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
3177 prev_end_pfn = max(prev_end_pfn, range_start_pfn); 3178 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
3178 3179
3179 /* Update the hole size cound and move on */ 3180 /* Update the hole size cound and move on */
3180 if (start_pfn > range_start_pfn) { 3181 if (start_pfn > range_start_pfn) {
3181 BUG_ON(prev_end_pfn > start_pfn); 3182 BUG_ON(prev_end_pfn > start_pfn);
3182 hole_pages += start_pfn - prev_end_pfn; 3183 hole_pages += start_pfn - prev_end_pfn;
3183 } 3184 }
3184 prev_end_pfn = early_node_map[i].end_pfn; 3185 prev_end_pfn = early_node_map[i].end_pfn;
3185 } 3186 }
3186 3187
3187 /* Account for ranges past physical memory on this node */ 3188 /* Account for ranges past physical memory on this node */
3188 if (range_end_pfn > prev_end_pfn) 3189 if (range_end_pfn > prev_end_pfn)
3189 hole_pages += range_end_pfn - 3190 hole_pages += range_end_pfn -
3190 max(range_start_pfn, prev_end_pfn); 3191 max(range_start_pfn, prev_end_pfn);
3191 3192
3192 return hole_pages; 3193 return hole_pages;
3193 } 3194 }
3194 3195
3195 /** 3196 /**
3196 * absent_pages_in_range - Return number of page frames in holes within a range 3197 * absent_pages_in_range - Return number of page frames in holes within a range
3197 * @start_pfn: The start PFN to start searching for holes 3198 * @start_pfn: The start PFN to start searching for holes
3198 * @end_pfn: The end PFN to stop searching for holes 3199 * @end_pfn: The end PFN to stop searching for holes
3199 * 3200 *
3200 * It returns the number of pages frames in memory holes within a range. 3201 * It returns the number of pages frames in memory holes within a range.
3201 */ 3202 */
3202 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 3203 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
3203 unsigned long end_pfn) 3204 unsigned long end_pfn)
3204 { 3205 {
3205 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 3206 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
3206 } 3207 }
3207 3208
3208 /* Return the number of page frames in holes in a zone on a node */ 3209 /* Return the number of page frames in holes in a zone on a node */
3209 static unsigned long __meminit zone_absent_pages_in_node(int nid, 3210 static unsigned long __meminit zone_absent_pages_in_node(int nid,
3210 unsigned long zone_type, 3211 unsigned long zone_type,
3211 unsigned long *ignored) 3212 unsigned long *ignored)
3212 { 3213 {
3213 unsigned long node_start_pfn, node_end_pfn; 3214 unsigned long node_start_pfn, node_end_pfn;
3214 unsigned long zone_start_pfn, zone_end_pfn; 3215 unsigned long zone_start_pfn, zone_end_pfn;
3215 3216
3216 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3217 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
3217 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 3218 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
3218 node_start_pfn); 3219 node_start_pfn);
3219 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 3220 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
3220 node_end_pfn); 3221 node_end_pfn);
3221 3222
3222 adjust_zone_range_for_zone_movable(nid, zone_type, 3223 adjust_zone_range_for_zone_movable(nid, zone_type,
3223 node_start_pfn, node_end_pfn, 3224 node_start_pfn, node_end_pfn,
3224 &zone_start_pfn, &zone_end_pfn); 3225 &zone_start_pfn, &zone_end_pfn);
3225 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 3226 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
3226 } 3227 }
3227 3228
3228 #else 3229 #else
3229 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 3230 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
3230 unsigned long zone_type, 3231 unsigned long zone_type,
3231 unsigned long *zones_size) 3232 unsigned long *zones_size)
3232 { 3233 {
3233 return zones_size[zone_type]; 3234 return zones_size[zone_type];
3234 } 3235 }
3235 3236
3236 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 3237 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
3237 unsigned long zone_type, 3238 unsigned long zone_type,
3238 unsigned long *zholes_size) 3239 unsigned long *zholes_size)
3239 { 3240 {
3240 if (!zholes_size) 3241 if (!zholes_size)
3241 return 0; 3242 return 0;
3242 3243
3243 return zholes_size[zone_type]; 3244 return zholes_size[zone_type];
3244 } 3245 }
3245 3246
3246 #endif 3247 #endif
3247 3248
3248 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 3249 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
3249 unsigned long *zones_size, unsigned long *zholes_size) 3250 unsigned long *zones_size, unsigned long *zholes_size)
3250 { 3251 {
3251 unsigned long realtotalpages, totalpages = 0; 3252 unsigned long realtotalpages, totalpages = 0;
3252 enum zone_type i; 3253 enum zone_type i;
3253 3254
3254 for (i = 0; i < MAX_NR_ZONES; i++) 3255 for (i = 0; i < MAX_NR_ZONES; i++)
3255 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 3256 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
3256 zones_size); 3257 zones_size);
3257 pgdat->node_spanned_pages = totalpages; 3258 pgdat->node_spanned_pages = totalpages;
3258 3259
3259 realtotalpages = totalpages; 3260 realtotalpages = totalpages;
3260 for (i = 0; i < MAX_NR_ZONES; i++) 3261 for (i = 0; i < MAX_NR_ZONES; i++)
3261 realtotalpages -= 3262 realtotalpages -=
3262 zone_absent_pages_in_node(pgdat->node_id, i, 3263 zone_absent_pages_in_node(pgdat->node_id, i,
3263 zholes_size); 3264 zholes_size);
3264 pgdat->node_present_pages = realtotalpages; 3265 pgdat->node_present_pages = realtotalpages;
3265 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 3266 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
3266 realtotalpages); 3267 realtotalpages);
3267 } 3268 }
3268 3269
3269 #ifndef CONFIG_SPARSEMEM 3270 #ifndef CONFIG_SPARSEMEM
3270 /* 3271 /*
3271 * Calculate the size of the zone->blockflags rounded to an unsigned long 3272 * Calculate the size of the zone->blockflags rounded to an unsigned long
3272 * Start by making sure zonesize is a multiple of pageblock_order by rounding 3273 * Start by making sure zonesize is a multiple of pageblock_order by rounding
3273 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 3274 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
3274 * round what is now in bits to nearest long in bits, then return it in 3275 * round what is now in bits to nearest long in bits, then return it in
3275 * bytes. 3276 * bytes.
3276 */ 3277 */
3277 static unsigned long __init usemap_size(unsigned long zonesize) 3278 static unsigned long __init usemap_size(unsigned long zonesize)
3278 { 3279 {
3279 unsigned long usemapsize; 3280 unsigned long usemapsize;
3280 3281
3281 usemapsize = roundup(zonesize, pageblock_nr_pages); 3282 usemapsize = roundup(zonesize, pageblock_nr_pages);
3282 usemapsize = usemapsize >> pageblock_order; 3283 usemapsize = usemapsize >> pageblock_order;
3283 usemapsize *= NR_PAGEBLOCK_BITS; 3284 usemapsize *= NR_PAGEBLOCK_BITS;
3284 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 3285 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
3285 3286
3286 return usemapsize / 8; 3287 return usemapsize / 8;
3287 } 3288 }
3288 3289
3289 static void __init setup_usemap(struct pglist_data *pgdat, 3290 static void __init setup_usemap(struct pglist_data *pgdat,
3290 struct zone *zone, unsigned long zonesize) 3291 struct zone *zone, unsigned long zonesize)
3291 { 3292 {
3292 unsigned long usemapsize = usemap_size(zonesize); 3293 unsigned long usemapsize = usemap_size(zonesize);
3293 zone->pageblock_flags = NULL; 3294 zone->pageblock_flags = NULL;
3294 if (usemapsize) { 3295 if (usemapsize) {
3295 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 3296 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3296 memset(zone->pageblock_flags, 0, usemapsize); 3297 memset(zone->pageblock_flags, 0, usemapsize);
3297 } 3298 }
3298 } 3299 }
3299 #else 3300 #else
3300 static void inline setup_usemap(struct pglist_data *pgdat, 3301 static void inline setup_usemap(struct pglist_data *pgdat,
3301 struct zone *zone, unsigned long zonesize) {} 3302 struct zone *zone, unsigned long zonesize) {}
3302 #endif /* CONFIG_SPARSEMEM */ 3303 #endif /* CONFIG_SPARSEMEM */
3303 3304
3304 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 3305 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
3305 3306
3306 /* Return a sensible default order for the pageblock size. */ 3307 /* Return a sensible default order for the pageblock size. */
3307 static inline int pageblock_default_order(void) 3308 static inline int pageblock_default_order(void)
3308 { 3309 {
3309 if (HPAGE_SHIFT > PAGE_SHIFT) 3310 if (HPAGE_SHIFT > PAGE_SHIFT)
3310 return HUGETLB_PAGE_ORDER; 3311 return HUGETLB_PAGE_ORDER;
3311 3312
3312 return MAX_ORDER-1; 3313 return MAX_ORDER-1;
3313 } 3314 }
3314 3315
3315 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 3316 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
3316 static inline void __init set_pageblock_order(unsigned int order) 3317 static inline void __init set_pageblock_order(unsigned int order)
3317 { 3318 {
3318 /* Check that pageblock_nr_pages has not already been setup */ 3319 /* Check that pageblock_nr_pages has not already been setup */
3319 if (pageblock_order) 3320 if (pageblock_order)
3320 return; 3321 return;
3321 3322
3322 /* 3323 /*
3323 * Assume the largest contiguous order of interest is a huge page. 3324 * Assume the largest contiguous order of interest is a huge page.
3324 * This value may be variable depending on boot parameters on IA64 3325 * This value may be variable depending on boot parameters on IA64
3325 */ 3326 */
3326 pageblock_order = order; 3327 pageblock_order = order;
3327 } 3328 }
3328 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 3329 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3329 3330
3330 /* 3331 /*
3331 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 3332 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
3332 * and pageblock_default_order() are unused as pageblock_order is set 3333 * and pageblock_default_order() are unused as pageblock_order is set
3333 * at compile-time. See include/linux/pageblock-flags.h for the values of 3334 * at compile-time. See include/linux/pageblock-flags.h for the values of
3334 * pageblock_order based on the kernel config 3335 * pageblock_order based on the kernel config
3335 */ 3336 */
3336 static inline int pageblock_default_order(unsigned int order) 3337 static inline int pageblock_default_order(unsigned int order)
3337 { 3338 {
3338 return MAX_ORDER-1; 3339 return MAX_ORDER-1;
3339 } 3340 }
3340 #define set_pageblock_order(x) do {} while (0) 3341 #define set_pageblock_order(x) do {} while (0)
3341 3342
3342 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 3343 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3343 3344
3344 /* 3345 /*
3345 * Set up the zone data structures: 3346 * Set up the zone data structures:
3346 * - mark all pages reserved 3347 * - mark all pages reserved
3347 * - mark all memory queues empty 3348 * - mark all memory queues empty
3348 * - clear the memory bitmaps 3349 * - clear the memory bitmaps
3349 */ 3350 */
3350 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 3351 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3351 unsigned long *zones_size, unsigned long *zholes_size) 3352 unsigned long *zones_size, unsigned long *zholes_size)
3352 { 3353 {
3353 enum zone_type j; 3354 enum zone_type j;
3354 int nid = pgdat->node_id; 3355 int nid = pgdat->node_id;
3355 unsigned long zone_start_pfn = pgdat->node_start_pfn; 3356 unsigned long zone_start_pfn = pgdat->node_start_pfn;
3356 int ret; 3357 int ret;
3357 3358
3358 pgdat_resize_init(pgdat); 3359 pgdat_resize_init(pgdat);
3359 pgdat->nr_zones = 0; 3360 pgdat->nr_zones = 0;
3360 init_waitqueue_head(&pgdat->kswapd_wait); 3361 init_waitqueue_head(&pgdat->kswapd_wait);
3361 pgdat->kswapd_max_order = 0; 3362 pgdat->kswapd_max_order = 0;
3362 3363
3363 for (j = 0; j < MAX_NR_ZONES; j++) { 3364 for (j = 0; j < MAX_NR_ZONES; j++) {
3364 struct zone *zone = pgdat->node_zones + j; 3365 struct zone *zone = pgdat->node_zones + j;
3365 unsigned long size, realsize, memmap_pages; 3366 unsigned long size, realsize, memmap_pages;
3366 3367
3367 size = zone_spanned_pages_in_node(nid, j, zones_size); 3368 size = zone_spanned_pages_in_node(nid, j, zones_size);
3368 realsize = size - zone_absent_pages_in_node(nid, j, 3369 realsize = size - zone_absent_pages_in_node(nid, j,
3369 zholes_size); 3370 zholes_size);
3370 3371
3371 /* 3372 /*
3372 * Adjust realsize so that it accounts for how much memory 3373 * Adjust realsize so that it accounts for how much memory
3373 * is used by this zone for memmap. This affects the watermark 3374 * is used by this zone for memmap. This affects the watermark
3374 * and per-cpu initialisations 3375 * and per-cpu initialisations
3375 */ 3376 */
3376 memmap_pages = 3377 memmap_pages =
3377 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3378 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3378 if (realsize >= memmap_pages) { 3379 if (realsize >= memmap_pages) {
3379 realsize -= memmap_pages; 3380 realsize -= memmap_pages;
3380 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3381 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3381 "%s zone: %lu pages used for memmap\n", 3382 "%s zone: %lu pages used for memmap\n",
3382 zone_names[j], memmap_pages); 3383 zone_names[j], memmap_pages);
3383 } else 3384 } else
3384 printk(KERN_WARNING 3385 printk(KERN_WARNING
3385 " %s zone: %lu pages exceeds realsize %lu\n", 3386 " %s zone: %lu pages exceeds realsize %lu\n",
3386 zone_names[j], memmap_pages, realsize); 3387 zone_names[j], memmap_pages, realsize);
3387 3388
3388 /* Account for reserved pages */ 3389 /* Account for reserved pages */
3389 if (j == 0 && realsize > dma_reserve) { 3390 if (j == 0 && realsize > dma_reserve) {
3390 realsize -= dma_reserve; 3391 realsize -= dma_reserve;
3391 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3392 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3392 "%s zone: %lu pages reserved\n", 3393 "%s zone: %lu pages reserved\n",
3393 zone_names[0], dma_reserve); 3394 zone_names[0], dma_reserve);
3394 } 3395 }
3395 3396
3396 if (!is_highmem_idx(j)) 3397 if (!is_highmem_idx(j))
3397 nr_kernel_pages += realsize; 3398 nr_kernel_pages += realsize;
3398 nr_all_pages += realsize; 3399 nr_all_pages += realsize;
3399 3400
3400 zone->spanned_pages = size; 3401 zone->spanned_pages = size;
3401 zone->present_pages = realsize; 3402 zone->present_pages = realsize;
3402 #ifdef CONFIG_NUMA 3403 #ifdef CONFIG_NUMA
3403 zone->node = nid; 3404 zone->node = nid;
3404 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 3405 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
3405 / 100; 3406 / 100;
3406 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 3407 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
3407 #endif 3408 #endif
3408 zone->name = zone_names[j]; 3409 zone->name = zone_names[j];
3409 spin_lock_init(&zone->lock); 3410 spin_lock_init(&zone->lock);
3410 spin_lock_init(&zone->lru_lock); 3411 spin_lock_init(&zone->lru_lock);
3411 zone_seqlock_init(zone); 3412 zone_seqlock_init(zone);
3412 zone->zone_pgdat = pgdat; 3413 zone->zone_pgdat = pgdat;
3413 3414
3414 zone->prev_priority = DEF_PRIORITY; 3415 zone->prev_priority = DEF_PRIORITY;
3415 3416
3416 zone_pcp_init(zone); 3417 zone_pcp_init(zone);
3417 INIT_LIST_HEAD(&zone->active_list); 3418 INIT_LIST_HEAD(&zone->active_list);
3418 INIT_LIST_HEAD(&zone->inactive_list); 3419 INIT_LIST_HEAD(&zone->inactive_list);
3419 zone->nr_scan_active = 0; 3420 zone->nr_scan_active = 0;
3420 zone->nr_scan_inactive = 0; 3421 zone->nr_scan_inactive = 0;
3421 zap_zone_vm_stats(zone); 3422 zap_zone_vm_stats(zone);
3422 zone->flags = 0; 3423 zone->flags = 0;
3423 if (!size) 3424 if (!size)
3424 continue; 3425 continue;
3425 3426
3426 set_pageblock_order(pageblock_default_order()); 3427 set_pageblock_order(pageblock_default_order());
3427 setup_usemap(pgdat, zone, size); 3428 setup_usemap(pgdat, zone, size);
3428 ret = init_currently_empty_zone(zone, zone_start_pfn, 3429 ret = init_currently_empty_zone(zone, zone_start_pfn,
3429 size, MEMMAP_EARLY); 3430 size, MEMMAP_EARLY);
3430 BUG_ON(ret); 3431 BUG_ON(ret);
3431 memmap_init(size, nid, j, zone_start_pfn); 3432 memmap_init(size, nid, j, zone_start_pfn);
3432 zone_start_pfn += size; 3433 zone_start_pfn += size;
3433 } 3434 }
3434 } 3435 }
3435 3436
3436 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 3437 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
3437 { 3438 {
3438 /* Skip empty nodes */ 3439 /* Skip empty nodes */
3439 if (!pgdat->node_spanned_pages) 3440 if (!pgdat->node_spanned_pages)
3440 return; 3441 return;
3441 3442
3442 #ifdef CONFIG_FLAT_NODE_MEM_MAP 3443 #ifdef CONFIG_FLAT_NODE_MEM_MAP
3443 /* ia64 gets its own node_mem_map, before this, without bootmem */ 3444 /* ia64 gets its own node_mem_map, before this, without bootmem */
3444 if (!pgdat->node_mem_map) { 3445 if (!pgdat->node_mem_map) {
3445 unsigned long size, start, end; 3446 unsigned long size, start, end;
3446 struct page *map; 3447 struct page *map;
3447 3448
3448 /* 3449 /*
3449 * The zone's endpoints aren't required to be MAX_ORDER 3450 * The zone's endpoints aren't required to be MAX_ORDER
3450 * aligned but the node_mem_map endpoints must be in order 3451 * aligned but the node_mem_map endpoints must be in order
3451 * for the buddy allocator to function correctly. 3452 * for the buddy allocator to function correctly.
3452 */ 3453 */
3453 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 3454 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
3454 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 3455 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
3455 end = ALIGN(end, MAX_ORDER_NR_PAGES); 3456 end = ALIGN(end, MAX_ORDER_NR_PAGES);
3456 size = (end - start) * sizeof(struct page); 3457 size = (end - start) * sizeof(struct page);
3457 map = alloc_remap(pgdat->node_id, size); 3458 map = alloc_remap(pgdat->node_id, size);
3458 if (!map) 3459 if (!map)
3459 map = alloc_bootmem_node(pgdat, size); 3460 map = alloc_bootmem_node(pgdat, size);
3460 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 3461 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
3461 } 3462 }
3462 #ifndef CONFIG_NEED_MULTIPLE_NODES 3463 #ifndef CONFIG_NEED_MULTIPLE_NODES
3463 /* 3464 /*
3464 * With no DISCONTIG, the global mem_map is just set as node 0's 3465 * With no DISCONTIG, the global mem_map is just set as node 0's
3465 */ 3466 */
3466 if (pgdat == NODE_DATA(0)) { 3467 if (pgdat == NODE_DATA(0)) {
3467 mem_map = NODE_DATA(0)->node_mem_map; 3468 mem_map = NODE_DATA(0)->node_mem_map;
3468 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3469 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3469 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 3470 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
3470 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 3471 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
3471 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3472 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
3472 } 3473 }
3473 #endif 3474 #endif
3474 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3475 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
3475 } 3476 }
3476 3477
3477 void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3478 void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
3478 unsigned long *zones_size, unsigned long node_start_pfn, 3479 unsigned long *zones_size, unsigned long node_start_pfn,
3479 unsigned long *zholes_size) 3480 unsigned long *zholes_size)
3480 { 3481 {
3481 pgdat->node_id = nid; 3482 pgdat->node_id = nid;
3482 pgdat->node_start_pfn = node_start_pfn; 3483 pgdat->node_start_pfn = node_start_pfn;
3483 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3484 calculate_node_totalpages(pgdat, zones_size, zholes_size);
3484 3485
3485 alloc_node_mem_map(pgdat); 3486 alloc_node_mem_map(pgdat);
3486 #ifdef CONFIG_FLAT_NODE_MEM_MAP 3487 #ifdef CONFIG_FLAT_NODE_MEM_MAP
3487 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 3488 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
3488 nid, (unsigned long)pgdat, 3489 nid, (unsigned long)pgdat,
3489 (unsigned long)pgdat->node_mem_map); 3490 (unsigned long)pgdat->node_mem_map);
3490 #endif 3491 #endif
3491 3492
3492 free_area_init_core(pgdat, zones_size, zholes_size); 3493 free_area_init_core(pgdat, zones_size, zholes_size);
3493 } 3494 }
3494 3495
3495 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3496 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
3496 3497
3497 #if MAX_NUMNODES > 1 3498 #if MAX_NUMNODES > 1
3498 /* 3499 /*
3499 * Figure out the number of possible node ids. 3500 * Figure out the number of possible node ids.
3500 */ 3501 */
3501 static void __init setup_nr_node_ids(void) 3502 static void __init setup_nr_node_ids(void)
3502 { 3503 {
3503 unsigned int node; 3504 unsigned int node;
3504 unsigned int highest = 0; 3505 unsigned int highest = 0;
3505 3506
3506 for_each_node_mask(node, node_possible_map) 3507 for_each_node_mask(node, node_possible_map)
3507 highest = node; 3508 highest = node;
3508 nr_node_ids = highest + 1; 3509 nr_node_ids = highest + 1;
3509 } 3510 }
3510 #else 3511 #else
3511 static inline void setup_nr_node_ids(void) 3512 static inline void setup_nr_node_ids(void)
3512 { 3513 {
3513 } 3514 }
3514 #endif 3515 #endif
3515 3516
3516 /** 3517 /**
3517 * add_active_range - Register a range of PFNs backed by physical memory 3518 * add_active_range - Register a range of PFNs backed by physical memory
3518 * @nid: The node ID the range resides on 3519 * @nid: The node ID the range resides on
3519 * @start_pfn: The start PFN of the available physical memory 3520 * @start_pfn: The start PFN of the available physical memory
3520 * @end_pfn: The end PFN of the available physical memory 3521 * @end_pfn: The end PFN of the available physical memory
3521 * 3522 *
3522 * These ranges are stored in an early_node_map[] and later used by 3523 * These ranges are stored in an early_node_map[] and later used by
3523 * free_area_init_nodes() to calculate zone sizes and holes. If the 3524 * free_area_init_nodes() to calculate zone sizes and holes. If the
3524 * range spans a memory hole, it is up to the architecture to ensure 3525 * range spans a memory hole, it is up to the architecture to ensure
3525 * the memory is not freed by the bootmem allocator. If possible 3526 * the memory is not freed by the bootmem allocator. If possible
3526 * the range being registered will be merged with existing ranges. 3527 * the range being registered will be merged with existing ranges.
3527 */ 3528 */
3528 void __init add_active_range(unsigned int nid, unsigned long start_pfn, 3529 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3529 unsigned long end_pfn) 3530 unsigned long end_pfn)
3530 { 3531 {
3531 int i; 3532 int i;
3532 3533
3533 mminit_dprintk(MMINIT_TRACE, "memory_register", 3534 mminit_dprintk(MMINIT_TRACE, "memory_register",
3534 "Entering add_active_range(%d, %#lx, %#lx) " 3535 "Entering add_active_range(%d, %#lx, %#lx) "
3535 "%d entries of %d used\n", 3536 "%d entries of %d used\n",
3536 nid, start_pfn, end_pfn, 3537 nid, start_pfn, end_pfn,
3537 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3538 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
3538 3539
3539 mminit_validate_memmodel_limits(&start_pfn, &end_pfn); 3540 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
3540 3541
3541 /* Merge with existing active regions if possible */ 3542 /* Merge with existing active regions if possible */
3542 for (i = 0; i < nr_nodemap_entries; i++) { 3543 for (i = 0; i < nr_nodemap_entries; i++) {
3543 if (early_node_map[i].nid != nid) 3544 if (early_node_map[i].nid != nid)
3544 continue; 3545 continue;
3545 3546
3546 /* Skip if an existing region covers this new one */ 3547 /* Skip if an existing region covers this new one */
3547 if (start_pfn >= early_node_map[i].start_pfn && 3548 if (start_pfn >= early_node_map[i].start_pfn &&
3548 end_pfn <= early_node_map[i].end_pfn) 3549 end_pfn <= early_node_map[i].end_pfn)
3549 return; 3550 return;
3550 3551
3551 /* Merge forward if suitable */ 3552 /* Merge forward if suitable */
3552 if (start_pfn <= early_node_map[i].end_pfn && 3553 if (start_pfn <= early_node_map[i].end_pfn &&
3553 end_pfn > early_node_map[i].end_pfn) { 3554 end_pfn > early_node_map[i].end_pfn) {
3554 early_node_map[i].end_pfn = end_pfn; 3555 early_node_map[i].end_pfn = end_pfn;
3555 return; 3556 return;
3556 } 3557 }
3557 3558
3558 /* Merge backward if suitable */ 3559 /* Merge backward if suitable */
3559 if (start_pfn < early_node_map[i].end_pfn && 3560 if (start_pfn < early_node_map[i].end_pfn &&
3560 end_pfn >= early_node_map[i].start_pfn) { 3561 end_pfn >= early_node_map[i].start_pfn) {
3561 early_node_map[i].start_pfn = start_pfn; 3562 early_node_map[i].start_pfn = start_pfn;
3562 return; 3563 return;
3563 } 3564 }
3564 } 3565 }
3565 3566
3566 /* Check that early_node_map is large enough */ 3567 /* Check that early_node_map is large enough */
3567 if (i >= MAX_ACTIVE_REGIONS) { 3568 if (i >= MAX_ACTIVE_REGIONS) {
3568 printk(KERN_CRIT "More than %d memory regions, truncating\n", 3569 printk(KERN_CRIT "More than %d memory regions, truncating\n",
3569 MAX_ACTIVE_REGIONS); 3570 MAX_ACTIVE_REGIONS);
3570 return; 3571 return;
3571 } 3572 }
3572 3573
3573 early_node_map[i].nid = nid; 3574 early_node_map[i].nid = nid;
3574 early_node_map[i].start_pfn = start_pfn; 3575 early_node_map[i].start_pfn = start_pfn;
3575 early_node_map[i].end_pfn = end_pfn; 3576 early_node_map[i].end_pfn = end_pfn;
3576 nr_nodemap_entries = i + 1; 3577 nr_nodemap_entries = i + 1;
3577 } 3578 }
3578 3579
3579 /** 3580 /**
3580 * remove_active_range - Shrink an existing registered range of PFNs 3581 * remove_active_range - Shrink an existing registered range of PFNs
3581 * @nid: The node id the range is on that should be shrunk 3582 * @nid: The node id the range is on that should be shrunk
3582 * @start_pfn: The new PFN of the range 3583 * @start_pfn: The new PFN of the range
3583 * @end_pfn: The new PFN of the range 3584 * @end_pfn: The new PFN of the range
3584 * 3585 *
3585 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 3586 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
3586 * The map is kept near the end physical page range that has already been 3587 * The map is kept near the end physical page range that has already been
3587 * registered. This function allows an arch to shrink an existing registered 3588 * registered. This function allows an arch to shrink an existing registered
3588 * range. 3589 * range.
3589 */ 3590 */
3590 void __init remove_active_range(unsigned int nid, unsigned long start_pfn, 3591 void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
3591 unsigned long end_pfn) 3592 unsigned long end_pfn)
3592 { 3593 {
3593 int i, j; 3594 int i, j;
3594 int removed = 0; 3595 int removed = 0;
3595 3596
3596 printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", 3597 printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
3597 nid, start_pfn, end_pfn); 3598 nid, start_pfn, end_pfn);
3598 3599
3599 /* Find the old active region end and shrink */ 3600 /* Find the old active region end and shrink */
3600 for_each_active_range_index_in_nid(i, nid) { 3601 for_each_active_range_index_in_nid(i, nid) {
3601 if (early_node_map[i].start_pfn >= start_pfn && 3602 if (early_node_map[i].start_pfn >= start_pfn &&
3602 early_node_map[i].end_pfn <= end_pfn) { 3603 early_node_map[i].end_pfn <= end_pfn) {
3603 /* clear it */ 3604 /* clear it */
3604 early_node_map[i].start_pfn = 0; 3605 early_node_map[i].start_pfn = 0;
3605 early_node_map[i].end_pfn = 0; 3606 early_node_map[i].end_pfn = 0;
3606 removed = 1; 3607 removed = 1;
3607 continue; 3608 continue;
3608 } 3609 }
3609 if (early_node_map[i].start_pfn < start_pfn && 3610 if (early_node_map[i].start_pfn < start_pfn &&
3610 early_node_map[i].end_pfn > start_pfn) { 3611 early_node_map[i].end_pfn > start_pfn) {
3611 unsigned long temp_end_pfn = early_node_map[i].end_pfn; 3612 unsigned long temp_end_pfn = early_node_map[i].end_pfn;
3612 early_node_map[i].end_pfn = start_pfn; 3613 early_node_map[i].end_pfn = start_pfn;
3613 if (temp_end_pfn > end_pfn) 3614 if (temp_end_pfn > end_pfn)
3614 add_active_range(nid, end_pfn, temp_end_pfn); 3615 add_active_range(nid, end_pfn, temp_end_pfn);
3615 continue; 3616 continue;
3616 } 3617 }
3617 if (early_node_map[i].start_pfn >= start_pfn && 3618 if (early_node_map[i].start_pfn >= start_pfn &&
3618 early_node_map[i].end_pfn > end_pfn && 3619 early_node_map[i].end_pfn > end_pfn &&
3619 early_node_map[i].start_pfn < end_pfn) { 3620 early_node_map[i].start_pfn < end_pfn) {
3620 early_node_map[i].start_pfn = end_pfn; 3621 early_node_map[i].start_pfn = end_pfn;
3621 continue; 3622 continue;
3622 } 3623 }
3623 } 3624 }
3624 3625
3625 if (!removed) 3626 if (!removed)
3626 return; 3627 return;
3627 3628
3628 /* remove the blank ones */ 3629 /* remove the blank ones */
3629 for (i = nr_nodemap_entries - 1; i > 0; i--) { 3630 for (i = nr_nodemap_entries - 1; i > 0; i--) {
3630 if (early_node_map[i].nid != nid) 3631 if (early_node_map[i].nid != nid)
3631 continue; 3632 continue;
3632 if (early_node_map[i].end_pfn) 3633 if (early_node_map[i].end_pfn)
3633 continue; 3634 continue;
3634 /* we found it, get rid of it */ 3635 /* we found it, get rid of it */
3635 for (j = i; j < nr_nodemap_entries - 1; j++) 3636 for (j = i; j < nr_nodemap_entries - 1; j++)
3636 memcpy(&early_node_map[j], &early_node_map[j+1], 3637 memcpy(&early_node_map[j], &early_node_map[j+1],
3637 sizeof(early_node_map[j])); 3638 sizeof(early_node_map[j]));
3638 j = nr_nodemap_entries - 1; 3639 j = nr_nodemap_entries - 1;
3639 memset(&early_node_map[j], 0, sizeof(early_node_map[j])); 3640 memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
3640 nr_nodemap_entries--; 3641 nr_nodemap_entries--;
3641 } 3642 }
3642 } 3643 }
3643 3644
3644 /** 3645 /**
3645 * remove_all_active_ranges - Remove all currently registered regions 3646 * remove_all_active_ranges - Remove all currently registered regions
3646 * 3647 *
3647 * During discovery, it may be found that a table like SRAT is invalid 3648 * During discovery, it may be found that a table like SRAT is invalid
3648 * and an alternative discovery method must be used. This function removes 3649 * and an alternative discovery method must be used. This function removes
3649 * all currently registered regions. 3650 * all currently registered regions.
3650 */ 3651 */
3651 void __init remove_all_active_ranges(void) 3652 void __init remove_all_active_ranges(void)
3652 { 3653 {
3653 memset(early_node_map, 0, sizeof(early_node_map)); 3654 memset(early_node_map, 0, sizeof(early_node_map));
3654 nr_nodemap_entries = 0; 3655 nr_nodemap_entries = 0;
3655 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 3656 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3656 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); 3657 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3657 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); 3658 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3658 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 3659 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3659 } 3660 }
3660 3661
3661 /* Compare two active node_active_regions */ 3662 /* Compare two active node_active_regions */
3662 static int __init cmp_node_active_region(const void *a, const void *b) 3663 static int __init cmp_node_active_region(const void *a, const void *b)
3663 { 3664 {
3664 struct node_active_region *arange = (struct node_active_region *)a; 3665 struct node_active_region *arange = (struct node_active_region *)a;
3665 struct node_active_region *brange = (struct node_active_region *)b; 3666 struct node_active_region *brange = (struct node_active_region *)b;
3666 3667
3667 /* Done this way to avoid overflows */ 3668 /* Done this way to avoid overflows */
3668 if (arange->start_pfn > brange->start_pfn) 3669 if (arange->start_pfn > brange->start_pfn)
3669 return 1; 3670 return 1;
3670 if (arange->start_pfn < brange->start_pfn) 3671 if (arange->start_pfn < brange->start_pfn)
3671 return -1; 3672 return -1;
3672 3673
3673 return 0; 3674 return 0;
3674 } 3675 }
3675 3676
3676 /* sort the node_map by start_pfn */ 3677 /* sort the node_map by start_pfn */
3677 static void __init sort_node_map(void) 3678 static void __init sort_node_map(void)
3678 { 3679 {
3679 sort(early_node_map, (size_t)nr_nodemap_entries, 3680 sort(early_node_map, (size_t)nr_nodemap_entries,
3680 sizeof(struct node_active_region), 3681 sizeof(struct node_active_region),
3681 cmp_node_active_region, NULL); 3682 cmp_node_active_region, NULL);
3682 } 3683 }
3683 3684
3684 /* Find the lowest pfn for a node */ 3685 /* Find the lowest pfn for a node */
3685 unsigned long __init find_min_pfn_for_node(int nid) 3686 unsigned long __init find_min_pfn_for_node(int nid)
3686 { 3687 {
3687 int i; 3688 int i;
3688 unsigned long min_pfn = ULONG_MAX; 3689 unsigned long min_pfn = ULONG_MAX;
3689 3690
3690 /* Assuming a sorted map, the first range found has the starting pfn */ 3691 /* Assuming a sorted map, the first range found has the starting pfn */
3691 for_each_active_range_index_in_nid(i, nid) 3692 for_each_active_range_index_in_nid(i, nid)
3692 min_pfn = min(min_pfn, early_node_map[i].start_pfn); 3693 min_pfn = min(min_pfn, early_node_map[i].start_pfn);
3693 3694
3694 if (min_pfn == ULONG_MAX) { 3695 if (min_pfn == ULONG_MAX) {
3695 printk(KERN_WARNING 3696 printk(KERN_WARNING
3696 "Could not find start_pfn for node %d\n", nid); 3697 "Could not find start_pfn for node %d\n", nid);
3697 return 0; 3698 return 0;
3698 } 3699 }
3699 3700
3700 return min_pfn; 3701 return min_pfn;
3701 } 3702 }
3702 3703
3703 /** 3704 /**
3704 * find_min_pfn_with_active_regions - Find the minimum PFN registered 3705 * find_min_pfn_with_active_regions - Find the minimum PFN registered
3705 * 3706 *
3706 * It returns the minimum PFN based on information provided via 3707 * It returns the minimum PFN based on information provided via
3707 * add_active_range(). 3708 * add_active_range().
3708 */ 3709 */
3709 unsigned long __init find_min_pfn_with_active_regions(void) 3710 unsigned long __init find_min_pfn_with_active_regions(void)
3710 { 3711 {
3711 return find_min_pfn_for_node(MAX_NUMNODES); 3712 return find_min_pfn_for_node(MAX_NUMNODES);
3712 } 3713 }
3713 3714
3714 /** 3715 /**
3715 * find_max_pfn_with_active_regions - Find the maximum PFN registered 3716 * find_max_pfn_with_active_regions - Find the maximum PFN registered
3716 * 3717 *
3717 * It returns the maximum PFN based on information provided via 3718 * It returns the maximum PFN based on information provided via
3718 * add_active_range(). 3719 * add_active_range().
3719 */ 3720 */
3720 unsigned long __init find_max_pfn_with_active_regions(void) 3721 unsigned long __init find_max_pfn_with_active_regions(void)
3721 { 3722 {
3722 int i; 3723 int i;
3723 unsigned long max_pfn = 0; 3724 unsigned long max_pfn = 0;
3724 3725
3725 for (i = 0; i < nr_nodemap_entries; i++) 3726 for (i = 0; i < nr_nodemap_entries; i++)
3726 max_pfn = max(max_pfn, early_node_map[i].end_pfn); 3727 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
3727 3728
3728 return max_pfn; 3729 return max_pfn;
3729 } 3730 }
3730 3731
3731 /* 3732 /*
3732 * early_calculate_totalpages() 3733 * early_calculate_totalpages()
3733 * Sum pages in active regions for movable zone. 3734 * Sum pages in active regions for movable zone.
3734 * Populate N_HIGH_MEMORY for calculating usable_nodes. 3735 * Populate N_HIGH_MEMORY for calculating usable_nodes.
3735 */ 3736 */
3736 static unsigned long __init early_calculate_totalpages(void) 3737 static unsigned long __init early_calculate_totalpages(void)
3737 { 3738 {
3738 int i; 3739 int i;
3739 unsigned long totalpages = 0; 3740 unsigned long totalpages = 0;
3740 3741
3741 for (i = 0; i < nr_nodemap_entries; i++) { 3742 for (i = 0; i < nr_nodemap_entries; i++) {
3742 unsigned long pages = early_node_map[i].end_pfn - 3743 unsigned long pages = early_node_map[i].end_pfn -
3743 early_node_map[i].start_pfn; 3744 early_node_map[i].start_pfn;
3744 totalpages += pages; 3745 totalpages += pages;
3745 if (pages) 3746 if (pages)
3746 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); 3747 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
3747 } 3748 }
3748 return totalpages; 3749 return totalpages;
3749 } 3750 }
3750 3751
3751 /* 3752 /*
3752 * Find the PFN the Movable zone begins in each node. Kernel memory 3753 * Find the PFN the Movable zone begins in each node. Kernel memory
3753 * is spread evenly between nodes as long as the nodes have enough 3754 * is spread evenly between nodes as long as the nodes have enough
3754 * memory. When they don't, some nodes will have more kernelcore than 3755 * memory. When they don't, some nodes will have more kernelcore than
3755 * others 3756 * others
3756 */ 3757 */
3757 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3758 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3758 { 3759 {
3759 int i, nid; 3760 int i, nid;
3760 unsigned long usable_startpfn; 3761 unsigned long usable_startpfn;
3761 unsigned long kernelcore_node, kernelcore_remaining; 3762 unsigned long kernelcore_node, kernelcore_remaining;
3762 unsigned long totalpages = early_calculate_totalpages(); 3763 unsigned long totalpages = early_calculate_totalpages();
3763 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 3764 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3764 3765
3765 /* 3766 /*
3766 * If movablecore was specified, calculate what size of 3767 * If movablecore was specified, calculate what size of
3767 * kernelcore that corresponds so that memory usable for 3768 * kernelcore that corresponds so that memory usable for
3768 * any allocation type is evenly spread. If both kernelcore 3769 * any allocation type is evenly spread. If both kernelcore
3769 * and movablecore are specified, then the value of kernelcore 3770 * and movablecore are specified, then the value of kernelcore
3770 * will be used for required_kernelcore if it's greater than 3771 * will be used for required_kernelcore if it's greater than
3771 * what movablecore would have allowed. 3772 * what movablecore would have allowed.
3772 */ 3773 */
3773 if (required_movablecore) { 3774 if (required_movablecore) {
3774 unsigned long corepages; 3775 unsigned long corepages;
3775 3776
3776 /* 3777 /*
3777 * Round-up so that ZONE_MOVABLE is at least as large as what 3778 * Round-up so that ZONE_MOVABLE is at least as large as what
3778 * was requested by the user 3779 * was requested by the user
3779 */ 3780 */
3780 required_movablecore = 3781 required_movablecore =
3781 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 3782 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
3782 corepages = totalpages - required_movablecore; 3783 corepages = totalpages - required_movablecore;
3783 3784
3784 required_kernelcore = max(required_kernelcore, corepages); 3785 required_kernelcore = max(required_kernelcore, corepages);
3785 } 3786 }
3786 3787
3787 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 3788 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
3788 if (!required_kernelcore) 3789 if (!required_kernelcore)
3789 return; 3790 return;
3790 3791
3791 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 3792 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
3792 find_usable_zone_for_movable(); 3793 find_usable_zone_for_movable();
3793 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 3794 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
3794 3795
3795 restart: 3796 restart:
3796 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3797 /* Spread kernelcore memory as evenly as possible throughout nodes */
3797 kernelcore_node = required_kernelcore / usable_nodes; 3798 kernelcore_node = required_kernelcore / usable_nodes;
3798 for_each_node_state(nid, N_HIGH_MEMORY) { 3799 for_each_node_state(nid, N_HIGH_MEMORY) {
3799 /* 3800 /*
3800 * Recalculate kernelcore_node if the division per node 3801 * Recalculate kernelcore_node if the division per node
3801 * now exceeds what is necessary to satisfy the requested 3802 * now exceeds what is necessary to satisfy the requested
3802 * amount of memory for the kernel 3803 * amount of memory for the kernel
3803 */ 3804 */
3804 if (required_kernelcore < kernelcore_node) 3805 if (required_kernelcore < kernelcore_node)
3805 kernelcore_node = required_kernelcore / usable_nodes; 3806 kernelcore_node = required_kernelcore / usable_nodes;
3806 3807
3807 /* 3808 /*
3808 * As the map is walked, we track how much memory is usable 3809 * As the map is walked, we track how much memory is usable
3809 * by the kernel using kernelcore_remaining. When it is 3810 * by the kernel using kernelcore_remaining. When it is
3810 * 0, the rest of the node is usable by ZONE_MOVABLE 3811 * 0, the rest of the node is usable by ZONE_MOVABLE
3811 */ 3812 */
3812 kernelcore_remaining = kernelcore_node; 3813 kernelcore_remaining = kernelcore_node;
3813 3814
3814 /* Go through each range of PFNs within this node */ 3815 /* Go through each range of PFNs within this node */
3815 for_each_active_range_index_in_nid(i, nid) { 3816 for_each_active_range_index_in_nid(i, nid) {
3816 unsigned long start_pfn, end_pfn; 3817 unsigned long start_pfn, end_pfn;
3817 unsigned long size_pages; 3818 unsigned long size_pages;
3818 3819
3819 start_pfn = max(early_node_map[i].start_pfn, 3820 start_pfn = max(early_node_map[i].start_pfn,
3820 zone_movable_pfn[nid]); 3821 zone_movable_pfn[nid]);
3821 end_pfn = early_node_map[i].end_pfn; 3822 end_pfn = early_node_map[i].end_pfn;
3822 if (start_pfn >= end_pfn) 3823 if (start_pfn >= end_pfn)
3823 continue; 3824 continue;
3824 3825
3825 /* Account for what is only usable for kernelcore */ 3826 /* Account for what is only usable for kernelcore */
3826 if (start_pfn < usable_startpfn) { 3827 if (start_pfn < usable_startpfn) {
3827 unsigned long kernel_pages; 3828 unsigned long kernel_pages;
3828 kernel_pages = min(end_pfn, usable_startpfn) 3829 kernel_pages = min(end_pfn, usable_startpfn)
3829 - start_pfn; 3830 - start_pfn;
3830 3831
3831 kernelcore_remaining -= min(kernel_pages, 3832 kernelcore_remaining -= min(kernel_pages,
3832 kernelcore_remaining); 3833 kernelcore_remaining);
3833 required_kernelcore -= min(kernel_pages, 3834 required_kernelcore -= min(kernel_pages,
3834 required_kernelcore); 3835 required_kernelcore);
3835 3836
3836 /* Continue if range is now fully accounted */ 3837 /* Continue if range is now fully accounted */
3837 if (end_pfn <= usable_startpfn) { 3838 if (end_pfn <= usable_startpfn) {
3838 3839
3839 /* 3840 /*
3840 * Push zone_movable_pfn to the end so 3841 * Push zone_movable_pfn to the end so
3841 * that if we have to rebalance 3842 * that if we have to rebalance
3842 * kernelcore across nodes, we will 3843 * kernelcore across nodes, we will
3843 * not double account here 3844 * not double account here
3844 */ 3845 */
3845 zone_movable_pfn[nid] = end_pfn; 3846 zone_movable_pfn[nid] = end_pfn;
3846 continue; 3847 continue;
3847 } 3848 }
3848 start_pfn = usable_startpfn; 3849 start_pfn = usable_startpfn;
3849 } 3850 }
3850 3851
3851 /* 3852 /*
3852 * The usable PFN range for ZONE_MOVABLE is from 3853 * The usable PFN range for ZONE_MOVABLE is from
3853 * start_pfn->end_pfn. Calculate size_pages as the 3854 * start_pfn->end_pfn. Calculate size_pages as the
3854 * number of pages used as kernelcore 3855 * number of pages used as kernelcore
3855 */ 3856 */
3856 size_pages = end_pfn - start_pfn; 3857 size_pages = end_pfn - start_pfn;
3857 if (size_pages > kernelcore_remaining) 3858 if (size_pages > kernelcore_remaining)
3858 size_pages = kernelcore_remaining; 3859 size_pages = kernelcore_remaining;
3859 zone_movable_pfn[nid] = start_pfn + size_pages; 3860 zone_movable_pfn[nid] = start_pfn + size_pages;
3860 3861
3861 /* 3862 /*
3862 * Some kernelcore has been met, update counts and 3863 * Some kernelcore has been met, update counts and
3863 * break if the kernelcore for this node has been 3864 * break if the kernelcore for this node has been
3864 * satisified 3865 * satisified
3865 */ 3866 */
3866 required_kernelcore -= min(required_kernelcore, 3867 required_kernelcore -= min(required_kernelcore,
3867 size_pages); 3868 size_pages);
3868 kernelcore_remaining -= size_pages; 3869 kernelcore_remaining -= size_pages;
3869 if (!kernelcore_remaining) 3870 if (!kernelcore_remaining)
3870 break; 3871 break;
3871 } 3872 }
3872 } 3873 }
3873 3874
3874 /* 3875 /*
3875 * If there is still required_kernelcore, we do another pass with one 3876 * If there is still required_kernelcore, we do another pass with one
3876 * less node in the count. This will push zone_movable_pfn[nid] further 3877 * less node in the count. This will push zone_movable_pfn[nid] further
3877 * along on the nodes that still have memory until kernelcore is 3878 * along on the nodes that still have memory until kernelcore is
3878 * satisified 3879 * satisified
3879 */ 3880 */
3880 usable_nodes--; 3881 usable_nodes--;
3881 if (usable_nodes && required_kernelcore > usable_nodes) 3882 if (usable_nodes && required_kernelcore > usable_nodes)
3882 goto restart; 3883 goto restart;
3883 3884
3884 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 3885 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
3885 for (nid = 0; nid < MAX_NUMNODES; nid++) 3886 for (nid = 0; nid < MAX_NUMNODES; nid++)
3886 zone_movable_pfn[nid] = 3887 zone_movable_pfn[nid] =
3887 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3888 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
3888 } 3889 }
3889 3890
3890 /* Any regular memory on that node ? */ 3891 /* Any regular memory on that node ? */
3891 static void check_for_regular_memory(pg_data_t *pgdat) 3892 static void check_for_regular_memory(pg_data_t *pgdat)
3892 { 3893 {
3893 #ifdef CONFIG_HIGHMEM 3894 #ifdef CONFIG_HIGHMEM
3894 enum zone_type zone_type; 3895 enum zone_type zone_type;
3895 3896
3896 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 3897 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
3897 struct zone *zone = &pgdat->node_zones[zone_type]; 3898 struct zone *zone = &pgdat->node_zones[zone_type];
3898 if (zone->present_pages) 3899 if (zone->present_pages)
3899 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 3900 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
3900 } 3901 }
3901 #endif 3902 #endif
3902 } 3903 }
3903 3904
3904 /** 3905 /**
3905 * free_area_init_nodes - Initialise all pg_data_t and zone data 3906 * free_area_init_nodes - Initialise all pg_data_t and zone data
3906 * @max_zone_pfn: an array of max PFNs for each zone 3907 * @max_zone_pfn: an array of max PFNs for each zone
3907 * 3908 *
3908 * This will call free_area_init_node() for each active node in the system. 3909 * This will call free_area_init_node() for each active node in the system.
3909 * Using the page ranges provided by add_active_range(), the size of each 3910 * Using the page ranges provided by add_active_range(), the size of each
3910 * zone in each node and their holes is calculated. If the maximum PFN 3911 * zone in each node and their holes is calculated. If the maximum PFN
3911 * between two adjacent zones match, it is assumed that the zone is empty. 3912 * between two adjacent zones match, it is assumed that the zone is empty.
3912 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 3913 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
3913 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 3914 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
3914 * starts where the previous one ended. For example, ZONE_DMA32 starts 3915 * starts where the previous one ended. For example, ZONE_DMA32 starts
3915 * at arch_max_dma_pfn. 3916 * at arch_max_dma_pfn.
3916 */ 3917 */
3917 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 3918 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3918 { 3919 {
3919 unsigned long nid; 3920 unsigned long nid;
3920 enum zone_type i; 3921 enum zone_type i;
3921 3922
3922 /* Sort early_node_map as initialisation assumes it is sorted */ 3923 /* Sort early_node_map as initialisation assumes it is sorted */
3923 sort_node_map(); 3924 sort_node_map();
3924 3925
3925 /* Record where the zone boundaries are */ 3926 /* Record where the zone boundaries are */
3926 memset(arch_zone_lowest_possible_pfn, 0, 3927 memset(arch_zone_lowest_possible_pfn, 0,
3927 sizeof(arch_zone_lowest_possible_pfn)); 3928 sizeof(arch_zone_lowest_possible_pfn));
3928 memset(arch_zone_highest_possible_pfn, 0, 3929 memset(arch_zone_highest_possible_pfn, 0,
3929 sizeof(arch_zone_highest_possible_pfn)); 3930 sizeof(arch_zone_highest_possible_pfn));
3930 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 3931 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
3931 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 3932 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
3932 for (i = 1; i < MAX_NR_ZONES; i++) { 3933 for (i = 1; i < MAX_NR_ZONES; i++) {
3933 if (i == ZONE_MOVABLE) 3934 if (i == ZONE_MOVABLE)
3934 continue; 3935 continue;
3935 arch_zone_lowest_possible_pfn[i] = 3936 arch_zone_lowest_possible_pfn[i] =
3936 arch_zone_highest_possible_pfn[i-1]; 3937 arch_zone_highest_possible_pfn[i-1];
3937 arch_zone_highest_possible_pfn[i] = 3938 arch_zone_highest_possible_pfn[i] =
3938 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 3939 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
3939 } 3940 }
3940 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 3941 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
3941 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 3942 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
3942 3943
3943 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 3944 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
3944 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 3945 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
3945 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 3946 find_zone_movable_pfns_for_nodes(zone_movable_pfn);
3946 3947
3947 /* Print out the zone ranges */ 3948 /* Print out the zone ranges */
3948 printk("Zone PFN ranges:\n"); 3949 printk("Zone PFN ranges:\n");
3949 for (i = 0; i < MAX_NR_ZONES; i++) { 3950 for (i = 0; i < MAX_NR_ZONES; i++) {
3950 if (i == ZONE_MOVABLE) 3951 if (i == ZONE_MOVABLE)
3951 continue; 3952 continue;
3952 printk(" %-8s %0#10lx -> %0#10lx\n", 3953 printk(" %-8s %0#10lx -> %0#10lx\n",
3953 zone_names[i], 3954 zone_names[i],
3954 arch_zone_lowest_possible_pfn[i], 3955 arch_zone_lowest_possible_pfn[i],
3955 arch_zone_highest_possible_pfn[i]); 3956 arch_zone_highest_possible_pfn[i]);
3956 } 3957 }
3957 3958
3958 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 3959 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
3959 printk("Movable zone start PFN for each node\n"); 3960 printk("Movable zone start PFN for each node\n");
3960 for (i = 0; i < MAX_NUMNODES; i++) { 3961 for (i = 0; i < MAX_NUMNODES; i++) {
3961 if (zone_movable_pfn[i]) 3962 if (zone_movable_pfn[i])
3962 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 3963 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
3963 } 3964 }
3964 3965
3965 /* Print out the early_node_map[] */ 3966 /* Print out the early_node_map[] */
3966 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 3967 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
3967 for (i = 0; i < nr_nodemap_entries; i++) 3968 for (i = 0; i < nr_nodemap_entries; i++)
3968 printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, 3969 printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
3969 early_node_map[i].start_pfn, 3970 early_node_map[i].start_pfn,
3970 early_node_map[i].end_pfn); 3971 early_node_map[i].end_pfn);
3971 3972
3972 /* Initialise every node */ 3973 /* Initialise every node */
3973 mminit_verify_pageflags_layout(); 3974 mminit_verify_pageflags_layout();
3974 setup_nr_node_ids(); 3975 setup_nr_node_ids();
3975 for_each_online_node(nid) { 3976 for_each_online_node(nid) {
3976 pg_data_t *pgdat = NODE_DATA(nid); 3977 pg_data_t *pgdat = NODE_DATA(nid);
3977 free_area_init_node(nid, pgdat, NULL, 3978 free_area_init_node(nid, pgdat, NULL,
3978 find_min_pfn_for_node(nid), NULL); 3979 find_min_pfn_for_node(nid), NULL);
3979 3980
3980 /* Any memory on that node */ 3981 /* Any memory on that node */
3981 if (pgdat->node_present_pages) 3982 if (pgdat->node_present_pages)
3982 node_set_state(nid, N_HIGH_MEMORY); 3983 node_set_state(nid, N_HIGH_MEMORY);
3983 check_for_regular_memory(pgdat); 3984 check_for_regular_memory(pgdat);
3984 } 3985 }
3985 } 3986 }
3986 3987
3987 static int __init cmdline_parse_core(char *p, unsigned long *core) 3988 static int __init cmdline_parse_core(char *p, unsigned long *core)
3988 { 3989 {
3989 unsigned long long coremem; 3990 unsigned long long coremem;
3990 if (!p) 3991 if (!p)
3991 return -EINVAL; 3992 return -EINVAL;
3992 3993
3993 coremem = memparse(p, &p); 3994 coremem = memparse(p, &p);
3994 *core = coremem >> PAGE_SHIFT; 3995 *core = coremem >> PAGE_SHIFT;
3995 3996
3996 /* Paranoid check that UL is enough for the coremem value */ 3997 /* Paranoid check that UL is enough for the coremem value */
3997 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 3998 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
3998 3999
3999 return 0; 4000 return 0;
4000 } 4001 }
4001 4002
4002 /* 4003 /*
4003 * kernelcore=size sets the amount of memory for use for allocations that 4004 * kernelcore=size sets the amount of memory for use for allocations that
4004 * cannot be reclaimed or migrated. 4005 * cannot be reclaimed or migrated.
4005 */ 4006 */
4006 static int __init cmdline_parse_kernelcore(char *p) 4007 static int __init cmdline_parse_kernelcore(char *p)
4007 { 4008 {
4008 return cmdline_parse_core(p, &required_kernelcore); 4009 return cmdline_parse_core(p, &required_kernelcore);
4009 } 4010 }
4010 4011
4011 /* 4012 /*
4012 * movablecore=size sets the amount of memory for use for allocations that 4013 * movablecore=size sets the amount of memory for use for allocations that
4013 * can be reclaimed or migrated. 4014 * can be reclaimed or migrated.
4014 */ 4015 */
4015 static int __init cmdline_parse_movablecore(char *p) 4016 static int __init cmdline_parse_movablecore(char *p)
4016 { 4017 {
4017 return cmdline_parse_core(p, &required_movablecore); 4018 return cmdline_parse_core(p, &required_movablecore);
4018 } 4019 }
4019 4020
4020 early_param("kernelcore", cmdline_parse_kernelcore); 4021 early_param("kernelcore", cmdline_parse_kernelcore);
4021 early_param("movablecore", cmdline_parse_movablecore); 4022 early_param("movablecore", cmdline_parse_movablecore);
4022 4023
4023 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 4024 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
4024 4025
4025 /** 4026 /**
4026 * set_dma_reserve - set the specified number of pages reserved in the first zone 4027 * set_dma_reserve - set the specified number of pages reserved in the first zone
4027 * @new_dma_reserve: The number of pages to mark reserved 4028 * @new_dma_reserve: The number of pages to mark reserved
4028 * 4029 *
4029 * The per-cpu batchsize and zone watermarks are determined by present_pages. 4030 * The per-cpu batchsize and zone watermarks are determined by present_pages.
4030 * In the DMA zone, a significant percentage may be consumed by kernel image 4031 * In the DMA zone, a significant percentage may be consumed by kernel image
4031 * and other unfreeable allocations which can skew the watermarks badly. This 4032 * and other unfreeable allocations which can skew the watermarks badly. This
4032 * function may optionally be used to account for unfreeable pages in the 4033 * function may optionally be used to account for unfreeable pages in the
4033 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 4034 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
4034 * smaller per-cpu batchsize. 4035 * smaller per-cpu batchsize.
4035 */ 4036 */
4036 void __init set_dma_reserve(unsigned long new_dma_reserve) 4037 void __init set_dma_reserve(unsigned long new_dma_reserve)
4037 { 4038 {
4038 dma_reserve = new_dma_reserve; 4039 dma_reserve = new_dma_reserve;
4039 } 4040 }
4040 4041
4041 #ifndef CONFIG_NEED_MULTIPLE_NODES 4042 #ifndef CONFIG_NEED_MULTIPLE_NODES
4042 static bootmem_data_t contig_bootmem_data; 4043 static bootmem_data_t contig_bootmem_data;
4043 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 4044 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
4044 4045
4045 EXPORT_SYMBOL(contig_page_data); 4046 EXPORT_SYMBOL(contig_page_data);
4046 #endif 4047 #endif
4047 4048
4048 void __init free_area_init(unsigned long *zones_size) 4049 void __init free_area_init(unsigned long *zones_size)
4049 { 4050 {
4050 free_area_init_node(0, NODE_DATA(0), zones_size, 4051 free_area_init_node(0, NODE_DATA(0), zones_size,
4051 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4052 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4052 } 4053 }
4053 4054
4054 static int page_alloc_cpu_notify(struct notifier_block *self, 4055 static int page_alloc_cpu_notify(struct notifier_block *self,
4055 unsigned long action, void *hcpu) 4056 unsigned long action, void *hcpu)
4056 { 4057 {
4057 int cpu = (unsigned long)hcpu; 4058 int cpu = (unsigned long)hcpu;
4058 4059
4059 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4060 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4060 drain_pages(cpu); 4061 drain_pages(cpu);
4061 4062
4062 /* 4063 /*
4063 * Spill the event counters of the dead processor 4064 * Spill the event counters of the dead processor
4064 * into the current processors event counters. 4065 * into the current processors event counters.
4065 * This artificially elevates the count of the current 4066 * This artificially elevates the count of the current
4066 * processor. 4067 * processor.
4067 */ 4068 */
4068 vm_events_fold_cpu(cpu); 4069 vm_events_fold_cpu(cpu);
4069 4070
4070 /* 4071 /*
4071 * Zero the differential counters of the dead processor 4072 * Zero the differential counters of the dead processor
4072 * so that the vm statistics are consistent. 4073 * so that the vm statistics are consistent.
4073 * 4074 *
4074 * This is only okay since the processor is dead and cannot 4075 * This is only okay since the processor is dead and cannot
4075 * race with what we are doing. 4076 * race with what we are doing.
4076 */ 4077 */
4077 refresh_cpu_vm_stats(cpu); 4078 refresh_cpu_vm_stats(cpu);
4078 } 4079 }
4079 return NOTIFY_OK; 4080 return NOTIFY_OK;
4080 } 4081 }
4081 4082
4082 void __init page_alloc_init(void) 4083 void __init page_alloc_init(void)
4083 { 4084 {
4084 hotcpu_notifier(page_alloc_cpu_notify, 0); 4085 hotcpu_notifier(page_alloc_cpu_notify, 0);
4085 } 4086 }
4086 4087
4087 /* 4088 /*
4088 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 4089 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
4089 * or min_free_kbytes changes. 4090 * or min_free_kbytes changes.
4090 */ 4091 */
4091 static void calculate_totalreserve_pages(void) 4092 static void calculate_totalreserve_pages(void)
4092 { 4093 {
4093 struct pglist_data *pgdat; 4094 struct pglist_data *pgdat;
4094 unsigned long reserve_pages = 0; 4095 unsigned long reserve_pages = 0;
4095 enum zone_type i, j; 4096 enum zone_type i, j;
4096 4097
4097 for_each_online_pgdat(pgdat) { 4098 for_each_online_pgdat(pgdat) {
4098 for (i = 0; i < MAX_NR_ZONES; i++) { 4099 for (i = 0; i < MAX_NR_ZONES; i++) {
4099 struct zone *zone = pgdat->node_zones + i; 4100 struct zone *zone = pgdat->node_zones + i;
4100 unsigned long max = 0; 4101 unsigned long max = 0;
4101 4102
4102 /* Find valid and maximum lowmem_reserve in the zone */ 4103 /* Find valid and maximum lowmem_reserve in the zone */
4103 for (j = i; j < MAX_NR_ZONES; j++) { 4104 for (j = i; j < MAX_NR_ZONES; j++) {
4104 if (zone->lowmem_reserve[j] > max) 4105 if (zone->lowmem_reserve[j] > max)
4105 max = zone->lowmem_reserve[j]; 4106 max = zone->lowmem_reserve[j];
4106 } 4107 }
4107 4108
4108 /* we treat pages_high as reserved pages. */ 4109 /* we treat pages_high as reserved pages. */
4109 max += zone->pages_high; 4110 max += zone->pages_high;
4110 4111
4111 if (max > zone->present_pages) 4112 if (max > zone->present_pages)
4112 max = zone->present_pages; 4113 max = zone->present_pages;
4113 reserve_pages += max; 4114 reserve_pages += max;
4114 } 4115 }
4115 } 4116 }
4116 totalreserve_pages = reserve_pages; 4117 totalreserve_pages = reserve_pages;
4117 } 4118 }
4118 4119
4119 /* 4120 /*
4120 * setup_per_zone_lowmem_reserve - called whenever 4121 * setup_per_zone_lowmem_reserve - called whenever
4121 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 4122 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
4122 * has a correct pages reserved value, so an adequate number of 4123 * has a correct pages reserved value, so an adequate number of
4123 * pages are left in the zone after a successful __alloc_pages(). 4124 * pages are left in the zone after a successful __alloc_pages().
4124 */ 4125 */
4125 static void setup_per_zone_lowmem_reserve(void) 4126 static void setup_per_zone_lowmem_reserve(void)
4126 { 4127 {
4127 struct pglist_data *pgdat; 4128 struct pglist_data *pgdat;
4128 enum zone_type j, idx; 4129 enum zone_type j, idx;
4129 4130
4130 for_each_online_pgdat(pgdat) { 4131 for_each_online_pgdat(pgdat) {
4131 for (j = 0; j < MAX_NR_ZONES; j++) { 4132 for (j = 0; j < MAX_NR_ZONES; j++) {
4132 struct zone *zone = pgdat->node_zones + j; 4133 struct zone *zone = pgdat->node_zones + j;
4133 unsigned long present_pages = zone->present_pages; 4134 unsigned long present_pages = zone->present_pages;
4134 4135
4135 zone->lowmem_reserve[j] = 0; 4136 zone->lowmem_reserve[j] = 0;
4136 4137
4137 idx = j; 4138 idx = j;
4138 while (idx) { 4139 while (idx) {
4139 struct zone *lower_zone; 4140 struct zone *lower_zone;
4140 4141
4141 idx--; 4142 idx--;
4142 4143
4143 if (sysctl_lowmem_reserve_ratio[idx] < 1) 4144 if (sysctl_lowmem_reserve_ratio[idx] < 1)
4144 sysctl_lowmem_reserve_ratio[idx] = 1; 4145 sysctl_lowmem_reserve_ratio[idx] = 1;
4145 4146
4146 lower_zone = pgdat->node_zones + idx; 4147 lower_zone = pgdat->node_zones + idx;
4147 lower_zone->lowmem_reserve[j] = present_pages / 4148 lower_zone->lowmem_reserve[j] = present_pages /
4148 sysctl_lowmem_reserve_ratio[idx]; 4149 sysctl_lowmem_reserve_ratio[idx];
4149 present_pages += lower_zone->present_pages; 4150 present_pages += lower_zone->present_pages;
4150 } 4151 }
4151 } 4152 }
4152 } 4153 }
4153 4154
4154 /* update totalreserve_pages */ 4155 /* update totalreserve_pages */
4155 calculate_totalreserve_pages(); 4156 calculate_totalreserve_pages();
4156 } 4157 }
4157 4158
4158 /** 4159 /**
4159 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4160 * setup_per_zone_pages_min - called when min_free_kbytes changes.
4160 * 4161 *
4161 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4162 * Ensures that the pages_{min,low,high} values for each zone are set correctly
4162 * with respect to min_free_kbytes. 4163 * with respect to min_free_kbytes.
4163 */ 4164 */
4164 void setup_per_zone_pages_min(void) 4165 void setup_per_zone_pages_min(void)
4165 { 4166 {
4166 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4167 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4167 unsigned long lowmem_pages = 0; 4168 unsigned long lowmem_pages = 0;
4168 struct zone *zone; 4169 struct zone *zone;
4169 unsigned long flags; 4170 unsigned long flags;
4170 4171
4171 /* Calculate total number of !ZONE_HIGHMEM pages */ 4172 /* Calculate total number of !ZONE_HIGHMEM pages */
4172 for_each_zone(zone) { 4173 for_each_zone(zone) {
4173 if (!is_highmem(zone)) 4174 if (!is_highmem(zone))
4174 lowmem_pages += zone->present_pages; 4175 lowmem_pages += zone->present_pages;
4175 } 4176 }
4176 4177
4177 for_each_zone(zone) { 4178 for_each_zone(zone) {
4178 u64 tmp; 4179 u64 tmp;
4179 4180
4180 spin_lock_irqsave(&zone->lru_lock, flags); 4181 spin_lock_irqsave(&zone->lru_lock, flags);
4181 tmp = (u64)pages_min * zone->present_pages; 4182 tmp = (u64)pages_min * zone->present_pages;
4182 do_div(tmp, lowmem_pages); 4183 do_div(tmp, lowmem_pages);
4183 if (is_highmem(zone)) { 4184 if (is_highmem(zone)) {
4184 /* 4185 /*
4185 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 4186 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
4186 * need highmem pages, so cap pages_min to a small 4187 * need highmem pages, so cap pages_min to a small
4187 * value here. 4188 * value here.
4188 * 4189 *
4189 * The (pages_high-pages_low) and (pages_low-pages_min) 4190 * The (pages_high-pages_low) and (pages_low-pages_min)
4190 * deltas controls asynch page reclaim, and so should 4191 * deltas controls asynch page reclaim, and so should
4191 * not be capped for highmem. 4192 * not be capped for highmem.
4192 */ 4193 */
4193 int min_pages; 4194 int min_pages;
4194 4195
4195 min_pages = zone->present_pages / 1024; 4196 min_pages = zone->present_pages / 1024;
4196 if (min_pages < SWAP_CLUSTER_MAX) 4197 if (min_pages < SWAP_CLUSTER_MAX)
4197 min_pages = SWAP_CLUSTER_MAX; 4198 min_pages = SWAP_CLUSTER_MAX;
4198 if (min_pages > 128) 4199 if (min_pages > 128)
4199 min_pages = 128; 4200 min_pages = 128;
4200 zone->pages_min = min_pages; 4201 zone->pages_min = min_pages;
4201 } else { 4202 } else {
4202 /* 4203 /*
4203 * If it's a lowmem zone, reserve a number of pages 4204 * If it's a lowmem zone, reserve a number of pages
4204 * proportionate to the zone's size. 4205 * proportionate to the zone's size.
4205 */ 4206 */
4206 zone->pages_min = tmp; 4207 zone->pages_min = tmp;
4207 } 4208 }
4208 4209
4209 zone->pages_low = zone->pages_min + (tmp >> 2); 4210 zone->pages_low = zone->pages_min + (tmp >> 2);
4210 zone->pages_high = zone->pages_min + (tmp >> 1); 4211 zone->pages_high = zone->pages_min + (tmp >> 1);
4211 setup_zone_migrate_reserve(zone); 4212 setup_zone_migrate_reserve(zone);
4212 spin_unlock_irqrestore(&zone->lru_lock, flags); 4213 spin_unlock_irqrestore(&zone->lru_lock, flags);
4213 } 4214 }
4214 4215
4215 /* update totalreserve_pages */ 4216 /* update totalreserve_pages */
4216 calculate_totalreserve_pages(); 4217 calculate_totalreserve_pages();
4217 } 4218 }
4218 4219
4219 /* 4220 /*
4220 * Initialise min_free_kbytes. 4221 * Initialise min_free_kbytes.
4221 * 4222 *
4222 * For small machines we want it small (128k min). For large machines 4223 * For small machines we want it small (128k min). For large machines
4223 * we want it large (64MB max). But it is not linear, because network 4224 * we want it large (64MB max). But it is not linear, because network
4224 * bandwidth does not increase linearly with machine size. We use 4225 * bandwidth does not increase linearly with machine size. We use
4225 * 4226 *
4226 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 4227 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
4227 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 4228 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
4228 * 4229 *
4229 * which yields 4230 * which yields
4230 * 4231 *
4231 * 16MB: 512k 4232 * 16MB: 512k
4232 * 32MB: 724k 4233 * 32MB: 724k
4233 * 64MB: 1024k 4234 * 64MB: 1024k
4234 * 128MB: 1448k 4235 * 128MB: 1448k
4235 * 256MB: 2048k 4236 * 256MB: 2048k
4236 * 512MB: 2896k 4237 * 512MB: 2896k
4237 * 1024MB: 4096k 4238 * 1024MB: 4096k
4238 * 2048MB: 5792k 4239 * 2048MB: 5792k
4239 * 4096MB: 8192k 4240 * 4096MB: 8192k
4240 * 8192MB: 11584k 4241 * 8192MB: 11584k
4241 * 16384MB: 16384k 4242 * 16384MB: 16384k
4242 */ 4243 */
4243 static int __init init_per_zone_pages_min(void) 4244 static int __init init_per_zone_pages_min(void)
4244 { 4245 {
4245 unsigned long lowmem_kbytes; 4246 unsigned long lowmem_kbytes;
4246 4247
4247 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 4248 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
4248 4249
4249 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 4250 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
4250 if (min_free_kbytes < 128) 4251 if (min_free_kbytes < 128)
4251 min_free_kbytes = 128; 4252 min_free_kbytes = 128;
4252 if (min_free_kbytes > 65536) 4253 if (min_free_kbytes > 65536)
4253 min_free_kbytes = 65536; 4254 min_free_kbytes = 65536;
4254 setup_per_zone_pages_min(); 4255 setup_per_zone_pages_min();
4255 setup_per_zone_lowmem_reserve(); 4256 setup_per_zone_lowmem_reserve();
4256 return 0; 4257 return 0;
4257 } 4258 }
4258 module_init(init_per_zone_pages_min) 4259 module_init(init_per_zone_pages_min)
4259 4260
4260 /* 4261 /*
4261 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4262 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
4262 * that we can call two helper functions whenever min_free_kbytes 4263 * that we can call two helper functions whenever min_free_kbytes
4263 * changes. 4264 * changes.
4264 */ 4265 */
4265 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4266 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4266 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4267 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4267 { 4268 {
4268 proc_dointvec(table, write, file, buffer, length, ppos); 4269 proc_dointvec(table, write, file, buffer, length, ppos);
4269 if (write) 4270 if (write)
4270 setup_per_zone_pages_min(); 4271 setup_per_zone_pages_min();
4271 return 0; 4272 return 0;
4272 } 4273 }
4273 4274
4274 #ifdef CONFIG_NUMA 4275 #ifdef CONFIG_NUMA
4275 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4276 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4276 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4277 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4277 { 4278 {
4278 struct zone *zone; 4279 struct zone *zone;
4279 int rc; 4280 int rc;
4280 4281
4281 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4282 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4282 if (rc) 4283 if (rc)
4283 return rc; 4284 return rc;
4284 4285
4285 for_each_zone(zone) 4286 for_each_zone(zone)
4286 zone->min_unmapped_pages = (zone->present_pages * 4287 zone->min_unmapped_pages = (zone->present_pages *
4287 sysctl_min_unmapped_ratio) / 100; 4288 sysctl_min_unmapped_ratio) / 100;
4288 return 0; 4289 return 0;
4289 } 4290 }
4290 4291
4291 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4292 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4292 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4293 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4293 { 4294 {
4294 struct zone *zone; 4295 struct zone *zone;
4295 int rc; 4296 int rc;
4296 4297
4297 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4298 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4298 if (rc) 4299 if (rc)
4299 return rc; 4300 return rc;
4300 4301
4301 for_each_zone(zone) 4302 for_each_zone(zone)
4302 zone->min_slab_pages = (zone->present_pages * 4303 zone->min_slab_pages = (zone->present_pages *
4303 sysctl_min_slab_ratio) / 100; 4304 sysctl_min_slab_ratio) / 100;
4304 return 0; 4305 return 0;
4305 } 4306 }
4306 #endif 4307 #endif
4307 4308
4308 /* 4309 /*
4309 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 4310 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
4310 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 4311 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
4311 * whenever sysctl_lowmem_reserve_ratio changes. 4312 * whenever sysctl_lowmem_reserve_ratio changes.
4312 * 4313 *
4313 * The reserve ratio obviously has absolutely no relation with the 4314 * The reserve ratio obviously has absolutely no relation with the
4314 * pages_min watermarks. The lowmem reserve ratio can only make sense 4315 * pages_min watermarks. The lowmem reserve ratio can only make sense
4315 * if in function of the boot time zone sizes. 4316 * if in function of the boot time zone sizes.
4316 */ 4317 */
4317 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4318 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4318 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4319 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4319 { 4320 {
4320 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4321 proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4321 setup_per_zone_lowmem_reserve(); 4322 setup_per_zone_lowmem_reserve();
4322 return 0; 4323 return 0;
4323 } 4324 }
4324 4325
4325 /* 4326 /*
4326 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 4327 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
4327 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 4328 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
4328 * can have before it gets flushed back to buddy allocator. 4329 * can have before it gets flushed back to buddy allocator.
4329 */ 4330 */
4330 4331
4331 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4332 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4332 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4333 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
4333 { 4334 {
4334 struct zone *zone; 4335 struct zone *zone;
4335 unsigned int cpu; 4336 unsigned int cpu;
4336 int ret; 4337 int ret;
4337 4338
4338 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4339 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4339 if (!write || (ret == -EINVAL)) 4340 if (!write || (ret == -EINVAL))
4340 return ret; 4341 return ret;
4341 for_each_zone(zone) { 4342 for_each_zone(zone) {
4342 for_each_online_cpu(cpu) { 4343 for_each_online_cpu(cpu) {
4343 unsigned long high; 4344 unsigned long high;
4344 high = zone->present_pages / percpu_pagelist_fraction; 4345 high = zone->present_pages / percpu_pagelist_fraction;
4345 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4346 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
4346 } 4347 }
4347 } 4348 }
4348 return 0; 4349 return 0;
4349 } 4350 }
4350 4351
4351 int hashdist = HASHDIST_DEFAULT; 4352 int hashdist = HASHDIST_DEFAULT;
4352 4353
4353 #ifdef CONFIG_NUMA 4354 #ifdef CONFIG_NUMA
4354 static int __init set_hashdist(char *str) 4355 static int __init set_hashdist(char *str)
4355 { 4356 {
4356 if (!str) 4357 if (!str)
4357 return 0; 4358 return 0;
4358 hashdist = simple_strtoul(str, &str, 0); 4359 hashdist = simple_strtoul(str, &str, 0);
4359 return 1; 4360 return 1;
4360 } 4361 }
4361 __setup("hashdist=", set_hashdist); 4362 __setup("hashdist=", set_hashdist);
4362 #endif 4363 #endif
4363 4364
4364 /* 4365 /*
4365 * allocate a large system hash table from bootmem 4366 * allocate a large system hash table from bootmem
4366 * - it is assumed that the hash table must contain an exact power-of-2 4367 * - it is assumed that the hash table must contain an exact power-of-2
4367 * quantity of entries 4368 * quantity of entries
4368 * - limit is the number of hash buckets, not the total allocation size 4369 * - limit is the number of hash buckets, not the total allocation size
4369 */ 4370 */
4370 void *__init alloc_large_system_hash(const char *tablename, 4371 void *__init alloc_large_system_hash(const char *tablename,
4371 unsigned long bucketsize, 4372 unsigned long bucketsize,
4372 unsigned long numentries, 4373 unsigned long numentries,
4373 int scale, 4374 int scale,
4374 int flags, 4375 int flags,
4375 unsigned int *_hash_shift, 4376 unsigned int *_hash_shift,
4376 unsigned int *_hash_mask, 4377 unsigned int *_hash_mask,
4377 unsigned long limit) 4378 unsigned long limit)
4378 { 4379 {
4379 unsigned long long max = limit; 4380 unsigned long long max = limit;
4380 unsigned long log2qty, size; 4381 unsigned long log2qty, size;
4381 void *table = NULL; 4382 void *table = NULL;
4382 4383
4383 /* allow the kernel cmdline to have a say */ 4384 /* allow the kernel cmdline to have a say */
4384 if (!numentries) { 4385 if (!numentries) {
4385 /* round applicable memory size up to nearest megabyte */ 4386 /* round applicable memory size up to nearest megabyte */
4386 numentries = nr_kernel_pages; 4387 numentries = nr_kernel_pages;
4387 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 4388 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
4388 numentries >>= 20 - PAGE_SHIFT; 4389 numentries >>= 20 - PAGE_SHIFT;
4389 numentries <<= 20 - PAGE_SHIFT; 4390 numentries <<= 20 - PAGE_SHIFT;
4390 4391
4391 /* limit to 1 bucket per 2^scale bytes of low memory */ 4392 /* limit to 1 bucket per 2^scale bytes of low memory */
4392 if (scale > PAGE_SHIFT) 4393 if (scale > PAGE_SHIFT)
4393 numentries >>= (scale - PAGE_SHIFT); 4394 numentries >>= (scale - PAGE_SHIFT);
4394 else 4395 else
4395 numentries <<= (PAGE_SHIFT - scale); 4396 numentries <<= (PAGE_SHIFT - scale);
4396 4397
4397 /* Make sure we've got at least a 0-order allocation.. */ 4398 /* Make sure we've got at least a 0-order allocation.. */
4398 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4399 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4399 numentries = PAGE_SIZE / bucketsize; 4400 numentries = PAGE_SIZE / bucketsize;
4400 } 4401 }
4401 numentries = roundup_pow_of_two(numentries); 4402 numentries = roundup_pow_of_two(numentries);
4402 4403
4403 /* limit allocation size to 1/16 total memory by default */ 4404 /* limit allocation size to 1/16 total memory by default */
4404 if (max == 0) { 4405 if (max == 0) {
4405 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 4406 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
4406 do_div(max, bucketsize); 4407 do_div(max, bucketsize);
4407 } 4408 }
4408 4409
4409 if (numentries > max) 4410 if (numentries > max)
4410 numentries = max; 4411 numentries = max;
4411 4412
4412 log2qty = ilog2(numentries); 4413 log2qty = ilog2(numentries);
4413 4414
4414 do { 4415 do {
4415 size = bucketsize << log2qty; 4416 size = bucketsize << log2qty;
4416 if (flags & HASH_EARLY) 4417 if (flags & HASH_EARLY)
4417 table = alloc_bootmem(size); 4418 table = alloc_bootmem(size);
4418 else if (hashdist) 4419 else if (hashdist)
4419 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4420 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4420 else { 4421 else {
4421 unsigned long order = get_order(size); 4422 unsigned long order = get_order(size);
4422 table = (void*) __get_free_pages(GFP_ATOMIC, order); 4423 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4423 /* 4424 /*
4424 * If bucketsize is not a power-of-two, we may free 4425 * If bucketsize is not a power-of-two, we may free
4425 * some pages at the end of hash table. 4426 * some pages at the end of hash table.
4426 */ 4427 */
4427 if (table) { 4428 if (table) {
4428 unsigned long alloc_end = (unsigned long)table + 4429 unsigned long alloc_end = (unsigned long)table +
4429 (PAGE_SIZE << order); 4430 (PAGE_SIZE << order);
4430 unsigned long used = (unsigned long)table + 4431 unsigned long used = (unsigned long)table +
4431 PAGE_ALIGN(size); 4432 PAGE_ALIGN(size);
4432 split_page(virt_to_page(table), order); 4433 split_page(virt_to_page(table), order);
4433 while (used < alloc_end) { 4434 while (used < alloc_end) {
4434 free_page(used); 4435 free_page(used);
4435 used += PAGE_SIZE; 4436 used += PAGE_SIZE;
4436 } 4437 }
4437 } 4438 }
4438 } 4439 }
4439 } while (!table && size > PAGE_SIZE && --log2qty); 4440 } while (!table && size > PAGE_SIZE && --log2qty);
4440 4441
4441 if (!table) 4442 if (!table)
4442 panic("Failed to allocate %s hash table\n", tablename); 4443 panic("Failed to allocate %s hash table\n", tablename);
4443 4444
4444 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 4445 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
4445 tablename, 4446 tablename,
4446 (1U << log2qty), 4447 (1U << log2qty),
4447 ilog2(size) - PAGE_SHIFT, 4448 ilog2(size) - PAGE_SHIFT,
4448 size); 4449 size);
4449 4450
4450 if (_hash_shift) 4451 if (_hash_shift)
4451 *_hash_shift = log2qty; 4452 *_hash_shift = log2qty;
4452 if (_hash_mask) 4453 if (_hash_mask)
4453 *_hash_mask = (1 << log2qty) - 1; 4454 *_hash_mask = (1 << log2qty) - 1;
4454 4455
4455 return table; 4456 return table;
4456 } 4457 }
4457 4458
4458 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 4459 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
4459 struct page *pfn_to_page(unsigned long pfn) 4460 struct page *pfn_to_page(unsigned long pfn)
4460 { 4461 {
4461 return __pfn_to_page(pfn); 4462 return __pfn_to_page(pfn);
4462 } 4463 }
4463 unsigned long page_to_pfn(struct page *page) 4464 unsigned long page_to_pfn(struct page *page)
4464 { 4465 {
4465 return __page_to_pfn(page); 4466 return __page_to_pfn(page);
4466 } 4467 }
4467 EXPORT_SYMBOL(pfn_to_page); 4468 EXPORT_SYMBOL(pfn_to_page);
4468 EXPORT_SYMBOL(page_to_pfn); 4469 EXPORT_SYMBOL(page_to_pfn);
4469 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 4470 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
4470 4471
4471 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 4472 /* Return a pointer to the bitmap storing bits affecting a block of pages */
4472 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 4473 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4473 unsigned long pfn) 4474 unsigned long pfn)
4474 { 4475 {
4475 #ifdef CONFIG_SPARSEMEM 4476 #ifdef CONFIG_SPARSEMEM
4476 return __pfn_to_section(pfn)->pageblock_flags; 4477 return __pfn_to_section(pfn)->pageblock_flags;
4477 #else 4478 #else
4478 return zone->pageblock_flags; 4479 return zone->pageblock_flags;
4479 #endif /* CONFIG_SPARSEMEM */ 4480 #endif /* CONFIG_SPARSEMEM */
4480 } 4481 }
4481 4482
4482 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 4483 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
4483 { 4484 {
4484 #ifdef CONFIG_SPARSEMEM 4485 #ifdef CONFIG_SPARSEMEM
4485 pfn &= (PAGES_PER_SECTION-1); 4486 pfn &= (PAGES_PER_SECTION-1);
4486 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 4487 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4487 #else 4488 #else
4488 pfn = pfn - zone->zone_start_pfn; 4489 pfn = pfn - zone->zone_start_pfn;
4489 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 4490 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4490 #endif /* CONFIG_SPARSEMEM */ 4491 #endif /* CONFIG_SPARSEMEM */
4491 } 4492 }
4492 4493
4493 /** 4494 /**
4494 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 4495 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
4495 * @page: The page within the block of interest 4496 * @page: The page within the block of interest
4496 * @start_bitidx: The first bit of interest to retrieve 4497 * @start_bitidx: The first bit of interest to retrieve
4497 * @end_bitidx: The last bit of interest 4498 * @end_bitidx: The last bit of interest
4498 * returns pageblock_bits flags 4499 * returns pageblock_bits flags
4499 */ 4500 */
4500 unsigned long get_pageblock_flags_group(struct page *page, 4501 unsigned long get_pageblock_flags_group(struct page *page,
4501 int start_bitidx, int end_bitidx) 4502 int start_bitidx, int end_bitidx)
4502 { 4503 {
4503 struct zone *zone; 4504 struct zone *zone;
4504 unsigned long *bitmap; 4505 unsigned long *bitmap;
4505 unsigned long pfn, bitidx; 4506 unsigned long pfn, bitidx;
4506 unsigned long flags = 0; 4507 unsigned long flags = 0;
4507 unsigned long value = 1; 4508 unsigned long value = 1;
4508 4509
4509 zone = page_zone(page); 4510 zone = page_zone(page);
4510 pfn = page_to_pfn(page); 4511 pfn = page_to_pfn(page);
4511 bitmap = get_pageblock_bitmap(zone, pfn); 4512 bitmap = get_pageblock_bitmap(zone, pfn);
4512 bitidx = pfn_to_bitidx(zone, pfn); 4513 bitidx = pfn_to_bitidx(zone, pfn);
4513 4514
4514 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 4515 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4515 if (test_bit(bitidx + start_bitidx, bitmap)) 4516 if (test_bit(bitidx + start_bitidx, bitmap))
4516 flags |= value; 4517 flags |= value;
4517 4518
4518 return flags; 4519 return flags;
4519 } 4520 }
4520 4521
4521 /** 4522 /**
4522 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 4523 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
4523 * @page: The page within the block of interest 4524 * @page: The page within the block of interest
4524 * @start_bitidx: The first bit of interest 4525 * @start_bitidx: The first bit of interest
4525 * @end_bitidx: The last bit of interest 4526 * @end_bitidx: The last bit of interest
4526 * @flags: The flags to set 4527 * @flags: The flags to set
4527 */ 4528 */
4528 void set_pageblock_flags_group(struct page *page, unsigned long flags, 4529 void set_pageblock_flags_group(struct page *page, unsigned long flags,
4529 int start_bitidx, int end_bitidx) 4530 int start_bitidx, int end_bitidx)
4530 { 4531 {
4531 struct zone *zone; 4532 struct zone *zone;
4532 unsigned long *bitmap; 4533 unsigned long *bitmap;
4533 unsigned long pfn, bitidx; 4534 unsigned long pfn, bitidx;
4534 unsigned long value = 1; 4535 unsigned long value = 1;
4535 4536
4536 zone = page_zone(page); 4537 zone = page_zone(page);
4537 pfn = page_to_pfn(page); 4538 pfn = page_to_pfn(page);
4538 bitmap = get_pageblock_bitmap(zone, pfn); 4539 bitmap = get_pageblock_bitmap(zone, pfn);
4539 bitidx = pfn_to_bitidx(zone, pfn); 4540 bitidx = pfn_to_bitidx(zone, pfn);
4540 VM_BUG_ON(pfn < zone->zone_start_pfn); 4541 VM_BUG_ON(pfn < zone->zone_start_pfn);
4541 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); 4542 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
4542 4543
4543 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 4544 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4544 if (flags & value) 4545 if (flags & value)
4545 __set_bit(bitidx + start_bitidx, bitmap); 4546 __set_bit(bitidx + start_bitidx, bitmap);
4546 else 4547 else
4547 __clear_bit(bitidx + start_bitidx, bitmap); 4548 __clear_bit(bitidx + start_bitidx, bitmap);
4548 } 4549 }
4549 4550
4550 /* 4551 /*
4551 * This is designed as sub function...plz see page_isolation.c also. 4552 * This is designed as sub function...plz see page_isolation.c also.
4552 * set/clear page block's type to be ISOLATE. 4553 * set/clear page block's type to be ISOLATE.
4553 * page allocater never alloc memory from ISOLATE block. 4554 * page allocater never alloc memory from ISOLATE block.
4554 */ 4555 */
4555 4556
4556 int set_migratetype_isolate(struct page *page) 4557 int set_migratetype_isolate(struct page *page)
4557 { 4558 {
4558 struct zone *zone; 4559 struct zone *zone;
4559 unsigned long flags; 4560 unsigned long flags;
4560 int ret = -EBUSY; 4561 int ret = -EBUSY;
4561 4562
4562 zone = page_zone(page); 4563 zone = page_zone(page);
4563 spin_lock_irqsave(&zone->lock, flags); 4564 spin_lock_irqsave(&zone->lock, flags);
4564 /* 4565 /*
4565 * In future, more migrate types will be able to be isolation target. 4566 * In future, more migrate types will be able to be isolation target.
4566 */ 4567 */
4567 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 4568 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
4568 goto out; 4569 goto out;
4569 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 4570 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4570 move_freepages_block(zone, page, MIGRATE_ISOLATE); 4571 move_freepages_block(zone, page, MIGRATE_ISOLATE);
4571 ret = 0; 4572 ret = 0;
4572 out: 4573 out:
4573 spin_unlock_irqrestore(&zone->lock, flags); 4574 spin_unlock_irqrestore(&zone->lock, flags);
4574 if (!ret) 4575 if (!ret)
4575 drain_all_pages(); 4576 drain_all_pages();
4576 return ret; 4577 return ret;
4577 } 4578 }
4578 4579
4579 void unset_migratetype_isolate(struct page *page) 4580 void unset_migratetype_isolate(struct page *page)
4580 { 4581 {
4581 struct zone *zone; 4582 struct zone *zone;
4582 unsigned long flags; 4583 unsigned long flags;
4583 zone = page_zone(page); 4584 zone = page_zone(page);
4584 spin_lock_irqsave(&zone->lock, flags); 4585 spin_lock_irqsave(&zone->lock, flags);
4585 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 4586 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
4586 goto out; 4587 goto out;
4587 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4588 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4588 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4589 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4589 out: 4590 out:
4590 spin_unlock_irqrestore(&zone->lock, flags); 4591 spin_unlock_irqrestore(&zone->lock, flags);
4591 } 4592 }
4592 4593
4593 #ifdef CONFIG_MEMORY_HOTREMOVE 4594 #ifdef CONFIG_MEMORY_HOTREMOVE
4594 /* 4595 /*
4595 * All pages in the range must be isolated before calling this. 4596 * All pages in the range must be isolated before calling this.
4596 */ 4597 */
4597 void 4598 void
4598 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 4599 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
4599 { 4600 {
4600 struct page *page; 4601 struct page *page;
4601 struct zone *zone; 4602 struct zone *zone;
4602 int order, i; 4603 int order, i;
4603 unsigned long pfn; 4604 unsigned long pfn;
4604 unsigned long flags; 4605 unsigned long flags;
4605 /* find the first valid pfn */ 4606 /* find the first valid pfn */
4606 for (pfn = start_pfn; pfn < end_pfn; pfn++) 4607 for (pfn = start_pfn; pfn < end_pfn; pfn++)
4607 if (pfn_valid(pfn)) 4608 if (pfn_valid(pfn))
4608 break; 4609 break;
4609 if (pfn == end_pfn) 4610 if (pfn == end_pfn)
4610 return; 4611 return;
4611 zone = page_zone(pfn_to_page(pfn)); 4612 zone = page_zone(pfn_to_page(pfn));
4612 spin_lock_irqsave(&zone->lock, flags); 4613 spin_lock_irqsave(&zone->lock, flags);
4613 pfn = start_pfn; 4614 pfn = start_pfn;
4614 while (pfn < end_pfn) { 4615 while (pfn < end_pfn) {
4615 if (!pfn_valid(pfn)) { 4616 if (!pfn_valid(pfn)) {
4616 pfn++; 4617 pfn++;
4617 continue; 4618 continue;
4618 } 4619 }
4619 page = pfn_to_page(pfn); 4620 page = pfn_to_page(pfn);
4620 BUG_ON(page_count(page)); 4621 BUG_ON(page_count(page));
4621 BUG_ON(!PageBuddy(page)); 4622 BUG_ON(!PageBuddy(page));
4622 order = page_order(page); 4623 order = page_order(page);
4623 #ifdef CONFIG_DEBUG_VM 4624 #ifdef CONFIG_DEBUG_VM
4624 printk(KERN_INFO "remove from free list %lx %d %lx\n", 4625 printk(KERN_INFO "remove from free list %lx %d %lx\n",
4625 pfn, 1 << order, end_pfn); 4626 pfn, 1 << order, end_pfn);
4626 #endif 4627 #endif
4627 list_del(&page->lru); 4628 list_del(&page->lru);
4628 rmv_page_order(page); 4629 rmv_page_order(page);
4629 zone->free_area[order].nr_free--; 4630 zone->free_area[order].nr_free--;
4630 __mod_zone_page_state(zone, NR_FREE_PAGES, 4631 __mod_zone_page_state(zone, NR_FREE_PAGES,
4631 - (1UL << order)); 4632 - (1UL << order));
4632 for (i = 0; i < (1 << order); i++) 4633 for (i = 0; i < (1 << order); i++)
4633 SetPageReserved((page+i)); 4634 SetPageReserved((page+i));
4634 pfn += (1 << order); 4635 pfn += (1 << order);
4635 } 4636 }
4636 spin_unlock_irqrestore(&zone->lock, flags); 4637 spin_unlock_irqrestore(&zone->lock, flags);
4637 } 4638 }
4638 #endif 4639 #endif
4639 4640