Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 208d54e5513c0c02d85af0990901354c74364d5c

Authored by Dave Hansen 20 years ago

Committed by Linus Torvalds 20 years ago

Exists in master and in 7 other branches

[PATCH] memory hotplug locking: node_size_lock

pgdat->node_size_lock is basically only neeeded in one place in the normal
code: show_mem(), which is the arch-specific sysrq-m printing function.

Strictly speaking, the architectures not doing memory hotplug do no need this
locking in show_mem(). However, they are all included for completeness. This
should also make any future consolidation of all of the implementations a
little more straightforward.

This lock is also held in the sparsemem code during a memory removal, as
sections are invalidated. This is the place there pfn_valid() is made false
for a memory area that's being removed. The lock is only required when doing
pfn_valid() operations on memory which the user does not already have a
reference on the page, such as in show_mem().

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 9 changed files with 76 additions and 2 deletions Inline Diff

arch/alpha/mm/numa.c
arch/i386/mm/pgtable.c
arch/ia64/mm/discontig.c
arch/m32r/mm/init.c
arch/parisc/mm/init.c
arch/ppc64/mm/init.c
include/linux/memory_hotplug.h
include/linux/mmzone.h
mm/page_alloc.c

arch/alpha/mm/numa.c

Diff comments View file @ 208d54e

 /*
  *  linux/arch/alpha/mm/numa.c
  *
  *  DISCONTIGMEM NUMA alpha support.
  *
  *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  */
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>
 #include <linux/swap.h>
 #include <linux/initrd.h>
 #include <asm/hwrpb.h>
 #include <asm/pgalloc.h>
 pg_data_t node_data[MAX_NUMNODES];
 bootmem_data_t node_bdata[MAX_NUMNODES];
 #undef DEBUG_DISCONTIG
 #ifdef DEBUG_DISCONTIG
 #define DBGDCONT(args...) printk(args)
 #else
 #define DBGDCONT(args...)
 #endif
 #define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
 #define PFN_DOWN(x)     ((x) >> PAGE_SHIFT)
 #define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
 #define for_each_mem_cluster(memdesc, cluster, i)		\
 	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
 	     (i) < (memdesc)->numclusters; (i)++, (cluster)++)
 static void __init show_mem_layout(void)
 {
 	struct memclust_struct * cluster;
 	struct memdesc_struct * memdesc;
 	int i;
 	/* Find free clusters, and init and free the bootmem accordingly.  */
 	memdesc = (struct memdesc_struct *)
 	  (hwrpb->mddt_offset + (unsigned long) hwrpb);
 	printk("Raw memory layout:\n");
 	for_each_mem_cluster(memdesc, cluster, i) {
 		printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
 		       i, cluster->usage, cluster->start_pfn,
 		       cluster->start_pfn + cluster->numpages);
 	}
 }
 static void __init
 setup_memory_node(int nid, void *kernel_end)
 {
 	extern unsigned long mem_size_limit;
 	struct memclust_struct * cluster;
 	struct memdesc_struct * memdesc;
 	unsigned long start_kernel_pfn, end_kernel_pfn;
 	unsigned long bootmap_size, bootmap_pages, bootmap_start;
 	unsigned long start, end;
 	unsigned long node_pfn_start, node_pfn_end;
 	unsigned long node_min_pfn, node_max_pfn;
 	int i;
 	unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
 	int show_init = 0;
 	/* Find the bounds of current node */
 	node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT;
 	node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT);
 	/* Find free clusters, and init and free the bootmem accordingly.  */
 	memdesc = (struct memdesc_struct *)
 	  (hwrpb->mddt_offset + (unsigned long) hwrpb);
 	/* find the bounds of this node (node_min_pfn/node_max_pfn) */
 	node_min_pfn = ~0UL;
 	node_max_pfn = 0UL;
 	for_each_mem_cluster(memdesc, cluster, i) {
 		/* Bit 0 is console/PALcode reserved.  Bit 1 is
 		   non-volatile memory -- we might want to mark
 		   this for later.  */
 		if (cluster->usage & 3)
 			continue;
 		start = cluster->start_pfn;
 		end = start + cluster->numpages;
 		if (start >= node_pfn_end || end <= node_pfn_start)
 			continue;
 		if (!show_init) {
 			show_init = 1;
 			printk("Initializing bootmem allocator on Node ID %d\n", nid);
 		}
 		printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
 		       i, cluster->usage, cluster->start_pfn,
 		       cluster->start_pfn + cluster->numpages);
 		if (start < node_pfn_start)
 			start = node_pfn_start;
 		if (end > node_pfn_end)
 			end = node_pfn_end;
 		if (start < node_min_pfn)
 			node_min_pfn = start;
 		if (end > node_max_pfn)
 			node_max_pfn = end;
 	}
 	if (mem_size_limit && node_max_pfn > mem_size_limit) {
 		static int msg_shown = 0;
 		if (!msg_shown) {
 			msg_shown = 1;
 			printk("setup: forcing memory size to %ldK (from %ldK).\n",
 			       mem_size_limit << (PAGE_SHIFT - 10),
 			       node_max_pfn    << (PAGE_SHIFT - 10));
 		}
 		node_max_pfn = mem_size_limit;
 	}
 	if (node_min_pfn >= node_max_pfn)
 		return;
 	/* Update global {min,max}_low_pfn from node information. */
 	if (node_min_pfn < min_low_pfn)
 		min_low_pfn = node_min_pfn;
 	if (node_max_pfn > max_low_pfn)
 		max_pfn = max_low_pfn = node_max_pfn;
 	num_physpages += node_max_pfn - node_min_pfn;
 #if 0 /* we'll try this one again in a little while */
 	/* Cute trick to make sure our local node data is on local memory */
 	node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
 #endif
 	/* Quasi-mark the pg_data_t as in-use */
 	node_min_pfn += node_datasz;
 	if (node_min_pfn >= node_max_pfn) {
 		printk(" not enough mem to reserve NODE_DATA");
 		return;
 	}
 	NODE_DATA(nid)->bdata = &node_bdata[nid];
 	printk(" Detected node memory:   start %8lu, end %8lu\n",
 	       node_min_pfn, node_max_pfn);
 	DBGDCONT(" DISCONTIG: node_data[%d]   is at 0x%p\n", nid, NODE_DATA(nid));
 	DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
 	/* Find the bounds of kernel memory.  */
 	start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
 	end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
 	bootmap_start = -1;
 	if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
 		panic("kernel loaded out of ram");
 	/* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned.
 	   Note that we round this down, not up - node memory
 	   has much larger alignment than 8Mb, so it's safe. */
 	node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
 	/* We need to know how many physically contiguous pages
 	   we'll need for the bootmap.  */
 	bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
 	/* Now find a good region where to allocate the bootmap.  */
 	for_each_mem_cluster(memdesc, cluster, i) {
 		if (cluster->usage & 3)
 			continue;
 		start = cluster->start_pfn;
 		end = start + cluster->numpages;
 		if (start >= node_max_pfn || end <= node_min_pfn)
 			continue;
 		if (end > node_max_pfn)
 			end = node_max_pfn;
 		if (start < node_min_pfn)
 			start = node_min_pfn;
 		if (start < start_kernel_pfn) {
 			if (end > end_kernel_pfn
 			    && end - end_kernel_pfn >= bootmap_pages) {
 				bootmap_start = end_kernel_pfn;
 				break;
 			} else if (end > start_kernel_pfn)
 				end = start_kernel_pfn;
 		} else if (start < end_kernel_pfn)
 			start = end_kernel_pfn;
 		if (end - start >= bootmap_pages) {
 			bootmap_start = start;
 			break;
 		}
 	}
 	if (bootmap_start == -1)
 		panic("couldn't find a contigous place for the bootmap");
 	/* Allocate the bootmap and mark the whole MM as reserved.  */
 	bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
 					 node_min_pfn, node_max_pfn);
 	DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
 		 bootmap_start, bootmap_size, bootmap_pages);
 	/* Mark the free regions.  */
 	for_each_mem_cluster(memdesc, cluster, i) {
 		if (cluster->usage & 3)
 			continue;
 		start = cluster->start_pfn;
 		end = cluster->start_pfn + cluster->numpages;
 		if (start >= node_max_pfn || end <= node_min_pfn)
 			continue;
 		if (end > node_max_pfn)
 			end = node_max_pfn;
 		if (start < node_min_pfn)
 			start = node_min_pfn;
 		if (start < start_kernel_pfn) {
 			if (end > end_kernel_pfn) {
 				free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
 					     (PFN_PHYS(start_kernel_pfn)
 					      - PFN_PHYS(start)));
 				printk(" freeing pages %ld:%ld\n",
 				       start, start_kernel_pfn);
 				start = end_kernel_pfn;
 			} else if (end > start_kernel_pfn)
 				end = start_kernel_pfn;
 		} else if (start < end_kernel_pfn)
 			start = end_kernel_pfn;
 		if (start >= end)
 			continue;
 		free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
 		printk(" freeing pages %ld:%ld\n", start, end);
 	}
 	/* Reserve the bootmap memory.  */
 	reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), bootmap_size);
 	printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
 	node_set_online(nid);
 }
 void __init
 setup_memory(void *kernel_end)
 {
 	int nid;
 	show_mem_layout();
 	nodes_clear(node_online_map);
 	min_low_pfn = ~0UL;
 	max_low_pfn = 0UL;
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		setup_memory_node(nid, kernel_end);
 #ifdef CONFIG_BLK_DEV_INITRD
 	initrd_start = INITRD_START;
 	if (initrd_start) {
 		extern void *move_initrd(unsigned long);
 		initrd_end = initrd_start+INITRD_SIZE;
 		printk("Initial ramdisk at: 0x%p (%lu bytes)\n",
 		       (void *) initrd_start, INITRD_SIZE);
 		if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) {
 			if (!move_initrd(PFN_PHYS(max_low_pfn)))
 				printk("initrd extends beyond end of memory "
 				       "(0x%08lx > 0x%p)\ndisabling initrd\n",
 				       initrd_end,
 				       phys_to_virt(PFN_PHYS(max_low_pfn)));
 		} else {
 			nid = kvaddr_to_nid(initrd_start);
 			reserve_bootmem_node(NODE_DATA(nid),
 					     virt_to_phys((void *)initrd_start),
 					     INITRD_SIZE);
 		}
 	}
 #endif /* CONFIG_BLK_DEV_INITRD */
 }
 void __init paging_init(void)
 {
 	unsigned int    nid;
 	unsigned long   zones_size[MAX_NR_ZONES] = {0, };
 	unsigned long	dma_local_pfn;
 	/*
 	 * The old global MAX_DMA_ADDRESS per-arch API doesn't fit
 	 * in the NUMA model, for now we convert it to a pfn and
 	 * we interpret this pfn as a local per-node information.
 	 * This issue isn't very important since none of these machines
 	 * have legacy ISA slots anyways.
 	 */
 	dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	for_each_online_node(nid) {
 		unsigned long start_pfn = node_bdata[nid].node_boot_start >> PAGE_SHIFT;
 		unsigned long end_pfn = node_bdata[nid].node_low_pfn;
 		if (dma_local_pfn >= end_pfn - start_pfn)
 			zones_size[ZONE_DMA] = end_pfn - start_pfn;
 		else {
 			zones_size[ZONE_DMA] = dma_local_pfn;
 			zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
 		}
 		free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, NULL);
 	}
 	/* Initialize the kernel's ZERO_PGE. */
 	memset((void *)ZERO_PGE, 0, PAGE_SIZE);
 }
 void __init mem_init(void)
 {
 	unsigned long codesize, reservedpages, datasize, initsize, pfn;
 	extern int page_is_ram(unsigned long) __init;
 	extern char _text, _etext, _data, _edata;
 	extern char __init_begin, __init_end;
 	unsigned long nid, i;
 	high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 	reservedpages = 0;
 	for_each_online_node(nid) {
 		/*
 		 * This will free up the bootmem, ie, slot 0 memory
 		 */
 		totalram_pages += free_all_bootmem_node(NODE_DATA(nid));
 		pfn = NODE_DATA(nid)->node_start_pfn;
 		for (i = 0; i < node_spanned_pages(nid); i++, pfn++)
 			if (page_is_ram(pfn) &&
 			    PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
 	}
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_data;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 	printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, "
 	       "%luk data, %luk init)\n",
 	       (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
 	       num_physpages << (PAGE_SHIFT-10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT-10),
 	       datasize >> 10,
 	       initsize >> 10);
 #if 0
 	mem_stress();
 #endif
 }
 void
 show_mem(void)
 {
 	long i,free = 0,total = 0,reserved = 0;
 	long shared = 0, cached = 0;
 	int nid;
 	printk("\nMem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_node(nid) {
+		unsigned long flags;
+		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		i = node_spanned_pages(nid);
 		while (i-- > 0) {
 			struct page *page = nid_page_nr(nid, i);
 			total++;
 			if (PageReserved(page))
 				reserved++;
 			else if (PageSwapCache(page))
 				cached++;
 			else if (!page_count(page))
 				free++;
 			else
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(NODE_DATA(nid), &flags);
 	}
 	printk("%ld pages of RAM\n",total);
 	printk("%ld free pages\n",free);
 	printk("%ld reserved pages\n",reserved);
 	printk("%ld pages shared\n",shared);
 	printk("%ld pages swap cached\n",cached);
 }

arch/i386/mm/pgtable.c

Diff comments View file @ 208d54e

 /*
  *  linux/arch/i386/mm/pgtable.c
  */
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/smp.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
 #include <asm/e820.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 void show_mem(void)
 {
 	int total = 0, reserved = 0;
 	int shared = 0, cached = 0;
 	int highmem = 0;
 	struct page *page;
 	pg_data_t *pgdat;
 	unsigned long i;
 	struct page_state ps;
+	unsigned long flags;
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
 				highmem++;
 			if (PageReserved(page))
 				reserved++;
 			else if (PageSwapCache(page))
 				cached++;
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk(KERN_INFO "%d pages of RAM\n", total);
 	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
 	printk(KERN_INFO "%d reserved pages\n", reserved);
 	printk(KERN_INFO "%d pages shared\n", shared);
 	printk(KERN_INFO "%d pages swap cached\n", cached);
 	get_page_state(&ps);
 	printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
 	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
 	printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
 	printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
 	printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
 }
 /*
  * Associate a virtual page frame with a given physical page frame
  * and protection flags for that frame.
  */
 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	pgd = swapper_pg_dir + pgd_index(vaddr);
 	if (pgd_none(*pgd)) {
 		BUG();
 		return;
 	}
 	pud = pud_offset(pgd, vaddr);
 	if (pud_none(*pud)) {
 		BUG();
 		return;
 	}
 	pmd = pmd_offset(pud, vaddr);
 	if (pmd_none(*pmd)) {
 		BUG();
 		return;
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
 	/* <pfn,flags> stored as-is, to permit clearing entries */
 	set_pte(pte, pfn_pte(pfn, flags));
 	/*
 	 * It's enough to flush this one mapping.
 	 * (PGE mappings get flushed as well)
 	 */
 	__flush_tlb_one(vaddr);
 }
 /*
  * Associate a large virtual page frame with a given physical page frame
  * and protection flags for that frame. pfn is for the base of the page,
  * vaddr is what the page gets mapped to - both must be properly aligned.
  * The pmd must already be instantiated. Assumes PAE mode.
  */
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
 		printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
 		return; /* BUG(); */
 	}
 	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
 		printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
 		return; /* BUG(); */
 	}
 	pgd = swapper_pg_dir + pgd_index(vaddr);
 	if (pgd_none(*pgd)) {
 		printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
 		return; /* BUG(); */
 	}
 	pud = pud_offset(pgd, vaddr);
 	pmd = pmd_offset(pud, vaddr);
 	set_pmd(pmd, pfn_pmd(pfn, flags));
 	/*
 	 * It's enough to flush this one mapping.
 	 * (PGE mappings get flushed as well)
 	 */
 	__flush_tlb_one(vaddr);
 }
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
 	if (idx >= __end_of_fixed_addresses) {
 		BUG();
 		return;
 	}
 	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
 }
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
 	return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
 }
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 	struct page *pte;
 #ifdef CONFIG_HIGHPTE
 	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
 #else
 	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
 #endif
 	return pte;
 }
 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
 {
 	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
 }
 /*
  * List of all pgd's needed for non-PAE so it can invalidate entries
  * in both cached and uncached pgd's; not needed for PAE since the
  * kernel pmd is shared. If PAE were not to share the pmd a similar
  * tactic would be needed. This is essentially codepath-based locking
  * against pageattr.c; it is the unique case in which a valid change
  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
  * vmalloc faults work because attached pagetables are never freed.
  * The locking scheme was chosen on the basis of manfred's
  * recommendations and having no core impact whatsoever.
  * -- wli
  */
 DEFINE_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 static inline void pgd_list_add(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
 	page->index = (unsigned long)pgd_list;
 	if (pgd_list)
 		set_page_private(pgd_list, (unsigned long)&page->index);
 	pgd_list = page;
 	set_page_private(page, (unsigned long)&pgd_list);
 }
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *)page->index;
 	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
 		set_page_private(next, (unsigned long)pprev);
 }
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
 	unsigned long flags;
 	if (PTRS_PER_PMD == 1) {
 		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 		spin_lock_irqsave(&pgd_lock, flags);
 	}
 	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			KERNEL_PGD_PTRS);
 	if (PTRS_PER_PMD > 1)
 		return;
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 /* never called when PTRS_PER_PMD > 1 */
 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
 	unsigned long flags; /* can be called from interrupt context */
 	spin_lock_irqsave(&pgd_lock, flags);
 	pgd_list_del(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
 	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
 	if (PTRS_PER_PMD == 1 || !pgd)
 		return pgd;
 	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
 		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
 		if (!pmd)
 			goto out_oom;
 		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
 	}
 	return pgd;
 out_oom:
 	for (i--; i >= 0; i--)
 		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
 	kmem_cache_free(pgd_cache, pgd);
 	return NULL;
 }
 void pgd_free(pgd_t *pgd)
 {
 	int i;
 	/* in the PAE case user pgd entries are overwritten before usage */
 	if (PTRS_PER_PMD > 1)
 		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
 			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
 	/* in the non-PAE case, free_pgtables() clears user pgd entries */
 	kmem_cache_free(pgd_cache, pgd);
 }

arch/ia64/mm/discontig.c

Diff comments View file @ 208d54e

 /*
  * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
  * Copyright (c) 2001 Intel Corp.
  * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
  * Copyright (c) 2002 NEC Corp.
  * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
  * Copyright (c) 2004 Silicon Graphics, Inc
  *	Russ Anderson <rja@sgi.com>
  *	Jesse Barnes <jbarnes@sgi.com>
  *	Jack Steiner <steiner@sgi.com>
  */
 /*
  * Platform initialization for Discontig Memory
  */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
 #include <linux/efi.h>
 #include <linux/nodemask.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/meminit.h>
 #include <asm/numa.h>
 #include <asm/sections.h>
 /*
  * Track per-node information needed to setup the boot memory allocator, the
  * per-node areas, and the real VM.
  */
 struct early_node_data {
 	struct ia64_node_data *node_data;
 	pg_data_t *pgdat;
 	unsigned long pernode_addr;
 	unsigned long pernode_size;
 	struct bootmem_data bootmem_data;
 	unsigned long num_physpages;
 	unsigned long num_dma_physpages;
 	unsigned long min_pfn;
 	unsigned long max_pfn;
 };
 static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
 static nodemask_t memory_less_mask __initdata;
 /*
  * To prevent cache aliasing effects, align per-node structures so that they
  * start at addresses that are strided by node number.
  */
 #define NODEDATA_ALIGN(addr, node)						\
 	((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
 /**
  * build_node_maps - callback to setup bootmem structs for each node
  * @start: physical start of range
  * @len: length of range
  * @node: node where this range resides
  *
  * We allocate a struct bootmem_data for each piece of memory that we wish to
  * treat as a virtually contiguous block (i.e. each node). Each such block
  * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
  * if necessary.  Any non-existent pages will simply be part of the virtual
  * memmap.  We also update min_low_pfn and max_low_pfn here as we receive
  * memory ranges from the caller.
  */
 static int __init build_node_maps(unsigned long start, unsigned long len,
 				  int node)
 {
 	unsigned long cstart, epfn, end = start + len;
 	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 	epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
 	cstart = GRANULEROUNDDOWN(start);
 	if (!bdp->node_low_pfn) {
 		bdp->node_boot_start = cstart;
 		bdp->node_low_pfn = epfn;
 	} else {
 		bdp->node_boot_start = min(cstart, bdp->node_boot_start);
 		bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
 	}
 	min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
 	max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
 	return 0;
 }
 /**
  * early_nr_cpus_node - return number of cpus on a given node
  * @node: node to check
  *
  * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
  * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
  * called yet.  Note that node 0 will also count all non-existent cpus.
  */
 static int __init early_nr_cpus_node(int node)
 {
 	int cpu, n = 0;
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		if (node == node_cpuid[cpu].nid)
 			n++;
 	return n;
 }
 /**
  * compute_pernodesize - compute size of pernode data
  * @node: the node id.
  */
 static unsigned long __init compute_pernodesize(int node)
 {
 	unsigned long pernodesize = 0, cpus;
 	cpus = early_nr_cpus_node(node);
 	pernodesize += PERCPU_PAGE_SIZE * cpus;
 	pernodesize += node * L1_CACHE_BYTES;
 	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
 	pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
 	pernodesize = PAGE_ALIGN(pernodesize);
 	return pernodesize;
 }
 /**
  * per_cpu_node_setup - setup per-cpu areas on each node
  * @cpu_data: per-cpu area on this node
  * @node: node to setup
  *
  * Copy the static per-cpu data into the region we just set aside and then
  * setup __per_cpu_offset for each CPU on this node.  Return a pointer to
  * the end of the area.
  */
 static void *per_cpu_node_setup(void *cpu_data, int node)
 {
 #ifdef CONFIG_SMP
 	int cpu;
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		if (node == node_cpuid[cpu].nid) {
 			memcpy(__va(cpu_data), __phys_per_cpu_start,
 			       __per_cpu_end - __per_cpu_start);
 			__per_cpu_offset[cpu] = (char*)__va(cpu_data) -
 				__per_cpu_start;
 			cpu_data += PERCPU_PAGE_SIZE;
 		}
 	}
 #endif
 	return cpu_data;
 }
 /**
  * fill_pernode - initialize pernode data.
  * @node: the node id.
  * @pernode: physical address of pernode data
  * @pernodesize: size of the pernode data
  */
 static void __init fill_pernode(int node, unsigned long pernode,
 	unsigned long pernodesize)
 {
 	void *cpu_data;
 	int cpus = early_nr_cpus_node(node);
 	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 	mem_data[node].pernode_addr = pernode;
 	mem_data[node].pernode_size = pernodesize;
 	memset(__va(pernode), 0, pernodesize);
 	cpu_data = (void *)pernode;
 	pernode += PERCPU_PAGE_SIZE * cpus;
 	pernode += node * L1_CACHE_BYTES;
 	mem_data[node].pgdat = __va(pernode);
 	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 	mem_data[node].node_data = __va(pernode);
 	pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
 	mem_data[node].pgdat->bdata = bdp;
 	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 	cpu_data = per_cpu_node_setup(cpu_data, node);
 	return;
 }
 /**
  * find_pernode_space - allocate memory for memory map and per-node structures
  * @start: physical start of range
  * @len: length of range
  * @node: node where this range resides
  *
  * This routine reserves space for the per-cpu data struct, the list of
  * pg_data_ts and the per-node data struct.  Each node will have something like
  * the following in the first chunk of addr. space large enough to hold it.
  *
  *    ________________________
  *   |                        |
  *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
  *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
  *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
  *   |------------------------|
  *   |   local pg_data_t *    |
  *   |------------------------|
  *   |  local ia64_node_data  |
  *   |------------------------|
  *   |          ???           |
  *   |________________________|
  *
  * Once this space has been set aside, the bootmem maps are initialized.  We
  * could probably move the allocation of the per-cpu and ia64_node_data space
  * outside of this function and use alloc_bootmem_node(), but doing it here
  * is straightforward and we get the alignments we want so...
  */
 static int __init find_pernode_space(unsigned long start, unsigned long len,
 				     int node)
 {
 	unsigned long epfn;
 	unsigned long pernodesize = 0, pernode, pages, mapsize;
 	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 	epfn = (start + len) >> PAGE_SHIFT;
 	pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
 	mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
 	/*
 	 * Make sure this memory falls within this node's usable memory
 	 * since we may have thrown some away in build_maps().
 	 */
 	if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn)
 		return 0;
 	/* Don't setup this node's local space twice... */
 	if (mem_data[node].pernode_addr)
 		return 0;
 	/*
 	 * Calculate total size needed, incl. what's necessary
 	 * for good alignment and alias prevention.
 	 */
 	pernodesize = compute_pernodesize(node);
 	pernode = NODEDATA_ALIGN(start, node);
 	/* Is this range big enough for what we want to store here? */
 	if (start + len > (pernode + pernodesize + mapsize))
 		fill_pernode(node, pernode, pernodesize);
 	return 0;
 }
 /**
  * free_node_bootmem - free bootmem allocator memory for use
  * @start: physical start of range
  * @len: length of range
  * @node: node where this range resides
  *
  * Simply calls the bootmem allocator to free the specified ranged from
  * the given pg_data_t's bdata struct.  After this function has been called
  * for all the entries in the EFI memory map, the bootmem allocator will
  * be ready to service allocation requests.
  */
 static int __init free_node_bootmem(unsigned long start, unsigned long len,
 				    int node)
 {
 	free_bootmem_node(mem_data[node].pgdat, start, len);
 	return 0;
 }
 /**
  * reserve_pernode_space - reserve memory for per-node space
  *
  * Reserve the space used by the bootmem maps & per-node space in the boot
  * allocator so that when we actually create the real mem maps we don't
  * use their memory.
  */
 static void __init reserve_pernode_space(void)
 {
 	unsigned long base, size, pages;
 	struct bootmem_data *bdp;
 	int node;
 	for_each_online_node(node) {
 		pg_data_t *pdp = mem_data[node].pgdat;
 		if (node_isset(node, memory_less_mask))
 			continue;
 		bdp = pdp->bdata;
 		/* First the bootmem_map itself */
 		pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
 		size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
 		base = __pa(bdp->node_bootmem_map);
 		reserve_bootmem_node(pdp, base, size);
 		/* Now the per-node space */
 		size = mem_data[node].pernode_size;
 		base = __pa(mem_data[node].pernode_addr);
 		reserve_bootmem_node(pdp, base, size);
 	}
 }
 /**
  * initialize_pernode_data - fixup per-cpu & per-node pointers
  *
  * Each node's per-node area has a copy of the global pg_data_t list, so
  * we copy that to each node here, as well as setting the per-cpu pointer
  * to the local node data structure.  The active_cpus field of the per-node
  * structure gets setup by the platform_cpu_init() function later.
  */
 static void __init initialize_pernode_data(void)
 {
 	pg_data_t *pgdat_list[MAX_NUMNODES];
 	int cpu, node;
 	for_each_online_node(node)
 		pgdat_list[node] = mem_data[node].pgdat;
 	/* Copy the pg_data_t list to each node and init the node field */
 	for_each_online_node(node) {
 		memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
 		       sizeof(pgdat_list));
 	}
 #ifdef CONFIG_SMP
 	/* Set the node_data pointer for each per-cpu struct */
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		node = node_cpuid[cpu].nid;
 		per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
 	}
 #else
 	{
 		struct cpuinfo_ia64 *cpu0_cpu_info;
 		cpu = 0;
 		node = node_cpuid[cpu].nid;
 		cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
 			((char *)&per_cpu__cpu_info - __per_cpu_start));
 		cpu0_cpu_info->node_data = mem_data[node].node_data;
 	}
 #endif /* CONFIG_SMP */
 }
 /**
  * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
  * 	node but fall back to any other node when __alloc_bootmem_node fails
  *	for best.
  * @nid: node id
  * @pernodesize: size of this node's pernode data
  * @align: alignment to use for this node's pernode data
  */
 static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
 	unsigned long align)
 {
 	void *ptr = NULL;
 	u8 best = 0xff;
 	int bestnode = -1, node;
 	for_each_online_node(node) {
 		if (node_isset(node, memory_less_mask))
 			continue;
 		else if (node_distance(nid, node) < best) {
 			best = node_distance(nid, node);
 			bestnode = node;
 		}
 	}
 	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
 		pernodesize, align, __pa(MAX_DMA_ADDRESS));
 	if (!ptr)
 		panic("NO memory for memory less node\n");
 	return ptr;
 }
 /**
  * pgdat_insert - insert the pgdat into global pgdat_list
  * @pgdat: the pgdat for a node.
  */
 static void __init pgdat_insert(pg_data_t *pgdat)
 {
 	pg_data_t *prev = NULL, *next;
 	for_each_pgdat(next)
 		if (pgdat->node_id < next->node_id)
 			break;
 		else
 			prev = next;
 	if (prev) {
 		prev->pgdat_next = pgdat;
 		pgdat->pgdat_next = next;
 	} else {
 		pgdat->pgdat_next = pgdat_list;
 		pgdat_list = pgdat;
 	}
 	return;
 }
 /**
  * memory_less_nodes - allocate and initialize CPU only nodes pernode
  *	information.
  */
 static void __init memory_less_nodes(void)
 {
 	unsigned long pernodesize;
 	void *pernode;
 	int node;
 	for_each_node_mask(node, memory_less_mask) {
 		pernodesize = compute_pernodesize(node);
 		pernode = memory_less_node_alloc(node, pernodesize,
 			(node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
 		fill_pernode(node, __pa(pernode), pernodesize);
 	}
 	return;
 }
 #ifdef CONFIG_SPARSEMEM
 /**
  * register_sparse_mem - notify SPARSEMEM that this memory range exists.
  * @start: physical start of range
  * @end: physical end of range
  * @arg: unused
  *
  * Simply calls SPARSEMEM to register memory section(s).
  */
 static int __init register_sparse_mem(unsigned long start, unsigned long end,
 	void *arg)
 {
 	int nid;
 	start = __pa(start) >> PAGE_SHIFT;
 	end = __pa(end) >> PAGE_SHIFT;
 	nid = early_pfn_to_nid(start);
 	memory_present(nid, start, end);
 	return 0;
 }
 static void __init arch_sparse_init(void)
 {
 	efi_memmap_walk(register_sparse_mem, NULL);
 	sparse_init();
 }
 #else
 #define arch_sparse_init() do {} while (0)
 #endif
 /**
  * find_memory - walk the EFI memory map and setup the bootmem allocator
  *
  * Called early in boot to setup the bootmem allocator, and to
  * allocate the per-cpu and per-node structures.
  */
 void __init find_memory(void)
 {
 	int node;
 	reserve_memory();
 	if (num_online_nodes() == 0) {
 		printk(KERN_ERR "node info missing!\n");
 		node_set_online(0);
 	}
 	nodes_or(memory_less_mask, memory_less_mask, node_online_map);
 	min_low_pfn = -1;
 	max_low_pfn = 0;
 	/* These actually end up getting called by call_pernode_memory() */
 	efi_memmap_walk(filter_rsvd_memory, build_node_maps);
 	efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
 	for_each_online_node(node)
 		if (mem_data[node].bootmem_data.node_low_pfn) {
 			node_clear(node, memory_less_mask);
 			mem_data[node].min_pfn = ~0UL;
 		}
 	/*
 	 * Initialize the boot memory maps in reverse order since that's
 	 * what the bootmem allocator expects
 	 */
 	for (node = MAX_NUMNODES - 1; node >= 0; node--) {
 		unsigned long pernode, pernodesize, map;
 		struct bootmem_data *bdp;
 		if (!node_online(node))
 			continue;
 		else if (node_isset(node, memory_less_mask))
 			continue;
 		bdp = &mem_data[node].bootmem_data;
 		pernode = mem_data[node].pernode_addr;
 		pernodesize = mem_data[node].pernode_size;
 		map = pernode + pernodesize;
 		init_bootmem_node(mem_data[node].pgdat,
 				  map>>PAGE_SHIFT,
 				  bdp->node_boot_start>>PAGE_SHIFT,
 				  bdp->node_low_pfn);
 	}
 	efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
 	reserve_pernode_space();
 	memory_less_nodes();
 	initialize_pernode_data();
 	max_pfn = max_low_pfn;
 	find_initrd();
 }
 #ifdef CONFIG_SMP
 /**
  * per_cpu_init - setup per-cpu variables
  *
  * find_pernode_space() does most of this already, we just need to set
  * local_per_cpu_offset
  */
 void *per_cpu_init(void)
 {
 	int cpu;
 	if (smp_processor_id() != 0)
 		return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
 	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 }
 #endif /* CONFIG_SMP */
 /**
  * show_mem - give short summary of memory stats
  *
  * Shows a simple page count of reserved and used pages in the system.
  * For discontig machines, it does this on a per-pgdat basis.
  */
 void show_mem(void)
 {
 	int i, total_reserved = 0;
 	int total_shared = 0, total_cached = 0;
 	unsigned long total_present = 0;
 	pg_data_t *pgdat;
 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		unsigned long present = pgdat->node_present_pages;
+		unsigned long present;
+		unsigned long flags;
 		int shared = 0, cached = 0, reserved = 0;
 		printk("Node ID: %d\n", pgdat->node_id);
+		pgdat_resize_lock(pgdat, &flags);
+		present = pgdat->node_present_pages;
 		for(i = 0; i < pgdat->node_spanned_pages; i++) {
 			struct page *page;
 			if (pfn_valid(pgdat->node_start_pfn + i))
 				page = pfn_to_page(pgdat->node_start_pfn + i);
 			else
 				continue;
 			if (PageReserved(page))
 				reserved++;
 			else if (PageSwapCache(page))
 				cached++;
 			else if (page_count(page))
 				shared += page_count(page)-1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 		total_present += present;
 		total_reserved += reserved;
 		total_cached += cached;
 		total_shared += shared;
 		printk("\t%ld pages of RAM\n", present);
 		printk("\t%d reserved pages\n", reserved);
 		printk("\t%d pages shared\n", shared);
 		printk("\t%d pages swap cached\n", cached);
 	}
 	printk("%ld pages of RAM\n", total_present);
 	printk("%d reserved pages\n", total_reserved);
 	printk("%d pages shared\n", total_shared);
 	printk("%d pages swap cached\n", total_cached);
 	printk("Total of %ld pages in page table cache\n",
 		pgtable_quicklist_total_size());
 	printk("%d free buffer pages\n", nr_free_buffer_pages());
 }
 /**
  * call_pernode_memory - use SRAT to call callback functions with node info
  * @start: physical start of range
  * @len: length of range
  * @arg: function to call for each range
  *
  * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
  * out to which node a block of memory belongs.  Ignore memory that we cannot
  * identify, and split blocks that run across multiple nodes.
  *
  * Take this opportunity to round the start address up and the end address
  * down to page boundaries.
  */
 void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
 {
 	unsigned long rs, re, end = start + len;
 	void (*func)(unsigned long, unsigned long, int);
 	int i;
 	start = PAGE_ALIGN(start);
 	end &= PAGE_MASK;
 	if (start >= end)
 		return;
 	func = arg;
 	if (!num_node_memblks) {
 		/* No SRAT table, so assume one node (node 0) */
 		if (start < end)
 			(*func)(start, end - start, 0);
 		return;
 	}
 	for (i = 0; i < num_node_memblks; i++) {
 		rs = max(start, node_memblk[i].start_paddr);
 		re = min(end, node_memblk[i].start_paddr +
 			 node_memblk[i].size);
 		if (rs < re)
 			(*func)(rs, re - rs, node_memblk[i].nid);
 		if (re == end)
 			break;
 	}
 }
 /**
  * count_node_pages - callback to build per-node memory info structures
  * @start: physical start of range
  * @len: length of range
  * @node: node where this range resides
  *
  * Each node has it's own number of physical pages, DMAable pages, start, and
  * end page frame number.  This routine will be called by call_pernode_memory()
  * for each piece of usable memory and will setup these values for each node.
  * Very similar to build_maps().
  */
 static __init int count_node_pages(unsigned long start, unsigned long len, int node)
 {
 	unsigned long end = start + len;
 	mem_data[node].num_physpages += len >> PAGE_SHIFT;
 	if (start <= __pa(MAX_DMA_ADDRESS))
 		mem_data[node].num_dma_physpages +=
 			(min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
 	start = GRANULEROUNDDOWN(start);
 	start = ORDERROUNDDOWN(start);
 	end = GRANULEROUNDUP(end);
 	mem_data[node].max_pfn = max(mem_data[node].max_pfn,
 				     end >> PAGE_SHIFT);
 	mem_data[node].min_pfn = min(mem_data[node].min_pfn,
 				     start >> PAGE_SHIFT);
 	return 0;
 }
 /**
  * paging_init - setup page tables
  *
  * paging_init() sets up the page tables for each node of the system and frees
  * the bootmem allocator memory for general use.
  */
 void __init paging_init(void)
 {
 	unsigned long max_dma;
 	unsigned long zones_size[MAX_NR_ZONES];
 	unsigned long zholes_size[MAX_NR_ZONES];
 	unsigned long pfn_offset = 0;
 	int node;
 	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	arch_sparse_init();
 	efi_memmap_walk(filter_rsvd_memory, count_node_pages);
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 	vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page));
 	vmem_map = (struct page *) vmalloc_end;
 	efi_memmap_walk(create_mem_map_page_table, NULL);
 	printk("Virtual mem_map starts at 0x%p\n", vmem_map);
 #endif
 	for_each_online_node(node) {
 		memset(zones_size, 0, sizeof(zones_size));
 		memset(zholes_size, 0, sizeof(zholes_size));
 		num_physpages += mem_data[node].num_physpages;
 		if (mem_data[node].min_pfn >= max_dma) {
 			/* All of this node's memory is above ZONE_DMA */
 			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
 				mem_data[node].min_pfn;
 			zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
 				mem_data[node].min_pfn -
 				mem_data[node].num_physpages;
 		} else if (mem_data[node].max_pfn < max_dma) {
 			/* All of this node's memory is in ZONE_DMA */
 			zones_size[ZONE_DMA] = mem_data[node].max_pfn -
 				mem_data[node].min_pfn;
 			zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
 				mem_data[node].min_pfn -
 				mem_data[node].num_dma_physpages;
 		} else {
 			/* This node has memory in both zones */
 			zones_size[ZONE_DMA] = max_dma -
 				mem_data[node].min_pfn;
 			zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
 				mem_data[node].num_dma_physpages;
 			zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
 				max_dma;
 			zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
 				(mem_data[node].num_physpages -
 				 mem_data[node].num_dma_physpages);
 		}
 		pfn_offset = mem_data[node].min_pfn;
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 		NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
 #endif
 		free_area_init_node(node, NODE_DATA(node), zones_size,
 				    pfn_offset, zholes_size);
 	}
 	/*
 	 * Make memory less nodes become a member of the known nodes.
 	 */
 	for_each_node_mask(node, memory_less_mask)
 		pgdat_insert(mem_data[node].pgdat);
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }

arch/m32r/mm/init.c

Diff comments View file @ 208d54e

 /*
  *  linux/arch/m32r/mm/init.c
  *
  *  Copyright (c) 2001, 2002  Hitoshi Yamamoto
  *
  *  Some code taken from sh version.
  *    Copyright (C) 1999  Niibe Yutaka
  *    Based on linux/arch/i386/mm/init.c:
  *      Copyright (C) 1995  Linus Torvalds
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 #include <linux/nodemask.h>
 #include <asm/types.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/setup.h>
 #include <asm/tlb.h>
 /* References to section boundaries */
 extern char _text, _etext, _edata;
 extern char __init_begin, __init_end;
 pgd_t swapper_pg_dir[1024];
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 void show_mem(void)
 {
 	int total = 0, reserved = 0;
 	int shared = 0, cached = 0;
 	int highmem = 0;
 	struct page *page;
 	pg_data_t *pgdat;
 	unsigned long i;
 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
 				highmem++;
 			if (PageReserved(page))
 				reserved++;
 			else if (PageSwapCache(page))
 				cached++;
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%d pages of RAM\n", total);
 	printk("%d pages of HIGHMEM\n",highmem);
 	printk("%d reserved pages\n",reserved);
 	printk("%d pages shared\n",shared);
 	printk("%d pages swap cached\n",cached);
 }
 /*
  * Cache of MMU context last used.
  */
 #ifndef CONFIG_SMP
 unsigned long mmu_context_cache_dat;
 #else
 unsigned long mmu_context_cache_dat[NR_CPUS];
 #endif
 static unsigned long hole_pages;
 /*
  * function prototype
  */
 void __init paging_init(void);
 void __init mem_init(void);
 void free_initmem(void);
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long, unsigned long);
 #endif
 /* It'd be good if these lines were in the standard header file. */
 #define START_PFN(nid)	\
 	(NODE_DATA(nid)->bdata->node_boot_start >> PAGE_SHIFT)
 #define MAX_LOW_PFN(nid)	(NODE_DATA(nid)->bdata->node_low_pfn)
 #ifndef CONFIG_DISCONTIGMEM
 unsigned long __init zone_sizes_init(void)
 {
 	unsigned long  zones_size[MAX_NR_ZONES] = {0, 0, 0};
 	unsigned long  max_dma;
 	unsigned long  low;
 	unsigned long  start_pfn;
 #ifdef CONFIG_MMU
 	start_pfn = START_PFN(0);
 	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	low = MAX_LOW_PFN(0);
 	if (low < max_dma){
 		zones_size[ZONE_DMA] = low - start_pfn;
 		zones_size[ZONE_NORMAL] = 0;
 	} else {
 		zones_size[ZONE_DMA] = low - start_pfn;
 		zones_size[ZONE_NORMAL] = low - max_dma;
 	}
 #else
 	zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
 	zones_size[ZONE_NORMAL] = __MEMORY_SIZE >> PAGE_SHIFT;
 	start_pfn = __MEMORY_START >> PAGE_SHIFT;
 #endif /* CONFIG_MMU */
 	free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0);
 	return 0;
 }
 #else	/* CONFIG_DISCONTIGMEM */
 extern unsigned long zone_sizes_init(void);
 #endif	/* CONFIG_DISCONTIGMEM */
 /*======================================================================*
  * paging_init() : sets up the page tables
  *======================================================================*/
 void __init paging_init(void)
 {
 #ifdef CONFIG_MMU
 	int  i;
 	pgd_t *pg_dir;
 	/* We don't need kernel mapping as hardware support that. */
 	pg_dir = swapper_pg_dir;
 	for (i = 0 ; i < USER_PTRS_PER_PGD * 2 ; i++)
 		pgd_val(pg_dir[i]) = 0;
 #endif /* CONFIG_MMU */
 	hole_pages = zone_sizes_init();
 }
 int __init reservedpages_count(void)
 {
 	int reservedpages, nid, i;
 	reservedpages = 0;
-	for_each_online_node(nid)
+	for_each_online_node(nid) {
+		unsigned long flags;
+		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
 			if (PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
+		pgdat_resize_unlock(NODE_DATA(nid), &flags);
+	}
 	return reservedpages;
 }
 /*======================================================================*
  * mem_init() :
  * orig : arch/sh/mm/init.c
  *======================================================================*/
 void __init mem_init(void)
 {
 	int codesize, reservedpages, datasize, initsize;
 	int nid;
 #ifndef CONFIG_MMU
 	extern unsigned long memory_end;
 #endif
 	num_physpages = 0;
 	for_each_online_node(nid)
 		num_physpages += MAX_LOW_PFN(nid) - START_PFN(nid) + 1;
 	num_physpages -= hole_pages;
 #ifndef CONFIG_DISCONTIGMEM
 	max_mapnr = num_physpages;
 #endif	/* CONFIG_DISCONTIGMEM */
 #ifdef CONFIG_MMU
 	high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0)));
 #else
 	high_memory = (void *)(memory_end & PAGE_MASK);
 #endif /* CONFIG_MMU */
 	/* clear the zero-page */
 	memset(empty_zero_page, 0, PAGE_SIZE);
 	/* this will put all low memory onto the freelists */
 	for_each_online_node(nid)
 		totalram_pages += free_all_bootmem_node(NODE_DATA(nid));
 	reservedpages = reservedpages_count() - hole_pages;
 	codesize = (unsigned long) &_etext - (unsigned long)&_text;
 	datasize = (unsigned long) &_edata - (unsigned long)&_etext;
 	initsize = (unsigned long) &__init_end - (unsigned long)&__init_begin;
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 		"%dk reserved, %dk data, %dk init)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
 		initsize >> 10);
 }
 /*======================================================================*
  * free_initmem() :
  * orig : arch/sh/mm/init.c
  *======================================================================*/
 void free_initmem(void)
 {
 	unsigned long addr;
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		set_page_count(virt_to_page(addr), 1);
 		free_page(addr);
 		totalram_pages++;
 	}
 	printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", \
 	  (int)(&__init_end - &__init_begin) >> 10);
 }
 #ifdef CONFIG_BLK_DEV_INITRD
 /*======================================================================*
  * free_initrd_mem() :
  * orig : arch/sh/mm/init.c
  *======================================================================*/
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
 		set_page_count(virt_to_page(p), 1);
 		free_page(p);
 		totalram_pages++;
 	}
 	printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 }
 #endif

arch/parisc/mm/init.c

Diff comments View file @ 208d54e

 /*
  *  linux/arch/parisc/mm/init.c
  *
  *  Copyright (C) 1995	Linus Torvalds
  *  Copyright 1999 SuSE GmbH
  *    changed by Philipp Rumpf
  *  Copyright 1999 Philipp Rumpf (prumpf@tux.org)
  *  Copyright 2004 Randolph Chung (tausq@debian.org)
  *
  */
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/pci.h>		/* for hppa_dma_ops and pcxl_dma_ops */
 #include <linux/initrd.h>
 #include <linux/swap.h>
 #include <linux/unistd.h>
 #include <linux/nodemask.h>	/* for node_online_map */
 #include <linux/pagemap.h>	/* for release_pages and page_cache_release */
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/pdc_chassis.h>
 #include <asm/mmzone.h>
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 extern char _text;	/* start of kernel code, defined by linker */
 extern int  data_start;
 extern char _end;	/* end of BSS, defined by linker */
 extern char __init_begin, __init_end;
 #ifdef CONFIG_DISCONTIGMEM
 struct node_map_data node_data[MAX_NUMNODES];
 bootmem_data_t bmem_data[MAX_NUMNODES];
 unsigned char pfnnid_map[PFNNID_MAP_MAX];
 #endif
 static struct resource data_resource = {
 	.name	= "Kernel data",
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
 };
 static struct resource code_resource = {
 	.name	= "Kernel code",
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
 };
 static struct resource pdcdata_resource = {
 	.name	= "PDC data (Page Zero)",
 	.start	= 0,
 	.end	= 0x9ff,
 	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
 };
 static struct resource sysram_resources[MAX_PHYSMEM_RANGES];
 /* The following array is initialized from the firmware specific
  * information retrieved in kernel/inventory.c.
  */
 physmem_range_t pmem_ranges[MAX_PHYSMEM_RANGES];
 int npmem_ranges;
 #ifdef __LP64__
 #define MAX_MEM         (~0UL)
 #else /* !__LP64__ */
 #define MAX_MEM         (3584U*1024U*1024U)
 #endif /* !__LP64__ */
 static unsigned long mem_limit = MAX_MEM;
 static void __init mem_limit_func(void)
 {
 	char *cp, *end;
 	unsigned long limit;
 	extern char saved_command_line[];
 	/* We need this before __setup() functions are called */
 	limit = MAX_MEM;
 	for (cp = saved_command_line; *cp; ) {
 		if (memcmp(cp, "mem=", 4) == 0) {
 			cp += 4;
 			limit = memparse(cp, &end);
 			if (end != cp)
 				break;
 			cp = end;
 		} else {
 			while (*cp != ' ' && *cp)
 				++cp;
 			while (*cp == ' ')
 				++cp;
 		}
 	}
 	if (limit < mem_limit)
 		mem_limit = limit;
 }
 #define MAX_GAP (0x40000000UL >> PAGE_SHIFT)
 static void __init setup_bootmem(void)
 {
 	unsigned long bootmap_size;
 	unsigned long mem_max;
 	unsigned long bootmap_pages;
 	unsigned long bootmap_start_pfn;
 	unsigned long bootmap_pfn;
 #ifndef CONFIG_DISCONTIGMEM
 	physmem_range_t pmem_holes[MAX_PHYSMEM_RANGES - 1];
 	int npmem_holes;
 #endif
 	int i, sysram_resource_count;
 	disable_sr_hashing(); /* Turn off space register hashing */
 	/*
 	 * Sort the ranges. Since the number of ranges is typically
 	 * small, and performance is not an issue here, just do
 	 * a simple insertion sort.
 	 */
 	for (i = 1; i < npmem_ranges; i++) {
 		int j;
 		for (j = i; j > 0; j--) {
 			unsigned long tmp;
 			if (pmem_ranges[j-1].start_pfn <
 			    pmem_ranges[j].start_pfn) {
 				break;
 			}
 			tmp = pmem_ranges[j-1].start_pfn;
 			pmem_ranges[j-1].start_pfn = pmem_ranges[j].start_pfn;
 			pmem_ranges[j].start_pfn = tmp;
 			tmp = pmem_ranges[j-1].pages;
 			pmem_ranges[j-1].pages = pmem_ranges[j].pages;
 			pmem_ranges[j].pages = tmp;
 		}
 	}
 #ifndef CONFIG_DISCONTIGMEM
 	/*
 	 * Throw out ranges that are too far apart (controlled by
 	 * MAX_GAP).
 	 */
 	for (i = 1; i < npmem_ranges; i++) {
 		if (pmem_ranges[i].start_pfn -
 			(pmem_ranges[i-1].start_pfn +
 			 pmem_ranges[i-1].pages) > MAX_GAP) {
 			npmem_ranges = i;
 			printk("Large gap in memory detected (%ld pages). "
 			       "Consider turning on CONFIG_DISCONTIGMEM\n",
 			       pmem_ranges[i].start_pfn -
 			       (pmem_ranges[i-1].start_pfn +
 			        pmem_ranges[i-1].pages));
 			break;
 		}
 	}
 #endif
 	if (npmem_ranges > 1) {
 		/* Print the memory ranges */
 		printk(KERN_INFO "Memory Ranges:\n");
 		for (i = 0; i < npmem_ranges; i++) {
 			unsigned long start;
 			unsigned long size;
 			size = (pmem_ranges[i].pages << PAGE_SHIFT);
 			start = (pmem_ranges[i].start_pfn << PAGE_SHIFT);
 			printk(KERN_INFO "%2d) Start 0x%016lx End 0x%016lx Size %6ld MB\n",
 				i,start, start + (size - 1), size >> 20);
 		}
 	}
 	sysram_resource_count = npmem_ranges;
 	for (i = 0; i < sysram_resource_count; i++) {
 		struct resource *res = &sysram_resources[i];
 		res->name = "System RAM";
 		res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT;
 		res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
 	}
 	/*
 	 * For 32 bit kernels we limit the amount of memory we can
 	 * support, in order to preserve enough kernel address space
 	 * for other purposes. For 64 bit kernels we don't normally
 	 * limit the memory, but this mechanism can be used to
 	 * artificially limit the amount of memory (and it is written
 	 * to work with multiple memory ranges).
 	 */
 	mem_limit_func();       /* check for "mem=" argument */
 	mem_max = 0;
 	num_physpages = 0;
 	for (i = 0; i < npmem_ranges; i++) {
 		unsigned long rsize;
 		rsize = pmem_ranges[i].pages << PAGE_SHIFT;
 		if ((mem_max + rsize) > mem_limit) {
 			printk(KERN_WARNING "Memory truncated to %ld MB\n", mem_limit >> 20);
 			if (mem_max == mem_limit)
 				npmem_ranges = i;
 			else {
 				pmem_ranges[i].pages =   (mem_limit >> PAGE_SHIFT)
 						       - (mem_max >> PAGE_SHIFT);
 				npmem_ranges = i + 1;
 				mem_max = mem_limit;
 			}
 	        num_physpages += pmem_ranges[i].pages;
 			break;
 		}
 	    num_physpages += pmem_ranges[i].pages;
 		mem_max += rsize;
 	}
 	printk(KERN_INFO "Total Memory: %ld MB\n",mem_max >> 20);
 #ifndef CONFIG_DISCONTIGMEM
 	/* Merge the ranges, keeping track of the holes */
 	{
 		unsigned long end_pfn;
 		unsigned long hole_pages;
 		npmem_holes = 0;
 		end_pfn = pmem_ranges[0].start_pfn + pmem_ranges[0].pages;
 		for (i = 1; i < npmem_ranges; i++) {
 			hole_pages = pmem_ranges[i].start_pfn - end_pfn;
 			if (hole_pages) {
 				pmem_holes[npmem_holes].start_pfn = end_pfn;
 				pmem_holes[npmem_holes++].pages = hole_pages;
 				end_pfn += hole_pages;
 			}
 			end_pfn += pmem_ranges[i].pages;
 		}
 		pmem_ranges[0].pages = end_pfn - pmem_ranges[0].start_pfn;
 		npmem_ranges = 1;
 	}
 #endif
 	bootmap_pages = 0;
 	for (i = 0; i < npmem_ranges; i++)
 		bootmap_pages += bootmem_bootmap_pages(pmem_ranges[i].pages);
 	bootmap_start_pfn = PAGE_ALIGN(__pa((unsigned long) &_end)) >> PAGE_SHIFT;
 #ifdef CONFIG_DISCONTIGMEM
 	for (i = 0; i < MAX_PHYSMEM_RANGES; i++) {
 		memset(NODE_DATA(i), 0, sizeof(pg_data_t));
 		NODE_DATA(i)->bdata = &bmem_data[i];
 	}
 	memset(pfnnid_map, 0xff, sizeof(pfnnid_map));
 	for (i = 0; i < npmem_ranges; i++)
 		node_set_online(i);
 #endif
 	/*
 	 * Initialize and free the full range of memory in each range.
 	 * Note that the only writing these routines do are to the bootmap,
 	 * and we've made sure to locate the bootmap properly so that they
 	 * won't be writing over anything important.
 	 */
 	bootmap_pfn = bootmap_start_pfn;
 	max_pfn = 0;
 	for (i = 0; i < npmem_ranges; i++) {
 		unsigned long start_pfn;
 		unsigned long npages;
 		start_pfn = pmem_ranges[i].start_pfn;
 		npages = pmem_ranges[i].pages;
 		bootmap_size = init_bootmem_node(NODE_DATA(i),
 						bootmap_pfn,
 						start_pfn,
 						(start_pfn + npages) );
 		free_bootmem_node(NODE_DATA(i),
 				  (start_pfn << PAGE_SHIFT),
 				  (npages << PAGE_SHIFT) );
 		bootmap_pfn += (bootmap_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if ((start_pfn + npages) > max_pfn)
 			max_pfn = start_pfn + npages;
 	}
 	if ((bootmap_pfn - bootmap_start_pfn) != bootmap_pages) {
 		printk(KERN_WARNING "WARNING! bootmap sizing is messed up!\n");
 		BUG();
 	}
 	/* reserve PAGE0 pdc memory, kernel text/data/bss & bootmap */
 #define PDC_CONSOLE_IO_IODC_SIZE 32768
 	reserve_bootmem_node(NODE_DATA(0), 0UL,
 			(unsigned long)(PAGE0->mem_free + PDC_CONSOLE_IO_IODC_SIZE));
 	reserve_bootmem_node(NODE_DATA(0),__pa((unsigned long)&_text),
 			(unsigned long)(&_end - &_text));
 	reserve_bootmem_node(NODE_DATA(0), (bootmap_start_pfn << PAGE_SHIFT),
 			((bootmap_pfn - bootmap_start_pfn) << PAGE_SHIFT));
 #ifndef CONFIG_DISCONTIGMEM
 	/* reserve the holes */
 	for (i = 0; i < npmem_holes; i++) {
 		reserve_bootmem_node(NODE_DATA(0),
 				(pmem_holes[i].start_pfn << PAGE_SHIFT),
 				(pmem_holes[i].pages << PAGE_SHIFT));
 	}
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start) {
 		printk(KERN_INFO "initrd: %08lx-%08lx\n", initrd_start, initrd_end);
 		if (__pa(initrd_start) < mem_max) {
 			unsigned long initrd_reserve;
 			if (__pa(initrd_end) > mem_max) {
 				initrd_reserve = mem_max - __pa(initrd_start);
 			} else {
 				initrd_reserve = initrd_end - initrd_start;
 			}
 			initrd_below_start_ok = 1;
 			printk(KERN_INFO "initrd: reserving %08lx-%08lx (mem_max %08lx)\n", __pa(initrd_start), __pa(initrd_start) + initrd_reserve, mem_max);
 			reserve_bootmem_node(NODE_DATA(0),__pa(initrd_start), initrd_reserve);
 		}
 	}
 #endif
 	data_resource.start =  virt_to_phys(&data_start);
 	data_resource.end = virt_to_phys(&_end)-1;
 	code_resource.start = virt_to_phys(&_text);
 	code_resource.end = virt_to_phys(&data_start)-1;
 	/* We don't know which region the kernel will be in, so try
 	 * all of them.
 	 */
 	for (i = 0; i < sysram_resource_count; i++) {
 		struct resource *res = &sysram_resources[i];
 		request_resource(res, &code_resource);
 		request_resource(res, &data_resource);
 	}
 	request_resource(&sysram_resources[0], &pdcdata_resource);
 }
 void free_initmem(void)
 {
 	/* FIXME: */
 #if 0
 	printk(KERN_INFO "NOT FREEING INITMEM (%dk)\n",
 			(&__init_end - &__init_begin) >> 10);
 	return;
 #else
 	unsigned long addr;
 	printk(KERN_INFO "Freeing unused kernel memory: ");
 #if 1
 	/* Attempt to catch anyone trying to execute code here
 	 * by filling the page with BRK insns.
 	 *
 	 * If we disable interrupts for all CPUs, then IPI stops working.
 	 * Kinda breaks the global cache flushing.
 	 */
 	local_irq_disable();
 	memset(&__init_begin, 0x00,
 		(unsigned long)&__init_end - (unsigned long)&__init_begin);
 	flush_data_cache();
 	asm volatile("sync" : : );
 	flush_icache_range((unsigned long)&__init_begin, (unsigned long)&__init_end);
 	asm volatile("sync" : : );
 	local_irq_enable();
 #endif
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		set_page_count(virt_to_page(addr), 1);
 		free_page(addr);
 		num_physpages++;
 		totalram_pages++;
 	}
 	/* set up a new led state on systems shipped LED State panel */
 	pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE);
 	printk("%luk freed\n", (unsigned long)(&__init_end - &__init_begin) >> 10);
 #endif
 }
 /*
  * Just an arbitrary offset to serve as a "hole" between mapping areas
  * (between top of physical memory and a potential pcxl dma mapping
  * area, and below the vmalloc mapping area).
  *
  * The current 32K value just means that there will be a 32K "hole"
  * between mapping areas. That means that  any out-of-bounds memory
  * accesses will hopefully be caught. The vmalloc() routines leaves
  * a hole of 4kB between each vmalloced area for the same reason.
  */
  /* Leave room for gateway page expansion */
 #if KERNEL_MAP_START < GATEWAY_PAGE_SIZE
 #error KERNEL_MAP_START is in gateway reserved region
 #endif
 #define MAP_START (KERNEL_MAP_START)
 #define VM_MAP_OFFSET  (32*1024)
 #define SET_MAP_OFFSET(x) ((void *)(((unsigned long)(x) + VM_MAP_OFFSET) \
 				     & ~(VM_MAP_OFFSET-1)))
 void *vmalloc_start;
 EXPORT_SYMBOL(vmalloc_start);
 #ifdef CONFIG_PA11
 unsigned long pcxl_dma_start;
 #endif
 void __init mem_init(void)
 {
 	high_memory = __va((max_pfn << PAGE_SHIFT));
 #ifndef CONFIG_DISCONTIGMEM
 	max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1;
 	totalram_pages += free_all_bootmem();
 #else
 	{
 		int i;
 		for (i = 0; i < npmem_ranges; i++)
 			totalram_pages += free_all_bootmem_node(NODE_DATA(i));
 	}
 #endif
 	printk(KERN_INFO "Memory: %luk available\n", num_physpages << (PAGE_SHIFT-10));
 #ifdef CONFIG_PA11
 	if (hppa_dma_ops == &pcxl_dma_ops) {
 		pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START);
 		vmalloc_start = SET_MAP_OFFSET(pcxl_dma_start + PCXL_DMA_MAP_SIZE);
 	} else {
 		pcxl_dma_start = 0;
 		vmalloc_start = SET_MAP_OFFSET(MAP_START);
 	}
 #else
 	vmalloc_start = SET_MAP_OFFSET(MAP_START);
 #endif
 }
 int do_check_pgt_cache(int low, int high)
 {
 	return 0;
 }
 unsigned long *empty_zero_page;
 void show_mem(void)
 {
 	int i,free = 0,total = 0,reserved = 0;
 	int shared = 0, cached = 0;
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:	 %6ldkB\n",
 				nr_swap_pages<<(PAGE_SHIFT-10));
 #ifndef CONFIG_DISCONTIGMEM
 	i = max_mapnr;
 	while (i-- > 0) {
 		total++;
 		if (PageReserved(mem_map+i))
 			reserved++;
 		else if (PageSwapCache(mem_map+i))
 			cached++;
 		else if (!page_count(&mem_map[i]))
 			free++;
 		else
 			shared += page_count(&mem_map[i]) - 1;
 	}
 #else
 	for (i = 0; i < npmem_ranges; i++) {
 		int j;
 		for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
 			struct page *p;
+			unsigned long flags;
+			pgdat_resize_lock(NODE_DATA(i), &flags);
 			p = nid_page_nr(i, j) - node_start_pfn(i);
 			total++;
 			if (PageReserved(p))
 				reserved++;
 			else if (PageSwapCache(p))
 				cached++;
 			else if (!page_count(p))
 				free++;
 			else
 				shared += page_count(p) - 1;
+			pgdat_resize_unlock(NODE_DATA(i), &flags);
         	}
 	}
 #endif
 	printk(KERN_INFO "%d pages of RAM\n", total);
 	printk(KERN_INFO "%d reserved pages\n", reserved);
 	printk(KERN_INFO "%d pages shared\n", shared);
 	printk(KERN_INFO "%d pages swap cached\n", cached);
 #ifdef CONFIG_DISCONTIGMEM
 	{
 		struct zonelist *zl;
 		int i, j, k;
 		for (i = 0; i < npmem_ranges; i++) {
 			for (j = 0; j < MAX_NR_ZONES; j++) {
 				zl = NODE_DATA(i)->node_zonelists + j;
 				printk("Zone list for zone %d on node %d: ", j, i);
 				for (k = 0; zl->zones[k] != NULL; k++)
 					printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name);
 				printk("\n");
 			}
 		}
 	}
 #endif
 }
 static void __init map_pages(unsigned long start_vaddr, unsigned long start_paddr, unsigned long size, pgprot_t pgprot)
 {
 	pgd_t *pg_dir;
 	pmd_t *pmd;
 	pte_t *pg_table;
 	unsigned long end_paddr;
 	unsigned long start_pmd;
 	unsigned long start_pte;
 	unsigned long tmp1;
 	unsigned long tmp2;
 	unsigned long address;
 	unsigned long ro_start;
 	unsigned long ro_end;
 	unsigned long fv_addr;
 	unsigned long gw_addr;
 	extern const unsigned long fault_vector_20;
 	extern void * const linux_gateway_page;
 	ro_start = __pa((unsigned long)&_text);
 	ro_end   = __pa((unsigned long)&data_start);
 	fv_addr  = __pa((unsigned long)&fault_vector_20) & PAGE_MASK;
 	gw_addr  = __pa((unsigned long)&linux_gateway_page) & PAGE_MASK;
 	end_paddr = start_paddr + size;
 	pg_dir = pgd_offset_k(start_vaddr);
 #if PTRS_PER_PMD == 1
 	start_pmd = 0;
 #else
 	start_pmd = ((start_vaddr >> PMD_SHIFT) & (PTRS_PER_PMD - 1));
 #endif
 	start_pte = ((start_vaddr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1));
 	address = start_paddr;
 	while (address < end_paddr) {
 #if PTRS_PER_PMD == 1
 		pmd = (pmd_t *)__pa(pg_dir);
 #else
 		pmd = (pmd_t *)pgd_address(*pg_dir);
 		/*
 		 * pmd is physical at this point
 		 */
 		if (!pmd) {
 			pmd = (pmd_t *) alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE << PMD_ORDER);
 			pmd = (pmd_t *) __pa(pmd);
 		}
 		pgd_populate(NULL, pg_dir, __va(pmd));
 #endif
 		pg_dir++;
 		/* now change pmd to kernel virtual addresses */
 		pmd = (pmd_t *)__va(pmd) + start_pmd;
 		for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++,pmd++) {
 			/*
 			 * pg_table is physical at this point
 			 */
 			pg_table = (pte_t *)pmd_address(*pmd);
 			if (!pg_table) {
 				pg_table = (pte_t *)
 					alloc_bootmem_low_pages_node(NODE_DATA(0),PAGE_SIZE);
 				pg_table = (pte_t *) __pa(pg_table);
 			}
 			pmd_populate_kernel(NULL, pmd, __va(pg_table));
 			/* now change pg_table to kernel virtual addresses */
 			pg_table = (pte_t *) __va(pg_table) + start_pte;
 			for (tmp2 = start_pte; tmp2 < PTRS_PER_PTE; tmp2++,pg_table++) {
 				pte_t pte;
 				/*
 				 * Map the fault vector writable so we can
 				 * write the HPMC checksum.
 				 */
 				if (address >= ro_start && address < ro_end
 							&& address != fv_addr
 							&& address != gw_addr)
 				    pte = __mk_pte(address, PAGE_KERNEL_RO);
 				else
 				    pte = __mk_pte(address, pgprot);
 				if (address >= end_paddr)
 					pte_val(pte) = 0;
 				set_pte(pg_table, pte);
 				address += PAGE_SIZE;
 			}
 			start_pte = 0;
 			if (address >= end_paddr)
 			    break;
 		}
 		start_pmd = 0;
 	}
 }
 /*
  * pagetable_init() sets up the page tables
  *
  * Note that gateway_init() places the Linux gateway page at page 0.
  * Since gateway pages cannot be dereferenced this has the desirable
  * side effect of trapping those pesky NULL-reference errors in the
  * kernel.
  */
 static void __init pagetable_init(void)
 {
 	int range;
 	/* Map each physical memory range to its kernel vaddr */
 	for (range = 0; range < npmem_ranges; range++) {
 		unsigned long start_paddr;
 		unsigned long end_paddr;
 		unsigned long size;
 		start_paddr = pmem_ranges[range].start_pfn << PAGE_SHIFT;
 		end_paddr = start_paddr + (pmem_ranges[range].pages << PAGE_SHIFT);
 		size = pmem_ranges[range].pages << PAGE_SHIFT;
 		map_pages((unsigned long)__va(start_paddr), start_paddr,
 			size, PAGE_KERNEL);
 	}
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_end && initrd_end > mem_limit) {
 		printk("initrd: mapping %08lx-%08lx\n", initrd_start, initrd_end);
 		map_pages(initrd_start, __pa(initrd_start),
 			initrd_end - initrd_start, PAGE_KERNEL);
 	}
 #endif
 	empty_zero_page = alloc_bootmem_pages(PAGE_SIZE);
 	memset(empty_zero_page, 0, PAGE_SIZE);
 }
 static void __init gateway_init(void)
 {
 	unsigned long linux_gateway_page_addr;
 	/* FIXME: This is 'const' in order to trick the compiler
 	   into not treating it as DP-relative data. */
 	extern void * const linux_gateway_page;
 	linux_gateway_page_addr = LINUX_GATEWAY_ADDR & PAGE_MASK;
 	/*
 	 * Setup Linux Gateway page.
 	 *
 	 * The Linux gateway page will reside in kernel space (on virtual
 	 * page 0), so it doesn't need to be aliased into user space.
 	 */
 	map_pages(linux_gateway_page_addr, __pa(&linux_gateway_page),
 		PAGE_SIZE, PAGE_GATEWAY);
 }
 #ifdef CONFIG_HPUX
 void
 map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm)
 {
 	pgd_t *pg_dir;
 	pmd_t *pmd;
 	pte_t *pg_table;
 	unsigned long start_pmd;
 	unsigned long start_pte;
 	unsigned long address;
 	unsigned long hpux_gw_page_addr;
 	/* FIXME: This is 'const' in order to trick the compiler
 	   into not treating it as DP-relative data. */
 	extern void * const hpux_gateway_page;
 	hpux_gw_page_addr = HPUX_GATEWAY_ADDR & PAGE_MASK;
 	/*
 	 * Setup HP-UX Gateway page.
 	 *
 	 * The HP-UX gateway page resides in the user address space,
 	 * so it needs to be aliased into each process.
 	 */
 	pg_dir = pgd_offset(mm,hpux_gw_page_addr);
 #if PTRS_PER_PMD == 1
 	start_pmd = 0;
 #else
 	start_pmd = ((hpux_gw_page_addr >> PMD_SHIFT) & (PTRS_PER_PMD - 1));
 #endif
 	start_pte = ((hpux_gw_page_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1));
 	address = __pa(&hpux_gateway_page);
 #if PTRS_PER_PMD == 1
 	pmd = (pmd_t *)__pa(pg_dir);
 #else
 	pmd = (pmd_t *) pgd_address(*pg_dir);
 	/*
 	 * pmd is physical at this point
 	 */
 	if (!pmd) {
 		pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL);
 		pmd = (pmd_t *) __pa(pmd);
 	}
 	__pgd_val_set(*pg_dir, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pmd);
 #endif
 	/* now change pmd to kernel virtual addresses */
 	pmd = (pmd_t *)__va(pmd) + start_pmd;
 	/*
 	 * pg_table is physical at this point
 	 */
 	pg_table = (pte_t *) pmd_address(*pmd);
 	if (!pg_table)
 		pg_table = (pte_t *) __pa(get_zeroed_page(GFP_KERNEL));
 	__pmd_val_set(*pmd, PxD_FLAG_PRESENT | PxD_FLAG_VALID | (unsigned long) pg_table);
 	/* now change pg_table to kernel virtual addresses */
 	pg_table = (pte_t *) __va(pg_table) + start_pte;
 	set_pte(pg_table, __mk_pte(address, PAGE_GATEWAY));
 }
 EXPORT_SYMBOL(map_hpux_gateway_page);
 #endif
 extern void flush_tlb_all_local(void);
 void __init paging_init(void)
 {
 	int i;
 	setup_bootmem();
 	pagetable_init();
 	gateway_init();
 	flush_cache_all_local(); /* start with known state */
 	flush_tlb_all_local();
 	for (i = 0; i < npmem_ranges; i++) {
 		unsigned long zones_size[MAX_NR_ZONES] = { 0, 0, 0 };
 		/* We have an IOMMU, so all memory can go into a single
 		   ZONE_DMA zone. */
 		zones_size[ZONE_DMA] = pmem_ranges[i].pages;
 #ifdef CONFIG_DISCONTIGMEM
 		/* Need to initialize the pfnnid_map before we can initialize
 		   the zone */
 		{
 		    int j;
 		    for (j = (pmem_ranges[i].start_pfn >> PFNNID_SHIFT);
 			 j <= ((pmem_ranges[i].start_pfn + pmem_ranges[i].pages) >> PFNNID_SHIFT);
 			 j++) {
 			pfnnid_map[j] = i;
 		    }
 		}
 #endif
 		free_area_init_node(i, NODE_DATA(i), zones_size,
 				pmem_ranges[i].start_pfn, NULL);
 	}
 }
 #ifdef CONFIG_PA20
 /*
  * Currently, all PA20 chips have 18 bit protection id's, which is the
  * limiting factor (space ids are 32 bits).
  */
 #define NR_SPACE_IDS 262144
 #else
 /*
  * Currently we have a one-to-one relationship between space id's and
  * protection id's. Older parisc chips (PCXS, PCXT, PCXL, PCXL2) only
  * support 15 bit protection id's, so that is the limiting factor.
  * PCXT' has 18 bit protection id's, but only 16 bit spaceids, so it's
  * probably not worth the effort for a special case here.
  */
 #define NR_SPACE_IDS 32768
 #endif  /* !CONFIG_PA20 */
 #define RECYCLE_THRESHOLD (NR_SPACE_IDS / 2)
 #define SID_ARRAY_SIZE  (NR_SPACE_IDS / (8 * sizeof(long)))
 static unsigned long space_id[SID_ARRAY_SIZE] = { 1 }; /* disallow space 0 */
 static unsigned long dirty_space_id[SID_ARRAY_SIZE];
 static unsigned long space_id_index;
 static unsigned long free_space_ids = NR_SPACE_IDS - 1;
 static unsigned long dirty_space_ids = 0;
 static DEFINE_SPINLOCK(sid_lock);
 unsigned long alloc_sid(void)
 {
 	unsigned long index;
 	spin_lock(&sid_lock);
 	if (free_space_ids == 0) {
 		if (dirty_space_ids != 0) {
 			spin_unlock(&sid_lock);
 			flush_tlb_all(); /* flush_tlb_all() calls recycle_sids() */
 			spin_lock(&sid_lock);
 		}
 		if (free_space_ids == 0)
 			BUG();
 	}
 	free_space_ids--;
 	index = find_next_zero_bit(space_id, NR_SPACE_IDS, space_id_index);
 	space_id[index >> SHIFT_PER_LONG] |= (1L << (index & (BITS_PER_LONG - 1)));
 	space_id_index = index;
 	spin_unlock(&sid_lock);
 	return index << SPACEID_SHIFT;
 }
 void free_sid(unsigned long spaceid)
 {
 	unsigned long index = spaceid >> SPACEID_SHIFT;
 	unsigned long *dirty_space_offset;
 	dirty_space_offset = dirty_space_id + (index >> SHIFT_PER_LONG);
 	index &= (BITS_PER_LONG - 1);
 	spin_lock(&sid_lock);
 	if (*dirty_space_offset & (1L << index))
 	    BUG(); /* attempt to free space id twice */
 	*dirty_space_offset |= (1L << index);
 	dirty_space_ids++;
 	spin_unlock(&sid_lock);
 }
 #ifdef CONFIG_SMP
 static void get_dirty_sids(unsigned long *ndirtyptr,unsigned long *dirty_array)
 {
 	int i;
 	/* NOTE: sid_lock must be held upon entry */
 	*ndirtyptr = dirty_space_ids;
 	if (dirty_space_ids != 0) {
 	    for (i = 0; i < SID_ARRAY_SIZE; i++) {
 		dirty_array[i] = dirty_space_id[i];
 		dirty_space_id[i] = 0;
 	    }
 	    dirty_space_ids = 0;
 	}
 	return;
 }
 static void recycle_sids(unsigned long ndirty,unsigned long *dirty_array)
 {
 	int i;
 	/* NOTE: sid_lock must be held upon entry */
 	if (ndirty != 0) {
 		for (i = 0; i < SID_ARRAY_SIZE; i++) {
 			space_id[i] ^= dirty_array[i];
 		}
 		free_space_ids += ndirty;
 		space_id_index = 0;
 	}
 }
 #else /* CONFIG_SMP */
 static void recycle_sids(void)
 {
 	int i;
 	/* NOTE: sid_lock must be held upon entry */
 	if (dirty_space_ids != 0) {
 		for (i = 0; i < SID_ARRAY_SIZE; i++) {
 			space_id[i] ^= dirty_space_id[i];
 			dirty_space_id[i] = 0;
 		}
 		free_space_ids += dirty_space_ids;
 		dirty_space_ids = 0;
 		space_id_index = 0;
 	}
 }
 #endif
 /*
  * flush_tlb_all() calls recycle_sids(), since whenever the entire tlb is
  * purged, we can safely reuse the space ids that were released but
  * not flushed from the tlb.
  */
 #ifdef CONFIG_SMP
 static unsigned long recycle_ndirty;
 static unsigned long recycle_dirty_array[SID_ARRAY_SIZE];
 static unsigned int recycle_inuse = 0;
 void flush_tlb_all(void)
 {
 	int do_recycle;
 	do_recycle = 0;
 	spin_lock(&sid_lock);
 	if (dirty_space_ids > RECYCLE_THRESHOLD) {
 	    if (recycle_inuse) {
 		BUG();  /* FIXME: Use a semaphore/wait queue here */
 	    }
 	    get_dirty_sids(&recycle_ndirty,recycle_dirty_array);
 	    recycle_inuse++;
 	    do_recycle++;
 	}
 	spin_unlock(&sid_lock);
 	on_each_cpu((void (*)(void *))flush_tlb_all_local, NULL, 1, 1);
 	if (do_recycle) {
 	    spin_lock(&sid_lock);
 	    recycle_sids(recycle_ndirty,recycle_dirty_array);
 	    recycle_inuse = 0;
 	    spin_unlock(&sid_lock);
 	}
 }
 #else
 void flush_tlb_all(void)
 {
 	spin_lock(&sid_lock);
 	flush_tlb_all_local();
 	recycle_sids();
 	spin_unlock(&sid_lock);
 }
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 #if 0
 	if (start < end)
 		printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
 		set_page_count(virt_to_page(start), 1);
 		free_page(start);
 		num_physpages++;
 		totalram_pages++;
 	}
 #endif
 }
 #endif

arch/ppc64/mm/init.c

Diff comments View file @ 208d54e

 /*
  *  PowerPC version
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  *
  *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
  *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
  *    Copyright (C) 1996 Paul Mackerras
  *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
  *
  *  Derived from "arch/i386/mm/init.c"
  *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Dave Engebretsen <engebret@us.ibm.com>
  *      Rework for PPC64 port.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  *
  */
 #include <linux/config.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
 #include <linux/idr.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
 #include <asm/pgalloc.h>
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/rtas.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/uaccess.h>
 #include <asm/smp.h>
 #include <asm/machdep.h>
 #include <asm/tlb.h>
 #include <asm/eeh.h>
 #include <asm/processor.h>
 #include <asm/mmzone.h>
 #include <asm/cputable.h>
 #include <asm/ppcdebug.h>
 #include <asm/sections.h>
 #include <asm/system.h>
 #include <asm/iommu.h>
 #include <asm/abs_addr.h>
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
 #if PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
 #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
 #warning TASK_SIZE is smaller than it needs to be.
 #endif
 int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
 extern pgd_t swapper_pg_dir[];
 extern struct task_struct *current_set[NR_CPUS];
 unsigned long klimit = (unsigned long)_end;
 unsigned long _SDR1=0;
 unsigned long _ASR=0;
 /* max amount of RAM to use */
 unsigned long __max_memory;
 /* info on what we think the IO hole is */
 unsigned long 	io_hole_start;
 unsigned long	io_hole_size;
 void show_mem(void)
 {
 	unsigned long total = 0, reserved = 0;
 	unsigned long shared = 0, cached = 0;
 	struct page *page;
 	pg_data_t *pgdat;
 	unsigned long i;
 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageReserved(page))
 				reserved++;
 			else if (PageSwapCache(page))
 				cached++;
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%ld pages of RAM\n", total);
 	printk("%ld reserved pages\n", reserved);
 	printk("%ld pages shared\n", shared);
 	printk("%ld pages swap cached\n", cached);
 }
 #ifdef CONFIG_PPC_ISERIES
 void __iomem *ioremap(unsigned long addr, unsigned long size)
 {
 	return (void __iomem *)addr;
 }
 extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
 		       unsigned long flags)
 {
 	return (void __iomem *)addr;
 }
 void iounmap(volatile void __iomem *addr)
 {
 	return;
 }
 #else
 /*
  * map_io_page currently only called by __ioremap
  * map_io_page adds an entry to the ioremap page table
  * and adds an entry to the HPT, possibly bolting it
  */
 static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep;
 	unsigned long vsid;
 	if (mem_init_done) {
 		pgdp = pgd_offset_k(ea);
 		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
 			return -ENOMEM;
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
 		if (!pmdp)
 			return -ENOMEM;
 		ptep = pte_alloc_kernel(pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 	} else {
 		unsigned long va, vpn, hash, hpteg;
 		/*
 		 * If the mm subsystem is not fully up, we cannot create a
 		 * linux page table entry for this mapping.  Simply bolt an
 		 * entry in the hardware page table.
 		 */
 		vsid = get_kernel_vsid(ea);
 		va = (vsid << 28) | (ea & 0xFFFFFFF);
 		vpn = va >> PAGE_SHIFT;
 		hash = hpt_hash(vpn, 0);
 		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
 		/* Panic if a pte grpup is full */
 		if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
 				       HPTE_V_BOLTED,
 				       _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
 		    == -1) {
 			panic("map_io_page: could not insert mapping");
 		}
 	}
 	return 0;
 }
 static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
 			    unsigned long ea, unsigned long size,
 			    unsigned long flags)
 {
 	unsigned long i;
 	if ((flags & _PAGE_PRESENT) == 0)
 		flags |= pgprot_val(PAGE_KERNEL);
 	for (i = 0; i < size; i += PAGE_SIZE)
 		if (map_io_page(ea+i, pa+i, flags))
 			return NULL;
 	return (void __iomem *) (ea + (addr & ~PAGE_MASK));
 }
 void __iomem *
 ioremap(unsigned long addr, unsigned long size)
 {
 	return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
 }
 void __iomem * __ioremap(unsigned long addr, unsigned long size,
 			 unsigned long flags)
 {
 	unsigned long pa, ea;
 	void __iomem *ret;
 	/*
 	 * Choose an address to map it to.
 	 * Once the imalloc system is running, we use it.
 	 * Before that, we map using addresses going
 	 * up from ioremap_bot.  imalloc will use
 	 * the addresses from ioremap_bot through
 	 * IMALLOC_END
 	 *
 	 */
 	pa = addr & PAGE_MASK;
 	size = PAGE_ALIGN(addr + size) - pa;
 	if (size == 0)
 		return NULL;
 	if (mem_init_done) {
 		struct vm_struct *area;
 		area = im_get_free_area(size);
 		if (area == NULL)
 			return NULL;
 		ea = (unsigned long)(area->addr);
 		ret = __ioremap_com(addr, pa, ea, size, flags);
 		if (!ret)
 			im_free(area->addr);
 	} else {
 		ea = ioremap_bot;
 		ret = __ioremap_com(addr, pa, ea, size, flags);
 		if (ret)
 			ioremap_bot += size;
 	}
 	return ret;
 }
 #define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
 int __ioremap_explicit(unsigned long pa, unsigned long ea,
 		       unsigned long size, unsigned long flags)
 {
 	struct vm_struct *area;
 	void __iomem *ret;
 	/* For now, require page-aligned values for pa, ea, and size */
 	if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
 	    !IS_PAGE_ALIGNED(size)) {
 		printk(KERN_ERR	"unaligned value in %s\n", __FUNCTION__);
 		return 1;
 	}
 	if (!mem_init_done) {
 		/* Two things to consider in this case:
 		 * 1) No records will be kept (imalloc, etc) that the region
 		 *    has been remapped
 		 * 2) It won't be easy to iounmap() the region later (because
 		 *    of 1)
 		 */
 		;
 	} else {
 		area = im_get_area(ea, size,
 			IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
 		if (area == NULL) {
 			/* Expected when PHB-dlpar is in play */
 			return 1;
 		}
 		if (ea != (unsigned long) area->addr) {
 			printk(KERN_ERR "unexpected addr return from "
 			       "im_get_area\n");
 			return 1;
 		}
 	}
 	ret = __ioremap_com(pa, pa, ea, size, flags);
 	if (ret == NULL) {
 		printk(KERN_ERR "ioremap_explicit() allocation failure !\n");
 		return 1;
 	}
 	if (ret != (void *) ea) {
 		printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
 		return 1;
 	}
 	return 0;
 }
 /*
  * Unmap an IO region and remove it from imalloc'd list.
  * Access to IO memory should be serialized by driver.
  * This code is modeled after vmalloc code - unmap_vm_area()
  *
  * XXX	what about calls before mem_init_done (ie python_countermeasures())
  */
 void iounmap(volatile void __iomem *token)
 {
 	void *addr;
 	if (!mem_init_done)
 		return;
 	addr = (void *) ((unsigned long __force) token & PAGE_MASK);
 	im_free(addr);
 }
 static int iounmap_subset_regions(unsigned long addr, unsigned long size)
 {
 	struct vm_struct *area;
 	/* Check whether subsets of this region exist */
 	area = im_get_area(addr, size, IM_REGION_SUPERSET);
 	if (area == NULL)
 		return 1;
 	while (area) {
 		iounmap((void __iomem *) area->addr);
 		area = im_get_area(addr, size,
 				IM_REGION_SUPERSET);
 	}
 	return 0;
 }
 int iounmap_explicit(volatile void __iomem *start, unsigned long size)
 {
 	struct vm_struct *area;
 	unsigned long addr;
 	int rc;
 	addr = (unsigned long __force) start & PAGE_MASK;
 	/* Verify that the region either exists or is a subset of an existing
 	 * region.  In the latter case, split the parent region to create
 	 * the exact region
 	 */
 	area = im_get_area(addr, size,
 			    IM_REGION_EXISTS | IM_REGION_SUBSET);
 	if (area == NULL) {
 		/* Determine whether subset regions exist.  If so, unmap */
 		rc = iounmap_subset_regions(addr, size);
 		if (rc) {
 			printk(KERN_ERR
 			       "%s() cannot unmap nonexistent range 0x%lx\n",
  				__FUNCTION__, addr);
 			return 1;
 		}
 	} else {
 		iounmap((void __iomem *) area->addr);
 	}
 	/*
 	 * FIXME! This can't be right:
 	iounmap(area->addr);
 	 * Maybe it should be "iounmap(area);"
 	 */
 	return 0;
 }
 #endif
 EXPORT_SYMBOL(ioremap);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 void free_initmem(void)
 {
 	unsigned long addr;
 	addr = (unsigned long)__init_begin;
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
 		set_page_count(virt_to_page(addr), 1);
 		free_page(addr);
 		totalram_pages++;
 	}
 	printk ("Freeing unused kernel memory: %luk freed\n",
 		((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
 }
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (start < end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
 		set_page_count(virt_to_page(start), 1);
 		free_page(start);
 		totalram_pages++;
 	}
 }
 #endif
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDR(mmu_context_idr);
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int index;
 	int err;
 again:
 	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
 		return -ENOMEM;
 	spin_lock(&mmu_context_lock);
 	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
 	spin_unlock(&mmu_context_lock);
 	if (err == -EAGAIN)
 		goto again;
 	else if (err)
 		return err;
 	if (index > MAX_CONTEXT) {
 		idr_remove(&mmu_context_idr, index);
 		return -ENOMEM;
 	}
 	mm->context.id = index;
 	return 0;
 }
 void destroy_context(struct mm_struct *mm)
 {
 	spin_lock(&mmu_context_lock);
 	idr_remove(&mmu_context_idr, mm->context.id);
 	spin_unlock(&mmu_context_lock);
 	mm->context.id = NO_CONTEXT;
 }
 /*
  * Do very early mm setup.
  */
 void __init mm_init_ppc64(void)
 {
 #ifndef CONFIG_PPC_ISERIES
 	unsigned long i;
 #endif
 	ppc64_boot_msg(0x100, "MM Init");
 	/* This is the story of the IO hole... please, keep seated,
 	 * unfortunately, we are out of oxygen masks at the moment.
 	 * So we need some rough way to tell where your big IO hole
 	 * is. On pmac, it's between 2G and 4G, on POWER3, it's around
 	 * that area as well, on POWER4 we don't have one, etc...
 	 * We need that as a "hint" when sizing the TCE table on POWER3
 	 * So far, the simplest way that seem work well enough for us it
 	 * to just assume that the first discontinuity in our physical
 	 * RAM layout is the IO hole. That may not be correct in the future
 	 * (and isn't on iSeries but then we don't care ;)
 	 */
 #ifndef CONFIG_PPC_ISERIES
 	for (i = 1; i < lmb.memory.cnt; i++) {
 		unsigned long base, prevbase, prevsize;
 		prevbase = lmb.memory.region[i-1].base;
 		prevsize = lmb.memory.region[i-1].size;
 		base = lmb.memory.region[i].base;
 		if (base > (prevbase + prevsize)) {
 			io_hole_start = prevbase + prevsize;
 			io_hole_size = base  - (prevbase + prevsize);
 			break;
 		}
 	}
 #endif /* CONFIG_PPC_ISERIES */
 	if (io_hole_start)
 		printk("IO Hole assumed to be %lx -> %lx\n",
 		       io_hole_start, io_hole_start + io_hole_size - 1);
 	ppc64_boot_msg(0x100, "MM Init Done");
 }
 /*
  * This is called by /dev/mem to know if a given address has to
  * be mapped non-cacheable or not
  */
 int page_is_ram(unsigned long pfn)
 {
 	int i;
 	unsigned long paddr = (pfn << PAGE_SHIFT);
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long base;
 		base = lmb.memory.region[i].base;
 		if ((paddr >= base) &&
 			(paddr < (base + lmb.memory.region[i].size))) {
 			return 1;
 		}
 	}
 	return 0;
 }
 EXPORT_SYMBOL(page_is_ram);
 /*
  * Initialize the bootmem system and give it all the memory we
  * have available.
  */
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init do_init_bootmem(void)
 {
 	unsigned long i;
 	unsigned long start, bootmap_pages;
 	unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
 	int boot_mapsize;
 	/*
 	 * Find an area to use for the bootmem bitmap.  Calculate the size of
 	 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
 	 * Add 1 additional page in case the address isn't page-aligned.
 	 */
 	bootmap_pages = bootmem_bootmap_pages(total_pages);
 	start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
 	BUG_ON(!start);
 	boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
 	max_pfn = max_low_pfn;
 	/* Add all physical memory to the bootmem map, mark each area
 	 * present.
 	 */
 	for (i=0; i < lmb.memory.cnt; i++)
 		free_bootmem(lmb.memory.region[i].base,
 			     lmb_size_bytes(&lmb.memory, i));
 	/* reserve the sections we're already using */
 	for (i=0; i < lmb.reserved.cnt; i++)
 		reserve_bootmem(lmb.reserved.region[i].base,
 				lmb_size_bytes(&lmb.reserved, i));
 	for (i=0; i < lmb.memory.cnt; i++)
 		memory_present(0, lmb_start_pfn(&lmb.memory, i),
 			       lmb_end_pfn(&lmb.memory, i));
 }
 /*
  * paging_init() sets up the page tables - in fact we've already done this.
  */
 void __init paging_init(void)
 {
 	unsigned long zones_size[MAX_NR_ZONES];
 	unsigned long zholes_size[MAX_NR_ZONES];
 	unsigned long total_ram = lmb_phys_mem_size();
 	unsigned long top_of_ram = lmb_end_of_DRAM();
 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 	       top_of_ram, total_ram);
 	printk(KERN_INFO "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 	/*
 	 * All pages are DMA-able so we put them all in the DMA zone.
 	 */
 	memset(zones_size, 0, sizeof(zones_size));
 	memset(zholes_size, 0, sizeof(zholes_size));
 	zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
 	zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
 	free_area_init_node(0, NODE_DATA(0), zones_size,
 			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
 }
 #endif /* ! CONFIG_NEED_MULTIPLE_NODES */
 static struct kcore_list kcore_vmem;
 static int __init setup_kcore(void)
 {
 	int i;
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long base, size;
 		struct kcore_list *kcore_mem;
 		base = lmb.memory.region[i].base;
 		size = lmb.memory.region[i].size;
 		/* GFP_ATOMIC to avoid might_sleep warnings during boot */
 		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
 		if (!kcore_mem)
 			panic("mem_init: kmalloc failed\n");
 		kclist_add(kcore_mem, __va(base), size);
 	}
 	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
 	return 0;
 }
 module_init(setup_kcore);
 void __init mem_init(void)
 {
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 	int nid;
 #endif
 	pg_data_t *pgdat;
 	unsigned long i;
 	struct page *page;
 	unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
 	num_physpages = max_low_pfn;	/* RAM is assumed contiguous */
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 #ifdef CONFIG_NEED_MULTIPLE_NODES
         for_each_online_node(nid) {
 		if (NODE_DATA(nid)->node_spanned_pages != 0) {
 			printk("freeing bootmem node %x\n", nid);
 			totalram_pages +=
 				free_all_bootmem_node(NODE_DATA(nid));
 		}
 	}
 #else
 	max_mapnr = num_physpages;
 	totalram_pages += free_all_bootmem();
 #endif
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	codesize = (unsigned long)&_etext - (unsigned long)&_stext;
 	initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
 	datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
 	bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
 	printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
 	       "%luk reserved, %luk data, %luk bss, %luk init)\n",
 		(unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
 		bsssize >> 10,
 		initsize >> 10);
 	mem_init_done = 1;
 	/* Initialize the vDSO */
 	vdso_init();
 }
 /*
  * This is called when a page has been modified by the kernel.
  * It just marks the page as not i-cache clean.  We do the i-cache
  * flush later when the page is given to a user process, if necessary.
  */
 void flush_dcache_page(struct page *page)
 {
 	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		return;
 	/* avoid an atomic op if possible */
 	if (test_bit(PG_arch_1, &page->flags))
 		clear_bit(PG_arch_1, &page->flags);
 }
 EXPORT_SYMBOL(flush_dcache_page);
 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {
 	clear_page(page);
 	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		return;
 	/*
 	 * We shouldnt have to do this, but some versions of glibc
 	 * require it (ld.so assumes zero filled pages are icache clean)
 	 * - Anton
 	 */
 	/* avoid an atomic op if possible */
 	if (test_bit(PG_arch_1, &pg->flags))
 		clear_bit(PG_arch_1, &pg->flags);
 }
 EXPORT_SYMBOL(clear_user_page);
 void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
 		    struct page *pg)
 {
 	copy_page(vto, vfrom);
 	/*
 	 * We should be able to use the following optimisation, however
 	 * there are two problems.
 	 * Firstly a bug in some versions of binutils meant PLT sections
 	 * were not marked executable.
 	 * Secondly the first word in the GOT section is blrl, used
 	 * to establish the GOT address. Until recently the GOT was
 	 * not marked executable.
 	 * - Anton
 	 */
 #if 0
 	if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
 		return;
 #endif
 	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		return;
 	/* avoid an atomic op if possible */
 	if (test_bit(PG_arch_1, &pg->flags))
 		clear_bit(PG_arch_1, &pg->flags);
 }
 void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 			     unsigned long addr, int len)
 {
 	unsigned long maddr;
 	maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
 	flush_icache_range(maddr, maddr + len);
 }
 EXPORT_SYMBOL(flush_icache_user_range);
 /*
  * This is called at the end of handling a user page fault, when the
  * fault has been handled by updating a PTE in the linux page tables.
  * We use it to preload an HPTE into the hash table corresponding to
  * the updated linux PTE.
  *
  * This must always be called with the mm->page_table_lock held
  */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
 		      pte_t pte)
 {
 	unsigned long vsid;
 	void *pgdir;
 	pte_t *ptep;
 	int local = 0;
 	cpumask_t tmp;
 	unsigned long flags;
 	/* handle i-cache coherency */
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
 	    !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
 		unsigned long pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!PageReserved(page)
 			    && !test_bit(PG_arch_1, &page->flags)) {
 				__flush_dcache_icache(page_address(page));
 				set_bit(PG_arch_1, &page->flags);
 			}
 		}
 	}
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(pte))
 		return;
 	pgdir = vma->vm_mm->pgd;
 	if (pgdir == NULL)
 		return;
 	ptep = find_linux_pte(pgdir, ea);
 	if (!ptep)
 		return;
 	vsid = get_vsid(vma->vm_mm->context.id, ea);
 	local_irq_save(flags);
 	tmp = cpumask_of_cpu(smp_processor_id());
 	if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
 		local = 1;
 	__hash_page(ea, 0, vsid, ptep, 0x300, local);
 	local_irq_restore(flags);
 }
 void __iomem * reserve_phb_iospace(unsigned long size)
 {
 	void __iomem *virt_addr;
 	if (phbs_io_bot >= IMALLOC_BASE)
 		panic("reserve_phb_iospace(): phb io space overflow\n");
 	virt_addr = (void __iomem *) phbs_io_bot;
 	phbs_io_bot += size;
 	return virt_addr;
 }
 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 {
 	memset(addr, 0, kmem_cache_size(cache));
 }
 static const int pgtable_cache_size[2] = {
 	PTE_TABLE_SIZE, PMD_TABLE_SIZE
 };
 static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 	"pgd_pte_cache", "pud_pmd_cache",
 };
 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 void pgtable_cache_init(void)
 {
 	int i;
 	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
 	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
 	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
 	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
 	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
 		int size = pgtable_cache_size[i];
 		const char *name = pgtable_cache_name[i];
 		pgtable_cache[i] = kmem_cache_create(name,
 						     size, size,
 						     SLAB_HWCACHE_ALIGN
 						     | SLAB_MUST_HWCACHE_ALIGN,
 						     zero_ctor,
 						     NULL);
 		if (! pgtable_cache[i])
 			panic("pgtable_cache_init(): could not create %s!\n",
 			      name);
 	}
 }
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 			      unsigned long size, pgprot_t vma_prot)
 {
 	if (ppc_md.phys_mem_access_prot)
 		return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
 	if (!page_is_ram(addr >> PAGE_SHIFT))
 		vma_prot = __pgprot(pgprot_val(vma_prot)
 				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);

include/linux/memory_hotplug.h

Diff comments View file @ 208d54e

File was created	1	#ifndef __LINUX_MEMORY_HOTPLUG_H
	2	#define __LINUX_MEMORY_HOTPLUG_H
	3
	4	#include <linux/mmzone.h>
	5	#include <linux/spinlock.h>
	6
	7	#ifdef CONFIG_MEMORY_HOTPLUG
	8	/*
	9	* pgdat resizing functions
	10	*/
	11	static inline
	12	void pgdat_resize_lock(struct pglist_data pgdat, unsigned long flags)
	13	{
	14	spin_lock_irqsave(&pgdat->node_size_lock, *flags);
	15	}
	16	static inline
	17	void pgdat_resize_unlock(struct pglist_data pgdat, unsigned long flags)
	18	{
	19	spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
	20	}
	21	static inline
	22	void pgdat_resize_init(struct pglist_data *pgdat)
	23	{
	24	spin_lock_init(&pgdat->node_size_lock);
	25	}
	26	#else /* ! CONFIG_MEMORY_HOTPLUG */
	27	/*
	28	* Stub functions for when hotplug is off
	29	*/
	30	static inline void pgdat_resize_lock(struct pglist_data p, unsigned long f) {}
	31	static inline void pgdat_resize_unlock(struct pglist_data p, unsigned long f) {}
	32	static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
	33	#endif
	34	#endif /* __LINUX_MEMORY_HOTPLUG_H */
	35

include/linux/mmzone.h

Diff comments View file @ 208d54e

 #ifndef _LINUX_MMZONE_H
 #define _LINUX_MMZONE_H
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 #include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/numa.h>
 #include <linux/init.h>
 #include <asm/atomic.h>
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_FORCE_MAX_ZONEORDER
 #define MAX_ORDER 11
 #else
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
 struct free_area {
 	struct list_head	free_list;
 	unsigned long		nr_free;
 };
 struct pglist_data;
 /*
  * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
  * So add a wild amount of padding here to ensure that they fall into separate
  * cachelines.  There are very few zone structures in the machine, so space
  * consumption is not a concern here.
  */
 #if defined(CONFIG_SMP)
 struct zone_padding {
 	char x[0];
 } ____cacheline_maxaligned_in_smp;
 #define ZONE_PADDING(name)	struct zone_padding name;
 #else
 #define ZONE_PADDING(name)
 #endif
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int low;		/* low watermark, refill needed */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	struct list_head list;	/* the list of pages */
 };
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
 #ifdef CONFIG_NUMA
 	unsigned long numa_hit;		/* allocated in intended node */
 	unsigned long numa_miss;	/* allocated in non intended node */
 	unsigned long numa_foreign;	/* was intended here, hit elsewhere */
 	unsigned long interleave_hit; 	/* interleaver prefered this zone */
 	unsigned long local_node;	/* allocation from local node */
 	unsigned long other_node;	/* allocation from other node */
 #endif
 } ____cacheline_aligned_in_smp;
 #ifdef CONFIG_NUMA
 #define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
 #else
 #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
 #endif
 #define ZONE_DMA		0
 #define ZONE_NORMAL		1
 #define ZONE_HIGHMEM		2
 #define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
 #define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 /*
  * When a memory allocation must conform to specific limitations (such
  * as being suitable for DMA) the caller will pass in hints to the
  * allocator in the gfp_mask, in the zone modifier bits.  These bits
  * are used to select a priority ordered list of memory zones which
  * match the requested limits.  GFP_ZONEMASK defines which bits within
  * the gfp_mask should be considered as zone modifiers.  Each valid
  * combination of the zone modifier bits has a corresponding list
  * of zones (in node_zonelists).  Thus for two zone modifiers there
  * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will
  * be 8 (2 ** 3) zonelists.  GFP_ZONETYPES defines the number of possible
  * combinations of zone modifiers in "zone modifier space".
  */
 #define GFP_ZONEMASK	0x03
 /*
  * As an optimisation any zone modifier bits which are only valid when
  * no other zone modifier bits are set (loners) should be placed in
  * the highest order bits of this field.  This allows us to reduce the
  * extent of the zonelists thus saving space.  For example in the case
  * of three zone modifier bits, we could require up to eight zonelists.
  * If the left most zone modifier is a "loner" then the highest valid
  * zonelist would be four allowing us to allocate only five zonelists.
  * Use the first form when the left most bit is not a "loner", otherwise
  * use the second.
  */
 /* #define GFP_ZONETYPES	(GFP_ZONEMASK + 1) */		/* Non-loner */
 #define GFP_ZONETYPES	((GFP_ZONEMASK + 1) / 2 + 1)		/* Loner */
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
  *
  * ZONE_DMA	  < 16 MB	ISA DMA capable memory
  * ZONE_NORMAL	16-896 MB	direct mapped by the kernel
  * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
  */
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
 	/*
 	 * We don't know if the memory that we're going to allocate will be freeable
 	 * or/and it will be released eventually, so to avoid totally wasting several
 	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
 	 * to run OOM on the lower zones despite there's tons of freeable ram
 	 * on the higher zones). This array is recalculated at runtime if the
 	 * sysctl_lowmem_reserve_ratio sysctl changes.
 	 */
 	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 #ifdef CONFIG_NUMA
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
 #endif
 	/*
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
 	struct free_area	free_area[MAX_ORDER];
 	ZONE_PADDING(_pad1_)
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;
 	struct list_head	active_list;
 	struct list_head	inactive_list;
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
 	unsigned long		nr_inactive;
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 	/*
 	 * Does the allocator try to reclaim pages from the zone as soon
 	 * as it fails a watermark_ok() in __alloc_pages?
 	 */
 	int			reclaim_pages;
 	/* A count of how many reclaimers are scanning this zone */
 	atomic_t		reclaim_in_progress;
 	/*
 	 * prev_priority holds the scanning priority for this zone.  It is
 	 * defined as the scanning priority at which we achieved our reclaim
 	 * target at the previous try_to_free_pages() or balance_pgdat()
 	 * invokation.
 	 *
 	 * We use prev_priority as a measure of how much stress page reclaim is
 	 * under - it drives the swappiness decision: whether to unmap mapped
 	 * pages.
 	 *
 	 * temp_priority is used to remember the scanning priority at which
 	 * this zone was successfully refilled to free_pages == pages_high.
 	 *
 	 * Access to both these fields is quite racy even on uniprocessor.  But
 	 * it is expected to average out OK.
 	 */
 	int temp_priority;
 	int prev_priority;
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
 	/*
 	 * wait_table		-- the array holding the hash table
 	 * wait_table_size	-- the size of the hash table array
 	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
 	 *
 	 * The purpose of all these is to keep track of the people
 	 * waiting for a page to become available and make them
 	 * runnable again when possible. The trouble is that this
 	 * consumes a lot of space, especially when so few things
 	 * wait on pages at a given time. So instead of using
 	 * per-page waitqueues, we use a waitqueue hash table.
 	 *
 	 * The bucket discipline is to sleep on the same queue when
 	 * colliding and wake all in that wait queue when removing.
 	 * When something wakes, it must check to be sure its page is
 	 * truly available, a la thundering herd. The cost of a
 	 * collision is great, but given the expected load of the
 	 * table, they should be so rare as to be outweighed by the
 	 * benefits from the saved space.
 	 *
 	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
 	 * primary users of these fields, and in mm/page_alloc.c
 	 * free_area_init_core() performs the initialization of them.
 	 */
 	wait_queue_head_t	* wait_table;
 	unsigned long		wait_table_size;
 	unsigned long		wait_table_bits;
 	/*
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
 	struct page		*zone_mem_map;
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 	/*
 	 * rarely used fields:
 	 */
 	char			*name;
 } ____cacheline_maxaligned_in_smp;
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
  * queues ("queue_length >> 12") during an aging round.
  */
 #define DEF_PRIORITY 12
 /*
  * One allocation request operates on a zonelist. A zonelist
  * is a list of zones, the first one is the 'goal' of the
  * allocation, the other zones are fallback zones, in decreasing
  * priority.
  *
  * Right now a zonelist takes up less than a cacheline. We never
  * modify it apart from boot-up, and only a few indices are used,
  * so despite the zonelist table being relatively big, the cache
  * footprint of this construct is very small.
  */
 struct zonelist {
 	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
 };
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
  * (mostly NUMA machines?) to denote a higher-level memory zone than the
  * zone denotes.
  *
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout.
  *
  * Memory statistics and page replacement data structures are maintained on a
  * per-zone basis.
  */
 struct bootmem_data;
 typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[GFP_ZONETYPES];
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;
 #endif
 	struct bootmem_data *bdata;
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * Must be held any time you expect node_start_pfn, node_present_pages
+	 * or node_spanned_pages stay constant.  Holding this will also
+	 * guarantee that any pfn_valid() stays that way.
+	 *
+	 * Nests above zone->lock and zone->size_seqlock.
+	 */
+	spinlock_t node_size_lock;
+#endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
 	struct pglist_data *pgdat_next;
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
 } pg_data_t;
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
 #else
 #define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
+#include <linux/memory_hotplug.h>
 extern struct pglist_data *pgdat_list;
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat);
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free);
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int alloc_type, int can_try_harder, gfp_t gfp_high);
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
 #else
 static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
 #endif
 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #endif
 /*
  * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
  * @pgdat - pointer to a pg_data_t variable
  *
  * Meant to help with common loops of the form
  * pgdat = pgdat_list;
  * while(pgdat) {
  * 	...
  * 	pgdat = pgdat->pgdat_next;
  * }
  */
 #define for_each_pgdat(pgdat) \
 	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
 /*
  * next_zone - helper magic for for_each_zone()
  * Thanks to William Lee Irwin III for this piece of ingenuity.
  */
 static inline struct zone *next_zone(struct zone *zone)
 {
 	pg_data_t *pgdat = zone->zone_pgdat;
 	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
 		zone++;
 	else if (pgdat->pgdat_next) {
 		pgdat = pgdat->pgdat_next;
 		zone = pgdat->node_zones;
 	} else
 		zone = NULL;
 	return zone;
 }
 /**
  * for_each_zone - helper macro to iterate over all memory zones
  * @zone - pointer to struct zone variable
  *
  * The user only needs to declare the zone variable, for_each_zone
  * fills it in. This basically means for_each_zone() is an
  * easier to read version of this piece of code:
  *
  * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
  * 	for (i = 0; i < MAX_NR_ZONES; ++i) {
  * 		struct zone * z = pgdat->node_zones + i;
  * 		...
  * 	}
  * }
  */
 #define for_each_zone(zone) \
 	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
 static inline int is_highmem_idx(int idx)
 {
 	return (idx == ZONE_HIGHMEM);
 }
 static inline int is_normal_idx(int idx)
 {
 	return (idx == ZONE_NORMAL);
 }
 /**
  * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
  *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
  * @zone - pointer to struct zone variable
  */
 static inline int is_highmem(struct zone *zone)
 {
 	return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
 }
 static inline int is_normal(struct zone *zone)
 {
 	return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
 }
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
 #define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
 #define NODE_MEM_MAP(nid)	mem_map
 #define MAX_NODES_SHIFT		1
 #define pfn_to_nid(pfn)		(0)
 #else /* CONFIG_NEED_MULTIPLE_NODES */
 #include <asm/mmzone.h>
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
  * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
  */
 #define FLAGS_RESERVED		8
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
 #define FLAGS_RESERVED		32
 #else
 #error BITS_PER_LONG not defined
 #endif
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
 #ifdef CONFIG_SPARSEMEM
 /*
  * SECTION_SHIFT    		#bits space required to store a section #
  *
  * PA_SECTION_SHIFT		physical address to/from section number
  * PFN_SECTION_SHIFT		pfn to/from section number
  */
 #define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
 #define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
 #define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
 #define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
 #define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
 #define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
 #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
 #error Allocator MAX_ORDER exceeds SECTION_SIZE
 #endif
 struct page;
 struct mem_section {
 	/*
 	 * This is, logically, a pointer to an array of struct
 	 * pages.  However, it is stored with some other magic.
 	 * (see sparse.c::sparse_init_one_section())
 	 *
 	 * Making it a UL at least makes someone do a cast
 	 * before using it wrong.
 	 */
 	unsigned long section_mem_map;
 };
 #ifdef CONFIG_SPARSEMEM_EXTREME
 #define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
 #else
 #define SECTIONS_PER_ROOT	1
 #endif
 #define SECTION_NR_TO_ROOT(sec)	((sec) / SECTIONS_PER_ROOT)
 #define NR_SECTION_ROOTS	(NR_MEM_SECTIONS / SECTIONS_PER_ROOT)
 #define SECTION_ROOT_MASK	(SECTIONS_PER_ROOT - 1)
 #ifdef CONFIG_SPARSEMEM_EXTREME
 extern struct mem_section *mem_section[NR_SECTION_ROOTS];
 #else
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 extern int __section_nr(struct mem_section* ms);
 /*
  * We use the lower bits of the mem_map pointer to store
  * a little bit of information.  There should be at least
  * 3 bits here due to 32-bit alignment.
  */
 #define	SECTION_MARKED_PRESENT	(1UL<<0)
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_MAP_LAST_BIT	(1UL<<2)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
 	unsigned long map = section->section_mem_map;
 	map &= SECTION_MAP_MASK;
 	return (struct page *)map;
 }
 static inline int valid_section(struct mem_section *section)
 {
 	return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
 }
 static inline int section_has_mem_map(struct mem_section *section)
 {
 	return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
 }
 static inline int valid_section_nr(unsigned long nr)
 {
 	return valid_section(__nr_to_section(nr));
 }
 /*
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 #define pfn_to_page(pfn) 						\
 ({ 									\
 	unsigned long __pfn = (pfn);					\
 	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
 })
 #define page_to_pfn(page)						\
 ({									\
 	page - __section_mem_map_addr(__nr_to_section(			\
 		page_to_section(page)));				\
 })
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
  * this restriction.
  */
 #ifdef CONFIG_NUMA
 #define pfn_to_nid		early_pfn_to_nid
 #endif
 #define pfn_to_pgdat(pfn)						\
 ({									\
 	NODE_DATA(pfn_to_nid(pfn));					\
 })
 #define early_pfn_valid(pfn)	pfn_valid(pfn)
 void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 #define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
 #else
 #define early_pfn_in_nid(pfn, nid)	(1)
 #endif
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
 void memory_present(int nid, unsigned long start, unsigned long end);
 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */

mm/page_alloc.c

Diff comments View file @ 208d54e

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/config.h>
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 /*
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);
 /*
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
 		return 1;
 	if (page_to_pfn(page) < zone->zone_start_pfn)
 		return 1;
 	return 0;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
 	if (!pfn_valid(page_to_pfn(page)))
 		return 0;
 #endif
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 static void bad_page(const char *function, struct page *page)
 {
 	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
 		function, current->comm, page);
 	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
 		page->mapping, page_mapcount(page), page_count(page));
 	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
 	add_taint(TAINT_BAD_PAGE);
 }
 #ifndef CONFIG_HUGETLB_PAGE
 #define prep_compound_page(page, order) do { } while (0)
 #define destroy_compound_page(page, order) do { } while (0)
 #else
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
  * The first tail page's ->mapping, if non-zero, holds the address of the
  * compound page's put_page() function.
  *
  * The order of the allocation is stored in the first tail page's ->index
  * This is only for debug at present.  This usage means that zero-order pages
  * may not be compound.
  */
 static void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	page[1].mapping = NULL;
 	page[1].index = order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 		SetPageCompound(p);
 		set_page_private(p, (unsigned long)page);
 	}
 }
 static void destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	if (!PageCompound(page))
 		return;
 	if (page[1].index != order)
 		bad_page(__FUNCTION__, page);
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
 		if (page_private(p) != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
 }
 #endif		/* CONFIG_HUGETLB_PAGE */
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page) {
 	return page_private(page);
 }
 static inline void set_page_order(struct page *page, int order) {
 	set_page_private(page, order);
 	__SetPagePrivate(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPagePrivate(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contigious at least up to MAX_ORDER
  */
 static inline struct page *
 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
 {
 	unsigned long buddy_idx = page_idx ^ (1 << order);
 	return page + (buddy_idx - page_idx);
 }
 static inline unsigned long
 __find_combined_index(unsigned long page_idx, unsigned int order)
 {
 	return (page_idx & ~(1 << order));
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
  * (c) a page and its buddy have the same order.
  * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
 {
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
             page_count(page) == 0)
                return 1;
        return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_Private.Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
 static inline void __free_pages_bulk (struct page *page,
 		struct zone *zone, unsigned int order)
 {
 	unsigned long page_idx;
 	int order_size = 1 << order;
 	if (unlikely(order))
 		destroy_compound_page(page, order);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	BUG_ON(page_idx & (order_size - 1));
 	BUG_ON(bad_range(zone, page));
 	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
 		unsigned long combined_idx;
 		struct free_area *area;
 		struct page *buddy;
 		combined_idx = __find_combined_index(page_idx, order);
 		buddy = __page_find_buddy(page, page_idx, order);
 		if (bad_range(zone, buddy))
 			break;
 		if (!page_is_buddy(buddy, order))
 			break;		/* Move the buddy up one level. */
 		list_del(&buddy->lru);
 		area = zone->free_area + order;
 		area->nr_free--;
 		rmv_page_order(buddy);
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	list_add(&page->lru, &zone->free_area[order].free_list);
 	zone->free_area[order].nr_free++;
 }
 static inline void free_pages_check(const char *function, struct page *page)
 {
 	if (	page_mapcount(page) ||
 		page->mapping != NULL ||
 		page_count(page) != 0 ||
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved )))
 		bad_page(function, page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
 }
 /*
  * Frees a list of pages.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static int
 free_pages_bulk(struct zone *zone, int count,
 		struct list_head *list, unsigned int order)
 {
 	unsigned long flags;
 	struct page *page = NULL;
 	int ret = 0;
 	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (!list_empty(list) && count--) {
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->lru);
 		__free_pages_bulk(page, zone, order);
 		ret++;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return ret;
 }
 void __free_pages_ok(struct page *page, unsigned int order)
 {
 	LIST_HEAD(list);
 	int i;
 	arch_free_page(page, order);
 	mod_page_state(pgfree, 1 << order);
 #ifndef CONFIG_MMU
 	if (order > 0)
 		for (i = 1 ; i < (1 << order) ; ++i)
 			__put_page(page + i);
 #endif
 	for (i = 0 ; i < (1 << order) ; ++i)
 		free_pages_check(__FUNCTION__, page + i);
 	list_add(&page->lru, &list);
 	kernel_map_pages(page, 1<<order, 0);
 	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- wli
  */
 static inline struct page *
 expand(struct zone *zone, struct page *page,
  	int low, int high, struct free_area *area)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 	return page;
 }
 void set_page_refs(struct page *page, int order)
 {
 #ifdef CONFIG_MMU
 	set_page_count(page, 1);
 #else
 	int i;
 	/*
 	 * We need to reference all the pages for this order, otherwise if
 	 * anyone accesses one of the pages with (get/put) it will be freed.
 	 * - eg: access_process_vm()
 	 */
 	for (i = 0; i < (1 << order); i++)
 		set_page_count(page + i, 1);
 #endif /* CONFIG_MMU */
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static void prep_new_page(struct page *page, int order)
 {
 	if (	page_mapcount(page) ||
 		page->mapping != NULL ||
 		page_count(page) != 0 ||
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved )))
 		bad_page(__FUNCTION__, page);
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
 	set_page_refs(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order)
 {
 	struct free_area * area;
 	unsigned int current_order;
 	struct page *page;
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = zone->free_area + current_order;
 		if (list_empty(&area->free_list))
 			continue;
 		page = list_entry(area->free_list.next, struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		zone->free_pages -= 1UL << order;
 		return expand(zone, page, order, current_order, area);
 	}
 	return NULL;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list)
 {
 	unsigned long flags;
 	int i;
 	int allocated = 0;
 	struct page *page;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
 		page = __rmqueue(zone, order);
 		if (page == NULL)
 			break;
 		allocated++;
 		list_add_tail(&page->lru, list);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return allocated;
 }
 #ifdef CONFIG_NUMA
 /* Called from the slab reaper to drain remote pagesets */
 void drain_remote_pages(void)
 {
 	struct zone *zone;
 	int i;
 	unsigned long flags;
 	local_irq_save(flags);
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 		/* Do not drain local pagesets */
 		if (zone->zone_pgdat->node_id == numa_node_id())
 			continue;
 		pset = zone->pageset[smp_processor_id()];
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 			pcp = &pset->pcp[i];
 			if (pcp->count)
 				pcp->count -= free_pages_bulk(zone, pcp->count,
 						&pcp->list, 0);
 		}
 	}
 	local_irq_restore(flags);
 }
 #endif
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
 	struct zone *zone;
 	int i;
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 		pset = zone_pcp(zone, cpu);
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 			pcp = &pset->pcp[i];
 			pcp->count -= free_pages_bulk(zone, pcp->count,
 						&pcp->list, 0);
 		}
 	}
 }
 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_PM
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long zone_pfn, flags;
 	int order;
 	struct list_head *curr;
 	if (!zone->spanned_pages)
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
 		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
 	for (order = MAX_ORDER - 1; order >= 0; --order)
 		list_for_each(curr, &zone->free_area[order].free_list) {
 			unsigned long start_pfn, i;
 			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i=0; i < (1<<order); i++)
 				SetPageNosaveFree(pfn_to_page(start_pfn+i));
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	__drain_pages(smp_processor_id());
 	local_irq_restore(flags);
 }
 #endif /* CONFIG_PM */
 static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 {
 #ifdef CONFIG_NUMA
 	unsigned long flags;
 	int cpu;
 	pg_data_t *pg = z->zone_pgdat;
 	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
 	struct per_cpu_pageset *p;
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 	p = zone_pcp(z,cpu);
 	if (pg == orig) {
 		p->numa_hit++;
 	} else {
 		p->numa_miss++;
 		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
 	}
 	if (pg == NODE_DATA(numa_node_id()))
 		p->local_node++;
 	else
 		p->other_node++;
 	local_irq_restore(flags);
 #endif
 }
 /*
  * Free a 0-order page
  */
 static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 	inc_page_state(pgfree);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
 	if (pcp->count >= pcp->high)
 		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 	local_irq_restore(flags);
 	put_cpu();
 }
 void fastcall free_hot_page(struct page *page)
 {
 	free_hot_cold_page(page, 0);
 }
 void fastcall free_cold_page(struct page *page)
 {
 	free_hot_cold_page(page, 1);
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
 	for(i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static struct page *
 buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page = NULL;
 	int cold = !!(gfp_flags & __GFP_COLD);
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
 		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 		if (pcp->count) {
 			page = list_entry(pcp->list.next, struct page, lru);
 			list_del(&page->lru);
 			pcp->count--;
 		}
 		local_irq_restore(flags);
 		put_cpu();
 	}
 	if (page == NULL) {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	if (page != NULL) {
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
 		if (gfp_flags & __GFP_ZERO)
 			prep_zero_page(page, order, gfp_flags);
 		if (order && (gfp_flags & __GFP_COMP))
 			prep_compound_page(page, order);
 	}
 	return page;
 }
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int can_try_harder, gfp_t gfp_high)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
 	int o;
 	if (gfp_high)
 		min -= min / 2;
 	if (can_try_harder)
 		min -= min / 4;
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return 0;
 	}
 	return 1;
 }
 static inline int
 should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
 {
 	if (!z->reclaim_pages)
 		return 0;
 	if (gfp_mask & __GFP_NORECLAIM)
 		return 0;
 	return 1;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page * fastcall
 __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct zone **zones, *z;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
 	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
 	int did_some_progress;
 	might_sleep_if(wait);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or is the caller has realtime scheduling
 	 * policy
 	 */
 	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	if (unlikely(zones[0] == NULL)) {
 		/* Should this ever happen?? */
 		return NULL;
 	}
 	classzone_idx = zone_idx(zones[0]);
 restart:
 	/*
 	 * Go through the zonelist once, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
 		/*
 		 * If the zone is to attempt early page reclaim then this loop
 		 * will try to reclaim pages and check the watermark a second
 		 * time before giving up and falling back to the next zone.
 		 */
 zone_reclaim_retry:
 		if (!zone_watermark_ok(z, order, z->pages_low,
 				       classzone_idx, 0, 0)) {
 			if (!do_reclaim)
 				continue;
 			else {
 				zone_reclaim(z, gfp_mask, order);
 				/* Only try reclaim once */
 				do_reclaim = 0;
 				goto zone_reclaim_retry;
 			}
 		}
 		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page)
 			goto got_pg;
 	}
 	for (i = 0; (z = zones[i]) != NULL; i++)
 		wakeup_kswapd(z, order);
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks to go deeper into reserves
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
 				       classzone_idx, can_try_harder,
 				       gfp_mask & __GFP_HIGH))
 			continue;
 		if (wait && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page)
 			goto got_pg;
 	}
 	/* This allocation should allow future memory freeing. */
 	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
 			for (i = 0; (z = zones[i]) != NULL; i++) {
 				if (!cpuset_zone_allowed(z, gfp_mask))
 					continue;
 				page = buffered_rmqueue(z, order, gfp_mask);
 				if (page)
 					goto got_pg;
 			}
 		}
 		goto nopage;
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 rebalance:
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	did_some_progress = try_to_free_pages(zones, gfp_mask);
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	if (likely(did_some_progress)) {
 		for (i = 0; (z = zones[i]) != NULL; i++) {
 			if (!zone_watermark_ok(z, order, z->pages_min,
 					       classzone_idx, can_try_harder,
 					       gfp_mask & __GFP_HIGH))
 				continue;
 			if (!cpuset_zone_allowed(z, gfp_mask))
 				continue;
 			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		/*
 		 * Go through the zonelist yet one more time, keep
 		 * very high watermark here, this is only to catch
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
 		for (i = 0; (z = zones[i]) != NULL; i++) {
 			if (!zone_watermark_ok(z, order, z->pages_high,
 					       classzone_idx, 0, 0))
 				continue;
 			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 				continue;
 			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
 		out_of_memory(gfp_mask, order);
 		goto restart;
 	}
 	/*
 	 * Don't let big-order allocations loop unless the caller explicitly
 	 * requests that.  Wait for some write requests to complete then retry.
 	 *
 	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
 	 * <= 3, but that may not be true in other implementations.
 	 */
 	do_retry = 0;
 	if (!(gfp_mask & __GFP_NORETRY)) {
 		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
 			do_retry = 1;
 		if (gfp_mask & __GFP_NOFAIL)
 			do_retry = 1;
 	}
 	if (do_retry) {
 		blk_congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
 nopage:
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		printk(KERN_WARNING "%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
 			p->comm, order, gfp_mask);
 		dump_stack();
 		show_mem();
 	}
 	return NULL;
 got_pg:
 	zone_statistics(zonelist, z);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages);
 /*
  * Common helper functions.
  */
 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page * page;
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	struct page * page;
 	/*
 	 * get_zeroed_page() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
 	if (page)
 		return (unsigned long) page_address(page);
 	return 0;
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __pagevec_free(struct pagevec *pvec)
 {
 	int i = pagevec_count(pvec);
 	while (--i >= 0)
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 fastcall void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * Total amount of free (allocatable) RAM:
  */
 unsigned int nr_free_pages(void)
 {
 	unsigned int sum = 0;
 	struct zone *zone;
 	for_each_zone(zone)
 		sum += zone->free_pages;
 	return sum;
 }
 EXPORT_SYMBOL(nr_free_pages);
 #ifdef CONFIG_NUMA
 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 {
 	unsigned int i, sum = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		sum += pgdat->node_zones[i].free_pages;
 	return sum;
 }
 #endif
 static unsigned int nr_free_zone_pages(int offset)
 {
 	/* Just pick one node, since fallback list is circular */
 	pg_data_t *pgdat = NODE_DATA(numa_node_id());
 	unsigned int sum = 0;
 	struct zonelist *zonelist = pgdat->node_zonelists + offset;
 	struct zone **zonep = zonelist->zones;
 	struct zone *zone;
 	for (zone = *zonep++; zone; zone = *zonep++) {
 		unsigned long size = zone->present_pages;
 		unsigned long high = zone->pages_high;
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /*
  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
  */
 unsigned int nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 /*
  * Amount of free RAM allocatable within all zones
  */
 unsigned int nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
 }
 #ifdef CONFIG_HIGHMEM
 unsigned int nr_free_highpages (void)
 {
 	pg_data_t *pgdat;
 	unsigned int pages = 0;
 	for_each_pgdat(pgdat)
 		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 	return pages;
 }
 #endif
 #ifdef CONFIG_NUMA
 static void show_node(struct zone *zone)
 {
 	printk("Node %d ", zone->zone_pgdat->node_id);
 }
 #else
 #define show_node(zone)	do { } while (0)
 #endif
 /*
  * Accumulate the page_state information across all CPUs.
  * The result is unavoidably approximate - it can change
  * during and after execution of this function.
  */
 static DEFINE_PER_CPU(struct page_state, page_states) = {0};
 atomic_t nr_pagecache = ATOMIC_INIT(0);
 EXPORT_SYMBOL(nr_pagecache);
 #ifdef CONFIG_SMP
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
 void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 	memset(ret, 0, sizeof(*ret));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 		cpu = next_cpu(cpu, *cpumask);
 		if (cpu < NR_CPUS)
 			prefetch(&per_cpu(page_states, cpu));
 		out = (unsigned long *)ret;
 		for (off = 0; off < nr; off++)
 			*out++ += *in++;
 	}
 }
 void get_page_state_node(struct page_state *ret, int node)
 {
 	int nr;
 	cpumask_t mask = node_to_cpumask(node);
 	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 	nr /= sizeof(unsigned long);
 	__get_page_state(ret, nr+1, &mask);
 }
 void get_page_state(struct page_state *ret)
 {
 	int nr;
 	cpumask_t mask = CPU_MASK_ALL;
 	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 	nr /= sizeof(unsigned long);
 	__get_page_state(ret, nr + 1, &mask);
 }
 void get_full_page_state(struct page_state *ret)
 {
 	cpumask_t mask = CPU_MASK_ALL;
 	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 unsigned long __read_page_state(unsigned long offset)
 {
 	unsigned long ret = 0;
 	int cpu;
 	for_each_online_cpu(cpu) {
 		unsigned long in;
 		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
 		ret += *((unsigned long *)in);
 	}
 	return ret;
 }
 void __mod_page_state(unsigned long offset, unsigned long delta)
 {
 	unsigned long flags;
 	void* ptr;
 	local_irq_save(flags);
 	ptr = &__get_cpu_var(page_states);
 	*(unsigned long*)(ptr + offset) += delta;
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__mod_page_state);
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
 {
 	struct zone *zones = pgdat->node_zones;
 	int i;
 	*active = 0;
 	*inactive = 0;
 	*free = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		*active += zones[i].nr_active;
 		*inactive += zones[i].nr_inactive;
 		*free += zones[i].free_pages;
 	}
 }
 void get_zone_counts(unsigned long *active,
 		unsigned long *inactive, unsigned long *free)
 {
 	struct pglist_data *pgdat;
 	*active = 0;
 	*inactive = 0;
 	*free = 0;
 	for_each_pgdat(pgdat) {
 		unsigned long l, m, n;
 		__get_zone_counts(&l, &m, &n, pgdat);
 		*active += l;
 		*inactive += m;
 		*free += n;
 	}
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = nr_free_pages();
 	val->bufferram = nr_blockdev_pages();
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	val->totalram = pgdat->node_present_pages;
 	val->freeram = nr_free_pages_pgdat(pgdat);
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  */
 void show_free_areas(void)
 {
 	struct page_state ps;
 	int cpu, temperature;
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long free;
 	struct zone *zone;
 	for_each_zone(zone) {
 		show_node(zone);
 		printk("%s per-cpu:", zone->name);
 		if (!zone->present_pages) {
 			printk(" empty\n");
 			continue;
 		} else
 			printk("\n");
 		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
 			struct per_cpu_pageset *pageset;
 			if (!cpu_possible(cpu))
 				continue;
 			pageset = zone_pcp(zone, cpu);
 			for (temperature = 0; temperature < 2; temperature++)
 				printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
 					cpu,
 					temperature ? "cold" : "hot",
 					pageset->pcp[temperature].low,
 					pageset->pcp[temperature].high,
 					pageset->pcp[temperature].batch,
 					pageset->pcp[temperature].count);
 		}
 	}
 	get_page_state(&ps);
 	get_zone_counts(&active, &inactive, &free);
 	printk("Free pages: %11ukB (%ukB HighMem)\n",
 		K(nr_free_pages()),
 		K(nr_free_highpages()));
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
 		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
 		active,
 		inactive,
 		ps.nr_dirty,
 		ps.nr_writeback,
 		ps.nr_unstable,
 		nr_free_pages(),
 		ps.nr_slab,
 		ps.nr_mapped,
 		ps.nr_page_table_pages);
 	for_each_zone(zone) {
 		int i;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active:%lukB"
 			" inactive:%lukB"
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone->free_pages),
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
 			K(zone->nr_active),
 			K(zone->nr_inactive),
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_zone(zone) {
  		unsigned long nr, flags, order, total = 0;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		if (!zone->present_pages) {
 			printk("empty\n");
 			continue;
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			nr = zone->free_area[order].nr_free;
 			total += nr << order;
 			printk("%lu*%lukB ", nr, K(1UL) << order);
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		printk("= %lukB\n", K(total));
 	}
 	show_swap_cache_info();
 }
 /*
  * Builds allocation fallback zone lists.
  */
 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
 {
 	switch (k) {
 		struct zone *zone;
 	default:
 		BUG();
 	case ZONE_HIGHMEM:
 		zone = pgdat->node_zones + ZONE_HIGHMEM;
 		if (zone->present_pages) {
 #ifndef CONFIG_HIGHMEM
 			BUG();
 #endif
 			zonelist->zones[j++] = zone;
 		}
 	case ZONE_NORMAL:
 		zone = pgdat->node_zones + ZONE_NORMAL;
 		if (zone->present_pages)
 			zonelist->zones[j++] = zone;
 	case ZONE_DMA:
 		zone = pgdat->node_zones + ZONE_DMA;
 		if (zone->present_pages)
 			zonelist->zones[j++] = zone;
 	}
 	return j;
 }
 static inline int highest_zone(int zone_bits)
 {
 	int res = ZONE_NORMAL;
 	if (zone_bits & (__force int)__GFP_HIGHMEM)
 		res = ZONE_HIGHMEM;
 	if (zone_bits & (__force int)__GFP_DMA)
 		res = ZONE_DMA;
 	return res;
 }
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
 static int __initdata node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int i, n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
 	for_each_online_node(i) {
 		cpumask_t tmp;
 		/* Start from local node */
 		n = (node+i) % num_online_nodes();
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the local node if we haven't already */
 		if (!node_isset(node, *used_node_mask)) {
 			best_node = node;
 			break;
 		}
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Give preference to headless and unused nodes */
 		tmp = node_to_cpumask(n);
 		if (!cpus_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 	int prev_node, load;
 	struct zonelist *zonelist;
 	nodemask_t used_mask;
 	/* initialize zonelists */
 	for (i = 0; i < GFP_ZONETYPES; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->zones[0] = NULL;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 				node_distance(local_node, prev_node))
 			node_load[node] += load;
 		prev_node = node;
 		load--;
 		for (i = 0; i < GFP_ZONETYPES; i++) {
 			zonelist = pgdat->node_zonelists + i;
 			for (j = 0; zonelist->zones[j] != NULL; j++);
 			k = highest_zone(i);
 	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 			zonelist->zones[j] = NULL;
 		}
 	}
 }
 #else	/* CONFIG_NUMA */
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 	local_node = pgdat->node_id;
 	for (i = 0; i < GFP_ZONETYPES; i++) {
 		struct zonelist *zonelist;
 		zonelist = pgdat->node_zonelists + i;
 		j = 0;
 		k = highest_zone(i);
  		j = build_zonelists_node(pgdat, zonelist, j, k);
  		/*
  		 * Now we build the zonelist so that it contains the zones
  		 * of all the other nodes.
  		 * We don't want to pressure a particular node, so when
  		 * building the zones for node N, we make sure that the
  		 * zones coming right after the local ones are those from
  		 * node N+1 (modulo N)
  		 */
 		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 			if (!node_online(node))
 				continue;
 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 		}
 		for (node = 0; node < local_node; node++) {
 			if (!node_online(node))
 				continue;
 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
 		}
 		zonelist->zones[j] = NULL;
 	}
 }
 #endif	/* CONFIG_NUMA */
 void __init build_all_zonelists(void)
 {
 	int i;
 	for_each_online_node(i)
 		build_zonelists(NODE_DATA(i));
 	printk("Built %i zonelists\n", num_online_nodes());
 	cpuset_init_current_mems_allowed();
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 static inline unsigned long wait_table_size(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	int i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zones_size[i];
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	if (zholes_size)
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			realtotalpages -= zholes_size[i];
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
 		if (!early_pfn_valid(pfn))
 			continue;
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 1);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long size)
 {
 	int order;
 	for (order = 0; order < MAX_ORDER ; order++) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
 void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 		unsigned long size)
 {
 	unsigned long snum = pfn_to_section_nr(pfn);
 	unsigned long end = pfn_to_section_nr(pfn + size);
 	if (FLAGS_HAS_NODE)
 		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
 	else
 		for (; snum <= end; snum++)
 			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 static int __devinit zone_batchsize(struct zone *zone)
 {
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * We will be trying to allcoate bigger chunks of contiguous
 	 * memory of the order of fls(batch).  This should result in
 	 * better cache coloring.
 	 *
 	 * A sanity check also to ensure that batch is still in limits.
 	 */
 	batch = (1 << fls(batch + batch/2));
 	if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
 		batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
 	return batch;
 }
 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp[0];		/* hot */
 	pcp->count = 0;
 	pcp->low = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 	pcp = &p->pcp[1];		/* cold*/
 	pcp->count = 0;
 	pcp->low = 0;
 	pcp->high = 2 * batch;
 	pcp->batch = max(1UL, batch/2);
 	INIT_LIST_HEAD(&pcp->list);
 }
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * Some NUMA counter updates may also be caught by the boot pagesets.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static struct per_cpu_pageset
 	boot_pageset[NR_CPUS];
 /*
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
 static int __devinit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 	for_each_zone(zone) {
 		zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, cpu_to_node(cpu));
 		if (!zone->pageset[cpu])
 			goto bad;
 		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
 	}
 	return 0;
 bad:
 	for_each_zone(dzone) {
 		if (dzone == zone)
 			break;
 		kfree(dzone->pageset[cpu]);
 		dzone->pageset[cpu] = NULL;
 	}
 	return -ENOMEM;
 }
 static inline void free_zone_pagesets(int cpu)
 {
 #ifdef CONFIG_NUMA
 	struct zone *zone;
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
 		zone_pcp(zone, cpu) = NULL;
 		kfree(pset);
 	}
 #endif
 }
 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
 	int cpu = (long)hcpu;
 	int ret = NOTIFY_OK;
 	switch (action) {
 		case CPU_UP_PREPARE:
 			if (process_zones(cpu))
 				ret = NOTIFY_BAD;
 			break;
 #ifdef CONFIG_HOTPLUG_CPU
 		case CPU_DEAD:
 			free_zone_pagesets(cpu);
 			break;
 #endif
 		default:
 			break;
 	}
 	return ret;
 }
 static struct notifier_block pageset_notifier =
 	{ &pageset_cpuup_callback, NULL, 0 };
 void __init setup_per_cpu_pageset()
 {
 	int err;
 	/* Initialize per_cpu_pageset for cpu 0.
 	 * A cpuup callback will do this for every cpu
 	 * as it comes online
 	 */
 	err = process_zones(smp_processor_id());
 	BUG_ON(err);
 	register_cpu_notifier(&pageset_notifier);
 }
 #endif
 static __devinit
 void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_size = wait_table_size(zone_size_pages);
 	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
 	zone->wait_table = (wait_queue_head_t *)
 		alloc_bootmem_node(pgdat, zone->wait_table_size
 					* sizeof(wait_queue_head_t));
 	for(i = 0; i < zone->wait_table_size; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 }
 static __devinit void zone_pcp_init(struct zone *zone)
 {
 	int cpu;
 	unsigned long batch = zone_batchsize(zone);
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
 		zone->pageset[cpu] = &boot_pageset[cpu];
 		setup_pageset(&boot_pageset[cpu],0);
 #else
 		setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
 	}
 	printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 		zone->name, zone->present_pages, batch);
 }
 static __devinit void init_currently_empty_zone(struct zone *zone,
 		unsigned long zone_start_pfn, unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	zone_wait_table_init(zone, size);
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 	zone->zone_start_pfn = zone_start_pfn;
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
 static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
 		if (j == ZONE_DMA || j == ZONE_NORMAL)
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
 		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
 		zone_start_pfn += size;
 	}
 }
 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size;
 		struct page *map;
 		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node(pgdat, size);
 		pgdat->node_mem_map = map;
 	}
 #ifdef CONFIG_FLATMEM
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0))
 		mem_map = NODE_DATA(0)->node_mem_map;
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, NODE_DATA(0), zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 #ifdef CONFIG_PROC_FS
 #include <linux/seq_file.h>
 static void *frag_start(struct seq_file *m, loff_t *pos)
 {
 	pg_data_t *pgdat;
 	loff_t node = *pos;
 	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
 		--node;
 	return pgdat;
 }
 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 {
 	pg_data_t *pgdat = (pg_data_t *)arg;
 	(*pos)++;
 	return pgdat->pgdat_next;
 }
 static void frag_stop(struct seq_file *m, void *arg)
 {
 }
 /*
  * This walks the free areas for each zone.
  */
 static int frag_show(struct seq_file *m, void *arg)
 {
 	pg_data_t *pgdat = (pg_data_t *)arg;
 	struct zone *zone;
 	struct zone *node_zones = pgdat->node_zones;
 	unsigned long flags;
 	int order;
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 		if (!zone->present_pages)
 			continue;
 		spin_lock_irqsave(&zone->lock, flags);
 		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 		for (order = 0; order < MAX_ORDER; ++order)
 			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
 		spin_unlock_irqrestore(&zone->lock, flags);
 		seq_putc(m, '\n');
 	}
 	return 0;
 }
 struct seq_operations fragmentation_op = {
 	.start	= frag_start,
 	.next	= frag_next,
 	.stop	= frag_stop,
 	.show	= frag_show,
 };
 /*
  * Output information about zones in @pgdat.
  */
 static int zoneinfo_show(struct seq_file *m, void *arg)
 {
 	pg_data_t *pgdat = arg;
 	struct zone *zone;
 	struct zone *node_zones = pgdat->node_zones;
 	unsigned long flags;
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
 		int i;
 		if (!zone->present_pages)
 			continue;
 		spin_lock_irqsave(&zone->lock, flags);
 		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
 		seq_printf(m,
 			   "\n  pages free     %lu"
 			   "\n        min      %lu"
 			   "\n        low      %lu"
 			   "\n        high     %lu"
 			   "\n        active   %lu"
 			   "\n        inactive %lu"
 			   "\n        scanned  %lu (a: %lu i: %lu)"
 			   "\n        spanned  %lu"
 			   "\n        present  %lu",
 			   zone->free_pages,
 			   zone->pages_min,
 			   zone->pages_low,
 			   zone->pages_high,
 			   zone->nr_active,
 			   zone->nr_inactive,
 			   zone->pages_scanned,
 			   zone->nr_scan_active, zone->nr_scan_inactive,
 			   zone->spanned_pages,
 			   zone->present_pages);
 		seq_printf(m,
 			   "\n        protection: (%lu",
 			   zone->lowmem_reserve[0]);
 		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
 			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
 		seq_printf(m,
 			   ")"
 			   "\n  pagesets");
 		for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
 			struct per_cpu_pageset *pageset;
 			int j;
 			pageset = zone_pcp(zone, i);
 			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
 				if (pageset->pcp[j].count)
 					break;
 			}
 			if (j == ARRAY_SIZE(pageset->pcp))
 				continue;
 			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
 				seq_printf(m,
 					   "\n    cpu: %i pcp: %i"
 					   "\n              count: %i"
 					   "\n              low:   %i"
 					   "\n              high:  %i"
 					   "\n              batch: %i",
 					   i, j,
 					   pageset->pcp[j].count,
 					   pageset->pcp[j].low,
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
 #ifdef CONFIG_NUMA
 			seq_printf(m,
 				   "\n            numa_hit:       %lu"
 				   "\n            numa_miss:      %lu"
 				   "\n            numa_foreign:   %lu"
 				   "\n            interleave_hit: %lu"
 				   "\n            local_node:     %lu"
 				   "\n            other_node:     %lu",
 				   pageset->numa_hit,
 				   pageset->numa_miss,
 				   pageset->numa_foreign,
 				   pageset->interleave_hit,
 				   pageset->local_node,
 				   pageset->other_node);
 #endif
 		}
 		seq_printf(m,
 			   "\n  all_unreclaimable: %u"
 			   "\n  prev_priority:     %i"
 			   "\n  temp_priority:     %i"
 			   "\n  start_pfn:         %lu",
 			   zone->all_unreclaimable,
 			   zone->prev_priority,
 			   zone->temp_priority,
 			   zone->zone_start_pfn);
 		spin_unlock_irqrestore(&zone->lock, flags);
 		seq_putc(m, '\n');
 	}
 	return 0;
 }
 struct seq_operations zoneinfo_op = {
 	.start	= frag_start, /* iterate over all zones. The same as in
 			       * fragmentation. */
 	.next	= frag_next,
 	.stop	= frag_stop,
 	.show	= zoneinfo_show,
 };
 static char *vmstat_text[] = {
 	"nr_dirty",
 	"nr_writeback",
 	"nr_unstable",
 	"nr_page_table_pages",
 	"nr_mapped",
 	"nr_slab",
 	"pgpgin",
 	"pgpgout",
 	"pswpin",
 	"pswpout",
 	"pgalloc_high",
 	"pgalloc_normal",
 	"pgalloc_dma",
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
 	"pgfault",
 	"pgmajfault",
 	"pgrefill_high",
 	"pgrefill_normal",
 	"pgrefill_dma",
 	"pgsteal_high",
 	"pgsteal_normal",
 	"pgsteal_dma",
 	"pgscan_kswapd_high",
 	"pgscan_kswapd_normal",
 	"pgscan_kswapd_dma",
 	"pgscan_direct_high",
 	"pgscan_direct_normal",
 	"pgscan_direct_dma",
 	"pginodesteal",
 	"slabs_scanned",
 	"kswapd_steal",
 	"kswapd_inodesteal",
 	"pageoutrun",
 	"allocstall",
 	"pgrotated",
 	"nr_bounce",
 };
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
 	struct page_state *ps;
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
 	m->private = ps;
 	if (!ps)
 		return ERR_PTR(-ENOMEM);
 	get_full_page_state(ps);
 	ps->pgpgin /= 2;		/* sectors -> kbytes */
 	ps->pgpgout /= 2;
 	return (unsigned long *)ps + *pos;
 }
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
 {
 	(*pos)++;
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
 	return (unsigned long *)m->private + *pos;
 }
 static int vmstat_show(struct seq_file *m, void *arg)
 {
 	unsigned long *l = arg;
 	unsigned long off = l - (unsigned long *)m->private;
 	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
 	return 0;
 }
 static void vmstat_stop(struct seq_file *m, void *arg)
 {
 	kfree(m->private);
 	m->private = NULL;
 }
 struct seq_operations vmstat_op = {
 	.start	= vmstat_start,
 	.next	= vmstat_next,
 	.stop	= vmstat_stop,
 	.show	= vmstat_show,
 };
 #endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	long *count;
 	unsigned long *src, *dest;
 	if (action == CPU_DEAD) {
 		int i;
 		/* Drain local pagecache count. */
 		count = &per_cpu(nr_pagecache_local, cpu);
 		atomic_add(*count, &nr_pagecache);
 		*count = 0;
 		local_irq_disable();
 		__drain_pages(cpu);
 		/* Add dead cpu's page_states to our own. */
 		dest = (unsigned long *)&__get_cpu_var(page_states);
 		src = (unsigned long *)&per_cpu(page_states, cpu);
 		for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
 				i++) {
 			dest[i] += src[i];
 			src[i] = 0;
 		}
 		local_irq_enable();
 	}
 	return NOTIFY_OK;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	int j, idx;
 	for_each_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long present_pages = zone->present_pages;
 			zone->lowmem_reserve[j] = 0;
 			for (idx = j-1; idx >= 0; idx--) {
 				struct zone *lower_zone;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = present_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
 }
 /*
  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
  *	that the pages_{min,low,high} values for each zone are set correctly
  *	with respect to min_free_kbytes.
  */
 static void setup_per_zone_pages_min(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
 	}
 	for_each_zone(zone) {
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (is_highmem(zone)) {
 			/*
 			 * Often, highmem doesn't need to reserve any pages.
 			 * But the pages_min/low/high values are also used for
 			 * batching up page reclaim activity so we need a
 			 * decent value here.
 			 */
 			int min_pages;
 			min_pages = zone->present_pages / 1024;
 			if (min_pages < SWAP_CLUSTER_MAX)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
 			/* if it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->pages_min = (pages_min * zone->present_pages) /
 			                   lowmem_pages;
 		}
 		/*
 		 * When interpreting these watermarks, just keep in mind that:
 		 * zone->pages_min == (zone->pages_min * 4) / 4;
 		 */
 		zone->pages_low   = (zone->pages_min * 5) / 4;
 		zone->pages_high  = (zone->pages_min * 6) / 4;
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 static int __init init_per_zone_pages_min(void)
 {
 	unsigned long lowmem_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (min_free_kbytes < 128)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	setup_per_zone_pages_min();
 	return 0;
 }
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * pages_min watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 __initdata int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long limit)
 {
 	unsigned long long max = limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
 		numentries >>= 20 - PAGE_SHIFT;
 		numentries <<= 20 - PAGE_SHIFT;
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 	}
 	/* rounded up to nearest power of 2 in size */
 	numentries = 1UL << (long_log2(numentries) + 1);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	if (numentries > max)
 		numentries = max;
 	log2qty = long_log2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			unsigned long order;
 			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
 				;
 			table = (void*) __get_free_pages(GFP_ATOMIC, order);
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1U << log2qty),
 	       long_log2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }