Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 476f35348eb8d2a827765992899fea78b7dcc46f

Authored by Christoph Lameter 2007-05-07 05:48:58 +0800

Committed by Linus Torvalds 2007-05-08 03:12:51 +0800

Safer nr_node_ids and nr_node_ids determination and initial values

The nr_cpu_ids value is currently only calculated in smp_init. However, it
may be needed before (SLUB needs it on kmem_cache_init!) and other kernel
components may also want to allocate dynamically sized per cpu array before
smp_init. So move the determination of possible cpus into sched_init()
where we already loop over all possible cpus early in boot.

Also initialize both nr_node_ids and nr_cpu_ids with the highest value they
could take. If we have accidental users before these values are determined
then the current valud of 0 may cause too small per cpu and per node arrays
to be allocated. If it is set to the maximum possible then we only waste
some memory for early boot users.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 9 additions and 9 deletions Inline Diff

init/main.c
kernel/sched.c
lib/cpumask.c
mm/page_alloc.c

init/main.c

Diff comments View file @ 476f353

 /*
  *  linux/init/main.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  GK 2/5/95  -  Changed to support mounting root fs via NFS
  *  Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
  *  Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
  *  Simplified starting of init:  Michael A. Griffith <grif@acm.org>
  */
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/utsname.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/initrd.h>
 #include <linux/hdreg.h>
 #include <linux/bootmem.h>
 #include <linux/tty.h>
 #include <linux/gfp.h>
 #include <linux/percpu.h>
 #include <linux/kmod.h>
 #include <linux/kernel_stat.h>
 #include <linux/start_kernel.h>
 #include <linux/security.h>
 #include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
 #include <linux/interrupt.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/unwind.h>
 #include <linux/buffer_head.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
 #endif
 /*
  * This is one of the first .c files built. Error out early if we have compiler
  * trouble.
  *
  * Versions of gcc older than that listed below may actually compile and link
  * okay, but the end product can have subtle run time bugs.  To avoid associated
  * bogus bug reports, we flatly refuse to compile with a gcc that is known to be
  * too old from the very beginning.
  */
 #if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
 #error Sorry, your GCC is too old. It builds incorrect kernels.
 #endif
 #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0
 #warning gcc-4.1.0 is known to miscompile the kernel.  A different compiler version is recommended.
 #endif
 static int kernel_init(void *);
 extern void init_IRQ(void);
 extern void fork_init(unsigned long);
 extern void mca_init(void);
 extern void sbus_init(void);
 extern void signals_init(void);
 extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void prio_tree_init(void);
 extern void radix_tree_init(void);
 extern void free_initmem(void);
 extern void prepare_namespace(void);
 #ifdef	CONFIG_ACPI
 extern void acpi_early_init(void);
 #else
 static inline void acpi_early_init(void) { }
 #endif
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
 #ifdef CONFIG_TC
 extern void tc_init(void);
 #endif
 enum system_states system_state;
 EXPORT_SYMBOL(system_state);
 /*
  * Boot command-line arguments
  */
 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
 extern void time_init(void);
 /* Default late time init is NULL. archs can override this later. */
 void (*late_time_init)(void);
 extern void softirq_init(void);
 /* Untouched command line saved by arch-specific code. */
 char __initdata boot_command_line[COMMAND_LINE_SIZE];
 /* Untouched saved command line (eg. for /proc) */
 char *saved_command_line;
 /* Command line for parameter parsing */
 static char *static_command_line;
 static char *execute_command;
 static char *ramdisk_execute_command;
 /* Setup configured maximum number of CPUs to activate */
 static unsigned int max_cpus = NR_CPUS;
 /*
  * If set, this is an indication to the drivers that reset the underlying
  * device before going ahead with the initialization otherwise driver might
  * rely on the BIOS and skip the reset operation.
  *
  * This is useful if kernel is booting in an unreliable environment.
  * For ex. kdump situaiton where previous kernel has crashed, BIOS has been
  * skipped and devices will be in unknown state.
  */
 unsigned int reset_devices;
 EXPORT_SYMBOL(reset_devices);
 /*
  * Setup routine for controlling SMP activation
  *
  * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
  * activation entirely (the MPS table probe still happens, though).
  *
  * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
  * greater than 0, limits the maximum number of CPUs activated in
  * SMP mode to <NUM>.
  */
 static int __init nosmp(char *str)
 {
 	max_cpus = 0;
 	return 1;
 }
 __setup("nosmp", nosmp);
 static int __init maxcpus(char *str)
 {
 	get_option(&str, &max_cpus);
 	return 1;
 }
 __setup("maxcpus=", maxcpus);
 static int __init set_reset_devices(char *str)
 {
 	reset_devices = 1;
 	return 1;
 }
 __setup("reset_devices", set_reset_devices);
 static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
 char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
 static const char *panic_later, *panic_param;
 extern struct obs_kernel_param __setup_start[], __setup_end[];
 static int __init obsolete_checksetup(char *line)
 {
 	struct obs_kernel_param *p;
 	int had_early_param = 0;
 	p = __setup_start;
 	do {
 		int n = strlen(p->str);
 		if (!strncmp(line, p->str, n)) {
 			if (p->early) {
 				/* Already done in parse_early_param?
 				 * (Needs exact match on param part).
 				 * Keep iterating, as we can have early
 				 * params and __setups of same names 8( */
 				if (line[n] == '\0' || line[n] == '=')
 					had_early_param = 1;
 			} else if (!p->setup_func) {
 				printk(KERN_WARNING "Parameter %s is obsolete,"
 				       " ignored\n", p->str);
 				return 1;
 			} else if (p->setup_func(line + n))
 				return 1;
 		}
 		p++;
 	} while (p < __setup_end);
 	return had_early_param;
 }
 /*
  * This should be approx 2 Bo*oMips to start (note initial shift), and will
  * still work even if initially too large, it will just take slightly longer
  */
 unsigned long loops_per_jiffy = (1<<12);
 EXPORT_SYMBOL(loops_per_jiffy);
 static int __init debug_kernel(char *str)
 {
 	if (*str)
 		return 0;
 	console_loglevel = 10;
 	return 1;
 }
 static int __init quiet_kernel(char *str)
 {
 	if (*str)
 		return 0;
 	console_loglevel = 4;
 	return 1;
 }
 __setup("debug", debug_kernel);
 __setup("quiet", quiet_kernel);
 static int __init loglevel(char *str)
 {
 	get_option(&str, &console_loglevel);
 	return 1;
 }
 __setup("loglevel=", loglevel);
 /*
  * Unknown boot options get handed to init, unless they look like
  * failed parameters
  */
 static int __init unknown_bootoption(char *param, char *val)
 {
 	/* Change NUL term back to "=", to make "param" the whole string. */
 	if (val) {
 		/* param=val or param="val"? */
 		if (val == param+strlen(param)+1)
 			val[-1] = '=';
 		else if (val == param+strlen(param)+2) {
 			val[-2] = '=';
 			memmove(val-1, val, strlen(val)+1);
 			val--;
 		} else
 			BUG();
 	}
 	/* Handle obsolete-style parameters */
 	if (obsolete_checksetup(param))
 		return 0;
 	/*
 	 * Preemptive maintenance for "why didn't my mispelled command
 	 * line work?"
 	 */
 	if (strchr(param, '.') && (!val || strchr(param, '.') < val)) {
 		printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param);
 		return 0;
 	}
 	if (panic_later)
 		return 0;
 	if (val) {
 		/* Environment option */
 		unsigned int i;
 		for (i = 0; envp_init[i]; i++) {
 			if (i == MAX_INIT_ENVS) {
 				panic_later = "Too many boot env vars at `%s'";
 				panic_param = param;
 			}
 			if (!strncmp(param, envp_init[i], val - param))
 				break;
 		}
 		envp_init[i] = param;
 	} else {
 		/* Command line option */
 		unsigned int i;
 		for (i = 0; argv_init[i]; i++) {
 			if (i == MAX_INIT_ARGS) {
 				panic_later = "Too many boot init vars at `%s'";
 				panic_param = param;
 			}
 		}
 		argv_init[i] = param;
 	}
 	return 0;
 }
 static int __init init_setup(char *str)
 {
 	unsigned int i;
 	execute_command = str;
 	/*
 	 * In case LILO is going to boot us with default command line,
 	 * it prepends "auto" before the whole cmdline which makes
 	 * the shell think it should execute a script with such name.
 	 * So we ignore all arguments entered _before_ init=... [MJ]
 	 */
 	for (i = 1; i < MAX_INIT_ARGS; i++)
 		argv_init[i] = NULL;
 	return 1;
 }
 __setup("init=", init_setup);
 static int __init rdinit_setup(char *str)
 {
 	unsigned int i;
 	ramdisk_execute_command = str;
 	/* See "auto" comment in init_setup */
 	for (i = 1; i < MAX_INIT_ARGS; i++)
 		argv_init[i] = NULL;
 	return 1;
 }
 __setup("rdinit=", rdinit_setup);
 #ifndef CONFIG_SMP
 #ifdef CONFIG_X86_LOCAL_APIC
 static void __init smp_init(void)
 {
 	APIC_init_uniprocessor();
 }
 #else
 #define smp_init()	do { } while (0)
 #endif
 static inline void setup_per_cpu_areas(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 #else
 #ifdef __GENERIC_PER_CPU
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
 static void __init setup_per_cpu_areas(void)
 {
 	unsigned long size, i;
 	char *ptr;
 	unsigned long nr_possible_cpus = num_possible_cpus();
 	/* Copy section for each CPU (we discard the original) */
 	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
 	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		ptr += size;
 	}
 }
 #endif /* !__GENERIC_PER_CPU */
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
 {
 	unsigned int cpu;
-	unsigned highest = 0;
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		highest = cpu;
-	nr_cpu_ids = highest + 1;
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= max_cpus)
 			break;
 		if (!cpu_online(cpu))
 			cpu_up(cpu);
 	}
 	/* Any cleanup work */
 	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
 	smp_cpus_done(max_cpus);
 }
 #endif
 /*
  * We need to store the untouched command line for future reference.
  * We also need to store the touched command line since the parameter
  * parsing is performed in place, and we should allow a component to
  * store reference of name/value for future reference.
  */
 static void __init setup_command_line(char *command_line)
 {
 	saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);
 	static_command_line = alloc_bootmem(strlen (command_line)+1);
 	strcpy (saved_command_line, boot_command_line);
 	strcpy (static_command_line, command_line);
 }
 /*
  * We need to finalize in a non-__init function or else race conditions
  * between the root thread and the init thread may cause start_kernel to
  * be reaped by free_initmem before the root thread has proceeded to
  * cpu_idle.
  *
  * gcc-3.4 accidentally inlines this function, so use noinline.
  */
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
 	kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	unlock_kernel();
 	/*
 	 * The boot idle thread must execute schedule()
 	 * at least one to get things moving:
 	 */
 	preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 	/* Call into cpu_idle with preempt disabled */
 	cpu_idle();
 }
 /* Check for early params. */
 static int __init do_early_param(char *param, char *val)
 {
 	struct obs_kernel_param *p;
 	for (p = __setup_start; p < __setup_end; p++) {
 		if (p->early && strcmp(param, p->str) == 0) {
 			if (p->setup_func(val) != 0)
 				printk(KERN_WARNING
 				       "Malformed early option '%s'\n", param);
 		}
 	}
 	/* We accept everything at this stage. */
 	return 0;
 }
 /* Arch code calls this early on, or if not, just before other parsing. */
 void __init parse_early_param(void)
 {
 	static __initdata int done = 0;
 	static __initdata char tmp_cmdline[COMMAND_LINE_SIZE];
 	if (done)
 		return;
 	/* All fall through to do_early_param. */
 	strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	parse_args("early options", tmp_cmdline, NULL, 0, do_early_param);
 	done = 1;
 }
 /*
  *	Activate the first processor.
  */
 static void __init boot_cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
 	cpu_set(cpu, cpu_online_map);
 	cpu_set(cpu, cpu_present_map);
 	cpu_set(cpu, cpu_possible_map);
 }
 void __init __attribute__((weak)) smp_setup_processor_id(void)
 {
 }
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
 	extern struct kernel_param __start___param[], __stop___param[];
 	smp_setup_processor_id();
 	/*
 	 * Need to run as early as possible, to initialize the
 	 * lockdep hash:
 	 */
 	unwind_init();
 	lockdep_init();
 	local_irq_disable();
 	early_boot_irqs_off();
 	early_init_irq_lock_class();
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
  */
 	lock_kernel();
 	tick_init();
 	boot_cpu_init();
 	page_address_init();
 	printk(KERN_NOTICE);
 	printk(linux_banner);
 	setup_arch(&command_line);
 	setup_command_line(command_line);
 	unwind_setup();
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
 	 * time - but meanwhile we still have a functioning scheduler.
 	 */
 	sched_init();
 	/*
 	 * Disable preemption - early bootup scheduling is extremely
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
 	build_all_zonelists();
 	page_alloc_init();
 	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", static_command_line, __start___param,
 		   __stop___param - __start___param,
 		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
 		local_irq_disable();
 	}
 	sort_main_extable();
 	trap_init();
 	rcu_init();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
 	timekeeping_init();
 	time_init();
 	profile_init();
 	if (!irqs_disabled())
 		printk("start_kernel(): bug: interrupts were enabled early\n");
 	early_boot_irqs_on();
 	local_irq_enable();
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
 	 * we've done PCI setups etc, and console_init() must be aware of
 	 * this. But we do want output early, in case something goes wrong.
 	 */
 	console_init();
 	if (panic_later)
 		panic(panic_later, panic_param);
 	lockdep_info();
 	/*
 	 * Need to run this when irqs are enabled, because it wants
 	 * to self-test [hard/soft]-irqs on/off lock inversion bugs
 	 * too:
 	 */
 	locking_selftest();
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 			initrd_start < min_low_pfn << PAGE_SHIFT) {
 		printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
 		    "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
 		initrd_start = 0;
 	}
 #endif
 	vfs_caches_init_early();
 	cpuset_init_early();
 	mem_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
 	calibrate_delay();
 	pidmap_init();
 	pgtable_cache_init();
 	prio_tree_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled)
 		efi_enter_virtual_mode();
 #endif
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
 	unnamed_dev_init();
 	key_init();
 	security_init();
 	vfs_caches_init(num_physpages);
 	radix_tree_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
 #ifdef CONFIG_PROC_FS
 	proc_root_init();
 #endif
 	cpuset_init();
 	taskstats_init_early();
 	delayacct_init();
 	check_bugs();
 	acpi_early_init(); /* before LAPIC and SMP init */
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
 static int __initdata initcall_debug;
 static int __init initcall_debug_setup(char *str)
 {
 	initcall_debug = 1;
 	return 1;
 }
 __setup("initcall_debug", initcall_debug_setup);
 extern initcall_t __initcall_start[], __initcall_end[];
 static void __init do_initcalls(void)
 {
 	initcall_t *call;
 	int count = preempt_count();
 	for (call = __initcall_start; call < __initcall_end; call++) {
 		char *msg = NULL;
 		char msgbuf[40];
 		int result;
 		if (initcall_debug) {
 			printk("Calling initcall 0x%p", *call);
 			print_fn_descriptor_symbol(": %s()",
 					(unsigned long) *call);
 			printk("\n");
 		}
 		result = (*call)();
 		if (result && result != -ENODEV && initcall_debug) {
 			sprintf(msgbuf, "error code %d", result);
 			msg = msgbuf;
 		}
 		if (preempt_count() != count) {
 			msg = "preemption imbalance";
 			preempt_count() = count;
 		}
 		if (irqs_disabled()) {
 			msg = "disabled interrupts";
 			local_irq_enable();
 		}
 		if (msg) {
 			printk(KERN_WARNING "initcall at 0x%p", *call);
 			print_fn_descriptor_symbol(": %s()",
 					(unsigned long) *call);
 			printk(": returned with %s\n", msg);
 		}
 	}
 	/* Make sure there is no pending stuff from the initcall sequence */
 	flush_scheduled_work();
 }
 /*
  * Ok, the machine is now initialized. None of the devices
  * have been touched yet, but the CPU subsystem is up and
  * running, and memory and process management works.
  *
  * Now we can finally start doing some real work..
  */
 static void __init do_basic_setup(void)
 {
 	/* drivers will send hotplug events */
 	init_workqueues();
 	usermodehelper_init();
 	driver_init();
 	init_irq_proc();
 	do_initcalls();
 }
 static void __init do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
 #ifdef CONFIG_SMP
 	extern int migration_init(void);
 	migration_init();
 #endif
 	spawn_ksoftirqd();
 	spawn_softlockup_task();
 }
 static void run_init_process(char *init_filename)
 {
 	argv_init[0] = init_filename;
 	kernel_execve(init_filename, argv_init, envp_init);
 }
 /* This is a non __init function. Force it to be noinline otherwise gcc
  * makes it inline to init() and it becomes part of init.text section
  */
 static int noinline init_post(void)
 {
 	free_initmem();
 	unlock_kernel();
 	mark_rodata_ro();
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 	if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
 		printk(KERN_WARNING "Warning: unable to open an initial console.\n");
 	(void) sys_dup(0);
 	(void) sys_dup(0);
 	if (ramdisk_execute_command) {
 		run_init_process(ramdisk_execute_command);
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
 	/*
 	 * We try each of these until one succeeds.
 	 *
 	 * The Bourne shell can be used instead of init if we are
 	 * trying to recover a really broken machine.
 	 */
 	if (execute_command) {
 		run_init_process(execute_command);
 		printk(KERN_WARNING "Failed to execute %s.  Attempting "
 					"defaults...\n", execute_command);
 	}
 	run_init_process("/sbin/init");
 	run_init_process("/etc/init");
 	run_init_process("/bin/init");
 	run_init_process("/bin/sh");
 	panic("No init found.  Try passing init= option to kernel.");
 }
 static int __init kernel_init(void * unused)
 {
 	lock_kernel();
 	/*
 	 * init can run on any cpu.
 	 */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 	/*
 	 * Tell the world that we're going to be the grim
 	 * reaper of innocent orphaned children.
 	 *
 	 * We don't want people to have to make incorrect
 	 * assumptions about where in the task array this
 	 * can be found.
 	 */
 	init_pid_ns.child_reaper = current;
 	cad_pid = task_pid(current);
 	smp_prepare_cpus(max_cpus);
 	do_pre_smp_initcalls();
 	smp_init();
 	sched_init_smp();
 	cpuset_init_smp();
 	do_basic_setup();
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
 	 * the work
 	 */
 	if (!ramdisk_execute_command)
 		ramdisk_execute_command = "/init";
 	if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
 	}
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
 	init_post();
 	return 0;
 }

kernel/sched.c

Diff comments View file @ 476f353

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <asm/tlb.h>
 #include <asm/unistd.h>
 /*
  * Scheduler clock - returns current time in nanosec units.
  * This is default implementation.
  * Architectures and sub-architectures can override this.
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  * Timeslices get refilled after they expire.
  */
 #define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 #define ON_RUNQUEUE_WEIGHT	 30
 #define CHILD_PENALTY		 95
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		  3
 #define PRIO_BONUS_RATIO	 25
 #define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
 #define INTERACTIVE_DELTA	  2
 #define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
 /*
  * If a task is 'interactive' then we reinsert it in the active
  * array after it has expired its current timeslice. (it will not
  * continue to run immediately, it will still roundrobin with
  * other interactive tasks.)
  *
  * This part scales the interactivity limit depending on niceness.
  *
  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
  * Here are a few examples of different nice levels:
  *
  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
  *
  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
  *  priority range a task can explore, a value of '1' means the
  *  task is rated interactive.)
  *
  * Ie. nice +19 tasks can never get 'interactive' enough to be
  * reinserted into the active array. And only heavily CPU-hog nice -20
  * tasks will be expired. Default nice 0 tasks are somewhere between,
  * it takes some effort for them to get interactive, but it's not
  * too hard.
  */
 #define CURRENT_BONUS(p) \
 	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 		MAX_SLEEP_AVG)
 #define GRANULARITY	(10 * HZ / 1000 ? : 1)
 #ifdef CONFIG_SMP
 #define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
 		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 			num_online_cpus())
 #else
 #define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
 		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 #endif
 #define SCALE(v1,v1_max,v2_max) \
 	(v1) * (v2_max) / (v1_max)
 #define DELTA(p) \
 	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
 		INTERACTIVE_DELTA)
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
 #define INTERACTIVE_SLEEP(p) \
 	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 static unsigned int static_prio_timeslice(int static_prio)
 {
 	if (static_prio < NICE_TO_PRIO(0))
 		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 	else
 		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
 /*
  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
  * to time slice values: [800ms ... 100ms ... 5ms]
  *
  * The higher a thread's priority, the bigger timeslices
  * it gets during one round of execution. But even the lowest
  * priority thread gets MIN_TIMESLICE worth of execution time.
  */
 static inline unsigned int task_timeslice(struct task_struct *p)
 {
 	return static_prio_timeslice(p->static_prio);
 }
 /*
  * These are the runqueue data structures:
  */
 struct prio_array {
 	unsigned int nr_active;
 	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_PRIO];
 };
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
 	spinlock_t lock;
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 	unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	unsigned long expired_timestamp;
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	struct prio_array *active, *expired, arrays[2];
 	int best_expired_prio;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct sched_domain *sd;
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
 	int cpu;		/* cpu of this runqueue */
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	/* sys_sched_yield() stats */
 	unsigned long yld_exp_empty;
 	unsigned long yld_act_empty;
 	unsigned long yld_both_empty;
 	unsigned long yld_cnt;
 	/* schedule() stats */
 	unsigned long sched_switch;
 	unsigned long sched_cnt;
 	unsigned long sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_cnt;
 	unsigned long ttwu_local;
 #endif
 	struct lock_class_key rq_lock_key;
 };
 static DEFINE_PER_CPU(struct rq, runqueues);
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	return rq->cpu;
 #else
 	return 0;
 #endif
 }
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 	spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
 	return rq->curr == p;
 #endif
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	spin_unlock_irq(&rq->lock);
 #else
 	spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->oncpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 repeat_lock_task:
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock(&rq->lock);
 		goto repeat_lock_task;
 	}
 	return rq;
 }
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 repeat_lock_task:
 	local_irq_save(*flags);
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
 	}
 	return rq;
 }
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 #ifdef CONFIG_SCHEDSTATS
 /*
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
 #define SCHEDSTAT_VERSION 14
 static int show_schedstat(struct seq_file *seq, void *v)
 {
 	int cpu;
 	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 	seq_printf(seq, "timestamp %lu\n", jiffies);
 	for_each_online_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_SMP
 		struct sched_domain *sd;
 		int dcnt = 0;
 #endif
 		/* runqueue-specific stats */
 		seq_printf(seq,
 		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 		    cpu, rq->yld_both_empty,
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
 		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 		    rq->ttwu_cnt, rq->ttwu_local,
 		    rq->rq_sched_info.cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 		seq_printf(seq, "\n");
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
 		preempt_disable();
 		for_each_domain(cpu, sd) {
 			enum idle_type itype;
 			char mask_str[NR_CPUS];
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 					itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
 						"%lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
 				    sd->lb_imbalance[itype],
 				    sd->lb_gained[itype],
 				    sd->lb_hot_gained[itype],
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
 			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
 			    " %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
 			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
 			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
 			    sd->ttwu_move_balance);
 		}
 		preempt_enable();
 #endif
 	}
 	return 0;
 }
 static int schedstat_open(struct inode *inode, struct file *file)
 {
 	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
 	char *buf = kmalloc(size, GFP_KERNEL);
 	struct seq_file *m;
 	int res;
 	if (!buf)
 		return -ENOMEM;
 	res = single_open(file, show_schedstat, NULL);
 	if (!res) {
 		m = file->private_data;
 		m->buf = buf;
 		m->size = size;
 	} else
 		kfree(buf);
 	return res;
 }
 const struct file_operations proc_schedstat_operations = {
 	.open    = schedstat_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = single_release,
 };
 /*
  * Expects runqueue lock to be held for atomicity of update
  */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
 {
 	if (rq) {
 		rq->rq_sched_info.run_delay += delta_jiffies;
 		rq->rq_sched_info.pcnt++;
 	}
 }
 /*
  * Expects runqueue lock to be held for atomicity of update
  */
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 {
 	if (rq)
 		rq->rq_sched_info.cpu_time += delta_jiffies;
 }
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
 {}
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 {}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 #endif
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static inline struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 	return rq;
 }
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
  * tasks, the expired queue will become the active queue after the active
  * queue is empty, without explicitly dequeuing and requeuing tasks in the
  * expired queue.  (Interactive tasks may be requeued directly to the
  * active queue, thus delaying tasks in the expired queue from running;
  * see scheduler_tick()).
  *
  * This function is only called from sched_info_arrive(), rather than
  * dequeue_task(). Even though a task may be queued and dequeued multiple
  * times as it is shuffled about, we're really interested in knowing how
  * long it was from the *first* time it was queued to the time that it
  * finally hit a cpu.
  */
 static inline void sched_info_dequeued(struct task_struct *t)
 {
 	t->sched_info.last_queued = 0;
 }
 /*
  * Called when a task finally hits the cpu.  We can now calculate how
  * long it was waiting to run.  We also note when it began so that we
  * can keep stats on how long its timeslice is.
  */
 static void sched_info_arrive(struct task_struct *t)
 {
 	unsigned long now = jiffies, delta_jiffies = 0;
 	if (t->sched_info.last_queued)
 		delta_jiffies = now - t->sched_info.last_queued;
 	sched_info_dequeued(t);
 	t->sched_info.run_delay += delta_jiffies;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcnt++;
 	rq_sched_info_arrive(task_rq(t), delta_jiffies);
 }
 /*
  * Called when a process is queued into either the active or expired
  * array.  The time is noted and later used to determine how long we
  * had to wait for us to reach the cpu.  Since the expired queue will
  * become the active queue after active queue is empty, without dequeuing
  * and requeuing any tasks, we are interested in queuing to either. It
  * is unusual but not impossible for tasks to be dequeued and immediately
  * requeued in the same or another array: this can happen in sched_yield(),
  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
  * to runqueue.
  *
  * This function is only called from enqueue_task(), but also only updates
  * the timestamp if it is already not set.  It's assumed that
  * sched_info_dequeued() will clear that stamp when appropriate.
  */
 static inline void sched_info_queued(struct task_struct *t)
 {
 	if (unlikely(sched_info_on()))
 		if (!t->sched_info.last_queued)
 			t->sched_info.last_queued = jiffies;
 }
 /*
  * Called when a process ceases being the active-running process, either
  * voluntarily or involuntarily.  Now we can calculate how long we ran.
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
 	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 	t->sched_info.cpu_time += delta_jiffies;
 	rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 /*
  * Called when tasks are switched involuntarily due, typically, to expiring
  * their time slice.  (This may also be called when switching to or from
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
 __sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct rq *rq = task_rq(prev);
 	/*
 	 * prev now departs the cpu.  It's not interesting to record
 	 * stats about how efficient we were at scheduling the idle
 	 * process, however.
 	 */
 	if (prev != rq->idle)
 		sched_info_depart(prev);
 	if (next != rq->idle)
 		sched_info_arrive(next);
 }
 static inline void
 sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
 	if (unlikely(sched_info_on()))
 		__sched_info_switch(prev, next);
 }
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 /*
  * Adding/removing a task to/from a priority array:
  */
 static void dequeue_task(struct task_struct *p, struct prio_array *array)
 {
 	array->nr_active--;
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 }
 static void enqueue_task(struct task_struct *p, struct prio_array *array)
 {
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
 }
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
 static void requeue_task(struct task_struct *p, struct prio_array *array)
 {
 	list_move_tail(&p->run_list, array->queue + p->prio);
 }
 static inline void
 enqueue_task_head(struct task_struct *p, struct prio_array *array)
 {
 	list_add(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
 }
 /*
  * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
  * into the -5 ... 0 ... +5 bonus/penalty range.
  *
  * We use 25% of the full 0...39 priority range so that:
  *
  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
  *
  * Both properties are important to certain workloads.
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	int bonus, prio;
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
 		prio = MAX_PRIO-1;
 	return prio;
 }
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
  * scaled version of the new time slice allocation that they receive on time
  * slice expiry etc.
  */
 /*
  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
  * If static_prio_timeslice() is ever changed to break this assumption then
  * this code will need modification
  */
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 #define LOAD_WEIGHT(lp) \
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define PRIO_TO_LOAD_WEIGHT(prio) \
 	LOAD_WEIGHT(static_prio_timeslice(prio))
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 static void set_load_weight(struct task_struct *p)
 {
 	if (has_rt_policy(p)) {
 #ifdef CONFIG_SMP
 		if (p == task_rq(p)->migration_thread)
 			/*
 			 * The migration thread does the actual balancing.
 			 * Giving its load any weight will skew balancing
 			 * adversely.
 			 */
 			p->load_weight = 0;
 		else
 #endif
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
 	} else
 		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
 }
 static inline void
 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
 	rq->raw_weighted_load += p->load_weight;
 }
 static inline void
 dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
 	rq->raw_weighted_load -= p->load_weight;
 }
 static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
 	inc_raw_weighted_load(rq, p);
 }
 static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
 	dec_raw_weighted_load(rq, p);
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /*
  * __activate_task - move a task to the runqueue.
  */
 static void __activate_task(struct task_struct *p, struct rq *rq)
 {
 	struct prio_array *target = rq->active;
 	if (batch_task(p))
 		target = rq->expired;
 	enqueue_task(p, target);
 	inc_nr_running(p, rq);
 }
 /*
  * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
 static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
 {
 	enqueue_task_head(p, rq->active);
 	inc_nr_running(p, rq);
 }
 /*
  * Recalculate p->normal_prio and p->prio after having slept,
  * updating the sleep-average too:
  */
 static int recalc_task_prio(struct task_struct *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long sleep_time = now - p->timestamp;
 	if (batch_task(p))
 		sleep_time = 0;
 	if (likely(sleep_time > 0)) {
 		/*
 		 * This ceiling is set to the lowest priority that would allow
 		 * a task to be reinserted into the active array on timeslice
 		 * completion.
 		 */
 		unsigned long ceiling = INTERACTIVE_SLEEP(p);
 		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
 			/*
 			 * Prevents user tasks from achieving best priority
 			 * with one single large enough sleep.
 			 */
 			p->sleep_avg = ceiling;
 			/*
 			 * Using INTERACTIVE_SLEEP() as a ceiling places a
 			 * nice(0) task 1ms sleep away from promotion, and
 			 * gives it 700ms to round-robin with no chance of
 			 * being demoted.  This is more than generous, so
 			 * mark this sleep as non-interactive to prevent the
 			 * on-runqueue bonus logic from intervening should
 			 * this task not receive cpu immediately.
 			 */
 			p->sleep_type = SLEEP_NONINTERACTIVE;
 		} else {
 			/*
 			 * Tasks waking from uninterruptible sleep are
 			 * limited in their sleep_avg rise as they
 			 * are likely to be waiting on I/O
 			 */
 			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
 				if (p->sleep_avg >= ceiling)
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
 					 ceiling) {
 						p->sleep_avg = ceiling;
 						sleep_time = 0;
 				}
 			}
 			/*
 			 * This code gives a bonus to interactive tasks.
 			 *
 			 * The boost works by updating the 'average sleep time'
 			 * value here, based on ->timestamp. The more time a
 			 * task spends sleeping, the higher the average gets -
 			 * and the higher the priority boost gets as well.
 			 */
 			p->sleep_avg += sleep_time;
 		}
 		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
 			p->sleep_avg = NS_MAX_SLEEP_AVG;
 	}
 	return effective_prio(p);
 }
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
  * Update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
 static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
 	unsigned long long now;
 	if (rt_task(p))
 		goto out;
 	now = sched_clock();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
 		struct rq *this_rq = this_rq();
 		now = (now - this_rq->most_recent_timestamp)
 			+ rq->most_recent_timestamp;
 	}
 #endif
 	/*
 	 * Sleep time is in units of nanosecs, so shift by 20 to get a
 	 * milliseconds-range estimation of the amount of time that the task
 	 * spent sleeping:
 	 */
 	if (unlikely(prof_on == SLEEP_PROFILING)) {
 		if (p->state == TASK_UNINTERRUPTIBLE)
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
 				     (now - p->timestamp) >> 20);
 	}
 	p->prio = recalc_task_prio(p, now);
 	/*
 	 * This checks to make sure it's not an uninterruptible task
 	 * that is now waking up.
 	 */
 	if (p->sleep_type == SLEEP_NORMAL) {
 		/*
 		 * Tasks which were woken up by interrupts (ie. hw events)
 		 * are most likely of interactive nature. So we give them
 		 * the credit of extending their sleep time to the period
 		 * of time they spend on the runqueue, waiting for execution
 		 * on a CPU, first time around:
 		 */
 		if (in_interrupt())
 			p->sleep_type = SLEEP_INTERRUPTED;
 		else {
 			/*
 			 * Normal first-time wakeups get a credit too for
 			 * on-runqueue time, but it will be weighted down:
 			 */
 			p->sleep_type = SLEEP_INTERACTIVE;
 		}
 	}
 	p->timestamp = now;
 out:
 	__activate_task(p, rq);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 static void resched_task(struct task_struct *p)
 {
 	int cpu;
 	assert_spin_locked(&task_rq(p)->lock);
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 #else
 static inline void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 #endif
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->raw_weighted_load;
 }
 #ifdef CONFIG_SMP
 struct migration_req {
 	struct list_head list;
 	struct task_struct *task;
 	int dest_cpu;
 	struct completion done;
 };
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int
 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
 	struct rq *rq = task_rq(p);
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->array && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
 	init_completion(&req->done);
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 void wait_task_inactive(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	int preempted;
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
 	if (unlikely(p->array || task_running(rq, p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);
 		cpu_relax();
 		if (preempted)
 			yield();
 		goto repeat;
 	}
 	task_rq_unlock(rq, &flags);
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesnt have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static inline unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (type == 0)
 		return rq->raw_weighted_load;
 	return min(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static inline unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (type == 0)
 		return rq->raw_weighted_load;
 	return max(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 /*
  * Return the average load per task on the cpu's run queue
  */
 static inline unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long n = rq->nr_running;
 	return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
 }
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 	do {
 		unsigned long load, avg_load;
 		int local_group;
 		int i;
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
 			goto nextgroup;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
 			avg_load += load;
 		}
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
 		}
 nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
 }
 /*
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	cpumask_t tmp;
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 	/* Traverse only the allowed CPUs */
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 	for_each_cpu_mask(i, tmp) {
 		load = weighted_cpuload(i);
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
 			idlest = i;
 		}
 	}
 	return idlest;
 }
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  * SD_BALANCE_EXEC.
  *
  * Balance, ie. select the least loaded group.
  *
  * Returns the target CPU number, or the same CPU if no balancing is needed.
  *
  * preempt must be disabled.
  */
 static int sched_balance_self(int cpu, int flag)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 	for_each_domain(cpu, tmp) {
  		/*
  	 	 * If power savings logic is enabled for a domain, stop there.
  	 	 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
 		if (tmp->flags & flag)
 			sd = tmp;
 	}
 	while (sd) {
 		cpumask_t span;
 		struct sched_group *group;
 		int new_cpu, weight;
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
 			continue;
 		}
 		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
 		}
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
 		sd = NULL;
 		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpus_weight(tmp->span))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
 	}
 	return cpu;
 }
 #endif /* CONFIG_SMP */
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  *
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, struct task_struct *p)
 {
 	cpumask_t tmp;
 	struct sched_domain *sd;
 	int i;
 	if (idle_cpu(cpu))
 		return cpu;
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_IDLE) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i))
 					return i;
 			}
 		}
 		else
 			break;
 	}
 	return cpu;
 }
 #else
 static inline int wake_idle(int cpu, struct task_struct *p)
 {
 	return cpu;
 }
 #endif
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @state: the mask of task states that can be woken
  * @sync: do a synchronous wakeup?
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * returns failure only if the task is already active.
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
 #ifdef CONFIG_SMP
 	struct sched_domain *sd, *this_sd = NULL;
 	unsigned long load, this_load;
 	int new_cpu;
 #endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 	if (p->array)
 		goto out_running;
 	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 	new_cpu = cpu;
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		goto out_set_cpu;
 	}
 	for_each_domain(this_cpu, sd) {
 		if (cpu_isset(cpu, sd->span)) {
 			schedstat_inc(sd, ttwu_wake_remote);
 			this_sd = sd;
 			break;
 		}
 	}
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 	/*
 	 * Check for affine wakeup and passive balancing possibilities.
 	 */
 	if (this_sd) {
 		int idx = this_sd->wake_idx;
 		unsigned int imbalance;
 		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
 		load = source_load(cpu, idx);
 		this_load = target_load(this_cpu, idx);
 		new_cpu = this_cpu; /* Wake to this CPU if we can */
 		if (this_sd->flags & SD_WAKE_AFFINE) {
 			unsigned long tl = this_load;
 			unsigned long tl_per_task;
 			tl_per_task = cpu_avg_load_per_task(this_cpu);
 			/*
 			 * If sync wakeup then subtract the (maximum possible)
 			 * effect of the currently running task from the load
 			 * of the current CPU:
 			 */
 			if (sync)
 				tl -= current->load_weight;
 			if ((tl <= load &&
 				tl + target_load(cpu, idx) <= tl_per_task) ||
 				100*(tl + p->load_weight) <= imbalance*load) {
 				/*
 				 * This domain has SD_WAKE_AFFINE and
 				 * p is cache cold in this domain, and
 				 * there is no bad imbalance.
 				 */
 				schedstat_inc(this_sd, ttwu_move_affine);
 				goto out_set_cpu;
 			}
 		}
 		/*
 		 * Start passive balancing when half the imbalance_pct
 		 * limit is reached.
 		 */
 		if (this_sd->flags & SD_WAKE_BALANCE) {
 			if (imbalance*this_load <= 100*load) {
 				schedstat_inc(this_sd, ttwu_move_balance);
 				goto out_set_cpu;
 			}
 		}
 	}
 	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu) {
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
 		if (p->array)
 			goto out_running;
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
 		rq->nr_uninterruptible--;
 		/*
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->sleep_type = SLEEP_NONINTERACTIVE;
 	} else
 	/*
 	 * Tasks that have marked their sleep as noninteractive get
 	 * woken up with their sleep average not weighted in an
 	 * interactive way.
 	 */
 		if (old_state & TASK_NONINTERACTIVE)
 			p->sleep_type = SLEEP_NONINTERACTIVE;
 	activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
 	 * don't trigger a preemption, if the woken up task will run on
 	 * this cpu. (in this case the 'I will reschedule' promise of
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	success = 1;
 out_running:
 	p->state = TASK_RUNNING;
 out:
 	task_rq_unlock(rq, &flags);
 	return success;
 }
 int fastcall wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 static void task_running_tick(struct rq *rq, struct task_struct *p);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
 void fastcall sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
 	set_task_cpu(p, cpu);
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 	/*
 	 * Share the timeslice between parent and child, thus the
 	 * total amount of pending timeslices in the system doesn't change,
 	 * resulting in more scheduling fairness.
 	 */
 	local_irq_disable();
 	p->time_slice = (current->time_slice + 1) >> 1;
 	/*
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
 		 * a single jiffy left from its timeslice. Taking the
 		 * runqueue lock is not a problem.
 		 */
 		current->time_slice = 1;
 		task_running_tick(cpu_rq(cpu), current);
 	}
 	local_irq_enable();
 	put_cpu();
 }
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	struct rq *rq, *this_rq;
 	unsigned long flags;
 	int this_cpu, cpu;
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
 	 * from forking tasks that are max-interactive. The parent
 	 * (current) is done further down, under its lock.
 	 */
 	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	p->prio = effective_prio(p);
 	if (likely(cpu == this_cpu)) {
 		if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
 			if (unlikely(!current->array))
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
 				p->normal_prio = current->normal_prio;
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
 				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
 			/* Run child last */
 			__activate_task(p, rq);
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
 		 *   task_rq_unlock(rq, &flags);
 		 *   this_rq = task_rq_lock(current, &flags);
 		 */
 		this_rq = rq;
 	} else {
 		this_rq = cpu_rq(this_cpu);
 		/*
 		 * Not the local CPU - must adjust timestamp. This should
 		 * get optimised away in the !CONFIG_SMP case.
 		 */
 		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
 					+ rq->most_recent_timestamp;
 		__activate_task(p, rq);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 		/*
 		 * Parent and child are on different CPUs, now get the
 		 * parent runqueue to update the parent's ->sleep_avg:
 		 */
 		task_rq_unlock(rq, &flags);
 		this_rq = task_rq_lock(current, &flags);
 	}
 	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
 		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	task_rq_unlock(this_rq, &flags);
 }
 /*
  * Potentially available exiting-child timeslices are
  * retrieved here - this way the parent does not get
  * penalized for creating too many threads.
  *
  * (this cannot be used to 'generate' timeslices
  * artificially, because any timeslice recovered here
  * was given away by the parent in the first place.)
  */
 void fastcall sched_exit(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	/*
 	 * If the child was a (relative-) CPU hog then decrease
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
 	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
 	}
 	if (p->sleep_avg < p->parent->sleep_avg)
 		p->parent->sleep_avg = p->parent->sleep_avg /
 		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
 		(EXIT_WEIGHT + 1);
 	task_rq_unlock(rq, &flags);
 }
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
 {
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 	 	 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 }
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline struct task_struct *
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_enter_lazy_cpu_mode();
 	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (!prev->mm) {
 		prev->active_mm = NULL;
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	return prev;
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_active(void)
 {
 	unsigned long i, running = 0, uninterruptible = 0;
 	for_each_online_cpu(i) {
 		running += cpu_rq(i)->nr_running;
 		uninterruptible += cpu_rq(i)->nr_uninterruptible;
 	}
 	if (unlikely((long)uninterruptible < 0))
 		uninterruptible = 0;
 	return running + uninterruptible;
 }
 #ifdef CONFIG_SMP
 /*
  * Is this task likely cache-hot:
  */
 static inline int
 task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
 {
 	return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
 }
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
 			spin_lock(&rq2->lock);
 			spin_lock(&rq1->lock);
 		}
 	}
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
 		} else
 			spin_lock(&busiest->lock);
 	}
 }
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
  * the cpu_allowed mask is restored.
  */
 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 }
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
 	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
 }
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
 static void pull_task(struct rq *src_rq, struct prio_array *src_array,
 		      struct task_struct *p, struct rq *this_rq,
 		      struct prio_array *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
 	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
 				+ this_rq->most_recent_timestamp;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
 	if (TASK_PREEMPTS_CURR(p, this_rq))
 		resched_task(this_rq->curr);
 }
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle,
 		     int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
 	*all_pinned = 0;
 	if (task_running(rq, p))
 		return 0;
 	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 	if (sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->most_recent_timestamp, sd))
 			schedstat_inc(sd, lb_hot_gained[idle]);
 #endif
 		return 1;
 	}
 	if (task_hot(p, rq->most_recent_timestamp, sd))
 		return 0;
 	return 1;
 }
 #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
 /*
  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
  * load from busiest to this_rq, as part of a balancing operation within
  * "domain". Returns the number of tasks moved.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum idle_type idle,
 		      int *all_pinned)
 {
 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
 	    best_prio_seen, skip_for_load;
 	struct prio_array *array, *dst_array;
 	struct list_head *head, *curr;
 	struct task_struct *tmp;
 	long rem_load_move;
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 	rem_load_move = max_load_move;
 	pinned = 1;
 	this_best_prio = rq_best_prio(this_rq);
 	best_prio = rq_best_prio(busiest);
 	/*
 	 * Enable handling of the case where there is more than one task
 	 * with the best priority.   If the current running task is one
 	 * of those with prio==best_prio we know it won't be moved
 	 * and therefore it's safe to override the skip (based on load) of
 	 * any task we find with that prio.
 	 */
 	best_prio_seen = best_prio == busiest->curr->prio;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
 	 * on them.
 	 */
 	if (busiest->expired->nr_active) {
 		array = busiest->expired;
 		dst_array = this_rq->expired;
 	} else {
 		array = busiest->active;
 		dst_array = this_rq->active;
 	}
 new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
 skip_bitmap:
 	if (!idx)
 		idx = sched_find_first_bit(array->bitmap);
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
 		if (array == busiest->expired && busiest->active->nr_active) {
 			array = busiest->active;
 			dst_array = this_rq->active;
 			goto new_array;
 		}
 		goto out;
 	}
 	head = array->queue + idx;
 	curr = head->prev;
 skip_queue:
 	tmp = list_entry(curr, struct task_struct, run_list);
 	curr = curr->prev;
 	/*
 	 * To help distribute high priority tasks accross CPUs we don't
 	 * skip a task if it will be the highest priority task (i.e. smallest
 	 * prio value) on its new queue regardless of its load weight
 	 */
 	skip_for_load = tmp->load_weight > rem_load_move;
 	if (skip_for_load && idx < this_best_prio)
 		skip_for_load = !best_prio_seen && idx == best_prio;
 	if (skip_for_load ||
 	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		best_prio_seen |= idx == best_prio;
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
 	rem_load_move -= tmp->load_weight;
 	/*
 	 * We only want to steal up to the prescribed number of tasks
 	 * and the prescribed amount of weighted load.
 	 */
 	if (pulled < max_nr_move && rem_load_move > 0) {
 		if (idx < this_best_prio)
 			this_best_prio = idx;
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
 out:
 	/*
 	 * Right now, this is the only place pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 	if (all_pinned)
 		*all_pinned = pinned;
 	return pulled;
 }
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
  * should be moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
 		   cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
 	unsigned long busiest_load_per_task, busiest_nr_running;
 	unsigned long this_load_per_task, this_nr_running;
 	int load_idx;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	int power_savings_balance = 1;
 	unsigned long leader_nr_running = 0, min_load_per_task = 0;
 	unsigned long min_nr_running = ULONG_MAX;
 	struct sched_group *group_min = NULL, *group_leader = NULL;
 #endif
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
 	do {
 		unsigned long load, group_capacity;
 		int local_group;
 		int i;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		if (local_group)
 			balance_cpu = first_cpu(group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			struct rq *rq;
 			if (!cpu_isset(i, *cpus))
 				continue;
 			rq = cpu_rq(i);
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				if (idle_cpu(i) && !first_idle_cpu) {
 					first_idle_cpu = 1;
 					balance_cpu = i;
 				}
 				load = target_load(i, load_idx);
 			} else
 				load = source_load(i, load_idx);
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
 			sum_weighted_load += rq->raw_weighted_load;
 		}
 		/*
 		 * First idle cpu or the first cpu(busiest) in this sched group
 		 * is eligible for doing load balancing at this and above
 		 * domains.
 		 */
 		if (local_group && balance_cpu != this_cpu && balance) {
 			*balance = 0;
 			goto ret;
 		}
 		total_load += avg_load;
 		total_pwr += group->cpu_power;
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 		group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 			this_nr_running = sum_nr_running;
 			this_load_per_task = sum_weighted_load;
 		} else if (avg_load > max_load &&
 			   sum_nr_running > group_capacity) {
 			max_load = avg_load;
 			busiest = group;
 			busiest_nr_running = sum_nr_running;
 			busiest_load_per_task = sum_weighted_load;
 		}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 		/*
 		 * Busy processors will not participate in power savings
 		 * balance.
 		 */
  		if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
  			goto group_next;
 		/*
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
 		if (local_group && (this_nr_running >= group_capacity ||
 				    !this_nr_running))
 			power_savings_balance = 0;
  		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
  		 */
  		if (!power_savings_balance || sum_nr_running >= group_capacity
 		    || !sum_nr_running)
  			goto group_next;
  		/*
 		 * Calculate the group which has the least non-idle load.
  		 * This is the group from where we need to pick up the load
  		 * for saving power
  		 */
  		if ((sum_nr_running < min_nr_running) ||
  		    (sum_nr_running == min_nr_running &&
 		     first_cpu(group->cpumask) <
 		     first_cpu(group_min->cpumask))) {
  			group_min = group;
  			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
 						sum_nr_running;
  		}
  		/*
 		 * Calculate the group which is almost near its
  		 * capacity but still has some space to pick up some load
  		 * from other group and save more power
  		 */
  		if (sum_nr_running <= group_capacity - 1) {
  			if (sum_nr_running > leader_nr_running ||
  			    (sum_nr_running == leader_nr_running &&
  			     first_cpu(group->cpumask) >
  			      first_cpu(group_leader->cpumask))) {
  				group_leader = group;
  				leader_nr_running = sum_nr_running;
  			}
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 	if (this_load >= avg_load ||
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 	busiest_load_per_task /= busiest_nr_running;
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load, as either of these
 	 * actions would just result in more rebalancing later, and ping-pong
 	 * tasks around. Thus we look for the minimum possible imbalance.
 	 * Negative imbalances (*we* are more loaded than anyone else) will
 	 * be counted as no imbalance for these purposes -- we can't fix that
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
 	if (max_load <= busiest_load_per_task)
 		goto out_balanced;
 	/*
 	 * In the presence of smp nice balancing, certain scenarios can have
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
 	if (max_load < avg_load) {
 		*imbalance = 0;
 		goto small_imbalance;
 	}
 	/* Don't want to pull so many tasks that a group would go idle */
 	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no gaurantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
 	if (*imbalance < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 small_imbalance:
 		pwr_move = pwr_now = 0;
 		imbn = 2;
 		if (this_nr_running) {
 			this_load_per_task /= this_nr_running;
 			if (busiest_load_per_task > this_load_per_task)
 				imbn = 1;
 		} else
 			this_load_per_task = SCHED_LOAD_SCALE;
 		if (max_load - this_load >= busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
 		/*
 		 * OK, we don't have enough imbalance to justify moving tasks,
 		 * however we may be able to increase total CPU power used by
 		 * moving them.
 		 */
 		pwr_now += busiest->cpu_power *
 			min(busiest_load_per_task, max_load);
 		pwr_now += this->cpu_power *
 			min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 		/* Amount of load we'd subtract */
 		tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
 			busiest->cpu_power;
 		if (max_load > tmp)
 			pwr_move += busiest->cpu_power *
 				min(busiest_load_per_task, max_load - tmp);
 		/* Amount of load we'd add */
 		if (max_load * busiest->cpu_power <
 				busiest_load_per_task * SCHED_LOAD_SCALE)
 			tmp = max_load * busiest->cpu_power / this->cpu_power;
 		else
 			tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
 				this->cpu_power;
 		pwr_move += this->cpu_power *
 			min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 		/* Move if we gain throughput */
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 		*imbalance = busiest_load_per_task;
 	}
 	return busiest;
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 	if (this == group_leader && group_leader != group_min) {
 		*imbalance = min_load_per_task;
 		return group_min;
 	}
 #endif
 ret:
 	*imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum idle_type idle,
 		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 	for_each_cpu_mask(i, group->cpumask) {
 		if (!cpu_isset(i, *cpus))
 			continue;
 		rq = cpu_rq(i);
 		if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
 			continue;
 		if (rq->raw_weighted_load > max_load) {
 			max_load = rq->raw_weighted_load;
 			busiest = rq;
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
 static inline unsigned long minus_1_or_zero(unsigned long n)
 {
 	return n > 0 ? n - 1 : 0;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum idle_type idle,
 			int *balance)
 {
 	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	cpumask_t cpus = CPU_MASK_ALL;
 	unsigned long flags;
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
 	 * portraying it as NOT_IDLE.
 	 */
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_cnt[idle]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   &cpus, balance);
 	if (*balance == 0)
 		goto out_balanced;
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 	nr_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 				      minus_1_or_zero(busiest->nr_running),
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
 			goto out_balanced;
 		}
 	}
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			spin_lock_irqsave(&busiest->lock, flags);
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
 	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return nr_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return 0;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
 	cpumask_t cpus = CPU_MASK_ALL;
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
 	 * portraying it as NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
 				&cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 	nr_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 		if (!nr_moved) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
 		}
 	}
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 	} else
 		sd->nr_balance_failed = 0;
 	return nr_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + 60 *  HZ;
 	for_each_domain(this_cpu, sd) {
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu,
 							this_rq, sd);
 			if (time_after(next_balance,
 				  sd->last_balance + sd->balance_interval))
 				next_balance = sd->last_balance
 						+ sd->balance_interval;
 			if (pulled_task)
 				break;
 		}
 	}
 	if (!pulled_task)
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
 }
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
  * running on each physical CPU where possible, and avoids physical /
  * logical imbalances.
  *
  * Called with busiest_rq locked.
  */
 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 {
 	int target_cpu = busiest_rq->push_cpu;
 	struct sched_domain *sd;
 	struct rq *target_rq;
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
 		return;
 	target_rq = cpu_rq(target_cpu);
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it.  Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpu_isset(busiest_cpu, sd->span))
 				break;
 	}
 	if (likely(sd)) {
 		schedstat_inc(sd, alb_cnt);
 		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
 			       RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
 			       NULL))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	spin_unlock(&target_rq->lock);
 }
 static void update_load(struct rq *this_rq)
 {
 	unsigned long this_load;
 	unsigned int i, scale;
 	this_load = this_rq->raw_weighted_load;
 	/* Update our load: */
 	for (i = 0, scale = 1; i < 3; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  *
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 static DEFINE_SPINLOCK(balancing);
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id(), balance = 1;
 	struct rq *this_rq = cpu_rq(this_cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/*
 	 * We are idle if there are no processes running. This
 	 * is valid even if we are the idle process (SMT).
 	 */
 	enum idle_type idle = !this_rq->nr_running ?
 				SCHED_IDLE : NOT_IDLE;
 	/* Earliest time when we have to call run_rebalance_domains again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		interval = sd->balance_interval;
 		if (idle != SCHED_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
 		if (sd->flags & SD_SERIALIZE) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
 				idle = NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
 		if (sd->flags & SD_SERIALIZE)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		/*
 		 * Stop the load balance at this level. There is another
 		 * CPU in our sched group which is doing load balancing more
 		 * actively.
 		 */
 		if (!balance)
 			break;
 	}
 	this_rq->next_balance = next_balance;
 }
 #else
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  */
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
 	p->sched_time += now - p->last_ran;
 	p->last_ran = rq->most_recent_timestamp = now;
 }
 /*
  * Return current->sched_time plus any more ns on the sched_clock
  * that have not yet been banked.
  */
 unsigned long long current_sched_time(const struct task_struct *p)
 {
 	unsigned long long ns;
 	unsigned long flags;
 	local_irq_save(flags);
 	ns = p->sched_time + sched_clock() - p->last_ran;
 	local_irq_restore(flags);
 	return ns;
 }
 /*
  * We place interactive tasks back into the active array, if possible.
  *
  * To guarantee that this does not starve expired tasks we ignore the
  * interactivity of a task if the first expired task had to wait more
  * than a 'reasonable' amount of time. This deadline timeout is
  * load-dependent, as the frequency of array switched decreases with
  * increasing number of running tasks. We also ignore the interactivity
  * if a better static_prio task has expired:
  */
 static inline int expired_starving(struct rq *rq)
 {
 	if (rq->curr->static_prio > rq->best_expired_prio)
 		return 1;
 	if (!STARVATION_LIMIT || !rq->expired_timestamp)
 		return 0;
 	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
 		return 1;
 	return 0;
 }
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time(struct task_struct *p, cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	p->utime = cputime_add(p->utime, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 	p->stime = cputime_add(p->stime, cputime);
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
 void account_steal_time(struct task_struct *p, cputime_t steal)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp = cputime_to_cputime64(steal);
 	struct rq *rq = this_rq();
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
 			cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	} else
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
 	if (p->array != rq->active) {
 		/* Task has expired but was not scheduled yet */
 		set_tsk_need_resched(p);
 		return;
 	}
 	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
 	 * time slice counter. Note: we do not update a thread's
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
 	 */
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
 			p->first_time_slice = 0;
 			set_tsk_need_resched(p);
 			/* put it at the end of the queue: */
 			requeue_task(p, rq->active);
 		}
 		goto out_unlock;
 	}
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
 		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
 	} else {
 		/*
 		 * Prevent a too long timeslice allowing a task to monopolize
 		 * the CPU. We do this by splitting up the timeslice into
 		 * smaller pieces.
 		 *
 		 * Note: this does not mean the task's timeslices expire or
 		 * get lost in any way, they just might be preempted by
 		 * another task of equal priority. (one with higher
 		 * priority would have preempted this task already.) We
 		 * requeue this task to the end of the list on this priority
 		 * level, which is in essence a round-robin of tasks with
 		 * equal priority.
 		 *
 		 * This only applies to tasks in the interactive
 		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
 		 */
 		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
 			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
 			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
 			(p->array == rq->active)) {
 			requeue_task(p, rq->active);
 			set_tsk_need_resched(p);
 		}
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
 void scheduler_tick(void)
 {
 	unsigned long long now = sched_clock();
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	update_cpu_clock(p, rq, now);
 	if (p != rq->idle)
 		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
 	update_load(rq);
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #endif
 }
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
 void fastcall add_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 	preempt_count() += val;
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 void fastcall sub_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 static inline int interactive_sleep(enum sleep_type sleep_type)
 {
 	return (sleep_type == SLEEP_INTERACTIVE ||
 		sleep_type == SLEEP_INTERRUPTED);
 }
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	struct prio_array *array;
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
 	int cpu, idx, new_prio;
 	long *switch_count;
 	struct rq *rq;
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic() && !current->exit_state)) {
 		printk(KERN_ERR "BUG: scheduling while atomic: "
 			"%s/0x%08x/%d\n",
 			current->comm, preempt_count(), current->pid);
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
 		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 need_resched:
 	preempt_disable();
 	prev = current;
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	rq = this_rq();
 	/*
 	 * The idle thread is not allowed to schedule!
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
 		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
 	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
 		run_time = now - prev->timestamp;
 		if (unlikely((long long)(now - prev->timestamp) < 0))
 			run_time = 0;
 	} else
 		run_time = NS_MAX_SLEEP_AVG;
 	/*
 	 * Tasks charged proportionately less run_time at high sleep_avg to
 	 * delay them losing their interactive status
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 	spin_lock_irq(&rq->lock);
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
 			if (prev->state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
 			deactivate_task(prev, rq);
 		}
 	}
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
 			rq->expired_timestamp = 0;
 			goto switch_tasks;
 		}
 	}
 	array = rq->active;
 	if (unlikely(!array->nr_active)) {
 		/*
 		 * Switch the active and expired arrays.
 		 */
 		schedstat_inc(rq, sched_switch);
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
 		rq->best_expired_prio = MAX_PRIO;
 	}
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
 		if (next->sleep_type == SLEEP_INTERACTIVE)
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 		array = next->array;
 		new_prio = recalc_task_prio(next, next->timestamp + delta);
 		if (unlikely(next->prio != new_prio)) {
 			dequeue_task(next, array);
 			next->prio = new_prio;
 			enqueue_task(next, array);
 		}
 	}
 	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 	update_cpu_clock(prev, rq, now);
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = next->last_ran = now;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
 	} else
 		spin_unlock_irq(&rq->lock);
 	prev = current;
 	if (unlikely(reacquire_kernel_lock(prev) < 0))
 		goto need_resched_nonpreemptible;
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 #ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
 	 */
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 need_resched:
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
 	 * clear ->lock_depth so that schedule() doesnt
 	 * auto-release the semaphore:
 	 */
 #ifdef CONFIG_PREEMPT_BKL
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
 	schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
 	sub_preempt_count(PREEMPT_ACTIVE);
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 #ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 need_resched:
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
 	 * clear ->lock_depth so that schedule() doesnt
 	 * auto-release the semaphore:
 	 */
 #ifdef CONFIG_PREEMPT_BKL
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
 	local_irq_enable();
 	schedule();
 	local_irq_disable();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
 	sub_preempt_count(PREEMPT_ACTIVE);
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, sync);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
 	struct list_head *tmp, *next;
 	list_for_each_safe(tmp, next, &q->task_list) {
 		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 		unsigned flags = curr->flags;
 		if (curr->func(curr, mode, sync, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 /**
  * __wake_up_sync - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  */
 void fastcall
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
 	int sync = 1;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		sync = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 void fastcall complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 void fastcall complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 	spin_unlock_irq(&x->wait.lock);
 }
 EXPORT_SYMBOL(wait_for_completion);
 unsigned long fastcall __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
 {
 	int ret = 0;
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending(current)) {
 				ret = -ERESTARTSYS;
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 			__set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 unsigned long fastcall __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending(current)) {
 				timeout = -ERESTARTSYS;
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 			__set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 #define	SLEEP_ON_VAR					\
 	unsigned long flags;				\
 	wait_queue_t wait;				\
 	init_waitqueue_entry(&wait, current);
 #define SLEEP_ON_HEAD					\
 	spin_lock_irqsave(&q->lock,flags);		\
 	__add_wait_queue(q, &wait);			\
 	spin_unlock(&q->lock);
 #define	SLEEP_ON_TAIL					\
 	spin_lock_irq(&q->lock);			\
 	__remove_wait_queue(q, &wait);			\
 	spin_unlock_irqrestore(&q->lock, flags);
 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_INTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	schedule();
 	SLEEP_ON_TAIL
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long fastcall __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_INTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
 	SLEEP_ON_TAIL
 	return timeout;
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_UNINTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	schedule();
 	SLEEP_ON_TAIL
 }
 EXPORT_SYMBOL(sleep_on);
 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_UNINTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
 	SLEEP_ON_TAIL
 	return timeout;
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	struct prio_array *array;
 	unsigned long flags;
 	struct rq *rq;
 	int oldprio;
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	rq = task_rq_lock(p, &flags);
 	oldprio = p->prio;
 	array = p->array;
 	if (array)
 		dequeue_task(p, array);
 	p->prio = prio;
 	if (array) {
 		/*
 		 * If changing to an RT priority then queue it
 		 * in the active array!
 		 */
 		if (rt_task(p))
 			array = rq->active;
 		enqueue_task(p, array);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	task_rq_unlock(rq, &flags);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	struct prio_array *array;
 	int old_prio, delta;
 	unsigned long flags;
 	struct rq *rq;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
 	if (has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	array = p->array;
 	if (array) {
 		dequeue_task(p, array);
 		dec_raw_weighted_load(rq, p);
 	}
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (array) {
 		enqueue_task(p, array);
 		inc_raw_weighted_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 asmlinkage long sys_nice(int increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = PRIO_TO_NICE(current->static_prio) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL_GPL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static inline struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	/*
 	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
 	 */
 	if (policy == SCHED_BATCH)
 		p->sleep_avg = 0;
 	set_load_weight(p);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	int retval, oldprio, oldpolicy = -1;
 	struct prio_array *array;
 	unsigned long flags;
 	struct rq *rq;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
 	 * SCHED_BATCH is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if (is_rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		if (is_rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 			unsigned long flags;
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
 			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
 			unlock_task_sighand(p, &flags);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (param->sched_priority > p->rt_priority &&
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
 			return -EPERM;
 	}
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 */
 	spin_lock_irqsave(&p->pi_lock, flags);
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	array = p->array;
 	if (array)
 		deactivate_task(p, rq);
 	oldprio = p->prio;
 	__setscheduler(p, policy, param->sched_priority);
 	if (array) {
 		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				       struct sched_param __user *param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
 	int retval = -EINVAL;
 	if (pid < 0)
 		goto out_nounlock;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy;
 	}
 	read_unlock(&tasklist_lock);
 out_nounlock:
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 {
 	cpumask_t cpus_allowed;
 	struct task_struct *p;
 	int retval;
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		unlock_cpu_hotplug();
 		return -ESRCH;
 	}
 	/*
 	 * It is not safe to call set_cpus_allowed with the
 	 * tasklist_lock held.  We will bump the task_struct's
 	 * usage count and then drop tasklist_lock.
 	 */
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	retval = -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 	retval = security_task_setscheduler(p, 0, NULL);
 	if (retval)
 		goto out_unlock;
 	cpus_allowed = cpuset_cpus_allowed(p);
 	cpus_and(new_mask, new_mask, cpus_allowed);
 	retval = set_cpus_allowed(p, new_mask);
 out_unlock:
 	put_task_struct(p);
 	unlock_cpu_hotplug();
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     cpumask_t *new_mask)
 {
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	cpumask_t new_mask;
 	int retval;
 	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
 	if (retval)
 		return retval;
 	return sched_setaffinity(pid, new_mask);
 }
 /*
  * Represents all cpu's present in the system
  * In systems capable of hotplug, this map could dynamically grow
  * as new cpu's are detected in the system via any platform specific
  * method, such as ACPI for e.g.
  */
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_possible_map);
 #endif
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	struct task_struct *p;
 	int retval;
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 out_unlock:
 	read_unlock(&tasklist_lock);
 	unlock_cpu_hotplug();
 	if (retval)
 		return retval;
 	return 0;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	int ret;
 	cpumask_t mask;
 	if (len < sizeof(cpumask_t))
 		return -EINVAL;
 	ret = sched_getaffinity(pid, &mask);
 	if (ret < 0)
 		return ret;
 	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
 		return -EFAULT;
 	return sizeof(cpumask_t);
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU by moving the calling thread
  * to the expired array. If there are no other threads running on this
  * CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
 	struct prio_array *array = current->array, *target = rq->expired;
 	schedstat_inc(rq, yld_cnt);
 	/*
 	 * We implement yielding by moving the task into the expired
 	 * queue.
 	 *
 	 * (special rule: RT tasks will just roundrobin in the active
 	 *  array.)
 	 */
 	if (rt_task(current))
 		target = rq->active;
 	if (array->nr_active == 1) {
 		schedstat_inc(rq, yld_act_empty);
 		if (!rq->expired->nr_active)
 			schedstat_inc(rq, yld_both_empty);
 	} else if (!rq->expired->nr_active)
 		schedstat_inc(rq, yld_exp_empty);
 	if (array != target) {
 		dequeue_task(current, array);
 		enqueue_task(current, target);
 	} else
 		/*
 		 * requeue_task is cheaper so perform that if possible.
 		 */
 		requeue_task(current, array);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 	__might_sleep(__FILE__, __LINE__);
 #endif
 	/*
 	 * The BKS might be reacquired before we have dropped
 	 * PREEMPT_ACTIVE, which could trigger a second
 	 * cond_resched() call.
 	 */
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 	} while (need_resched());
 }
 int __sched cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched);
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int cond_resched_lock(spinlock_t *lock)
 {
 	int ret = 0;
 	if (need_lockbreak(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		spin_release(&lock->dep_map, 1, _THIS_IP_);
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(cond_resched_lock);
 int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		raw_local_irq_disable();
 		_local_bh_enable();
 		raw_local_irq_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  *
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_min(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	struct task_struct *p;
 	int retval = -EINVAL;
 	struct timespec t;
 	if (pid < 0)
 		goto out_nounlock;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	jiffies_to_timespec(p->policy == SCHED_FIFO ?
 				0 : task_timeslice(p), &t);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static const char stat_nam[] = "RSDTtZX";
 static void show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk("%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if (BITS_PER_LONG == 32)
 	if (state == TASK_RUNNING)
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
 		while (!*n)
 			n++;
 		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
 	printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
 	if (!p->mm)
 		printk(" (L-TLB)\n");
 	else
 		printk(" (NOTLB)\n");
 	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if (BITS_PER_LONG == 32)
 	printk("\n"
 	       "                         free                        sibling\n");
 	printk("  task             PC    stack   pid father child younger older\n");
 #else
 	printk("\n"
 	       "                                 free                        sibling\n");
 	printk("  task                 PC        stack   pid father child younger older\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			show_task(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (state_filter == -1)
 		debug_show_all_locks();
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
 #endif
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_MASK_NONE.
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
  *
  * 1) we queue a struct migration_req structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
  *    thread off the CPU)
  * 4) it gets the migration request and checks whether the migrated
  *    task is still in the wrong runqueue.
  * 5) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 6) migration thread up()s the semaphore.
  * 7) we wake up and the migration is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely.  The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	p->cpus_allowed = new_mask;
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
 	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed);
 /*
  * Move (not current) task off this cpu, onto dest cpu.  We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0;
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto out;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 	set_task_cpu(p, dest_cpu);
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
 		 * The same thing could be achieved by doing this step
 		 * afterwards, and pretending it was a local activate.
 		 * This way is cleaner and logically correct.
 		 */
 		p->timestamp = p->timestamp - rq_src->most_recent_timestamp
 				+ rq_dest->most_recent_timestamp;
 		deactivate_task(p, rq_src);
 		__activate_task(p, rq_dest);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
 	ret = 1;
 out:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
 static int migration_thread(void *data)
 {
 	int cpu = (long)data;
 	struct rq *rq;
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		struct migration_req *req;
 		struct list_head *head;
 		try_to_freeze();
 		spin_lock_irq(&rq->lock);
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
 			goto wait_to_die;
 		}
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
 		}
 		head = &rq->migration_queue;
 		if (list_empty(head)) {
 			spin_unlock_irq(&rq->lock);
 			schedule();
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
 		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
 		local_irq_enable();
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 wait_to_die:
 	/* Wait for kthread_stop */
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Figure out where task on dead CPU should go, use force if neccessary.
  * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	unsigned long flags;
 	cpumask_t mask;
 	struct rq *rq;
 	int dest_cpu;
 restart:
 	/* On same node? */
 	mask = node_to_cpumask(cpu_to_node(dead_cpu));
 	cpus_and(mask, mask, p->cpus_allowed);
 	dest_cpu = any_online_cpu(mask);
 	/* On any allowed CPU? */
 	if (dest_cpu == NR_CPUS)
 		dest_cpu = any_online_cpu(p->cpus_allowed);
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
 		rq = task_rq_lock(p, &flags);
 		cpus_setall(p->cpus_allowed);
 		dest_cpu = any_online_cpu(p->cpus_allowed);
 		task_rq_unlock(rq, &flags);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
 		 * leave kernel.
 		 */
 		if (p->mm && printk_ratelimit())
 			printk(KERN_INFO "process %d (%s) no "
 			       "longer affine to cpu%d\n",
 			       p->pid, p->comm, dead_cpu);
 	}
 	if (!__migrate_task(p, dead_cpu, dest_cpu))
 		goto restart;
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 	local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
 /* Run through task list and migrate tasks from the dead cpu. */
 static void migrate_live_tasks(int src_cpu)
 {
 	struct task_struct *p, *t;
 	write_lock_irq(&tasklist_lock);
 	do_each_thread(t, p) {
 		if (p == current)
 			continue;
 		if (task_cpu(p) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, p);
 	} while_each_thread(t, p);
 	write_unlock_irq(&tasklist_lock);
 }
 /* Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible and adding it to
  * the _front_ of the runqueue. Used by CPU offline code.
  */
 void sched_idle_next(void)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(this_cpu);
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(this_cpu));
 	/*
 	 * Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on the current cpu.
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 	/* Add idle task to the _front_ of its priority queue: */
 	__activate_idle_task(p, rq);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 /* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(p->state == TASK_DEAD);
 	get_task_struct(p);
 	/*
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
 	 * NOTE: interrupts should be left disabled  --dev@
 	 */
 	spin_unlock(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, p);
 	spin_lock(&rq->lock);
 	put_task_struct(p);
 }
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	unsigned int arr, i;
 	for (arr = 0; arr < 2; arr++) {
 		for (i = 0; i < MAX_PRIO; i++) {
 			struct list_head *list = &rq->arrays[arr].queue[i];
 			while (!list_empty(list))
 				migrate_dead(dead_cpu, list_entry(list->next,
 					     struct task_struct, run_list));
 		}
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq;
 	switch (action) {
 	case CPU_UP_PREPARE:
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
 		/* Strictly unneccessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     any_online_cpu(cpu_online_map));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 	case CPU_DEAD:
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
 		deactivate_task(rq->idle, rq);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
 		migrate_dead_tasks(cpu);
 		task_rq_unlock(rq, &flags);
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		/* No need to migrate the tasks: it was best-effort if
 		 * they didn't do lock_cpu_hotplug().  Just wake up
 		 * the requestors. */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
 			struct migration_req *req;
 			req = list_entry(rq->migration_queue.next,
 					 struct migration_req, list);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
 }
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
 int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Start one for the boot CPU: */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	return 0;
 }
 #endif
 #ifdef CONFIG_SMP
+/* Number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	do {
 		int i;
 		char str[NR_CPUS];
 		struct sched_group *group = sd->groups;
 		cpumask_t groupmask;
 		cpumask_scnprintf(str, NR_CPUS, sd->span);
 		cpus_clear(groupmask);
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 1; i++)
 			printk(" ");
 		printk("domain %d: ", level);
 		if (!(sd->flags & SD_LOAD_BALANCE)) {
 			printk("does not load-balance\n");
 			if (sd->parent)
 				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 						" has parent");
 			break;
 		}
 		printk("span %s\n", str);
 		if (!cpu_isset(cpu, sd->span))
 			printk(KERN_ERR "ERROR: domain->span does not contain "
 					"CPU%d\n", cpu);
 		if (!cpu_isset(cpu, group->cpumask))
 			printk(KERN_ERR "ERROR: domain->groups does not contain"
 					" CPU%d\n", cpu);
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 2; i++)
 			printk(" ");
 		printk("groups:");
 		do {
 			if (!group) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: group is NULL\n");
 				break;
 			}
 			if (!group->cpu_power) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: domain->cpu_power not "
 						"set\n");
 			}
 			if (!cpus_weight(group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: empty group\n");
 			}
 			if (cpus_intersects(groupmask, group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: repeated CPUs\n");
 			}
 			cpus_or(groupmask, groupmask, group->cpumask);
 			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
 			printk(" %s", str);
 			group = group->next;
 		} while (group != sd->groups);
 		printk("\n");
 		if (!cpus_equal(sd->span, groupmask))
 			printk(KERN_ERR "ERROR: groups don't span "
 					"domain->span\n");
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			continue;
 		if (!cpus_subset(groupmask, sd->span))
 			printk(KERN_ERR "ERROR: parent span is not a superset "
 				"of domain->span\n");
 	} while (sd);
 }
 #else
 # define sched_domain_debug(sd, cpu) do { } while (0)
 #endif
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_IDLE |
 			 SD_WAKE_AFFINE |
 			 SD_WAKE_BALANCE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpus_equal(sd->span, parent->span))
 		return 0;
 	/* Does parent contain flags not in child? */
 	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
 	if (cflags & SD_WAKE_AFFINE)
 		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; tmp = tmp->parent) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 		}
 	}
 	if (sd && sd_degenerate(sd)) {
 		sd = sd->parent;
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rcu_assign_pointer(rq->sd, sd);
 }
 /* cpus with isolated domains */
 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	int ints[NR_CPUS], i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
 	for (i = 1; i <= ints[0]; i++)
 		if (ints[i] < NR_CPUS)
 			cpu_set(ints[i], cpu_isolated_map);
 	return 1;
 }
 __setup ("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
  * (due to the fact that we keep track of groups covered with a cpumask_t).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
 					struct sched_group **sg))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 	for_each_cpu_mask(i, span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg);
 		int j;
 		if (cpu_isset(i, covered))
 			continue;
 		sg->cpumask = CPU_MASK_NONE;
 		sg->cpu_power = 0;
 		for_each_cpu_mask(j, span) {
 			if (group_fn(j, cpu_map, NULL) != group)
 				continue;
 			cpu_set(j, covered);
 			cpu_set(j, sg->cpumask);
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 #define SD_NODES_PER_DOMAIN 16
 /*
  * Self-tuning task migration cost measurement between source and target CPUs.
  *
  * This is done by measuring the cost of manipulating buffers of varying
  * sizes. For a given buffer-size here are the steps that are taken:
  *
  * 1) the source CPU reads+dirties a shared buffer
  * 2) the target CPU reads+dirties the same shared buffer
  *
  * We measure how long they take, in the following 4 scenarios:
  *
  *  - source: CPU1, target: CPU2 | cost1
  *  - source: CPU2, target: CPU1 | cost2
  *  - source: CPU1, target: CPU1 | cost3
  *  - source: CPU2, target: CPU2 | cost4
  *
  * We then calculate the cost3+cost4-cost1-cost2 difference - this is
  * the cost of migration.
  *
  * We then start off from a small buffer-size and iterate up to larger
  * buffer sizes, in 5% steps - measuring each buffer-size separately, and
  * doing a maximum search for the cost. (The maximum cost for a migration
  * normally occurs when the working set size is around the effective cache
  * size.)
  */
 #define SEARCH_SCOPE		2
 #define MIN_CACHE_SIZE		(64*1024U)
 #define DEFAULT_CACHE_SIZE	(5*1024*1024U)
 #define ITERATIONS		1
 #define SIZE_THRESH		130
 #define COST_THRESH		130
 /*
  * The migration cost is a function of 'domain distance'. Domain
  * distance is the number of steps a CPU has to iterate down its
  * domain tree to share a domain with the other CPU. The farther
  * two CPUs are from each other, the larger the distance gets.
  *
  * Note that we use the distance only to cache measurement results,
  * the distance value is not used numerically otherwise. When two
  * CPUs have the same distance it is assumed that the migration
  * cost is the same. (this is a simplification but quite practical)
  */
 #define MAX_DOMAIN_DISTANCE 32
 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
 		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
 /*
  * Architectures may override the migration cost and thus avoid
  * boot-time calibration. Unit is nanoseconds. Mostly useful for
  * virtualized hardware:
  */
 #ifdef CONFIG_DEFAULT_MIGRATION_COST
 			CONFIG_DEFAULT_MIGRATION_COST
 #else
 			-1LL
 #endif
 };
 /*
  * Allow override of migration cost - in units of microseconds.
  * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
  * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
  */
 static int __init migration_cost_setup(char *str)
 {
 	int ints[MAX_DOMAIN_DISTANCE+1], i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	printk("#ints: %d\n", ints[0]);
 	for (i = 1; i <= ints[0]; i++) {
 		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
 		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
 	}
 	return 1;
 }
 __setup ("migration_cost=", migration_cost_setup);
 /*
  * Global multiplier (divisor) for migration-cutoff values,
  * in percentiles. E.g. use a value of 150 to get 1.5 times
  * longer cache-hot cutoff times.
  *
  * (We scale it from 100 to 128 to long long handling easier.)
  */
 #define MIGRATION_FACTOR_SCALE 128
 static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
 static int __init setup_migration_factor(char *str)
 {
 	get_option(&str, &migration_factor);
 	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
 	return 1;
 }
 __setup("migration_factor=", setup_migration_factor);
 /*
  * Estimated distance of two CPUs, measured via the number of domains
  * we have to pass for the two CPUs to be in the same span:
  */
 static unsigned long domain_distance(int cpu1, int cpu2)
 {
 	unsigned long distance = 0;
 	struct sched_domain *sd;
 	for_each_domain(cpu1, sd) {
 		WARN_ON(!cpu_isset(cpu1, sd->span));
 		if (cpu_isset(cpu2, sd->span))
 			return distance;
 		distance++;
 	}
 	if (distance >= MAX_DOMAIN_DISTANCE) {
 		WARN_ON(1);
 		distance = MAX_DOMAIN_DISTANCE-1;
 	}
 	return distance;
 }
 static unsigned int migration_debug;
 static int __init setup_migration_debug(char *str)
 {
 	get_option(&str, &migration_debug);
 	return 1;
 }
 __setup("migration_debug=", setup_migration_debug);
 /*
  * Maximum cache-size that the scheduler should try to measure.
  * Architectures with larger caches should tune this up during
  * bootup. Gets used in the domain-setup code (i.e. during SMP
  * bootup).
  */
 unsigned int max_cache_size;
 static int __init setup_max_cache_size(char *str)
 {
 	get_option(&str, &max_cache_size);
 	return 1;
 }
 __setup("max_cache_size=", setup_max_cache_size);
 /*
  * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
  * is the operation that is timed, so we try to generate unpredictable
  * cachemisses that still end up filling the L2 cache:
  */
 static void touch_cache(void *__cache, unsigned long __size)
 {
 	unsigned long size = __size / sizeof(long);
 	unsigned long chunk1 = size / 3;
 	unsigned long chunk2 = 2 * size / 3;
 	unsigned long *cache = __cache;
 	int i;
 	for (i = 0; i < size/6; i += 8) {
 		switch (i % 6) {
 			case 0: cache[i]++;
 			case 1: cache[size-1-i]++;
 			case 2: cache[chunk1-i]++;
 			case 3: cache[chunk1+i]++;
 			case 4: cache[chunk2-i]++;
 			case 5: cache[chunk2+i]++;
 		}
 	}
 }
 /*
  * Measure the cache-cost of one task migration. Returns in units of nsec.
  */
 static unsigned long long
 measure_one(void *cache, unsigned long size, int source, int target)
 {
 	cpumask_t mask, saved_mask;
 	unsigned long long t0, t1, t2, t3, cost;
 	saved_mask = current->cpus_allowed;
 	/*
 	 * Flush source caches to RAM and invalidate them:
 	 */
 	sched_cacheflush();
 	/*
 	 * Migrate to the source CPU:
 	 */
 	mask = cpumask_of_cpu(source);
 	set_cpus_allowed(current, mask);
 	WARN_ON(smp_processor_id() != source);
 	/*
 	 * Dirty the working set:
 	 */
 	t0 = sched_clock();
 	touch_cache(cache, size);
 	t1 = sched_clock();
 	/*
 	 * Migrate to the target CPU, dirty the L2 cache and access
 	 * the shared buffer. (which represents the working set
 	 * of a migrated task.)
 	 */
 	mask = cpumask_of_cpu(target);
 	set_cpus_allowed(current, mask);
 	WARN_ON(smp_processor_id() != target);
 	t2 = sched_clock();
 	touch_cache(cache, size);
 	t3 = sched_clock();
 	cost = t1-t0 + t3-t2;
 	if (migration_debug >= 2)
 		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
 			source, target, t1-t0, t1-t0, t3-t2, cost);
 	/*
 	 * Flush target caches to RAM and invalidate them:
 	 */
 	sched_cacheflush();
 	set_cpus_allowed(current, saved_mask);
 	return cost;
 }
 /*
  * Measure a series of task migrations and return the average
  * result. Since this code runs early during bootup the system
  * is 'undisturbed' and the average latency makes sense.
  *
  * The algorithm in essence auto-detects the relevant cache-size,
  * so it will properly detect different cachesizes for different
  * cache-hierarchies, depending on how the CPUs are connected.
  *
  * Architectures can prime the upper limit of the search range via
  * max_cache_size, otherwise the search range defaults to 20MB...64K.
  */
 static unsigned long long
 measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
 {
 	unsigned long long cost1, cost2;
 	int i;
 	/*
 	 * Measure the migration cost of 'size' bytes, over an
 	 * average of 10 runs:
 	 *
 	 * (We perturb the cache size by a small (0..4k)
 	 *  value to compensate size/alignment related artifacts.
 	 *  We also subtract the cost of the operation done on
 	 *  the same CPU.)
 	 */
 	cost1 = 0;
 	/*
 	 * dry run, to make sure we start off cache-cold on cpu1,
 	 * and to get any vmalloc pagefaults in advance:
 	 */
 	measure_one(cache, size, cpu1, cpu2);
 	for (i = 0; i < ITERATIONS; i++)
 		cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
 	measure_one(cache, size, cpu2, cpu1);
 	for (i = 0; i < ITERATIONS; i++)
 		cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
 	/*
 	 * (We measure the non-migrating [cached] cost on both
 	 *  cpu1 and cpu2, to handle CPUs with different speeds)
 	 */
 	cost2 = 0;
 	measure_one(cache, size, cpu1, cpu1);
 	for (i = 0; i < ITERATIONS; i++)
 		cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
 	measure_one(cache, size, cpu2, cpu2);
 	for (i = 0; i < ITERATIONS; i++)
 		cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
 	/*
 	 * Get the per-iteration migration cost:
 	 */
 	do_div(cost1, 2 * ITERATIONS);
 	do_div(cost2, 2 * ITERATIONS);
 	return cost1 - cost2;
 }
 static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 {
 	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
 	unsigned int max_size, size, size_found = 0;
 	long long cost = 0, prev_cost;
 	void *cache;
 	/*
 	 * Search from max_cache_size*5 down to 64K - the real relevant
 	 * cachesize has to lie somewhere inbetween.
 	 */
 	if (max_cache_size) {
 		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
 		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
 	} else {
 		/*
 		 * Since we have no estimation about the relevant
 		 * search range
 		 */
 		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
 		size = MIN_CACHE_SIZE;
 	}
 	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
 		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
 		return 0;
 	}
 	/*
 	 * Allocate the working set:
 	 */
 	cache = vmalloc(max_size);
 	if (!cache) {
 		printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
 		return 1000000; /* return 1 msec on very small boxen */
 	}
 	while (size <= max_size) {
 		prev_cost = cost;
 		cost = measure_cost(cpu1, cpu2, cache, size);
 		/*
 		 * Update the max:
 		 */
 		if (cost > 0) {
 			if (max_cost < cost) {
 				max_cost = cost;
 				size_found = size;
 			}
 		}
 		/*
 		 * Calculate average fluctuation, we use this to prevent
 		 * noise from triggering an early break out of the loop:
 		 */
 		fluct = abs(cost - prev_cost);
 		avg_fluct = (avg_fluct + fluct)/2;
 		if (migration_debug)
 			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
 				"(%8Ld %8Ld)\n",
 				cpu1, cpu2, size,
 				(long)cost / 1000000,
 				((long)cost / 100000) % 10,
 				(long)max_cost / 1000000,
 				((long)max_cost / 100000) % 10,
 				domain_distance(cpu1, cpu2),
 				cost, avg_fluct);
 		/*
 		 * If we iterated at least 20% past the previous maximum,
 		 * and the cost has dropped by more than 20% already,
 		 * (taking fluctuations into account) then we assume to
 		 * have found the maximum and break out of the loop early:
 		 */
 		if (size_found && (size*100 > size_found*SIZE_THRESH))
 			if (cost+avg_fluct <= 0 ||
 				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
 				if (migration_debug)
 					printk("-> found max.\n");
 				break;
 			}
 		/*
 		 * Increase the cachesize in 10% steps:
 		 */
 		size = size * 10 / 9;
 	}
 	if (migration_debug)
 		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
 			cpu1, cpu2, size_found, max_cost);
 	vfree(cache);
 	/*
 	 * A task is considered 'cache cold' if at least 2 times
 	 * the worst-case cost of migration has passed.
 	 *
 	 * (this limit is only listened to if the load-balancing
 	 * situation is 'nice' - if there is a large imbalance we
 	 * ignore it for the sake of CPU utilization and
 	 * processing fairness.)
 	 */
 	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
 }
 static void calibrate_migration_costs(const cpumask_t *cpu_map)
 {
 	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
 	unsigned long j0, j1, distance, max_distance = 0;
 	struct sched_domain *sd;
 	j0 = jiffies;
 	/*
 	 * First pass - calculate the cacheflush times:
 	 */
 	for_each_cpu_mask(cpu1, *cpu_map) {
 		for_each_cpu_mask(cpu2, *cpu_map) {
 			if (cpu1 == cpu2)
 				continue;
 			distance = domain_distance(cpu1, cpu2);
 			max_distance = max(max_distance, distance);
 			/*
 			 * No result cached yet?
 			 */
 			if (migration_cost[distance] == -1LL)
 				migration_cost[distance] =
 					measure_migration_cost(cpu1, cpu2);
 		}
 	}
 	/*
 	 * Second pass - update the sched domain hierarchy with
 	 * the new cache-hot-time estimations:
 	 */
 	for_each_cpu_mask(cpu, *cpu_map) {
 		distance = 0;
 		for_each_domain(cpu, sd) {
 			sd->cache_hot_time = migration_cost[distance];
 			distance++;
 		}
 	}
 	/*
 	 * Print the matrix:
 	 */
 	if (migration_debug)
 		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
 			max_cache_size,
 #ifdef CONFIG_X86
 			cpu_khz/1000
 #else
 			-1
 #endif
 		);
 	if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
 		printk("migration_cost=");
 		for (distance = 0; distance <= max_distance; distance++) {
 			if (distance)
 				printk(",");
 			printk("%ld", (long)migration_cost[distance] / 1000);
 		}
 		printk("\n");
 	}
 	j1 = jiffies;
 	if (migration_debug)
 		printk("migration: %ld seconds\n", (j1-j0) / HZ);
 	/*
 	 * Move back to the original CPU. NUMA-Q gets confused
 	 * if we migrate to another quad during bootup.
 	 */
 	if (raw_smp_processor_id() != orig_cpu) {
 		cpumask_t mask = cpumask_of_cpu(orig_cpu),
 			saved_mask = current->cpus_allowed;
 		set_cpus_allowed(current, mask);
 		set_cpus_allowed(current, saved_mask);
 	}
 }
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain.  Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 	min_val = INT_MAX;
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Start at @node */
 		n = (node + i) % MAX_NUMNODES;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (test_bit(n, used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	set_bit(best_node, used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @size: number of nodes to include in this span
  *
  * Given a node, construct a good cpumask for its sched_domain to span.  It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static cpumask_t sched_domain_node_span(int node)
 {
 	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
 	cpumask_t span, nodemask;
 	int i;
 	cpus_clear(span);
 	bitmap_zero(used_nodes, MAX_NUMNODES);
 	nodemask = node_to_cpumask(node);
 	cpus_or(span, span, nodemask);
 	set_bit(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, used_nodes);
 		nodemask = node_to_cpumask(next_node);
 		cpus_or(span, span, nodemask);
 	}
 	return span;
 }
 #endif
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
 			    struct sched_group **sg)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
 #endif
 /*
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
 			     struct sched_group **sg)
 {
 	int group;
 	cpumask_t mask = cpu_sibling_map[cpu];
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group);
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
 			     struct sched_group **sg)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu);
 	return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
 			     struct sched_group **sg)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
 	cpumask_t mask = cpu_coregroup_map(cpu);
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
 	cpumask_t mask = cpu_sibling_map[cpu];
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 #else
 	group = cpu;
 #endif
 	if (sg)
 		*sg = &per_cpu(sched_group_phys, group);
 	return group;
 }
 #ifdef CONFIG_NUMA
 /*
  * The init_sched_build_groups can't handle what we want to do with node
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg)
 {
 	cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
 	int group;
 	cpus_and(nodemask, nodemask, *cpu_map);
 	group = first_cpu(nodemask);
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group);
 	return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
 	struct sched_group *sg = group_head;
 	int j;
 	if (!sg)
 		return;
 next_sg:
 	for_each_cpu_mask(j, sg->cpumask) {
 		struct sched_domain *sd;
 		sd = &per_cpu(phys_domains, j);
 		if (j != first_cpu(sd->groups->cpumask)) {
 			/*
 			 * Only add "power" once for each
 			 * physical package.
 			 */
 			continue;
 		}
 		sg->cpu_power += sd->groups->cpu_power;
 	}
 	sg = sg->next;
 	if (sg != group_head)
 		goto next_sg;
 }
 #endif
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
 	int cpu, i;
 	for_each_cpu_mask(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (!sched_group_nodes)
 			continue;
 		for (i = 0; i < MAX_NUMNODES; i++) {
 			cpumask_t nodemask = node_to_cpumask(i);
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			cpus_and(nodemask, nodemask, *cpu_map);
 			if (cpus_empty(nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
 			oldsg = sg;
 			sg = sg->next;
 			kfree(oldsg);
 			if (oldsg != sched_group_nodes[i])
 				goto next_sg;
 		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
 #else
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
 }
 #endif
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  *
  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
  * the maximum number of tasks a group can handle in the presence of other idle
  * or lightly loaded groups in the same sched domain.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
 	WARN_ON(!sd || !sd->groups);
 	if (cpu != first_cpu(sd->groups->cpumask))
 		return;
 	child = sd->child;
 	/*
 	 * For perf policy, if the groups in child domain share resources
 	 * (for example cores sharing some portions of the cache hierarchy
 	 * or SMT), then set this domain groups cpu_power such that each group
 	 * can handle only one task, when there are other idle groups in the
 	 * same sched domain.
 	 */
 	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 		       (child->flags &
 			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
 		return;
 	}
 	sd->groups->cpu_power = 0;
 	/*
 	 * add cpu_power of each child group to this groups cpu_power
 	 */
 	group = child->groups;
 	do {
 		sd->groups->cpu_power += group->cpu_power;
 		group = group->next;
 	} while (group != child->groups);
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	struct sched_domain *sd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
 	sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
 					   GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return -ENOMEM;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
 			p = sd;
 			sd_allnodes = 1;
 		} else
 			p = NULL;
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpu_to_phys_group(i, cpu_map, &sd->groups);
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
 		*sd = SD_MC_INIT;
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups);
 #endif
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups);
 #endif
 	}
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 		init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_core_map = cpu_coregroup_map(i);
 		cpus_and(this_core_map, this_core_map, *cpu_map);
 		if (i != first_cpu(this_core_map))
 			continue;
 		init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
 	}
 #endif
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 		init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
 	}
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sd_allnodes)
 		init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpumask_t domainspan;
 		cpumask_t covered = CPU_MASK_NONE;
 		int j;
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
 		if (!sg) {
 			printk(KERN_WARNING "Can not alloc domain group for "
 				"node %d\n", i);
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
 		sg->cpu_power = 0;
 		sg->cpumask = nodemask;
 		sg->next = sg;
 		cpus_or(covered, covered, nodemask);
 		prev = sg;
 		for (j = 0; j < MAX_NUMNODES; j++) {
 			cpumask_t tmp, notcovered;
 			int n = (i + j) % MAX_NUMNODES;
 			cpus_complement(notcovered, covered);
 			cpus_and(tmp, notcovered, *cpu_map);
 			cpus_and(tmp, tmp, domainspan);
 			if (cpus_empty(tmp))
 				break;
 			nodemask = node_to_cpumask(n);
 			cpus_and(tmp, tmp, nodemask);
 			if (cpus_empty(tmp))
 				continue;
 			sg = kmalloc_node(sizeof(struct sched_group),
 					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
 				goto error;
 			}
 			sg->cpu_power = 0;
 			sg->cpumask = tmp;
 			sg->next = prev->next;
 			cpus_or(covered, covered, tmp);
 			prev->next = sg;
 			prev = sg;
 		}
 	}
 #endif
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
 		sd = &per_cpu(cpu_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
 		sd = &per_cpu(core_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 	for_each_cpu_mask(i, *cpu_map) {
 		sd = &per_cpu(phys_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #ifdef CONFIG_NUMA
 	for (i = 0; i < MAX_NUMNODES; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 	if (sd_allnodes) {
 		struct sched_group *sg;
 		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
 		cpu_attach_domain(sd, i);
 	}
 	/*
 	 * Tune cache-hot values:
 	 */
 	calibrate_migration_costs(cpu_map);
 	return 0;
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map);
 	return -ENOMEM;
 #endif
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 	int err;
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
 	 * exclude other special cases in the future.
 	 */
 	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 	err = build_sched_domains(&cpu_default_map);
 	return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	free_sched_groups(cpu_map);
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
 /*
  * Partition sched domains as specified by the cpumasks below.
  * This attaches all cpus from the cpumasks to the NULL domain,
  * waits for a RCU quiescent period, recalculates sched
  * domain information and then attaches them back to the
  * correct sched domains
  * Call with hotplug lock held
  */
 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 {
 	cpumask_t change_map;
 	int err = 0;
 	cpus_and(*partition1, *partition1, cpu_online_map);
 	cpus_and(*partition2, *partition2, cpu_online_map);
 	cpus_or(change_map, *partition1, *partition2);
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
 	if (!cpus_empty(*partition1))
 		err = build_sched_domains(partition1);
 	if (!err && !cpus_empty(*partition2))
 		err = build_sched_domains(partition2);
 	return err;
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
 	int err;
 	lock_cpu_hotplug();
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
 	unlock_cpu_hotplug();
 	return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
 	if (buf[0] != '0' && buf[0] != '1')
 		return -EINVAL;
 	if (smt)
 		sched_smt_power_savings = (buf[0] == '1');
 	else
 		sched_mc_power_savings = (buf[0] == '1');
 	ret = arch_reinit_sched_domains();
 	return ret ? ret : count;
 }
 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
 	return err;
 }
 #endif
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 	    sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
 	    sched_smt_power_savings_store);
 #endif
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
  * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
 		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 	case CPU_UP_CANCELED:
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 	case CPU_DEAD:
 		/*
 		 * Fall through and re-initialise the domains.
 		 */
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
 	return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 	lock_cpu_hotplug();
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 }
 #else
 void __init sched_init_smp(void)
 {
 }
 #endif /* CONFIG_SMP */
 int in_sched_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __sched functions */
 	extern char __sched_text_start[], __sched_text_end[];
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 void __init sched_init(void)
 {
 	int i, j, k;
+	int highest_cpu = 0;
 	for_each_possible_cpu(i) {
 		struct prio_array *array;
 		struct rq *rq;
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
 			}
 			// delimiter for bitsearch
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
+		highest_cpu = i;
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_SMP
+	nr_cpu_ids = highest_cpu + 1;
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 void __might_sleep(char *file, int line)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
 		dump_stack();
 	}
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
 	struct prio_array *array;
 	struct task_struct *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock_irq(&tasklist_lock);
 	for_each_process(p) {
 		if (!rt_task(p))
 			continue;
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 		array = p->array;
 		if (array)
 			deactivate_task(p, task_rq(p));
 		__setscheduler(p, SCHED_NORMAL, 0);
 		if (array) {
 			__activate_task(p, task_rq(p));
 			resched_task(rq->curr);
 		}
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 	}
 	read_unlock_irq(&tasklist_lock);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #ifdef CONFIG_IA64
 /*
  * These functions are only useful for the IA64 MCA handling.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack.  It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner.  This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif

lib/cpumask.c

Diff comments View file @ 476f353

 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
 int __first_cpu(const cpumask_t *srcp)
 {
 	return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS));
 }
 EXPORT_SYMBOL(__first_cpu);
 int __next_cpu(int n, const cpumask_t *srcp)
 {
 	return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1));
 }
 EXPORT_SYMBOL(__next_cpu);
-int nr_cpu_ids;
-EXPORT_SYMBOL(nr_cpu_ids);
 int __any_online_cpu(const cpumask_t *mask)
 {
 	int cpu;
 	for_each_cpu_mask(cpu, *mask) {
 		if (cpu_online(cpu))
 			break;
 	}
 	return cpu;
 }
 EXPORT_SYMBOL(__any_online_cpu);

mm/page_alloc.c

Diff comments View file @ 476f353

1	/*	1	/*
2	* linux/mm/page_alloc.c	2	* linux/mm/page_alloc.c
3	*	3	*
4	* Manages the free list, the system allocates free pages here.	4	* Manages the free list, the system allocates free pages here.
5	* Note that kmalloc() lives in slab.c	5	* Note that kmalloc() lives in slab.c
6	*	6	*
7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds	7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8	* Swap reorganised 29.12.95, Stephen Tweedie	8	* Swap reorganised 29.12.95, Stephen Tweedie
9	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999	9	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999	10	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999	11	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000	12	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002	13	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)	14	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15	*/	15	*/
16		16
17	#include <linux/stddef.h>	17	#include <linux/stddef.h>
18	#include <linux/mm.h>	18	#include <linux/mm.h>
19	#include <linux/swap.h>	19	#include <linux/swap.h>
20	#include <linux/interrupt.h>	20	#include <linux/interrupt.h>
21	#include <linux/pagemap.h>	21	#include <linux/pagemap.h>
22	#include <linux/bootmem.h>	22	#include <linux/bootmem.h>
23	#include <linux/compiler.h>	23	#include <linux/compiler.h>
24	#include <linux/kernel.h>	24	#include <linux/kernel.h>
25	#include <linux/module.h>	25	#include <linux/module.h>
26	#include <linux/suspend.h>	26	#include <linux/suspend.h>
27	#include <linux/pagevec.h>	27	#include <linux/pagevec.h>
28	#include <linux/blkdev.h>	28	#include <linux/blkdev.h>
29	#include <linux/slab.h>	29	#include <linux/slab.h>
30	#include <linux/notifier.h>	30	#include <linux/notifier.h>
31	#include <linux/topology.h>	31	#include <linux/topology.h>
32	#include <linux/sysctl.h>	32	#include <linux/sysctl.h>
33	#include <linux/cpu.h>	33	#include <linux/cpu.h>
34	#include <linux/cpuset.h>	34	#include <linux/cpuset.h>
35	#include <linux/memory_hotplug.h>	35	#include <linux/memory_hotplug.h>
36	#include <linux/nodemask.h>	36	#include <linux/nodemask.h>
37	#include <linux/vmalloc.h>	37	#include <linux/vmalloc.h>
38	#include <linux/mempolicy.h>	38	#include <linux/mempolicy.h>
39	#include <linux/stop_machine.h>	39	#include <linux/stop_machine.h>
40	#include <linux/sort.h>	40	#include <linux/sort.h>
41	#include <linux/pfn.h>	41	#include <linux/pfn.h>
42	#include <linux/backing-dev.h>	42	#include <linux/backing-dev.h>
43	#include <linux/fault-inject.h>	43	#include <linux/fault-inject.h>
44		44
45	#include <asm/tlbflush.h>	45	#include <asm/tlbflush.h>
46	#include <asm/div64.h>	46	#include <asm/div64.h>
47	#include "internal.h"	47	#include "internal.h"
48		48
49	/*	49	/*
50	* MCD - HACK: Find somewhere to initialize this EARLY, or make this	50	* MCD - HACK: Find somewhere to initialize this EARLY, or make this
51	* initializer cleaner	51	* initializer cleaner
52	*/	52	*/
53	nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };	53	nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
54	EXPORT_SYMBOL(node_online_map);	54	EXPORT_SYMBOL(node_online_map);
55	nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;	55	nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
56	EXPORT_SYMBOL(node_possible_map);	56	EXPORT_SYMBOL(node_possible_map);
57	unsigned long totalram_pages __read_mostly;	57	unsigned long totalram_pages __read_mostly;
58	unsigned long totalreserve_pages __read_mostly;	58	unsigned long totalreserve_pages __read_mostly;
59	long nr_swap_pages;	59	long nr_swap_pages;
60	int percpu_pagelist_fraction;	60	int percpu_pagelist_fraction;
61		61
62	static void __free_pages_ok(struct page *page, unsigned int order);	62	static void __free_pages_ok(struct page *page, unsigned int order);
63		63
64	/*	64	/*
65	* results with 256, 32 in the lowmem_reserve sysctl:	65	* results with 256, 32 in the lowmem_reserve sysctl:
66	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)	66	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
67	* 1G machine -> (16M dma, 784M normal, 224M high)	67	* 1G machine -> (16M dma, 784M normal, 224M high)
68	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA	68	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
69	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL	69	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
70	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA	70	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
71	*	71	*
72	* TBD: should special case ZONE_DMA32 machines here - in those we normally	72	* TBD: should special case ZONE_DMA32 machines here - in those we normally
73	* don't need any ZONE_NORMAL reservation	73	* don't need any ZONE_NORMAL reservation
74	*/	74	*/
75	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {	75	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
76	#ifdef CONFIG_ZONE_DMA	76	#ifdef CONFIG_ZONE_DMA
77	256,	77	256,
78	#endif	78	#endif
79	#ifdef CONFIG_ZONE_DMA32	79	#ifdef CONFIG_ZONE_DMA32
80	256,	80	256,
81	#endif	81	#endif
82	#ifdef CONFIG_HIGHMEM	82	#ifdef CONFIG_HIGHMEM
83	32	83	32
84	#endif	84	#endif
85	};	85	};
86		86
87	EXPORT_SYMBOL(totalram_pages);	87	EXPORT_SYMBOL(totalram_pages);
88		88
89	static char * const zone_names[MAX_NR_ZONES] = {	89	static char * const zone_names[MAX_NR_ZONES] = {
90	#ifdef CONFIG_ZONE_DMA	90	#ifdef CONFIG_ZONE_DMA
91	"DMA",	91	"DMA",
92	#endif	92	#endif
93	#ifdef CONFIG_ZONE_DMA32	93	#ifdef CONFIG_ZONE_DMA32
94	"DMA32",	94	"DMA32",
95	#endif	95	#endif
96	"Normal",	96	"Normal",
97	#ifdef CONFIG_HIGHMEM	97	#ifdef CONFIG_HIGHMEM
98	"HighMem"	98	"HighMem"
99	#endif	99	#endif
100	};	100	};
101		101
102	int min_free_kbytes = 1024;	102	int min_free_kbytes = 1024;
103		103
104	unsigned long __meminitdata nr_kernel_pages;	104	unsigned long __meminitdata nr_kernel_pages;
105	unsigned long __meminitdata nr_all_pages;	105	unsigned long __meminitdata nr_all_pages;
106	static unsigned long __initdata dma_reserve;	106	static unsigned long __initdata dma_reserve;
107		107
108	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP	108	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
109	/*	109	/*
110	* MAX_ACTIVE_REGIONS determines the maxmimum number of distinct	110	* MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
111	* ranges of memory (RAM) that may be registered with add_active_range().	111	* ranges of memory (RAM) that may be registered with add_active_range().
112	* Ranges passed to add_active_range() will be merged if possible	112	* Ranges passed to add_active_range() will be merged if possible
113	* so the number of times add_active_range() can be called is	113	* so the number of times add_active_range() can be called is
114	* related to the number of nodes and the number of holes	114	* related to the number of nodes and the number of holes
115	*/	115	*/
116	#ifdef CONFIG_MAX_ACTIVE_REGIONS	116	#ifdef CONFIG_MAX_ACTIVE_REGIONS
117	/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */	117	/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
118	#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS	118	#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
119	#else	119	#else
120	#if MAX_NUMNODES >= 32	120	#if MAX_NUMNODES >= 32
121	/* If there can be many nodes, allow up to 50 holes per node */	121	/* If there can be many nodes, allow up to 50 holes per node */
122	#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)	122	#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
123	#else	123	#else
124	/* By default, allow up to 256 distinct regions */	124	/* By default, allow up to 256 distinct regions */
125	#define MAX_ACTIVE_REGIONS 256	125	#define MAX_ACTIVE_REGIONS 256
126	#endif	126	#endif
127	#endif	127	#endif
128		128
129	struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];	129	struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
130	int __initdata nr_nodemap_entries;	130	int __initdata nr_nodemap_entries;
131	unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];	131	unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
132	unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];	132	unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
133	#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE	133	#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
134	unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];	134	unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
135	unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];	135	unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
136	#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */	136	#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
137	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */	137	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
138		138
139	#ifdef CONFIG_DEBUG_VM	139	#ifdef CONFIG_DEBUG_VM
140	static int page_outside_zone_boundaries(struct zone zone, struct page page)	140	static int page_outside_zone_boundaries(struct zone zone, struct page page)
141	{	141	{
142	int ret = 0;	142	int ret = 0;
143	unsigned seq;	143	unsigned seq;
144	unsigned long pfn = page_to_pfn(page);	144	unsigned long pfn = page_to_pfn(page);
145		145
146	do {	146	do {
147	seq = zone_span_seqbegin(zone);	147	seq = zone_span_seqbegin(zone);
148	if (pfn >= zone->zone_start_pfn + zone->spanned_pages)	148	if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
149	ret = 1;	149	ret = 1;
150	else if (pfn < zone->zone_start_pfn)	150	else if (pfn < zone->zone_start_pfn)
151	ret = 1;	151	ret = 1;
152	} while (zone_span_seqretry(zone, seq));	152	} while (zone_span_seqretry(zone, seq));
153		153
154	return ret;	154	return ret;
155	}	155	}
156		156
157	static int page_is_consistent(struct zone zone, struct page page)	157	static int page_is_consistent(struct zone zone, struct page page)
158	{	158	{
159	#ifdef CONFIG_HOLES_IN_ZONE	159	#ifdef CONFIG_HOLES_IN_ZONE
160	if (!pfn_valid(page_to_pfn(page)))	160	if (!pfn_valid(page_to_pfn(page)))
161	return 0;	161	return 0;
162	#endif	162	#endif
163	if (zone != page_zone(page))	163	if (zone != page_zone(page))
164	return 0;	164	return 0;
165		165
166	return 1;	166	return 1;
167	}	167	}
168	/*	168	/*
169	* Temporary debugging check for pages not lying within a given zone.	169	* Temporary debugging check for pages not lying within a given zone.
170	*/	170	*/
171	static int bad_range(struct zone zone, struct page page)	171	static int bad_range(struct zone zone, struct page page)
172	{	172	{
173	if (page_outside_zone_boundaries(zone, page))	173	if (page_outside_zone_boundaries(zone, page))
174	return 1;	174	return 1;
175	if (!page_is_consistent(zone, page))	175	if (!page_is_consistent(zone, page))
176	return 1;	176	return 1;
177		177
178	return 0;	178	return 0;
179	}	179	}
180	#else	180	#else
181	static inline int bad_range(struct zone zone, struct page page)	181	static inline int bad_range(struct zone zone, struct page page)
182	{	182	{
183	return 0;	183	return 0;
184	}	184	}
185	#endif	185	#endif
186		186
187	static void bad_page(struct page *page)	187	static void bad_page(struct page *page)
188	{	188	{
189	printk(KERN_EMERG "Bad page state in process '%s'\n"	189	printk(KERN_EMERG "Bad page state in process '%s'\n"
190	KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"	190	KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
191	KERN_EMERG "Trying to fix it up, but a reboot is needed\n"	191	KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
192	KERN_EMERG "Backtrace:\n",	192	KERN_EMERG "Backtrace:\n",
193	current->comm, page, (int)(2*sizeof(unsigned long)),	193	current->comm, page, (int)(2*sizeof(unsigned long)),
194	(unsigned long)page->flags, page->mapping,	194	(unsigned long)page->flags, page->mapping,
195	page_mapcount(page), page_count(page));	195	page_mapcount(page), page_count(page));
196	dump_stack();	196	dump_stack();
197	page->flags &= ~(1 << PG_lru \|	197	page->flags &= ~(1 << PG_lru \|
198	1 << PG_private \|	198	1 << PG_private \|
199	1 << PG_locked \|	199	1 << PG_locked \|
200	1 << PG_active \|	200	1 << PG_active \|
201	1 << PG_dirty \|	201	1 << PG_dirty \|
202	1 << PG_reclaim \|	202	1 << PG_reclaim \|
203	1 << PG_slab \|	203	1 << PG_slab \|
204	1 << PG_swapcache \|	204	1 << PG_swapcache \|
205	1 << PG_writeback \|	205	1 << PG_writeback \|
206	1 << PG_buddy );	206	1 << PG_buddy );
207	set_page_count(page, 0);	207	set_page_count(page, 0);
208	reset_page_mapcount(page);	208	reset_page_mapcount(page);
209	page->mapping = NULL;	209	page->mapping = NULL;
210	add_taint(TAINT_BAD_PAGE);	210	add_taint(TAINT_BAD_PAGE);
211	}	211	}
212		212
213	/*	213	/*
214	* Higher-order pages are called "compound pages". They are structured thusly:	214	* Higher-order pages are called "compound pages". They are structured thusly:
215	*	215	*
216	* The first PAGE_SIZE page is called the "head page".	216	* The first PAGE_SIZE page is called the "head page".
217	*	217	*
218	* The remaining PAGE_SIZE pages are called "tail pages".	218	* The remaining PAGE_SIZE pages are called "tail pages".
219	*	219	*
220	* All pages have PG_compound set. All pages have their ->private pointing at	220	* All pages have PG_compound set. All pages have their ->private pointing at
221	* the head page (even the head page has this).	221	* the head page (even the head page has this).
222	*	222	*
223	* The first tail page's ->lru.next holds the address of the compound page's	223	* The first tail page's ->lru.next holds the address of the compound page's
224	* put_page() function. Its ->lru.prev holds the order of allocation.	224	* put_page() function. Its ->lru.prev holds the order of allocation.
225	* This usage means that zero-order pages may not be compound.	225	* This usage means that zero-order pages may not be compound.
226	*/	226	*/
227		227
228	static void free_compound_page(struct page *page)	228	static void free_compound_page(struct page *page)
229	{	229	{
230	__free_pages_ok(page, (unsigned long)page[1].lru.prev);	230	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
231	}	231	}
232		232
233	static void prep_compound_page(struct page *page, unsigned long order)	233	static void prep_compound_page(struct page *page, unsigned long order)
234	{	234	{
235	int i;	235	int i;
236	int nr_pages = 1 << order;	236	int nr_pages = 1 << order;
237		237
238	set_compound_page_dtor(page, free_compound_page);	238	set_compound_page_dtor(page, free_compound_page);
239	page[1].lru.prev = (void *)order;	239	page[1].lru.prev = (void *)order;
240	for (i = 0; i < nr_pages; i++) {	240	for (i = 0; i < nr_pages; i++) {
241	struct page *p = page + i;	241	struct page *p = page + i;
242		242
243	__SetPageCompound(p);	243	__SetPageCompound(p);
244	set_page_private(p, (unsigned long)page);	244	set_page_private(p, (unsigned long)page);
245	}	245	}
246	}	246	}
247		247
248	static void destroy_compound_page(struct page *page, unsigned long order)	248	static void destroy_compound_page(struct page *page, unsigned long order)
249	{	249	{
250	int i;	250	int i;
251	int nr_pages = 1 << order;	251	int nr_pages = 1 << order;
252		252
253	if (unlikely((unsigned long)page[1].lru.prev != order))	253	if (unlikely((unsigned long)page[1].lru.prev != order))
254	bad_page(page);	254	bad_page(page);
255		255
256	for (i = 0; i < nr_pages; i++) {	256	for (i = 0; i < nr_pages; i++) {
257	struct page *p = page + i;	257	struct page *p = page + i;
258		258
259	if (unlikely(!PageCompound(p) \|	259	if (unlikely(!PageCompound(p) \|
260	(page_private(p) != (unsigned long)page)))	260	(page_private(p) != (unsigned long)page)))
261	bad_page(page);	261	bad_page(page);
262	__ClearPageCompound(p);	262	__ClearPageCompound(p);
263	}	263	}
264	}	264	}
265		265
266	static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)	266	static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
267	{	267	{
268	int i;	268	int i;
269		269
270	VM_BUG_ON((gfp_flags & (__GFP_WAIT \| __GFP_HIGHMEM)) == __GFP_HIGHMEM);	270	VM_BUG_ON((gfp_flags & (__GFP_WAIT \| __GFP_HIGHMEM)) == __GFP_HIGHMEM);
271	/*	271	/*
272	* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO	272	* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
273	* and __GFP_HIGHMEM from hard or soft interrupt context.	273	* and __GFP_HIGHMEM from hard or soft interrupt context.
274	*/	274	*/
275	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());	275	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
276	for (i = 0; i < (1 << order); i++)	276	for (i = 0; i < (1 << order); i++)
277	clear_highpage(page + i);	277	clear_highpage(page + i);
278	}	278	}
279		279
280	/*	280	/*
281	* function for dealing with page's order in buddy system.	281	* function for dealing with page's order in buddy system.
282	* zone->lock is already acquired when we use these.	282	* zone->lock is already acquired when we use these.
283	* So, we don't need atomic page->flags operations here.	283	* So, we don't need atomic page->flags operations here.
284	*/	284	*/
285	static inline unsigned long page_order(struct page *page)	285	static inline unsigned long page_order(struct page *page)
286	{	286	{
287	return page_private(page);	287	return page_private(page);
288	}	288	}
289		289
290	static inline void set_page_order(struct page *page, int order)	290	static inline void set_page_order(struct page *page, int order)
291	{	291	{
292	set_page_private(page, order);	292	set_page_private(page, order);
293	__SetPageBuddy(page);	293	__SetPageBuddy(page);
294	}	294	}
295		295
296	static inline void rmv_page_order(struct page *page)	296	static inline void rmv_page_order(struct page *page)
297	{	297	{
298	__ClearPageBuddy(page);	298	__ClearPageBuddy(page);
299	set_page_private(page, 0);	299	set_page_private(page, 0);
300	}	300	}
301		301
302	/*	302	/*
303	* Locate the struct page for both the matching buddy in our	303	* Locate the struct page for both the matching buddy in our
304	* pair (buddy1) and the combined O(n+1) page they form (page).	304	* pair (buddy1) and the combined O(n+1) page they form (page).
305	*	305	*
306	* 1) Any buddy B1 will have an order O twin B2 which satisfies	306	* 1) Any buddy B1 will have an order O twin B2 which satisfies
307	* the following equation:	307	* the following equation:
308	* B2 = B1 ^ (1 << O)	308	* B2 = B1 ^ (1 << O)
309	* For example, if the starting buddy (buddy2) is #8 its order	309	* For example, if the starting buddy (buddy2) is #8 its order
310	* 1 buddy is #10:	310	* 1 buddy is #10:
311	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10	311	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
312	*	312	*
313	* 2) Any buddy B will have an order O+1 parent P which	313	* 2) Any buddy B will have an order O+1 parent P which
314	* satisfies the following equation:	314	* satisfies the following equation:
315	* P = B & ~(1 << O)	315	* P = B & ~(1 << O)
316	*	316	*
317	* Assumption: *_mem_map is contiguous at least up to MAX_ORDER	317	* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
318	*/	318	*/
319	static inline struct page *	319	static inline struct page *
320	__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)	320	__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
321	{	321	{
322	unsigned long buddy_idx = page_idx ^ (1 << order);	322	unsigned long buddy_idx = page_idx ^ (1 << order);
323		323
324	return page + (buddy_idx - page_idx);	324	return page + (buddy_idx - page_idx);
325	}	325	}
326		326
327	static inline unsigned long	327	static inline unsigned long
328	__find_combined_index(unsigned long page_idx, unsigned int order)	328	__find_combined_index(unsigned long page_idx, unsigned int order)
329	{	329	{
330	return (page_idx & ~(1 << order));	330	return (page_idx & ~(1 << order));
331	}	331	}
332		332
333	/*	333	/*
334	* This function checks whether a page is free && is the buddy	334	* This function checks whether a page is free && is the buddy
335	* we can do coalesce a page and its buddy if	335	* we can do coalesce a page and its buddy if
336	* (a) the buddy is not in a hole &&	336	* (a) the buddy is not in a hole &&
337	* (b) the buddy is in the buddy system &&	337	* (b) the buddy is in the buddy system &&
338	* (c) a page and its buddy have the same order &&	338	* (c) a page and its buddy have the same order &&
339	* (d) a page and its buddy are in the same zone.	339	* (d) a page and its buddy are in the same zone.
340	*	340	*
341	* For recording whether a page is in the buddy system, we use PG_buddy.	341	* For recording whether a page is in the buddy system, we use PG_buddy.
342	* Setting, clearing, and testing PG_buddy is serialized by zone->lock.	342	* Setting, clearing, and testing PG_buddy is serialized by zone->lock.
343	*	343	*
344	* For recording page's order, we use page_private(page).	344	* For recording page's order, we use page_private(page).
345	*/	345	*/
346	static inline int page_is_buddy(struct page page, struct page buddy,	346	static inline int page_is_buddy(struct page page, struct page buddy,
347	int order)	347	int order)
348	{	348	{
349	#ifdef CONFIG_HOLES_IN_ZONE	349	#ifdef CONFIG_HOLES_IN_ZONE
350	if (!pfn_valid(page_to_pfn(buddy)))	350	if (!pfn_valid(page_to_pfn(buddy)))
351	return 0;	351	return 0;
352	#endif	352	#endif
353		353
354	if (page_zone_id(page) != page_zone_id(buddy))	354	if (page_zone_id(page) != page_zone_id(buddy))
355	return 0;	355	return 0;
356		356
357	if (PageBuddy(buddy) && page_order(buddy) == order) {	357	if (PageBuddy(buddy) && page_order(buddy) == order) {
358	BUG_ON(page_count(buddy) != 0);	358	BUG_ON(page_count(buddy) != 0);
359	return 1;	359	return 1;
360	}	360	}
361	return 0;	361	return 0;
362	}	362	}
363		363
364	/*	364	/*
365	* Freeing function for a buddy system allocator.	365	* Freeing function for a buddy system allocator.
366	*	366	*
367	* The concept of a buddy system is to maintain direct-mapped table	367	* The concept of a buddy system is to maintain direct-mapped table
368	* (containing bit values) for memory blocks of various "orders".	368	* (containing bit values) for memory blocks of various "orders".
369	* The bottom level table contains the map for the smallest allocatable	369	* The bottom level table contains the map for the smallest allocatable
370	* units of memory (here, pages), and each level above it describes	370	* units of memory (here, pages), and each level above it describes
371	* pairs of units from the levels below, hence, "buddies".	371	* pairs of units from the levels below, hence, "buddies".
372	* At a high level, all that happens here is marking the table entry	372	* At a high level, all that happens here is marking the table entry
373	* at the bottom level available, and propagating the changes upward	373	* at the bottom level available, and propagating the changes upward
374	* as necessary, plus some accounting needed to play nicely with other	374	* as necessary, plus some accounting needed to play nicely with other
375	* parts of the VM system.	375	* parts of the VM system.
376	* At each level, we keep a list of pages, which are heads of continuous	376	* At each level, we keep a list of pages, which are heads of continuous
377	* free pages of length of (1 << order) and marked with PG_buddy. Page's	377	* free pages of length of (1 << order) and marked with PG_buddy. Page's
378	* order is recorded in page_private(page) field.	378	* order is recorded in page_private(page) field.
379	* So when we are allocating or freeing one, we can derive the state of the	379	* So when we are allocating or freeing one, we can derive the state of the
380	* other. That is, if we allocate a small block, and both were	380	* other. That is, if we allocate a small block, and both were
381	* free, the remainder of the region must be split into blocks.	381	* free, the remainder of the region must be split into blocks.
382	* If a block is freed, and its buddy is also free, then this	382	* If a block is freed, and its buddy is also free, then this
383	* triggers coalescing into a block of larger size.	383	* triggers coalescing into a block of larger size.
384	*	384	*
385	* -- wli	385	* -- wli
386	*/	386	*/
387		387
388	static inline void __free_one_page(struct page *page,	388	static inline void __free_one_page(struct page *page,
389	struct zone *zone, unsigned int order)	389	struct zone *zone, unsigned int order)
390	{	390	{
391	unsigned long page_idx;	391	unsigned long page_idx;
392	int order_size = 1 << order;	392	int order_size = 1 << order;
393		393
394	if (unlikely(PageCompound(page)))	394	if (unlikely(PageCompound(page)))
395	destroy_compound_page(page, order);	395	destroy_compound_page(page, order);
396		396
397	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);	397	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
398		398
399	VM_BUG_ON(page_idx & (order_size - 1));	399	VM_BUG_ON(page_idx & (order_size - 1));
400	VM_BUG_ON(bad_range(zone, page));	400	VM_BUG_ON(bad_range(zone, page));
401		401
402	__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);	402	__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
403	while (order < MAX_ORDER-1) {	403	while (order < MAX_ORDER-1) {
404	unsigned long combined_idx;	404	unsigned long combined_idx;
405	struct free_area *area;	405	struct free_area *area;
406	struct page *buddy;	406	struct page *buddy;
407		407
408	buddy = __page_find_buddy(page, page_idx, order);	408	buddy = __page_find_buddy(page, page_idx, order);
409	if (!page_is_buddy(page, buddy, order))	409	if (!page_is_buddy(page, buddy, order))
410	break; /* Move the buddy up one level. */	410	break; /* Move the buddy up one level. */
411		411
412	list_del(&buddy->lru);	412	list_del(&buddy->lru);
413	area = zone->free_area + order;	413	area = zone->free_area + order;
414	area->nr_free--;	414	area->nr_free--;
415	rmv_page_order(buddy);	415	rmv_page_order(buddy);
416	combined_idx = __find_combined_index(page_idx, order);	416	combined_idx = __find_combined_index(page_idx, order);
417	page = page + (combined_idx - page_idx);	417	page = page + (combined_idx - page_idx);
418	page_idx = combined_idx;	418	page_idx = combined_idx;
419	order++;	419	order++;
420	}	420	}
421	set_page_order(page, order);	421	set_page_order(page, order);
422	list_add(&page->lru, &zone->free_area[order].free_list);	422	list_add(&page->lru, &zone->free_area[order].free_list);
423	zone->free_area[order].nr_free++;	423	zone->free_area[order].nr_free++;
424	}	424	}
425		425
426	static inline int free_pages_check(struct page *page)	426	static inline int free_pages_check(struct page *page)
427	{	427	{
428	if (unlikely(page_mapcount(page) \|	428	if (unlikely(page_mapcount(page) \|
429	(page->mapping != NULL) \|	429	(page->mapping != NULL) \|
430	(page_count(page) != 0) \|	430	(page_count(page) != 0) \|
431	(page->flags & (	431	(page->flags & (
432	1 << PG_lru \|	432	1 << PG_lru \|
433	1 << PG_private \|	433	1 << PG_private \|
434	1 << PG_locked \|	434	1 << PG_locked \|
435	1 << PG_active \|	435	1 << PG_active \|
436	1 << PG_reclaim \|	436	1 << PG_reclaim \|
437	1 << PG_slab \|	437	1 << PG_slab \|
438	1 << PG_swapcache \|	438	1 << PG_swapcache \|
439	1 << PG_writeback \|	439	1 << PG_writeback \|
440	1 << PG_reserved \|	440	1 << PG_reserved \|
441	1 << PG_buddy ))))	441	1 << PG_buddy ))))
442	bad_page(page);	442	bad_page(page);
443	if (PageDirty(page))	443	if (PageDirty(page))
444	__ClearPageDirty(page);	444	__ClearPageDirty(page);
445	/*	445	/*
446	* For now, we report if PG_reserved was found set, but do not	446	* For now, we report if PG_reserved was found set, but do not
447	* clear it, and do not free the page. But we shall soon need	447	* clear it, and do not free the page. But we shall soon need
448	* to do more, for when the ZERO_PAGE count wraps negative.	448	* to do more, for when the ZERO_PAGE count wraps negative.
449	*/	449	*/
450	return PageReserved(page);	450	return PageReserved(page);
451	}	451	}
452		452
453	/*	453	/*
454	* Frees a list of pages.	454	* Frees a list of pages.
455	* Assumes all pages on list are in same zone, and of same order.	455	* Assumes all pages on list are in same zone, and of same order.
456	* count is the number of pages to free.	456	* count is the number of pages to free.
457	*	457	*
458	* If the zone was previously in an "all pages pinned" state then look to	458	* If the zone was previously in an "all pages pinned" state then look to
459	* see if this freeing clears that state.	459	* see if this freeing clears that state.
460	*	460	*
461	* And clear the zone's pages_scanned counter, to hold off the "all pages are	461	* And clear the zone's pages_scanned counter, to hold off the "all pages are
462	* pinned" detection logic.	462	* pinned" detection logic.
463	*/	463	*/
464	static void free_pages_bulk(struct zone *zone, int count,	464	static void free_pages_bulk(struct zone *zone, int count,
465	struct list_head *list, int order)	465	struct list_head *list, int order)
466	{	466	{
467	spin_lock(&zone->lock);	467	spin_lock(&zone->lock);
468	zone->all_unreclaimable = 0;	468	zone->all_unreclaimable = 0;
469	zone->pages_scanned = 0;	469	zone->pages_scanned = 0;
470	while (count--) {	470	while (count--) {
471	struct page *page;	471	struct page *page;
472		472
473	VM_BUG_ON(list_empty(list));	473	VM_BUG_ON(list_empty(list));
474	page = list_entry(list->prev, struct page, lru);	474	page = list_entry(list->prev, struct page, lru);
475	/* have to delete it as __free_one_page list manipulates */	475	/* have to delete it as __free_one_page list manipulates */
476	list_del(&page->lru);	476	list_del(&page->lru);
477	__free_one_page(page, zone, order);	477	__free_one_page(page, zone, order);
478	}	478	}
479	spin_unlock(&zone->lock);	479	spin_unlock(&zone->lock);
480	}	480	}
481		481
482	static void free_one_page(struct zone zone, struct page page, int order)	482	static void free_one_page(struct zone zone, struct page page, int order)
483	{	483	{
484	spin_lock(&zone->lock);	484	spin_lock(&zone->lock);
485	zone->all_unreclaimable = 0;	485	zone->all_unreclaimable = 0;
486	zone->pages_scanned = 0;	486	zone->pages_scanned = 0;
487	__free_one_page(page, zone, order);	487	__free_one_page(page, zone, order);
488	spin_unlock(&zone->lock);	488	spin_unlock(&zone->lock);
489	}	489	}
490		490
491	static void __free_pages_ok(struct page *page, unsigned int order)	491	static void __free_pages_ok(struct page *page, unsigned int order)
492	{	492	{
493	unsigned long flags;	493	unsigned long flags;
494	int i;	494	int i;
495	int reserved = 0;	495	int reserved = 0;
496		496
497	for (i = 0 ; i < (1 << order) ; ++i)	497	for (i = 0 ; i < (1 << order) ; ++i)
498	reserved += free_pages_check(page + i);	498	reserved += free_pages_check(page + i);
499	if (reserved)	499	if (reserved)
500	return;	500	return;
501		501
502	if (!PageHighMem(page))	502	if (!PageHighMem(page))
503	debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);	503	debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
504	arch_free_page(page, order);	504	arch_free_page(page, order);
505	kernel_map_pages(page, 1 << order, 0);	505	kernel_map_pages(page, 1 << order, 0);
506		506
507	local_irq_save(flags);	507	local_irq_save(flags);
508	__count_vm_events(PGFREE, 1 << order);	508	__count_vm_events(PGFREE, 1 << order);
509	free_one_page(page_zone(page), page, order);	509	free_one_page(page_zone(page), page, order);
510	local_irq_restore(flags);	510	local_irq_restore(flags);
511	}	511	}
512		512
513	/*	513	/*
514	* permit the bootmem allocator to evade page validation on high-order frees	514	* permit the bootmem allocator to evade page validation on high-order frees
515	*/	515	*/
516	void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)	516	void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
517	{	517	{
518	if (order == 0) {	518	if (order == 0) {
519	__ClearPageReserved(page);	519	__ClearPageReserved(page);
520	set_page_count(page, 0);	520	set_page_count(page, 0);
521	set_page_refcounted(page);	521	set_page_refcounted(page);
522	__free_page(page);	522	__free_page(page);
523	} else {	523	} else {
524	int loop;	524	int loop;
525		525
526	prefetchw(page);	526	prefetchw(page);
527	for (loop = 0; loop < BITS_PER_LONG; loop++) {	527	for (loop = 0; loop < BITS_PER_LONG; loop++) {
528	struct page *p = &page[loop];	528	struct page *p = &page[loop];
529		529
530	if (loop + 1 < BITS_PER_LONG)	530	if (loop + 1 < BITS_PER_LONG)
531	prefetchw(p + 1);	531	prefetchw(p + 1);
532	__ClearPageReserved(p);	532	__ClearPageReserved(p);
533	set_page_count(p, 0);	533	set_page_count(p, 0);
534	}	534	}
535		535
536	set_page_refcounted(page);	536	set_page_refcounted(page);
537	__free_pages(page, order);	537	__free_pages(page, order);
538	}	538	}
539	}	539	}
540		540
541		541
542	/*	542	/*
543	* The order of subdivision here is critical for the IO subsystem.	543	* The order of subdivision here is critical for the IO subsystem.
544	* Please do not alter this order without good reasons and regression	544	* Please do not alter this order without good reasons and regression
545	* testing. Specifically, as large blocks of memory are subdivided,	545	* testing. Specifically, as large blocks of memory are subdivided,
546	* the order in which smaller blocks are delivered depends on the order	546	* the order in which smaller blocks are delivered depends on the order
547	* they're subdivided in this function. This is the primary factor	547	* they're subdivided in this function. This is the primary factor
548	* influencing the order in which pages are delivered to the IO	548	* influencing the order in which pages are delivered to the IO
549	* subsystem according to empirical testing, and this is also justified	549	* subsystem according to empirical testing, and this is also justified
550	* by considering the behavior of a buddy system containing a single	550	* by considering the behavior of a buddy system containing a single
551	* large block of memory acted on by a series of small allocations.	551	* large block of memory acted on by a series of small allocations.
552	* This behavior is a critical factor in sglist merging's success.	552	* This behavior is a critical factor in sglist merging's success.
553	*	553	*
554	* -- wli	554	* -- wli
555	*/	555	*/
556	static inline void expand(struct zone zone, struct page page,	556	static inline void expand(struct zone zone, struct page page,
557	int low, int high, struct free_area *area)	557	int low, int high, struct free_area *area)
558	{	558	{
559	unsigned long size = 1 << high;	559	unsigned long size = 1 << high;
560		560
561	while (high > low) {	561	while (high > low) {
562	area--;	562	area--;
563	high--;	563	high--;
564	size >>= 1;	564	size >>= 1;
565	VM_BUG_ON(bad_range(zone, &page[size]));	565	VM_BUG_ON(bad_range(zone, &page[size]));
566	list_add(&page[size].lru, &area->free_list);	566	list_add(&page[size].lru, &area->free_list);
567	area->nr_free++;	567	area->nr_free++;
568	set_page_order(&page[size], high);	568	set_page_order(&page[size], high);
569	}	569	}
570	}	570	}
571		571
572	/*	572	/*
573	* This page is about to be returned from the page allocator	573	* This page is about to be returned from the page allocator
574	*/	574	*/
575	static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)	575	static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
576	{	576	{
577	if (unlikely(page_mapcount(page) \|	577	if (unlikely(page_mapcount(page) \|
578	(page->mapping != NULL) \|	578	(page->mapping != NULL) \|
579	(page_count(page) != 0) \|	579	(page_count(page) != 0) \|
580	(page->flags & (	580	(page->flags & (
581	1 << PG_lru \|	581	1 << PG_lru \|
582	1 << PG_private \|	582	1 << PG_private \|
583	1 << PG_locked \|	583	1 << PG_locked \|
584	1 << PG_active \|	584	1 << PG_active \|
585	1 << PG_dirty \|	585	1 << PG_dirty \|
586	1 << PG_reclaim \|	586	1 << PG_reclaim \|
587	1 << PG_slab \|	587	1 << PG_slab \|
588	1 << PG_swapcache \|	588	1 << PG_swapcache \|
589	1 << PG_writeback \|	589	1 << PG_writeback \|
590	1 << PG_reserved \|	590	1 << PG_reserved \|
591	1 << PG_buddy ))))	591	1 << PG_buddy ))))
592	bad_page(page);	592	bad_page(page);
593		593
594	/*	594	/*
595	* For now, we report if PG_reserved was found set, but do not	595	* For now, we report if PG_reserved was found set, but do not
596	* clear it, and do not allocate the page: as a safety net.	596	* clear it, and do not allocate the page: as a safety net.
597	*/	597	*/
598	if (PageReserved(page))	598	if (PageReserved(page))
599	return 1;	599	return 1;
600		600
601	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|	601	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
602	1 << PG_referenced \| 1 << PG_arch_1 \|	602	1 << PG_referenced \| 1 << PG_arch_1 \|
603	1 << PG_owner_priv_1 \| 1 << PG_mappedtodisk);	603	1 << PG_owner_priv_1 \| 1 << PG_mappedtodisk);
604	set_page_private(page, 0);	604	set_page_private(page, 0);
605	set_page_refcounted(page);	605	set_page_refcounted(page);
606		606
607	arch_alloc_page(page, order);	607	arch_alloc_page(page, order);
608	kernel_map_pages(page, 1 << order, 1);	608	kernel_map_pages(page, 1 << order, 1);
609		609
610	if (gfp_flags & __GFP_ZERO)	610	if (gfp_flags & __GFP_ZERO)
611	prep_zero_page(page, order, gfp_flags);	611	prep_zero_page(page, order, gfp_flags);
612		612
613	if (order && (gfp_flags & __GFP_COMP))	613	if (order && (gfp_flags & __GFP_COMP))
614	prep_compound_page(page, order);	614	prep_compound_page(page, order);
615		615
616	return 0;	616	return 0;
617	}	617	}
618		618
619	/*	619	/*
620	* Do the hard work of removing an element from the buddy allocator.	620	* Do the hard work of removing an element from the buddy allocator.
621	* Call me with the zone->lock already held.	621	* Call me with the zone->lock already held.
622	*/	622	*/
623	static struct page __rmqueue(struct zone zone, unsigned int order)	623	static struct page __rmqueue(struct zone zone, unsigned int order)
624	{	624	{
625	struct free_area * area;	625	struct free_area * area;
626	unsigned int current_order;	626	unsigned int current_order;
627	struct page *page;	627	struct page *page;
628		628
629	for (current_order = order; current_order < MAX_ORDER; ++current_order) {	629	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
630	area = zone->free_area + current_order;	630	area = zone->free_area + current_order;
631	if (list_empty(&area->free_list))	631	if (list_empty(&area->free_list))
632	continue;	632	continue;
633		633
634	page = list_entry(area->free_list.next, struct page, lru);	634	page = list_entry(area->free_list.next, struct page, lru);
635	list_del(&page->lru);	635	list_del(&page->lru);
636	rmv_page_order(page);	636	rmv_page_order(page);
637	area->nr_free--;	637	area->nr_free--;
638	__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));	638	__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
639	expand(zone, page, order, current_order, area);	639	expand(zone, page, order, current_order, area);
640	return page;	640	return page;
641	}	641	}
642		642
643	return NULL;	643	return NULL;
644	}	644	}
645		645
646	/*	646	/*
647	* Obtain a specified number of elements from the buddy allocator, all under	647	* Obtain a specified number of elements from the buddy allocator, all under
648	* a single hold of the lock, for efficiency. Add them to the supplied list.	648	* a single hold of the lock, for efficiency. Add them to the supplied list.
649	* Returns the number of new pages which were placed at *list.	649	* Returns the number of new pages which were placed at *list.
650	*/	650	*/
651	static int rmqueue_bulk(struct zone *zone, unsigned int order,	651	static int rmqueue_bulk(struct zone *zone, unsigned int order,
652	unsigned long count, struct list_head *list)	652	unsigned long count, struct list_head *list)
653	{	653	{
654	int i;	654	int i;
655		655
656	spin_lock(&zone->lock);	656	spin_lock(&zone->lock);
657	for (i = 0; i < count; ++i) {	657	for (i = 0; i < count; ++i) {
658	struct page *page = __rmqueue(zone, order);	658	struct page *page = __rmqueue(zone, order);
659	if (unlikely(page == NULL))	659	if (unlikely(page == NULL))
660	break;	660	break;
661	list_add_tail(&page->lru, list);	661	list_add_tail(&page->lru, list);
662	}	662	}
663	spin_unlock(&zone->lock);	663	spin_unlock(&zone->lock);
664	return i;	664	return i;
665	}	665	}
666		666
667	#if MAX_NUMNODES > 1	667	#if MAX_NUMNODES > 1
668	int nr_node_ids __read_mostly;	668	int nr_node_ids __read_mostly = MAX_NUMNODES;
669	EXPORT_SYMBOL(nr_node_ids);	669	EXPORT_SYMBOL(nr_node_ids);
670		670
671	/*	671	/*
672	* Figure out the number of possible node ids.	672	* Figure out the number of possible node ids.
673	*/	673	*/
674	static void __init setup_nr_node_ids(void)	674	static void __init setup_nr_node_ids(void)
675	{	675	{
676	unsigned int node;	676	unsigned int node;
677	unsigned int highest = 0;	677	unsigned int highest = 0;
678		678
679	for_each_node_mask(node, node_possible_map)	679	for_each_node_mask(node, node_possible_map)
680	highest = node;	680	highest = node;
681	nr_node_ids = highest + 1;	681	nr_node_ids = highest + 1;
682	}	682	}
683	#else	683	#else
684	static void __init setup_nr_node_ids(void) {}	684	static void __init setup_nr_node_ids(void) {}
685	#endif	685	#endif
686		686
687	#ifdef CONFIG_NUMA	687	#ifdef CONFIG_NUMA
688	/*	688	/*
689	* Called from the slab reaper to drain pagesets on a particular node that	689	* Called from the slab reaper to drain pagesets on a particular node that
690	* belongs to the currently executing processor.	690	* belongs to the currently executing processor.
691	* Note that this function must be called with the thread pinned to	691	* Note that this function must be called with the thread pinned to
692	* a single processor.	692	* a single processor.
693	*/	693	*/
694	void drain_node_pages(int nodeid)	694	void drain_node_pages(int nodeid)
695	{	695	{
696	int i;	696	int i;
697	enum zone_type z;	697	enum zone_type z;
698	unsigned long flags;	698	unsigned long flags;
699		699
700	for (z = 0; z < MAX_NR_ZONES; z++) {	700	for (z = 0; z < MAX_NR_ZONES; z++) {
701	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;	701	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
702	struct per_cpu_pageset *pset;	702	struct per_cpu_pageset *pset;
703		703
704	if (!populated_zone(zone))	704	if (!populated_zone(zone))
705	continue;	705	continue;
706		706
707	pset = zone_pcp(zone, smp_processor_id());	707	pset = zone_pcp(zone, smp_processor_id());
708	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	708	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
709	struct per_cpu_pages *pcp;	709	struct per_cpu_pages *pcp;
710		710
711	pcp = &pset->pcp[i];	711	pcp = &pset->pcp[i];
712	if (pcp->count) {	712	if (pcp->count) {
713	int to_drain;	713	int to_drain;
714		714
715	local_irq_save(flags);	715	local_irq_save(flags);
716	if (pcp->count >= pcp->batch)	716	if (pcp->count >= pcp->batch)
717	to_drain = pcp->batch;	717	to_drain = pcp->batch;
718	else	718	else
719	to_drain = pcp->count;	719	to_drain = pcp->count;
720	free_pages_bulk(zone, to_drain, &pcp->list, 0);	720	free_pages_bulk(zone, to_drain, &pcp->list, 0);
721	pcp->count -= to_drain;	721	pcp->count -= to_drain;
722	local_irq_restore(flags);	722	local_irq_restore(flags);
723	}	723	}
724	}	724	}
725	}	725	}
726	}	726	}
727	#endif	727	#endif
728		728
729	static void __drain_pages(unsigned int cpu)	729	static void __drain_pages(unsigned int cpu)
730	{	730	{
731	unsigned long flags;	731	unsigned long flags;
732	struct zone *zone;	732	struct zone *zone;
733	int i;	733	int i;
734		734
735	for_each_zone(zone) {	735	for_each_zone(zone) {
736	struct per_cpu_pageset *pset;	736	struct per_cpu_pageset *pset;
737		737
738	if (!populated_zone(zone))	738	if (!populated_zone(zone))
739	continue;	739	continue;
740		740
741	pset = zone_pcp(zone, cpu);	741	pset = zone_pcp(zone, cpu);
742	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	742	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
743	struct per_cpu_pages *pcp;	743	struct per_cpu_pages *pcp;
744		744
745	pcp = &pset->pcp[i];	745	pcp = &pset->pcp[i];
746	local_irq_save(flags);	746	local_irq_save(flags);
747	free_pages_bulk(zone, pcp->count, &pcp->list, 0);	747	free_pages_bulk(zone, pcp->count, &pcp->list, 0);
748	pcp->count = 0;	748	pcp->count = 0;
749	local_irq_restore(flags);	749	local_irq_restore(flags);
750	}	750	}
751	}	751	}
752	}	752	}
753		753
754	#ifdef CONFIG_PM	754	#ifdef CONFIG_PM
755		755
756	void mark_free_pages(struct zone *zone)	756	void mark_free_pages(struct zone *zone)
757	{	757	{
758	unsigned long pfn, max_zone_pfn;	758	unsigned long pfn, max_zone_pfn;
759	unsigned long flags;	759	unsigned long flags;
760	int order;	760	int order;
761	struct list_head *curr;	761	struct list_head *curr;
762		762
763	if (!zone->spanned_pages)	763	if (!zone->spanned_pages)
764	return;	764	return;
765		765
766	spin_lock_irqsave(&zone->lock, flags);	766	spin_lock_irqsave(&zone->lock, flags);
767		767
768	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;	768	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
769	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)	769	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
770	if (pfn_valid(pfn)) {	770	if (pfn_valid(pfn)) {
771	struct page *page = pfn_to_page(pfn);	771	struct page *page = pfn_to_page(pfn);
772		772
773	if (!PageNosave(page))	773	if (!PageNosave(page))
774	ClearPageNosaveFree(page);	774	ClearPageNosaveFree(page);
775	}	775	}
776		776
777	for (order = MAX_ORDER - 1; order >= 0; --order)	777	for (order = MAX_ORDER - 1; order >= 0; --order)
778	list_for_each(curr, &zone->free_area[order].free_list) {	778	list_for_each(curr, &zone->free_area[order].free_list) {
779	unsigned long i;	779	unsigned long i;
780		780
781	pfn = page_to_pfn(list_entry(curr, struct page, lru));	781	pfn = page_to_pfn(list_entry(curr, struct page, lru));
782	for (i = 0; i < (1UL << order); i++)	782	for (i = 0; i < (1UL << order); i++)
783	SetPageNosaveFree(pfn_to_page(pfn + i));	783	SetPageNosaveFree(pfn_to_page(pfn + i));
784	}	784	}
785		785
786	spin_unlock_irqrestore(&zone->lock, flags);	786	spin_unlock_irqrestore(&zone->lock, flags);
787	}	787	}
788		788
789	/*	789	/*
790	* Spill all of this CPU's per-cpu pages back into the buddy allocator.	790	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
791	*/	791	*/
792	void drain_local_pages(void)	792	void drain_local_pages(void)
793	{	793	{
794	unsigned long flags;	794	unsigned long flags;
795		795
796	local_irq_save(flags);	796	local_irq_save(flags);
797	__drain_pages(smp_processor_id());	797	__drain_pages(smp_processor_id());
798	local_irq_restore(flags);	798	local_irq_restore(flags);
799	}	799	}
800	#endif /* CONFIG_PM */	800	#endif /* CONFIG_PM */
801		801
802	/*	802	/*
803	* Free a 0-order page	803	* Free a 0-order page
804	*/	804	*/
805	static void fastcall free_hot_cold_page(struct page *page, int cold)	805	static void fastcall free_hot_cold_page(struct page *page, int cold)
806	{	806	{
807	struct zone *zone = page_zone(page);	807	struct zone *zone = page_zone(page);
808	struct per_cpu_pages *pcp;	808	struct per_cpu_pages *pcp;
809	unsigned long flags;	809	unsigned long flags;
810		810
811	if (PageAnon(page))	811	if (PageAnon(page))
812	page->mapping = NULL;	812	page->mapping = NULL;
813	if (free_pages_check(page))	813	if (free_pages_check(page))
814	return;	814	return;
815		815
816	if (!PageHighMem(page))	816	if (!PageHighMem(page))
817	debug_check_no_locks_freed(page_address(page), PAGE_SIZE);	817	debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
818	arch_free_page(page, 0);	818	arch_free_page(page, 0);
819	kernel_map_pages(page, 1, 0);	819	kernel_map_pages(page, 1, 0);
820		820
821	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];	821	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
822	local_irq_save(flags);	822	local_irq_save(flags);
823	__count_vm_event(PGFREE);	823	__count_vm_event(PGFREE);
824	list_add(&page->lru, &pcp->list);	824	list_add(&page->lru, &pcp->list);
825	pcp->count++;	825	pcp->count++;
826	if (pcp->count >= pcp->high) {	826	if (pcp->count >= pcp->high) {
827	free_pages_bulk(zone, pcp->batch, &pcp->list, 0);	827	free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
828	pcp->count -= pcp->batch;	828	pcp->count -= pcp->batch;
829	}	829	}
830	local_irq_restore(flags);	830	local_irq_restore(flags);
831	put_cpu();	831	put_cpu();
832	}	832	}
833		833
834	void fastcall free_hot_page(struct page *page)	834	void fastcall free_hot_page(struct page *page)
835	{	835	{
836	free_hot_cold_page(page, 0);	836	free_hot_cold_page(page, 0);
837	}	837	}
838		838
839	void fastcall free_cold_page(struct page *page)	839	void fastcall free_cold_page(struct page *page)
840	{	840	{
841	free_hot_cold_page(page, 1);	841	free_hot_cold_page(page, 1);
842	}	842	}
843		843
844	/*	844	/*
845	* split_page takes a non-compound higher-order page, and splits it into	845	* split_page takes a non-compound higher-order page, and splits it into
846	* n (1<<order) sub-pages: page[0..n]	846	* n (1<<order) sub-pages: page[0..n]
847	* Each sub-page must be freed individually.	847	* Each sub-page must be freed individually.
848	*	848	*
849	* Note: this is probably too low level an operation for use in drivers.	849	* Note: this is probably too low level an operation for use in drivers.
850	* Please consult with lkml before using this in your driver.	850	* Please consult with lkml before using this in your driver.
851	*/	851	*/
852	void split_page(struct page *page, unsigned int order)	852	void split_page(struct page *page, unsigned int order)
853	{	853	{
854	int i;	854	int i;
855		855
856	VM_BUG_ON(PageCompound(page));	856	VM_BUG_ON(PageCompound(page));
857	VM_BUG_ON(!page_count(page));	857	VM_BUG_ON(!page_count(page));
858	for (i = 1; i < (1 << order); i++)	858	for (i = 1; i < (1 << order); i++)
859	set_page_refcounted(page + i);	859	set_page_refcounted(page + i);
860	}	860	}
861		861
862	/*	862	/*
863	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But	863	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
864	* we cheat by calling it from here, in the order > 0 path. Saves a branch	864	* we cheat by calling it from here, in the order > 0 path. Saves a branch
865	* or two.	865	* or two.
866	*/	866	*/
867	static struct page buffered_rmqueue(struct zonelist zonelist,	867	static struct page buffered_rmqueue(struct zonelist zonelist,
868	struct zone *zone, int order, gfp_t gfp_flags)	868	struct zone *zone, int order, gfp_t gfp_flags)
869	{	869	{
870	unsigned long flags;	870	unsigned long flags;
871	struct page *page;	871	struct page *page;
872	int cold = !!(gfp_flags & __GFP_COLD);	872	int cold = !!(gfp_flags & __GFP_COLD);
873	int cpu;	873	int cpu;
874		874
875	again:	875	again:
876	cpu = get_cpu();	876	cpu = get_cpu();
877	if (likely(order == 0)) {	877	if (likely(order == 0)) {
878	struct per_cpu_pages *pcp;	878	struct per_cpu_pages *pcp;
879		879
880	pcp = &zone_pcp(zone, cpu)->pcp[cold];	880	pcp = &zone_pcp(zone, cpu)->pcp[cold];
881	local_irq_save(flags);	881	local_irq_save(flags);
882	if (!pcp->count) {	882	if (!pcp->count) {
883	pcp->count = rmqueue_bulk(zone, 0,	883	pcp->count = rmqueue_bulk(zone, 0,
884	pcp->batch, &pcp->list);	884	pcp->batch, &pcp->list);
885	if (unlikely(!pcp->count))	885	if (unlikely(!pcp->count))
886	goto failed;	886	goto failed;
887	}	887	}
888	page = list_entry(pcp->list.next, struct page, lru);	888	page = list_entry(pcp->list.next, struct page, lru);
889	list_del(&page->lru);	889	list_del(&page->lru);
890	pcp->count--;	890	pcp->count--;
891	} else {	891	} else {
892	spin_lock_irqsave(&zone->lock, flags);	892	spin_lock_irqsave(&zone->lock, flags);
893	page = __rmqueue(zone, order);	893	page = __rmqueue(zone, order);
894	spin_unlock(&zone->lock);	894	spin_unlock(&zone->lock);
895	if (!page)	895	if (!page)
896	goto failed;	896	goto failed;
897	}	897	}
898		898
899	__count_zone_vm_events(PGALLOC, zone, 1 << order);	899	__count_zone_vm_events(PGALLOC, zone, 1 << order);
900	zone_statistics(zonelist, zone);	900	zone_statistics(zonelist, zone);
901	local_irq_restore(flags);	901	local_irq_restore(flags);
902	put_cpu();	902	put_cpu();
903		903
904	VM_BUG_ON(bad_range(zone, page));	904	VM_BUG_ON(bad_range(zone, page));
905	if (prep_new_page(page, order, gfp_flags))	905	if (prep_new_page(page, order, gfp_flags))
906	goto again;	906	goto again;
907	return page;	907	return page;
908		908
909	failed:	909	failed:
910	local_irq_restore(flags);	910	local_irq_restore(flags);
911	put_cpu();	911	put_cpu();
912	return NULL;	912	return NULL;
913	}	913	}
914		914
915	#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */	915	#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
916	#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */	916	#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
917	#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */	917	#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
918	#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */	918	#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
919	#define ALLOC_HARDER 0x10 /* try to alloc harder */	919	#define ALLOC_HARDER 0x10 /* try to alloc harder */
920	#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */	920	#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
921	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */	921	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
922		922
923	#ifdef CONFIG_FAIL_PAGE_ALLOC	923	#ifdef CONFIG_FAIL_PAGE_ALLOC
924		924
925	static struct fail_page_alloc_attr {	925	static struct fail_page_alloc_attr {
926	struct fault_attr attr;	926	struct fault_attr attr;
927		927
928	u32 ignore_gfp_highmem;	928	u32 ignore_gfp_highmem;
929	u32 ignore_gfp_wait;	929	u32 ignore_gfp_wait;
930		930
931	#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS	931	#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
932		932
933	struct dentry *ignore_gfp_highmem_file;	933	struct dentry *ignore_gfp_highmem_file;
934	struct dentry *ignore_gfp_wait_file;	934	struct dentry *ignore_gfp_wait_file;
935		935
936	#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */	936	#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
937		937
938	} fail_page_alloc = {	938	} fail_page_alloc = {
939	.attr = FAULT_ATTR_INITIALIZER,	939	.attr = FAULT_ATTR_INITIALIZER,
940	.ignore_gfp_wait = 1,	940	.ignore_gfp_wait = 1,
941	.ignore_gfp_highmem = 1,	941	.ignore_gfp_highmem = 1,
942	};	942	};
943		943
944	static int __init setup_fail_page_alloc(char *str)	944	static int __init setup_fail_page_alloc(char *str)
945	{	945	{
946	return setup_fault_attr(&fail_page_alloc.attr, str);	946	return setup_fault_attr(&fail_page_alloc.attr, str);
947	}	947	}
948	__setup("fail_page_alloc=", setup_fail_page_alloc);	948	__setup("fail_page_alloc=", setup_fail_page_alloc);
949		949
950	static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)	950	static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
951	{	951	{
952	if (gfp_mask & __GFP_NOFAIL)	952	if (gfp_mask & __GFP_NOFAIL)
953	return 0;	953	return 0;
954	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))	954	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
955	return 0;	955	return 0;
956	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))	956	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
957	return 0;	957	return 0;
958		958
959	return should_fail(&fail_page_alloc.attr, 1 << order);	959	return should_fail(&fail_page_alloc.attr, 1 << order);
960	}	960	}
961		961
962	#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS	962	#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
963		963
964	static int __init fail_page_alloc_debugfs(void)	964	static int __init fail_page_alloc_debugfs(void)
965	{	965	{
966	mode_t mode = S_IFREG \| S_IRUSR \| S_IWUSR;	966	mode_t mode = S_IFREG \| S_IRUSR \| S_IWUSR;
967	struct dentry *dir;	967	struct dentry *dir;
968	int err;	968	int err;
969		969
970	err = init_fault_attr_dentries(&fail_page_alloc.attr,	970	err = init_fault_attr_dentries(&fail_page_alloc.attr,
971	"fail_page_alloc");	971	"fail_page_alloc");
972	if (err)	972	if (err)
973	return err;	973	return err;
974	dir = fail_page_alloc.attr.dentries.dir;	974	dir = fail_page_alloc.attr.dentries.dir;
975		975
976	fail_page_alloc.ignore_gfp_wait_file =	976	fail_page_alloc.ignore_gfp_wait_file =
977	debugfs_create_bool("ignore-gfp-wait", mode, dir,	977	debugfs_create_bool("ignore-gfp-wait", mode, dir,
978	&fail_page_alloc.ignore_gfp_wait);	978	&fail_page_alloc.ignore_gfp_wait);
979		979
980	fail_page_alloc.ignore_gfp_highmem_file =	980	fail_page_alloc.ignore_gfp_highmem_file =
981	debugfs_create_bool("ignore-gfp-highmem", mode, dir,	981	debugfs_create_bool("ignore-gfp-highmem", mode, dir,
982	&fail_page_alloc.ignore_gfp_highmem);	982	&fail_page_alloc.ignore_gfp_highmem);
983		983
984	if (!fail_page_alloc.ignore_gfp_wait_file \|\|	984	if (!fail_page_alloc.ignore_gfp_wait_file \|\|
985	!fail_page_alloc.ignore_gfp_highmem_file) {	985	!fail_page_alloc.ignore_gfp_highmem_file) {
986	err = -ENOMEM;	986	err = -ENOMEM;
987	debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);	987	debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
988	debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);	988	debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
989	cleanup_fault_attr_dentries(&fail_page_alloc.attr);	989	cleanup_fault_attr_dentries(&fail_page_alloc.attr);
990	}	990	}
991		991
992	return err;	992	return err;
993	}	993	}
994		994
995	late_initcall(fail_page_alloc_debugfs);	995	late_initcall(fail_page_alloc_debugfs);
996		996
997	#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */	997	#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
998		998
999	#else /* CONFIG_FAIL_PAGE_ALLOC */	999	#else /* CONFIG_FAIL_PAGE_ALLOC */
1000		1000
1001	static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)	1001	static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1002	{	1002	{
1003	return 0;	1003	return 0;
1004	}	1004	}
1005		1005
1006	#endif /* CONFIG_FAIL_PAGE_ALLOC */	1006	#endif /* CONFIG_FAIL_PAGE_ALLOC */
1007		1007
1008	/*	1008	/*
1009	* Return 1 if free pages are above 'mark'. This takes into account the order	1009	* Return 1 if free pages are above 'mark'. This takes into account the order
1010	* of the allocation.	1010	* of the allocation.
1011	*/	1011	*/
1012	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,	1012	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1013	int classzone_idx, int alloc_flags)	1013	int classzone_idx, int alloc_flags)
1014	{	1014	{
1015	/* free_pages my go negative - that's OK */	1015	/* free_pages my go negative - that's OK */
1016	long min = mark;	1016	long min = mark;
1017	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;	1017	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
1018	int o;	1018	int o;
1019		1019
1020	if (alloc_flags & ALLOC_HIGH)	1020	if (alloc_flags & ALLOC_HIGH)
1021	min -= min / 2;	1021	min -= min / 2;
1022	if (alloc_flags & ALLOC_HARDER)	1022	if (alloc_flags & ALLOC_HARDER)
1023	min -= min / 4;	1023	min -= min / 4;
1024		1024
1025	if (free_pages <= min + z->lowmem_reserve[classzone_idx])	1025	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1026	return 0;	1026	return 0;
1027	for (o = 0; o < order; o++) {	1027	for (o = 0; o < order; o++) {
1028	/* At the next order, this order's pages become unavailable */	1028	/* At the next order, this order's pages become unavailable */
1029	free_pages -= z->free_area[o].nr_free << o;	1029	free_pages -= z->free_area[o].nr_free << o;
1030		1030
1031	/* Require fewer higher order pages to be free */	1031	/* Require fewer higher order pages to be free */
1032	min >>= 1;	1032	min >>= 1;
1033		1033
1034	if (free_pages <= min)	1034	if (free_pages <= min)
1035	return 0;	1035	return 0;
1036	}	1036	}
1037	return 1;	1037	return 1;
1038	}	1038	}
1039		1039
1040	#ifdef CONFIG_NUMA	1040	#ifdef CONFIG_NUMA
1041	/*	1041	/*
1042	* zlc_setup - Setup for "zonelist cache". Uses cached zone data to	1042	* zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1043	* skip over zones that are not allowed by the cpuset, or that have	1043	* skip over zones that are not allowed by the cpuset, or that have
1044	* been recently (in last second) found to be nearly full. See further	1044	* been recently (in last second) found to be nearly full. See further
1045	* comments in mmzone.h. Reduces cache footprint of zonelist scans	1045	* comments in mmzone.h. Reduces cache footprint of zonelist scans
1046	* that have to skip over alot of full or unallowed zones.	1046	* that have to skip over alot of full or unallowed zones.
1047	*	1047	*
1048	* If the zonelist cache is present in the passed in zonelist, then	1048	* If the zonelist cache is present in the passed in zonelist, then
1049	* returns a pointer to the allowed node mask (either the current	1049	* returns a pointer to the allowed node mask (either the current
1050	* tasks mems_allowed, or node_online_map.)	1050	* tasks mems_allowed, or node_online_map.)
1051	*	1051	*
1052	* If the zonelist cache is not available for this zonelist, does	1052	* If the zonelist cache is not available for this zonelist, does
1053	* nothing and returns NULL.	1053	* nothing and returns NULL.
1054	*	1054	*
1055	* If the fullzones BITMAP in the zonelist cache is stale (more than	1055	* If the fullzones BITMAP in the zonelist cache is stale (more than
1056	* a second since last zap'd) then we zap it out (clear its bits.)	1056	* a second since last zap'd) then we zap it out (clear its bits.)
1057	*	1057	*
1058	* We hold off even calling zlc_setup, until after we've checked the	1058	* We hold off even calling zlc_setup, until after we've checked the
1059	* first zone in the zonelist, on the theory that most allocations will	1059	* first zone in the zonelist, on the theory that most allocations will
1060	* be satisfied from that first zone, so best to examine that zone as	1060	* be satisfied from that first zone, so best to examine that zone as
1061	* quickly as we can.	1061	* quickly as we can.
1062	*/	1062	*/
1063	static nodemask_t zlc_setup(struct zonelist zonelist, int alloc_flags)	1063	static nodemask_t zlc_setup(struct zonelist zonelist, int alloc_flags)
1064	{	1064	{
1065	struct zonelist_cache zlc; / cached zonelist speedup info */	1065	struct zonelist_cache zlc; / cached zonelist speedup info */
1066	nodemask_t allowednodes; / zonelist_cache approximation */	1066	nodemask_t allowednodes; / zonelist_cache approximation */
1067		1067
1068	zlc = zonelist->zlcache_ptr;	1068	zlc = zonelist->zlcache_ptr;
1069	if (!zlc)	1069	if (!zlc)
1070	return NULL;	1070	return NULL;
1071		1071
1072	if (jiffies - zlc->last_full_zap > 1 * HZ) {	1072	if (jiffies - zlc->last_full_zap > 1 * HZ) {
1073	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);	1073	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1074	zlc->last_full_zap = jiffies;	1074	zlc->last_full_zap = jiffies;
1075	}	1075	}
1076		1076
1077	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?	1077	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1078	&cpuset_current_mems_allowed :	1078	&cpuset_current_mems_allowed :
1079	&node_online_map;	1079	&node_online_map;
1080	return allowednodes;	1080	return allowednodes;
1081	}	1081	}
1082		1082
1083	/*	1083	/*
1084	* Given 'z' scanning a zonelist, run a couple of quick checks to see	1084	* Given 'z' scanning a zonelist, run a couple of quick checks to see
1085	* if it is worth looking at further for free memory:	1085	* if it is worth looking at further for free memory:
1086	* 1) Check that the zone isn't thought to be full (doesn't have its	1086	* 1) Check that the zone isn't thought to be full (doesn't have its
1087	* bit set in the zonelist_cache fullzones BITMAP).	1087	* bit set in the zonelist_cache fullzones BITMAP).
1088	* 2) Check that the zones node (obtained from the zonelist_cache	1088	* 2) Check that the zones node (obtained from the zonelist_cache
1089	* z_to_n[] mapping) is allowed in the passed in allowednodes mask.	1089	* z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1090	* Return true (non-zero) if zone is worth looking at further, or	1090	* Return true (non-zero) if zone is worth looking at further, or
1091	* else return false (zero) if it is not.	1091	* else return false (zero) if it is not.
1092	*	1092	*
1093	* This check -ignores- the distinction between various watermarks,	1093	* This check -ignores- the distinction between various watermarks,
1094	* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is	1094	* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1095	* found to be full for any variation of these watermarks, it will	1095	* found to be full for any variation of these watermarks, it will
1096	* be considered full for up to one second by all requests, unless	1096	* be considered full for up to one second by all requests, unless
1097	* we are so low on memory on all allowed nodes that we are forced	1097	* we are so low on memory on all allowed nodes that we are forced
1098	* into the second scan of the zonelist.	1098	* into the second scan of the zonelist.
1099	*	1099	*
1100	* In the second scan we ignore this zonelist cache and exactly	1100	* In the second scan we ignore this zonelist cache and exactly
1101	* apply the watermarks to all zones, even it is slower to do so.	1101	* apply the watermarks to all zones, even it is slower to do so.
1102	* We are low on memory in the second scan, and should leave no stone	1102	* We are low on memory in the second scan, and should leave no stone
1103	* unturned looking for a free page.	1103	* unturned looking for a free page.
1104	*/	1104	*/
1105	static int zlc_zone_worth_trying(struct zonelist zonelist, struct zone *z,	1105	static int zlc_zone_worth_trying(struct zonelist zonelist, struct zone *z,
1106	nodemask_t *allowednodes)	1106	nodemask_t *allowednodes)
1107	{	1107	{
1108	struct zonelist_cache zlc; / cached zonelist speedup info */	1108	struct zonelist_cache zlc; / cached zonelist speedup info */
1109	int i; /* index of z in zonelist zones /	1109	int i; /* index of z in zonelist zones /
1110	int n; /* node that zone z is on /	1110	int n; /* node that zone z is on /
1111		1111
1112	zlc = zonelist->zlcache_ptr;	1112	zlc = zonelist->zlcache_ptr;
1113	if (!zlc)	1113	if (!zlc)
1114	return 1;	1114	return 1;
1115		1115
1116	i = z - zonelist->zones;	1116	i = z - zonelist->zones;
1117	n = zlc->z_to_n[i];	1117	n = zlc->z_to_n[i];
1118		1118
1119	/* This zone is worth trying if it is allowed but not full */	1119	/* This zone is worth trying if it is allowed but not full */
1120	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);	1120	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1121	}	1121	}
1122		1122
1123	/*	1123	/*
1124	* Given 'z' scanning a zonelist, set the corresponding bit in	1124	* Given 'z' scanning a zonelist, set the corresponding bit in
1125	* zlc->fullzones, so that subsequent attempts to allocate a page	1125	* zlc->fullzones, so that subsequent attempts to allocate a page
1126	* from that zone don't waste time re-examining it.	1126	* from that zone don't waste time re-examining it.
1127	*/	1127	*/
1128	static void zlc_mark_zone_full(struct zonelist zonelist, struct zone *z)	1128	static void zlc_mark_zone_full(struct zonelist zonelist, struct zone *z)
1129	{	1129	{
1130	struct zonelist_cache zlc; / cached zonelist speedup info */	1130	struct zonelist_cache zlc; / cached zonelist speedup info */
1131	int i; /* index of z in zonelist zones /	1131	int i; /* index of z in zonelist zones /
1132		1132
1133	zlc = zonelist->zlcache_ptr;	1133	zlc = zonelist->zlcache_ptr;
1134	if (!zlc)	1134	if (!zlc)
1135	return;	1135	return;
1136		1136
1137	i = z - zonelist->zones;	1137	i = z - zonelist->zones;
1138		1138
1139	set_bit(i, zlc->fullzones);	1139	set_bit(i, zlc->fullzones);
1140	}	1140	}
1141		1141
1142	#else /* CONFIG_NUMA */	1142	#else /* CONFIG_NUMA */
1143		1143
1144	static nodemask_t zlc_setup(struct zonelist zonelist, int alloc_flags)	1144	static nodemask_t zlc_setup(struct zonelist zonelist, int alloc_flags)
1145	{	1145	{
1146	return NULL;	1146	return NULL;
1147	}	1147	}
1148		1148
1149	static int zlc_zone_worth_trying(struct zonelist zonelist, struct zone *z,	1149	static int zlc_zone_worth_trying(struct zonelist zonelist, struct zone *z,
1150	nodemask_t *allowednodes)	1150	nodemask_t *allowednodes)
1151	{	1151	{
1152	return 1;	1152	return 1;
1153	}	1153	}
1154		1154
1155	static void zlc_mark_zone_full(struct zonelist zonelist, struct zone *z)	1155	static void zlc_mark_zone_full(struct zonelist zonelist, struct zone *z)
1156	{	1156	{
1157	}	1157	}
1158	#endif /* CONFIG_NUMA */	1158	#endif /* CONFIG_NUMA */
1159		1159
1160	/*	1160	/*
1161	* get_page_from_freelist goes through the zonelist trying to allocate	1161	* get_page_from_freelist goes through the zonelist trying to allocate
1162	* a page.	1162	* a page.
1163	*/	1163	*/
1164	static struct page *	1164	static struct page *
1165	get_page_from_freelist(gfp_t gfp_mask, unsigned int order,	1165	get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1166	struct zonelist *zonelist, int alloc_flags)	1166	struct zonelist *zonelist, int alloc_flags)
1167	{	1167	{
1168	struct zone **z;	1168	struct zone **z;
1169	struct page *page = NULL;	1169	struct page *page = NULL;
1170	int classzone_idx = zone_idx(zonelist->zones[0]);	1170	int classzone_idx = zone_idx(zonelist->zones[0]);
1171	struct zone *zone;	1171	struct zone *zone;
1172	nodemask_t allowednodes = NULL;/ zonelist_cache approximation */	1172	nodemask_t allowednodes = NULL;/ zonelist_cache approximation */
1173	int zlc_active = 0; /* set if using zonelist_cache */	1173	int zlc_active = 0; /* set if using zonelist_cache */
1174	int did_zlc_setup = 0; /* just call zlc_setup() one time */	1174	int did_zlc_setup = 0; /* just call zlc_setup() one time */
1175		1175
1176	zonelist_scan:	1176	zonelist_scan:
1177	/*	1177	/*
1178	* Scan zonelist, looking for a zone with enough free.	1178	* Scan zonelist, looking for a zone with enough free.
1179	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.	1179	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1180	*/	1180	*/
1181	z = zonelist->zones;	1181	z = zonelist->zones;
1182		1182
1183	do {	1183	do {
1184	if (NUMA_BUILD && zlc_active &&	1184	if (NUMA_BUILD && zlc_active &&
1185	!zlc_zone_worth_trying(zonelist, z, allowednodes))	1185	!zlc_zone_worth_trying(zonelist, z, allowednodes))
1186	continue;	1186	continue;
1187	zone = *z;	1187	zone = *z;
1188	if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&	1188	if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1189	zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))	1189	zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1190	break;	1190	break;
1191	if ((alloc_flags & ALLOC_CPUSET) &&	1191	if ((alloc_flags & ALLOC_CPUSET) &&
1192	!cpuset_zone_allowed_softwall(zone, gfp_mask))	1192	!cpuset_zone_allowed_softwall(zone, gfp_mask))
1193	goto try_next_zone;	1193	goto try_next_zone;
1194		1194
1195	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {	1195	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1196	unsigned long mark;	1196	unsigned long mark;
1197	if (alloc_flags & ALLOC_WMARK_MIN)	1197	if (alloc_flags & ALLOC_WMARK_MIN)
1198	mark = zone->pages_min;	1198	mark = zone->pages_min;
1199	else if (alloc_flags & ALLOC_WMARK_LOW)	1199	else if (alloc_flags & ALLOC_WMARK_LOW)
1200	mark = zone->pages_low;	1200	mark = zone->pages_low;
1201	else	1201	else
1202	mark = zone->pages_high;	1202	mark = zone->pages_high;
1203	if (!zone_watermark_ok(zone, order, mark,	1203	if (!zone_watermark_ok(zone, order, mark,
1204	classzone_idx, alloc_flags)) {	1204	classzone_idx, alloc_flags)) {
1205	if (!zone_reclaim_mode \|\|	1205	if (!zone_reclaim_mode \|\|
1206	!zone_reclaim(zone, gfp_mask, order))	1206	!zone_reclaim(zone, gfp_mask, order))
1207	goto this_zone_full;	1207	goto this_zone_full;
1208	}	1208	}
1209	}	1209	}
1210		1210
1211	page = buffered_rmqueue(zonelist, zone, order, gfp_mask);	1211	page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
1212	if (page)	1212	if (page)
1213	break;	1213	break;
1214	this_zone_full:	1214	this_zone_full:
1215	if (NUMA_BUILD)	1215	if (NUMA_BUILD)
1216	zlc_mark_zone_full(zonelist, z);	1216	zlc_mark_zone_full(zonelist, z);
1217	try_next_zone:	1217	try_next_zone:
1218	if (NUMA_BUILD && !did_zlc_setup) {	1218	if (NUMA_BUILD && !did_zlc_setup) {
1219	/* we do zlc_setup after the first zone is tried */	1219	/* we do zlc_setup after the first zone is tried */
1220	allowednodes = zlc_setup(zonelist, alloc_flags);	1220	allowednodes = zlc_setup(zonelist, alloc_flags);
1221	zlc_active = 1;	1221	zlc_active = 1;
1222	did_zlc_setup = 1;	1222	did_zlc_setup = 1;
1223	}	1223	}
1224	} while (*(++z) != NULL);	1224	} while (*(++z) != NULL);
1225		1225
1226	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {	1226	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1227	/* Disable zlc cache for second zonelist scan */	1227	/* Disable zlc cache for second zonelist scan */
1228	zlc_active = 0;	1228	zlc_active = 0;
1229	goto zonelist_scan;	1229	goto zonelist_scan;
1230	}	1230	}
1231	return page;	1231	return page;
1232	}	1232	}
1233		1233
1234	/*	1234	/*
1235	* This is the 'heart' of the zoned buddy allocator.	1235	* This is the 'heart' of the zoned buddy allocator.
1236	*/	1236	*/
1237	struct page * fastcall	1237	struct page * fastcall
1238	__alloc_pages(gfp_t gfp_mask, unsigned int order,	1238	__alloc_pages(gfp_t gfp_mask, unsigned int order,
1239	struct zonelist *zonelist)	1239	struct zonelist *zonelist)
1240	{	1240	{
1241	const gfp_t wait = gfp_mask & __GFP_WAIT;	1241	const gfp_t wait = gfp_mask & __GFP_WAIT;
1242	struct zone **z;	1242	struct zone **z;
1243	struct page *page;	1243	struct page *page;
1244	struct reclaim_state reclaim_state;	1244	struct reclaim_state reclaim_state;
1245	struct task_struct *p = current;	1245	struct task_struct *p = current;
1246	int do_retry;	1246	int do_retry;
1247	int alloc_flags;	1247	int alloc_flags;
1248	int did_some_progress;	1248	int did_some_progress;
1249		1249
1250	might_sleep_if(wait);	1250	might_sleep_if(wait);
1251		1251
1252	if (should_fail_alloc_page(gfp_mask, order))	1252	if (should_fail_alloc_page(gfp_mask, order))
1253	return NULL;	1253	return NULL;
1254		1254
1255	restart:	1255	restart:
1256	z = zonelist->zones; /* the list of zones suitable for gfp_mask */	1256	z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1257		1257
1258	if (unlikely(*z == NULL)) {	1258	if (unlikely(*z == NULL)) {
1259	/* Should this ever happen?? */	1259	/* Should this ever happen?? */
1260	return NULL;	1260	return NULL;
1261	}	1261	}
1262		1262
1263	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,	1263	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
1264	zonelist, ALLOC_WMARK_LOW\|ALLOC_CPUSET);	1264	zonelist, ALLOC_WMARK_LOW\|ALLOC_CPUSET);
1265	if (page)	1265	if (page)
1266	goto got_pg;	1266	goto got_pg;
1267		1267
1268	/*	1268	/*
1269	* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and	1269	* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1270	* __GFP_NOWARN set) should not cause reclaim since the subsystem	1270	* __GFP_NOWARN set) should not cause reclaim since the subsystem
1271	* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim	1271	* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1272	* using a larger set of nodes after it has established that the	1272	* using a larger set of nodes after it has established that the
1273	* allowed per node queues are empty and that nodes are	1273	* allowed per node queues are empty and that nodes are
1274	* over allocated.	1274	* over allocated.
1275	*/	1275	*/
1276	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)	1276	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1277	goto nopage;	1277	goto nopage;
1278		1278
1279	for (z = zonelist->zones; *z; z++)	1279	for (z = zonelist->zones; *z; z++)
1280	wakeup_kswapd(*z, order);	1280	wakeup_kswapd(*z, order);
1281		1281
1282	/*	1282	/*
1283	* OK, we're below the kswapd watermark and have kicked background	1283	* OK, we're below the kswapd watermark and have kicked background
1284	* reclaim. Now things get more complex, so set up alloc_flags according	1284	* reclaim. Now things get more complex, so set up alloc_flags according
1285	* to how we want to proceed.	1285	* to how we want to proceed.
1286	*	1286	*
1287	* The caller may dip into page reserves a bit more if the caller	1287	* The caller may dip into page reserves a bit more if the caller
1288	* cannot run direct reclaim, or if the caller has realtime scheduling	1288	* cannot run direct reclaim, or if the caller has realtime scheduling
1289	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will	1289	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1290	* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).	1290	* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1291	*/	1291	*/
1292	alloc_flags = ALLOC_WMARK_MIN;	1292	alloc_flags = ALLOC_WMARK_MIN;
1293	if ((unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait)	1293	if ((unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait)
1294	alloc_flags \|= ALLOC_HARDER;	1294	alloc_flags \|= ALLOC_HARDER;
1295	if (gfp_mask & __GFP_HIGH)	1295	if (gfp_mask & __GFP_HIGH)
1296	alloc_flags \|= ALLOC_HIGH;	1296	alloc_flags \|= ALLOC_HIGH;
1297	if (wait)	1297	if (wait)
1298	alloc_flags \|= ALLOC_CPUSET;	1298	alloc_flags \|= ALLOC_CPUSET;
1299		1299
1300	/*	1300	/*
1301	* Go through the zonelist again. Let __GFP_HIGH and allocations	1301	* Go through the zonelist again. Let __GFP_HIGH and allocations
1302	* coming from realtime tasks go deeper into reserves.	1302	* coming from realtime tasks go deeper into reserves.
1303	*	1303	*
1304	* This is the last chance, in general, before the goto nopage.	1304	* This is the last chance, in general, before the goto nopage.
1305	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.	1305	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1306	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.	1306	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1307	*/	1307	*/
1308	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);	1308	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
1309	if (page)	1309	if (page)
1310	goto got_pg;	1310	goto got_pg;
1311		1311
1312	/* This allocation should allow future memory freeing. */	1312	/* This allocation should allow future memory freeing. */
1313		1313
1314	rebalance:	1314	rebalance:
1315	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))	1315	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
1316	&& !in_interrupt()) {	1316	&& !in_interrupt()) {
1317	if (!(gfp_mask & __GFP_NOMEMALLOC)) {	1317	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1318	nofail_alloc:	1318	nofail_alloc:
1319	/* go through the zonelist yet again, ignoring mins */	1319	/* go through the zonelist yet again, ignoring mins */
1320	page = get_page_from_freelist(gfp_mask, order,	1320	page = get_page_from_freelist(gfp_mask, order,
1321	zonelist, ALLOC_NO_WATERMARKS);	1321	zonelist, ALLOC_NO_WATERMARKS);
1322	if (page)	1322	if (page)
1323	goto got_pg;	1323	goto got_pg;
1324	if (gfp_mask & __GFP_NOFAIL) {	1324	if (gfp_mask & __GFP_NOFAIL) {
1325	congestion_wait(WRITE, HZ/50);	1325	congestion_wait(WRITE, HZ/50);
1326	goto nofail_alloc;	1326	goto nofail_alloc;
1327	}	1327	}
1328	}	1328	}
1329	goto nopage;	1329	goto nopage;
1330	}	1330	}
1331		1331
1332	/* Atomic allocations - we can't balance anything */	1332	/* Atomic allocations - we can't balance anything */
1333	if (!wait)	1333	if (!wait)
1334	goto nopage;	1334	goto nopage;
1335		1335
1336	cond_resched();	1336	cond_resched();
1337		1337
1338	/* We now go into synchronous reclaim */	1338	/* We now go into synchronous reclaim */
1339	cpuset_memory_pressure_bump();	1339	cpuset_memory_pressure_bump();
1340	p->flags \|= PF_MEMALLOC;	1340	p->flags \|= PF_MEMALLOC;
1341	reclaim_state.reclaimed_slab = 0;	1341	reclaim_state.reclaimed_slab = 0;
1342	p->reclaim_state = &reclaim_state;	1342	p->reclaim_state = &reclaim_state;
1343		1343
1344	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);	1344	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1345		1345
1346	p->reclaim_state = NULL;	1346	p->reclaim_state = NULL;
1347	p->flags &= ~PF_MEMALLOC;	1347	p->flags &= ~PF_MEMALLOC;
1348		1348
1349	cond_resched();	1349	cond_resched();
1350		1350
1351	if (likely(did_some_progress)) {	1351	if (likely(did_some_progress)) {
1352	page = get_page_from_freelist(gfp_mask, order,	1352	page = get_page_from_freelist(gfp_mask, order,
1353	zonelist, alloc_flags);	1353	zonelist, alloc_flags);
1354	if (page)	1354	if (page)
1355	goto got_pg;	1355	goto got_pg;
1356	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {	1356	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1357	/*	1357	/*
1358	* Go through the zonelist yet one more time, keep	1358	* Go through the zonelist yet one more time, keep
1359	* very high watermark here, this is only to catch	1359	* very high watermark here, this is only to catch
1360	* a parallel oom killing, we must fail if we're still	1360	* a parallel oom killing, we must fail if we're still
1361	* under heavy pressure.	1361	* under heavy pressure.
1362	*/	1362	*/
1363	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,	1363	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
1364	zonelist, ALLOC_WMARK_HIGH\|ALLOC_CPUSET);	1364	zonelist, ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
1365	if (page)	1365	if (page)
1366	goto got_pg;	1366	goto got_pg;
1367		1367
1368	out_of_memory(zonelist, gfp_mask, order);	1368	out_of_memory(zonelist, gfp_mask, order);
1369	goto restart;	1369	goto restart;
1370	}	1370	}
1371		1371
1372	/*	1372	/*
1373	* Don't let big-order allocations loop unless the caller explicitly	1373	* Don't let big-order allocations loop unless the caller explicitly
1374	* requests that. Wait for some write requests to complete then retry.	1374	* requests that. Wait for some write requests to complete then retry.
1375	*	1375	*
1376	* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order	1376	* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1377	* <= 3, but that may not be true in other implementations.	1377	* <= 3, but that may not be true in other implementations.
1378	*/	1378	*/
1379	do_retry = 0;	1379	do_retry = 0;
1380	if (!(gfp_mask & __GFP_NORETRY)) {	1380	if (!(gfp_mask & __GFP_NORETRY)) {
1381	if ((order <= 3) \|\| (gfp_mask & __GFP_REPEAT))	1381	if ((order <= 3) \|\| (gfp_mask & __GFP_REPEAT))
1382	do_retry = 1;	1382	do_retry = 1;
1383	if (gfp_mask & __GFP_NOFAIL)	1383	if (gfp_mask & __GFP_NOFAIL)
1384	do_retry = 1;	1384	do_retry = 1;
1385	}	1385	}
1386	if (do_retry) {	1386	if (do_retry) {
1387	congestion_wait(WRITE, HZ/50);	1387	congestion_wait(WRITE, HZ/50);
1388	goto rebalance;	1388	goto rebalance;
1389	}	1389	}
1390		1390
1391	nopage:	1391	nopage:
1392	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {	1392	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1393	printk(KERN_WARNING "%s: page allocation failure."	1393	printk(KERN_WARNING "%s: page allocation failure."
1394	" order:%d, mode:0x%x\n",	1394	" order:%d, mode:0x%x\n",
1395	p->comm, order, gfp_mask);	1395	p->comm, order, gfp_mask);
1396	dump_stack();	1396	dump_stack();
1397	show_mem();	1397	show_mem();
1398	}	1398	}
1399	got_pg:	1399	got_pg:
1400	return page;	1400	return page;
1401	}	1401	}
1402		1402
1403	EXPORT_SYMBOL(__alloc_pages);	1403	EXPORT_SYMBOL(__alloc_pages);
1404		1404
1405	/*	1405	/*
1406	* Common helper functions.	1406	* Common helper functions.
1407	*/	1407	*/
1408	fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)	1408	fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1409	{	1409	{
1410	struct page * page;	1410	struct page * page;
1411	page = alloc_pages(gfp_mask, order);	1411	page = alloc_pages(gfp_mask, order);
1412	if (!page)	1412	if (!page)
1413	return 0;	1413	return 0;
1414	return (unsigned long) page_address(page);	1414	return (unsigned long) page_address(page);
1415	}	1415	}
1416		1416
1417	EXPORT_SYMBOL(__get_free_pages);	1417	EXPORT_SYMBOL(__get_free_pages);
1418		1418
1419	fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)	1419	fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1420	{	1420	{
1421	struct page * page;	1421	struct page * page;
1422		1422
1423	/*	1423	/*
1424	* get_zeroed_page() returns a 32-bit address, which cannot represent	1424	* get_zeroed_page() returns a 32-bit address, which cannot represent
1425	* a highmem page	1425	* a highmem page
1426	*/	1426	*/
1427	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);	1427	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1428		1428
1429	page = alloc_pages(gfp_mask \| __GFP_ZERO, 0);	1429	page = alloc_pages(gfp_mask \| __GFP_ZERO, 0);
1430	if (page)	1430	if (page)
1431	return (unsigned long) page_address(page);	1431	return (unsigned long) page_address(page);
1432	return 0;	1432	return 0;
1433	}	1433	}
1434		1434
1435	EXPORT_SYMBOL(get_zeroed_page);	1435	EXPORT_SYMBOL(get_zeroed_page);
1436		1436
1437	void __pagevec_free(struct pagevec *pvec)	1437	void __pagevec_free(struct pagevec *pvec)
1438	{	1438	{
1439	int i = pagevec_count(pvec);	1439	int i = pagevec_count(pvec);
1440		1440
1441	while (--i >= 0)	1441	while (--i >= 0)
1442	free_hot_cold_page(pvec->pages[i], pvec->cold);	1442	free_hot_cold_page(pvec->pages[i], pvec->cold);
1443	}	1443	}
1444		1444
1445	fastcall void __free_pages(struct page *page, unsigned int order)	1445	fastcall void __free_pages(struct page *page, unsigned int order)
1446	{	1446	{
1447	if (put_page_testzero(page)) {	1447	if (put_page_testzero(page)) {
1448	if (order == 0)	1448	if (order == 0)
1449	free_hot_page(page);	1449	free_hot_page(page);
1450	else	1450	else
1451	__free_pages_ok(page, order);	1451	__free_pages_ok(page, order);
1452	}	1452	}
1453	}	1453	}
1454		1454
1455	EXPORT_SYMBOL(__free_pages);	1455	EXPORT_SYMBOL(__free_pages);
1456		1456
1457	fastcall void free_pages(unsigned long addr, unsigned int order)	1457	fastcall void free_pages(unsigned long addr, unsigned int order)
1458	{	1458	{
1459	if (addr != 0) {	1459	if (addr != 0) {
1460	VM_BUG_ON(!virt_addr_valid((void *)addr));	1460	VM_BUG_ON(!virt_addr_valid((void *)addr));
1461	__free_pages(virt_to_page((void *)addr), order);	1461	__free_pages(virt_to_page((void *)addr), order);
1462	}	1462	}
1463	}	1463	}
1464		1464
1465	EXPORT_SYMBOL(free_pages);	1465	EXPORT_SYMBOL(free_pages);
1466		1466
1467	static unsigned int nr_free_zone_pages(int offset)	1467	static unsigned int nr_free_zone_pages(int offset)
1468	{	1468	{
1469	/* Just pick one node, since fallback list is circular */	1469	/* Just pick one node, since fallback list is circular */
1470	pg_data_t *pgdat = NODE_DATA(numa_node_id());	1470	pg_data_t *pgdat = NODE_DATA(numa_node_id());
1471	unsigned int sum = 0;	1471	unsigned int sum = 0;
1472		1472
1473	struct zonelist *zonelist = pgdat->node_zonelists + offset;	1473	struct zonelist *zonelist = pgdat->node_zonelists + offset;
1474	struct zone **zonep = zonelist->zones;	1474	struct zone **zonep = zonelist->zones;
1475	struct zone *zone;	1475	struct zone *zone;
1476		1476
1477	for (zone = zonep++; zone; zone = zonep++) {	1477	for (zone = zonep++; zone; zone = zonep++) {
1478	unsigned long size = zone->present_pages;	1478	unsigned long size = zone->present_pages;
1479	unsigned long high = zone->pages_high;	1479	unsigned long high = zone->pages_high;
1480	if (size > high)	1480	if (size > high)
1481	sum += size - high;	1481	sum += size - high;
1482	}	1482	}
1483		1483
1484	return sum;	1484	return sum;
1485	}	1485	}
1486		1486
1487	/*	1487	/*
1488	* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL	1488	* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1489	*/	1489	*/
1490	unsigned int nr_free_buffer_pages(void)	1490	unsigned int nr_free_buffer_pages(void)
1491	{	1491	{
1492	return nr_free_zone_pages(gfp_zone(GFP_USER));	1492	return nr_free_zone_pages(gfp_zone(GFP_USER));
1493	}	1493	}
1494		1494
1495	/*	1495	/*
1496	* Amount of free RAM allocatable within all zones	1496	* Amount of free RAM allocatable within all zones
1497	*/	1497	*/
1498	unsigned int nr_free_pagecache_pages(void)	1498	unsigned int nr_free_pagecache_pages(void)
1499	{	1499	{
1500	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));	1500	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1501	}	1501	}
1502		1502
1503	static inline void show_node(struct zone *zone)	1503	static inline void show_node(struct zone *zone)
1504	{	1504	{
1505	if (NUMA_BUILD)	1505	if (NUMA_BUILD)
1506	printk("Node %d ", zone_to_nid(zone));	1506	printk("Node %d ", zone_to_nid(zone));
1507	}	1507	}
1508		1508
1509	void si_meminfo(struct sysinfo *val)	1509	void si_meminfo(struct sysinfo *val)
1510	{	1510	{
1511	val->totalram = totalram_pages;	1511	val->totalram = totalram_pages;
1512	val->sharedram = 0;	1512	val->sharedram = 0;
1513	val->freeram = global_page_state(NR_FREE_PAGES);	1513	val->freeram = global_page_state(NR_FREE_PAGES);
1514	val->bufferram = nr_blockdev_pages();	1514	val->bufferram = nr_blockdev_pages();
1515	val->totalhigh = totalhigh_pages;	1515	val->totalhigh = totalhigh_pages;
1516	val->freehigh = nr_free_highpages();	1516	val->freehigh = nr_free_highpages();
1517	val->mem_unit = PAGE_SIZE;	1517	val->mem_unit = PAGE_SIZE;
1518	}	1518	}
1519		1519
1520	EXPORT_SYMBOL(si_meminfo);	1520	EXPORT_SYMBOL(si_meminfo);
1521		1521
1522	#ifdef CONFIG_NUMA	1522	#ifdef CONFIG_NUMA
1523	void si_meminfo_node(struct sysinfo *val, int nid)	1523	void si_meminfo_node(struct sysinfo *val, int nid)
1524	{	1524	{
1525	pg_data_t *pgdat = NODE_DATA(nid);	1525	pg_data_t *pgdat = NODE_DATA(nid);
1526		1526
1527	val->totalram = pgdat->node_present_pages;	1527	val->totalram = pgdat->node_present_pages;
1528	val->freeram = node_page_state(nid, NR_FREE_PAGES);	1528	val->freeram = node_page_state(nid, NR_FREE_PAGES);
1529	#ifdef CONFIG_HIGHMEM	1529	#ifdef CONFIG_HIGHMEM
1530	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;	1530	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1531	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],	1531	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
1532	NR_FREE_PAGES);	1532	NR_FREE_PAGES);
1533	#else	1533	#else
1534	val->totalhigh = 0;	1534	val->totalhigh = 0;
1535	val->freehigh = 0;	1535	val->freehigh = 0;
1536	#endif	1536	#endif
1537	val->mem_unit = PAGE_SIZE;	1537	val->mem_unit = PAGE_SIZE;
1538	}	1538	}
1539	#endif	1539	#endif
1540		1540
1541	#define K(x) ((x) << (PAGE_SHIFT-10))	1541	#define K(x) ((x) << (PAGE_SHIFT-10))
1542		1542
1543	/*	1543	/*
1544	* Show free area list (used inside shift_scroll-lock stuff)	1544	* Show free area list (used inside shift_scroll-lock stuff)
1545	* We also calculate the percentage fragmentation. We do this by counting the	1545	* We also calculate the percentage fragmentation. We do this by counting the
1546	* memory on each free list with the exception of the first item on the list.	1546	* memory on each free list with the exception of the first item on the list.
1547	*/	1547	*/
1548	void show_free_areas(void)	1548	void show_free_areas(void)
1549	{	1549	{
1550	int cpu;	1550	int cpu;
1551	struct zone *zone;	1551	struct zone *zone;
1552		1552
1553	for_each_zone(zone) {	1553	for_each_zone(zone) {
1554	if (!populated_zone(zone))	1554	if (!populated_zone(zone))
1555	continue;	1555	continue;
1556		1556
1557	show_node(zone);	1557	show_node(zone);
1558	printk("%s per-cpu:\n", zone->name);	1558	printk("%s per-cpu:\n", zone->name);
1559		1559
1560	for_each_online_cpu(cpu) {	1560	for_each_online_cpu(cpu) {
1561	struct per_cpu_pageset *pageset;	1561	struct per_cpu_pageset *pageset;
1562		1562
1563	pageset = zone_pcp(zone, cpu);	1563	pageset = zone_pcp(zone, cpu);
1564		1564
1565	printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "	1565	printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1566	"Cold: hi:%5d, btch:%4d usd:%4d\n",	1566	"Cold: hi:%5d, btch:%4d usd:%4d\n",
1567	cpu, pageset->pcp[0].high,	1567	cpu, pageset->pcp[0].high,
1568	pageset->pcp[0].batch, pageset->pcp[0].count,	1568	pageset->pcp[0].batch, pageset->pcp[0].count,
1569	pageset->pcp[1].high, pageset->pcp[1].batch,	1569	pageset->pcp[1].high, pageset->pcp[1].batch,
1570	pageset->pcp[1].count);	1570	pageset->pcp[1].count);
1571	}	1571	}
1572	}	1572	}
1573		1573
1574	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"	1574	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
1575	" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",	1575	" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1576	global_page_state(NR_ACTIVE),	1576	global_page_state(NR_ACTIVE),
1577	global_page_state(NR_INACTIVE),	1577	global_page_state(NR_INACTIVE),
1578	global_page_state(NR_FILE_DIRTY),	1578	global_page_state(NR_FILE_DIRTY),
1579	global_page_state(NR_WRITEBACK),	1579	global_page_state(NR_WRITEBACK),
1580	global_page_state(NR_UNSTABLE_NFS),	1580	global_page_state(NR_UNSTABLE_NFS),
1581	global_page_state(NR_FREE_PAGES),	1581	global_page_state(NR_FREE_PAGES),
1582	global_page_state(NR_SLAB_RECLAIMABLE) +	1582	global_page_state(NR_SLAB_RECLAIMABLE) +
1583	global_page_state(NR_SLAB_UNRECLAIMABLE),	1583	global_page_state(NR_SLAB_UNRECLAIMABLE),
1584	global_page_state(NR_FILE_MAPPED),	1584	global_page_state(NR_FILE_MAPPED),
1585	global_page_state(NR_PAGETABLE),	1585	global_page_state(NR_PAGETABLE),
1586	global_page_state(NR_BOUNCE));	1586	global_page_state(NR_BOUNCE));
1587		1587
1588	for_each_zone(zone) {	1588	for_each_zone(zone) {
1589	int i;	1589	int i;
1590		1590
1591	if (!populated_zone(zone))	1591	if (!populated_zone(zone))
1592	continue;	1592	continue;
1593		1593
1594	show_node(zone);	1594	show_node(zone);
1595	printk("%s"	1595	printk("%s"
1596	" free:%lukB"	1596	" free:%lukB"
1597	" min:%lukB"	1597	" min:%lukB"
1598	" low:%lukB"	1598	" low:%lukB"
1599	" high:%lukB"	1599	" high:%lukB"
1600	" active:%lukB"	1600	" active:%lukB"
1601	" inactive:%lukB"	1601	" inactive:%lukB"
1602	" present:%lukB"	1602	" present:%lukB"
1603	" pages_scanned:%lu"	1603	" pages_scanned:%lu"
1604	" all_unreclaimable? %s"	1604	" all_unreclaimable? %s"
1605	"\n",	1605	"\n",
1606	zone->name,	1606	zone->name,
1607	K(zone_page_state(zone, NR_FREE_PAGES)),	1607	K(zone_page_state(zone, NR_FREE_PAGES)),
1608	K(zone->pages_min),	1608	K(zone->pages_min),
1609	K(zone->pages_low),	1609	K(zone->pages_low),
1610	K(zone->pages_high),	1610	K(zone->pages_high),
1611	K(zone_page_state(zone, NR_ACTIVE)),	1611	K(zone_page_state(zone, NR_ACTIVE)),
1612	K(zone_page_state(zone, NR_INACTIVE)),	1612	K(zone_page_state(zone, NR_INACTIVE)),
1613	K(zone->present_pages),	1613	K(zone->present_pages),
1614	zone->pages_scanned,	1614	zone->pages_scanned,
1615	(zone->all_unreclaimable ? "yes" : "no")	1615	(zone->all_unreclaimable ? "yes" : "no")
1616	);	1616	);
1617	printk("lowmem_reserve[]:");	1617	printk("lowmem_reserve[]:");
1618	for (i = 0; i < MAX_NR_ZONES; i++)	1618	for (i = 0; i < MAX_NR_ZONES; i++)
1619	printk(" %lu", zone->lowmem_reserve[i]);	1619	printk(" %lu", zone->lowmem_reserve[i]);
1620	printk("\n");	1620	printk("\n");
1621	}	1621	}
1622		1622
1623	for_each_zone(zone) {	1623	for_each_zone(zone) {
1624	unsigned long nr[MAX_ORDER], flags, order, total = 0;	1624	unsigned long nr[MAX_ORDER], flags, order, total = 0;
1625		1625
1626	if (!populated_zone(zone))	1626	if (!populated_zone(zone))
1627	continue;	1627	continue;
1628		1628
1629	show_node(zone);	1629	show_node(zone);
1630	printk("%s: ", zone->name);	1630	printk("%s: ", zone->name);
1631		1631
1632	spin_lock_irqsave(&zone->lock, flags);	1632	spin_lock_irqsave(&zone->lock, flags);
1633	for (order = 0; order < MAX_ORDER; order++) {	1633	for (order = 0; order < MAX_ORDER; order++) {
1634	nr[order] = zone->free_area[order].nr_free;	1634	nr[order] = zone->free_area[order].nr_free;
1635	total += nr[order] << order;	1635	total += nr[order] << order;
1636	}	1636	}
1637	spin_unlock_irqrestore(&zone->lock, flags);	1637	spin_unlock_irqrestore(&zone->lock, flags);
1638	for (order = 0; order < MAX_ORDER; order++)	1638	for (order = 0; order < MAX_ORDER; order++)
1639	printk("%lu*%lukB ", nr[order], K(1UL) << order);	1639	printk("%lu*%lukB ", nr[order], K(1UL) << order);
1640	printk("= %lukB\n", K(total));	1640	printk("= %lukB\n", K(total));
1641	}	1641	}
1642		1642
1643	show_swap_cache_info();	1643	show_swap_cache_info();
1644	}	1644	}
1645		1645
1646	/*	1646	/*
1647	* Builds allocation fallback zone lists.	1647	* Builds allocation fallback zone lists.
1648	*	1648	*
1649	* Add all populated zones of a node to the zonelist.	1649	* Add all populated zones of a node to the zonelist.
1650	*/	1650	*/
1651	static int __meminit build_zonelists_node(pg_data_t *pgdat,	1651	static int __meminit build_zonelists_node(pg_data_t *pgdat,
1652	struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)	1652	struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1653	{	1653	{
1654	struct zone *zone;	1654	struct zone *zone;
1655		1655
1656	BUG_ON(zone_type >= MAX_NR_ZONES);	1656	BUG_ON(zone_type >= MAX_NR_ZONES);
1657	zone_type++;	1657	zone_type++;
1658		1658
1659	do {	1659	do {
1660	zone_type--;	1660	zone_type--;
1661	zone = pgdat->node_zones + zone_type;	1661	zone = pgdat->node_zones + zone_type;
1662	if (populated_zone(zone)) {	1662	if (populated_zone(zone)) {
1663	zonelist->zones[nr_zones++] = zone;	1663	zonelist->zones[nr_zones++] = zone;
1664	check_highest_zone(zone_type);	1664	check_highest_zone(zone_type);
1665	}	1665	}
1666		1666
1667	} while (zone_type);	1667	} while (zone_type);
1668	return nr_zones;	1668	return nr_zones;
1669	}	1669	}
1670		1670
1671	#ifdef CONFIG_NUMA	1671	#ifdef CONFIG_NUMA
1672	#define MAX_NODE_LOAD (num_online_nodes())	1672	#define MAX_NODE_LOAD (num_online_nodes())
1673	static int __meminitdata node_load[MAX_NUMNODES];	1673	static int __meminitdata node_load[MAX_NUMNODES];
1674	/**	1674	/**
1675	* find_next_best_node - find the next node that should appear in a given node's fallback list	1675	* find_next_best_node - find the next node that should appear in a given node's fallback list
1676	* @node: node whose fallback list we're appending	1676	* @node: node whose fallback list we're appending
1677	* @used_node_mask: nodemask_t of already used nodes	1677	* @used_node_mask: nodemask_t of already used nodes
1678	*	1678	*
1679	* We use a number of factors to determine which is the next node that should	1679	* We use a number of factors to determine which is the next node that should
1680	* appear on a given node's fallback list. The node should not have appeared	1680	* appear on a given node's fallback list. The node should not have appeared
1681	* already in @node's fallback list, and it should be the next closest node	1681	* already in @node's fallback list, and it should be the next closest node
1682	* according to the distance array (which contains arbitrary distance values	1682	* according to the distance array (which contains arbitrary distance values
1683	* from each node to each node in the system), and should also prefer nodes	1683	* from each node to each node in the system), and should also prefer nodes
1684	* with no CPUs, since presumably they'll have very little allocation pressure	1684	* with no CPUs, since presumably they'll have very little allocation pressure
1685	* on them otherwise.	1685	* on them otherwise.
1686	* It returns -1 if no node is found.	1686	* It returns -1 if no node is found.
1687	*/	1687	*/
1688	static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)	1688	static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1689	{	1689	{
1690	int n, val;	1690	int n, val;
1691	int min_val = INT_MAX;	1691	int min_val = INT_MAX;
1692	int best_node = -1;	1692	int best_node = -1;
1693		1693
1694	/* Use the local node if we haven't already */	1694	/* Use the local node if we haven't already */
1695	if (!node_isset(node, *used_node_mask)) {	1695	if (!node_isset(node, *used_node_mask)) {
1696	node_set(node, *used_node_mask);	1696	node_set(node, *used_node_mask);
1697	return node;	1697	return node;
1698	}	1698	}
1699		1699
1700	for_each_online_node(n) {	1700	for_each_online_node(n) {
1701	cpumask_t tmp;	1701	cpumask_t tmp;
1702		1702
1703	/* Don't want a node to appear more than once */	1703	/* Don't want a node to appear more than once */
1704	if (node_isset(n, *used_node_mask))	1704	if (node_isset(n, *used_node_mask))
1705	continue;	1705	continue;
1706		1706
1707	/* Use the distance array to find the distance */	1707	/* Use the distance array to find the distance */
1708	val = node_distance(node, n);	1708	val = node_distance(node, n);
1709		1709
1710	/* Penalize nodes under us ("prefer the next node") */	1710	/* Penalize nodes under us ("prefer the next node") */
1711	val += (n < node);	1711	val += (n < node);
1712		1712
1713	/* Give preference to headless and unused nodes */	1713	/* Give preference to headless and unused nodes */
1714	tmp = node_to_cpumask(n);	1714	tmp = node_to_cpumask(n);
1715	if (!cpus_empty(tmp))	1715	if (!cpus_empty(tmp))
1716	val += PENALTY_FOR_NODE_WITH_CPUS;	1716	val += PENALTY_FOR_NODE_WITH_CPUS;
1717		1717
1718	/* Slight preference for less loaded node */	1718	/* Slight preference for less loaded node */
1719	val = (MAX_NODE_LOADMAX_NUMNODES);	1719	val = (MAX_NODE_LOADMAX_NUMNODES);
1720	val += node_load[n];	1720	val += node_load[n];
1721		1721
1722	if (val < min_val) {	1722	if (val < min_val) {
1723	min_val = val;	1723	min_val = val;
1724	best_node = n;	1724	best_node = n;
1725	}	1725	}
1726	}	1726	}
1727		1727
1728	if (best_node >= 0)	1728	if (best_node >= 0)
1729	node_set(best_node, *used_node_mask);	1729	node_set(best_node, *used_node_mask);
1730		1730
1731	return best_node;	1731	return best_node;
1732	}	1732	}
1733		1733
1734	static void __meminit build_zonelists(pg_data_t *pgdat)	1734	static void __meminit build_zonelists(pg_data_t *pgdat)
1735	{	1735	{
1736	int j, node, local_node;	1736	int j, node, local_node;
1737	enum zone_type i;	1737	enum zone_type i;
1738	int prev_node, load;	1738	int prev_node, load;
1739	struct zonelist *zonelist;	1739	struct zonelist *zonelist;
1740	nodemask_t used_mask;	1740	nodemask_t used_mask;
1741		1741
1742	/* initialize zonelists */	1742	/* initialize zonelists */
1743	for (i = 0; i < MAX_NR_ZONES; i++) {	1743	for (i = 0; i < MAX_NR_ZONES; i++) {
1744	zonelist = pgdat->node_zonelists + i;	1744	zonelist = pgdat->node_zonelists + i;
1745	zonelist->zones[0] = NULL;	1745	zonelist->zones[0] = NULL;
1746	}	1746	}
1747		1747
1748	/* NUMA-aware ordering of nodes */	1748	/* NUMA-aware ordering of nodes */
1749	local_node = pgdat->node_id;	1749	local_node = pgdat->node_id;
1750	load = num_online_nodes();	1750	load = num_online_nodes();
1751	prev_node = local_node;	1751	prev_node = local_node;
1752	nodes_clear(used_mask);	1752	nodes_clear(used_mask);
1753	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {	1753	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1754	int distance = node_distance(local_node, node);	1754	int distance = node_distance(local_node, node);
1755		1755
1756	/*	1756	/*
1757	* If another node is sufficiently far away then it is better	1757	* If another node is sufficiently far away then it is better
1758	* to reclaim pages in a zone before going off node.	1758	* to reclaim pages in a zone before going off node.
1759	*/	1759	*/
1760	if (distance > RECLAIM_DISTANCE)	1760	if (distance > RECLAIM_DISTANCE)
1761	zone_reclaim_mode = 1;	1761	zone_reclaim_mode = 1;
1762		1762
1763	/*	1763	/*
1764	* We don't want to pressure a particular node.	1764	* We don't want to pressure a particular node.
1765	* So adding penalty to the first node in same	1765	* So adding penalty to the first node in same
1766	* distance group to make it round-robin.	1766	* distance group to make it round-robin.
1767	*/	1767	*/
1768		1768
1769	if (distance != node_distance(local_node, prev_node))	1769	if (distance != node_distance(local_node, prev_node))
1770	node_load[node] += load;	1770	node_load[node] += load;
1771	prev_node = node;	1771	prev_node = node;
1772	load--;	1772	load--;
1773	for (i = 0; i < MAX_NR_ZONES; i++) {	1773	for (i = 0; i < MAX_NR_ZONES; i++) {
1774	zonelist = pgdat->node_zonelists + i;	1774	zonelist = pgdat->node_zonelists + i;
1775	for (j = 0; zonelist->zones[j] != NULL; j++);	1775	for (j = 0; zonelist->zones[j] != NULL; j++);
1776		1776
1777	j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);	1777	j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1778	zonelist->zones[j] = NULL;	1778	zonelist->zones[j] = NULL;
1779	}	1779	}
1780	}	1780	}
1781	}	1781	}
1782		1782
1783	/* Construct the zonelist performance cache - see further mmzone.h */	1783	/* Construct the zonelist performance cache - see further mmzone.h */
1784	static void __meminit build_zonelist_cache(pg_data_t *pgdat)	1784	static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1785	{	1785	{
1786	int i;	1786	int i;
1787		1787
1788	for (i = 0; i < MAX_NR_ZONES; i++) {	1788	for (i = 0; i < MAX_NR_ZONES; i++) {
1789	struct zonelist *zonelist;	1789	struct zonelist *zonelist;
1790	struct zonelist_cache *zlc;	1790	struct zonelist_cache *zlc;
1791	struct zone **z;	1791	struct zone **z;
1792		1792
1793	zonelist = pgdat->node_zonelists + i;	1793	zonelist = pgdat->node_zonelists + i;
1794	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;	1794	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1795	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);	1795	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1796	for (z = zonelist->zones; *z; z++)	1796	for (z = zonelist->zones; *z; z++)
1797	zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);	1797	zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1798	}	1798	}
1799	}	1799	}
1800		1800
1801	#else /* CONFIG_NUMA */	1801	#else /* CONFIG_NUMA */
1802		1802
1803	static void __meminit build_zonelists(pg_data_t *pgdat)	1803	static void __meminit build_zonelists(pg_data_t *pgdat)
1804	{	1804	{
1805	int node, local_node;	1805	int node, local_node;
1806	enum zone_type i,j;	1806	enum zone_type i,j;
1807		1807
1808	local_node = pgdat->node_id;	1808	local_node = pgdat->node_id;
1809	for (i = 0; i < MAX_NR_ZONES; i++) {	1809	for (i = 0; i < MAX_NR_ZONES; i++) {
1810	struct zonelist *zonelist;	1810	struct zonelist *zonelist;
1811		1811
1812	zonelist = pgdat->node_zonelists + i;	1812	zonelist = pgdat->node_zonelists + i;
1813		1813
1814	j = build_zonelists_node(pgdat, zonelist, 0, i);	1814	j = build_zonelists_node(pgdat, zonelist, 0, i);
1815	/*	1815	/*
1816	* Now we build the zonelist so that it contains the zones	1816	* Now we build the zonelist so that it contains the zones
1817	* of all the other nodes.	1817	* of all the other nodes.
1818	* We don't want to pressure a particular node, so when	1818	* We don't want to pressure a particular node, so when
1819	* building the zones for node N, we make sure that the	1819	* building the zones for node N, we make sure that the
1820	* zones coming right after the local ones are those from	1820	* zones coming right after the local ones are those from
1821	* node N+1 (modulo N)	1821	* node N+1 (modulo N)
1822	*/	1822	*/
1823	for (node = local_node + 1; node < MAX_NUMNODES; node++) {	1823	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1824	if (!node_online(node))	1824	if (!node_online(node))
1825	continue;	1825	continue;
1826	j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);	1826	j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1827	}	1827	}
1828	for (node = 0; node < local_node; node++) {	1828	for (node = 0; node < local_node; node++) {
1829	if (!node_online(node))	1829	if (!node_online(node))
1830	continue;	1830	continue;
1831	j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);	1831	j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1832	}	1832	}
1833		1833
1834	zonelist->zones[j] = NULL;	1834	zonelist->zones[j] = NULL;
1835	}	1835	}
1836	}	1836	}
1837		1837
1838	/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */	1838	/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1839	static void __meminit build_zonelist_cache(pg_data_t *pgdat)	1839	static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1840	{	1840	{
1841	int i;	1841	int i;
1842		1842
1843	for (i = 0; i < MAX_NR_ZONES; i++)	1843	for (i = 0; i < MAX_NR_ZONES; i++)
1844	pgdat->node_zonelists[i].zlcache_ptr = NULL;	1844	pgdat->node_zonelists[i].zlcache_ptr = NULL;
1845	}	1845	}
1846		1846
1847	#endif /* CONFIG_NUMA */	1847	#endif /* CONFIG_NUMA */
1848		1848
1849	/* return values int ....just for stop_machine_run() */	1849	/* return values int ....just for stop_machine_run() */
1850	static int __meminit __build_all_zonelists(void *dummy)	1850	static int __meminit __build_all_zonelists(void *dummy)
1851	{	1851	{
1852	int nid;	1852	int nid;
1853		1853
1854	for_each_online_node(nid) {	1854	for_each_online_node(nid) {
1855	build_zonelists(NODE_DATA(nid));	1855	build_zonelists(NODE_DATA(nid));
1856	build_zonelist_cache(NODE_DATA(nid));	1856	build_zonelist_cache(NODE_DATA(nid));
1857	}	1857	}
1858	return 0;	1858	return 0;
1859	}	1859	}
1860		1860
1861	void __meminit build_all_zonelists(void)	1861	void __meminit build_all_zonelists(void)
1862	{	1862	{
1863	if (system_state == SYSTEM_BOOTING) {	1863	if (system_state == SYSTEM_BOOTING) {
1864	__build_all_zonelists(NULL);	1864	__build_all_zonelists(NULL);
1865	cpuset_init_current_mems_allowed();	1865	cpuset_init_current_mems_allowed();
1866	} else {	1866	} else {
1867	/* we have to stop all cpus to guaranntee there is no user	1867	/* we have to stop all cpus to guaranntee there is no user
1868	of zonelist */	1868	of zonelist */
1869	stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);	1869	stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1870	/* cpuset refresh routine should be here */	1870	/* cpuset refresh routine should be here */
1871	}	1871	}
1872	vm_total_pages = nr_free_pagecache_pages();	1872	vm_total_pages = nr_free_pagecache_pages();
1873	printk("Built %i zonelists. Total pages: %ld\n",	1873	printk("Built %i zonelists. Total pages: %ld\n",
1874	num_online_nodes(), vm_total_pages);	1874	num_online_nodes(), vm_total_pages);
1875	}	1875	}
1876		1876
1877	/*	1877	/*
1878	* Helper functions to size the waitqueue hash table.	1878	* Helper functions to size the waitqueue hash table.
1879	* Essentially these want to choose hash table sizes sufficiently	1879	* Essentially these want to choose hash table sizes sufficiently
1880	* large so that collisions trying to wait on pages are rare.	1880	* large so that collisions trying to wait on pages are rare.
1881	* But in fact, the number of active page waitqueues on typical	1881	* But in fact, the number of active page waitqueues on typical
1882	* systems is ridiculously low, less than 200. So this is even	1882	* systems is ridiculously low, less than 200. So this is even
1883	* conservative, even though it seems large.	1883	* conservative, even though it seems large.
1884	*	1884	*
1885	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to	1885	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1886	* waitqueues, i.e. the size of the waitq table given the number of pages.	1886	* waitqueues, i.e. the size of the waitq table given the number of pages.
1887	*/	1887	*/
1888	#define PAGES_PER_WAITQUEUE 256	1888	#define PAGES_PER_WAITQUEUE 256
1889		1889
1890	#ifndef CONFIG_MEMORY_HOTPLUG	1890	#ifndef CONFIG_MEMORY_HOTPLUG
1891	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)	1891	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1892	{	1892	{
1893	unsigned long size = 1;	1893	unsigned long size = 1;
1894		1894
1895	pages /= PAGES_PER_WAITQUEUE;	1895	pages /= PAGES_PER_WAITQUEUE;
1896		1896
1897	while (size < pages)	1897	while (size < pages)
1898	size <<= 1;	1898	size <<= 1;
1899		1899
1900	/*	1900	/*
1901	* Once we have dozens or even hundreds of threads sleeping	1901	* Once we have dozens or even hundreds of threads sleeping
1902	* on IO we've got bigger problems than wait queue collision.	1902	* on IO we've got bigger problems than wait queue collision.
1903	* Limit the size of the wait table to a reasonable size.	1903	* Limit the size of the wait table to a reasonable size.
1904	*/	1904	*/
1905	size = min(size, 4096UL);	1905	size = min(size, 4096UL);
1906		1906
1907	return max(size, 4UL);	1907	return max(size, 4UL);
1908	}	1908	}
1909	#else	1909	#else
1910	/*	1910	/*
1911	* A zone's size might be changed by hot-add, so it is not possible to determine	1911	* A zone's size might be changed by hot-add, so it is not possible to determine
1912	* a suitable size for its wait_table. So we use the maximum size now.	1912	* a suitable size for its wait_table. So we use the maximum size now.
1913	*	1913	*
1914	* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:	1914	* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1915	*	1915	*
1916	* i386 (preemption config) : 4096 x 16 = 64Kbyte.	1916	* i386 (preemption config) : 4096 x 16 = 64Kbyte.
1917	* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.	1917	* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1918	* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.	1918	* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1919	*	1919	*
1920	* The maximum entries are prepared when a zone's memory is (512K + 256) pages	1920	* The maximum entries are prepared when a zone's memory is (512K + 256) pages
1921	* or more by the traditional way. (See above). It equals:	1921	* or more by the traditional way. (See above). It equals:
1922	*	1922	*
1923	* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.	1923	* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1924	* ia64(16K page size) : = ( 8G + 4M)byte.	1924	* ia64(16K page size) : = ( 8G + 4M)byte.
1925	* powerpc (64K page size) : = (32G +16M)byte.	1925	* powerpc (64K page size) : = (32G +16M)byte.
1926	*/	1926	*/
1927	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)	1927	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1928	{	1928	{
1929	return 4096UL;	1929	return 4096UL;
1930	}	1930	}
1931	#endif	1931	#endif
1932		1932
1933	/*	1933	/*
1934	* This is an integer logarithm so that shifts can be used later	1934	* This is an integer logarithm so that shifts can be used later
1935	* to extract the more random high bits from the multiplicative	1935	* to extract the more random high bits from the multiplicative
1936	* hash function before the remainder is taken.	1936	* hash function before the remainder is taken.
1937	*/	1937	*/
1938	static inline unsigned long wait_table_bits(unsigned long size)	1938	static inline unsigned long wait_table_bits(unsigned long size)
1939	{	1939	{
1940	return ffz(~size);	1940	return ffz(~size);
1941	}	1941	}
1942		1942
1943	#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))	1943	#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1944		1944
1945	/*	1945	/*
1946	* Initially all pages are reserved - free ones are freed	1946	* Initially all pages are reserved - free ones are freed
1947	* up by free_all_bootmem() once the early boot process is	1947	* up by free_all_bootmem() once the early boot process is
1948	* done. Non-atomic initialization, single-pass.	1948	* done. Non-atomic initialization, single-pass.
1949	*/	1949	*/
1950	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,	1950	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1951	unsigned long start_pfn, enum memmap_context context)	1951	unsigned long start_pfn, enum memmap_context context)
1952	{	1952	{
1953	struct page *page;	1953	struct page *page;
1954	unsigned long end_pfn = start_pfn + size;	1954	unsigned long end_pfn = start_pfn + size;
1955	unsigned long pfn;	1955	unsigned long pfn;
1956		1956
1957	for (pfn = start_pfn; pfn < end_pfn; pfn++) {	1957	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1958	/*	1958	/*
1959	* There can be holes in boot-time mem_map[]s	1959	* There can be holes in boot-time mem_map[]s
1960	* handed to this function. They do not	1960	* handed to this function. They do not
1961	* exist on hotplugged memory.	1961	* exist on hotplugged memory.
1962	*/	1962	*/
1963	if (context == MEMMAP_EARLY) {	1963	if (context == MEMMAP_EARLY) {
1964	if (!early_pfn_valid(pfn))	1964	if (!early_pfn_valid(pfn))
1965	continue;	1965	continue;
1966	if (!early_pfn_in_nid(pfn, nid))	1966	if (!early_pfn_in_nid(pfn, nid))
1967	continue;	1967	continue;
1968	}	1968	}
1969	page = pfn_to_page(pfn);	1969	page = pfn_to_page(pfn);
1970	set_page_links(page, zone, nid, pfn);	1970	set_page_links(page, zone, nid, pfn);
1971	init_page_count(page);	1971	init_page_count(page);
1972	reset_page_mapcount(page);	1972	reset_page_mapcount(page);
1973	SetPageReserved(page);	1973	SetPageReserved(page);
1974	INIT_LIST_HEAD(&page->lru);	1974	INIT_LIST_HEAD(&page->lru);
1975	#ifdef WANT_PAGE_VIRTUAL	1975	#ifdef WANT_PAGE_VIRTUAL
1976	/* The shift won't overflow because ZONE_NORMAL is below 4G. */	1976	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
1977	if (!is_highmem_idx(zone))	1977	if (!is_highmem_idx(zone))
1978	set_page_address(page, __va(pfn << PAGE_SHIFT));	1978	set_page_address(page, __va(pfn << PAGE_SHIFT));
1979	#endif	1979	#endif
1980	}	1980	}
1981	}	1981	}
1982		1982
1983	void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,	1983	void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
1984	unsigned long size)	1984	unsigned long size)
1985	{	1985	{
1986	int order;	1986	int order;
1987	for (order = 0; order < MAX_ORDER ; order++) {	1987	for (order = 0; order < MAX_ORDER ; order++) {
1988	INIT_LIST_HEAD(&zone->free_area[order].free_list);	1988	INIT_LIST_HEAD(&zone->free_area[order].free_list);
1989	zone->free_area[order].nr_free = 0;	1989	zone->free_area[order].nr_free = 0;
1990	}	1990	}
1991	}	1991	}
1992		1992
1993	#ifndef __HAVE_ARCH_MEMMAP_INIT	1993	#ifndef __HAVE_ARCH_MEMMAP_INIT
1994	#define memmap_init(size, nid, zone, start_pfn) \	1994	#define memmap_init(size, nid, zone, start_pfn) \
1995	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)	1995	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1996	#endif	1996	#endif
1997		1997
1998	static int __cpuinit zone_batchsize(struct zone *zone)	1998	static int __cpuinit zone_batchsize(struct zone *zone)
1999	{	1999	{
2000	int batch;	2000	int batch;
2001		2001
2002	/*	2002	/*
2003	* The per-cpu-pages pools are set to around 1000th of the	2003	* The per-cpu-pages pools are set to around 1000th of the
2004	* size of the zone. But no more than 1/2 of a meg.	2004	* size of the zone. But no more than 1/2 of a meg.
2005	*	2005	*
2006	* OK, so we don't know how big the cache is. So guess.	2006	* OK, so we don't know how big the cache is. So guess.
2007	*/	2007	*/
2008	batch = zone->present_pages / 1024;	2008	batch = zone->present_pages / 1024;
2009	if (batch * PAGE_SIZE > 512 * 1024)	2009	if (batch * PAGE_SIZE > 512 * 1024)
2010	batch = (512 * 1024) / PAGE_SIZE;	2010	batch = (512 * 1024) / PAGE_SIZE;
2011	batch /= 4; /* We effectively = 4 below /	2011	batch /= 4; /* We effectively = 4 below /
2012	if (batch < 1)	2012	if (batch < 1)
2013	batch = 1;	2013	batch = 1;
2014		2014
2015	/*	2015	/*
2016	* Clamp the batch to a 2^n - 1 value. Having a power	2016	* Clamp the batch to a 2^n - 1 value. Having a power
2017	* of 2 value was found to be more likely to have	2017	* of 2 value was found to be more likely to have
2018	* suboptimal cache aliasing properties in some cases.	2018	* suboptimal cache aliasing properties in some cases.
2019	*	2019	*
2020	* For example if 2 tasks are alternately allocating	2020	* For example if 2 tasks are alternately allocating
2021	* batches of pages, one task can end up with a lot	2021	* batches of pages, one task can end up with a lot
2022	* of pages of one half of the possible page colors	2022	* of pages of one half of the possible page colors
2023	* and the other with pages of the other colors.	2023	* and the other with pages of the other colors.
2024	*/	2024	*/
2025	batch = (1 << (fls(batch + batch/2)-1)) - 1;	2025	batch = (1 << (fls(batch + batch/2)-1)) - 1;
2026		2026
2027	return batch;	2027	return batch;
2028	}	2028	}
2029		2029
2030	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)	2030	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2031	{	2031	{
2032	struct per_cpu_pages *pcp;	2032	struct per_cpu_pages *pcp;
2033		2033
2034	memset(p, 0, sizeof(*p));	2034	memset(p, 0, sizeof(*p));
2035		2035
2036	pcp = &p->pcp[0]; /* hot */	2036	pcp = &p->pcp[0]; /* hot */
2037	pcp->count = 0;	2037	pcp->count = 0;
2038	pcp->high = 6 * batch;	2038	pcp->high = 6 * batch;
2039	pcp->batch = max(1UL, 1 * batch);	2039	pcp->batch = max(1UL, 1 * batch);
2040	INIT_LIST_HEAD(&pcp->list);	2040	INIT_LIST_HEAD(&pcp->list);
2041		2041
2042	pcp = &p->pcp[1]; /* cold*/	2042	pcp = &p->pcp[1]; /* cold*/
2043	pcp->count = 0;	2043	pcp->count = 0;
2044	pcp->high = 2 * batch;	2044	pcp->high = 2 * batch;
2045	pcp->batch = max(1UL, batch/2);	2045	pcp->batch = max(1UL, batch/2);
2046	INIT_LIST_HEAD(&pcp->list);	2046	INIT_LIST_HEAD(&pcp->list);
2047	}	2047	}
2048		2048
2049	/*	2049	/*
2050	* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist	2050	* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
2051	* to the value high for the pageset p.	2051	* to the value high for the pageset p.
2052	*/	2052	*/
2053		2053
2054	static void setup_pagelist_highmark(struct per_cpu_pageset *p,	2054	static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2055	unsigned long high)	2055	unsigned long high)
2056	{	2056	{
2057	struct per_cpu_pages *pcp;	2057	struct per_cpu_pages *pcp;
2058		2058
2059	pcp = &p->pcp[0]; /* hot list */	2059	pcp = &p->pcp[0]; /* hot list */
2060	pcp->high = high;	2060	pcp->high = high;
2061	pcp->batch = max(1UL, high/4);	2061	pcp->batch = max(1UL, high/4);
2062	if ((high/4) > (PAGE_SHIFT * 8))	2062	if ((high/4) > (PAGE_SHIFT * 8))
2063	pcp->batch = PAGE_SHIFT * 8;	2063	pcp->batch = PAGE_SHIFT * 8;
2064	}	2064	}
2065		2065
2066		2066
2067	#ifdef CONFIG_NUMA	2067	#ifdef CONFIG_NUMA
2068	/*	2068	/*
2069	* Boot pageset table. One per cpu which is going to be used for all	2069	* Boot pageset table. One per cpu which is going to be used for all
2070	* zones and all nodes. The parameters will be set in such a way	2070	* zones and all nodes. The parameters will be set in such a way
2071	* that an item put on a list will immediately be handed over to	2071	* that an item put on a list will immediately be handed over to
2072	* the buddy list. This is safe since pageset manipulation is done	2072	* the buddy list. This is safe since pageset manipulation is done
2073	* with interrupts disabled.	2073	* with interrupts disabled.
2074	*	2074	*
2075	* Some NUMA counter updates may also be caught by the boot pagesets.	2075	* Some NUMA counter updates may also be caught by the boot pagesets.
2076	*	2076	*
2077	* The boot_pagesets must be kept even after bootup is complete for	2077	* The boot_pagesets must be kept even after bootup is complete for
2078	* unused processors and/or zones. They do play a role for bootstrapping	2078	* unused processors and/or zones. They do play a role for bootstrapping
2079	* hotplugged processors.	2079	* hotplugged processors.
2080	*	2080	*
2081	* zoneinfo_show() and maybe other functions do	2081	* zoneinfo_show() and maybe other functions do
2082	* not check if the processor is online before following the pageset pointer.	2082	* not check if the processor is online before following the pageset pointer.
2083	* Other parts of the kernel may not check if the zone is available.	2083	* Other parts of the kernel may not check if the zone is available.
2084	*/	2084	*/
2085	static struct per_cpu_pageset boot_pageset[NR_CPUS];	2085	static struct per_cpu_pageset boot_pageset[NR_CPUS];
2086		2086
2087	/*	2087	/*
2088	* Dynamically allocate memory for the	2088	* Dynamically allocate memory for the
2089	* per cpu pageset array in struct zone.	2089	* per cpu pageset array in struct zone.
2090	*/	2090	*/
2091	static int __cpuinit process_zones(int cpu)	2091	static int __cpuinit process_zones(int cpu)
2092	{	2092	{
2093	struct zone zone, dzone;	2093	struct zone zone, dzone;
2094		2094
2095	for_each_zone(zone) {	2095	for_each_zone(zone) {
2096		2096
2097	if (!populated_zone(zone))	2097	if (!populated_zone(zone))
2098	continue;	2098	continue;
2099		2099
2100	zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),	2100	zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2101	GFP_KERNEL, cpu_to_node(cpu));	2101	GFP_KERNEL, cpu_to_node(cpu));
2102	if (!zone_pcp(zone, cpu))	2102	if (!zone_pcp(zone, cpu))
2103	goto bad;	2103	goto bad;
2104		2104
2105	setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));	2105	setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
2106		2106
2107	if (percpu_pagelist_fraction)	2107	if (percpu_pagelist_fraction)
2108	setup_pagelist_highmark(zone_pcp(zone, cpu),	2108	setup_pagelist_highmark(zone_pcp(zone, cpu),
2109	(zone->present_pages / percpu_pagelist_fraction));	2109	(zone->present_pages / percpu_pagelist_fraction));
2110	}	2110	}
2111		2111
2112	return 0;	2112	return 0;
2113	bad:	2113	bad:
2114	for_each_zone(dzone) {	2114	for_each_zone(dzone) {
2115	if (dzone == zone)	2115	if (dzone == zone)
2116	break;	2116	break;
2117	kfree(zone_pcp(dzone, cpu));	2117	kfree(zone_pcp(dzone, cpu));
2118	zone_pcp(dzone, cpu) = NULL;	2118	zone_pcp(dzone, cpu) = NULL;
2119	}	2119	}
2120	return -ENOMEM;	2120	return -ENOMEM;
2121	}	2121	}
2122		2122
2123	static inline void free_zone_pagesets(int cpu)	2123	static inline void free_zone_pagesets(int cpu)
2124	{	2124	{
2125	struct zone *zone;	2125	struct zone *zone;
2126		2126
2127	for_each_zone(zone) {	2127	for_each_zone(zone) {
2128	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);	2128	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2129		2129
2130	/* Free per_cpu_pageset if it is slab allocated */	2130	/* Free per_cpu_pageset if it is slab allocated */
2131	if (pset != &boot_pageset[cpu])	2131	if (pset != &boot_pageset[cpu])
2132	kfree(pset);	2132	kfree(pset);
2133	zone_pcp(zone, cpu) = NULL;	2133	zone_pcp(zone, cpu) = NULL;
2134	}	2134	}
2135	}	2135	}
2136		2136
2137	static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,	2137	static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2138	unsigned long action,	2138	unsigned long action,
2139	void *hcpu)	2139	void *hcpu)
2140	{	2140	{
2141	int cpu = (long)hcpu;	2141	int cpu = (long)hcpu;
2142	int ret = NOTIFY_OK;	2142	int ret = NOTIFY_OK;
2143		2143
2144	switch (action) {	2144	switch (action) {
2145	case CPU_UP_PREPARE:	2145	case CPU_UP_PREPARE:
2146	if (process_zones(cpu))	2146	if (process_zones(cpu))
2147	ret = NOTIFY_BAD;	2147	ret = NOTIFY_BAD;
2148	break;	2148	break;
2149	case CPU_UP_CANCELED:	2149	case CPU_UP_CANCELED:
2150	case CPU_DEAD:	2150	case CPU_DEAD:
2151	free_zone_pagesets(cpu);	2151	free_zone_pagesets(cpu);
2152	break;	2152	break;
2153	default:	2153	default:
2154	break;	2154	break;
2155	}	2155	}
2156	return ret;	2156	return ret;
2157	}	2157	}
2158		2158
2159	static struct notifier_block __cpuinitdata pageset_notifier =	2159	static struct notifier_block __cpuinitdata pageset_notifier =
2160	{ &pageset_cpuup_callback, NULL, 0 };	2160	{ &pageset_cpuup_callback, NULL, 0 };
2161		2161
2162	void __init setup_per_cpu_pageset(void)	2162	void __init setup_per_cpu_pageset(void)
2163	{	2163	{
2164	int err;	2164	int err;
2165		2165
2166	/* Initialize per_cpu_pageset for cpu 0.	2166	/* Initialize per_cpu_pageset for cpu 0.
2167	* A cpuup callback will do this for every cpu	2167	* A cpuup callback will do this for every cpu
2168	* as it comes online	2168	* as it comes online
2169	*/	2169	*/
2170	err = process_zones(smp_processor_id());	2170	err = process_zones(smp_processor_id());
2171	BUG_ON(err);	2171	BUG_ON(err);
2172	register_cpu_notifier(&pageset_notifier);	2172	register_cpu_notifier(&pageset_notifier);
2173	}	2173	}
2174		2174
2175	#endif	2175	#endif
2176		2176
2177	static __meminit	2177	static __meminit
2178	int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)	2178	int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2179	{	2179	{
2180	int i;	2180	int i;
2181	struct pglist_data *pgdat = zone->zone_pgdat;	2181	struct pglist_data *pgdat = zone->zone_pgdat;
2182	size_t alloc_size;	2182	size_t alloc_size;
2183		2183
2184	/*	2184	/*
2185	* The per-page waitqueue mechanism uses hashed waitqueues	2185	* The per-page waitqueue mechanism uses hashed waitqueues
2186	* per zone.	2186	* per zone.
2187	*/	2187	*/
2188	zone->wait_table_hash_nr_entries =	2188	zone->wait_table_hash_nr_entries =
2189	wait_table_hash_nr_entries(zone_size_pages);	2189	wait_table_hash_nr_entries(zone_size_pages);
2190	zone->wait_table_bits =	2190	zone->wait_table_bits =
2191	wait_table_bits(zone->wait_table_hash_nr_entries);	2191	wait_table_bits(zone->wait_table_hash_nr_entries);
2192	alloc_size = zone->wait_table_hash_nr_entries	2192	alloc_size = zone->wait_table_hash_nr_entries
2193	* sizeof(wait_queue_head_t);	2193	* sizeof(wait_queue_head_t);
2194		2194
2195	if (system_state == SYSTEM_BOOTING) {	2195	if (system_state == SYSTEM_BOOTING) {
2196	zone->wait_table = (wait_queue_head_t *)	2196	zone->wait_table = (wait_queue_head_t *)
2197	alloc_bootmem_node(pgdat, alloc_size);	2197	alloc_bootmem_node(pgdat, alloc_size);
2198	} else {	2198	} else {
2199	/*	2199	/*
2200	* This case means that a zone whose size was 0 gets new memory	2200	* This case means that a zone whose size was 0 gets new memory
2201	* via memory hot-add.	2201	* via memory hot-add.
2202	* But it may be the case that a new node was hot-added. In	2202	* But it may be the case that a new node was hot-added. In
2203	* this case vmalloc() will not be able to use this new node's	2203	* this case vmalloc() will not be able to use this new node's
2204	* memory - this wait_table must be initialized to use this new	2204	* memory - this wait_table must be initialized to use this new
2205	* node itself as well.	2205	* node itself as well.
2206	* To use this new node's memory, further consideration will be	2206	* To use this new node's memory, further consideration will be
2207	* necessary.	2207	* necessary.
2208	*/	2208	*/
2209	zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);	2209	zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
2210	}	2210	}
2211	if (!zone->wait_table)	2211	if (!zone->wait_table)
2212	return -ENOMEM;	2212	return -ENOMEM;
2213		2213
2214	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)	2214	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2215	init_waitqueue_head(zone->wait_table + i);	2215	init_waitqueue_head(zone->wait_table + i);
2216		2216
2217	return 0;	2217	return 0;
2218	}	2218	}
2219		2219
2220	static __meminit void zone_pcp_init(struct zone *zone)	2220	static __meminit void zone_pcp_init(struct zone *zone)
2221	{	2221	{
2222	int cpu;	2222	int cpu;
2223	unsigned long batch = zone_batchsize(zone);	2223	unsigned long batch = zone_batchsize(zone);
2224		2224
2225	for (cpu = 0; cpu < NR_CPUS; cpu++) {	2225	for (cpu = 0; cpu < NR_CPUS; cpu++) {
2226	#ifdef CONFIG_NUMA	2226	#ifdef CONFIG_NUMA
2227	/* Early boot. Slab allocator not functional yet */	2227	/* Early boot. Slab allocator not functional yet */
2228	zone_pcp(zone, cpu) = &boot_pageset[cpu];	2228	zone_pcp(zone, cpu) = &boot_pageset[cpu];
2229	setup_pageset(&boot_pageset[cpu],0);	2229	setup_pageset(&boot_pageset[cpu],0);
2230	#else	2230	#else
2231	setup_pageset(zone_pcp(zone,cpu), batch);	2231	setup_pageset(zone_pcp(zone,cpu), batch);
2232	#endif	2232	#endif
2233	}	2233	}
2234	if (zone->present_pages)	2234	if (zone->present_pages)
2235	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",	2235	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
2236	zone->name, zone->present_pages, batch);	2236	zone->name, zone->present_pages, batch);
2237	}	2237	}
2238		2238
2239	__meminit int init_currently_empty_zone(struct zone *zone,	2239	__meminit int init_currently_empty_zone(struct zone *zone,
2240	unsigned long zone_start_pfn,	2240	unsigned long zone_start_pfn,
2241	unsigned long size,	2241	unsigned long size,
2242	enum memmap_context context)	2242	enum memmap_context context)
2243	{	2243	{
2244	struct pglist_data *pgdat = zone->zone_pgdat;	2244	struct pglist_data *pgdat = zone->zone_pgdat;
2245	int ret;	2245	int ret;
2246	ret = zone_wait_table_init(zone, size);	2246	ret = zone_wait_table_init(zone, size);
2247	if (ret)	2247	if (ret)
2248	return ret;	2248	return ret;
2249	pgdat->nr_zones = zone_idx(zone) + 1;	2249	pgdat->nr_zones = zone_idx(zone) + 1;
2250		2250
2251	zone->zone_start_pfn = zone_start_pfn;	2251	zone->zone_start_pfn = zone_start_pfn;
2252		2252
2253	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);	2253	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2254		2254
2255	zone_init_free_lists(pgdat, zone, zone->spanned_pages);	2255	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2256		2256
2257	return 0;	2257	return 0;
2258	}	2258	}
2259		2259
2260	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP	2260	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2261	/*	2261	/*
2262	* Basic iterator support. Return the first range of PFNs for a node	2262	* Basic iterator support. Return the first range of PFNs for a node
2263	* Note: nid == MAX_NUMNODES returns first region regardless of node	2263	* Note: nid == MAX_NUMNODES returns first region regardless of node
2264	*/	2264	*/
2265	static int __init first_active_region_index_in_nid(int nid)	2265	static int __init first_active_region_index_in_nid(int nid)
2266	{	2266	{
2267	int i;	2267	int i;
2268		2268
2269	for (i = 0; i < nr_nodemap_entries; i++)	2269	for (i = 0; i < nr_nodemap_entries; i++)
2270	if (nid == MAX_NUMNODES \|\| early_node_map[i].nid == nid)	2270	if (nid == MAX_NUMNODES \|\| early_node_map[i].nid == nid)
2271	return i;	2271	return i;
2272		2272
2273	return -1;	2273	return -1;
2274	}	2274	}
2275		2275
2276	/*	2276	/*
2277	* Basic iterator support. Return the next active range of PFNs for a node	2277	* Basic iterator support. Return the next active range of PFNs for a node
2278	* Note: nid == MAX_NUMNODES returns next region regardles of node	2278	* Note: nid == MAX_NUMNODES returns next region regardles of node
2279	*/	2279	*/
2280	static int __init next_active_region_index_in_nid(int index, int nid)	2280	static int __init next_active_region_index_in_nid(int index, int nid)
2281	{	2281	{
2282	for (index = index + 1; index < nr_nodemap_entries; index++)	2282	for (index = index + 1; index < nr_nodemap_entries; index++)
2283	if (nid == MAX_NUMNODES \|\| early_node_map[index].nid == nid)	2283	if (nid == MAX_NUMNODES \|\| early_node_map[index].nid == nid)
2284	return index;	2284	return index;
2285		2285
2286	return -1;	2286	return -1;
2287	}	2287	}
2288		2288
2289	#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID	2289	#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2290	/*	2290	/*
2291	* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.	2291	* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2292	* Architectures may implement their own version but if add_active_range()	2292	* Architectures may implement their own version but if add_active_range()
2293	* was used and there are no special requirements, this is a convenient	2293	* was used and there are no special requirements, this is a convenient
2294	* alternative	2294	* alternative
2295	*/	2295	*/
2296	int __init early_pfn_to_nid(unsigned long pfn)	2296	int __init early_pfn_to_nid(unsigned long pfn)
2297	{	2297	{
2298	int i;	2298	int i;
2299		2299
2300	for (i = 0; i < nr_nodemap_entries; i++) {	2300	for (i = 0; i < nr_nodemap_entries; i++) {
2301	unsigned long start_pfn = early_node_map[i].start_pfn;	2301	unsigned long start_pfn = early_node_map[i].start_pfn;
2302	unsigned long end_pfn = early_node_map[i].end_pfn;	2302	unsigned long end_pfn = early_node_map[i].end_pfn;
2303		2303
2304	if (start_pfn <= pfn && pfn < end_pfn)	2304	if (start_pfn <= pfn && pfn < end_pfn)
2305	return early_node_map[i].nid;	2305	return early_node_map[i].nid;
2306	}	2306	}
2307		2307
2308	return 0;	2308	return 0;
2309	}	2309	}
2310	#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */	2310	#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2311		2311
2312	/* Basic iterator support to walk early_node_map[] */	2312	/* Basic iterator support to walk early_node_map[] */
2313	#define for_each_active_range_index_in_nid(i, nid) \	2313	#define for_each_active_range_index_in_nid(i, nid) \
2314	for (i = first_active_region_index_in_nid(nid); i != -1; \	2314	for (i = first_active_region_index_in_nid(nid); i != -1; \
2315	i = next_active_region_index_in_nid(i, nid))	2315	i = next_active_region_index_in_nid(i, nid))
2316		2316
2317	/**	2317	/**
2318	* free_bootmem_with_active_regions - Call free_bootmem_node for each active range	2318	* free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2319	* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.	2319	* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2320	* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node	2320	* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2321	*	2321	*
2322	* If an architecture guarantees that all ranges registered with	2322	* If an architecture guarantees that all ranges registered with
2323	* add_active_ranges() contain no holes and may be freed, this	2323	* add_active_ranges() contain no holes and may be freed, this
2324	* this function may be used instead of calling free_bootmem() manually.	2324	* this function may be used instead of calling free_bootmem() manually.
2325	*/	2325	*/
2326	void __init free_bootmem_with_active_regions(int nid,	2326	void __init free_bootmem_with_active_regions(int nid,
2327	unsigned long max_low_pfn)	2327	unsigned long max_low_pfn)
2328	{	2328	{
2329	int i;	2329	int i;
2330		2330
2331	for_each_active_range_index_in_nid(i, nid) {	2331	for_each_active_range_index_in_nid(i, nid) {
2332	unsigned long size_pages = 0;	2332	unsigned long size_pages = 0;
2333	unsigned long end_pfn = early_node_map[i].end_pfn;	2333	unsigned long end_pfn = early_node_map[i].end_pfn;
2334		2334
2335	if (early_node_map[i].start_pfn >= max_low_pfn)	2335	if (early_node_map[i].start_pfn >= max_low_pfn)
2336	continue;	2336	continue;
2337		2337
2338	if (end_pfn > max_low_pfn)	2338	if (end_pfn > max_low_pfn)
2339	end_pfn = max_low_pfn;	2339	end_pfn = max_low_pfn;
2340		2340
2341	size_pages = end_pfn - early_node_map[i].start_pfn;	2341	size_pages = end_pfn - early_node_map[i].start_pfn;
2342	free_bootmem_node(NODE_DATA(early_node_map[i].nid),	2342	free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2343	PFN_PHYS(early_node_map[i].start_pfn),	2343	PFN_PHYS(early_node_map[i].start_pfn),
2344	size_pages << PAGE_SHIFT);	2344	size_pages << PAGE_SHIFT);
2345	}	2345	}
2346	}	2346	}
2347		2347
2348	/**	2348	/**
2349	* sparse_memory_present_with_active_regions - Call memory_present for each active range	2349	* sparse_memory_present_with_active_regions - Call memory_present for each active range
2350	* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.	2350	* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2351	*	2351	*
2352	* If an architecture guarantees that all ranges registered with	2352	* If an architecture guarantees that all ranges registered with
2353	* add_active_ranges() contain no holes and may be freed, this	2353	* add_active_ranges() contain no holes and may be freed, this
2354	* function may be used instead of calling memory_present() manually.	2354	* function may be used instead of calling memory_present() manually.
2355	*/	2355	*/
2356	void __init sparse_memory_present_with_active_regions(int nid)	2356	void __init sparse_memory_present_with_active_regions(int nid)
2357	{	2357	{
2358	int i;	2358	int i;
2359		2359
2360	for_each_active_range_index_in_nid(i, nid)	2360	for_each_active_range_index_in_nid(i, nid)
2361	memory_present(early_node_map[i].nid,	2361	memory_present(early_node_map[i].nid,
2362	early_node_map[i].start_pfn,	2362	early_node_map[i].start_pfn,
2363	early_node_map[i].end_pfn);	2363	early_node_map[i].end_pfn);
2364	}	2364	}
2365		2365
2366	/**	2366	/**
2367	* push_node_boundaries - Push node boundaries to at least the requested boundary	2367	* push_node_boundaries - Push node boundaries to at least the requested boundary
2368	* @nid: The nid of the node to push the boundary for	2368	* @nid: The nid of the node to push the boundary for
2369	* @start_pfn: The start pfn of the node	2369	* @start_pfn: The start pfn of the node
2370	* @end_pfn: The end pfn of the node	2370	* @end_pfn: The end pfn of the node
2371	*	2371	*
2372	* In reserve-based hot-add, mem_map is allocated that is unused until hotadd	2372	* In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2373	* time. Specifically, on x86_64, SRAT will report ranges that can potentially	2373	* time. Specifically, on x86_64, SRAT will report ranges that can potentially
2374	* be hotplugged even though no physical memory exists. This function allows	2374	* be hotplugged even though no physical memory exists. This function allows
2375	* an arch to push out the node boundaries so mem_map is allocated that can	2375	* an arch to push out the node boundaries so mem_map is allocated that can
2376	* be used later.	2376	* be used later.
2377	*/	2377	*/
2378	#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE	2378	#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2379	void __init push_node_boundaries(unsigned int nid,	2379	void __init push_node_boundaries(unsigned int nid,
2380	unsigned long start_pfn, unsigned long end_pfn)	2380	unsigned long start_pfn, unsigned long end_pfn)
2381	{	2381	{
2382	printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",	2382	printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2383	nid, start_pfn, end_pfn);	2383	nid, start_pfn, end_pfn);
2384		2384
2385	/* Initialise the boundary for this node if necessary */	2385	/* Initialise the boundary for this node if necessary */
2386	if (node_boundary_end_pfn[nid] == 0)	2386	if (node_boundary_end_pfn[nid] == 0)
2387	node_boundary_start_pfn[nid] = -1UL;	2387	node_boundary_start_pfn[nid] = -1UL;
2388		2388
2389	/* Update the boundaries */	2389	/* Update the boundaries */
2390	if (node_boundary_start_pfn[nid] > start_pfn)	2390	if (node_boundary_start_pfn[nid] > start_pfn)
2391	node_boundary_start_pfn[nid] = start_pfn;	2391	node_boundary_start_pfn[nid] = start_pfn;
2392	if (node_boundary_end_pfn[nid] < end_pfn)	2392	if (node_boundary_end_pfn[nid] < end_pfn)
2393	node_boundary_end_pfn[nid] = end_pfn;	2393	node_boundary_end_pfn[nid] = end_pfn;
2394	}	2394	}
2395		2395
2396	/* If necessary, push the node boundary out for reserve hotadd */	2396	/* If necessary, push the node boundary out for reserve hotadd */
2397	static void __init account_node_boundary(unsigned int nid,	2397	static void __init account_node_boundary(unsigned int nid,
2398	unsigned long start_pfn, unsigned long end_pfn)	2398	unsigned long start_pfn, unsigned long end_pfn)
2399	{	2399	{
2400	printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",	2400	printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2401	nid, start_pfn, end_pfn);	2401	nid, start_pfn, end_pfn);
2402		2402
2403	/* Return if boundary information has not been provided */	2403	/* Return if boundary information has not been provided */
2404	if (node_boundary_end_pfn[nid] == 0)	2404	if (node_boundary_end_pfn[nid] == 0)
2405	return;	2405	return;
2406		2406
2407	/* Check the boundaries and update if necessary */	2407	/* Check the boundaries and update if necessary */
2408	if (node_boundary_start_pfn[nid] < *start_pfn)	2408	if (node_boundary_start_pfn[nid] < *start_pfn)
2409	*start_pfn = node_boundary_start_pfn[nid];	2409	*start_pfn = node_boundary_start_pfn[nid];
2410	if (node_boundary_end_pfn[nid] > *end_pfn)	2410	if (node_boundary_end_pfn[nid] > *end_pfn)
2411	*end_pfn = node_boundary_end_pfn[nid];	2411	*end_pfn = node_boundary_end_pfn[nid];
2412	}	2412	}
2413	#else	2413	#else
2414	void __init push_node_boundaries(unsigned int nid,	2414	void __init push_node_boundaries(unsigned int nid,
2415	unsigned long start_pfn, unsigned long end_pfn) {}	2415	unsigned long start_pfn, unsigned long end_pfn) {}
2416		2416
2417	static void __init account_node_boundary(unsigned int nid,	2417	static void __init account_node_boundary(unsigned int nid,
2418	unsigned long start_pfn, unsigned long end_pfn) {}	2418	unsigned long start_pfn, unsigned long end_pfn) {}
2419	#endif	2419	#endif
2420		2420
2421		2421
2422	/**	2422	/**
2423	* get_pfn_range_for_nid - Return the start and end page frames for a node	2423	* get_pfn_range_for_nid - Return the start and end page frames for a node
2424	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.	2424	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
2425	* @start_pfn: Passed by reference. On return, it will have the node start_pfn.	2425	* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
2426	* @end_pfn: Passed by reference. On return, it will have the node end_pfn.	2426	* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
2427	*	2427	*
2428	* It returns the start and end page frame of a node based on information	2428	* It returns the start and end page frame of a node based on information
2429	* provided by an arch calling add_active_range(). If called for a node	2429	* provided by an arch calling add_active_range(). If called for a node
2430	* with no available memory, a warning is printed and the start and end	2430	* with no available memory, a warning is printed and the start and end
2431	* PFNs will be 0.	2431	* PFNs will be 0.
2432	*/	2432	*/
2433	void __init get_pfn_range_for_nid(unsigned int nid,	2433	void __init get_pfn_range_for_nid(unsigned int nid,
2434	unsigned long start_pfn, unsigned long end_pfn)	2434	unsigned long start_pfn, unsigned long end_pfn)
2435	{	2435	{
2436	int i;	2436	int i;
2437	*start_pfn = -1UL;	2437	*start_pfn = -1UL;
2438	*end_pfn = 0;	2438	*end_pfn = 0;
2439		2439
2440	for_each_active_range_index_in_nid(i, nid) {	2440	for_each_active_range_index_in_nid(i, nid) {
2441	start_pfn = min(start_pfn, early_node_map[i].start_pfn);	2441	start_pfn = min(start_pfn, early_node_map[i].start_pfn);
2442	end_pfn = max(end_pfn, early_node_map[i].end_pfn);	2442	end_pfn = max(end_pfn, early_node_map[i].end_pfn);
2443	}	2443	}
2444		2444
2445	if (*start_pfn == -1UL) {	2445	if (*start_pfn == -1UL) {
2446	printk(KERN_WARNING "Node %u active with no memory\n", nid);	2446	printk(KERN_WARNING "Node %u active with no memory\n", nid);
2447	*start_pfn = 0;	2447	*start_pfn = 0;
2448	}	2448	}
2449		2449
2450	/* Push the node boundaries out if requested */	2450	/* Push the node boundaries out if requested */
2451	account_node_boundary(nid, start_pfn, end_pfn);	2451	account_node_boundary(nid, start_pfn, end_pfn);
2452	}	2452	}
2453		2453
2454	/*	2454	/*
2455	* Return the number of pages a zone spans in a node, including holes	2455	* Return the number of pages a zone spans in a node, including holes
2456	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()	2456	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2457	*/	2457	*/
2458	unsigned long __init zone_spanned_pages_in_node(int nid,	2458	unsigned long __init zone_spanned_pages_in_node(int nid,
2459	unsigned long zone_type,	2459	unsigned long zone_type,
2460	unsigned long *ignored)	2460	unsigned long *ignored)
2461	{	2461	{
2462	unsigned long node_start_pfn, node_end_pfn;	2462	unsigned long node_start_pfn, node_end_pfn;
2463	unsigned long zone_start_pfn, zone_end_pfn;	2463	unsigned long zone_start_pfn, zone_end_pfn;
2464		2464
2465	/* Get the start and end of the node and zone */	2465	/* Get the start and end of the node and zone */
2466	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);	2466	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2467	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];	2467	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2468	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];	2468	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2469		2469
2470	/* Check that this node has pages within the zone's required range */	2470	/* Check that this node has pages within the zone's required range */
2471	if (zone_end_pfn < node_start_pfn \|\| zone_start_pfn > node_end_pfn)	2471	if (zone_end_pfn < node_start_pfn \|\| zone_start_pfn > node_end_pfn)
2472	return 0;	2472	return 0;
2473		2473
2474	/* Move the zone boundaries inside the node if necessary */	2474	/* Move the zone boundaries inside the node if necessary */
2475	zone_end_pfn = min(zone_end_pfn, node_end_pfn);	2475	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2476	zone_start_pfn = max(zone_start_pfn, node_start_pfn);	2476	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2477		2477
2478	/* Return the spanned pages */	2478	/* Return the spanned pages */
2479	return zone_end_pfn - zone_start_pfn;	2479	return zone_end_pfn - zone_start_pfn;
2480	}	2480	}
2481		2481
2482	/*	2482	/*
2483	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,	2483	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2484	* then all holes in the requested range will be accounted for.	2484	* then all holes in the requested range will be accounted for.
2485	*/	2485	*/
2486	unsigned long __init __absent_pages_in_range(int nid,	2486	unsigned long __init __absent_pages_in_range(int nid,
2487	unsigned long range_start_pfn,	2487	unsigned long range_start_pfn,
2488	unsigned long range_end_pfn)	2488	unsigned long range_end_pfn)
2489	{	2489	{
2490	int i = 0;	2490	int i = 0;
2491	unsigned long prev_end_pfn = 0, hole_pages = 0;	2491	unsigned long prev_end_pfn = 0, hole_pages = 0;
2492	unsigned long start_pfn;	2492	unsigned long start_pfn;
2493		2493
2494	/* Find the end_pfn of the first active range of pfns in the node */	2494	/* Find the end_pfn of the first active range of pfns in the node */
2495	i = first_active_region_index_in_nid(nid);	2495	i = first_active_region_index_in_nid(nid);
2496	if (i == -1)	2496	if (i == -1)
2497	return 0;	2497	return 0;
2498		2498
2499	/* Account for ranges before physical memory on this node */	2499	/* Account for ranges before physical memory on this node */
2500	if (early_node_map[i].start_pfn > range_start_pfn)	2500	if (early_node_map[i].start_pfn > range_start_pfn)
2501	hole_pages = early_node_map[i].start_pfn - range_start_pfn;	2501	hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2502		2502
2503	prev_end_pfn = early_node_map[i].start_pfn;	2503	prev_end_pfn = early_node_map[i].start_pfn;
2504		2504
2505	/* Find all holes for the zone within the node */	2505	/* Find all holes for the zone within the node */
2506	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {	2506	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2507		2507
2508	/* No need to continue if prev_end_pfn is outside the zone */	2508	/* No need to continue if prev_end_pfn is outside the zone */
2509	if (prev_end_pfn >= range_end_pfn)	2509	if (prev_end_pfn >= range_end_pfn)
2510	break;	2510	break;
2511		2511
2512	/* Make sure the end of the zone is not within the hole */	2512	/* Make sure the end of the zone is not within the hole */
2513	start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);	2513	start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2514	prev_end_pfn = max(prev_end_pfn, range_start_pfn);	2514	prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2515		2515
2516	/* Update the hole size cound and move on */	2516	/* Update the hole size cound and move on */
2517	if (start_pfn > range_start_pfn) {	2517	if (start_pfn > range_start_pfn) {
2518	BUG_ON(prev_end_pfn > start_pfn);	2518	BUG_ON(prev_end_pfn > start_pfn);
2519	hole_pages += start_pfn - prev_end_pfn;	2519	hole_pages += start_pfn - prev_end_pfn;
2520	}	2520	}
2521	prev_end_pfn = early_node_map[i].end_pfn;	2521	prev_end_pfn = early_node_map[i].end_pfn;
2522	}	2522	}
2523		2523
2524	/* Account for ranges past physical memory on this node */	2524	/* Account for ranges past physical memory on this node */
2525	if (range_end_pfn > prev_end_pfn)	2525	if (range_end_pfn > prev_end_pfn)
2526	hole_pages += range_end_pfn -	2526	hole_pages += range_end_pfn -
2527	max(range_start_pfn, prev_end_pfn);	2527	max(range_start_pfn, prev_end_pfn);
2528		2528
2529	return hole_pages;	2529	return hole_pages;
2530	}	2530	}
2531		2531
2532	/**	2532	/**
2533	* absent_pages_in_range - Return number of page frames in holes within a range	2533	* absent_pages_in_range - Return number of page frames in holes within a range
2534	* @start_pfn: The start PFN to start searching for holes	2534	* @start_pfn: The start PFN to start searching for holes
2535	* @end_pfn: The end PFN to stop searching for holes	2535	* @end_pfn: The end PFN to stop searching for holes
2536	*	2536	*
2537	* It returns the number of pages frames in memory holes within a range.	2537	* It returns the number of pages frames in memory holes within a range.
2538	*/	2538	*/
2539	unsigned long __init absent_pages_in_range(unsigned long start_pfn,	2539	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2540	unsigned long end_pfn)	2540	unsigned long end_pfn)
2541	{	2541	{
2542	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);	2542	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2543	}	2543	}
2544		2544
2545	/* Return the number of page frames in holes in a zone on a node */	2545	/* Return the number of page frames in holes in a zone on a node */
2546	unsigned long __init zone_absent_pages_in_node(int nid,	2546	unsigned long __init zone_absent_pages_in_node(int nid,
2547	unsigned long zone_type,	2547	unsigned long zone_type,
2548	unsigned long *ignored)	2548	unsigned long *ignored)
2549	{	2549	{
2550	unsigned long node_start_pfn, node_end_pfn;	2550	unsigned long node_start_pfn, node_end_pfn;
2551	unsigned long zone_start_pfn, zone_end_pfn;	2551	unsigned long zone_start_pfn, zone_end_pfn;
2552		2552
2553	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);	2553	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2554	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],	2554	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2555	node_start_pfn);	2555	node_start_pfn);
2556	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],	2556	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2557	node_end_pfn);	2557	node_end_pfn);
2558		2558
2559	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);	2559	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2560	}	2560	}
2561		2561
2562	#else	2562	#else
2563	static inline unsigned long zone_spanned_pages_in_node(int nid,	2563	static inline unsigned long zone_spanned_pages_in_node(int nid,
2564	unsigned long zone_type,	2564	unsigned long zone_type,
2565	unsigned long *zones_size)	2565	unsigned long *zones_size)
2566	{	2566	{
2567	return zones_size[zone_type];	2567	return zones_size[zone_type];
2568	}	2568	}
2569		2569
2570	static inline unsigned long zone_absent_pages_in_node(int nid,	2570	static inline unsigned long zone_absent_pages_in_node(int nid,
2571	unsigned long zone_type,	2571	unsigned long zone_type,
2572	unsigned long *zholes_size)	2572	unsigned long *zholes_size)
2573	{	2573	{
2574	if (!zholes_size)	2574	if (!zholes_size)
2575	return 0;	2575	return 0;
2576		2576
2577	return zholes_size[zone_type];	2577	return zholes_size[zone_type];
2578	}	2578	}
2579		2579
2580	#endif	2580	#endif
2581		2581
2582	static void __init calculate_node_totalpages(struct pglist_data *pgdat,	2582	static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2583	unsigned long zones_size, unsigned long zholes_size)	2583	unsigned long zones_size, unsigned long zholes_size)
2584	{	2584	{
2585	unsigned long realtotalpages, totalpages = 0;	2585	unsigned long realtotalpages, totalpages = 0;
2586	enum zone_type i;	2586	enum zone_type i;
2587		2587
2588	for (i = 0; i < MAX_NR_ZONES; i++)	2588	for (i = 0; i < MAX_NR_ZONES; i++)
2589	totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,	2589	totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2590	zones_size);	2590	zones_size);
2591	pgdat->node_spanned_pages = totalpages;	2591	pgdat->node_spanned_pages = totalpages;
2592		2592
2593	realtotalpages = totalpages;	2593	realtotalpages = totalpages;
2594	for (i = 0; i < MAX_NR_ZONES; i++)	2594	for (i = 0; i < MAX_NR_ZONES; i++)
2595	realtotalpages -=	2595	realtotalpages -=
2596	zone_absent_pages_in_node(pgdat->node_id, i,	2596	zone_absent_pages_in_node(pgdat->node_id, i,
2597	zholes_size);	2597	zholes_size);
2598	pgdat->node_present_pages = realtotalpages;	2598	pgdat->node_present_pages = realtotalpages;
2599	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,	2599	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2600	realtotalpages);	2600	realtotalpages);
2601	}	2601	}
2602		2602
2603	/*	2603	/*
2604	* Set up the zone data structures:	2604	* Set up the zone data structures:
2605	* - mark all pages reserved	2605	* - mark all pages reserved
2606	* - mark all memory queues empty	2606	* - mark all memory queues empty
2607	* - clear the memory bitmaps	2607	* - clear the memory bitmaps
2608	*/	2608	*/
2609	static void __meminit free_area_init_core(struct pglist_data *pgdat,	2609	static void __meminit free_area_init_core(struct pglist_data *pgdat,
2610	unsigned long zones_size, unsigned long zholes_size)	2610	unsigned long zones_size, unsigned long zholes_size)
2611	{	2611	{
2612	enum zone_type j;	2612	enum zone_type j;
2613	int nid = pgdat->node_id;	2613	int nid = pgdat->node_id;
2614	unsigned long zone_start_pfn = pgdat->node_start_pfn;	2614	unsigned long zone_start_pfn = pgdat->node_start_pfn;
2615	int ret;	2615	int ret;
2616		2616
2617	pgdat_resize_init(pgdat);	2617	pgdat_resize_init(pgdat);
2618	pgdat->nr_zones = 0;	2618	pgdat->nr_zones = 0;
2619	init_waitqueue_head(&pgdat->kswapd_wait);	2619	init_waitqueue_head(&pgdat->kswapd_wait);
2620	pgdat->kswapd_max_order = 0;	2620	pgdat->kswapd_max_order = 0;
2621		2621
2622	for (j = 0; j < MAX_NR_ZONES; j++) {	2622	for (j = 0; j < MAX_NR_ZONES; j++) {
2623	struct zone *zone = pgdat->node_zones + j;	2623	struct zone *zone = pgdat->node_zones + j;
2624	unsigned long size, realsize, memmap_pages;	2624	unsigned long size, realsize, memmap_pages;
2625		2625
2626	size = zone_spanned_pages_in_node(nid, j, zones_size);	2626	size = zone_spanned_pages_in_node(nid, j, zones_size);
2627	realsize = size - zone_absent_pages_in_node(nid, j,	2627	realsize = size - zone_absent_pages_in_node(nid, j,
2628	zholes_size);	2628	zholes_size);
2629		2629
2630	/*	2630	/*
2631	* Adjust realsize so that it accounts for how much memory	2631	* Adjust realsize so that it accounts for how much memory
2632	* is used by this zone for memmap. This affects the watermark	2632	* is used by this zone for memmap. This affects the watermark
2633	* and per-cpu initialisations	2633	* and per-cpu initialisations
2634	*/	2634	*/
2635	memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;	2635	memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2636	if (realsize >= memmap_pages) {	2636	if (realsize >= memmap_pages) {
2637	realsize -= memmap_pages;	2637	realsize -= memmap_pages;
2638	printk(KERN_DEBUG	2638	printk(KERN_DEBUG
2639	" %s zone: %lu pages used for memmap\n",	2639	" %s zone: %lu pages used for memmap\n",
2640	zone_names[j], memmap_pages);	2640	zone_names[j], memmap_pages);
2641	} else	2641	} else
2642	printk(KERN_WARNING	2642	printk(KERN_WARNING
2643	" %s zone: %lu pages exceeds realsize %lu\n",	2643	" %s zone: %lu pages exceeds realsize %lu\n",
2644	zone_names[j], memmap_pages, realsize);	2644	zone_names[j], memmap_pages, realsize);
2645		2645
2646	/* Account for reserved pages */	2646	/* Account for reserved pages */
2647	if (j == 0 && realsize > dma_reserve) {	2647	if (j == 0 && realsize > dma_reserve) {
2648	realsize -= dma_reserve;	2648	realsize -= dma_reserve;
2649	printk(KERN_DEBUG " %s zone: %lu pages reserved\n",	2649	printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
2650	zone_names[0], dma_reserve);	2650	zone_names[0], dma_reserve);
2651	}	2651	}
2652		2652
2653	if (!is_highmem_idx(j))	2653	if (!is_highmem_idx(j))
2654	nr_kernel_pages += realsize;	2654	nr_kernel_pages += realsize;
2655	nr_all_pages += realsize;	2655	nr_all_pages += realsize;
2656		2656
2657	zone->spanned_pages = size;	2657	zone->spanned_pages = size;
2658	zone->present_pages = realsize;	2658	zone->present_pages = realsize;
2659	#ifdef CONFIG_NUMA	2659	#ifdef CONFIG_NUMA
2660	zone->node = nid;	2660	zone->node = nid;
2661	zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)	2661	zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2662	/ 100;	2662	/ 100;
2663	zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;	2663	zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2664	#endif	2664	#endif
2665	zone->name = zone_names[j];	2665	zone->name = zone_names[j];
2666	spin_lock_init(&zone->lock);	2666	spin_lock_init(&zone->lock);
2667	spin_lock_init(&zone->lru_lock);	2667	spin_lock_init(&zone->lru_lock);
2668	zone_seqlock_init(zone);	2668	zone_seqlock_init(zone);
2669	zone->zone_pgdat = pgdat;	2669	zone->zone_pgdat = pgdat;
2670		2670
2671	zone->prev_priority = DEF_PRIORITY;	2671	zone->prev_priority = DEF_PRIORITY;
2672		2672
2673	zone_pcp_init(zone);	2673	zone_pcp_init(zone);
2674	INIT_LIST_HEAD(&zone->active_list);	2674	INIT_LIST_HEAD(&zone->active_list);
2675	INIT_LIST_HEAD(&zone->inactive_list);	2675	INIT_LIST_HEAD(&zone->inactive_list);
2676	zone->nr_scan_active = 0;	2676	zone->nr_scan_active = 0;
2677	zone->nr_scan_inactive = 0;	2677	zone->nr_scan_inactive = 0;
2678	zap_zone_vm_stats(zone);	2678	zap_zone_vm_stats(zone);
2679	atomic_set(&zone->reclaim_in_progress, 0);	2679	atomic_set(&zone->reclaim_in_progress, 0);
2680	if (!size)	2680	if (!size)
2681	continue;	2681	continue;
2682		2682
2683	ret = init_currently_empty_zone(zone, zone_start_pfn,	2683	ret = init_currently_empty_zone(zone, zone_start_pfn,
2684	size, MEMMAP_EARLY);	2684	size, MEMMAP_EARLY);
2685	BUG_ON(ret);	2685	BUG_ON(ret);
2686	zone_start_pfn += size;	2686	zone_start_pfn += size;
2687	}	2687	}
2688	}	2688	}
2689		2689
2690	static void __init alloc_node_mem_map(struct pglist_data *pgdat)	2690	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2691	{	2691	{
2692	/* Skip empty nodes */	2692	/* Skip empty nodes */
2693	if (!pgdat->node_spanned_pages)	2693	if (!pgdat->node_spanned_pages)
2694	return;	2694	return;
2695		2695
2696	#ifdef CONFIG_FLAT_NODE_MEM_MAP	2696	#ifdef CONFIG_FLAT_NODE_MEM_MAP
2697	/* ia64 gets its own node_mem_map, before this, without bootmem */	2697	/* ia64 gets its own node_mem_map, before this, without bootmem */
2698	if (!pgdat->node_mem_map) {	2698	if (!pgdat->node_mem_map) {
2699	unsigned long size, start, end;	2699	unsigned long size, start, end;
2700	struct page *map;	2700	struct page *map;
2701		2701
2702	/*	2702	/*
2703	* The zone's endpoints aren't required to be MAX_ORDER	2703	* The zone's endpoints aren't required to be MAX_ORDER
2704	* aligned but the node_mem_map endpoints must be in order	2704	* aligned but the node_mem_map endpoints must be in order
2705	* for the buddy allocator to function correctly.	2705	* for the buddy allocator to function correctly.
2706	*/	2706	*/
2707	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);	2707	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
2708	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;	2708	end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
2709	end = ALIGN(end, MAX_ORDER_NR_PAGES);	2709	end = ALIGN(end, MAX_ORDER_NR_PAGES);
2710	size = (end - start) * sizeof(struct page);	2710	size = (end - start) * sizeof(struct page);
2711	map = alloc_remap(pgdat->node_id, size);	2711	map = alloc_remap(pgdat->node_id, size);
2712	if (!map)	2712	if (!map)
2713	map = alloc_bootmem_node(pgdat, size);	2713	map = alloc_bootmem_node(pgdat, size);
2714	pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);	2714	pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2715	}	2715	}
2716	#ifdef CONFIG_FLATMEM	2716	#ifdef CONFIG_FLATMEM
2717	/*	2717	/*
2718	* With no DISCONTIG, the global mem_map is just set as node 0's	2718	* With no DISCONTIG, the global mem_map is just set as node 0's
2719	*/	2719	*/
2720	if (pgdat == NODE_DATA(0)) {	2720	if (pgdat == NODE_DATA(0)) {
2721	mem_map = NODE_DATA(0)->node_mem_map;	2721	mem_map = NODE_DATA(0)->node_mem_map;
2722	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP	2722	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2723	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)	2723	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2724	mem_map -= pgdat->node_start_pfn;	2724	mem_map -= pgdat->node_start_pfn;
2725	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */	2725	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2726	}	2726	}
2727	#endif	2727	#endif
2728	#endif /* CONFIG_FLAT_NODE_MEM_MAP */	2728	#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2729	}	2729	}
2730		2730
2731	void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,	2731	void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2732	unsigned long *zones_size, unsigned long node_start_pfn,	2732	unsigned long *zones_size, unsigned long node_start_pfn,
2733	unsigned long *zholes_size)	2733	unsigned long *zholes_size)
2734	{	2734	{
2735	pgdat->node_id = nid;	2735	pgdat->node_id = nid;
2736	pgdat->node_start_pfn = node_start_pfn;	2736	pgdat->node_start_pfn = node_start_pfn;
2737	calculate_node_totalpages(pgdat, zones_size, zholes_size);	2737	calculate_node_totalpages(pgdat, zones_size, zholes_size);
2738		2738
2739	alloc_node_mem_map(pgdat);	2739	alloc_node_mem_map(pgdat);
2740		2740
2741	free_area_init_core(pgdat, zones_size, zholes_size);	2741	free_area_init_core(pgdat, zones_size, zholes_size);
2742	}	2742	}
2743		2743
2744	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP	2744	#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2745	/**	2745	/**
2746	* add_active_range - Register a range of PFNs backed by physical memory	2746	* add_active_range - Register a range of PFNs backed by physical memory
2747	* @nid: The node ID the range resides on	2747	* @nid: The node ID the range resides on
2748	* @start_pfn: The start PFN of the available physical memory	2748	* @start_pfn: The start PFN of the available physical memory
2749	* @end_pfn: The end PFN of the available physical memory	2749	* @end_pfn: The end PFN of the available physical memory
2750	*	2750	*
2751	* These ranges are stored in an early_node_map[] and later used by	2751	* These ranges are stored in an early_node_map[] and later used by
2752	* free_area_init_nodes() to calculate zone sizes and holes. If the	2752	* free_area_init_nodes() to calculate zone sizes and holes. If the
2753	* range spans a memory hole, it is up to the architecture to ensure	2753	* range spans a memory hole, it is up to the architecture to ensure
2754	* the memory is not freed by the bootmem allocator. If possible	2754	* the memory is not freed by the bootmem allocator. If possible
2755	* the range being registered will be merged with existing ranges.	2755	* the range being registered will be merged with existing ranges.
2756	*/	2756	*/
2757	void __init add_active_range(unsigned int nid, unsigned long start_pfn,	2757	void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2758	unsigned long end_pfn)	2758	unsigned long end_pfn)
2759	{	2759	{
2760	int i;	2760	int i;
2761		2761
2762	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "	2762	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2763	"%d entries of %d used\n",	2763	"%d entries of %d used\n",
2764	nid, start_pfn, end_pfn,	2764	nid, start_pfn, end_pfn,
2765	nr_nodemap_entries, MAX_ACTIVE_REGIONS);	2765	nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2766		2766
2767	/* Merge with existing active regions if possible */	2767	/* Merge with existing active regions if possible */
2768	for (i = 0; i < nr_nodemap_entries; i++) {	2768	for (i = 0; i < nr_nodemap_entries; i++) {
2769	if (early_node_map[i].nid != nid)	2769	if (early_node_map[i].nid != nid)
2770	continue;	2770	continue;
2771		2771
2772	/* Skip if an existing region covers this new one */	2772	/* Skip if an existing region covers this new one */
2773	if (start_pfn >= early_node_map[i].start_pfn &&	2773	if (start_pfn >= early_node_map[i].start_pfn &&
2774	end_pfn <= early_node_map[i].end_pfn)	2774	end_pfn <= early_node_map[i].end_pfn)
2775	return;	2775	return;
2776		2776
2777	/* Merge forward if suitable */	2777	/* Merge forward if suitable */
2778	if (start_pfn <= early_node_map[i].end_pfn &&	2778	if (start_pfn <= early_node_map[i].end_pfn &&
2779	end_pfn > early_node_map[i].end_pfn) {	2779	end_pfn > early_node_map[i].end_pfn) {
2780	early_node_map[i].end_pfn = end_pfn;	2780	early_node_map[i].end_pfn = end_pfn;
2781	return;	2781	return;
2782	}	2782	}
2783		2783
2784	/* Merge backward if suitable */	2784	/* Merge backward if suitable */
2785	if (start_pfn < early_node_map[i].end_pfn &&	2785	if (start_pfn < early_node_map[i].end_pfn &&
2786	end_pfn >= early_node_map[i].start_pfn) {	2786	end_pfn >= early_node_map[i].start_pfn) {
2787	early_node_map[i].start_pfn = start_pfn;	2787	early_node_map[i].start_pfn = start_pfn;
2788	return;	2788	return;
2789	}	2789	}
2790	}	2790	}
2791		2791
2792	/* Check that early_node_map is large enough */	2792	/* Check that early_node_map is large enough */
2793	if (i >= MAX_ACTIVE_REGIONS) {	2793	if (i >= MAX_ACTIVE_REGIONS) {
2794	printk(KERN_CRIT "More than %d memory regions, truncating\n",	2794	printk(KERN_CRIT "More than %d memory regions, truncating\n",
2795	MAX_ACTIVE_REGIONS);	2795	MAX_ACTIVE_REGIONS);
2796	return;	2796	return;
2797	}	2797	}
2798		2798
2799	early_node_map[i].nid = nid;	2799	early_node_map[i].nid = nid;
2800	early_node_map[i].start_pfn = start_pfn;	2800	early_node_map[i].start_pfn = start_pfn;
2801	early_node_map[i].end_pfn = end_pfn;	2801	early_node_map[i].end_pfn = end_pfn;
2802	nr_nodemap_entries = i + 1;	2802	nr_nodemap_entries = i + 1;
2803	}	2803	}
2804		2804
2805	/**	2805	/**
2806	* shrink_active_range - Shrink an existing registered range of PFNs	2806	* shrink_active_range - Shrink an existing registered range of PFNs
2807	* @nid: The node id the range is on that should be shrunk	2807	* @nid: The node id the range is on that should be shrunk
2808	* @old_end_pfn: The old end PFN of the range	2808	* @old_end_pfn: The old end PFN of the range
2809	* @new_end_pfn: The new PFN of the range	2809	* @new_end_pfn: The new PFN of the range
2810	*	2810	*
2811	* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.	2811	* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2812	* The map is kept at the end physical page range that has already been	2812	* The map is kept at the end physical page range that has already been
2813	* registered with add_active_range(). This function allows an arch to shrink	2813	* registered with add_active_range(). This function allows an arch to shrink
2814	* an existing registered range.	2814	* an existing registered range.
2815	*/	2815	*/
2816	void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,	2816	void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2817	unsigned long new_end_pfn)	2817	unsigned long new_end_pfn)
2818	{	2818	{
2819	int i;	2819	int i;
2820		2820
2821	/* Find the old active region end and shrink */	2821	/* Find the old active region end and shrink */
2822	for_each_active_range_index_in_nid(i, nid)	2822	for_each_active_range_index_in_nid(i, nid)
2823	if (early_node_map[i].end_pfn == old_end_pfn) {	2823	if (early_node_map[i].end_pfn == old_end_pfn) {
2824	early_node_map[i].end_pfn = new_end_pfn;	2824	early_node_map[i].end_pfn = new_end_pfn;
2825	break;	2825	break;
2826	}	2826	}
2827	}	2827	}
2828		2828
2829	/**	2829	/**
2830	* remove_all_active_ranges - Remove all currently registered regions	2830	* remove_all_active_ranges - Remove all currently registered regions
2831	*	2831	*
2832	* During discovery, it may be found that a table like SRAT is invalid	2832	* During discovery, it may be found that a table like SRAT is invalid
2833	* and an alternative discovery method must be used. This function removes	2833	* and an alternative discovery method must be used. This function removes
2834	* all currently registered regions.	2834	* all currently registered regions.
2835	*/	2835	*/
2836	void __init remove_all_active_ranges(void)	2836	void __init remove_all_active_ranges(void)
2837	{	2837	{
2838	memset(early_node_map, 0, sizeof(early_node_map));	2838	memset(early_node_map, 0, sizeof(early_node_map));
2839	nr_nodemap_entries = 0;	2839	nr_nodemap_entries = 0;
2840	#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE	2840	#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2841	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));	2841	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2842	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));	2842	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2843	#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */	2843	#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2844	}	2844	}
2845		2845
2846	/* Compare two active node_active_regions */	2846	/* Compare two active node_active_regions */
2847	static int __init cmp_node_active_region(const void a, const void b)	2847	static int __init cmp_node_active_region(const void a, const void b)
2848	{	2848	{
2849	struct node_active_region arange = (struct node_active_region )a;	2849	struct node_active_region arange = (struct node_active_region )a;
2850	struct node_active_region brange = (struct node_active_region )b;	2850	struct node_active_region brange = (struct node_active_region )b;
2851		2851
2852	/* Done this way to avoid overflows */	2852	/* Done this way to avoid overflows */
2853	if (arange->start_pfn > brange->start_pfn)	2853	if (arange->start_pfn > brange->start_pfn)
2854	return 1;	2854	return 1;
2855	if (arange->start_pfn < brange->start_pfn)	2855	if (arange->start_pfn < brange->start_pfn)
2856	return -1;	2856	return -1;
2857		2857
2858	return 0;	2858	return 0;
2859	}	2859	}
2860		2860
2861	/* sort the node_map by start_pfn */	2861	/* sort the node_map by start_pfn */
2862	static void __init sort_node_map(void)	2862	static void __init sort_node_map(void)
2863	{	2863	{
2864	sort(early_node_map, (size_t)nr_nodemap_entries,	2864	sort(early_node_map, (size_t)nr_nodemap_entries,
2865	sizeof(struct node_active_region),	2865	sizeof(struct node_active_region),
2866	cmp_node_active_region, NULL);	2866	cmp_node_active_region, NULL);
2867	}	2867	}
2868		2868
2869	/* Find the lowest pfn for a node */	2869	/* Find the lowest pfn for a node */
2870	unsigned long __init find_min_pfn_for_node(unsigned long nid)	2870	unsigned long __init find_min_pfn_for_node(unsigned long nid)
2871	{	2871	{
2872	int i;	2872	int i;
2873	unsigned long min_pfn = ULONG_MAX;	2873	unsigned long min_pfn = ULONG_MAX;
2874		2874
2875	/* Assuming a sorted map, the first range found has the starting pfn */	2875	/* Assuming a sorted map, the first range found has the starting pfn */
2876	for_each_active_range_index_in_nid(i, nid)	2876	for_each_active_range_index_in_nid(i, nid)
2877	min_pfn = min(min_pfn, early_node_map[i].start_pfn);	2877	min_pfn = min(min_pfn, early_node_map[i].start_pfn);
2878		2878
2879	if (min_pfn == ULONG_MAX) {	2879	if (min_pfn == ULONG_MAX) {
2880	printk(KERN_WARNING	2880	printk(KERN_WARNING
2881	"Could not find start_pfn for node %lu\n", nid);	2881	"Could not find start_pfn for node %lu\n", nid);
2882	return 0;	2882	return 0;
2883	}	2883	}
2884		2884
2885	return min_pfn;	2885	return min_pfn;
2886	}	2886	}
2887		2887
2888	/**	2888	/**
2889	* find_min_pfn_with_active_regions - Find the minimum PFN registered	2889	* find_min_pfn_with_active_regions - Find the minimum PFN registered
2890	*	2890	*
2891	* It returns the minimum PFN based on information provided via	2891	* It returns the minimum PFN based on information provided via
2892	* add_active_range().	2892	* add_active_range().
2893	*/	2893	*/
2894	unsigned long __init find_min_pfn_with_active_regions(void)	2894	unsigned long __init find_min_pfn_with_active_regions(void)
2895	{	2895	{
2896	return find_min_pfn_for_node(MAX_NUMNODES);	2896	return find_min_pfn_for_node(MAX_NUMNODES);
2897	}	2897	}
2898		2898
2899	/**	2899	/**
2900	* find_max_pfn_with_active_regions - Find the maximum PFN registered	2900	* find_max_pfn_with_active_regions - Find the maximum PFN registered
2901	*	2901	*
2902	* It returns the maximum PFN based on information provided via	2902	* It returns the maximum PFN based on information provided via
2903	* add_active_range().	2903	* add_active_range().
2904	*/	2904	*/
2905	unsigned long __init find_max_pfn_with_active_regions(void)	2905	unsigned long __init find_max_pfn_with_active_regions(void)
2906	{	2906	{
2907	int i;	2907	int i;
2908	unsigned long max_pfn = 0;	2908	unsigned long max_pfn = 0;
2909		2909
2910	for (i = 0; i < nr_nodemap_entries; i++)	2910	for (i = 0; i < nr_nodemap_entries; i++)
2911	max_pfn = max(max_pfn, early_node_map[i].end_pfn);	2911	max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2912		2912
2913	return max_pfn;	2913	return max_pfn;
2914	}	2914	}
2915		2915
2916	/**	2916	/**
2917	* free_area_init_nodes - Initialise all pg_data_t and zone data	2917	* free_area_init_nodes - Initialise all pg_data_t and zone data
2918	* @max_zone_pfn: an array of max PFNs for each zone	2918	* @max_zone_pfn: an array of max PFNs for each zone
2919	*	2919	*
2920	* This will call free_area_init_node() for each active node in the system.	2920	* This will call free_area_init_node() for each active node in the system.
2921	* Using the page ranges provided by add_active_range(), the size of each	2921	* Using the page ranges provided by add_active_range(), the size of each
2922	* zone in each node and their holes is calculated. If the maximum PFN	2922	* zone in each node and their holes is calculated. If the maximum PFN
2923	* between two adjacent zones match, it is assumed that the zone is empty.	2923	* between two adjacent zones match, it is assumed that the zone is empty.
2924	* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed	2924	* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2925	* that arch_max_dma32_pfn has no pages. It is also assumed that a zone	2925	* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2926	* starts where the previous one ended. For example, ZONE_DMA32 starts	2926	* starts where the previous one ended. For example, ZONE_DMA32 starts
2927	* at arch_max_dma_pfn.	2927	* at arch_max_dma_pfn.
2928	*/	2928	*/
2929	void __init free_area_init_nodes(unsigned long *max_zone_pfn)	2929	void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2930	{	2930	{
2931	unsigned long nid;	2931	unsigned long nid;
2932	enum zone_type i;	2932	enum zone_type i;
2933		2933
2934	/* Sort early_node_map as initialisation assumes it is sorted */	2934	/* Sort early_node_map as initialisation assumes it is sorted */
2935	sort_node_map();	2935	sort_node_map();
2936		2936
2937	/* Record where the zone boundaries are */	2937	/* Record where the zone boundaries are */
2938	memset(arch_zone_lowest_possible_pfn, 0,	2938	memset(arch_zone_lowest_possible_pfn, 0,
2939	sizeof(arch_zone_lowest_possible_pfn));	2939	sizeof(arch_zone_lowest_possible_pfn));
2940	memset(arch_zone_highest_possible_pfn, 0,	2940	memset(arch_zone_highest_possible_pfn, 0,
2941	sizeof(arch_zone_highest_possible_pfn));	2941	sizeof(arch_zone_highest_possible_pfn));
2942	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();	2942	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2943	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];	2943	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2944	for (i = 1; i < MAX_NR_ZONES; i++) {	2944	for (i = 1; i < MAX_NR_ZONES; i++) {
2945	arch_zone_lowest_possible_pfn[i] =	2945	arch_zone_lowest_possible_pfn[i] =
2946	arch_zone_highest_possible_pfn[i-1];	2946	arch_zone_highest_possible_pfn[i-1];
2947	arch_zone_highest_possible_pfn[i] =	2947	arch_zone_highest_possible_pfn[i] =
2948	max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);	2948	max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2949	}	2949	}
2950		2950
2951	/* Print out the zone ranges */	2951	/* Print out the zone ranges */
2952	printk("Zone PFN ranges:\n");	2952	printk("Zone PFN ranges:\n");
2953	for (i = 0; i < MAX_NR_ZONES; i++)	2953	for (i = 0; i < MAX_NR_ZONES; i++)
2954	printk(" %-8s %8lu -> %8lu\n",	2954	printk(" %-8s %8lu -> %8lu\n",
2955	zone_names[i],	2955	zone_names[i],
2956	arch_zone_lowest_possible_pfn[i],	2956	arch_zone_lowest_possible_pfn[i],
2957	arch_zone_highest_possible_pfn[i]);	2957	arch_zone_highest_possible_pfn[i]);
2958		2958
2959	/* Print out the early_node_map[] */	2959	/* Print out the early_node_map[] */
2960	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);	2960	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2961	for (i = 0; i < nr_nodemap_entries; i++)	2961	for (i = 0; i < nr_nodemap_entries; i++)
2962	printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,	2962	printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2963	early_node_map[i].start_pfn,	2963	early_node_map[i].start_pfn,
2964	early_node_map[i].end_pfn);	2964	early_node_map[i].end_pfn);
2965		2965
2966	/* Initialise every node */	2966	/* Initialise every node */
2967	setup_nr_node_ids();	2967	setup_nr_node_ids();
2968	for_each_online_node(nid) {	2968	for_each_online_node(nid) {
2969	pg_data_t *pgdat = NODE_DATA(nid);	2969	pg_data_t *pgdat = NODE_DATA(nid);
2970	free_area_init_node(nid, pgdat, NULL,	2970	free_area_init_node(nid, pgdat, NULL,
2971	find_min_pfn_for_node(nid), NULL);	2971	find_min_pfn_for_node(nid), NULL);
2972	}	2972	}
2973	}	2973	}
2974	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */	2974	#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2975		2975
2976	/**	2976	/**
2977	* set_dma_reserve - set the specified number of pages reserved in the first zone	2977	* set_dma_reserve - set the specified number of pages reserved in the first zone
2978	* @new_dma_reserve: The number of pages to mark reserved	2978	* @new_dma_reserve: The number of pages to mark reserved
2979	*	2979	*
2980	* The per-cpu batchsize and zone watermarks are determined by present_pages.	2980	* The per-cpu batchsize and zone watermarks are determined by present_pages.
2981	* In the DMA zone, a significant percentage may be consumed by kernel image	2981	* In the DMA zone, a significant percentage may be consumed by kernel image
2982	* and other unfreeable allocations which can skew the watermarks badly. This	2982	* and other unfreeable allocations which can skew the watermarks badly. This
2983	* function may optionally be used to account for unfreeable pages in the	2983	* function may optionally be used to account for unfreeable pages in the
2984	* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and	2984	* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2985	* smaller per-cpu batchsize.	2985	* smaller per-cpu batchsize.
2986	*/	2986	*/
2987	void __init set_dma_reserve(unsigned long new_dma_reserve)	2987	void __init set_dma_reserve(unsigned long new_dma_reserve)
2988	{	2988	{
2989	dma_reserve = new_dma_reserve;	2989	dma_reserve = new_dma_reserve;
2990	}	2990	}
2991		2991
2992	#ifndef CONFIG_NEED_MULTIPLE_NODES	2992	#ifndef CONFIG_NEED_MULTIPLE_NODES
2993	static bootmem_data_t contig_bootmem_data;	2993	static bootmem_data_t contig_bootmem_data;
2994	struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };	2994	struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2995		2995
2996	EXPORT_SYMBOL(contig_page_data);	2996	EXPORT_SYMBOL(contig_page_data);
2997	#endif	2997	#endif
2998		2998
2999	void __init free_area_init(unsigned long *zones_size)	2999	void __init free_area_init(unsigned long *zones_size)
3000	{	3000	{
3001	free_area_init_node(0, NODE_DATA(0), zones_size,	3001	free_area_init_node(0, NODE_DATA(0), zones_size,
3002	__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);	3002	__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
3003	}	3003	}
3004		3004
3005	static int page_alloc_cpu_notify(struct notifier_block *self,	3005	static int page_alloc_cpu_notify(struct notifier_block *self,
3006	unsigned long action, void *hcpu)	3006	unsigned long action, void *hcpu)
3007	{	3007	{
3008	int cpu = (unsigned long)hcpu;	3008	int cpu = (unsigned long)hcpu;
3009		3009
3010	if (action == CPU_DEAD) {	3010	if (action == CPU_DEAD) {
3011	local_irq_disable();	3011	local_irq_disable();
3012	__drain_pages(cpu);	3012	__drain_pages(cpu);
3013	vm_events_fold_cpu(cpu);	3013	vm_events_fold_cpu(cpu);
3014	local_irq_enable();	3014	local_irq_enable();
3015	refresh_cpu_vm_stats(cpu);	3015	refresh_cpu_vm_stats(cpu);
3016	}	3016	}
3017	return NOTIFY_OK;	3017	return NOTIFY_OK;
3018	}	3018	}
3019		3019
3020	void __init page_alloc_init(void)	3020	void __init page_alloc_init(void)
3021	{	3021	{
3022	hotcpu_notifier(page_alloc_cpu_notify, 0);	3022	hotcpu_notifier(page_alloc_cpu_notify, 0);
3023	}	3023	}
3024		3024
3025	/*	3025	/*
3026	* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio	3026	* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
3027	* or min_free_kbytes changes.	3027	* or min_free_kbytes changes.
3028	*/	3028	*/
3029	static void calculate_totalreserve_pages(void)	3029	static void calculate_totalreserve_pages(void)
3030	{	3030	{
3031	struct pglist_data *pgdat;	3031	struct pglist_data *pgdat;
3032	unsigned long reserve_pages = 0;	3032	unsigned long reserve_pages = 0;
3033	enum zone_type i, j;	3033	enum zone_type i, j;
3034		3034
3035	for_each_online_pgdat(pgdat) {	3035	for_each_online_pgdat(pgdat) {
3036	for (i = 0; i < MAX_NR_ZONES; i++) {	3036	for (i = 0; i < MAX_NR_ZONES; i++) {
3037	struct zone *zone = pgdat->node_zones + i;	3037	struct zone *zone = pgdat->node_zones + i;
3038	unsigned long max = 0;	3038	unsigned long max = 0;
3039		3039
3040	/* Find valid and maximum lowmem_reserve in the zone */	3040	/* Find valid and maximum lowmem_reserve in the zone */
3041	for (j = i; j < MAX_NR_ZONES; j++) {	3041	for (j = i; j < MAX_NR_ZONES; j++) {
3042	if (zone->lowmem_reserve[j] > max)	3042	if (zone->lowmem_reserve[j] > max)
3043	max = zone->lowmem_reserve[j];	3043	max = zone->lowmem_reserve[j];
3044	}	3044	}
3045		3045
3046	/* we treat pages_high as reserved pages. */	3046	/* we treat pages_high as reserved pages. */
3047	max += zone->pages_high;	3047	max += zone->pages_high;
3048		3048
3049	if (max > zone->present_pages)	3049	if (max > zone->present_pages)
3050	max = zone->present_pages;	3050	max = zone->present_pages;
3051	reserve_pages += max;	3051	reserve_pages += max;
3052	}	3052	}
3053	}	3053	}
3054	totalreserve_pages = reserve_pages;	3054	totalreserve_pages = reserve_pages;
3055	}	3055	}
3056		3056
3057	/*	3057	/*
3058	* setup_per_zone_lowmem_reserve - called whenever	3058	* setup_per_zone_lowmem_reserve - called whenever
3059	* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone	3059	* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
3060	* has a correct pages reserved value, so an adequate number of	3060	* has a correct pages reserved value, so an adequate number of
3061	* pages are left in the zone after a successful __alloc_pages().	3061	* pages are left in the zone after a successful __alloc_pages().
3062	*/	3062	*/
3063	static void setup_per_zone_lowmem_reserve(void)	3063	static void setup_per_zone_lowmem_reserve(void)
3064	{	3064	{
3065	struct pglist_data *pgdat;	3065	struct pglist_data *pgdat;
3066	enum zone_type j, idx;	3066	enum zone_type j, idx;
3067		3067
3068	for_each_online_pgdat(pgdat) {	3068	for_each_online_pgdat(pgdat) {
3069	for (j = 0; j < MAX_NR_ZONES; j++) {	3069	for (j = 0; j < MAX_NR_ZONES; j++) {
3070	struct zone *zone = pgdat->node_zones + j;	3070	struct zone *zone = pgdat->node_zones + j;
3071	unsigned long present_pages = zone->present_pages;	3071	unsigned long present_pages = zone->present_pages;
3072		3072
3073	zone->lowmem_reserve[j] = 0;	3073	zone->lowmem_reserve[j] = 0;
3074		3074
3075	idx = j;	3075	idx = j;
3076	while (idx) {	3076	while (idx) {
3077	struct zone *lower_zone;	3077	struct zone *lower_zone;
3078		3078
3079	idx--;	3079	idx--;
3080		3080
3081	if (sysctl_lowmem_reserve_ratio[idx] < 1)	3081	if (sysctl_lowmem_reserve_ratio[idx] < 1)
3082	sysctl_lowmem_reserve_ratio[idx] = 1;	3082	sysctl_lowmem_reserve_ratio[idx] = 1;
3083		3083
3084	lower_zone = pgdat->node_zones + idx;	3084	lower_zone = pgdat->node_zones + idx;
3085	lower_zone->lowmem_reserve[j] = present_pages /	3085	lower_zone->lowmem_reserve[j] = present_pages /
3086	sysctl_lowmem_reserve_ratio[idx];	3086	sysctl_lowmem_reserve_ratio[idx];
3087	present_pages += lower_zone->present_pages;	3087	present_pages += lower_zone->present_pages;
3088	}	3088	}
3089	}	3089	}
3090	}	3090	}
3091		3091
3092	/* update totalreserve_pages */	3092	/* update totalreserve_pages */
3093	calculate_totalreserve_pages();	3093	calculate_totalreserve_pages();
3094	}	3094	}
3095		3095
3096	/**	3096	/**
3097	* setup_per_zone_pages_min - called when min_free_kbytes changes.	3097	* setup_per_zone_pages_min - called when min_free_kbytes changes.
3098	*	3098	*
3099	* Ensures that the pages_{min,low,high} values for each zone are set correctly	3099	* Ensures that the pages_{min,low,high} values for each zone are set correctly
3100	* with respect to min_free_kbytes.	3100	* with respect to min_free_kbytes.
3101	*/	3101	*/
3102	void setup_per_zone_pages_min(void)	3102	void setup_per_zone_pages_min(void)
3103	{	3103	{
3104	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);	3104	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
3105	unsigned long lowmem_pages = 0;	3105	unsigned long lowmem_pages = 0;
3106	struct zone *zone;	3106	struct zone *zone;
3107	unsigned long flags;	3107	unsigned long flags;
3108		3108
3109	/* Calculate total number of !ZONE_HIGHMEM pages */	3109	/* Calculate total number of !ZONE_HIGHMEM pages */
3110	for_each_zone(zone) {	3110	for_each_zone(zone) {
3111	if (!is_highmem(zone))	3111	if (!is_highmem(zone))
3112	lowmem_pages += zone->present_pages;	3112	lowmem_pages += zone->present_pages;
3113	}	3113	}
3114		3114
3115	for_each_zone(zone) {	3115	for_each_zone(zone) {
3116	u64 tmp;	3116	u64 tmp;
3117		3117
3118	spin_lock_irqsave(&zone->lru_lock, flags);	3118	spin_lock_irqsave(&zone->lru_lock, flags);
3119	tmp = (u64)pages_min * zone->present_pages;	3119	tmp = (u64)pages_min * zone->present_pages;
3120	do_div(tmp, lowmem_pages);	3120	do_div(tmp, lowmem_pages);
3121	if (is_highmem(zone)) {	3121	if (is_highmem(zone)) {
3122	/*	3122	/*
3123	* __GFP_HIGH and PF_MEMALLOC allocations usually don't	3123	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
3124	* need highmem pages, so cap pages_min to a small	3124	* need highmem pages, so cap pages_min to a small
3125	* value here.	3125	* value here.
3126	*	3126	*
3127	* The (pages_high-pages_low) and (pages_low-pages_min)	3127	* The (pages_high-pages_low) and (pages_low-pages_min)
3128	* deltas controls asynch page reclaim, and so should	3128	* deltas controls asynch page reclaim, and so should
3129	* not be capped for highmem.	3129	* not be capped for highmem.
3130	*/	3130	*/
3131	int min_pages;	3131	int min_pages;
3132		3132
3133	min_pages = zone->present_pages / 1024;	3133	min_pages = zone->present_pages / 1024;
3134	if (min_pages < SWAP_CLUSTER_MAX)	3134	if (min_pages < SWAP_CLUSTER_MAX)
3135	min_pages = SWAP_CLUSTER_MAX;	3135	min_pages = SWAP_CLUSTER_MAX;
3136	if (min_pages > 128)	3136	if (min_pages > 128)
3137	min_pages = 128;	3137	min_pages = 128;
3138	zone->pages_min = min_pages;	3138	zone->pages_min = min_pages;
3139	} else {	3139	} else {
3140	/*	3140	/*
3141	* If it's a lowmem zone, reserve a number of pages	3141	* If it's a lowmem zone, reserve a number of pages
3142	* proportionate to the zone's size.	3142	* proportionate to the zone's size.
3143	*/	3143	*/
3144	zone->pages_min = tmp;	3144	zone->pages_min = tmp;
3145	}	3145	}
3146		3146
3147	zone->pages_low = zone->pages_min + (tmp >> 2);	3147	zone->pages_low = zone->pages_min + (tmp >> 2);
3148	zone->pages_high = zone->pages_min + (tmp >> 1);	3148	zone->pages_high = zone->pages_min + (tmp >> 1);
3149	spin_unlock_irqrestore(&zone->lru_lock, flags);	3149	spin_unlock_irqrestore(&zone->lru_lock, flags);
3150	}	3150	}
3151		3151
3152	/* update totalreserve_pages */	3152	/* update totalreserve_pages */
3153	calculate_totalreserve_pages();	3153	calculate_totalreserve_pages();
3154	}	3154	}
3155		3155
3156	/*	3156	/*
3157	* Initialise min_free_kbytes.	3157	* Initialise min_free_kbytes.
3158	*	3158	*
3159	* For small machines we want it small (128k min). For large machines	3159	* For small machines we want it small (128k min). For large machines
3160	* we want it large (64MB max). But it is not linear, because network	3160	* we want it large (64MB max). But it is not linear, because network
3161	* bandwidth does not increase linearly with machine size. We use	3161	* bandwidth does not increase linearly with machine size. We use
3162	*	3162	*
3163	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:	3163	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
3164	* min_free_kbytes = sqrt(lowmem_kbytes * 16)	3164	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
3165	*	3165	*
3166	* which yields	3166	* which yields
3167	*	3167	*
3168	* 16MB: 512k	3168	* 16MB: 512k
3169	* 32MB: 724k	3169	* 32MB: 724k
3170	* 64MB: 1024k	3170	* 64MB: 1024k
3171	* 128MB: 1448k	3171	* 128MB: 1448k
3172	* 256MB: 2048k	3172	* 256MB: 2048k
3173	* 512MB: 2896k	3173	* 512MB: 2896k
3174	* 1024MB: 4096k	3174	* 1024MB: 4096k
3175	* 2048MB: 5792k	3175	* 2048MB: 5792k
3176	* 4096MB: 8192k	3176	* 4096MB: 8192k
3177	* 8192MB: 11584k	3177	* 8192MB: 11584k
3178	* 16384MB: 16384k	3178	* 16384MB: 16384k
3179	*/	3179	*/
3180	static int __init init_per_zone_pages_min(void)	3180	static int __init init_per_zone_pages_min(void)
3181	{	3181	{
3182	unsigned long lowmem_kbytes;	3182	unsigned long lowmem_kbytes;
3183		3183
3184	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);	3184	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
3185		3185
3186	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);	3186	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
3187	if (min_free_kbytes < 128)	3187	if (min_free_kbytes < 128)
3188	min_free_kbytes = 128;	3188	min_free_kbytes = 128;
3189	if (min_free_kbytes > 65536)	3189	if (min_free_kbytes > 65536)
3190	min_free_kbytes = 65536;	3190	min_free_kbytes = 65536;
3191	setup_per_zone_pages_min();	3191	setup_per_zone_pages_min();
3192	setup_per_zone_lowmem_reserve();	3192	setup_per_zone_lowmem_reserve();
3193	return 0;	3193	return 0;
3194	}	3194	}
3195	module_init(init_per_zone_pages_min)	3195	module_init(init_per_zone_pages_min)
3196		3196
3197	/*	3197	/*
3198	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so	3198	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
3199	* that we can call two helper functions whenever min_free_kbytes	3199	* that we can call two helper functions whenever min_free_kbytes
3200	* changes.	3200	* changes.
3201	*/	3201	*/
3202	int min_free_kbytes_sysctl_handler(ctl_table *table, int write,	3202	int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
3203	struct file file, void __user buffer, size_t length, loff_t ppos)	3203	struct file file, void __user buffer, size_t length, loff_t ppos)
3204	{	3204	{
3205	proc_dointvec(table, write, file, buffer, length, ppos);	3205	proc_dointvec(table, write, file, buffer, length, ppos);
3206	setup_per_zone_pages_min();	3206	setup_per_zone_pages_min();
3207	return 0;	3207	return 0;
3208	}	3208	}
3209		3209
3210	#ifdef CONFIG_NUMA	3210	#ifdef CONFIG_NUMA
3211	int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,	3211	int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
3212	struct file file, void __user buffer, size_t length, loff_t ppos)	3212	struct file file, void __user buffer, size_t length, loff_t ppos)
3213	{	3213	{
3214	struct zone *zone;	3214	struct zone *zone;
3215	int rc;	3215	int rc;
3216		3216
3217	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);	3217	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3218	if (rc)	3218	if (rc)
3219	return rc;	3219	return rc;
3220		3220
3221	for_each_zone(zone)	3221	for_each_zone(zone)
3222	zone->min_unmapped_pages = (zone->present_pages *	3222	zone->min_unmapped_pages = (zone->present_pages *
3223	sysctl_min_unmapped_ratio) / 100;	3223	sysctl_min_unmapped_ratio) / 100;
3224	return 0;	3224	return 0;
3225	}	3225	}
3226		3226
3227	int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,	3227	int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
3228	struct file file, void __user buffer, size_t length, loff_t ppos)	3228	struct file file, void __user buffer, size_t length, loff_t ppos)
3229	{	3229	{
3230	struct zone *zone;	3230	struct zone *zone;
3231	int rc;	3231	int rc;
3232		3232
3233	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);	3233	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3234	if (rc)	3234	if (rc)
3235	return rc;	3235	return rc;
3236		3236
3237	for_each_zone(zone)	3237	for_each_zone(zone)
3238	zone->min_slab_pages = (zone->present_pages *	3238	zone->min_slab_pages = (zone->present_pages *
3239	sysctl_min_slab_ratio) / 100;	3239	sysctl_min_slab_ratio) / 100;
3240	return 0;	3240	return 0;
3241	}	3241	}
3242	#endif	3242	#endif
3243		3243
3244	/*	3244	/*
3245	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around	3245	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
3246	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()	3246	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
3247	* whenever sysctl_lowmem_reserve_ratio changes.	3247	* whenever sysctl_lowmem_reserve_ratio changes.
3248	*	3248	*
3249	* The reserve ratio obviously has absolutely no relation with the	3249	* The reserve ratio obviously has absolutely no relation with the
3250	* pages_min watermarks. The lowmem reserve ratio can only make sense	3250	* pages_min watermarks. The lowmem reserve ratio can only make sense
3251	* if in function of the boot time zone sizes.	3251	* if in function of the boot time zone sizes.
3252	*/	3252	*/
3253	int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,	3253	int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
3254	struct file file, void __user buffer, size_t length, loff_t ppos)	3254	struct file file, void __user buffer, size_t length, loff_t ppos)
3255	{	3255	{
3256	proc_dointvec_minmax(table, write, file, buffer, length, ppos);	3256	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3257	setup_per_zone_lowmem_reserve();	3257	setup_per_zone_lowmem_reserve();
3258	return 0;	3258	return 0;
3259	}	3259	}
3260		3260
3261	/*	3261	/*
3262	* percpu_pagelist_fraction - changes the pcp->high for each zone on each	3262	* percpu_pagelist_fraction - changes the pcp->high for each zone on each
3263	* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist	3263	* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
3264	* can have before it gets flushed back to buddy allocator.	3264	* can have before it gets flushed back to buddy allocator.
3265	*/	3265	*/
3266		3266
3267	int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,	3267	int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
3268	struct file file, void __user buffer, size_t length, loff_t ppos)	3268	struct file file, void __user buffer, size_t length, loff_t ppos)
3269	{	3269	{
3270	struct zone *zone;	3270	struct zone *zone;
3271	unsigned int cpu;	3271	unsigned int cpu;
3272	int ret;	3272	int ret;
3273		3273
3274	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);	3274	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3275	if (!write \|\| (ret == -EINVAL))	3275	if (!write \|\| (ret == -EINVAL))
3276	return ret;	3276	return ret;
3277	for_each_zone(zone) {	3277	for_each_zone(zone) {
3278	for_each_online_cpu(cpu) {	3278	for_each_online_cpu(cpu) {
3279	unsigned long high;	3279	unsigned long high;
3280	high = zone->present_pages / percpu_pagelist_fraction;	3280	high = zone->present_pages / percpu_pagelist_fraction;
3281	setup_pagelist_highmark(zone_pcp(zone, cpu), high);	3281	setup_pagelist_highmark(zone_pcp(zone, cpu), high);
3282	}	3282	}
3283	}	3283	}
3284	return 0;	3284	return 0;
3285	}	3285	}
3286		3286
3287	int hashdist = HASHDIST_DEFAULT;	3287	int hashdist = HASHDIST_DEFAULT;
3288		3288
3289	#ifdef CONFIG_NUMA	3289	#ifdef CONFIG_NUMA
3290	static int __init set_hashdist(char *str)	3290	static int __init set_hashdist(char *str)
3291	{	3291	{
3292	if (!str)	3292	if (!str)
3293	return 0;	3293	return 0;
3294	hashdist = simple_strtoul(str, &str, 0);	3294	hashdist = simple_strtoul(str, &str, 0);
3295	return 1;	3295	return 1;
3296	}	3296	}
3297	__setup("hashdist=", set_hashdist);	3297	__setup("hashdist=", set_hashdist);
3298	#endif	3298	#endif
3299		3299
3300	/*	3300	/*
3301	* allocate a large system hash table from bootmem	3301	* allocate a large system hash table from bootmem
3302	* - it is assumed that the hash table must contain an exact power-of-2	3302	* - it is assumed that the hash table must contain an exact power-of-2
3303	* quantity of entries	3303	* quantity of entries
3304	* - limit is the number of hash buckets, not the total allocation size	3304	* - limit is the number of hash buckets, not the total allocation size
3305	*/	3305	*/
3306	void __init alloc_large_system_hash(const char tablename,	3306	void __init alloc_large_system_hash(const char tablename,
3307	unsigned long bucketsize,	3307	unsigned long bucketsize,
3308	unsigned long numentries,	3308	unsigned long numentries,
3309	int scale,	3309	int scale,
3310	int flags,	3310	int flags,
3311	unsigned int *_hash_shift,	3311	unsigned int *_hash_shift,
3312	unsigned int *_hash_mask,	3312	unsigned int *_hash_mask,
3313	unsigned long limit)	3313	unsigned long limit)
3314	{	3314	{
3315	unsigned long long max = limit;	3315	unsigned long long max = limit;
3316	unsigned long log2qty, size;	3316	unsigned long log2qty, size;
3317	void *table = NULL;	3317	void *table = NULL;
3318		3318
3319	/* allow the kernel cmdline to have a say */	3319	/* allow the kernel cmdline to have a say */
3320	if (!numentries) {	3320	if (!numentries) {
3321	/* round applicable memory size up to nearest megabyte */	3321	/* round applicable memory size up to nearest megabyte */
3322	numentries = nr_kernel_pages;	3322	numentries = nr_kernel_pages;
3323	numentries += (1UL << (20 - PAGE_SHIFT)) - 1;	3323	numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3324	numentries >>= 20 - PAGE_SHIFT;	3324	numentries >>= 20 - PAGE_SHIFT;
3325	numentries <<= 20 - PAGE_SHIFT;	3325	numentries <<= 20 - PAGE_SHIFT;
3326		3326
3327	/* limit to 1 bucket per 2^scale bytes of low memory */	3327	/* limit to 1 bucket per 2^scale bytes of low memory */
3328	if (scale > PAGE_SHIFT)	3328	if (scale > PAGE_SHIFT)
3329	numentries >>= (scale - PAGE_SHIFT);	3329	numentries >>= (scale - PAGE_SHIFT);
3330	else	3330	else
3331	numentries <<= (PAGE_SHIFT - scale);	3331	numentries <<= (PAGE_SHIFT - scale);
3332		3332
3333	/* Make sure we've got at least a 0-order allocation.. */	3333	/* Make sure we've got at least a 0-order allocation.. */
3334	if (unlikely((numentries * bucketsize) < PAGE_SIZE))	3334	if (unlikely((numentries * bucketsize) < PAGE_SIZE))
3335	numentries = PAGE_SIZE / bucketsize;	3335	numentries = PAGE_SIZE / bucketsize;
3336	}	3336	}
3337	numentries = roundup_pow_of_two(numentries);	3337	numentries = roundup_pow_of_two(numentries);
3338		3338
3339	/* limit allocation size to 1/16 total memory by default */	3339	/* limit allocation size to 1/16 total memory by default */
3340	if (max == 0) {	3340	if (max == 0) {
3341	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;	3341	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
3342	do_div(max, bucketsize);	3342	do_div(max, bucketsize);
3343	}	3343	}
3344		3344
3345	if (numentries > max)	3345	if (numentries > max)
3346	numentries = max;	3346	numentries = max;
3347		3347
3348	log2qty = ilog2(numentries);	3348	log2qty = ilog2(numentries);
3349		3349
3350	do {	3350	do {
3351	size = bucketsize << log2qty;	3351	size = bucketsize << log2qty;
3352	if (flags & HASH_EARLY)	3352	if (flags & HASH_EARLY)
3353	table = alloc_bootmem(size);	3353	table = alloc_bootmem(size);
3354	else if (hashdist)	3354	else if (hashdist)
3355	table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);	3355	table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
3356	else {	3356	else {
3357	unsigned long order;	3357	unsigned long order;
3358	for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)	3358	for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
3359	;	3359	;
3360	table = (void*) __get_free_pages(GFP_ATOMIC, order);	3360	table = (void*) __get_free_pages(GFP_ATOMIC, order);
3361	}	3361	}
3362	} while (!table && size > PAGE_SIZE && --log2qty);	3362	} while (!table && size > PAGE_SIZE && --log2qty);
3363		3363
3364	if (!table)	3364	if (!table)
3365	panic("Failed to allocate %s hash table\n", tablename);	3365	panic("Failed to allocate %s hash table\n", tablename);
3366		3366
3367	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",	3367	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3368	tablename,	3368	tablename,
3369	(1U << log2qty),	3369	(1U << log2qty),
3370	ilog2(size) - PAGE_SHIFT,	3370	ilog2(size) - PAGE_SHIFT,
3371	size);	3371	size);
3372		3372
3373	if (_hash_shift)	3373	if (_hash_shift)
3374	*_hash_shift = log2qty;	3374	*_hash_shift = log2qty;
3375	if (_hash_mask)	3375	if (_hash_mask)
3376	*_hash_mask = (1 << log2qty) - 1;	3376	*_hash_mask = (1 << log2qty) - 1;
3377		3377
3378	return table;	3378	return table;
3379	}	3379	}
3380		3380
3381	#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE	3381	#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
3382	struct page *pfn_to_page(unsigned long pfn)	3382	struct page *pfn_to_page(unsigned long pfn)
3383	{	3383	{
3384	return __pfn_to_page(pfn);	3384	return __pfn_to_page(pfn);
3385	}	3385	}
3386	unsigned long page_to_pfn(struct page *page)	3386	unsigned long page_to_pfn(struct page *page)
3387	{	3387	{
3388	return __page_to_pfn(page);	3388	return __page_to_pfn(page);
3389	}	3389	}
3390	EXPORT_SYMBOL(pfn_to_page);	3390	EXPORT_SYMBOL(pfn_to_page);
3391	EXPORT_SYMBOL(page_to_pfn);	3391	EXPORT_SYMBOL(page_to_pfn);
3392	#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */	3392	#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3393		3393
3394		3394
3395		3395