Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 32439700fe1c0fc3c2d3f2aedd3ad6707c88b8ba

Authored by Linus Torvalds 2014-06-02 09:26:59 +0800

Exists in ti-lsk-linux-4.1.y and in 12 other branches

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Various fixlets, mostly related to the (root-only) SCHED_DEADLINE
  policy, but also a hotplug bug fix and a fix for a NR_CPUS related
  overallocation bug causing a suspend/resume regression"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Fix hotplug vs. set_cpus_allowed_ptr()
  sched/cpupri: Replace NR_CPUS arrays
  sched/deadline: Replace NR_CPUS arrays
  sched/deadline: Restrict user params max value to 2^63 ns
  sched/deadline: Change sched_getparam() behaviour vs SCHED_DEADLINE
  sched: Disallow sched_attr::sched_policy < 0
  sched: Make sched_setattr() correctly return -EFBIG

Showing 6 changed files Inline Diff

kernel/cpu.c
kernel/sched/core.c
kernel/sched/cpudeadline.c
kernel/sched/cpudeadline.h
kernel/sched/cpupri.c
kernel/sched/cpupri.h

kernel/cpu.c

Diff comments View file @ 3243970

 /* CPU control.
  * (C) 2001, 2002, 2003, 2004 Rusty Russell
  *
  * This code is licenced under the GPL.
  */
 #include <linux/proc_fs.h>
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
 #include <linux/bug.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
 #include "smpboot.h"
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
  * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
  * hotplug callback (un)registration performed using __register_cpu_notifier()
  * or __unregister_cpu_notifier().
  */
 void cpu_maps_update_begin(void)
 {
 	mutex_lock(&cpu_add_remove_lock);
 }
 EXPORT_SYMBOL(cpu_notifier_register_begin);
 void cpu_maps_update_done(void)
 {
 	mutex_unlock(&cpu_add_remove_lock);
 }
 EXPORT_SYMBOL(cpu_notifier_register_done);
 static RAW_NOTIFIER_HEAD(cpu_chain);
 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
  */
 static int cpu_hotplug_disabled;
 #ifdef CONFIG_HOTPLUG_CPU
 static struct {
 	struct task_struct *active_writer;
 	struct mutex lock; /* Synchronizes accesses to refcount, */
 	/*
 	 * Also blocks the new readers during
 	 * an ongoing cpu hotplug operation.
 	 */
 	int refcount;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
 } cpu_hotplug = {
 	.active_writer = NULL,
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
 	.refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	.dep_map = {.name = "cpu_hotplug.lock" },
 #endif
 };
 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 void get_online_cpus(void)
 {
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
 		return;
 	cpuhp_lock_acquire_read();
 	mutex_lock(&cpu_hotplug.lock);
 	cpu_hotplug.refcount++;
 	mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
 	mutex_lock(&cpu_hotplug.lock);
 	if (WARN_ON(!cpu_hotplug.refcount))
 		cpu_hotplug.refcount++; /* try to fix things up */
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
 	mutex_unlock(&cpu_hotplug.lock);
 	cpuhp_lock_release();
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 /*
  * This ensures that the hotplug operation can begin only when the
  * refcount goes to zero.
  *
  * Note that during a cpu-hotplug operation, the new readers, if any,
  * will be blocked by the cpu_hotplug.lock
  *
  * Since cpu_hotplug_begin() is always called after invoking
  * cpu_maps_update_begin(), we can be sure that only one writer is active.
  *
  * Note that theoretically, there is a possibility of a livelock:
  * - Refcount goes to zero, last reader wakes up the sleeping
  *   writer.
  * - Last reader unlocks the cpu_hotplug.lock.
  * - A new reader arrives at this moment, bumps up the refcount.
  * - The writer acquires the cpu_hotplug.lock finds the refcount
  *   non zero and goes to sleep again.
  *
  * However, this is very difficult to achieve in practice since
  * get_online_cpus() not an api which is called all that often.
  *
  */
 void cpu_hotplug_begin(void)
 {
 	cpu_hotplug.active_writer = current;
 	cpuhp_lock_acquire();
 	for (;;) {
 		mutex_lock(&cpu_hotplug.lock);
 		if (likely(!cpu_hotplug.refcount))
 			break;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
 	}
 }
 void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
 	mutex_unlock(&cpu_hotplug.lock);
 	cpuhp_lock_release();
 }
 /*
  * Wait for currently running CPU hotplug operations to complete (if any) and
  * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
  * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
  * hotplug path before performing hotplug operations. So acquiring that lock
  * guarantees mutual exclusion from any currently running hotplug operations.
  */
 void cpu_hotplug_disable(void)
 {
 	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 1;
 	cpu_maps_update_done();
 }
 void cpu_hotplug_enable(void)
 {
 	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
 	cpu_maps_update_done();
 }
 #endif	/* CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
 	cpu_maps_update_begin();
 	ret = raw_notifier_chain_register(&cpu_chain, nb);
 	cpu_maps_update_done();
 	return ret;
 }
 int __ref __register_cpu_notifier(struct notifier_block *nb)
 {
 	return raw_notifier_chain_register(&cpu_chain, nb);
 }
 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 			int *nr_calls)
 {
 	int ret;
 	ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
 					nr_calls);
 	return notifier_to_errno(ret);
 }
 static int cpu_notify(unsigned long val, void *v)
 {
 	return __cpu_notify(val, v, -1, NULL);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 static void cpu_notify_nofail(unsigned long val, void *v)
 {
 	BUG_ON(cpu_notify(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {
 	cpu_maps_update_begin();
 	raw_notifier_chain_unregister(&cpu_chain, nb);
 	cpu_maps_update_done();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 void __ref __unregister_cpu_notifier(struct notifier_block *nb)
 {
 	raw_notifier_chain_unregister(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(__unregister_cpu_notifier);
 /**
  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
  * @cpu: a CPU id
  *
  * This function walks all processes, finds a valid mm struct for each one and
  * then clears a corresponding bit in mm's cpumask.  While this all sounds
  * trivial, there are various non-obvious corner cases, which this function
  * tries to solve in a safe manner.
  *
  * Also note that the function uses a somewhat relaxed locking scheme, so it may
  * be called only for an already offlined CPU.
  */
 void clear_tasks_mm_cpumask(int cpu)
 {
 	struct task_struct *p;
 	/*
 	 * This function is called after the cpu is taken down and marked
 	 * offline, so its not like new tasks will ever get this cpu set in
 	 * their mm mask. -- Peter Zijlstra
 	 * Thus, we may use rcu_read_lock() here, instead of grabbing
 	 * full-fledged tasklist_lock.
 	 */
 	WARN_ON(cpu_online(cpu));
 	rcu_read_lock();
 	for_each_process(p) {
 		struct task_struct *t;
 		/*
 		 * Main thread might exit, but other threads may still have
 		 * a valid mm. Find one.
 		 */
 		t = find_lock_task_mm(p);
 		if (!t)
 			continue;
 		cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
 		task_unlock(t);
 	}
 	rcu_read_unlock();
 }
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
 	cputime_t utime, stime;
 	write_lock_irq(&tasklist_lock);
 	for_each_process(p) {
 		task_cputime(p, &utime, &stime);
 		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
 		    (utime || stime))
 			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
 				"(state = %ld, flags = %x)\n",
 				p->comm, task_pid_nr(p), cpu,
 				p->state, p->flags);
 	}
 	write_unlock_irq(&tasklist_lock);
 }
 struct take_cpu_down_param {
 	unsigned long mod;
 	void *hcpu;
 };
 /* Take this CPU down. */
 static int __ref take_cpu_down(void *_param)
 {
 	struct take_cpu_down_param *param = _param;
 	int err;
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
 		return err;
 	cpu_notify(CPU_DYING | param->mod, param->hcpu);
 	/* Park the stopper thread */
 	kthread_park(current);
 	return 0;
 }
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
 		.mod = mod,
 		.hcpu = hcpu,
 	};
 	if (num_online_cpus() == 1)
 		return -EBUSY;
 	if (!cpu_online(cpu))
 		return -EINVAL;
 	cpu_hotplug_begin();
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
 		nr_calls--;
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__func__, cpu);
 		goto out_release;
 	}
 	/*
 	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
 	 * and RCU users of this state to go away such that all new such users
 	 * will observe it.
 	 *
 	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
 	 * not imply sync_sched(), so explicitly call both.
 	 *
 	 * Do sync before park smpboot threads to take care the rcu boost case.
 	 */
 #ifdef CONFIG_PREEMPT
 	synchronize_sched();
 #endif
 	synchronize_rcu();
 	smpboot_park_threads(cpu);
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		smpboot_unpark_threads(cpu);
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
 		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
 	/*
 	 * The migration_call() CPU_DYING callback will have removed all
 	 * runnable tasks from the cpu, there's only the idle task left now
 	 * that the migration thread is done doing the stop_machine thing.
 	 *
 	 * Wait for the stop thread to go away.
 	 */
 	while (!idle_cpu(cpu))
 		cpu_relax();
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
 	cpu_notify_nofail(CPU_DEAD | mod, hcpu);
 	check_for_tasks(cpu);
 out_release:
 	cpu_hotplug_done();
 	if (!err)
 		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
 	return err;
 }
 int __ref cpu_down(unsigned int cpu)
 {
 	int err;
 	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
 		goto out;
 	}
 	err = _cpu_down(cpu, 0);
 out:
 	cpu_maps_update_done();
 	return err;
 }
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
 	int ret, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct task_struct *idle;
 	cpu_hotplug_begin();
 	if (cpu_online(cpu) || !cpu_present(cpu)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	idle = idle_thread_get(cpu);
 	if (IS_ERR(idle)) {
 		ret = PTR_ERR(idle);
 		goto out;
 	}
 	ret = smpboot_create_threads(cpu);
 	if (ret)
 		goto out;
 	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (ret) {
 		nr_calls--;
 		printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
 				__func__, cpu);
 		goto out_notify;
 	}
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 	/* Wake the per cpu threads */
 	smpboot_unpark_threads(cpu);
 	/* Now call notifier in preparation. */
 	cpu_notify(CPU_ONLINE | mod, hcpu);
 out_notify:
 	if (ret != 0)
 		__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 out:
 	cpu_hotplug_done();
 	return ret;
 }
 int cpu_up(unsigned int cpu)
 {
 	int err = 0;
 	if (!cpu_possible(cpu)) {
 		printk(KERN_ERR "can't online cpu %d because it is not "
 			"configured as may-hotadd at boot time\n", cpu);
 #if defined(CONFIG_IA64)
 		printk(KERN_ERR "please check additional_cpus= boot "
 				"parameter\n");
 #endif
 		return -EINVAL;
 	}
 	err = try_online_node(cpu_to_node(cpu));
 	if (err)
 		return err;
 	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
 		goto out;
 	}
 	err = _cpu_up(cpu, 0);
 out:
 	cpu_maps_update_done();
 	return err;
 }
 EXPORT_SYMBOL_GPL(cpu_up);
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
 int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error = 0;
 	cpu_maps_update_begin();
 	first_cpu = cpumask_first(cpu_online_mask);
 	/*
 	 * We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
 	 */
 	cpumask_clear(frozen_cpus);
 	printk("Disabling non-boot CPUs ...\n");
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
 		error = _cpu_down(cpu, 1);
 		if (!error)
 			cpumask_set_cpu(cpu, frozen_cpus);
 		else {
 			printk(KERN_ERR "Error taking CPU%d down: %d\n",
 				cpu, error);
 			break;
 		}
 	}
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
 		/* Make sure the CPUs won't be enabled by someone else */
 		cpu_hotplug_disabled = 1;
 	} else {
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
 	cpu_maps_update_done();
 	return error;
 }
 void __weak arch_enable_nonboot_cpus_begin(void)
 {
 }
 void __weak arch_enable_nonboot_cpus_end(void)
 {
 }
 void __ref enable_nonboot_cpus(void)
 {
 	int cpu, error;
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
 	if (cpumask_empty(frozen_cpus))
 		goto out;
 	printk(KERN_INFO "Enabling non-boot CPUs ...\n");
 	arch_enable_nonboot_cpus_begin();
 	for_each_cpu(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk(KERN_INFO "CPU%d is up\n", cpu);
 			continue;
 		}
 		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
 	}
 	arch_enable_nonboot_cpus_end();
 	cpumask_clear(frozen_cpus);
 out:
 	cpu_maps_update_done();
 }
 static int __init alloc_frozen_cpus(void)
 {
 	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
 		return -ENOMEM;
 	return 0;
 }
 core_initcall(alloc_frozen_cpus);
 /*
  * When callbacks for CPU hotplug notifications are being executed, we must
  * ensure that the state of the system with respect to the tasks being frozen
  * or not, as reported by the notification, remains unchanged *throughout the
  * duration* of the execution of the callbacks.
  * Hence we need to prevent the freezer from racing with regular CPU hotplug.
  *
  * This synchronization is implemented by mutually excluding regular CPU
  * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
  * Hibernate notifications.
  */
 static int
 cpu_hotplug_pm_callback(struct notifier_block *nb,
 			unsigned long action, void *ptr)
 {
 	switch (action) {
 	case PM_SUSPEND_PREPARE:
 	case PM_HIBERNATION_PREPARE:
 		cpu_hotplug_disable();
 		break;
 	case PM_POST_SUSPEND:
 	case PM_POST_HIBERNATION:
 		cpu_hotplug_enable();
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 static int __init cpu_hotplug_pm_sync_init(void)
 {
 	/*
 	 * cpu_hotplug_pm_callback has higher priority than x86
 	 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
 	 * to disable cpu hotplug to avoid cpu hotplug race.
 	 */
 	pm_notifier(cpu_hotplug_pm_callback, 0);
 	return 0;
 }
 core_initcall(cpu_hotplug_pm_sync_init);
 #endif /* CONFIG_PM_SLEEP_SMP */
 /**
  * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
  * @cpu: cpu that just started
  *
  * This function calls the cpu_chain notifiers with CPU_STARTING.
  * It must be called by the arch code on the new cpu, before the new cpu
  * enables interrupts and before the "boot" cpu returns from __cpu_up().
  */
 void notify_cpu_starting(unsigned int cpu)
 {
 	unsigned long val = CPU_STARTING;
 #ifdef CONFIG_PM_SLEEP_SMP
 	if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
 		val = CPU_STARTING_FROZEN;
 #endif /* CONFIG_PM_SLEEP_SMP */
 	cpu_notify(val, (void *)(long)cpu);
 }
 #endif /* CONFIG_SMP */
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
  *
  * It is used by cpumask_of() to get a constant address to a CPU
  * mask value that has a single bit set only.
  */
 /* cpu_bit_bitmap[0] is empty - so we can back into it */
 #define MASK_DECLARE_1(x)	[x+1][0] = (1UL << (x))
 #define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
 #define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
 #define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
 const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
 	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
 	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
 #if BITS_PER_LONG > 32
 	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
 	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
 #endif
 };
 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
 EXPORT_SYMBOL(cpu_all_bits);
 #ifdef CONFIG_INIT_ALL_POSSIBLE
 static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
 	= CPU_BITS_ALL;
 #else
 static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
 #endif
 const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
 EXPORT_SYMBOL(cpu_possible_mask);
 static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
 const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
 EXPORT_SYMBOL(cpu_online_mask);
 static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
 const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
 EXPORT_SYMBOL(cpu_present_mask);
 static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
 const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
 EXPORT_SYMBOL(cpu_active_mask);
 void set_cpu_possible(unsigned int cpu, bool possible)
 {
 	if (possible)
 		cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
 	else
 		cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
 }
 void set_cpu_present(unsigned int cpu, bool present)
 {
 	if (present)
 		cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
 	else
 		cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
 }
 void set_cpu_online(unsigned int cpu, bool online)
 {
-	if (online)
+	if (online) {
 		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
-	else
+		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+	} else {
 		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+	}
 }
 void set_cpu_active(unsigned int cpu, bool active)
 {
 	if (active)
 		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
 	else
 		cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
 }
 void init_cpu_present(const struct cpumask *src)
 {
 	cpumask_copy(to_cpumask(cpu_present_bits), src);
 }
 void init_cpu_possible(const struct cpumask *src)
 {
 	cpumask_copy(to_cpumask(cpu_possible_bits), src);
 }
 void init_cpu_online(const struct cpumask *src)
 {
 	cpumask_copy(to_cpumask(cpu_online_bits), src);
 }

kernel/sched/core.c

Diff comments View file @ 3243970

1	/*	1	/*
2	* kernel/sched/core.c	2	* kernel/sched/core.c
3	*	3	*
4	* Kernel scheduler and related syscalls	4	* Kernel scheduler and related syscalls
5	*	5	*
6	* Copyright (C) 1991-2002 Linus Torvalds	6	* Copyright (C) 1991-2002 Linus Torvalds
7	*	7	*
8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and	8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9	* make semaphores SMP safe	9	* make semaphores SMP safe
10	* 1998-11-19 Implemented schedule_timeout() and related stuff	10	* 1998-11-19 Implemented schedule_timeout() and related stuff
11	* by Andrea Arcangeli	11	* by Andrea Arcangeli
12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:	12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13	* hybrid priority-list and round-robin design with	13	* hybrid priority-list and round-robin design with
14	* an array-switch method of distributing timeslices	14	* an array-switch method of distributing timeslices
15	* and per-CPU runqueues. Cleanups and useful suggestions	15	* and per-CPU runqueues. Cleanups and useful suggestions
16	* by Davide Libenzi, preemptible kernel bits by Robert Love.	16	* by Davide Libenzi, preemptible kernel bits by Robert Love.
17	* 2003-09-03 Interactivity tuning by Con Kolivas.	17	* 2003-09-03 Interactivity tuning by Con Kolivas.
18	* 2004-04-02 Scheduler domains code by Nick Piggin	18	* 2004-04-02 Scheduler domains code by Nick Piggin
19	* 2007-04-15 Work begun on replacing all interactivity tuning with a	19	* 2007-04-15 Work begun on replacing all interactivity tuning with a
20	* fair scheduling design by Con Kolivas.	20	* fair scheduling design by Con Kolivas.
21	* 2007-05-05 Load balancing (smp-nice) and other improvements	21	* 2007-05-05 Load balancing (smp-nice) and other improvements
22	* by Peter Williams	22	* by Peter Williams
23	* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith	23	* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24	* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri	24	* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25	* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,	25	* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26	* Thomas Gleixner, Mike Kravetz	26	* Thomas Gleixner, Mike Kravetz
27	*/	27	*/
28		28
29	#include <linux/mm.h>	29	#include <linux/mm.h>
30	#include <linux/module.h>	30	#include <linux/module.h>
31	#include <linux/nmi.h>	31	#include <linux/nmi.h>
32	#include <linux/init.h>	32	#include <linux/init.h>
33	#include <linux/uaccess.h>	33	#include <linux/uaccess.h>
34	#include <linux/highmem.h>	34	#include <linux/highmem.h>
35	#include <asm/mmu_context.h>	35	#include <asm/mmu_context.h>
36	#include <linux/interrupt.h>	36	#include <linux/interrupt.h>
37	#include <linux/capability.h>	37	#include <linux/capability.h>
38	#include <linux/completion.h>	38	#include <linux/completion.h>
39	#include <linux/kernel_stat.h>	39	#include <linux/kernel_stat.h>
40	#include <linux/debug_locks.h>	40	#include <linux/debug_locks.h>
41	#include <linux/perf_event.h>	41	#include <linux/perf_event.h>
42	#include <linux/security.h>	42	#include <linux/security.h>
43	#include <linux/notifier.h>	43	#include <linux/notifier.h>
44	#include <linux/profile.h>	44	#include <linux/profile.h>
45	#include <linux/freezer.h>	45	#include <linux/freezer.h>
46	#include <linux/vmalloc.h>	46	#include <linux/vmalloc.h>
47	#include <linux/blkdev.h>	47	#include <linux/blkdev.h>
48	#include <linux/delay.h>	48	#include <linux/delay.h>
49	#include <linux/pid_namespace.h>	49	#include <linux/pid_namespace.h>
50	#include <linux/smp.h>	50	#include <linux/smp.h>
51	#include <linux/threads.h>	51	#include <linux/threads.h>
52	#include <linux/timer.h>	52	#include <linux/timer.h>
53	#include <linux/rcupdate.h>	53	#include <linux/rcupdate.h>
54	#include <linux/cpu.h>	54	#include <linux/cpu.h>
55	#include <linux/cpuset.h>	55	#include <linux/cpuset.h>
56	#include <linux/percpu.h>	56	#include <linux/percpu.h>
57	#include <linux/proc_fs.h>	57	#include <linux/proc_fs.h>
58	#include <linux/seq_file.h>	58	#include <linux/seq_file.h>
59	#include <linux/sysctl.h>	59	#include <linux/sysctl.h>
60	#include <linux/syscalls.h>	60	#include <linux/syscalls.h>
61	#include <linux/times.h>	61	#include <linux/times.h>
62	#include <linux/tsacct_kern.h>	62	#include <linux/tsacct_kern.h>
63	#include <linux/kprobes.h>	63	#include <linux/kprobes.h>
64	#include <linux/delayacct.h>	64	#include <linux/delayacct.h>
65	#include <linux/unistd.h>	65	#include <linux/unistd.h>
66	#include <linux/pagemap.h>	66	#include <linux/pagemap.h>
67	#include <linux/hrtimer.h>	67	#include <linux/hrtimer.h>
68	#include <linux/tick.h>	68	#include <linux/tick.h>
69	#include <linux/debugfs.h>	69	#include <linux/debugfs.h>
70	#include <linux/ctype.h>	70	#include <linux/ctype.h>
71	#include <linux/ftrace.h>	71	#include <linux/ftrace.h>
72	#include <linux/slab.h>	72	#include <linux/slab.h>
73	#include <linux/init_task.h>	73	#include <linux/init_task.h>
74	#include <linux/binfmts.h>	74	#include <linux/binfmts.h>
75	#include <linux/context_tracking.h>	75	#include <linux/context_tracking.h>
76	#include <linux/compiler.h>	76	#include <linux/compiler.h>
77		77
78	#include <asm/switch_to.h>	78	#include <asm/switch_to.h>
79	#include <asm/tlb.h>	79	#include <asm/tlb.h>
80	#include <asm/irq_regs.h>	80	#include <asm/irq_regs.h>
81	#include <asm/mutex.h>	81	#include <asm/mutex.h>
82	#ifdef CONFIG_PARAVIRT	82	#ifdef CONFIG_PARAVIRT
83	#include <asm/paravirt.h>	83	#include <asm/paravirt.h>
84	#endif	84	#endif
85		85
86	#include "sched.h"	86	#include "sched.h"
87	#include "../workqueue_internal.h"	87	#include "../workqueue_internal.h"
88	#include "../smpboot.h"	88	#include "../smpboot.h"
89		89
90	#define CREATE_TRACE_POINTS	90	#define CREATE_TRACE_POINTS
91	#include <trace/events/sched.h>	91	#include <trace/events/sched.h>
92		92
93	void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)	93	void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
94	{	94	{
95	unsigned long delta;	95	unsigned long delta;
96	ktime_t soft, hard, now;	96	ktime_t soft, hard, now;
97		97
98	for (;;) {	98	for (;;) {
99	if (hrtimer_active(period_timer))	99	if (hrtimer_active(period_timer))
100	break;	100	break;
101		101
102	now = hrtimer_cb_get_time(period_timer);	102	now = hrtimer_cb_get_time(period_timer);
103	hrtimer_forward(period_timer, now, period);	103	hrtimer_forward(period_timer, now, period);
104		104
105	soft = hrtimer_get_softexpires(period_timer);	105	soft = hrtimer_get_softexpires(period_timer);
106	hard = hrtimer_get_expires(period_timer);	106	hard = hrtimer_get_expires(period_timer);
107	delta = ktime_to_ns(ktime_sub(hard, soft));	107	delta = ktime_to_ns(ktime_sub(hard, soft));
108	__hrtimer_start_range_ns(period_timer, soft, delta,	108	__hrtimer_start_range_ns(period_timer, soft, delta,
109	HRTIMER_MODE_ABS_PINNED, 0);	109	HRTIMER_MODE_ABS_PINNED, 0);
110	}	110	}
111	}	111	}
112		112
113	DEFINE_MUTEX(sched_domains_mutex);	113	DEFINE_MUTEX(sched_domains_mutex);
114	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);	114	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
115		115
116	static void update_rq_clock_task(struct rq *rq, s64 delta);	116	static void update_rq_clock_task(struct rq *rq, s64 delta);
117		117
118	void update_rq_clock(struct rq *rq)	118	void update_rq_clock(struct rq *rq)
119	{	119	{
120	s64 delta;	120	s64 delta;
121		121
122	if (rq->skip_clock_update > 0)	122	if (rq->skip_clock_update > 0)
123	return;	123	return;
124		124
125	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;	125	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
126	rq->clock += delta;	126	rq->clock += delta;
127	update_rq_clock_task(rq, delta);	127	update_rq_clock_task(rq, delta);
128	}	128	}
129		129
130	/*	130	/*
131	* Debugging: various feature bits	131	* Debugging: various feature bits
132	*/	132	*/
133		133
134	#define SCHED_FEAT(name, enabled) \	134	#define SCHED_FEAT(name, enabled) \
135	(1UL << __SCHED_FEAT_##name) * enabled \|	135	(1UL << __SCHED_FEAT_##name) * enabled \|
136		136
137	const_debug unsigned int sysctl_sched_features =	137	const_debug unsigned int sysctl_sched_features =
138	#include "features.h"	138	#include "features.h"
139	0;	139	0;
140		140
141	#undef SCHED_FEAT	141	#undef SCHED_FEAT
142		142
143	#ifdef CONFIG_SCHED_DEBUG	143	#ifdef CONFIG_SCHED_DEBUG
144	#define SCHED_FEAT(name, enabled) \	144	#define SCHED_FEAT(name, enabled) \
145	#name ,	145	#name ,
146		146
147	static const char * const sched_feat_names[] = {	147	static const char * const sched_feat_names[] = {
148	#include "features.h"	148	#include "features.h"
149	};	149	};
150		150
151	#undef SCHED_FEAT	151	#undef SCHED_FEAT
152		152
153	static int sched_feat_show(struct seq_file m, void v)	153	static int sched_feat_show(struct seq_file m, void v)
154	{	154	{
155	int i;	155	int i;
156		156
157	for (i = 0; i < __SCHED_FEAT_NR; i++) {	157	for (i = 0; i < __SCHED_FEAT_NR; i++) {
158	if (!(sysctl_sched_features & (1UL << i)))	158	if (!(sysctl_sched_features & (1UL << i)))
159	seq_puts(m, "NO_");	159	seq_puts(m, "NO_");
160	seq_printf(m, "%s ", sched_feat_names[i]);	160	seq_printf(m, "%s ", sched_feat_names[i]);
161	}	161	}
162	seq_puts(m, "\n");	162	seq_puts(m, "\n");
163		163
164	return 0;	164	return 0;
165	}	165	}
166		166
167	#ifdef HAVE_JUMP_LABEL	167	#ifdef HAVE_JUMP_LABEL
168		168
169	#define jump_label_key__true STATIC_KEY_INIT_TRUE	169	#define jump_label_key__true STATIC_KEY_INIT_TRUE
170	#define jump_label_key__false STATIC_KEY_INIT_FALSE	170	#define jump_label_key__false STATIC_KEY_INIT_FALSE
171		171
172	#define SCHED_FEAT(name, enabled) \	172	#define SCHED_FEAT(name, enabled) \
173	jump_label_key__##enabled ,	173	jump_label_key__##enabled ,
174		174
175	struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {	175	struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
176	#include "features.h"	176	#include "features.h"
177	};	177	};
178		178
179	#undef SCHED_FEAT	179	#undef SCHED_FEAT
180		180
181	static void sched_feat_disable(int i)	181	static void sched_feat_disable(int i)
182	{	182	{
183	if (static_key_enabled(&sched_feat_keys[i]))	183	if (static_key_enabled(&sched_feat_keys[i]))
184	static_key_slow_dec(&sched_feat_keys[i]);	184	static_key_slow_dec(&sched_feat_keys[i]);
185	}	185	}
186		186
187	static void sched_feat_enable(int i)	187	static void sched_feat_enable(int i)
188	{	188	{
189	if (!static_key_enabled(&sched_feat_keys[i]))	189	if (!static_key_enabled(&sched_feat_keys[i]))
190	static_key_slow_inc(&sched_feat_keys[i]);	190	static_key_slow_inc(&sched_feat_keys[i]);
191	}	191	}
192	#else	192	#else
193	static void sched_feat_disable(int i) { };	193	static void sched_feat_disable(int i) { };
194	static void sched_feat_enable(int i) { };	194	static void sched_feat_enable(int i) { };
195	#endif /* HAVE_JUMP_LABEL */	195	#endif /* HAVE_JUMP_LABEL */
196		196
197	static int sched_feat_set(char *cmp)	197	static int sched_feat_set(char *cmp)
198	{	198	{
199	int i;	199	int i;
200	int neg = 0;	200	int neg = 0;
201		201
202	if (strncmp(cmp, "NO_", 3) == 0) {	202	if (strncmp(cmp, "NO_", 3) == 0) {
203	neg = 1;	203	neg = 1;
204	cmp += 3;	204	cmp += 3;
205	}	205	}
206		206
207	for (i = 0; i < __SCHED_FEAT_NR; i++) {	207	for (i = 0; i < __SCHED_FEAT_NR; i++) {
208	if (strcmp(cmp, sched_feat_names[i]) == 0) {	208	if (strcmp(cmp, sched_feat_names[i]) == 0) {
209	if (neg) {	209	if (neg) {
210	sysctl_sched_features &= ~(1UL << i);	210	sysctl_sched_features &= ~(1UL << i);
211	sched_feat_disable(i);	211	sched_feat_disable(i);
212	} else {	212	} else {
213	sysctl_sched_features \|= (1UL << i);	213	sysctl_sched_features \|= (1UL << i);
214	sched_feat_enable(i);	214	sched_feat_enable(i);
215	}	215	}
216	break;	216	break;
217	}	217	}
218	}	218	}
219		219
220	return i;	220	return i;
221	}	221	}
222		222
223	static ssize_t	223	static ssize_t
224	sched_feat_write(struct file filp, const char __user ubuf,	224	sched_feat_write(struct file filp, const char __user ubuf,
225	size_t cnt, loff_t *ppos)	225	size_t cnt, loff_t *ppos)
226	{	226	{
227	char buf[64];	227	char buf[64];
228	char *cmp;	228	char *cmp;
229	int i;	229	int i;
230		230
231	if (cnt > 63)	231	if (cnt > 63)
232	cnt = 63;	232	cnt = 63;
233		233
234	if (copy_from_user(&buf, ubuf, cnt))	234	if (copy_from_user(&buf, ubuf, cnt))
235	return -EFAULT;	235	return -EFAULT;
236		236
237	buf[cnt] = 0;	237	buf[cnt] = 0;
238	cmp = strstrip(buf);	238	cmp = strstrip(buf);
239		239
240	i = sched_feat_set(cmp);	240	i = sched_feat_set(cmp);
241	if (i == __SCHED_FEAT_NR)	241	if (i == __SCHED_FEAT_NR)
242	return -EINVAL;	242	return -EINVAL;
243		243
244	*ppos += cnt;	244	*ppos += cnt;
245		245
246	return cnt;	246	return cnt;
247	}	247	}
248		248
249	static int sched_feat_open(struct inode inode, struct file filp)	249	static int sched_feat_open(struct inode inode, struct file filp)
250	{	250	{
251	return single_open(filp, sched_feat_show, NULL);	251	return single_open(filp, sched_feat_show, NULL);
252	}	252	}
253		253
254	static const struct file_operations sched_feat_fops = {	254	static const struct file_operations sched_feat_fops = {
255	.open = sched_feat_open,	255	.open = sched_feat_open,
256	.write = sched_feat_write,	256	.write = sched_feat_write,
257	.read = seq_read,	257	.read = seq_read,
258	.llseek = seq_lseek,	258	.llseek = seq_lseek,
259	.release = single_release,	259	.release = single_release,
260	};	260	};
261		261
262	static __init int sched_init_debug(void)	262	static __init int sched_init_debug(void)
263	{	263	{
264	debugfs_create_file("sched_features", 0644, NULL, NULL,	264	debugfs_create_file("sched_features", 0644, NULL, NULL,
265	&sched_feat_fops);	265	&sched_feat_fops);
266		266
267	return 0;	267	return 0;
268	}	268	}
269	late_initcall(sched_init_debug);	269	late_initcall(sched_init_debug);
270	#endif /* CONFIG_SCHED_DEBUG */	270	#endif /* CONFIG_SCHED_DEBUG */
271		271
272	/*	272	/*
273	* Number of tasks to iterate in a single balance run.	273	* Number of tasks to iterate in a single balance run.
274	* Limited because this is done with IRQs disabled.	274	* Limited because this is done with IRQs disabled.
275	*/	275	*/
276	const_debug unsigned int sysctl_sched_nr_migrate = 32;	276	const_debug unsigned int sysctl_sched_nr_migrate = 32;
277		277
278	/*	278	/*
279	* period over which we average the RT time consumption, measured	279	* period over which we average the RT time consumption, measured
280	* in ms.	280	* in ms.
281	*	281	*
282	* default: 1s	282	* default: 1s
283	*/	283	*/
284	const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;	284	const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
285		285
286	/*	286	/*
287	* period over which we measure -rt task cpu usage in us.	287	* period over which we measure -rt task cpu usage in us.
288	* default: 1s	288	* default: 1s
289	*/	289	*/
290	unsigned int sysctl_sched_rt_period = 1000000;	290	unsigned int sysctl_sched_rt_period = 1000000;
291		291
292	__read_mostly int scheduler_running;	292	__read_mostly int scheduler_running;
293		293
294	/*	294	/*
295	* part of the period that we allow rt tasks to run in us.	295	* part of the period that we allow rt tasks to run in us.
296	* default: 0.95s	296	* default: 0.95s
297	*/	297	*/
298	int sysctl_sched_rt_runtime = 950000;	298	int sysctl_sched_rt_runtime = 950000;
299		299
300	/*	300	/*
301	* __task_rq_lock - lock the rq @p resides on.	301	* __task_rq_lock - lock the rq @p resides on.
302	*/	302	*/
303	static inline struct rq __task_rq_lock(struct task_struct p)	303	static inline struct rq __task_rq_lock(struct task_struct p)
304	__acquires(rq->lock)	304	__acquires(rq->lock)
305	{	305	{
306	struct rq *rq;	306	struct rq *rq;
307		307
308	lockdep_assert_held(&p->pi_lock);	308	lockdep_assert_held(&p->pi_lock);
309		309
310	for (;;) {	310	for (;;) {
311	rq = task_rq(p);	311	rq = task_rq(p);
312	raw_spin_lock(&rq->lock);	312	raw_spin_lock(&rq->lock);
313	if (likely(rq == task_rq(p)))	313	if (likely(rq == task_rq(p)))
314	return rq;	314	return rq;
315	raw_spin_unlock(&rq->lock);	315	raw_spin_unlock(&rq->lock);
316	}	316	}
317	}	317	}
318		318
319	/*	319	/*
320	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.	320	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
321	*/	321	*/
322	static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)	322	static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)
323	__acquires(p->pi_lock)	323	__acquires(p->pi_lock)
324	__acquires(rq->lock)	324	__acquires(rq->lock)
325	{	325	{
326	struct rq *rq;	326	struct rq *rq;
327		327
328	for (;;) {	328	for (;;) {
329	raw_spin_lock_irqsave(&p->pi_lock, *flags);	329	raw_spin_lock_irqsave(&p->pi_lock, *flags);
330	rq = task_rq(p);	330	rq = task_rq(p);
331	raw_spin_lock(&rq->lock);	331	raw_spin_lock(&rq->lock);
332	if (likely(rq == task_rq(p)))	332	if (likely(rq == task_rq(p)))
333	return rq;	333	return rq;
334	raw_spin_unlock(&rq->lock);	334	raw_spin_unlock(&rq->lock);
335	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);	335	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
336	}	336	}
337	}	337	}
338		338
339	static void __task_rq_unlock(struct rq *rq)	339	static void __task_rq_unlock(struct rq *rq)
340	__releases(rq->lock)	340	__releases(rq->lock)
341	{	341	{
342	raw_spin_unlock(&rq->lock);	342	raw_spin_unlock(&rq->lock);
343	}	343	}
344		344
345	static inline void	345	static inline void
346	task_rq_unlock(struct rq rq, struct task_struct p, unsigned long *flags)	346	task_rq_unlock(struct rq rq, struct task_struct p, unsigned long *flags)
347	__releases(rq->lock)	347	__releases(rq->lock)
348	__releases(p->pi_lock)	348	__releases(p->pi_lock)
349	{	349	{
350	raw_spin_unlock(&rq->lock);	350	raw_spin_unlock(&rq->lock);
351	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);	351	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
352	}	352	}
353		353
354	/*	354	/*
355	* this_rq_lock - lock this runqueue and disable interrupts.	355	* this_rq_lock - lock this runqueue and disable interrupts.
356	*/	356	*/
357	static struct rq *this_rq_lock(void)	357	static struct rq *this_rq_lock(void)
358	__acquires(rq->lock)	358	__acquires(rq->lock)
359	{	359	{
360	struct rq *rq;	360	struct rq *rq;
361		361
362	local_irq_disable();	362	local_irq_disable();
363	rq = this_rq();	363	rq = this_rq();
364	raw_spin_lock(&rq->lock);	364	raw_spin_lock(&rq->lock);
365		365
366	return rq;	366	return rq;
367	}	367	}
368		368
369	#ifdef CONFIG_SCHED_HRTICK	369	#ifdef CONFIG_SCHED_HRTICK
370	/*	370	/*
371	* Use HR-timers to deliver accurate preemption points.	371	* Use HR-timers to deliver accurate preemption points.
372	*/	372	*/
373		373
374	static void hrtick_clear(struct rq *rq)	374	static void hrtick_clear(struct rq *rq)
375	{	375	{
376	if (hrtimer_active(&rq->hrtick_timer))	376	if (hrtimer_active(&rq->hrtick_timer))
377	hrtimer_cancel(&rq->hrtick_timer);	377	hrtimer_cancel(&rq->hrtick_timer);
378	}	378	}
379		379
380	/*	380	/*
381	* High-resolution timer tick.	381	* High-resolution timer tick.
382	* Runs from hardirq context with interrupts disabled.	382	* Runs from hardirq context with interrupts disabled.
383	*/	383	*/
384	static enum hrtimer_restart hrtick(struct hrtimer *timer)	384	static enum hrtimer_restart hrtick(struct hrtimer *timer)
385	{	385	{
386	struct rq *rq = container_of(timer, struct rq, hrtick_timer);	386	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
387		387
388	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());	388	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
389		389
390	raw_spin_lock(&rq->lock);	390	raw_spin_lock(&rq->lock);
391	update_rq_clock(rq);	391	update_rq_clock(rq);
392	rq->curr->sched_class->task_tick(rq, rq->curr, 1);	392	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
393	raw_spin_unlock(&rq->lock);	393	raw_spin_unlock(&rq->lock);
394		394
395	return HRTIMER_NORESTART;	395	return HRTIMER_NORESTART;
396	}	396	}
397		397
398	#ifdef CONFIG_SMP	398	#ifdef CONFIG_SMP
399		399
400	static int __hrtick_restart(struct rq *rq)	400	static int __hrtick_restart(struct rq *rq)
401	{	401	{
402	struct hrtimer *timer = &rq->hrtick_timer;	402	struct hrtimer *timer = &rq->hrtick_timer;
403	ktime_t time = hrtimer_get_softexpires(timer);	403	ktime_t time = hrtimer_get_softexpires(timer);
404		404
405	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);	405	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
406	}	406	}
407		407
408	/*	408	/*
409	* called from hardirq (IPI) context	409	* called from hardirq (IPI) context
410	*/	410	*/
411	static void __hrtick_start(void *arg)	411	static void __hrtick_start(void *arg)
412	{	412	{
413	struct rq *rq = arg;	413	struct rq *rq = arg;
414		414
415	raw_spin_lock(&rq->lock);	415	raw_spin_lock(&rq->lock);
416	__hrtick_restart(rq);	416	__hrtick_restart(rq);
417	rq->hrtick_csd_pending = 0;	417	rq->hrtick_csd_pending = 0;
418	raw_spin_unlock(&rq->lock);	418	raw_spin_unlock(&rq->lock);
419	}	419	}
420		420
421	/*	421	/*
422	* Called to set the hrtick timer state.	422	* Called to set the hrtick timer state.
423	*	423	*
424	* called with rq->lock held and irqs disabled	424	* called with rq->lock held and irqs disabled
425	*/	425	*/
426	void hrtick_start(struct rq *rq, u64 delay)	426	void hrtick_start(struct rq *rq, u64 delay)
427	{	427	{
428	struct hrtimer *timer = &rq->hrtick_timer;	428	struct hrtimer *timer = &rq->hrtick_timer;
429	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);	429	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
430		430
431	hrtimer_set_expires(timer, time);	431	hrtimer_set_expires(timer, time);
432		432
433	if (rq == this_rq()) {	433	if (rq == this_rq()) {
434	__hrtick_restart(rq);	434	__hrtick_restart(rq);
435	} else if (!rq->hrtick_csd_pending) {	435	} else if (!rq->hrtick_csd_pending) {
436	smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);	436	smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
437	rq->hrtick_csd_pending = 1;	437	rq->hrtick_csd_pending = 1;
438	}	438	}
439	}	439	}
440		440
441	static int	441	static int
442	hotplug_hrtick(struct notifier_block nfb, unsigned long action, void hcpu)	442	hotplug_hrtick(struct notifier_block nfb, unsigned long action, void hcpu)
443	{	443	{
444	int cpu = (int)(long)hcpu;	444	int cpu = (int)(long)hcpu;
445		445
446	switch (action) {	446	switch (action) {
447	case CPU_UP_CANCELED:	447	case CPU_UP_CANCELED:
448	case CPU_UP_CANCELED_FROZEN:	448	case CPU_UP_CANCELED_FROZEN:
449	case CPU_DOWN_PREPARE:	449	case CPU_DOWN_PREPARE:
450	case CPU_DOWN_PREPARE_FROZEN:	450	case CPU_DOWN_PREPARE_FROZEN:
451	case CPU_DEAD:	451	case CPU_DEAD:
452	case CPU_DEAD_FROZEN:	452	case CPU_DEAD_FROZEN:
453	hrtick_clear(cpu_rq(cpu));	453	hrtick_clear(cpu_rq(cpu));
454	return NOTIFY_OK;	454	return NOTIFY_OK;
455	}	455	}
456		456
457	return NOTIFY_DONE;	457	return NOTIFY_DONE;
458	}	458	}
459		459
460	static __init void init_hrtick(void)	460	static __init void init_hrtick(void)
461	{	461	{
462	hotcpu_notifier(hotplug_hrtick, 0);	462	hotcpu_notifier(hotplug_hrtick, 0);
463	}	463	}
464	#else	464	#else
465	/*	465	/*
466	* Called to set the hrtick timer state.	466	* Called to set the hrtick timer state.
467	*	467	*
468	* called with rq->lock held and irqs disabled	468	* called with rq->lock held and irqs disabled
469	*/	469	*/
470	void hrtick_start(struct rq *rq, u64 delay)	470	void hrtick_start(struct rq *rq, u64 delay)
471	{	471	{
472	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,	472	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
473	HRTIMER_MODE_REL_PINNED, 0);	473	HRTIMER_MODE_REL_PINNED, 0);
474	}	474	}
475		475
476	static inline void init_hrtick(void)	476	static inline void init_hrtick(void)
477	{	477	{
478	}	478	}
479	#endif /* CONFIG_SMP */	479	#endif /* CONFIG_SMP */
480		480
481	static void init_rq_hrtick(struct rq *rq)	481	static void init_rq_hrtick(struct rq *rq)
482	{	482	{
483	#ifdef CONFIG_SMP	483	#ifdef CONFIG_SMP
484	rq->hrtick_csd_pending = 0;	484	rq->hrtick_csd_pending = 0;
485		485
486	rq->hrtick_csd.flags = 0;	486	rq->hrtick_csd.flags = 0;
487	rq->hrtick_csd.func = __hrtick_start;	487	rq->hrtick_csd.func = __hrtick_start;
488	rq->hrtick_csd.info = rq;	488	rq->hrtick_csd.info = rq;
489	#endif	489	#endif
490		490
491	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	491	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
492	rq->hrtick_timer.function = hrtick;	492	rq->hrtick_timer.function = hrtick;
493	}	493	}
494	#else /* CONFIG_SCHED_HRTICK */	494	#else /* CONFIG_SCHED_HRTICK */
495	static inline void hrtick_clear(struct rq *rq)	495	static inline void hrtick_clear(struct rq *rq)
496	{	496	{
497	}	497	}
498		498
499	static inline void init_rq_hrtick(struct rq *rq)	499	static inline void init_rq_hrtick(struct rq *rq)
500	{	500	{
501	}	501	}
502		502
503	static inline void init_hrtick(void)	503	static inline void init_hrtick(void)
504	{	504	{
505	}	505	}
506	#endif /* CONFIG_SCHED_HRTICK */	506	#endif /* CONFIG_SCHED_HRTICK */
507		507
508	/*	508	/*
509	* resched_task - mark a task 'to be rescheduled now'.	509	* resched_task - mark a task 'to be rescheduled now'.
510	*	510	*
511	* On UP this means the setting of the need_resched flag, on SMP it	511	* On UP this means the setting of the need_resched flag, on SMP it
512	* might also involve a cross-CPU call to trigger the scheduler on	512	* might also involve a cross-CPU call to trigger the scheduler on
513	* the target CPU.	513	* the target CPU.
514	*/	514	*/
515	void resched_task(struct task_struct *p)	515	void resched_task(struct task_struct *p)
516	{	516	{
517	int cpu;	517	int cpu;
518		518
519	lockdep_assert_held(&task_rq(p)->lock);	519	lockdep_assert_held(&task_rq(p)->lock);
520		520
521	if (test_tsk_need_resched(p))	521	if (test_tsk_need_resched(p))
522	return;	522	return;
523		523
524	set_tsk_need_resched(p);	524	set_tsk_need_resched(p);
525		525
526	cpu = task_cpu(p);	526	cpu = task_cpu(p);
527	if (cpu == smp_processor_id()) {	527	if (cpu == smp_processor_id()) {
528	set_preempt_need_resched();	528	set_preempt_need_resched();
529	return;	529	return;
530	}	530	}
531		531
532	/* NEED_RESCHED must be visible before we test polling */	532	/* NEED_RESCHED must be visible before we test polling */
533	smp_mb();	533	smp_mb();
534	if (!tsk_is_polling(p))	534	if (!tsk_is_polling(p))
535	smp_send_reschedule(cpu);	535	smp_send_reschedule(cpu);
536	}	536	}
537		537
538	void resched_cpu(int cpu)	538	void resched_cpu(int cpu)
539	{	539	{
540	struct rq *rq = cpu_rq(cpu);	540	struct rq *rq = cpu_rq(cpu);
541	unsigned long flags;	541	unsigned long flags;
542		542
543	if (!raw_spin_trylock_irqsave(&rq->lock, flags))	543	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
544	return;	544	return;
545	resched_task(cpu_curr(cpu));	545	resched_task(cpu_curr(cpu));
546	raw_spin_unlock_irqrestore(&rq->lock, flags);	546	raw_spin_unlock_irqrestore(&rq->lock, flags);
547	}	547	}
548		548
549	#ifdef CONFIG_SMP	549	#ifdef CONFIG_SMP
550	#ifdef CONFIG_NO_HZ_COMMON	550	#ifdef CONFIG_NO_HZ_COMMON
551	/*	551	/*
552	* In the semi idle case, use the nearest busy cpu for migrating timers	552	* In the semi idle case, use the nearest busy cpu for migrating timers
553	* from an idle cpu. This is good for power-savings.	553	* from an idle cpu. This is good for power-savings.
554	*	554	*
555	* We don't do similar optimization for completely idle system, as	555	* We don't do similar optimization for completely idle system, as
556	* selecting an idle cpu will add more delays to the timers than intended	556	* selecting an idle cpu will add more delays to the timers than intended
557	* (as that cpu's timer base may not be uptodate wrt jiffies etc).	557	* (as that cpu's timer base may not be uptodate wrt jiffies etc).
558	*/	558	*/
559	int get_nohz_timer_target(int pinned)	559	int get_nohz_timer_target(int pinned)
560	{	560	{
561	int cpu = smp_processor_id();	561	int cpu = smp_processor_id();
562	int i;	562	int i;
563	struct sched_domain *sd;	563	struct sched_domain *sd;
564		564
565	if (pinned \|\| !get_sysctl_timer_migration() \|\| !idle_cpu(cpu))	565	if (pinned \|\| !get_sysctl_timer_migration() \|\| !idle_cpu(cpu))
566	return cpu;	566	return cpu;
567		567
568	rcu_read_lock();	568	rcu_read_lock();
569	for_each_domain(cpu, sd) {	569	for_each_domain(cpu, sd) {
570	for_each_cpu(i, sched_domain_span(sd)) {	570	for_each_cpu(i, sched_domain_span(sd)) {
571	if (!idle_cpu(i)) {	571	if (!idle_cpu(i)) {
572	cpu = i;	572	cpu = i;
573	goto unlock;	573	goto unlock;
574	}	574	}
575	}	575	}
576	}	576	}
577	unlock:	577	unlock:
578	rcu_read_unlock();	578	rcu_read_unlock();
579	return cpu;	579	return cpu;
580	}	580	}
581	/*	581	/*
582	* When add_timer_on() enqueues a timer into the timer wheel of an	582	* When add_timer_on() enqueues a timer into the timer wheel of an
583	* idle CPU then this timer might expire before the next timer event	583	* idle CPU then this timer might expire before the next timer event
584	* which is scheduled to wake up that CPU. In case of a completely	584	* which is scheduled to wake up that CPU. In case of a completely
585	* idle system the next event might even be infinite time into the	585	* idle system the next event might even be infinite time into the
586	* future. wake_up_idle_cpu() ensures that the CPU is woken up and	586	* future. wake_up_idle_cpu() ensures that the CPU is woken up and
587	* leaves the inner idle loop so the newly added timer is taken into	587	* leaves the inner idle loop so the newly added timer is taken into
588	* account when the CPU goes back to idle and evaluates the timer	588	* account when the CPU goes back to idle and evaluates the timer
589	* wheel for the next timer event.	589	* wheel for the next timer event.
590	*/	590	*/
591	static void wake_up_idle_cpu(int cpu)	591	static void wake_up_idle_cpu(int cpu)
592	{	592	{
593	struct rq *rq = cpu_rq(cpu);	593	struct rq *rq = cpu_rq(cpu);
594		594
595	if (cpu == smp_processor_id())	595	if (cpu == smp_processor_id())
596	return;	596	return;
597		597
598	/*	598	/*
599	* This is safe, as this function is called with the timer	599	* This is safe, as this function is called with the timer
600	* wheel base lock of (cpu) held. When the CPU is on the way	600	* wheel base lock of (cpu) held. When the CPU is on the way
601	* to idle and has not yet set rq->curr to idle then it will	601	* to idle and has not yet set rq->curr to idle then it will
602	* be serialized on the timer wheel base lock and take the new	602	* be serialized on the timer wheel base lock and take the new
603	* timer into account automatically.	603	* timer into account automatically.
604	*/	604	*/
605	if (rq->curr != rq->idle)	605	if (rq->curr != rq->idle)
606	return;	606	return;
607		607
608	/*	608	/*
609	* We can set TIF_RESCHED on the idle task of the other CPU	609	* We can set TIF_RESCHED on the idle task of the other CPU
610	* lockless. The worst case is that the other CPU runs the	610	* lockless. The worst case is that the other CPU runs the
611	* idle task through an additional NOOP schedule()	611	* idle task through an additional NOOP schedule()
612	*/	612	*/
613	set_tsk_need_resched(rq->idle);	613	set_tsk_need_resched(rq->idle);
614		614
615	/* NEED_RESCHED must be visible before we test polling */	615	/* NEED_RESCHED must be visible before we test polling */
616	smp_mb();	616	smp_mb();
617	if (!tsk_is_polling(rq->idle))	617	if (!tsk_is_polling(rq->idle))
618	smp_send_reschedule(cpu);	618	smp_send_reschedule(cpu);
619	}	619	}
620		620
621	static bool wake_up_full_nohz_cpu(int cpu)	621	static bool wake_up_full_nohz_cpu(int cpu)
622	{	622	{
623	if (tick_nohz_full_cpu(cpu)) {	623	if (tick_nohz_full_cpu(cpu)) {
624	if (cpu != smp_processor_id() \|\|	624	if (cpu != smp_processor_id() \|\|
625	tick_nohz_tick_stopped())	625	tick_nohz_tick_stopped())
626	smp_send_reschedule(cpu);	626	smp_send_reschedule(cpu);
627	return true;	627	return true;
628	}	628	}
629		629
630	return false;	630	return false;
631	}	631	}
632		632
633	void wake_up_nohz_cpu(int cpu)	633	void wake_up_nohz_cpu(int cpu)
634	{	634	{
635	if (!wake_up_full_nohz_cpu(cpu))	635	if (!wake_up_full_nohz_cpu(cpu))
636	wake_up_idle_cpu(cpu);	636	wake_up_idle_cpu(cpu);
637	}	637	}
638		638
639	static inline bool got_nohz_idle_kick(void)	639	static inline bool got_nohz_idle_kick(void)
640	{	640	{
641	int cpu = smp_processor_id();	641	int cpu = smp_processor_id();
642		642
643	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))	643	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
644	return false;	644	return false;
645		645
646	if (idle_cpu(cpu) && !need_resched())	646	if (idle_cpu(cpu) && !need_resched())
647	return true;	647	return true;
648		648
649	/*	649	/*
650	* We can't run Idle Load Balance on this CPU for this time so we	650	* We can't run Idle Load Balance on this CPU for this time so we
651	* cancel it and clear NOHZ_BALANCE_KICK	651	* cancel it and clear NOHZ_BALANCE_KICK
652	*/	652	*/
653	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));	653	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
654	return false;	654	return false;
655	}	655	}
656		656
657	#else /* CONFIG_NO_HZ_COMMON */	657	#else /* CONFIG_NO_HZ_COMMON */
658		658
659	static inline bool got_nohz_idle_kick(void)	659	static inline bool got_nohz_idle_kick(void)
660	{	660	{
661	return false;	661	return false;
662	}	662	}
663		663
664	#endif /* CONFIG_NO_HZ_COMMON */	664	#endif /* CONFIG_NO_HZ_COMMON */
665		665
666	#ifdef CONFIG_NO_HZ_FULL	666	#ifdef CONFIG_NO_HZ_FULL
667	bool sched_can_stop_tick(void)	667	bool sched_can_stop_tick(void)
668	{	668	{
669	struct rq *rq;	669	struct rq *rq;
670		670
671	rq = this_rq();	671	rq = this_rq();
672		672
673	/* Make sure rq->nr_running update is visible after the IPI */	673	/* Make sure rq->nr_running update is visible after the IPI */
674	smp_rmb();	674	smp_rmb();
675		675
676	/* More than one running task need preemption */	676	/* More than one running task need preemption */
677	if (rq->nr_running > 1)	677	if (rq->nr_running > 1)
678	return false;	678	return false;
679		679
680	return true;	680	return true;
681	}	681	}
682	#endif /* CONFIG_NO_HZ_FULL */	682	#endif /* CONFIG_NO_HZ_FULL */
683		683
684	void sched_avg_update(struct rq *rq)	684	void sched_avg_update(struct rq *rq)
685	{	685	{
686	s64 period = sched_avg_period();	686	s64 period = sched_avg_period();
687		687
688	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {	688	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
689	/*	689	/*
690	* Inline assembly required to prevent the compiler	690	* Inline assembly required to prevent the compiler
691	* optimising this loop into a divmod call.	691	* optimising this loop into a divmod call.
692	* See __iter_div_u64_rem() for another example of this.	692	* See __iter_div_u64_rem() for another example of this.
693	*/	693	*/
694	asm("" : "+rm" (rq->age_stamp));	694	asm("" : "+rm" (rq->age_stamp));
695	rq->age_stamp += period;	695	rq->age_stamp += period;
696	rq->rt_avg /= 2;	696	rq->rt_avg /= 2;
697	}	697	}
698	}	698	}
699		699
700	#endif /* CONFIG_SMP */	700	#endif /* CONFIG_SMP */
701		701
702	#if defined(CONFIG_RT_GROUP_SCHED) \|\| (defined(CONFIG_FAIR_GROUP_SCHED) && \	702	#if defined(CONFIG_RT_GROUP_SCHED) \|\| (defined(CONFIG_FAIR_GROUP_SCHED) && \
703	(defined(CONFIG_SMP) \|\| defined(CONFIG_CFS_BANDWIDTH)))	703	(defined(CONFIG_SMP) \|\| defined(CONFIG_CFS_BANDWIDTH)))
704	/*	704	/*
705	* Iterate task_group tree rooted at *from, calling @down when first entering a	705	* Iterate task_group tree rooted at *from, calling @down when first entering a
706	* node and @up when leaving it for the final time.	706	* node and @up when leaving it for the final time.
707	*	707	*
708	* Caller must hold rcu_lock or sufficient equivalent.	708	* Caller must hold rcu_lock or sufficient equivalent.
709	*/	709	*/
710	int walk_tg_tree_from(struct task_group *from,	710	int walk_tg_tree_from(struct task_group *from,
711	tg_visitor down, tg_visitor up, void *data)	711	tg_visitor down, tg_visitor up, void *data)
712	{	712	{
713	struct task_group parent, child;	713	struct task_group parent, child;
714	int ret;	714	int ret;
715		715
716	parent = from;	716	parent = from;
717		717
718	down:	718	down:
719	ret = (*down)(parent, data);	719	ret = (*down)(parent, data);
720	if (ret)	720	if (ret)
721	goto out;	721	goto out;
722	list_for_each_entry_rcu(child, &parent->children, siblings) {	722	list_for_each_entry_rcu(child, &parent->children, siblings) {
723	parent = child;	723	parent = child;
724	goto down;	724	goto down;
725		725
726	up:	726	up:
727	continue;	727	continue;
728	}	728	}
729	ret = (*up)(parent, data);	729	ret = (*up)(parent, data);
730	if (ret \|\| parent == from)	730	if (ret \|\| parent == from)
731	goto out;	731	goto out;
732		732
733	child = parent;	733	child = parent;
734	parent = parent->parent;	734	parent = parent->parent;
735	if (parent)	735	if (parent)
736	goto up;	736	goto up;
737	out:	737	out:
738	return ret;	738	return ret;
739	}	739	}
740		740
741	int tg_nop(struct task_group tg, void data)	741	int tg_nop(struct task_group tg, void data)
742	{	742	{
743	return 0;	743	return 0;
744	}	744	}
745	#endif	745	#endif
746		746
747	static void set_load_weight(struct task_struct *p)	747	static void set_load_weight(struct task_struct *p)
748	{	748	{
749	int prio = p->static_prio - MAX_RT_PRIO;	749	int prio = p->static_prio - MAX_RT_PRIO;
750	struct load_weight *load = &p->se.load;	750	struct load_weight *load = &p->se.load;
751		751
752	/*	752	/*
753	* SCHED_IDLE tasks get minimal weight:	753	* SCHED_IDLE tasks get minimal weight:
754	*/	754	*/
755	if (p->policy == SCHED_IDLE) {	755	if (p->policy == SCHED_IDLE) {
756	load->weight = scale_load(WEIGHT_IDLEPRIO);	756	load->weight = scale_load(WEIGHT_IDLEPRIO);
757	load->inv_weight = WMULT_IDLEPRIO;	757	load->inv_weight = WMULT_IDLEPRIO;
758	return;	758	return;
759	}	759	}
760		760
761	load->weight = scale_load(prio_to_weight[prio]);	761	load->weight = scale_load(prio_to_weight[prio]);
762	load->inv_weight = prio_to_wmult[prio];	762	load->inv_weight = prio_to_wmult[prio];
763	}	763	}
764		764
765	static void enqueue_task(struct rq rq, struct task_struct p, int flags)	765	static void enqueue_task(struct rq rq, struct task_struct p, int flags)
766	{	766	{
767	update_rq_clock(rq);	767	update_rq_clock(rq);
768	sched_info_queued(rq, p);	768	sched_info_queued(rq, p);
769	p->sched_class->enqueue_task(rq, p, flags);	769	p->sched_class->enqueue_task(rq, p, flags);
770	}	770	}
771		771
772	static void dequeue_task(struct rq rq, struct task_struct p, int flags)	772	static void dequeue_task(struct rq rq, struct task_struct p, int flags)
773	{	773	{
774	update_rq_clock(rq);	774	update_rq_clock(rq);
775	sched_info_dequeued(rq, p);	775	sched_info_dequeued(rq, p);
776	p->sched_class->dequeue_task(rq, p, flags);	776	p->sched_class->dequeue_task(rq, p, flags);
777	}	777	}
778		778
779	void activate_task(struct rq rq, struct task_struct p, int flags)	779	void activate_task(struct rq rq, struct task_struct p, int flags)
780	{	780	{
781	if (task_contributes_to_load(p))	781	if (task_contributes_to_load(p))
782	rq->nr_uninterruptible--;	782	rq->nr_uninterruptible--;
783		783
784	enqueue_task(rq, p, flags);	784	enqueue_task(rq, p, flags);
785	}	785	}
786		786
787	void deactivate_task(struct rq rq, struct task_struct p, int flags)	787	void deactivate_task(struct rq rq, struct task_struct p, int flags)
788	{	788	{
789	if (task_contributes_to_load(p))	789	if (task_contributes_to_load(p))
790	rq->nr_uninterruptible++;	790	rq->nr_uninterruptible++;
791		791
792	dequeue_task(rq, p, flags);	792	dequeue_task(rq, p, flags);
793	}	793	}
794		794
795	static void update_rq_clock_task(struct rq *rq, s64 delta)	795	static void update_rq_clock_task(struct rq *rq, s64 delta)
796	{	796	{
797	/*	797	/*
798	* In theory, the compile should just see 0 here, and optimize out the call	798	* In theory, the compile should just see 0 here, and optimize out the call
799	* to sched_rt_avg_update. But I don't trust it...	799	* to sched_rt_avg_update. But I don't trust it...
800	*/	800	*/
801	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)	801	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
802	s64 steal = 0, irq_delta = 0;	802	s64 steal = 0, irq_delta = 0;
803	#endif	803	#endif
804	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	804	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
805	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;	805	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
806		806
807	/*	807	/*
808	* Since irq_time is only updated on {soft,}irq_exit, we might run into	808	* Since irq_time is only updated on {soft,}irq_exit, we might run into
809	* this case when a previous update_rq_clock() happened inside a	809	* this case when a previous update_rq_clock() happened inside a
810	* {soft,}irq region.	810	* {soft,}irq region.
811	*	811	*
812	* When this happens, we stop ->clock_task and only update the	812	* When this happens, we stop ->clock_task and only update the
813	* prev_irq_time stamp to account for the part that fit, so that a next	813	* prev_irq_time stamp to account for the part that fit, so that a next
814	* update will consume the rest. This ensures ->clock_task is	814	* update will consume the rest. This ensures ->clock_task is
815	* monotonic.	815	* monotonic.
816	*	816	*
817	* It does however cause some slight miss-attribution of {soft,}irq	817	* It does however cause some slight miss-attribution of {soft,}irq
818	* time, a more accurate solution would be to update the irq_time using	818	* time, a more accurate solution would be to update the irq_time using
819	* the current rq->clock timestamp, except that would require using	819	* the current rq->clock timestamp, except that would require using
820	* atomic ops.	820	* atomic ops.
821	*/	821	*/
822	if (irq_delta > delta)	822	if (irq_delta > delta)
823	irq_delta = delta;	823	irq_delta = delta;
824		824
825	rq->prev_irq_time += irq_delta;	825	rq->prev_irq_time += irq_delta;
826	delta -= irq_delta;	826	delta -= irq_delta;
827	#endif	827	#endif
828	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING	828	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
829	if (static_key_false((&paravirt_steal_rq_enabled))) {	829	if (static_key_false((&paravirt_steal_rq_enabled))) {
830	steal = paravirt_steal_clock(cpu_of(rq));	830	steal = paravirt_steal_clock(cpu_of(rq));
831	steal -= rq->prev_steal_time_rq;	831	steal -= rq->prev_steal_time_rq;
832		832
833	if (unlikely(steal > delta))	833	if (unlikely(steal > delta))
834	steal = delta;	834	steal = delta;
835		835
836	rq->prev_steal_time_rq += steal;	836	rq->prev_steal_time_rq += steal;
837	delta -= steal;	837	delta -= steal;
838	}	838	}
839	#endif	839	#endif
840		840
841	rq->clock_task += delta;	841	rq->clock_task += delta;
842		842
843	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)	843	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
844	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))	844	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
845	sched_rt_avg_update(rq, irq_delta + steal);	845	sched_rt_avg_update(rq, irq_delta + steal);
846	#endif	846	#endif
847	}	847	}
848		848
849	void sched_set_stop_task(int cpu, struct task_struct *stop)	849	void sched_set_stop_task(int cpu, struct task_struct *stop)
850	{	850	{
851	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };	851	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
852	struct task_struct *old_stop = cpu_rq(cpu)->stop;	852	struct task_struct *old_stop = cpu_rq(cpu)->stop;
853		853
854	if (stop) {	854	if (stop) {
855	/*	855	/*
856	* Make it appear like a SCHED_FIFO task, its something	856	* Make it appear like a SCHED_FIFO task, its something
857	* userspace knows about and won't get confused about.	857	* userspace knows about and won't get confused about.
858	*	858	*
859	* Also, it will make PI more or less work without too	859	* Also, it will make PI more or less work without too
860	* much confusion -- but then, stop work should not	860	* much confusion -- but then, stop work should not
861	* rely on PI working anyway.	861	* rely on PI working anyway.
862	*/	862	*/
863	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);	863	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
864		864
865	stop->sched_class = &stop_sched_class;	865	stop->sched_class = &stop_sched_class;
866	}	866	}
867		867
868	cpu_rq(cpu)->stop = stop;	868	cpu_rq(cpu)->stop = stop;
869		869
870	if (old_stop) {	870	if (old_stop) {
871	/*	871	/*
872	* Reset it back to a normal scheduling class so that	872	* Reset it back to a normal scheduling class so that
873	* it can die in pieces.	873	* it can die in pieces.
874	*/	874	*/
875	old_stop->sched_class = &rt_sched_class;	875	old_stop->sched_class = &rt_sched_class;
876	}	876	}
877	}	877	}
878		878
879	/*	879	/*
880	* __normal_prio - return the priority that is based on the static prio	880	* __normal_prio - return the priority that is based on the static prio
881	*/	881	*/
882	static inline int __normal_prio(struct task_struct *p)	882	static inline int __normal_prio(struct task_struct *p)
883	{	883	{
884	return p->static_prio;	884	return p->static_prio;
885	}	885	}
886		886
887	/*	887	/*
888	* Calculate the expected normal priority: i.e. priority	888	* Calculate the expected normal priority: i.e. priority
889	* without taking RT-inheritance into account. Might be	889	* without taking RT-inheritance into account. Might be
890	* boosted by interactivity modifiers. Changes upon fork,	890	* boosted by interactivity modifiers. Changes upon fork,
891	* setprio syscalls, and whenever the interactivity	891	* setprio syscalls, and whenever the interactivity
892	* estimator recalculates.	892	* estimator recalculates.
893	*/	893	*/
894	static inline int normal_prio(struct task_struct *p)	894	static inline int normal_prio(struct task_struct *p)
895	{	895	{
896	int prio;	896	int prio;
897		897
898	if (task_has_dl_policy(p))	898	if (task_has_dl_policy(p))
899	prio = MAX_DL_PRIO-1;	899	prio = MAX_DL_PRIO-1;
900	else if (task_has_rt_policy(p))	900	else if (task_has_rt_policy(p))
901	prio = MAX_RT_PRIO-1 - p->rt_priority;	901	prio = MAX_RT_PRIO-1 - p->rt_priority;
902	else	902	else
903	prio = __normal_prio(p);	903	prio = __normal_prio(p);
904	return prio;	904	return prio;
905	}	905	}
906		906
907	/*	907	/*
908	* Calculate the current priority, i.e. the priority	908	* Calculate the current priority, i.e. the priority
909	* taken into account by the scheduler. This value might	909	* taken into account by the scheduler. This value might
910	* be boosted by RT tasks, or might be boosted by	910	* be boosted by RT tasks, or might be boosted by
911	* interactivity modifiers. Will be RT if the task got	911	* interactivity modifiers. Will be RT if the task got
912	* RT-boosted. If not then it returns p->normal_prio.	912	* RT-boosted. If not then it returns p->normal_prio.
913	*/	913	*/
914	static int effective_prio(struct task_struct *p)	914	static int effective_prio(struct task_struct *p)
915	{	915	{
916	p->normal_prio = normal_prio(p);	916	p->normal_prio = normal_prio(p);
917	/*	917	/*
918	* If we are RT tasks or we were boosted to RT priority,	918	* If we are RT tasks or we were boosted to RT priority,
919	* keep the priority unchanged. Otherwise, update priority	919	* keep the priority unchanged. Otherwise, update priority
920	* to the normal priority:	920	* to the normal priority:
921	*/	921	*/
922	if (!rt_prio(p->prio))	922	if (!rt_prio(p->prio))
923	return p->normal_prio;	923	return p->normal_prio;
924	return p->prio;	924	return p->prio;
925	}	925	}
926		926
927	/**	927	/**
928	* task_curr - is this task currently executing on a CPU?	928	* task_curr - is this task currently executing on a CPU?
929	* @p: the task in question.	929	* @p: the task in question.
930	*	930	*
931	* Return: 1 if the task is currently executing. 0 otherwise.	931	* Return: 1 if the task is currently executing. 0 otherwise.
932	*/	932	*/
933	inline int task_curr(const struct task_struct *p)	933	inline int task_curr(const struct task_struct *p)
934	{	934	{
935	return cpu_curr(task_cpu(p)) == p;	935	return cpu_curr(task_cpu(p)) == p;
936	}	936	}
937		937
938	static inline void check_class_changed(struct rq rq, struct task_struct p,	938	static inline void check_class_changed(struct rq rq, struct task_struct p,
939	const struct sched_class *prev_class,	939	const struct sched_class *prev_class,
940	int oldprio)	940	int oldprio)
941	{	941	{
942	if (prev_class != p->sched_class) {	942	if (prev_class != p->sched_class) {
943	if (prev_class->switched_from)	943	if (prev_class->switched_from)
944	prev_class->switched_from(rq, p);	944	prev_class->switched_from(rq, p);
945	p->sched_class->switched_to(rq, p);	945	p->sched_class->switched_to(rq, p);
946	} else if (oldprio != p->prio \|\| dl_task(p))	946	} else if (oldprio != p->prio \|\| dl_task(p))
947	p->sched_class->prio_changed(rq, p, oldprio);	947	p->sched_class->prio_changed(rq, p, oldprio);
948	}	948	}
949		949
950	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)	950	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
951	{	951	{
952	const struct sched_class *class;	952	const struct sched_class *class;
953		953
954	if (p->sched_class == rq->curr->sched_class) {	954	if (p->sched_class == rq->curr->sched_class) {
955	rq->curr->sched_class->check_preempt_curr(rq, p, flags);	955	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
956	} else {	956	} else {
957	for_each_class(class) {	957	for_each_class(class) {
958	if (class == rq->curr->sched_class)	958	if (class == rq->curr->sched_class)
959	break;	959	break;
960	if (class == p->sched_class) {	960	if (class == p->sched_class) {
961	resched_task(rq->curr);	961	resched_task(rq->curr);
962	break;	962	break;
963	}	963	}
964	}	964	}
965	}	965	}
966		966
967	/*	967	/*
968	* A queue event has occurred, and we're going to schedule. In	968	* A queue event has occurred, and we're going to schedule. In
969	* this case, we can save a useless back to back clock update.	969	* this case, we can save a useless back to back clock update.
970	*/	970	*/
971	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))	971	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
972	rq->skip_clock_update = 1;	972	rq->skip_clock_update = 1;
973	}	973	}
974		974
975	#ifdef CONFIG_SMP	975	#ifdef CONFIG_SMP
976	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)	976	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
977	{	977	{
978	#ifdef CONFIG_SCHED_DEBUG	978	#ifdef CONFIG_SCHED_DEBUG
979	/*	979	/*
980	* We should never call set_task_cpu() on a blocked task,	980	* We should never call set_task_cpu() on a blocked task,
981	* ttwu() will sort out the placement.	981	* ttwu() will sort out the placement.
982	*/	982	*/
983	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&	983	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
984	!(task_preempt_count(p) & PREEMPT_ACTIVE));	984	!(task_preempt_count(p) & PREEMPT_ACTIVE));
985		985
986	#ifdef CONFIG_LOCKDEP	986	#ifdef CONFIG_LOCKDEP
987	/*	987	/*
988	* The caller should hold either p->pi_lock or rq->lock, when changing	988	* The caller should hold either p->pi_lock or rq->lock, when changing
989	* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.	989	* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
990	*	990	*
991	* sched_move_task() holds both and thus holding either pins the cgroup,	991	* sched_move_task() holds both and thus holding either pins the cgroup,
992	* see task_group().	992	* see task_group().
993	*	993	*
994	* Furthermore, all task_rq users should acquire both locks, see	994	* Furthermore, all task_rq users should acquire both locks, see
995	* task_rq_lock().	995	* task_rq_lock().
996	*/	996	*/
997	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|	997	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|
998	lockdep_is_held(&task_rq(p)->lock)));	998	lockdep_is_held(&task_rq(p)->lock)));
999	#endif	999	#endif
1000	#endif	1000	#endif
1001		1001
1002	trace_sched_migrate_task(p, new_cpu);	1002	trace_sched_migrate_task(p, new_cpu);
1003		1003
1004	if (task_cpu(p) != new_cpu) {	1004	if (task_cpu(p) != new_cpu) {
1005	if (p->sched_class->migrate_task_rq)	1005	if (p->sched_class->migrate_task_rq)
1006	p->sched_class->migrate_task_rq(p, new_cpu);	1006	p->sched_class->migrate_task_rq(p, new_cpu);
1007	p->se.nr_migrations++;	1007	p->se.nr_migrations++;
1008	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);	1008	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1009	}	1009	}
1010		1010
1011	__set_task_cpu(p, new_cpu);	1011	__set_task_cpu(p, new_cpu);
1012	}	1012	}
1013		1013
1014	static void __migrate_swap_task(struct task_struct *p, int cpu)	1014	static void __migrate_swap_task(struct task_struct *p, int cpu)
1015	{	1015	{
1016	if (p->on_rq) {	1016	if (p->on_rq) {
1017	struct rq src_rq, dst_rq;	1017	struct rq src_rq, dst_rq;
1018		1018
1019	src_rq = task_rq(p);	1019	src_rq = task_rq(p);
1020	dst_rq = cpu_rq(cpu);	1020	dst_rq = cpu_rq(cpu);
1021		1021
1022	deactivate_task(src_rq, p, 0);	1022	deactivate_task(src_rq, p, 0);
1023	set_task_cpu(p, cpu);	1023	set_task_cpu(p, cpu);
1024	activate_task(dst_rq, p, 0);	1024	activate_task(dst_rq, p, 0);
1025	check_preempt_curr(dst_rq, p, 0);	1025	check_preempt_curr(dst_rq, p, 0);
1026	} else {	1026	} else {
1027	/*	1027	/*
1028	* Task isn't running anymore; make it appear like we migrated	1028	* Task isn't running anymore; make it appear like we migrated
1029	* it before it went to sleep. This means on wakeup we make the	1029	* it before it went to sleep. This means on wakeup we make the
1030	* previous cpu our targer instead of where it really is.	1030	* previous cpu our targer instead of where it really is.
1031	*/	1031	*/
1032	p->wake_cpu = cpu;	1032	p->wake_cpu = cpu;
1033	}	1033	}
1034	}	1034	}
1035		1035
1036	struct migration_swap_arg {	1036	struct migration_swap_arg {
1037	struct task_struct src_task, dst_task;	1037	struct task_struct src_task, dst_task;
1038	int src_cpu, dst_cpu;	1038	int src_cpu, dst_cpu;
1039	};	1039	};
1040		1040
1041	static int migrate_swap_stop(void *data)	1041	static int migrate_swap_stop(void *data)
1042	{	1042	{
1043	struct migration_swap_arg *arg = data;	1043	struct migration_swap_arg *arg = data;
1044	struct rq src_rq, dst_rq;	1044	struct rq src_rq, dst_rq;
1045	int ret = -EAGAIN;	1045	int ret = -EAGAIN;
1046		1046
1047	src_rq = cpu_rq(arg->src_cpu);	1047	src_rq = cpu_rq(arg->src_cpu);
1048	dst_rq = cpu_rq(arg->dst_cpu);	1048	dst_rq = cpu_rq(arg->dst_cpu);
1049		1049
1050	double_raw_lock(&arg->src_task->pi_lock,	1050	double_raw_lock(&arg->src_task->pi_lock,
1051	&arg->dst_task->pi_lock);	1051	&arg->dst_task->pi_lock);
1052	double_rq_lock(src_rq, dst_rq);	1052	double_rq_lock(src_rq, dst_rq);
1053	if (task_cpu(arg->dst_task) != arg->dst_cpu)	1053	if (task_cpu(arg->dst_task) != arg->dst_cpu)
1054	goto unlock;	1054	goto unlock;
1055		1055
1056	if (task_cpu(arg->src_task) != arg->src_cpu)	1056	if (task_cpu(arg->src_task) != arg->src_cpu)
1057	goto unlock;	1057	goto unlock;
1058		1058
1059	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))	1059	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1060	goto unlock;	1060	goto unlock;
1061		1061
1062	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))	1062	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1063	goto unlock;	1063	goto unlock;
1064		1064
1065	__migrate_swap_task(arg->src_task, arg->dst_cpu);	1065	__migrate_swap_task(arg->src_task, arg->dst_cpu);
1066	__migrate_swap_task(arg->dst_task, arg->src_cpu);	1066	__migrate_swap_task(arg->dst_task, arg->src_cpu);
1067		1067
1068	ret = 0;	1068	ret = 0;
1069		1069
1070	unlock:	1070	unlock:
1071	double_rq_unlock(src_rq, dst_rq);	1071	double_rq_unlock(src_rq, dst_rq);
1072	raw_spin_unlock(&arg->dst_task->pi_lock);	1072	raw_spin_unlock(&arg->dst_task->pi_lock);
1073	raw_spin_unlock(&arg->src_task->pi_lock);	1073	raw_spin_unlock(&arg->src_task->pi_lock);
1074		1074
1075	return ret;	1075	return ret;
1076	}	1076	}
1077		1077
1078	/*	1078	/*
1079	* Cross migrate two tasks	1079	* Cross migrate two tasks
1080	*/	1080	*/
1081	int migrate_swap(struct task_struct cur, struct task_struct p)	1081	int migrate_swap(struct task_struct cur, struct task_struct p)
1082	{	1082	{
1083	struct migration_swap_arg arg;	1083	struct migration_swap_arg arg;
1084	int ret = -EINVAL;	1084	int ret = -EINVAL;
1085		1085
1086	arg = (struct migration_swap_arg){	1086	arg = (struct migration_swap_arg){
1087	.src_task = cur,	1087	.src_task = cur,
1088	.src_cpu = task_cpu(cur),	1088	.src_cpu = task_cpu(cur),
1089	.dst_task = p,	1089	.dst_task = p,
1090	.dst_cpu = task_cpu(p),	1090	.dst_cpu = task_cpu(p),
1091	};	1091	};
1092		1092
1093	if (arg.src_cpu == arg.dst_cpu)	1093	if (arg.src_cpu == arg.dst_cpu)
1094	goto out;	1094	goto out;
1095		1095
1096	/*	1096	/*
1097	* These three tests are all lockless; this is OK since all of them	1097	* These three tests are all lockless; this is OK since all of them
1098	* will be re-checked with proper locks held further down the line.	1098	* will be re-checked with proper locks held further down the line.
1099	*/	1099	*/
1100	if (!cpu_active(arg.src_cpu) \|\| !cpu_active(arg.dst_cpu))	1100	if (!cpu_active(arg.src_cpu) \|\| !cpu_active(arg.dst_cpu))
1101	goto out;	1101	goto out;
1102		1102
1103	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))	1103	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1104	goto out;	1104	goto out;
1105		1105
1106	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))	1106	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1107	goto out;	1107	goto out;
1108		1108
1109	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);	1109	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1110	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);	1110	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1111		1111
1112	out:	1112	out:
1113	return ret;	1113	return ret;
1114	}	1114	}
1115		1115
1116	struct migration_arg {	1116	struct migration_arg {
1117	struct task_struct *task;	1117	struct task_struct *task;
1118	int dest_cpu;	1118	int dest_cpu;
1119	};	1119	};
1120		1120
1121	static int migration_cpu_stop(void *data);	1121	static int migration_cpu_stop(void *data);
1122		1122
1123	/*	1123	/*
1124	* wait_task_inactive - wait for a thread to unschedule.	1124	* wait_task_inactive - wait for a thread to unschedule.
1125	*	1125	*
1126	* If @match_state is nonzero, it's the @p->state value just checked and	1126	* If @match_state is nonzero, it's the @p->state value just checked and
1127	* not expected to change. If it changes, i.e. @p might have woken up,	1127	* not expected to change. If it changes, i.e. @p might have woken up,
1128	* then return zero. When we succeed in waiting for @p to be off its CPU,	1128	* then return zero. When we succeed in waiting for @p to be off its CPU,
1129	* we return a positive number (its total switch count). If a second call	1129	* we return a positive number (its total switch count). If a second call
1130	* a short while later returns the same number, the caller can be sure that	1130	* a short while later returns the same number, the caller can be sure that
1131	* @p has remained unscheduled the whole time.	1131	* @p has remained unscheduled the whole time.
1132	*	1132	*
1133	* The caller must ensure that the task will unschedule sometime soon,	1133	* The caller must ensure that the task will unschedule sometime soon,
1134	* else this function might spin for a long time. This function can't	1134	* else this function might spin for a long time. This function can't
1135	* be called with interrupts off, or it may introduce deadlock with	1135	* be called with interrupts off, or it may introduce deadlock with
1136	* smp_call_function() if an IPI is sent by the same process we are	1136	* smp_call_function() if an IPI is sent by the same process we are
1137	* waiting to become inactive.	1137	* waiting to become inactive.
1138	*/	1138	*/
1139	unsigned long wait_task_inactive(struct task_struct *p, long match_state)	1139	unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1140	{	1140	{
1141	unsigned long flags;	1141	unsigned long flags;
1142	int running, on_rq;	1142	int running, on_rq;
1143	unsigned long ncsw;	1143	unsigned long ncsw;
1144	struct rq *rq;	1144	struct rq *rq;
1145		1145
1146	for (;;) {	1146	for (;;) {
1147	/*	1147	/*
1148	* We do the initial early heuristics without holding	1148	* We do the initial early heuristics without holding
1149	* any task-queue locks at all. We'll only try to get	1149	* any task-queue locks at all. We'll only try to get
1150	* the runqueue lock when things look like they will	1150	* the runqueue lock when things look like they will
1151	* work out!	1151	* work out!
1152	*/	1152	*/
1153	rq = task_rq(p);	1153	rq = task_rq(p);
1154		1154
1155	/*	1155	/*
1156	* If the task is actively running on another CPU	1156	* If the task is actively running on another CPU
1157	* still, just relax and busy-wait without holding	1157	* still, just relax and busy-wait without holding
1158	* any locks.	1158	* any locks.
1159	*	1159	*
1160	* NOTE! Since we don't hold any locks, it's not	1160	* NOTE! Since we don't hold any locks, it's not
1161	* even sure that "rq" stays as the right runqueue!	1161	* even sure that "rq" stays as the right runqueue!
1162	* But we don't care, since "task_running()" will	1162	* But we don't care, since "task_running()" will
1163	* return false if the runqueue has changed and p	1163	* return false if the runqueue has changed and p
1164	* is actually now running somewhere else!	1164	* is actually now running somewhere else!
1165	*/	1165	*/
1166	while (task_running(rq, p)) {	1166	while (task_running(rq, p)) {
1167	if (match_state && unlikely(p->state != match_state))	1167	if (match_state && unlikely(p->state != match_state))
1168	return 0;	1168	return 0;
1169	cpu_relax();	1169	cpu_relax();
1170	}	1170	}
1171		1171
1172	/*	1172	/*
1173	* Ok, time to look more closely! We need the rq	1173	* Ok, time to look more closely! We need the rq
1174	* lock now, to be sure. If we're wrong, we'll	1174	* lock now, to be sure. If we're wrong, we'll
1175	* just go back and repeat.	1175	* just go back and repeat.
1176	*/	1176	*/
1177	rq = task_rq_lock(p, &flags);	1177	rq = task_rq_lock(p, &flags);
1178	trace_sched_wait_task(p);	1178	trace_sched_wait_task(p);
1179	running = task_running(rq, p);	1179	running = task_running(rq, p);
1180	on_rq = p->on_rq;	1180	on_rq = p->on_rq;
1181	ncsw = 0;	1181	ncsw = 0;
1182	if (!match_state \|\| p->state == match_state)	1182	if (!match_state \|\| p->state == match_state)
1183	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */	1183	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
1184	task_rq_unlock(rq, p, &flags);	1184	task_rq_unlock(rq, p, &flags);
1185		1185
1186	/*	1186	/*
1187	* If it changed from the expected state, bail out now.	1187	* If it changed from the expected state, bail out now.
1188	*/	1188	*/
1189	if (unlikely(!ncsw))	1189	if (unlikely(!ncsw))
1190	break;	1190	break;
1191		1191
1192	/*	1192	/*
1193	* Was it really running after all now that we	1193	* Was it really running after all now that we
1194	* checked with the proper locks actually held?	1194	* checked with the proper locks actually held?
1195	*	1195	*
1196	* Oops. Go back and try again..	1196	* Oops. Go back and try again..
1197	*/	1197	*/
1198	if (unlikely(running)) {	1198	if (unlikely(running)) {
1199	cpu_relax();	1199	cpu_relax();
1200	continue;	1200	continue;
1201	}	1201	}
1202		1202
1203	/*	1203	/*
1204	* It's not enough that it's not actively running,	1204	* It's not enough that it's not actively running,
1205	* it must be off the runqueue _entirely_, and not	1205	* it must be off the runqueue _entirely_, and not
1206	* preempted!	1206	* preempted!
1207	*	1207	*
1208	* So if it was still runnable (but just not actively	1208	* So if it was still runnable (but just not actively
1209	* running right now), it's preempted, and we should	1209	* running right now), it's preempted, and we should
1210	* yield - it could be a while.	1210	* yield - it could be a while.
1211	*/	1211	*/
1212	if (unlikely(on_rq)) {	1212	if (unlikely(on_rq)) {
1213	ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);	1213	ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1214		1214
1215	set_current_state(TASK_UNINTERRUPTIBLE);	1215	set_current_state(TASK_UNINTERRUPTIBLE);
1216	schedule_hrtimeout(&to, HRTIMER_MODE_REL);	1216	schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1217	continue;	1217	continue;
1218	}	1218	}
1219		1219
1220	/*	1220	/*
1221	* Ahh, all good. It wasn't running, and it wasn't	1221	* Ahh, all good. It wasn't running, and it wasn't
1222	* runnable, which means that it will never become	1222	* runnable, which means that it will never become
1223	* running in the future either. We're all done!	1223	* running in the future either. We're all done!
1224	*/	1224	*/
1225	break;	1225	break;
1226	}	1226	}
1227		1227
1228	return ncsw;	1228	return ncsw;
1229	}	1229	}
1230		1230
1231	/***	1231	/***
1232	* kick_process - kick a running thread to enter/exit the kernel	1232	* kick_process - kick a running thread to enter/exit the kernel
1233	* @p: the to-be-kicked thread	1233	* @p: the to-be-kicked thread
1234	*	1234	*
1235	* Cause a process which is running on another CPU to enter	1235	* Cause a process which is running on another CPU to enter
1236	* kernel-mode, without any delay. (to get signals handled.)	1236	* kernel-mode, without any delay. (to get signals handled.)
1237	*	1237	*
1238	* NOTE: this function doesn't have to take the runqueue lock,	1238	* NOTE: this function doesn't have to take the runqueue lock,
1239	* because all it wants to ensure is that the remote task enters	1239	* because all it wants to ensure is that the remote task enters
1240	* the kernel. If the IPI races and the task has been migrated	1240	* the kernel. If the IPI races and the task has been migrated
1241	* to another CPU then no harm is done and the purpose has been	1241	* to another CPU then no harm is done and the purpose has been
1242	* achieved as well.	1242	* achieved as well.
1243	*/	1243	*/
1244	void kick_process(struct task_struct *p)	1244	void kick_process(struct task_struct *p)
1245	{	1245	{
1246	int cpu;	1246	int cpu;
1247		1247
1248	preempt_disable();	1248	preempt_disable();
1249	cpu = task_cpu(p);	1249	cpu = task_cpu(p);
1250	if ((cpu != smp_processor_id()) && task_curr(p))	1250	if ((cpu != smp_processor_id()) && task_curr(p))
1251	smp_send_reschedule(cpu);	1251	smp_send_reschedule(cpu);
1252	preempt_enable();	1252	preempt_enable();
1253	}	1253	}
1254	EXPORT_SYMBOL_GPL(kick_process);	1254	EXPORT_SYMBOL_GPL(kick_process);
1255	#endif /* CONFIG_SMP */	1255	#endif /* CONFIG_SMP */
1256		1256
1257	#ifdef CONFIG_SMP	1257	#ifdef CONFIG_SMP
1258	/*	1258	/*
1259	* ->cpus_allowed is protected by both rq->lock and p->pi_lock	1259	* ->cpus_allowed is protected by both rq->lock and p->pi_lock
1260	*/	1260	*/
1261	static int select_fallback_rq(int cpu, struct task_struct *p)	1261	static int select_fallback_rq(int cpu, struct task_struct *p)
1262	{	1262	{
1263	int nid = cpu_to_node(cpu);	1263	int nid = cpu_to_node(cpu);
1264	const struct cpumask *nodemask = NULL;	1264	const struct cpumask *nodemask = NULL;
1265	enum { cpuset, possible, fail } state = cpuset;	1265	enum { cpuset, possible, fail } state = cpuset;
1266	int dest_cpu;	1266	int dest_cpu;
1267		1267
1268	/*	1268	/*
1269	* If the node that the cpu is on has been offlined, cpu_to_node()	1269	* If the node that the cpu is on has been offlined, cpu_to_node()
1270	* will return -1. There is no cpu on the node, and we should	1270	* will return -1. There is no cpu on the node, and we should
1271	* select the cpu on the other node.	1271	* select the cpu on the other node.
1272	*/	1272	*/
1273	if (nid != -1) {	1273	if (nid != -1) {
1274	nodemask = cpumask_of_node(nid);	1274	nodemask = cpumask_of_node(nid);
1275		1275
1276	/* Look for allowed, online CPU in same node. */	1276	/* Look for allowed, online CPU in same node. */
1277	for_each_cpu(dest_cpu, nodemask) {	1277	for_each_cpu(dest_cpu, nodemask) {
1278	if (!cpu_online(dest_cpu))	1278	if (!cpu_online(dest_cpu))
1279	continue;	1279	continue;
1280	if (!cpu_active(dest_cpu))	1280	if (!cpu_active(dest_cpu))
1281	continue;	1281	continue;
1282	if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))	1282	if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1283	return dest_cpu;	1283	return dest_cpu;
1284	}	1284	}
1285	}	1285	}
1286		1286
1287	for (;;) {	1287	for (;;) {
1288	/* Any allowed, online CPU? */	1288	/* Any allowed, online CPU? */
1289	for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {	1289	for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1290	if (!cpu_online(dest_cpu))	1290	if (!cpu_online(dest_cpu))
1291	continue;	1291	continue;
1292	if (!cpu_active(dest_cpu))	1292	if (!cpu_active(dest_cpu))
1293	continue;	1293	continue;
1294	goto out;	1294	goto out;
1295	}	1295	}
1296		1296
1297	switch (state) {	1297	switch (state) {
1298	case cpuset:	1298	case cpuset:
1299	/* No more Mr. Nice Guy. */	1299	/* No more Mr. Nice Guy. */
1300	cpuset_cpus_allowed_fallback(p);	1300	cpuset_cpus_allowed_fallback(p);
1301	state = possible;	1301	state = possible;
1302	break;	1302	break;
1303		1303
1304	case possible:	1304	case possible:
1305	do_set_cpus_allowed(p, cpu_possible_mask);	1305	do_set_cpus_allowed(p, cpu_possible_mask);
1306	state = fail;	1306	state = fail;
1307	break;	1307	break;
1308		1308
1309	case fail:	1309	case fail:
1310	BUG();	1310	BUG();
1311	break;	1311	break;
1312	}	1312	}
1313	}	1313	}
1314		1314
1315	out:	1315	out:
1316	if (state != cpuset) {	1316	if (state != cpuset) {
1317	/*	1317	/*
1318	* Don't tell them about moving exiting tasks or	1318	* Don't tell them about moving exiting tasks or
1319	* kernel threads (both mm NULL), since they never	1319	* kernel threads (both mm NULL), since they never
1320	* leave kernel.	1320	* leave kernel.
1321	*/	1321	*/
1322	if (p->mm && printk_ratelimit()) {	1322	if (p->mm && printk_ratelimit()) {
1323	printk_sched("process %d (%s) no longer affine to cpu%d\n",	1323	printk_sched("process %d (%s) no longer affine to cpu%d\n",
1324	task_pid_nr(p), p->comm, cpu);	1324	task_pid_nr(p), p->comm, cpu);
1325	}	1325	}
1326	}	1326	}
1327		1327
1328	return dest_cpu;	1328	return dest_cpu;
1329	}	1329	}
1330		1330
1331	/*	1331	/*
1332	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.	1332	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1333	*/	1333	*/
1334	static inline	1334	static inline
1335	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)	1335	int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1336	{	1336	{
1337	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);	1337	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1338		1338
1339	/*	1339	/*
1340	* In order not to call set_task_cpu() on a blocking task we need	1340	* In order not to call set_task_cpu() on a blocking task we need
1341	* to rely on ttwu() to place the task on a valid ->cpus_allowed	1341	* to rely on ttwu() to place the task on a valid ->cpus_allowed
1342	* cpu.	1342	* cpu.
1343	*	1343	*
1344	* Since this is common to all placement strategies, this lives here.	1344	* Since this is common to all placement strategies, this lives here.
1345	*	1345	*
1346	* [ this allows ->select_task() to simply return task_cpu(p) and	1346	* [ this allows ->select_task() to simply return task_cpu(p) and
1347	* not worry about this generic constraint ]	1347	* not worry about this generic constraint ]
1348	*/	1348	*/
1349	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) \|\|	1349	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) \|\|
1350	!cpu_online(cpu)))	1350	!cpu_online(cpu)))
1351	cpu = select_fallback_rq(task_cpu(p), p);	1351	cpu = select_fallback_rq(task_cpu(p), p);
1352		1352
1353	return cpu;	1353	return cpu;
1354	}	1354	}
1355		1355
1356	static void update_avg(u64 *avg, u64 sample)	1356	static void update_avg(u64 *avg, u64 sample)
1357	{	1357	{
1358	s64 diff = sample - *avg;	1358	s64 diff = sample - *avg;
1359	*avg += diff >> 3;	1359	*avg += diff >> 3;
1360	}	1360	}
1361	#endif	1361	#endif
1362		1362
1363	static void	1363	static void
1364	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)	1364	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1365	{	1365	{
1366	#ifdef CONFIG_SCHEDSTATS	1366	#ifdef CONFIG_SCHEDSTATS
1367	struct rq *rq = this_rq();	1367	struct rq *rq = this_rq();
1368		1368
1369	#ifdef CONFIG_SMP	1369	#ifdef CONFIG_SMP
1370	int this_cpu = smp_processor_id();	1370	int this_cpu = smp_processor_id();
1371		1371
1372	if (cpu == this_cpu) {	1372	if (cpu == this_cpu) {
1373	schedstat_inc(rq, ttwu_local);	1373	schedstat_inc(rq, ttwu_local);
1374	schedstat_inc(p, se.statistics.nr_wakeups_local);	1374	schedstat_inc(p, se.statistics.nr_wakeups_local);
1375	} else {	1375	} else {
1376	struct sched_domain *sd;	1376	struct sched_domain *sd;
1377		1377
1378	schedstat_inc(p, se.statistics.nr_wakeups_remote);	1378	schedstat_inc(p, se.statistics.nr_wakeups_remote);
1379	rcu_read_lock();	1379	rcu_read_lock();
1380	for_each_domain(this_cpu, sd) {	1380	for_each_domain(this_cpu, sd) {
1381	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {	1381	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1382	schedstat_inc(sd, ttwu_wake_remote);	1382	schedstat_inc(sd, ttwu_wake_remote);
1383	break;	1383	break;
1384	}	1384	}
1385	}	1385	}
1386	rcu_read_unlock();	1386	rcu_read_unlock();
1387	}	1387	}
1388		1388
1389	if (wake_flags & WF_MIGRATED)	1389	if (wake_flags & WF_MIGRATED)
1390	schedstat_inc(p, se.statistics.nr_wakeups_migrate);	1390	schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1391		1391
1392	#endif /* CONFIG_SMP */	1392	#endif /* CONFIG_SMP */
1393		1393
1394	schedstat_inc(rq, ttwu_count);	1394	schedstat_inc(rq, ttwu_count);
1395	schedstat_inc(p, se.statistics.nr_wakeups);	1395	schedstat_inc(p, se.statistics.nr_wakeups);
1396		1396
1397	if (wake_flags & WF_SYNC)	1397	if (wake_flags & WF_SYNC)
1398	schedstat_inc(p, se.statistics.nr_wakeups_sync);	1398	schedstat_inc(p, se.statistics.nr_wakeups_sync);
1399		1399
1400	#endif /* CONFIG_SCHEDSTATS */	1400	#endif /* CONFIG_SCHEDSTATS */
1401	}	1401	}
1402		1402
1403	static void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)	1403	static void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)
1404	{	1404	{
1405	activate_task(rq, p, en_flags);	1405	activate_task(rq, p, en_flags);
1406	p->on_rq = 1;	1406	p->on_rq = 1;
1407		1407
1408	/* if a worker is waking up, notify workqueue */	1408	/* if a worker is waking up, notify workqueue */
1409	if (p->flags & PF_WQ_WORKER)	1409	if (p->flags & PF_WQ_WORKER)
1410	wq_worker_waking_up(p, cpu_of(rq));	1410	wq_worker_waking_up(p, cpu_of(rq));
1411	}	1411	}
1412		1412
1413	/*	1413	/*
1414	* Mark the task runnable and perform wakeup-preemption.	1414	* Mark the task runnable and perform wakeup-preemption.
1415	*/	1415	*/
1416	static void	1416	static void
1417	ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags)	1417	ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags)
1418	{	1418	{
1419	check_preempt_curr(rq, p, wake_flags);	1419	check_preempt_curr(rq, p, wake_flags);
1420	trace_sched_wakeup(p, true);	1420	trace_sched_wakeup(p, true);
1421		1421
1422	p->state = TASK_RUNNING;	1422	p->state = TASK_RUNNING;
1423	#ifdef CONFIG_SMP	1423	#ifdef CONFIG_SMP
1424	if (p->sched_class->task_woken)	1424	if (p->sched_class->task_woken)
1425	p->sched_class->task_woken(rq, p);	1425	p->sched_class->task_woken(rq, p);
1426		1426
1427	if (rq->idle_stamp) {	1427	if (rq->idle_stamp) {
1428	u64 delta = rq_clock(rq) - rq->idle_stamp;	1428	u64 delta = rq_clock(rq) - rq->idle_stamp;
1429	u64 max = 2*rq->max_idle_balance_cost;	1429	u64 max = 2*rq->max_idle_balance_cost;
1430		1430
1431	update_avg(&rq->avg_idle, delta);	1431	update_avg(&rq->avg_idle, delta);
1432		1432
1433	if (rq->avg_idle > max)	1433	if (rq->avg_idle > max)
1434	rq->avg_idle = max;	1434	rq->avg_idle = max;
1435		1435
1436	rq->idle_stamp = 0;	1436	rq->idle_stamp = 0;
1437	}	1437	}
1438	#endif	1438	#endif
1439	}	1439	}
1440		1440
1441	static void	1441	static void
1442	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags)	1442	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags)
1443	{	1443	{
1444	#ifdef CONFIG_SMP	1444	#ifdef CONFIG_SMP
1445	if (p->sched_contributes_to_load)	1445	if (p->sched_contributes_to_load)
1446	rq->nr_uninterruptible--;	1446	rq->nr_uninterruptible--;
1447	#endif	1447	#endif
1448		1448
1449	ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_WAKING);	1449	ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_WAKING);
1450	ttwu_do_wakeup(rq, p, wake_flags);	1450	ttwu_do_wakeup(rq, p, wake_flags);
1451	}	1451	}
1452		1452
1453	/*	1453	/*
1454	* Called in case the task @p isn't fully descheduled from its runqueue,	1454	* Called in case the task @p isn't fully descheduled from its runqueue,
1455	* in this case we must do a remote wakeup. Its a 'light' wakeup though,	1455	* in this case we must do a remote wakeup. Its a 'light' wakeup though,
1456	* since all we need to do is flip p->state to TASK_RUNNING, since	1456	* since all we need to do is flip p->state to TASK_RUNNING, since
1457	* the task is still ->on_rq.	1457	* the task is still ->on_rq.
1458	*/	1458	*/
1459	static int ttwu_remote(struct task_struct *p, int wake_flags)	1459	static int ttwu_remote(struct task_struct *p, int wake_flags)
1460	{	1460	{
1461	struct rq *rq;	1461	struct rq *rq;
1462	int ret = 0;	1462	int ret = 0;
1463		1463
1464	rq = __task_rq_lock(p);	1464	rq = __task_rq_lock(p);
1465	if (p->on_rq) {	1465	if (p->on_rq) {
1466	/* check_preempt_curr() may use rq clock */	1466	/* check_preempt_curr() may use rq clock */
1467	update_rq_clock(rq);	1467	update_rq_clock(rq);
1468	ttwu_do_wakeup(rq, p, wake_flags);	1468	ttwu_do_wakeup(rq, p, wake_flags);
1469	ret = 1;	1469	ret = 1;
1470	}	1470	}
1471	__task_rq_unlock(rq);	1471	__task_rq_unlock(rq);
1472		1472
1473	return ret;	1473	return ret;
1474	}	1474	}
1475		1475
1476	#ifdef CONFIG_SMP	1476	#ifdef CONFIG_SMP
1477	static void sched_ttwu_pending(void)	1477	static void sched_ttwu_pending(void)
1478	{	1478	{
1479	struct rq *rq = this_rq();	1479	struct rq *rq = this_rq();
1480	struct llist_node *llist = llist_del_all(&rq->wake_list);	1480	struct llist_node *llist = llist_del_all(&rq->wake_list);
1481	struct task_struct *p;	1481	struct task_struct *p;
1482		1482
1483	raw_spin_lock(&rq->lock);	1483	raw_spin_lock(&rq->lock);
1484		1484
1485	while (llist) {	1485	while (llist) {
1486	p = llist_entry(llist, struct task_struct, wake_entry);	1486	p = llist_entry(llist, struct task_struct, wake_entry);
1487	llist = llist_next(llist);	1487	llist = llist_next(llist);
1488	ttwu_do_activate(rq, p, 0);	1488	ttwu_do_activate(rq, p, 0);
1489	}	1489	}
1490		1490
1491	raw_spin_unlock(&rq->lock);	1491	raw_spin_unlock(&rq->lock);
1492	}	1492	}
1493		1493
1494	void scheduler_ipi(void)	1494	void scheduler_ipi(void)
1495	{	1495	{
1496	/*	1496	/*
1497	* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting	1497	* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1498	* TIF_NEED_RESCHED remotely (for the first time) will also send	1498	* TIF_NEED_RESCHED remotely (for the first time) will also send
1499	* this IPI.	1499	* this IPI.
1500	*/	1500	*/
1501	preempt_fold_need_resched();	1501	preempt_fold_need_resched();
1502		1502
1503	if (llist_empty(&this_rq()->wake_list)	1503	if (llist_empty(&this_rq()->wake_list)
1504	&& !tick_nohz_full_cpu(smp_processor_id())	1504	&& !tick_nohz_full_cpu(smp_processor_id())
1505	&& !got_nohz_idle_kick())	1505	&& !got_nohz_idle_kick())
1506	return;	1506	return;
1507		1507
1508	/*	1508	/*
1509	* Not all reschedule IPI handlers call irq_enter/irq_exit, since	1509	* Not all reschedule IPI handlers call irq_enter/irq_exit, since
1510	* traditionally all their work was done from the interrupt return	1510	* traditionally all their work was done from the interrupt return
1511	* path. Now that we actually do some work, we need to make sure	1511	* path. Now that we actually do some work, we need to make sure
1512	* we do call them.	1512	* we do call them.
1513	*	1513	*
1514	* Some archs already do call them, luckily irq_enter/exit nest	1514	* Some archs already do call them, luckily irq_enter/exit nest
1515	* properly.	1515	* properly.
1516	*	1516	*
1517	* Arguably we should visit all archs and update all handlers,	1517	* Arguably we should visit all archs and update all handlers,
1518	* however a fair share of IPIs are still resched only so this would	1518	* however a fair share of IPIs are still resched only so this would
1519	* somewhat pessimize the simple resched case.	1519	* somewhat pessimize the simple resched case.
1520	*/	1520	*/
1521	irq_enter();	1521	irq_enter();
1522	tick_nohz_full_check();	1522	tick_nohz_full_check();
1523	sched_ttwu_pending();	1523	sched_ttwu_pending();
1524		1524
1525	/*	1525	/*
1526	* Check if someone kicked us for doing the nohz idle load balance.	1526	* Check if someone kicked us for doing the nohz idle load balance.
1527	*/	1527	*/
1528	if (unlikely(got_nohz_idle_kick())) {	1528	if (unlikely(got_nohz_idle_kick())) {
1529	this_rq()->idle_balance = 1;	1529	this_rq()->idle_balance = 1;
1530	raise_softirq_irqoff(SCHED_SOFTIRQ);	1530	raise_softirq_irqoff(SCHED_SOFTIRQ);
1531	}	1531	}
1532	irq_exit();	1532	irq_exit();
1533	}	1533	}
1534		1534
1535	static void ttwu_queue_remote(struct task_struct *p, int cpu)	1535	static void ttwu_queue_remote(struct task_struct *p, int cpu)
1536	{	1536	{
1537	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))	1537	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1538	smp_send_reschedule(cpu);	1538	smp_send_reschedule(cpu);
1539	}	1539	}
1540		1540
1541	bool cpus_share_cache(int this_cpu, int that_cpu)	1541	bool cpus_share_cache(int this_cpu, int that_cpu)
1542	{	1542	{
1543	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);	1543	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1544	}	1544	}
1545	#endif /* CONFIG_SMP */	1545	#endif /* CONFIG_SMP */
1546		1546
1547	static void ttwu_queue(struct task_struct *p, int cpu)	1547	static void ttwu_queue(struct task_struct *p, int cpu)
1548	{	1548	{
1549	struct rq *rq = cpu_rq(cpu);	1549	struct rq *rq = cpu_rq(cpu);
1550		1550
1551	#if defined(CONFIG_SMP)	1551	#if defined(CONFIG_SMP)
1552	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {	1552	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1553	sched_clock_cpu(cpu); /* sync clocks x-cpu */	1553	sched_clock_cpu(cpu); /* sync clocks x-cpu */
1554	ttwu_queue_remote(p, cpu);	1554	ttwu_queue_remote(p, cpu);
1555	return;	1555	return;
1556	}	1556	}
1557	#endif	1557	#endif
1558		1558
1559	raw_spin_lock(&rq->lock);	1559	raw_spin_lock(&rq->lock);
1560	ttwu_do_activate(rq, p, 0);	1560	ttwu_do_activate(rq, p, 0);
1561	raw_spin_unlock(&rq->lock);	1561	raw_spin_unlock(&rq->lock);
1562	}	1562	}
1563		1563
1564	/**	1564	/**
1565	* try_to_wake_up - wake up a thread	1565	* try_to_wake_up - wake up a thread
1566	* @p: the thread to be awakened	1566	* @p: the thread to be awakened
1567	* @state: the mask of task states that can be woken	1567	* @state: the mask of task states that can be woken
1568	* @wake_flags: wake modifier flags (WF_*)	1568	* @wake_flags: wake modifier flags (WF_*)
1569	*	1569	*
1570	* Put it on the run-queue if it's not already there. The "current"	1570	* Put it on the run-queue if it's not already there. The "current"
1571	* thread is always on the run-queue (except when the actual	1571	* thread is always on the run-queue (except when the actual
1572	* re-schedule is in progress), and as such you're allowed to do	1572	* re-schedule is in progress), and as such you're allowed to do
1573	* the simpler "current->state = TASK_RUNNING" to mark yourself	1573	* the simpler "current->state = TASK_RUNNING" to mark yourself
1574	* runnable without the overhead of this.	1574	* runnable without the overhead of this.
1575	*	1575	*
1576	* Return: %true if @p was woken up, %false if it was already running.	1576	* Return: %true if @p was woken up, %false if it was already running.
1577	* or @state didn't match @p's state.	1577	* or @state didn't match @p's state.
1578	*/	1578	*/
1579	static int	1579	static int
1580	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)	1580	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1581	{	1581	{
1582	unsigned long flags;	1582	unsigned long flags;
1583	int cpu, success = 0;	1583	int cpu, success = 0;
1584		1584
1585	/*	1585	/*
1586	* If we are going to wake up a thread waiting for CONDITION we	1586	* If we are going to wake up a thread waiting for CONDITION we
1587	* need to ensure that CONDITION=1 done by the caller can not be	1587	* need to ensure that CONDITION=1 done by the caller can not be
1588	* reordered with p->state check below. This pairs with mb() in	1588	* reordered with p->state check below. This pairs with mb() in
1589	* set_current_state() the waiting thread does.	1589	* set_current_state() the waiting thread does.
1590	*/	1590	*/
1591	smp_mb__before_spinlock();	1591	smp_mb__before_spinlock();
1592	raw_spin_lock_irqsave(&p->pi_lock, flags);	1592	raw_spin_lock_irqsave(&p->pi_lock, flags);
1593	if (!(p->state & state))	1593	if (!(p->state & state))
1594	goto out;	1594	goto out;
1595		1595
1596	success = 1; /* we're going to change ->state */	1596	success = 1; /* we're going to change ->state */
1597	cpu = task_cpu(p);	1597	cpu = task_cpu(p);
1598		1598
1599	if (p->on_rq && ttwu_remote(p, wake_flags))	1599	if (p->on_rq && ttwu_remote(p, wake_flags))
1600	goto stat;	1600	goto stat;
1601		1601
1602	#ifdef CONFIG_SMP	1602	#ifdef CONFIG_SMP
1603	/*	1603	/*
1604	* If the owning (remote) cpu is still in the middle of schedule() with	1604	* If the owning (remote) cpu is still in the middle of schedule() with
1605	* this task as prev, wait until its done referencing the task.	1605	* this task as prev, wait until its done referencing the task.
1606	*/	1606	*/
1607	while (p->on_cpu)	1607	while (p->on_cpu)
1608	cpu_relax();	1608	cpu_relax();
1609	/*	1609	/*
1610	* Pairs with the smp_wmb() in finish_lock_switch().	1610	* Pairs with the smp_wmb() in finish_lock_switch().
1611	*/	1611	*/
1612	smp_rmb();	1612	smp_rmb();
1613		1613
1614	p->sched_contributes_to_load = !!task_contributes_to_load(p);	1614	p->sched_contributes_to_load = !!task_contributes_to_load(p);
1615	p->state = TASK_WAKING;	1615	p->state = TASK_WAKING;
1616		1616
1617	if (p->sched_class->task_waking)	1617	if (p->sched_class->task_waking)
1618	p->sched_class->task_waking(p);	1618	p->sched_class->task_waking(p);
1619		1619
1620	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);	1620	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1621	if (task_cpu(p) != cpu) {	1621	if (task_cpu(p) != cpu) {
1622	wake_flags \|= WF_MIGRATED;	1622	wake_flags \|= WF_MIGRATED;
1623	set_task_cpu(p, cpu);	1623	set_task_cpu(p, cpu);
1624	}	1624	}
1625	#endif /* CONFIG_SMP */	1625	#endif /* CONFIG_SMP */
1626		1626
1627	ttwu_queue(p, cpu);	1627	ttwu_queue(p, cpu);
1628	stat:	1628	stat:
1629	ttwu_stat(p, cpu, wake_flags);	1629	ttwu_stat(p, cpu, wake_flags);
1630	out:	1630	out:
1631	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	1631	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1632		1632
1633	return success;	1633	return success;
1634	}	1634	}
1635		1635
1636	/**	1636	/**
1637	* try_to_wake_up_local - try to wake up a local task with rq lock held	1637	* try_to_wake_up_local - try to wake up a local task with rq lock held
1638	* @p: the thread to be awakened	1638	* @p: the thread to be awakened
1639	*	1639	*
1640	* Put @p on the run-queue if it's not already there. The caller must	1640	* Put @p on the run-queue if it's not already there. The caller must
1641	* ensure that this_rq() is locked, @p is bound to this_rq() and not	1641	* ensure that this_rq() is locked, @p is bound to this_rq() and not
1642	* the current task.	1642	* the current task.
1643	*/	1643	*/
1644	static void try_to_wake_up_local(struct task_struct *p)	1644	static void try_to_wake_up_local(struct task_struct *p)
1645	{	1645	{
1646	struct rq *rq = task_rq(p);	1646	struct rq *rq = task_rq(p);
1647		1647
1648	if (WARN_ON_ONCE(rq != this_rq()) \|\|	1648	if (WARN_ON_ONCE(rq != this_rq()) \|\|
1649	WARN_ON_ONCE(p == current))	1649	WARN_ON_ONCE(p == current))
1650	return;	1650	return;
1651		1651
1652	lockdep_assert_held(&rq->lock);	1652	lockdep_assert_held(&rq->lock);
1653		1653
1654	if (!raw_spin_trylock(&p->pi_lock)) {	1654	if (!raw_spin_trylock(&p->pi_lock)) {
1655	raw_spin_unlock(&rq->lock);	1655	raw_spin_unlock(&rq->lock);
1656	raw_spin_lock(&p->pi_lock);	1656	raw_spin_lock(&p->pi_lock);
1657	raw_spin_lock(&rq->lock);	1657	raw_spin_lock(&rq->lock);
1658	}	1658	}
1659		1659
1660	if (!(p->state & TASK_NORMAL))	1660	if (!(p->state & TASK_NORMAL))
1661	goto out;	1661	goto out;
1662		1662
1663	if (!p->on_rq)	1663	if (!p->on_rq)
1664	ttwu_activate(rq, p, ENQUEUE_WAKEUP);	1664	ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1665		1665
1666	ttwu_do_wakeup(rq, p, 0);	1666	ttwu_do_wakeup(rq, p, 0);
1667	ttwu_stat(p, smp_processor_id(), 0);	1667	ttwu_stat(p, smp_processor_id(), 0);
1668	out:	1668	out:
1669	raw_spin_unlock(&p->pi_lock);	1669	raw_spin_unlock(&p->pi_lock);
1670	}	1670	}
1671		1671
1672	/**	1672	/**
1673	* wake_up_process - Wake up a specific process	1673	* wake_up_process - Wake up a specific process
1674	* @p: The process to be woken up.	1674	* @p: The process to be woken up.
1675	*	1675	*
1676	* Attempt to wake up the nominated process and move it to the set of runnable	1676	* Attempt to wake up the nominated process and move it to the set of runnable
1677	* processes.	1677	* processes.
1678	*	1678	*
1679	* Return: 1 if the process was woken up, 0 if it was already running.	1679	* Return: 1 if the process was woken up, 0 if it was already running.
1680	*	1680	*
1681	* It may be assumed that this function implies a write memory barrier before	1681	* It may be assumed that this function implies a write memory barrier before
1682	* changing the task state if and only if any tasks are woken up.	1682	* changing the task state if and only if any tasks are woken up.
1683	*/	1683	*/
1684	int wake_up_process(struct task_struct *p)	1684	int wake_up_process(struct task_struct *p)
1685	{	1685	{
1686	WARN_ON(task_is_stopped_or_traced(p));	1686	WARN_ON(task_is_stopped_or_traced(p));
1687	return try_to_wake_up(p, TASK_NORMAL, 0);	1687	return try_to_wake_up(p, TASK_NORMAL, 0);
1688	}	1688	}
1689	EXPORT_SYMBOL(wake_up_process);	1689	EXPORT_SYMBOL(wake_up_process);
1690		1690
1691	int wake_up_state(struct task_struct *p, unsigned int state)	1691	int wake_up_state(struct task_struct *p, unsigned int state)
1692	{	1692	{
1693	return try_to_wake_up(p, state, 0);	1693	return try_to_wake_up(p, state, 0);
1694	}	1694	}
1695		1695
1696	/*	1696	/*
1697	* Perform scheduler related setup for a newly forked process p.	1697	* Perform scheduler related setup for a newly forked process p.
1698	* p is forked by current.	1698	* p is forked by current.
1699	*	1699	*
1700	* __sched_fork() is basic setup used by init_idle() too:	1700	* __sched_fork() is basic setup used by init_idle() too:
1701	*/	1701	*/
1702	static void __sched_fork(unsigned long clone_flags, struct task_struct *p)	1702	static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1703	{	1703	{
1704	p->on_rq = 0;	1704	p->on_rq = 0;
1705		1705
1706	p->se.on_rq = 0;	1706	p->se.on_rq = 0;
1707	p->se.exec_start = 0;	1707	p->se.exec_start = 0;
1708	p->se.sum_exec_runtime = 0;	1708	p->se.sum_exec_runtime = 0;
1709	p->se.prev_sum_exec_runtime = 0;	1709	p->se.prev_sum_exec_runtime = 0;
1710	p->se.nr_migrations = 0;	1710	p->se.nr_migrations = 0;
1711	p->se.vruntime = 0;	1711	p->se.vruntime = 0;
1712	INIT_LIST_HEAD(&p->se.group_node);	1712	INIT_LIST_HEAD(&p->se.group_node);
1713		1713
1714	#ifdef CONFIG_SCHEDSTATS	1714	#ifdef CONFIG_SCHEDSTATS
1715	memset(&p->se.statistics, 0, sizeof(p->se.statistics));	1715	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1716	#endif	1716	#endif
1717		1717
1718	RB_CLEAR_NODE(&p->dl.rb_node);	1718	RB_CLEAR_NODE(&p->dl.rb_node);
1719	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	1719	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1720	p->dl.dl_runtime = p->dl.runtime = 0;	1720	p->dl.dl_runtime = p->dl.runtime = 0;
1721	p->dl.dl_deadline = p->dl.deadline = 0;	1721	p->dl.dl_deadline = p->dl.deadline = 0;
1722	p->dl.dl_period = 0;	1722	p->dl.dl_period = 0;
1723	p->dl.flags = 0;	1723	p->dl.flags = 0;
1724		1724
1725	INIT_LIST_HEAD(&p->rt.run_list);	1725	INIT_LIST_HEAD(&p->rt.run_list);
1726		1726
1727	#ifdef CONFIG_PREEMPT_NOTIFIERS	1727	#ifdef CONFIG_PREEMPT_NOTIFIERS
1728	INIT_HLIST_HEAD(&p->preempt_notifiers);	1728	INIT_HLIST_HEAD(&p->preempt_notifiers);
1729	#endif	1729	#endif
1730		1730
1731	#ifdef CONFIG_NUMA_BALANCING	1731	#ifdef CONFIG_NUMA_BALANCING
1732	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {	1732	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1733	p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);	1733	p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1734	p->mm->numa_scan_seq = 0;	1734	p->mm->numa_scan_seq = 0;
1735	}	1735	}
1736		1736
1737	if (clone_flags & CLONE_VM)	1737	if (clone_flags & CLONE_VM)
1738	p->numa_preferred_nid = current->numa_preferred_nid;	1738	p->numa_preferred_nid = current->numa_preferred_nid;
1739	else	1739	else
1740	p->numa_preferred_nid = -1;	1740	p->numa_preferred_nid = -1;
1741		1741
1742	p->node_stamp = 0ULL;	1742	p->node_stamp = 0ULL;
1743	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;	1743	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1744	p->numa_scan_period = sysctl_numa_balancing_scan_delay;	1744	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1745	p->numa_work.next = &p->numa_work;	1745	p->numa_work.next = &p->numa_work;
1746	p->numa_faults_memory = NULL;	1746	p->numa_faults_memory = NULL;
1747	p->numa_faults_buffer_memory = NULL;	1747	p->numa_faults_buffer_memory = NULL;
1748	p->last_task_numa_placement = 0;	1748	p->last_task_numa_placement = 0;
1749	p->last_sum_exec_runtime = 0;	1749	p->last_sum_exec_runtime = 0;
1750		1750
1751	INIT_LIST_HEAD(&p->numa_entry);	1751	INIT_LIST_HEAD(&p->numa_entry);
1752	p->numa_group = NULL;	1752	p->numa_group = NULL;
1753	#endif /* CONFIG_NUMA_BALANCING */	1753	#endif /* CONFIG_NUMA_BALANCING */
1754	}	1754	}
1755		1755
1756	#ifdef CONFIG_NUMA_BALANCING	1756	#ifdef CONFIG_NUMA_BALANCING
1757	#ifdef CONFIG_SCHED_DEBUG	1757	#ifdef CONFIG_SCHED_DEBUG
1758	void set_numabalancing_state(bool enabled)	1758	void set_numabalancing_state(bool enabled)
1759	{	1759	{
1760	if (enabled)	1760	if (enabled)
1761	sched_feat_set("NUMA");	1761	sched_feat_set("NUMA");
1762	else	1762	else
1763	sched_feat_set("NO_NUMA");	1763	sched_feat_set("NO_NUMA");
1764	}	1764	}
1765	#else	1765	#else
1766	__read_mostly bool numabalancing_enabled;	1766	__read_mostly bool numabalancing_enabled;
1767		1767
1768	void set_numabalancing_state(bool enabled)	1768	void set_numabalancing_state(bool enabled)
1769	{	1769	{
1770	numabalancing_enabled = enabled;	1770	numabalancing_enabled = enabled;
1771	}	1771	}
1772	#endif /* CONFIG_SCHED_DEBUG */	1772	#endif /* CONFIG_SCHED_DEBUG */
1773		1773
1774	#ifdef CONFIG_PROC_SYSCTL	1774	#ifdef CONFIG_PROC_SYSCTL
1775	int sysctl_numa_balancing(struct ctl_table *table, int write,	1775	int sysctl_numa_balancing(struct ctl_table *table, int write,
1776	void __user buffer, size_t lenp, loff_t *ppos)	1776	void __user buffer, size_t lenp, loff_t *ppos)
1777	{	1777	{
1778	struct ctl_table t;	1778	struct ctl_table t;
1779	int err;	1779	int err;
1780	int state = numabalancing_enabled;	1780	int state = numabalancing_enabled;
1781		1781
1782	if (write && !capable(CAP_SYS_ADMIN))	1782	if (write && !capable(CAP_SYS_ADMIN))
1783	return -EPERM;	1783	return -EPERM;
1784		1784
1785	t = *table;	1785	t = *table;
1786	t.data = &state;	1786	t.data = &state;
1787	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);	1787	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1788	if (err < 0)	1788	if (err < 0)
1789	return err;	1789	return err;
1790	if (write)	1790	if (write)
1791	set_numabalancing_state(state);	1791	set_numabalancing_state(state);
1792	return err;	1792	return err;
1793	}	1793	}
1794	#endif	1794	#endif
1795	#endif	1795	#endif
1796		1796
1797	/*	1797	/*
1798	* fork()/clone()-time setup:	1798	* fork()/clone()-time setup:
1799	*/	1799	*/
1800	int sched_fork(unsigned long clone_flags, struct task_struct *p)	1800	int sched_fork(unsigned long clone_flags, struct task_struct *p)
1801	{	1801	{
1802	unsigned long flags;	1802	unsigned long flags;
1803	int cpu = get_cpu();	1803	int cpu = get_cpu();
1804		1804
1805	__sched_fork(clone_flags, p);	1805	__sched_fork(clone_flags, p);
1806	/*	1806	/*
1807	* We mark the process as running here. This guarantees that	1807	* We mark the process as running here. This guarantees that
1808	* nobody will actually run it, and a signal or other external	1808	* nobody will actually run it, and a signal or other external
1809	* event cannot wake it up and insert it on the runqueue either.	1809	* event cannot wake it up and insert it on the runqueue either.
1810	*/	1810	*/
1811	p->state = TASK_RUNNING;	1811	p->state = TASK_RUNNING;
1812		1812
1813	/*	1813	/*
1814	* Make sure we do not leak PI boosting priority to the child.	1814	* Make sure we do not leak PI boosting priority to the child.
1815	*/	1815	*/
1816	p->prio = current->normal_prio;	1816	p->prio = current->normal_prio;
1817		1817
1818	/*	1818	/*
1819	* Revert to default priority/policy on fork if requested.	1819	* Revert to default priority/policy on fork if requested.
1820	*/	1820	*/
1821	if (unlikely(p->sched_reset_on_fork)) {	1821	if (unlikely(p->sched_reset_on_fork)) {
1822	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {	1822	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {
1823	p->policy = SCHED_NORMAL;	1823	p->policy = SCHED_NORMAL;
1824	p->static_prio = NICE_TO_PRIO(0);	1824	p->static_prio = NICE_TO_PRIO(0);
1825	p->rt_priority = 0;	1825	p->rt_priority = 0;
1826	} else if (PRIO_TO_NICE(p->static_prio) < 0)	1826	} else if (PRIO_TO_NICE(p->static_prio) < 0)
1827	p->static_prio = NICE_TO_PRIO(0);	1827	p->static_prio = NICE_TO_PRIO(0);
1828		1828
1829	p->prio = p->normal_prio = __normal_prio(p);	1829	p->prio = p->normal_prio = __normal_prio(p);
1830	set_load_weight(p);	1830	set_load_weight(p);
1831		1831
1832	/*	1832	/*
1833	* We don't need the reset flag anymore after the fork. It has	1833	* We don't need the reset flag anymore after the fork. It has
1834	* fulfilled its duty:	1834	* fulfilled its duty:
1835	*/	1835	*/
1836	p->sched_reset_on_fork = 0;	1836	p->sched_reset_on_fork = 0;
1837	}	1837	}
1838		1838
1839	if (dl_prio(p->prio)) {	1839	if (dl_prio(p->prio)) {
1840	put_cpu();	1840	put_cpu();
1841	return -EAGAIN;	1841	return -EAGAIN;
1842	} else if (rt_prio(p->prio)) {	1842	} else if (rt_prio(p->prio)) {
1843	p->sched_class = &rt_sched_class;	1843	p->sched_class = &rt_sched_class;
1844	} else {	1844	} else {
1845	p->sched_class = &fair_sched_class;	1845	p->sched_class = &fair_sched_class;
1846	}	1846	}
1847		1847
1848	if (p->sched_class->task_fork)	1848	if (p->sched_class->task_fork)
1849	p->sched_class->task_fork(p);	1849	p->sched_class->task_fork(p);
1850		1850
1851	/*	1851	/*
1852	* The child is not yet in the pid-hash so no cgroup attach races,	1852	* The child is not yet in the pid-hash so no cgroup attach races,
1853	* and the cgroup is pinned to this child due to cgroup_fork()	1853	* and the cgroup is pinned to this child due to cgroup_fork()
1854	* is ran before sched_fork().	1854	* is ran before sched_fork().
1855	*	1855	*
1856	* Silence PROVE_RCU.	1856	* Silence PROVE_RCU.
1857	*/	1857	*/
1858	raw_spin_lock_irqsave(&p->pi_lock, flags);	1858	raw_spin_lock_irqsave(&p->pi_lock, flags);
1859	set_task_cpu(p, cpu);	1859	set_task_cpu(p, cpu);
1860	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	1860	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1861		1861
1862	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)	1862	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)
1863	if (likely(sched_info_on()))	1863	if (likely(sched_info_on()))
1864	memset(&p->sched_info, 0, sizeof(p->sched_info));	1864	memset(&p->sched_info, 0, sizeof(p->sched_info));
1865	#endif	1865	#endif
1866	#if defined(CONFIG_SMP)	1866	#if defined(CONFIG_SMP)
1867	p->on_cpu = 0;	1867	p->on_cpu = 0;
1868	#endif	1868	#endif
1869	init_task_preempt_count(p);	1869	init_task_preempt_count(p);
1870	#ifdef CONFIG_SMP	1870	#ifdef CONFIG_SMP
1871	plist_node_init(&p->pushable_tasks, MAX_PRIO);	1871	plist_node_init(&p->pushable_tasks, MAX_PRIO);
1872	RB_CLEAR_NODE(&p->pushable_dl_tasks);	1872	RB_CLEAR_NODE(&p->pushable_dl_tasks);
1873	#endif	1873	#endif
1874		1874
1875	put_cpu();	1875	put_cpu();
1876	return 0;	1876	return 0;
1877	}	1877	}
1878		1878
1879	unsigned long to_ratio(u64 period, u64 runtime)	1879	unsigned long to_ratio(u64 period, u64 runtime)
1880	{	1880	{
1881	if (runtime == RUNTIME_INF)	1881	if (runtime == RUNTIME_INF)
1882	return 1ULL << 20;	1882	return 1ULL << 20;
1883		1883
1884	/*	1884	/*
1885	* Doing this here saves a lot of checks in all	1885	* Doing this here saves a lot of checks in all
1886	* the calling paths, and returning zero seems	1886	* the calling paths, and returning zero seems
1887	* safe for them anyway.	1887	* safe for them anyway.
1888	*/	1888	*/
1889	if (period == 0)	1889	if (period == 0)
1890	return 0;	1890	return 0;
1891		1891
1892	return div64_u64(runtime << 20, period);	1892	return div64_u64(runtime << 20, period);
1893	}	1893	}
1894		1894
1895	#ifdef CONFIG_SMP	1895	#ifdef CONFIG_SMP
1896	inline struct dl_bw *dl_bw_of(int i)	1896	inline struct dl_bw *dl_bw_of(int i)
1897	{	1897	{
1898	return &cpu_rq(i)->rd->dl_bw;	1898	return &cpu_rq(i)->rd->dl_bw;
1899	}	1899	}
1900		1900
1901	static inline int dl_bw_cpus(int i)	1901	static inline int dl_bw_cpus(int i)
1902	{	1902	{
1903	struct root_domain *rd = cpu_rq(i)->rd;	1903	struct root_domain *rd = cpu_rq(i)->rd;
1904	int cpus = 0;	1904	int cpus = 0;
1905		1905
1906	for_each_cpu_and(i, rd->span, cpu_active_mask)	1906	for_each_cpu_and(i, rd->span, cpu_active_mask)
1907	cpus++;	1907	cpus++;
1908		1908
1909	return cpus;	1909	return cpus;
1910	}	1910	}
1911	#else	1911	#else
1912	inline struct dl_bw *dl_bw_of(int i)	1912	inline struct dl_bw *dl_bw_of(int i)
1913	{	1913	{
1914	return &cpu_rq(i)->dl.dl_bw;	1914	return &cpu_rq(i)->dl.dl_bw;
1915	}	1915	}
1916		1916
1917	static inline int dl_bw_cpus(int i)	1917	static inline int dl_bw_cpus(int i)
1918	{	1918	{
1919	return 1;	1919	return 1;
1920	}	1920	}
1921	#endif	1921	#endif
1922		1922
1923	static inline	1923	static inline
1924	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)	1924	void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1925	{	1925	{
1926	dl_b->total_bw -= tsk_bw;	1926	dl_b->total_bw -= tsk_bw;
1927	}	1927	}
1928		1928
1929	static inline	1929	static inline
1930	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)	1930	void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1931	{	1931	{
1932	dl_b->total_bw += tsk_bw;	1932	dl_b->total_bw += tsk_bw;
1933	}	1933	}
1934		1934
1935	static inline	1935	static inline
1936	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)	1936	bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1937	{	1937	{
1938	return dl_b->bw != -1 &&	1938	return dl_b->bw != -1 &&
1939	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;	1939	dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1940	}	1940	}
1941		1941
1942	/*	1942	/*
1943	* We must be sure that accepting a new task (or allowing changing the	1943	* We must be sure that accepting a new task (or allowing changing the
1944	* parameters of an existing one) is consistent with the bandwidth	1944	* parameters of an existing one) is consistent with the bandwidth
1945	* constraints. If yes, this function also accordingly updates the currently	1945	* constraints. If yes, this function also accordingly updates the currently
1946	* allocated bandwidth to reflect the new situation.	1946	* allocated bandwidth to reflect the new situation.
1947	*	1947	*
1948	* This function is called while holding p's rq->lock.	1948	* This function is called while holding p's rq->lock.
1949	*/	1949	*/
1950	static int dl_overflow(struct task_struct *p, int policy,	1950	static int dl_overflow(struct task_struct *p, int policy,
1951	const struct sched_attr *attr)	1951	const struct sched_attr *attr)
1952	{	1952	{
1953		1953
1954	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));	1954	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955	u64 period = attr->sched_period ?: attr->sched_deadline;	1955	u64 period = attr->sched_period ?: attr->sched_deadline;
1956	u64 runtime = attr->sched_runtime;	1956	u64 runtime = attr->sched_runtime;
1957	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;	1957	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958	int cpus, err = -1;	1958	int cpus, err = -1;
1959		1959
1960	if (new_bw == p->dl.dl_bw)	1960	if (new_bw == p->dl.dl_bw)
1961	return 0;	1961	return 0;
1962		1962
1963	/*	1963	/*
1964	* Either if a task, enters, leave, or stays -deadline but changes	1964	* Either if a task, enters, leave, or stays -deadline but changes
1965	* its parameters, we may need to update accordingly the total	1965	* its parameters, we may need to update accordingly the total
1966	* allocated bandwidth of the container.	1966	* allocated bandwidth of the container.
1967	*/	1967	*/
1968	raw_spin_lock(&dl_b->lock);	1968	raw_spin_lock(&dl_b->lock);
1969	cpus = dl_bw_cpus(task_cpu(p));	1969	cpus = dl_bw_cpus(task_cpu(p));
1970	if (dl_policy(policy) && !task_has_dl_policy(p) &&	1970	if (dl_policy(policy) && !task_has_dl_policy(p) &&
1971	!__dl_overflow(dl_b, cpus, 0, new_bw)) {	1971	!__dl_overflow(dl_b, cpus, 0, new_bw)) {
1972	__dl_add(dl_b, new_bw);	1972	__dl_add(dl_b, new_bw);
1973	err = 0;	1973	err = 0;
1974	} else if (dl_policy(policy) && task_has_dl_policy(p) &&	1974	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
1975	!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {	1975	!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1976	__dl_clear(dl_b, p->dl.dl_bw);	1976	__dl_clear(dl_b, p->dl.dl_bw);
1977	__dl_add(dl_b, new_bw);	1977	__dl_add(dl_b, new_bw);
1978	err = 0;	1978	err = 0;
1979	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {	1979	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1980	__dl_clear(dl_b, p->dl.dl_bw);	1980	__dl_clear(dl_b, p->dl.dl_bw);
1981	err = 0;	1981	err = 0;
1982	}	1982	}
1983	raw_spin_unlock(&dl_b->lock);	1983	raw_spin_unlock(&dl_b->lock);
1984		1984
1985	return err;	1985	return err;
1986	}	1986	}
1987		1987
1988	extern void init_dl_bw(struct dl_bw *dl_b);	1988	extern void init_dl_bw(struct dl_bw *dl_b);
1989		1989
1990	/*	1990	/*
1991	* wake_up_new_task - wake up a newly created task for the first time.	1991	* wake_up_new_task - wake up a newly created task for the first time.
1992	*	1992	*
1993	* This function will do some initial scheduler statistics housekeeping	1993	* This function will do some initial scheduler statistics housekeeping
1994	* that must be done for every newly created context, then puts the task	1994	* that must be done for every newly created context, then puts the task
1995	* on the runqueue and wakes it.	1995	* on the runqueue and wakes it.
1996	*/	1996	*/
1997	void wake_up_new_task(struct task_struct *p)	1997	void wake_up_new_task(struct task_struct *p)
1998	{	1998	{
1999	unsigned long flags;	1999	unsigned long flags;
2000	struct rq *rq;	2000	struct rq *rq;
2001		2001
2002	raw_spin_lock_irqsave(&p->pi_lock, flags);	2002	raw_spin_lock_irqsave(&p->pi_lock, flags);
2003	#ifdef CONFIG_SMP	2003	#ifdef CONFIG_SMP
2004	/*	2004	/*
2005	* Fork balancing, do it here and not earlier because:	2005	* Fork balancing, do it here and not earlier because:
2006	* - cpus_allowed can change in the fork path	2006	* - cpus_allowed can change in the fork path
2007	* - any previously selected cpu might disappear through hotplug	2007	* - any previously selected cpu might disappear through hotplug
2008	*/	2008	*/
2009	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));	2009	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2010	#endif	2010	#endif
2011		2011
2012	/* Initialize new task's runnable average */	2012	/* Initialize new task's runnable average */
2013	init_task_runnable_average(p);	2013	init_task_runnable_average(p);
2014	rq = __task_rq_lock(p);	2014	rq = __task_rq_lock(p);
2015	activate_task(rq, p, 0);	2015	activate_task(rq, p, 0);
2016	p->on_rq = 1;	2016	p->on_rq = 1;
2017	trace_sched_wakeup_new(p, true);	2017	trace_sched_wakeup_new(p, true);
2018	check_preempt_curr(rq, p, WF_FORK);	2018	check_preempt_curr(rq, p, WF_FORK);
2019	#ifdef CONFIG_SMP	2019	#ifdef CONFIG_SMP
2020	if (p->sched_class->task_woken)	2020	if (p->sched_class->task_woken)
2021	p->sched_class->task_woken(rq, p);	2021	p->sched_class->task_woken(rq, p);
2022	#endif	2022	#endif
2023	task_rq_unlock(rq, p, &flags);	2023	task_rq_unlock(rq, p, &flags);
2024	}	2024	}
2025		2025
2026	#ifdef CONFIG_PREEMPT_NOTIFIERS	2026	#ifdef CONFIG_PREEMPT_NOTIFIERS
2027		2027
2028	/**	2028	/**
2029	* preempt_notifier_register - tell me when current is being preempted & rescheduled	2029	* preempt_notifier_register - tell me when current is being preempted & rescheduled
2030	* @notifier: notifier struct to register	2030	* @notifier: notifier struct to register
2031	*/	2031	*/
2032	void preempt_notifier_register(struct preempt_notifier *notifier)	2032	void preempt_notifier_register(struct preempt_notifier *notifier)
2033	{	2033	{
2034	hlist_add_head(&notifier->link, &current->preempt_notifiers);	2034	hlist_add_head(&notifier->link, &current->preempt_notifiers);
2035	}	2035	}
2036	EXPORT_SYMBOL_GPL(preempt_notifier_register);	2036	EXPORT_SYMBOL_GPL(preempt_notifier_register);
2037		2037
2038	/**	2038	/**
2039	* preempt_notifier_unregister - no longer interested in preemption notifications	2039	* preempt_notifier_unregister - no longer interested in preemption notifications
2040	* @notifier: notifier struct to unregister	2040	* @notifier: notifier struct to unregister
2041	*	2041	*
2042	* This is safe to call from within a preemption notifier.	2042	* This is safe to call from within a preemption notifier.
2043	*/	2043	*/
2044	void preempt_notifier_unregister(struct preempt_notifier *notifier)	2044	void preempt_notifier_unregister(struct preempt_notifier *notifier)
2045	{	2045	{
2046	hlist_del(&notifier->link);	2046	hlist_del(&notifier->link);
2047	}	2047	}
2048	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);	2048	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2049		2049
2050	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)	2050	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2051	{	2051	{
2052	struct preempt_notifier *notifier;	2052	struct preempt_notifier *notifier;
2053		2053
2054	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)	2054	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2055	notifier->ops->sched_in(notifier, raw_smp_processor_id());	2055	notifier->ops->sched_in(notifier, raw_smp_processor_id());
2056	}	2056	}
2057		2057
2058	static void	2058	static void
2059	fire_sched_out_preempt_notifiers(struct task_struct *curr,	2059	fire_sched_out_preempt_notifiers(struct task_struct *curr,
2060	struct task_struct *next)	2060	struct task_struct *next)
2061	{	2061	{
2062	struct preempt_notifier *notifier;	2062	struct preempt_notifier *notifier;
2063		2063
2064	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)	2064	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2065	notifier->ops->sched_out(notifier, next);	2065	notifier->ops->sched_out(notifier, next);
2066	}	2066	}
2067		2067
2068	#else /* !CONFIG_PREEMPT_NOTIFIERS */	2068	#else /* !CONFIG_PREEMPT_NOTIFIERS */
2069		2069
2070	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)	2070	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2071	{	2071	{
2072	}	2072	}
2073		2073
2074	static void	2074	static void
2075	fire_sched_out_preempt_notifiers(struct task_struct *curr,	2075	fire_sched_out_preempt_notifiers(struct task_struct *curr,
2076	struct task_struct *next)	2076	struct task_struct *next)
2077	{	2077	{
2078	}	2078	}
2079		2079
2080	#endif /* CONFIG_PREEMPT_NOTIFIERS */	2080	#endif /* CONFIG_PREEMPT_NOTIFIERS */
2081		2081
2082	/**	2082	/**
2083	* prepare_task_switch - prepare to switch tasks	2083	* prepare_task_switch - prepare to switch tasks
2084	* @rq: the runqueue preparing to switch	2084	* @rq: the runqueue preparing to switch
2085	* @prev: the current task that is being switched out	2085	* @prev: the current task that is being switched out
2086	* @next: the task we are going to switch to.	2086	* @next: the task we are going to switch to.
2087	*	2087	*
2088	* This is called with the rq lock held and interrupts off. It must	2088	* This is called with the rq lock held and interrupts off. It must
2089	* be paired with a subsequent finish_task_switch after the context	2089	* be paired with a subsequent finish_task_switch after the context
2090	* switch.	2090	* switch.
2091	*	2091	*
2092	* prepare_task_switch sets up locking and calls architecture specific	2092	* prepare_task_switch sets up locking and calls architecture specific
2093	* hooks.	2093	* hooks.
2094	*/	2094	*/
2095	static inline void	2095	static inline void
2096	prepare_task_switch(struct rq rq, struct task_struct prev,	2096	prepare_task_switch(struct rq rq, struct task_struct prev,
2097	struct task_struct *next)	2097	struct task_struct *next)
2098	{	2098	{
2099	trace_sched_switch(prev, next);	2099	trace_sched_switch(prev, next);
2100	sched_info_switch(rq, prev, next);	2100	sched_info_switch(rq, prev, next);
2101	perf_event_task_sched_out(prev, next);	2101	perf_event_task_sched_out(prev, next);
2102	fire_sched_out_preempt_notifiers(prev, next);	2102	fire_sched_out_preempt_notifiers(prev, next);
2103	prepare_lock_switch(rq, next);	2103	prepare_lock_switch(rq, next);
2104	prepare_arch_switch(next);	2104	prepare_arch_switch(next);
2105	}	2105	}
2106		2106
2107	/**	2107	/**
2108	* finish_task_switch - clean up after a task-switch	2108	* finish_task_switch - clean up after a task-switch
2109	* @rq: runqueue associated with task-switch	2109	* @rq: runqueue associated with task-switch
2110	* @prev: the thread we just switched away from.	2110	* @prev: the thread we just switched away from.
2111	*	2111	*
2112	* finish_task_switch must be called after the context switch, paired	2112	* finish_task_switch must be called after the context switch, paired
2113	* with a prepare_task_switch call before the context switch.	2113	* with a prepare_task_switch call before the context switch.
2114	* finish_task_switch will reconcile locking set up by prepare_task_switch,	2114	* finish_task_switch will reconcile locking set up by prepare_task_switch,
2115	* and do any other architecture-specific cleanup actions.	2115	* and do any other architecture-specific cleanup actions.
2116	*	2116	*
2117	* Note that we may have delayed dropping an mm in context_switch(). If	2117	* Note that we may have delayed dropping an mm in context_switch(). If
2118	* so, we finish that here outside of the runqueue lock. (Doing it	2118	* so, we finish that here outside of the runqueue lock. (Doing it
2119	* with the lock held can cause deadlocks; see schedule() for	2119	* with the lock held can cause deadlocks; see schedule() for
2120	* details.)	2120	* details.)
2121	*/	2121	*/
2122	static void finish_task_switch(struct rq rq, struct task_struct prev)	2122	static void finish_task_switch(struct rq rq, struct task_struct prev)
2123	__releases(rq->lock)	2123	__releases(rq->lock)
2124	{	2124	{
2125	struct mm_struct *mm = rq->prev_mm;	2125	struct mm_struct *mm = rq->prev_mm;
2126	long prev_state;	2126	long prev_state;
2127		2127
2128	rq->prev_mm = NULL;	2128	rq->prev_mm = NULL;
2129		2129
2130	/*	2130	/*
2131	* A task struct has one reference for the use as "current".	2131	* A task struct has one reference for the use as "current".
2132	* If a task dies, then it sets TASK_DEAD in tsk->state and calls	2132	* If a task dies, then it sets TASK_DEAD in tsk->state and calls
2133	* schedule one last time. The schedule call will never return, and	2133	* schedule one last time. The schedule call will never return, and
2134	* the scheduled task must drop that reference.	2134	* the scheduled task must drop that reference.
2135	* The test for TASK_DEAD must occur while the runqueue locks are	2135	* The test for TASK_DEAD must occur while the runqueue locks are
2136	* still held, otherwise prev could be scheduled on another cpu, die	2136	* still held, otherwise prev could be scheduled on another cpu, die
2137	* there before we look at prev->state, and then the reference would	2137	* there before we look at prev->state, and then the reference would
2138	* be dropped twice.	2138	* be dropped twice.
2139	* Manfred Spraul <manfred@colorfullife.com>	2139	* Manfred Spraul <manfred@colorfullife.com>
2140	*/	2140	*/
2141	prev_state = prev->state;	2141	prev_state = prev->state;
2142	vtime_task_switch(prev);	2142	vtime_task_switch(prev);
2143	finish_arch_switch(prev);	2143	finish_arch_switch(prev);
2144	perf_event_task_sched_in(prev, current);	2144	perf_event_task_sched_in(prev, current);
2145	finish_lock_switch(rq, prev);	2145	finish_lock_switch(rq, prev);
2146	finish_arch_post_lock_switch();	2146	finish_arch_post_lock_switch();
2147		2147
2148	fire_sched_in_preempt_notifiers(current);	2148	fire_sched_in_preempt_notifiers(current);
2149	if (mm)	2149	if (mm)
2150	mmdrop(mm);	2150	mmdrop(mm);
2151	if (unlikely(prev_state == TASK_DEAD)) {	2151	if (unlikely(prev_state == TASK_DEAD)) {
2152	if (prev->sched_class->task_dead)	2152	if (prev->sched_class->task_dead)
2153	prev->sched_class->task_dead(prev);	2153	prev->sched_class->task_dead(prev);
2154		2154
2155	/*	2155	/*
2156	* Remove function-return probe instances associated with this	2156	* Remove function-return probe instances associated with this
2157	* task and put them back on the free list.	2157	* task and put them back on the free list.
2158	*/	2158	*/
2159	kprobe_flush_task(prev);	2159	kprobe_flush_task(prev);
2160	put_task_struct(prev);	2160	put_task_struct(prev);
2161	}	2161	}
2162		2162
2163	tick_nohz_task_switch(current);	2163	tick_nohz_task_switch(current);
2164	}	2164	}
2165		2165
2166	#ifdef CONFIG_SMP	2166	#ifdef CONFIG_SMP
2167		2167
2168	/* rq->lock is NOT held, but preemption is disabled */	2168	/* rq->lock is NOT held, but preemption is disabled */
2169	static inline void post_schedule(struct rq *rq)	2169	static inline void post_schedule(struct rq *rq)
2170	{	2170	{
2171	if (rq->post_schedule) {	2171	if (rq->post_schedule) {
2172	unsigned long flags;	2172	unsigned long flags;
2173		2173
2174	raw_spin_lock_irqsave(&rq->lock, flags);	2174	raw_spin_lock_irqsave(&rq->lock, flags);
2175	if (rq->curr->sched_class->post_schedule)	2175	if (rq->curr->sched_class->post_schedule)
2176	rq->curr->sched_class->post_schedule(rq);	2176	rq->curr->sched_class->post_schedule(rq);
2177	raw_spin_unlock_irqrestore(&rq->lock, flags);	2177	raw_spin_unlock_irqrestore(&rq->lock, flags);
2178		2178
2179	rq->post_schedule = 0;	2179	rq->post_schedule = 0;
2180	}	2180	}
2181	}	2181	}
2182		2182
2183	#else	2183	#else
2184		2184
2185	static inline void post_schedule(struct rq *rq)	2185	static inline void post_schedule(struct rq *rq)
2186	{	2186	{
2187	}	2187	}
2188		2188
2189	#endif	2189	#endif
2190		2190
2191	/**	2191	/**
2192	* schedule_tail - first thing a freshly forked thread must call.	2192	* schedule_tail - first thing a freshly forked thread must call.
2193	* @prev: the thread we just switched away from.	2193	* @prev: the thread we just switched away from.
2194	*/	2194	*/
2195	asmlinkage __visible void schedule_tail(struct task_struct *prev)	2195	asmlinkage __visible void schedule_tail(struct task_struct *prev)
2196	__releases(rq->lock)	2196	__releases(rq->lock)
2197	{	2197	{
2198	struct rq *rq = this_rq();	2198	struct rq *rq = this_rq();
2199		2199
2200	finish_task_switch(rq, prev);	2200	finish_task_switch(rq, prev);
2201		2201
2202	/*	2202	/*
2203	* FIXME: do we need to worry about rq being invalidated by the	2203	* FIXME: do we need to worry about rq being invalidated by the
2204	* task_switch?	2204	* task_switch?
2205	*/	2205	*/
2206	post_schedule(rq);	2206	post_schedule(rq);
2207		2207
2208	#ifdef __ARCH_WANT_UNLOCKED_CTXSW	2208	#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2209	/* In this case, finish_task_switch does not reenable preemption */	2209	/* In this case, finish_task_switch does not reenable preemption */
2210	preempt_enable();	2210	preempt_enable();
2211	#endif	2211	#endif
2212	if (current->set_child_tid)	2212	if (current->set_child_tid)
2213	put_user(task_pid_vnr(current), current->set_child_tid);	2213	put_user(task_pid_vnr(current), current->set_child_tid);
2214	}	2214	}
2215		2215
2216	/*	2216	/*
2217	* context_switch - switch to the new MM and the new	2217	* context_switch - switch to the new MM and the new
2218	* thread's register state.	2218	* thread's register state.
2219	*/	2219	*/
2220	static inline void	2220	static inline void
2221	context_switch(struct rq rq, struct task_struct prev,	2221	context_switch(struct rq rq, struct task_struct prev,
2222	struct task_struct *next)	2222	struct task_struct *next)
2223	{	2223	{
2224	struct mm_struct mm, oldmm;	2224	struct mm_struct mm, oldmm;
2225		2225
2226	prepare_task_switch(rq, prev, next);	2226	prepare_task_switch(rq, prev, next);
2227		2227
2228	mm = next->mm;	2228	mm = next->mm;
2229	oldmm = prev->active_mm;	2229	oldmm = prev->active_mm;
2230	/*	2230	/*
2231	* For paravirt, this is coupled with an exit in switch_to to	2231	* For paravirt, this is coupled with an exit in switch_to to
2232	* combine the page table reload and the switch backend into	2232	* combine the page table reload and the switch backend into
2233	* one hypercall.	2233	* one hypercall.
2234	*/	2234	*/
2235	arch_start_context_switch(prev);	2235	arch_start_context_switch(prev);
2236		2236
2237	if (!mm) {	2237	if (!mm) {
2238	next->active_mm = oldmm;	2238	next->active_mm = oldmm;
2239	atomic_inc(&oldmm->mm_count);	2239	atomic_inc(&oldmm->mm_count);
2240	enter_lazy_tlb(oldmm, next);	2240	enter_lazy_tlb(oldmm, next);
2241	} else	2241	} else
2242	switch_mm(oldmm, mm, next);	2242	switch_mm(oldmm, mm, next);
2243		2243
2244	if (!prev->mm) {	2244	if (!prev->mm) {
2245	prev->active_mm = NULL;	2245	prev->active_mm = NULL;
2246	rq->prev_mm = oldmm;	2246	rq->prev_mm = oldmm;
2247	}	2247	}
2248	/*	2248	/*
2249	* Since the runqueue lock will be released by the next	2249	* Since the runqueue lock will be released by the next
2250	* task (which is an invalid locking op but in the case	2250	* task (which is an invalid locking op but in the case
2251	* of the scheduler it's an obvious special-case), so we	2251	* of the scheduler it's an obvious special-case), so we
2252	* do an early lockdep release here:	2252	* do an early lockdep release here:
2253	*/	2253	*/
2254	#ifndef __ARCH_WANT_UNLOCKED_CTXSW	2254	#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2255	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);	2255	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2256	#endif	2256	#endif
2257		2257
2258	context_tracking_task_switch(prev, next);	2258	context_tracking_task_switch(prev, next);
2259	/* Here we just switch the register state and the stack. */	2259	/* Here we just switch the register state and the stack. */
2260	switch_to(prev, next, prev);	2260	switch_to(prev, next, prev);
2261		2261
2262	barrier();	2262	barrier();
2263	/*	2263	/*
2264	* this_rq must be evaluated again because prev may have moved	2264	* this_rq must be evaluated again because prev may have moved
2265	* CPUs since it called schedule(), thus the 'rq' on its stack	2265	* CPUs since it called schedule(), thus the 'rq' on its stack
2266	* frame will be invalid.	2266	* frame will be invalid.
2267	*/	2267	*/
2268	finish_task_switch(this_rq(), prev);	2268	finish_task_switch(this_rq(), prev);
2269	}	2269	}
2270		2270
2271	/*	2271	/*
2272	* nr_running and nr_context_switches:	2272	* nr_running and nr_context_switches:
2273	*	2273	*
2274	* externally visible scheduler statistics: current number of runnable	2274	* externally visible scheduler statistics: current number of runnable
2275	* threads, total number of context switches performed since bootup.	2275	* threads, total number of context switches performed since bootup.
2276	*/	2276	*/
2277	unsigned long nr_running(void)	2277	unsigned long nr_running(void)
2278	{	2278	{
2279	unsigned long i, sum = 0;	2279	unsigned long i, sum = 0;
2280		2280
2281	for_each_online_cpu(i)	2281	for_each_online_cpu(i)
2282	sum += cpu_rq(i)->nr_running;	2282	sum += cpu_rq(i)->nr_running;
2283		2283
2284	return sum;	2284	return sum;
2285	}	2285	}
2286		2286
2287	unsigned long long nr_context_switches(void)	2287	unsigned long long nr_context_switches(void)
2288	{	2288	{
2289	int i;	2289	int i;
2290	unsigned long long sum = 0;	2290	unsigned long long sum = 0;
2291		2291
2292	for_each_possible_cpu(i)	2292	for_each_possible_cpu(i)
2293	sum += cpu_rq(i)->nr_switches;	2293	sum += cpu_rq(i)->nr_switches;
2294		2294
2295	return sum;	2295	return sum;
2296	}	2296	}
2297		2297
2298	unsigned long nr_iowait(void)	2298	unsigned long nr_iowait(void)
2299	{	2299	{
2300	unsigned long i, sum = 0;	2300	unsigned long i, sum = 0;
2301		2301
2302	for_each_possible_cpu(i)	2302	for_each_possible_cpu(i)
2303	sum += atomic_read(&cpu_rq(i)->nr_iowait);	2303	sum += atomic_read(&cpu_rq(i)->nr_iowait);
2304		2304
2305	return sum;	2305	return sum;
2306	}	2306	}
2307		2307
2308	unsigned long nr_iowait_cpu(int cpu)	2308	unsigned long nr_iowait_cpu(int cpu)
2309	{	2309	{
2310	struct rq *this = cpu_rq(cpu);	2310	struct rq *this = cpu_rq(cpu);
2311	return atomic_read(&this->nr_iowait);	2311	return atomic_read(&this->nr_iowait);
2312	}	2312	}
2313		2313
2314	#ifdef CONFIG_SMP	2314	#ifdef CONFIG_SMP
2315		2315
2316	/*	2316	/*
2317	* sched_exec - execve() is a valuable balancing opportunity, because at	2317	* sched_exec - execve() is a valuable balancing opportunity, because at
2318	* this point the task has the smallest effective memory and cache footprint.	2318	* this point the task has the smallest effective memory and cache footprint.
2319	*/	2319	*/
2320	void sched_exec(void)	2320	void sched_exec(void)
2321	{	2321	{
2322	struct task_struct *p = current;	2322	struct task_struct *p = current;
2323	unsigned long flags;	2323	unsigned long flags;
2324	int dest_cpu;	2324	int dest_cpu;
2325		2325
2326	raw_spin_lock_irqsave(&p->pi_lock, flags);	2326	raw_spin_lock_irqsave(&p->pi_lock, flags);
2327	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);	2327	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2328	if (dest_cpu == smp_processor_id())	2328	if (dest_cpu == smp_processor_id())
2329	goto unlock;	2329	goto unlock;
2330		2330
2331	if (likely(cpu_active(dest_cpu))) {	2331	if (likely(cpu_active(dest_cpu))) {
2332	struct migration_arg arg = { p, dest_cpu };	2332	struct migration_arg arg = { p, dest_cpu };
2333		2333
2334	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	2334	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2335	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);	2335	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2336	return;	2336	return;
2337	}	2337	}
2338	unlock:	2338	unlock:
2339	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	2339	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2340	}	2340	}
2341		2341
2342	#endif	2342	#endif
2343		2343
2344	DEFINE_PER_CPU(struct kernel_stat, kstat);	2344	DEFINE_PER_CPU(struct kernel_stat, kstat);
2345	DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);	2345	DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2346		2346
2347	EXPORT_PER_CPU_SYMBOL(kstat);	2347	EXPORT_PER_CPU_SYMBOL(kstat);
2348	EXPORT_PER_CPU_SYMBOL(kernel_cpustat);	2348	EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2349		2349
2350	/*	2350	/*
2351	* Return any ns on the sched_clock that have not yet been accounted in	2351	* Return any ns on the sched_clock that have not yet been accounted in
2352	* @p in case that task is currently running.	2352	* @p in case that task is currently running.
2353	*	2353	*
2354	* Called with task_rq_lock() held on @rq.	2354	* Called with task_rq_lock() held on @rq.
2355	*/	2355	*/
2356	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)	2356	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)
2357	{	2357	{
2358	u64 ns = 0;	2358	u64 ns = 0;
2359		2359
2360	if (task_current(rq, p)) {	2360	if (task_current(rq, p)) {
2361	update_rq_clock(rq);	2361	update_rq_clock(rq);
2362	ns = rq_clock_task(rq) - p->se.exec_start;	2362	ns = rq_clock_task(rq) - p->se.exec_start;
2363	if ((s64)ns < 0)	2363	if ((s64)ns < 0)
2364	ns = 0;	2364	ns = 0;
2365	}	2365	}
2366		2366
2367	return ns;	2367	return ns;
2368	}	2368	}
2369		2369
2370	unsigned long long task_delta_exec(struct task_struct *p)	2370	unsigned long long task_delta_exec(struct task_struct *p)
2371	{	2371	{
2372	unsigned long flags;	2372	unsigned long flags;
2373	struct rq *rq;	2373	struct rq *rq;
2374	u64 ns = 0;	2374	u64 ns = 0;
2375		2375
2376	rq = task_rq_lock(p, &flags);	2376	rq = task_rq_lock(p, &flags);
2377	ns = do_task_delta_exec(p, rq);	2377	ns = do_task_delta_exec(p, rq);
2378	task_rq_unlock(rq, p, &flags);	2378	task_rq_unlock(rq, p, &flags);
2379		2379
2380	return ns;	2380	return ns;
2381	}	2381	}
2382		2382
2383	/*	2383	/*
2384	* Return accounted runtime for the task.	2384	* Return accounted runtime for the task.
2385	* In case the task is currently running, return the runtime plus current's	2385	* In case the task is currently running, return the runtime plus current's
2386	* pending runtime that have not been accounted yet.	2386	* pending runtime that have not been accounted yet.
2387	*/	2387	*/
2388	unsigned long long task_sched_runtime(struct task_struct *p)	2388	unsigned long long task_sched_runtime(struct task_struct *p)
2389	{	2389	{
2390	unsigned long flags;	2390	unsigned long flags;
2391	struct rq *rq;	2391	struct rq *rq;
2392	u64 ns = 0;	2392	u64 ns = 0;
2393		2393
2394	#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)	2394	#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2395	/*	2395	/*
2396	* 64-bit doesn't need locks to atomically read a 64bit value.	2396	* 64-bit doesn't need locks to atomically read a 64bit value.
2397	* So we have a optimization chance when the task's delta_exec is 0.	2397	* So we have a optimization chance when the task's delta_exec is 0.
2398	* Reading ->on_cpu is racy, but this is ok.	2398	* Reading ->on_cpu is racy, but this is ok.
2399	*	2399	*
2400	* If we race with it leaving cpu, we'll take a lock. So we're correct.	2400	* If we race with it leaving cpu, we'll take a lock. So we're correct.
2401	* If we race with it entering cpu, unaccounted time is 0. This is	2401	* If we race with it entering cpu, unaccounted time is 0. This is
2402	* indistinguishable from the read occurring a few cycles earlier.	2402	* indistinguishable from the read occurring a few cycles earlier.
2403	*/	2403	*/
2404	if (!p->on_cpu)	2404	if (!p->on_cpu)
2405	return p->se.sum_exec_runtime;	2405	return p->se.sum_exec_runtime;
2406	#endif	2406	#endif
2407		2407
2408	rq = task_rq_lock(p, &flags);	2408	rq = task_rq_lock(p, &flags);
2409	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);	2409	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2410	task_rq_unlock(rq, p, &flags);	2410	task_rq_unlock(rq, p, &flags);
2411		2411
2412	return ns;	2412	return ns;
2413	}	2413	}
2414		2414
2415	/*	2415	/*
2416	* This function gets called by the timer code, with HZ frequency.	2416	* This function gets called by the timer code, with HZ frequency.
2417	* We call it with interrupts disabled.	2417	* We call it with interrupts disabled.
2418	*/	2418	*/
2419	void scheduler_tick(void)	2419	void scheduler_tick(void)
2420	{	2420	{
2421	int cpu = smp_processor_id();	2421	int cpu = smp_processor_id();
2422	struct rq *rq = cpu_rq(cpu);	2422	struct rq *rq = cpu_rq(cpu);
2423	struct task_struct *curr = rq->curr;	2423	struct task_struct *curr = rq->curr;
2424		2424
2425	sched_clock_tick();	2425	sched_clock_tick();
2426		2426
2427	raw_spin_lock(&rq->lock);	2427	raw_spin_lock(&rq->lock);
2428	update_rq_clock(rq);	2428	update_rq_clock(rq);
2429	curr->sched_class->task_tick(rq, curr, 0);	2429	curr->sched_class->task_tick(rq, curr, 0);
2430	update_cpu_load_active(rq);	2430	update_cpu_load_active(rq);
2431	raw_spin_unlock(&rq->lock);	2431	raw_spin_unlock(&rq->lock);
2432		2432
2433	perf_event_task_tick();	2433	perf_event_task_tick();
2434		2434
2435	#ifdef CONFIG_SMP	2435	#ifdef CONFIG_SMP
2436	rq->idle_balance = idle_cpu(cpu);	2436	rq->idle_balance = idle_cpu(cpu);
2437	trigger_load_balance(rq);	2437	trigger_load_balance(rq);
2438	#endif	2438	#endif
2439	rq_last_tick_reset(rq);	2439	rq_last_tick_reset(rq);
2440	}	2440	}
2441		2441
2442	#ifdef CONFIG_NO_HZ_FULL	2442	#ifdef CONFIG_NO_HZ_FULL
2443	/**	2443	/**
2444	* scheduler_tick_max_deferment	2444	* scheduler_tick_max_deferment
2445	*	2445	*
2446	* Keep at least one tick per second when a single	2446	* Keep at least one tick per second when a single
2447	* active task is running because the scheduler doesn't	2447	* active task is running because the scheduler doesn't
2448	* yet completely support full dynticks environment.	2448	* yet completely support full dynticks environment.
2449	*	2449	*
2450	* This makes sure that uptime, CFS vruntime, load	2450	* This makes sure that uptime, CFS vruntime, load
2451	* balancing, etc... continue to move forward, even	2451	* balancing, etc... continue to move forward, even
2452	* with a very low granularity.	2452	* with a very low granularity.
2453	*	2453	*
2454	* Return: Maximum deferment in nanoseconds.	2454	* Return: Maximum deferment in nanoseconds.
2455	*/	2455	*/
2456	u64 scheduler_tick_max_deferment(void)	2456	u64 scheduler_tick_max_deferment(void)
2457	{	2457	{
2458	struct rq *rq = this_rq();	2458	struct rq *rq = this_rq();
2459	unsigned long next, now = ACCESS_ONCE(jiffies);	2459	unsigned long next, now = ACCESS_ONCE(jiffies);
2460		2460
2461	next = rq->last_sched_tick + HZ;	2461	next = rq->last_sched_tick + HZ;
2462		2462
2463	if (time_before_eq(next, now))	2463	if (time_before_eq(next, now))
2464	return 0;	2464	return 0;
2465		2465
2466	return jiffies_to_nsecs(next - now);	2466	return jiffies_to_nsecs(next - now);
2467	}	2467	}
2468	#endif	2468	#endif
2469		2469
2470	notrace unsigned long get_parent_ip(unsigned long addr)	2470	notrace unsigned long get_parent_ip(unsigned long addr)
2471	{	2471	{
2472	if (in_lock_functions(addr)) {	2472	if (in_lock_functions(addr)) {
2473	addr = CALLER_ADDR2;	2473	addr = CALLER_ADDR2;
2474	if (in_lock_functions(addr))	2474	if (in_lock_functions(addr))
2475	addr = CALLER_ADDR3;	2475	addr = CALLER_ADDR3;
2476	}	2476	}
2477	return addr;	2477	return addr;
2478	}	2478	}
2479		2479
2480	#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \	2480	#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
2481	defined(CONFIG_PREEMPT_TRACER))	2481	defined(CONFIG_PREEMPT_TRACER))
2482		2482
2483	void __kprobes preempt_count_add(int val)	2483	void __kprobes preempt_count_add(int val)
2484	{	2484	{
2485	#ifdef CONFIG_DEBUG_PREEMPT	2485	#ifdef CONFIG_DEBUG_PREEMPT
2486	/*	2486	/*
2487	* Underflow?	2487	* Underflow?
2488	*/	2488	*/
2489	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))	2489	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2490	return;	2490	return;
2491	#endif	2491	#endif
2492	__preempt_count_add(val);	2492	__preempt_count_add(val);
2493	#ifdef CONFIG_DEBUG_PREEMPT	2493	#ifdef CONFIG_DEBUG_PREEMPT
2494	/*	2494	/*
2495	* Spinlock count overflowing soon?	2495	* Spinlock count overflowing soon?
2496	*/	2496	*/
2497	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=	2497	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2498	PREEMPT_MASK - 10);	2498	PREEMPT_MASK - 10);
2499	#endif	2499	#endif
2500	if (preempt_count() == val) {	2500	if (preempt_count() == val) {
2501	unsigned long ip = get_parent_ip(CALLER_ADDR1);	2501	unsigned long ip = get_parent_ip(CALLER_ADDR1);
2502	#ifdef CONFIG_DEBUG_PREEMPT	2502	#ifdef CONFIG_DEBUG_PREEMPT
2503	current->preempt_disable_ip = ip;	2503	current->preempt_disable_ip = ip;
2504	#endif	2504	#endif
2505	trace_preempt_off(CALLER_ADDR0, ip);	2505	trace_preempt_off(CALLER_ADDR0, ip);
2506	}	2506	}
2507	}	2507	}
2508	EXPORT_SYMBOL(preempt_count_add);	2508	EXPORT_SYMBOL(preempt_count_add);
2509		2509
2510	void __kprobes preempt_count_sub(int val)	2510	void __kprobes preempt_count_sub(int val)
2511	{	2511	{
2512	#ifdef CONFIG_DEBUG_PREEMPT	2512	#ifdef CONFIG_DEBUG_PREEMPT
2513	/*	2513	/*
2514	* Underflow?	2514	* Underflow?
2515	*/	2515	*/
2516	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))	2516	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2517	return;	2517	return;
2518	/*	2518	/*
2519	* Is the spinlock portion underflowing?	2519	* Is the spinlock portion underflowing?
2520	*/	2520	*/
2521	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&	2521	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2522	!(preempt_count() & PREEMPT_MASK)))	2522	!(preempt_count() & PREEMPT_MASK)))
2523	return;	2523	return;
2524	#endif	2524	#endif
2525		2525
2526	if (preempt_count() == val)	2526	if (preempt_count() == val)
2527	trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));	2527	trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2528	__preempt_count_sub(val);	2528	__preempt_count_sub(val);
2529	}	2529	}
2530	EXPORT_SYMBOL(preempt_count_sub);	2530	EXPORT_SYMBOL(preempt_count_sub);
2531		2531
2532	#endif	2532	#endif
2533		2533
2534	/*	2534	/*
2535	* Print scheduling while atomic bug:	2535	* Print scheduling while atomic bug:
2536	*/	2536	*/
2537	static noinline void __schedule_bug(struct task_struct *prev)	2537	static noinline void __schedule_bug(struct task_struct *prev)
2538	{	2538	{
2539	if (oops_in_progress)	2539	if (oops_in_progress)
2540	return;	2540	return;
2541		2541
2542	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",	2542	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2543	prev->comm, prev->pid, preempt_count());	2543	prev->comm, prev->pid, preempt_count());
2544		2544
2545	debug_show_held_locks(prev);	2545	debug_show_held_locks(prev);
2546	print_modules();	2546	print_modules();
2547	if (irqs_disabled())	2547	if (irqs_disabled())
2548	print_irqtrace_events(prev);	2548	print_irqtrace_events(prev);
2549	#ifdef CONFIG_DEBUG_PREEMPT	2549	#ifdef CONFIG_DEBUG_PREEMPT
2550	if (in_atomic_preempt_off()) {	2550	if (in_atomic_preempt_off()) {
2551	pr_err("Preemption disabled at:");	2551	pr_err("Preemption disabled at:");
2552	print_ip_sym(current->preempt_disable_ip);	2552	print_ip_sym(current->preempt_disable_ip);
2553	pr_cont("\n");	2553	pr_cont("\n");
2554	}	2554	}
2555	#endif	2555	#endif
2556	dump_stack();	2556	dump_stack();
2557	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);	2557	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2558	}	2558	}
2559		2559
2560	/*	2560	/*
2561	* Various schedule()-time debugging checks and statistics:	2561	* Various schedule()-time debugging checks and statistics:
2562	*/	2562	*/
2563	static inline void schedule_debug(struct task_struct *prev)	2563	static inline void schedule_debug(struct task_struct *prev)
2564	{	2564	{
2565	/*	2565	/*
2566	* Test if we are atomic. Since do_exit() needs to call into	2566	* Test if we are atomic. Since do_exit() needs to call into
2567	* schedule() atomically, we ignore that path. Otherwise whine	2567	* schedule() atomically, we ignore that path. Otherwise whine
2568	* if we are scheduling when we should not.	2568	* if we are scheduling when we should not.
2569	*/	2569	*/
2570	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))	2570	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2571	__schedule_bug(prev);	2571	__schedule_bug(prev);
2572	rcu_sleep_check();	2572	rcu_sleep_check();
2573		2573
2574	profile_hit(SCHED_PROFILING, __builtin_return_address(0));	2574	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2575		2575
2576	schedstat_inc(this_rq(), sched_count);	2576	schedstat_inc(this_rq(), sched_count);
2577	}	2577	}
2578		2578
2579	/*	2579	/*
2580	* Pick up the highest-prio task:	2580	* Pick up the highest-prio task:
2581	*/	2581	*/
2582	static inline struct task_struct *	2582	static inline struct task_struct *
2583	pick_next_task(struct rq rq, struct task_struct prev)	2583	pick_next_task(struct rq rq, struct task_struct prev)
2584	{	2584	{
2585	const struct sched_class *class = &fair_sched_class;	2585	const struct sched_class *class = &fair_sched_class;
2586	struct task_struct *p;	2586	struct task_struct *p;
2587		2587
2588	/*	2588	/*
2589	* Optimization: we know that if all tasks are in	2589	* Optimization: we know that if all tasks are in
2590	* the fair class we can call that function directly:	2590	* the fair class we can call that function directly:
2591	*/	2591	*/
2592	if (likely(prev->sched_class == class &&	2592	if (likely(prev->sched_class == class &&
2593	rq->nr_running == rq->cfs.h_nr_running)) {	2593	rq->nr_running == rq->cfs.h_nr_running)) {
2594	p = fair_sched_class.pick_next_task(rq, prev);	2594	p = fair_sched_class.pick_next_task(rq, prev);
2595	if (unlikely(p == RETRY_TASK))	2595	if (unlikely(p == RETRY_TASK))
2596	goto again;	2596	goto again;
2597		2597
2598	/* assumes fair_sched_class->next == idle_sched_class */	2598	/* assumes fair_sched_class->next == idle_sched_class */
2599	if (unlikely(!p))	2599	if (unlikely(!p))
2600	p = idle_sched_class.pick_next_task(rq, prev);	2600	p = idle_sched_class.pick_next_task(rq, prev);
2601		2601
2602	return p;	2602	return p;
2603	}	2603	}
2604		2604
2605	again:	2605	again:
2606	for_each_class(class) {	2606	for_each_class(class) {
2607	p = class->pick_next_task(rq, prev);	2607	p = class->pick_next_task(rq, prev);
2608	if (p) {	2608	if (p) {
2609	if (unlikely(p == RETRY_TASK))	2609	if (unlikely(p == RETRY_TASK))
2610	goto again;	2610	goto again;
2611	return p;	2611	return p;
2612	}	2612	}
2613	}	2613	}
2614		2614
2615	BUG(); /* the idle class will always have a runnable task */	2615	BUG(); /* the idle class will always have a runnable task */
2616	}	2616	}
2617		2617
2618	/*	2618	/*
2619	* __schedule() is the main scheduler function.	2619	* __schedule() is the main scheduler function.
2620	*	2620	*
2621	* The main means of driving the scheduler and thus entering this function are:	2621	* The main means of driving the scheduler and thus entering this function are:
2622	*	2622	*
2623	* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.	2623	* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2624	*	2624	*
2625	* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return	2625	* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2626	* paths. For example, see arch/x86/entry_64.S.	2626	* paths. For example, see arch/x86/entry_64.S.
2627	*	2627	*
2628	* To drive preemption between tasks, the scheduler sets the flag in timer	2628	* To drive preemption between tasks, the scheduler sets the flag in timer
2629	* interrupt handler scheduler_tick().	2629	* interrupt handler scheduler_tick().
2630	*	2630	*
2631	* 3. Wakeups don't really cause entry into schedule(). They add a	2631	* 3. Wakeups don't really cause entry into schedule(). They add a
2632	* task to the run-queue and that's it.	2632	* task to the run-queue and that's it.
2633	*	2633	*
2634	* Now, if the new task added to the run-queue preempts the current	2634	* Now, if the new task added to the run-queue preempts the current
2635	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets	2635	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2636	* called on the nearest possible occasion:	2636	* called on the nearest possible occasion:
2637	*	2637	*
2638	* - If the kernel is preemptible (CONFIG_PREEMPT=y):	2638	* - If the kernel is preemptible (CONFIG_PREEMPT=y):
2639	*	2639	*
2640	* - in syscall or exception context, at the next outmost	2640	* - in syscall or exception context, at the next outmost
2641	* preempt_enable(). (this might be as soon as the wake_up()'s	2641	* preempt_enable(). (this might be as soon as the wake_up()'s
2642	* spin_unlock()!)	2642	* spin_unlock()!)
2643	*	2643	*
2644	* - in IRQ context, return from interrupt-handler to	2644	* - in IRQ context, return from interrupt-handler to
2645	* preemptible context	2645	* preemptible context
2646	*	2646	*
2647	* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)	2647	* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2648	* then at the next:	2648	* then at the next:
2649	*	2649	*
2650	* - cond_resched() call	2650	* - cond_resched() call
2651	* - explicit schedule() call	2651	* - explicit schedule() call
2652	* - return from syscall or exception to user-space	2652	* - return from syscall or exception to user-space
2653	* - return from interrupt-handler to user-space	2653	* - return from interrupt-handler to user-space
2654	*/	2654	*/
2655	static void __sched __schedule(void)	2655	static void __sched __schedule(void)
2656	{	2656	{
2657	struct task_struct prev, next;	2657	struct task_struct prev, next;
2658	unsigned long *switch_count;	2658	unsigned long *switch_count;
2659	struct rq *rq;	2659	struct rq *rq;
2660	int cpu;	2660	int cpu;
2661		2661
2662	need_resched:	2662	need_resched:
2663	preempt_disable();	2663	preempt_disable();
2664	cpu = smp_processor_id();	2664	cpu = smp_processor_id();
2665	rq = cpu_rq(cpu);	2665	rq = cpu_rq(cpu);
2666	rcu_note_context_switch(cpu);	2666	rcu_note_context_switch(cpu);
2667	prev = rq->curr;	2667	prev = rq->curr;
2668		2668
2669	schedule_debug(prev);	2669	schedule_debug(prev);
2670		2670
2671	if (sched_feat(HRTICK))	2671	if (sched_feat(HRTICK))
2672	hrtick_clear(rq);	2672	hrtick_clear(rq);
2673		2673
2674	/*	2674	/*
2675	* Make sure that signal_pending_state()->signal_pending() below	2675	* Make sure that signal_pending_state()->signal_pending() below
2676	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)	2676	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2677	* done by the caller to avoid the race with signal_wake_up().	2677	* done by the caller to avoid the race with signal_wake_up().
2678	*/	2678	*/
2679	smp_mb__before_spinlock();	2679	smp_mb__before_spinlock();
2680	raw_spin_lock_irq(&rq->lock);	2680	raw_spin_lock_irq(&rq->lock);
2681		2681
2682	switch_count = &prev->nivcsw;	2682	switch_count = &prev->nivcsw;
2683	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	2683	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2684	if (unlikely(signal_pending_state(prev->state, prev))) {	2684	if (unlikely(signal_pending_state(prev->state, prev))) {
2685	prev->state = TASK_RUNNING;	2685	prev->state = TASK_RUNNING;
2686	} else {	2686	} else {
2687	deactivate_task(rq, prev, DEQUEUE_SLEEP);	2687	deactivate_task(rq, prev, DEQUEUE_SLEEP);
2688	prev->on_rq = 0;	2688	prev->on_rq = 0;
2689		2689
2690	/*	2690	/*
2691	* If a worker went to sleep, notify and ask workqueue	2691	* If a worker went to sleep, notify and ask workqueue
2692	* whether it wants to wake up a task to maintain	2692	* whether it wants to wake up a task to maintain
2693	* concurrency.	2693	* concurrency.
2694	*/	2694	*/
2695	if (prev->flags & PF_WQ_WORKER) {	2695	if (prev->flags & PF_WQ_WORKER) {
2696	struct task_struct *to_wakeup;	2696	struct task_struct *to_wakeup;
2697		2697
2698	to_wakeup = wq_worker_sleeping(prev, cpu);	2698	to_wakeup = wq_worker_sleeping(prev, cpu);
2699	if (to_wakeup)	2699	if (to_wakeup)
2700	try_to_wake_up_local(to_wakeup);	2700	try_to_wake_up_local(to_wakeup);
2701	}	2701	}
2702	}	2702	}
2703	switch_count = &prev->nvcsw;	2703	switch_count = &prev->nvcsw;
2704	}	2704	}
2705		2705
2706	if (prev->on_rq \|\| rq->skip_clock_update < 0)	2706	if (prev->on_rq \|\| rq->skip_clock_update < 0)
2707	update_rq_clock(rq);	2707	update_rq_clock(rq);
2708		2708
2709	next = pick_next_task(rq, prev);	2709	next = pick_next_task(rq, prev);
2710	clear_tsk_need_resched(prev);	2710	clear_tsk_need_resched(prev);
2711	clear_preempt_need_resched();	2711	clear_preempt_need_resched();
2712	rq->skip_clock_update = 0;	2712	rq->skip_clock_update = 0;
2713		2713
2714	if (likely(prev != next)) {	2714	if (likely(prev != next)) {
2715	rq->nr_switches++;	2715	rq->nr_switches++;
2716	rq->curr = next;	2716	rq->curr = next;
2717	++*switch_count;	2717	++*switch_count;
2718		2718
2719	context_switch(rq, prev, next); /* unlocks the rq */	2719	context_switch(rq, prev, next); /* unlocks the rq */
2720	/*	2720	/*
2721	* The context switch have flipped the stack from under us	2721	* The context switch have flipped the stack from under us
2722	* and restored the local variables which were saved when	2722	* and restored the local variables which were saved when
2723	* this task called schedule() in the past. prev == current	2723	* this task called schedule() in the past. prev == current
2724	* is still correct, but it can be moved to another cpu/rq.	2724	* is still correct, but it can be moved to another cpu/rq.
2725	*/	2725	*/
2726	cpu = smp_processor_id();	2726	cpu = smp_processor_id();
2727	rq = cpu_rq(cpu);	2727	rq = cpu_rq(cpu);
2728	} else	2728	} else
2729	raw_spin_unlock_irq(&rq->lock);	2729	raw_spin_unlock_irq(&rq->lock);
2730		2730
2731	post_schedule(rq);	2731	post_schedule(rq);
2732		2732
2733	sched_preempt_enable_no_resched();	2733	sched_preempt_enable_no_resched();
2734	if (need_resched())	2734	if (need_resched())
2735	goto need_resched;	2735	goto need_resched;
2736	}	2736	}
2737		2737
2738	static inline void sched_submit_work(struct task_struct *tsk)	2738	static inline void sched_submit_work(struct task_struct *tsk)
2739	{	2739	{
2740	if (!tsk->state \|\| tsk_is_pi_blocked(tsk))	2740	if (!tsk->state \|\| tsk_is_pi_blocked(tsk))
2741	return;	2741	return;
2742	/*	2742	/*
2743	* If we are going to sleep and we have plugged IO queued,	2743	* If we are going to sleep and we have plugged IO queued,
2744	* make sure to submit it to avoid deadlocks.	2744	* make sure to submit it to avoid deadlocks.
2745	*/	2745	*/
2746	if (blk_needs_flush_plug(tsk))	2746	if (blk_needs_flush_plug(tsk))
2747	blk_schedule_flush_plug(tsk);	2747	blk_schedule_flush_plug(tsk);
2748	}	2748	}
2749		2749
2750	asmlinkage __visible void __sched schedule(void)	2750	asmlinkage __visible void __sched schedule(void)
2751	{	2751	{
2752	struct task_struct *tsk = current;	2752	struct task_struct *tsk = current;
2753		2753
2754	sched_submit_work(tsk);	2754	sched_submit_work(tsk);
2755	__schedule();	2755	__schedule();
2756	}	2756	}
2757	EXPORT_SYMBOL(schedule);	2757	EXPORT_SYMBOL(schedule);
2758		2758
2759	#ifdef CONFIG_CONTEXT_TRACKING	2759	#ifdef CONFIG_CONTEXT_TRACKING
2760	asmlinkage __visible void __sched schedule_user(void)	2760	asmlinkage __visible void __sched schedule_user(void)
2761	{	2761	{
2762	/*	2762	/*
2763	* If we come here after a random call to set_need_resched(),	2763	* If we come here after a random call to set_need_resched(),
2764	* or we have been woken up remotely but the IPI has not yet arrived,	2764	* or we have been woken up remotely but the IPI has not yet arrived,
2765	* we haven't yet exited the RCU idle mode. Do it here manually until	2765	* we haven't yet exited the RCU idle mode. Do it here manually until
2766	* we find a better solution.	2766	* we find a better solution.
2767	*/	2767	*/
2768	user_exit();	2768	user_exit();
2769	schedule();	2769	schedule();
2770	user_enter();	2770	user_enter();
2771	}	2771	}
2772	#endif	2772	#endif
2773		2773
2774	/**	2774	/**
2775	* schedule_preempt_disabled - called with preemption disabled	2775	* schedule_preempt_disabled - called with preemption disabled
2776	*	2776	*
2777	* Returns with preemption disabled. Note: preempt_count must be 1	2777	* Returns with preemption disabled. Note: preempt_count must be 1
2778	*/	2778	*/
2779	void __sched schedule_preempt_disabled(void)	2779	void __sched schedule_preempt_disabled(void)
2780	{	2780	{
2781	sched_preempt_enable_no_resched();	2781	sched_preempt_enable_no_resched();
2782	schedule();	2782	schedule();
2783	preempt_disable();	2783	preempt_disable();
2784	}	2784	}
2785		2785
2786	#ifdef CONFIG_PREEMPT	2786	#ifdef CONFIG_PREEMPT
2787	/*	2787	/*
2788	* this is the entry point to schedule() from in-kernel preemption	2788	* this is the entry point to schedule() from in-kernel preemption
2789	* off of preempt_enable. Kernel preemptions off return from interrupt	2789	* off of preempt_enable. Kernel preemptions off return from interrupt
2790	* occur there and call schedule directly.	2790	* occur there and call schedule directly.
2791	*/	2791	*/
2792	asmlinkage __visible void __sched notrace preempt_schedule(void)	2792	asmlinkage __visible void __sched notrace preempt_schedule(void)
2793	{	2793	{
2794	/*	2794	/*
2795	* If there is a non-zero preempt_count or interrupts are disabled,	2795	* If there is a non-zero preempt_count or interrupts are disabled,
2796	* we do not want to preempt the current task. Just return..	2796	* we do not want to preempt the current task. Just return..
2797	*/	2797	*/
2798	if (likely(!preemptible()))	2798	if (likely(!preemptible()))
2799	return;	2799	return;
2800		2800
2801	do {	2801	do {
2802	__preempt_count_add(PREEMPT_ACTIVE);	2802	__preempt_count_add(PREEMPT_ACTIVE);
2803	__schedule();	2803	__schedule();
2804	__preempt_count_sub(PREEMPT_ACTIVE);	2804	__preempt_count_sub(PREEMPT_ACTIVE);
2805		2805
2806	/*	2806	/*
2807	* Check again in case we missed a preemption opportunity	2807	* Check again in case we missed a preemption opportunity
2808	* between schedule and now.	2808	* between schedule and now.
2809	*/	2809	*/
2810	barrier();	2810	barrier();
2811	} while (need_resched());	2811	} while (need_resched());
2812	}	2812	}
2813	EXPORT_SYMBOL(preempt_schedule);	2813	EXPORT_SYMBOL(preempt_schedule);
2814	#endif /* CONFIG_PREEMPT */	2814	#endif /* CONFIG_PREEMPT */
2815		2815
2816	/*	2816	/*
2817	* this is the entry point to schedule() from kernel preemption	2817	* this is the entry point to schedule() from kernel preemption
2818	* off of irq context.	2818	* off of irq context.
2819	* Note, that this is called and return with irqs disabled. This will	2819	* Note, that this is called and return with irqs disabled. This will
2820	* protect us against recursive calling from irq.	2820	* protect us against recursive calling from irq.
2821	*/	2821	*/
2822	asmlinkage __visible void __sched preempt_schedule_irq(void)	2822	asmlinkage __visible void __sched preempt_schedule_irq(void)
2823	{	2823	{
2824	enum ctx_state prev_state;	2824	enum ctx_state prev_state;
2825		2825
2826	/* Catch callers which need to be fixed */	2826	/* Catch callers which need to be fixed */
2827	BUG_ON(preempt_count() \|\| !irqs_disabled());	2827	BUG_ON(preempt_count() \|\| !irqs_disabled());
2828		2828
2829	prev_state = exception_enter();	2829	prev_state = exception_enter();
2830		2830
2831	do {	2831	do {
2832	__preempt_count_add(PREEMPT_ACTIVE);	2832	__preempt_count_add(PREEMPT_ACTIVE);
2833	local_irq_enable();	2833	local_irq_enable();
2834	__schedule();	2834	__schedule();
2835	local_irq_disable();	2835	local_irq_disable();
2836	__preempt_count_sub(PREEMPT_ACTIVE);	2836	__preempt_count_sub(PREEMPT_ACTIVE);
2837		2837
2838	/*	2838	/*
2839	* Check again in case we missed a preemption opportunity	2839	* Check again in case we missed a preemption opportunity
2840	* between schedule and now.	2840	* between schedule and now.
2841	*/	2841	*/
2842	barrier();	2842	barrier();
2843	} while (need_resched());	2843	} while (need_resched());
2844		2844
2845	exception_exit(prev_state);	2845	exception_exit(prev_state);
2846	}	2846	}
2847		2847
2848	int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,	2848	int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2849	void *key)	2849	void *key)
2850	{	2850	{
2851	return try_to_wake_up(curr->private, mode, wake_flags);	2851	return try_to_wake_up(curr->private, mode, wake_flags);
2852	}	2852	}
2853	EXPORT_SYMBOL(default_wake_function);	2853	EXPORT_SYMBOL(default_wake_function);
2854		2854
2855	#ifdef CONFIG_RT_MUTEXES	2855	#ifdef CONFIG_RT_MUTEXES
2856		2856
2857	/*	2857	/*
2858	* rt_mutex_setprio - set the current priority of a task	2858	* rt_mutex_setprio - set the current priority of a task
2859	* @p: task	2859	* @p: task
2860	* @prio: prio value (kernel-internal form)	2860	* @prio: prio value (kernel-internal form)
2861	*	2861	*
2862	* This function changes the 'effective' priority of a task. It does	2862	* This function changes the 'effective' priority of a task. It does
2863	* not touch ->normal_prio like __setscheduler().	2863	* not touch ->normal_prio like __setscheduler().
2864	*	2864	*
2865	* Used by the rt_mutex code to implement priority inheritance	2865	* Used by the rt_mutex code to implement priority inheritance
2866	* logic. Call site only calls if the priority of the task changed.	2866	* logic. Call site only calls if the priority of the task changed.
2867	*/	2867	*/
2868	void rt_mutex_setprio(struct task_struct *p, int prio)	2868	void rt_mutex_setprio(struct task_struct *p, int prio)
2869	{	2869	{
2870	int oldprio, on_rq, running, enqueue_flag = 0;	2870	int oldprio, on_rq, running, enqueue_flag = 0;
2871	struct rq *rq;	2871	struct rq *rq;
2872	const struct sched_class *prev_class;	2872	const struct sched_class *prev_class;
2873		2873
2874	BUG_ON(prio > MAX_PRIO);	2874	BUG_ON(prio > MAX_PRIO);
2875		2875
2876	rq = __task_rq_lock(p);	2876	rq = __task_rq_lock(p);
2877		2877
2878	/*	2878	/*
2879	* Idle task boosting is a nono in general. There is one	2879	* Idle task boosting is a nono in general. There is one
2880	* exception, when PREEMPT_RT and NOHZ is active:	2880	* exception, when PREEMPT_RT and NOHZ is active:
2881	*	2881	*
2882	* The idle task calls get_next_timer_interrupt() and holds	2882	* The idle task calls get_next_timer_interrupt() and holds
2883	* the timer wheel base->lock on the CPU and another CPU wants	2883	* the timer wheel base->lock on the CPU and another CPU wants
2884	* to access the timer (probably to cancel it). We can safely	2884	* to access the timer (probably to cancel it). We can safely
2885	* ignore the boosting request, as the idle CPU runs this code	2885	* ignore the boosting request, as the idle CPU runs this code
2886	* with interrupts disabled and will complete the lock	2886	* with interrupts disabled and will complete the lock
2887	* protected section without being interrupted. So there is no	2887	* protected section without being interrupted. So there is no
2888	* real need to boost.	2888	* real need to boost.
2889	*/	2889	*/
2890	if (unlikely(p == rq->idle)) {	2890	if (unlikely(p == rq->idle)) {
2891	WARN_ON(p != rq->curr);	2891	WARN_ON(p != rq->curr);
2892	WARN_ON(p->pi_blocked_on);	2892	WARN_ON(p->pi_blocked_on);
2893	goto out_unlock;	2893	goto out_unlock;
2894	}	2894	}
2895		2895
2896	trace_sched_pi_setprio(p, prio);	2896	trace_sched_pi_setprio(p, prio);
2897	p->pi_top_task = rt_mutex_get_top_task(p);	2897	p->pi_top_task = rt_mutex_get_top_task(p);
2898	oldprio = p->prio;	2898	oldprio = p->prio;
2899	prev_class = p->sched_class;	2899	prev_class = p->sched_class;
2900	on_rq = p->on_rq;	2900	on_rq = p->on_rq;
2901	running = task_current(rq, p);	2901	running = task_current(rq, p);
2902	if (on_rq)	2902	if (on_rq)
2903	dequeue_task(rq, p, 0);	2903	dequeue_task(rq, p, 0);
2904	if (running)	2904	if (running)
2905	p->sched_class->put_prev_task(rq, p);	2905	p->sched_class->put_prev_task(rq, p);
2906		2906
2907	/*	2907	/*
2908	* Boosting condition are:	2908	* Boosting condition are:
2909	* 1. -rt task is running and holds mutex A	2909	* 1. -rt task is running and holds mutex A
2910	* --> -dl task blocks on mutex A	2910	* --> -dl task blocks on mutex A
2911	*	2911	*
2912	* 2. -dl task is running and holds mutex A	2912	* 2. -dl task is running and holds mutex A
2913	* --> -dl task blocks on mutex A and could preempt the	2913	* --> -dl task blocks on mutex A and could preempt the
2914	* running task	2914	* running task
2915	*/	2915	*/
2916	if (dl_prio(prio)) {	2916	if (dl_prio(prio)) {
2917	if (!dl_prio(p->normal_prio) \|\| (p->pi_top_task &&	2917	if (!dl_prio(p->normal_prio) \|\| (p->pi_top_task &&
2918	dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {	2918	dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2919	p->dl.dl_boosted = 1;	2919	p->dl.dl_boosted = 1;
2920	p->dl.dl_throttled = 0;	2920	p->dl.dl_throttled = 0;
2921	enqueue_flag = ENQUEUE_REPLENISH;	2921	enqueue_flag = ENQUEUE_REPLENISH;
2922	} else	2922	} else
2923	p->dl.dl_boosted = 0;	2923	p->dl.dl_boosted = 0;
2924	p->sched_class = &dl_sched_class;	2924	p->sched_class = &dl_sched_class;
2925	} else if (rt_prio(prio)) {	2925	} else if (rt_prio(prio)) {
2926	if (dl_prio(oldprio))	2926	if (dl_prio(oldprio))
2927	p->dl.dl_boosted = 0;	2927	p->dl.dl_boosted = 0;
2928	if (oldprio < prio)	2928	if (oldprio < prio)
2929	enqueue_flag = ENQUEUE_HEAD;	2929	enqueue_flag = ENQUEUE_HEAD;
2930	p->sched_class = &rt_sched_class;	2930	p->sched_class = &rt_sched_class;
2931	} else {	2931	} else {
2932	if (dl_prio(oldprio))	2932	if (dl_prio(oldprio))
2933	p->dl.dl_boosted = 0;	2933	p->dl.dl_boosted = 0;
2934	p->sched_class = &fair_sched_class;	2934	p->sched_class = &fair_sched_class;
2935	}	2935	}
2936		2936
2937	p->prio = prio;	2937	p->prio = prio;
2938		2938
2939	if (running)	2939	if (running)
2940	p->sched_class->set_curr_task(rq);	2940	p->sched_class->set_curr_task(rq);
2941	if (on_rq)	2941	if (on_rq)
2942	enqueue_task(rq, p, enqueue_flag);	2942	enqueue_task(rq, p, enqueue_flag);
2943		2943
2944	check_class_changed(rq, p, prev_class, oldprio);	2944	check_class_changed(rq, p, prev_class, oldprio);
2945	out_unlock:	2945	out_unlock:
2946	__task_rq_unlock(rq);	2946	__task_rq_unlock(rq);
2947	}	2947	}
2948	#endif	2948	#endif
2949		2949
2950	void set_user_nice(struct task_struct *p, long nice)	2950	void set_user_nice(struct task_struct *p, long nice)
2951	{	2951	{
2952	int old_prio, delta, on_rq;	2952	int old_prio, delta, on_rq;
2953	unsigned long flags;	2953	unsigned long flags;
2954	struct rq *rq;	2954	struct rq *rq;
2955		2955
2956	if (task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE)	2956	if (task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE)
2957	return;	2957	return;
2958	/*	2958	/*
2959	* We have to be careful, if called from sys_setpriority(),	2959	* We have to be careful, if called from sys_setpriority(),
2960	* the task might be in the middle of scheduling on another CPU.	2960	* the task might be in the middle of scheduling on another CPU.
2961	*/	2961	*/
2962	rq = task_rq_lock(p, &flags);	2962	rq = task_rq_lock(p, &flags);
2963	/*	2963	/*
2964	* The RT priorities are set via sched_setscheduler(), but we still	2964	* The RT priorities are set via sched_setscheduler(), but we still
2965	* allow the 'normal' nice value to be set - but as expected	2965	* allow the 'normal' nice value to be set - but as expected
2966	* it wont have any effect on scheduling until the task is	2966	* it wont have any effect on scheduling until the task is
2967	* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:	2967	* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2968	*/	2968	*/
2969	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {	2969	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {
2970	p->static_prio = NICE_TO_PRIO(nice);	2970	p->static_prio = NICE_TO_PRIO(nice);
2971	goto out_unlock;	2971	goto out_unlock;
2972	}	2972	}
2973	on_rq = p->on_rq;	2973	on_rq = p->on_rq;
2974	if (on_rq)	2974	if (on_rq)
2975	dequeue_task(rq, p, 0);	2975	dequeue_task(rq, p, 0);
2976		2976
2977	p->static_prio = NICE_TO_PRIO(nice);	2977	p->static_prio = NICE_TO_PRIO(nice);
2978	set_load_weight(p);	2978	set_load_weight(p);
2979	old_prio = p->prio;	2979	old_prio = p->prio;
2980	p->prio = effective_prio(p);	2980	p->prio = effective_prio(p);
2981	delta = p->prio - old_prio;	2981	delta = p->prio - old_prio;
2982		2982
2983	if (on_rq) {	2983	if (on_rq) {
2984	enqueue_task(rq, p, 0);	2984	enqueue_task(rq, p, 0);
2985	/*	2985	/*
2986	* If the task increased its priority or is running and	2986	* If the task increased its priority or is running and
2987	* lowered its priority, then reschedule its CPU:	2987	* lowered its priority, then reschedule its CPU:
2988	*/	2988	*/
2989	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))	2989	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
2990	resched_task(rq->curr);	2990	resched_task(rq->curr);
2991	}	2991	}
2992	out_unlock:	2992	out_unlock:
2993	task_rq_unlock(rq, p, &flags);	2993	task_rq_unlock(rq, p, &flags);
2994	}	2994	}
2995	EXPORT_SYMBOL(set_user_nice);	2995	EXPORT_SYMBOL(set_user_nice);
2996		2996
2997	/*	2997	/*
2998	* can_nice - check if a task can reduce its nice value	2998	* can_nice - check if a task can reduce its nice value
2999	* @p: task	2999	* @p: task
3000	* @nice: nice value	3000	* @nice: nice value
3001	*/	3001	*/
3002	int can_nice(const struct task_struct *p, const int nice)	3002	int can_nice(const struct task_struct *p, const int nice)
3003	{	3003	{
3004	/* convert nice value [19,-20] to rlimit style value [1,40] */	3004	/* convert nice value [19,-20] to rlimit style value [1,40] */
3005	int nice_rlim = 20 - nice;	3005	int nice_rlim = 20 - nice;
3006		3006
3007	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|	3007	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|
3008	capable(CAP_SYS_NICE));	3008	capable(CAP_SYS_NICE));
3009	}	3009	}
3010		3010
3011	#ifdef __ARCH_WANT_SYS_NICE	3011	#ifdef __ARCH_WANT_SYS_NICE
3012		3012
3013	/*	3013	/*
3014	* sys_nice - change the priority of the current process.	3014	* sys_nice - change the priority of the current process.
3015	* @increment: priority increment	3015	* @increment: priority increment
3016	*	3016	*
3017	* sys_setpriority is a more generic, but much slower function that	3017	* sys_setpriority is a more generic, but much slower function that
3018	* does similar things.	3018	* does similar things.
3019	*/	3019	*/
3020	SYSCALL_DEFINE1(nice, int, increment)	3020	SYSCALL_DEFINE1(nice, int, increment)
3021	{	3021	{
3022	long nice, retval;	3022	long nice, retval;
3023		3023
3024	/*	3024	/*
3025	* Setpriority might change our priority at the same moment.	3025	* Setpriority might change our priority at the same moment.
3026	* We don't have to worry. Conceptually one call occurs first	3026	* We don't have to worry. Conceptually one call occurs first
3027	* and we have a single winner.	3027	* and we have a single winner.
3028	*/	3028	*/
3029	if (increment < -40)	3029	if (increment < -40)
3030	increment = -40;	3030	increment = -40;
3031	if (increment > 40)	3031	if (increment > 40)
3032	increment = 40;	3032	increment = 40;
3033		3033
3034	nice = task_nice(current) + increment;	3034	nice = task_nice(current) + increment;
3035	if (nice < MIN_NICE)	3035	if (nice < MIN_NICE)
3036	nice = MIN_NICE;	3036	nice = MIN_NICE;
3037	if (nice > MAX_NICE)	3037	if (nice > MAX_NICE)
3038	nice = MAX_NICE;	3038	nice = MAX_NICE;
3039		3039
3040	if (increment < 0 && !can_nice(current, nice))	3040	if (increment < 0 && !can_nice(current, nice))
3041	return -EPERM;	3041	return -EPERM;
3042		3042
3043	retval = security_task_setnice(current, nice);	3043	retval = security_task_setnice(current, nice);
3044	if (retval)	3044	if (retval)
3045	return retval;	3045	return retval;
3046		3046
3047	set_user_nice(current, nice);	3047	set_user_nice(current, nice);
3048	return 0;	3048	return 0;
3049	}	3049	}
3050		3050
3051	#endif	3051	#endif
3052		3052
3053	/**	3053	/**
3054	* task_prio - return the priority value of a given task.	3054	* task_prio - return the priority value of a given task.
3055	* @p: the task in question.	3055	* @p: the task in question.
3056	*	3056	*
3057	* Return: The priority value as seen by users in /proc.	3057	* Return: The priority value as seen by users in /proc.
3058	* RT tasks are offset by -200. Normal tasks are centered	3058	* RT tasks are offset by -200. Normal tasks are centered
3059	* around 0, value goes from -16 to +15.	3059	* around 0, value goes from -16 to +15.
3060	*/	3060	*/
3061	int task_prio(const struct task_struct *p)	3061	int task_prio(const struct task_struct *p)
3062	{	3062	{
3063	return p->prio - MAX_RT_PRIO;	3063	return p->prio - MAX_RT_PRIO;
3064	}	3064	}
3065		3065
3066	/**	3066	/**
3067	* idle_cpu - is a given cpu idle currently?	3067	* idle_cpu - is a given cpu idle currently?
3068	* @cpu: the processor in question.	3068	* @cpu: the processor in question.
3069	*	3069	*
3070	* Return: 1 if the CPU is currently idle. 0 otherwise.	3070	* Return: 1 if the CPU is currently idle. 0 otherwise.
3071	*/	3071	*/
3072	int idle_cpu(int cpu)	3072	int idle_cpu(int cpu)
3073	{	3073	{
3074	struct rq *rq = cpu_rq(cpu);	3074	struct rq *rq = cpu_rq(cpu);
3075		3075
3076	if (rq->curr != rq->idle)	3076	if (rq->curr != rq->idle)
3077	return 0;	3077	return 0;
3078		3078
3079	if (rq->nr_running)	3079	if (rq->nr_running)
3080	return 0;	3080	return 0;
3081		3081
3082	#ifdef CONFIG_SMP	3082	#ifdef CONFIG_SMP
3083	if (!llist_empty(&rq->wake_list))	3083	if (!llist_empty(&rq->wake_list))
3084	return 0;	3084	return 0;
3085	#endif	3085	#endif
3086		3086
3087	return 1;	3087	return 1;
3088	}	3088	}
3089		3089
3090	/**	3090	/**
3091	* idle_task - return the idle task for a given cpu.	3091	* idle_task - return the idle task for a given cpu.
3092	* @cpu: the processor in question.	3092	* @cpu: the processor in question.
3093	*	3093	*
3094	* Return: The idle task for the cpu @cpu.	3094	* Return: The idle task for the cpu @cpu.
3095	*/	3095	*/
3096	struct task_struct *idle_task(int cpu)	3096	struct task_struct *idle_task(int cpu)
3097	{	3097	{
3098	return cpu_rq(cpu)->idle;	3098	return cpu_rq(cpu)->idle;
3099	}	3099	}
3100		3100
3101	/**	3101	/**
3102	* find_process_by_pid - find a process with a matching PID value.	3102	* find_process_by_pid - find a process with a matching PID value.
3103	* @pid: the pid in question.	3103	* @pid: the pid in question.
3104	*	3104	*
3105	* The task of @pid, if found. %NULL otherwise.	3105	* The task of @pid, if found. %NULL otherwise.
3106	*/	3106	*/
3107	static struct task_struct *find_process_by_pid(pid_t pid)	3107	static struct task_struct *find_process_by_pid(pid_t pid)
3108	{	3108	{
3109	return pid ? find_task_by_vpid(pid) : current;	3109	return pid ? find_task_by_vpid(pid) : current;
3110	}	3110	}
3111		3111
3112	/*	3112	/*
3113	* This function initializes the sched_dl_entity of a newly becoming	3113	* This function initializes the sched_dl_entity of a newly becoming
3114	* SCHED_DEADLINE task.	3114	* SCHED_DEADLINE task.
3115	*	3115	*
3116	* Only the static values are considered here, the actual runtime and the	3116	* Only the static values are considered here, the actual runtime and the
3117	* absolute deadline will be properly calculated when the task is enqueued	3117	* absolute deadline will be properly calculated when the task is enqueued
3118	* for the first time with its new policy.	3118	* for the first time with its new policy.
3119	*/	3119	*/
3120	static void	3120	static void
3121	__setparam_dl(struct task_struct p, const struct sched_attr attr)	3121	__setparam_dl(struct task_struct p, const struct sched_attr attr)
3122	{	3122	{
3123	struct sched_dl_entity *dl_se = &p->dl;	3123	struct sched_dl_entity *dl_se = &p->dl;
3124		3124
3125	init_dl_task_timer(dl_se);	3125	init_dl_task_timer(dl_se);
3126	dl_se->dl_runtime = attr->sched_runtime;	3126	dl_se->dl_runtime = attr->sched_runtime;
3127	dl_se->dl_deadline = attr->sched_deadline;	3127	dl_se->dl_deadline = attr->sched_deadline;
3128	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;	3128	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3129	dl_se->flags = attr->sched_flags;	3129	dl_se->flags = attr->sched_flags;
3130	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);	3130	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3131	dl_se->dl_throttled = 0;	3131	dl_se->dl_throttled = 0;
3132	dl_se->dl_new = 1;	3132	dl_se->dl_new = 1;
3133	dl_se->dl_yielded = 0;	3133	dl_se->dl_yielded = 0;
3134	}	3134	}
3135		3135
3136	static void __setscheduler_params(struct task_struct *p,	3136	static void __setscheduler_params(struct task_struct *p,
3137	const struct sched_attr *attr)	3137	const struct sched_attr *attr)
3138	{	3138	{
3139	int policy = attr->sched_policy;	3139	int policy = attr->sched_policy;
3140		3140
3141	if (policy == -1) /* setparam */	3141	if (policy == -1) /* setparam */
3142	policy = p->policy;	3142	policy = p->policy;
3143		3143
3144	p->policy = policy;	3144	p->policy = policy;
3145		3145
3146	if (dl_policy(policy))	3146	if (dl_policy(policy))
3147	__setparam_dl(p, attr);	3147	__setparam_dl(p, attr);
3148	else if (fair_policy(policy))	3148	else if (fair_policy(policy))
3149	p->static_prio = NICE_TO_PRIO(attr->sched_nice);	3149	p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3150		3150
3151	/*	3151	/*
3152	* __sched_setscheduler() ensures attr->sched_priority == 0 when	3152	* __sched_setscheduler() ensures attr->sched_priority == 0 when
3153	* !rt_policy. Always setting this ensures that things like	3153	* !rt_policy. Always setting this ensures that things like
3154	* getparam()/getattr() don't report silly values for !rt tasks.	3154	* getparam()/getattr() don't report silly values for !rt tasks.
3155	*/	3155	*/
3156	p->rt_priority = attr->sched_priority;	3156	p->rt_priority = attr->sched_priority;
3157	p->normal_prio = normal_prio(p);	3157	p->normal_prio = normal_prio(p);
3158	set_load_weight(p);	3158	set_load_weight(p);
3159	}	3159	}
3160		3160
3161	/* Actually do priority change: must hold pi & rq lock. */	3161	/* Actually do priority change: must hold pi & rq lock. */
3162	static void __setscheduler(struct rq rq, struct task_struct p,	3162	static void __setscheduler(struct rq rq, struct task_struct p,
3163	const struct sched_attr *attr)	3163	const struct sched_attr *attr)
3164	{	3164	{
3165	__setscheduler_params(p, attr);	3165	__setscheduler_params(p, attr);
3166		3166
3167	/*	3167	/*
3168	* If we get here, there was no pi waiters boosting the	3168	* If we get here, there was no pi waiters boosting the
3169	* task. It is safe to use the normal prio.	3169	* task. It is safe to use the normal prio.
3170	*/	3170	*/
3171	p->prio = normal_prio(p);	3171	p->prio = normal_prio(p);
3172		3172
3173	if (dl_prio(p->prio))	3173	if (dl_prio(p->prio))
3174	p->sched_class = &dl_sched_class;	3174	p->sched_class = &dl_sched_class;
3175	else if (rt_prio(p->prio))	3175	else if (rt_prio(p->prio))
3176	p->sched_class = &rt_sched_class;	3176	p->sched_class = &rt_sched_class;
3177	else	3177	else
3178	p->sched_class = &fair_sched_class;	3178	p->sched_class = &fair_sched_class;
3179	}	3179	}
3180		3180
3181	static void	3181	static void
3182	__getparam_dl(struct task_struct p, struct sched_attr attr)	3182	__getparam_dl(struct task_struct p, struct sched_attr attr)
3183	{	3183	{
3184	struct sched_dl_entity *dl_se = &p->dl;	3184	struct sched_dl_entity *dl_se = &p->dl;
3185		3185
3186	attr->sched_priority = p->rt_priority;	3186	attr->sched_priority = p->rt_priority;
3187	attr->sched_runtime = dl_se->dl_runtime;	3187	attr->sched_runtime = dl_se->dl_runtime;
3188	attr->sched_deadline = dl_se->dl_deadline;	3188	attr->sched_deadline = dl_se->dl_deadline;
3189	attr->sched_period = dl_se->dl_period;	3189	attr->sched_period = dl_se->dl_period;
3190	attr->sched_flags = dl_se->flags;	3190	attr->sched_flags = dl_se->flags;
3191	}	3191	}
3192		3192
3193	/*	3193	/*
3194	* This function validates the new parameters of a -deadline task.	3194	* This function validates the new parameters of a -deadline task.
3195	* We ask for the deadline not being zero, and greater or equal	3195	* We ask for the deadline not being zero, and greater or equal
3196	* than the runtime, as well as the period of being zero or	3196	* than the runtime, as well as the period of being zero or
3197	* greater than deadline. Furthermore, we have to be sure that	3197	* greater than deadline. Furthermore, we have to be sure that
3198	* user parameters are above the internal resolution (1us); we	3198	* user parameters are above the internal resolution of 1us (we
3199	* check sched_runtime only since it is always the smaller one.	3199	* check sched_runtime only since it is always the smaller one) and
		3200	* below 2^63 ns (we have to check both sched_deadline and
		3201	* sched_period, as the latter can be zero).
3200	*/	3202	*/
3201	static bool	3203	static bool
3202	__checkparam_dl(const struct sched_attr *attr)	3204	__checkparam_dl(const struct sched_attr *attr)
3203	{	3205	{
3204	return attr && attr->sched_deadline != 0 &&	3206	/* deadline != 0 */
3205	(attr->sched_period == 0 \|\|	3207	if (attr->sched_deadline == 0)
3206	(s64)(attr->sched_period - attr->sched_deadline) >= 0) &&	3208	return false;
3207	(s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&	3209
3208	attr->sched_runtime >= (2 << (DL_SCALE - 1));	3210	/*
		3211	* Since we truncate DL_SCALE bits, make sure we're at least
		3212	* that big.
		3213	*/
		3214	if (attr->sched_runtime < (1ULL << DL_SCALE))
		3215	return false;
		3216
		3217	/*
		3218	* Since we use the MSB for wrap-around and sign issues, make
		3219	* sure it's not set (mind that period can be equal to zero).
		3220	*/
		3221	if (attr->sched_deadline & (1ULL << 63) \|\|
		3222	attr->sched_period & (1ULL << 63))
		3223	return false;
		3224
		3225	/* runtime <= deadline <= period (if period != 0) */
		3226	if ((attr->sched_period != 0 &&
		3227	attr->sched_period < attr->sched_deadline) \|\|
		3228	attr->sched_deadline < attr->sched_runtime)
		3229	return false;
		3230
		3231	return true;
3209	}	3232	}
3210		3233
3211	/*	3234	/*
3212	* check the target process has a UID that matches the current process's	3235	* check the target process has a UID that matches the current process's
3213	*/	3236	*/
3214	static bool check_same_owner(struct task_struct *p)	3237	static bool check_same_owner(struct task_struct *p)
3215	{	3238	{
3216	const struct cred cred = current_cred(), pcred;	3239	const struct cred cred = current_cred(), pcred;
3217	bool match;	3240	bool match;
3218		3241
3219	rcu_read_lock();	3242	rcu_read_lock();
3220	pcred = __task_cred(p);	3243	pcred = __task_cred(p);
3221	match = (uid_eq(cred->euid, pcred->euid) \|\|	3244	match = (uid_eq(cred->euid, pcred->euid) \|\|
3222	uid_eq(cred->euid, pcred->uid));	3245	uid_eq(cred->euid, pcred->uid));
3223	rcu_read_unlock();	3246	rcu_read_unlock();
3224	return match;	3247	return match;
3225	}	3248	}
3226		3249
3227	static int __sched_setscheduler(struct task_struct *p,	3250	static int __sched_setscheduler(struct task_struct *p,
3228	const struct sched_attr *attr,	3251	const struct sched_attr *attr,
3229	bool user)	3252	bool user)
3230	{	3253	{
3231	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :	3254	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3232	MAX_RT_PRIO - 1 - attr->sched_priority;	3255	MAX_RT_PRIO - 1 - attr->sched_priority;
3233	int retval, oldprio, oldpolicy = -1, on_rq, running;	3256	int retval, oldprio, oldpolicy = -1, on_rq, running;
3234	int policy = attr->sched_policy;	3257	int policy = attr->sched_policy;
3235	unsigned long flags;	3258	unsigned long flags;
3236	const struct sched_class *prev_class;	3259	const struct sched_class *prev_class;
3237	struct rq *rq;	3260	struct rq *rq;
3238	int reset_on_fork;	3261	int reset_on_fork;
3239		3262
3240	/* may grab non-irq protected spin_locks */	3263	/* may grab non-irq protected spin_locks */
3241	BUG_ON(in_interrupt());	3264	BUG_ON(in_interrupt());
3242	recheck:	3265	recheck:
3243	/* double check policy once rq lock held */	3266	/* double check policy once rq lock held */
3244	if (policy < 0) {	3267	if (policy < 0) {
3245	reset_on_fork = p->sched_reset_on_fork;	3268	reset_on_fork = p->sched_reset_on_fork;
3246	policy = oldpolicy = p->policy;	3269	policy = oldpolicy = p->policy;
3247	} else {	3270	} else {
3248	reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);	3271	reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3249		3272
3250	if (policy != SCHED_DEADLINE &&	3273	if (policy != SCHED_DEADLINE &&
3251	policy != SCHED_FIFO && policy != SCHED_RR &&	3274	policy != SCHED_FIFO && policy != SCHED_RR &&
3252	policy != SCHED_NORMAL && policy != SCHED_BATCH &&	3275	policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3253	policy != SCHED_IDLE)	3276	policy != SCHED_IDLE)
3254	return -EINVAL;	3277	return -EINVAL;
3255	}	3278	}
3256		3279
3257	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))	3280	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3258	return -EINVAL;	3281	return -EINVAL;
3259		3282
3260	/*	3283	/*
3261	* Valid priorities for SCHED_FIFO and SCHED_RR are	3284	* Valid priorities for SCHED_FIFO and SCHED_RR are
3262	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,	3285	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3263	* SCHED_BATCH and SCHED_IDLE is 0.	3286	* SCHED_BATCH and SCHED_IDLE is 0.
3264	*/	3287	*/
3265	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) \|\|	3288	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) \|\|
3266	(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))	3289	(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3267	return -EINVAL;	3290	return -EINVAL;
3268	if ((dl_policy(policy) && !__checkparam_dl(attr)) \|\|	3291	if ((dl_policy(policy) && !__checkparam_dl(attr)) \|\|
3269	(rt_policy(policy) != (attr->sched_priority != 0)))	3292	(rt_policy(policy) != (attr->sched_priority != 0)))
3270	return -EINVAL;	3293	return -EINVAL;
3271		3294
3272	/*	3295	/*
3273	* Allow unprivileged RT tasks to decrease priority:	3296	* Allow unprivileged RT tasks to decrease priority:
3274	*/	3297	*/
3275	if (user && !capable(CAP_SYS_NICE)) {	3298	if (user && !capable(CAP_SYS_NICE)) {
3276	if (fair_policy(policy)) {	3299	if (fair_policy(policy)) {
3277	if (attr->sched_nice < task_nice(p) &&	3300	if (attr->sched_nice < task_nice(p) &&
3278	!can_nice(p, attr->sched_nice))	3301	!can_nice(p, attr->sched_nice))
3279	return -EPERM;	3302	return -EPERM;
3280	}	3303	}
3281		3304
3282	if (rt_policy(policy)) {	3305	if (rt_policy(policy)) {
3283	unsigned long rlim_rtprio =	3306	unsigned long rlim_rtprio =
3284	task_rlimit(p, RLIMIT_RTPRIO);	3307	task_rlimit(p, RLIMIT_RTPRIO);
3285		3308
3286	/* can't set/change the rt policy */	3309	/* can't set/change the rt policy */
3287	if (policy != p->policy && !rlim_rtprio)	3310	if (policy != p->policy && !rlim_rtprio)
3288	return -EPERM;	3311	return -EPERM;
3289		3312
3290	/* can't increase priority */	3313	/* can't increase priority */
3291	if (attr->sched_priority > p->rt_priority &&	3314	if (attr->sched_priority > p->rt_priority &&
3292	attr->sched_priority > rlim_rtprio)	3315	attr->sched_priority > rlim_rtprio)
3293	return -EPERM;	3316	return -EPERM;
3294	}	3317	}
3295		3318
3296	/*	3319	/*
3297	* Can't set/change SCHED_DEADLINE policy at all for now	3320	* Can't set/change SCHED_DEADLINE policy at all for now
3298	* (safest behavior); in the future we would like to allow	3321	* (safest behavior); in the future we would like to allow
3299	* unprivileged DL tasks to increase their relative deadline	3322	* unprivileged DL tasks to increase their relative deadline
3300	* or reduce their runtime (both ways reducing utilization)	3323	* or reduce their runtime (both ways reducing utilization)
3301	*/	3324	*/
3302	if (dl_policy(policy))	3325	if (dl_policy(policy))
3303	return -EPERM;	3326	return -EPERM;
3304		3327
3305	/*	3328	/*
3306	* Treat SCHED_IDLE as nice 20. Only allow a switch to	3329	* Treat SCHED_IDLE as nice 20. Only allow a switch to
3307	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.	3330	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3308	*/	3331	*/
3309	if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {	3332	if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3310	if (!can_nice(p, task_nice(p)))	3333	if (!can_nice(p, task_nice(p)))
3311	return -EPERM;	3334	return -EPERM;
3312	}	3335	}
3313		3336
3314	/* can't change other user's priorities */	3337	/* can't change other user's priorities */
3315	if (!check_same_owner(p))	3338	if (!check_same_owner(p))
3316	return -EPERM;	3339	return -EPERM;
3317		3340
3318	/* Normal users shall not reset the sched_reset_on_fork flag */	3341	/* Normal users shall not reset the sched_reset_on_fork flag */
3319	if (p->sched_reset_on_fork && !reset_on_fork)	3342	if (p->sched_reset_on_fork && !reset_on_fork)
3320	return -EPERM;	3343	return -EPERM;
3321	}	3344	}
3322		3345
3323	if (user) {	3346	if (user) {
3324	retval = security_task_setscheduler(p);	3347	retval = security_task_setscheduler(p);
3325	if (retval)	3348	if (retval)
3326	return retval;	3349	return retval;
3327	}	3350	}
3328		3351
3329	/*	3352	/*
3330	* make sure no PI-waiters arrive (or leave) while we are	3353	* make sure no PI-waiters arrive (or leave) while we are
3331	* changing the priority of the task:	3354	* changing the priority of the task:
3332	*	3355	*
3333	* To be able to change p->policy safely, the appropriate	3356	* To be able to change p->policy safely, the appropriate
3334	* runqueue lock must be held.	3357	* runqueue lock must be held.
3335	*/	3358	*/
3336	rq = task_rq_lock(p, &flags);	3359	rq = task_rq_lock(p, &flags);
3337		3360
3338	/*	3361	/*
3339	* Changing the policy of the stop threads its a very bad idea	3362	* Changing the policy of the stop threads its a very bad idea
3340	*/	3363	*/
3341	if (p == rq->stop) {	3364	if (p == rq->stop) {
3342	task_rq_unlock(rq, p, &flags);	3365	task_rq_unlock(rq, p, &flags);
3343	return -EINVAL;	3366	return -EINVAL;
3344	}	3367	}
3345		3368
3346	/*	3369	/*
3347	* If not changing anything there's no need to proceed further,	3370	* If not changing anything there's no need to proceed further,
3348	* but store a possible modification of reset_on_fork.	3371	* but store a possible modification of reset_on_fork.
3349	*/	3372	*/
3350	if (unlikely(policy == p->policy)) {	3373	if (unlikely(policy == p->policy)) {
3351	if (fair_policy(policy) && attr->sched_nice != task_nice(p))	3374	if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3352	goto change;	3375	goto change;
3353	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)	3376	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3354	goto change;	3377	goto change;
3355	if (dl_policy(policy))	3378	if (dl_policy(policy))
3356	goto change;	3379	goto change;
3357		3380
3358	p->sched_reset_on_fork = reset_on_fork;	3381	p->sched_reset_on_fork = reset_on_fork;
3359	task_rq_unlock(rq, p, &flags);	3382	task_rq_unlock(rq, p, &flags);
3360	return 0;	3383	return 0;
3361	}	3384	}
3362	change:	3385	change:
3363		3386
3364	if (user) {	3387	if (user) {
3365	#ifdef CONFIG_RT_GROUP_SCHED	3388	#ifdef CONFIG_RT_GROUP_SCHED
3366	/*	3389	/*
3367	* Do not allow realtime tasks into groups that have no runtime	3390	* Do not allow realtime tasks into groups that have no runtime
3368	* assigned.	3391	* assigned.
3369	*/	3392	*/
3370	if (rt_bandwidth_enabled() && rt_policy(policy) &&	3393	if (rt_bandwidth_enabled() && rt_policy(policy) &&
3371	task_group(p)->rt_bandwidth.rt_runtime == 0 &&	3394	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3372	!task_group_is_autogroup(task_group(p))) {	3395	!task_group_is_autogroup(task_group(p))) {
3373	task_rq_unlock(rq, p, &flags);	3396	task_rq_unlock(rq, p, &flags);
3374	return -EPERM;	3397	return -EPERM;
3375	}	3398	}
3376	#endif	3399	#endif
3377	#ifdef CONFIG_SMP	3400	#ifdef CONFIG_SMP
3378	if (dl_bandwidth_enabled() && dl_policy(policy)) {	3401	if (dl_bandwidth_enabled() && dl_policy(policy)) {
3379	cpumask_t *span = rq->rd->span;	3402	cpumask_t *span = rq->rd->span;
3380		3403
3381	/*	3404	/*
3382	* Don't allow tasks with an affinity mask smaller than	3405	* Don't allow tasks with an affinity mask smaller than
3383	* the entire root_domain to become SCHED_DEADLINE. We	3406	* the entire root_domain to become SCHED_DEADLINE. We
3384	* will also fail if there's no bandwidth available.	3407	* will also fail if there's no bandwidth available.
3385	*/	3408	*/
3386	if (!cpumask_subset(span, &p->cpus_allowed) \|\|	3409	if (!cpumask_subset(span, &p->cpus_allowed) \|\|
3387	rq->rd->dl_bw.bw == 0) {	3410	rq->rd->dl_bw.bw == 0) {
3388	task_rq_unlock(rq, p, &flags);	3411	task_rq_unlock(rq, p, &flags);
3389	return -EPERM;	3412	return -EPERM;
3390	}	3413	}
3391	}	3414	}
3392	#endif	3415	#endif
3393	}	3416	}
3394		3417
3395	/* recheck policy now with rq lock held */	3418	/* recheck policy now with rq lock held */
3396	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {	3419	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3397	policy = oldpolicy = -1;	3420	policy = oldpolicy = -1;
3398	task_rq_unlock(rq, p, &flags);	3421	task_rq_unlock(rq, p, &flags);
3399	goto recheck;	3422	goto recheck;
3400	}	3423	}
3401		3424
3402	/*	3425	/*
3403	* If setscheduling to SCHED_DEADLINE (or changing the parameters	3426	* If setscheduling to SCHED_DEADLINE (or changing the parameters
3404	* of a SCHED_DEADLINE task) we need to check if enough bandwidth	3427	* of a SCHED_DEADLINE task) we need to check if enough bandwidth
3405	* is available.	3428	* is available.
3406	*/	3429	*/
3407	if ((dl_policy(policy) \|\| dl_task(p)) && dl_overflow(p, policy, attr)) {	3430	if ((dl_policy(policy) \|\| dl_task(p)) && dl_overflow(p, policy, attr)) {
3408	task_rq_unlock(rq, p, &flags);	3431	task_rq_unlock(rq, p, &flags);
3409	return -EBUSY;	3432	return -EBUSY;
3410	}	3433	}
3411		3434
3412	p->sched_reset_on_fork = reset_on_fork;	3435	p->sched_reset_on_fork = reset_on_fork;
3413	oldprio = p->prio;	3436	oldprio = p->prio;
3414		3437
3415	/*	3438	/*
3416	* Special case for priority boosted tasks.	3439	* Special case for priority boosted tasks.
3417	*	3440	*
3418	* If the new priority is lower or equal (user space view)	3441	* If the new priority is lower or equal (user space view)
3419	* than the current (boosted) priority, we just store the new	3442	* than the current (boosted) priority, we just store the new
3420	* normal parameters and do not touch the scheduler class and	3443	* normal parameters and do not touch the scheduler class and
3421	* the runqueue. This will be done when the task deboost	3444	* the runqueue. This will be done when the task deboost
3422	* itself.	3445	* itself.
3423	*/	3446	*/
3424	if (rt_mutex_check_prio(p, newprio)) {	3447	if (rt_mutex_check_prio(p, newprio)) {
3425	__setscheduler_params(p, attr);	3448	__setscheduler_params(p, attr);
3426	task_rq_unlock(rq, p, &flags);	3449	task_rq_unlock(rq, p, &flags);
3427	return 0;	3450	return 0;
3428	}	3451	}
3429		3452
3430	on_rq = p->on_rq;	3453	on_rq = p->on_rq;
3431	running = task_current(rq, p);	3454	running = task_current(rq, p);
3432	if (on_rq)	3455	if (on_rq)
3433	dequeue_task(rq, p, 0);	3456	dequeue_task(rq, p, 0);
3434	if (running)	3457	if (running)
3435	p->sched_class->put_prev_task(rq, p);	3458	p->sched_class->put_prev_task(rq, p);
3436		3459
3437	prev_class = p->sched_class;	3460	prev_class = p->sched_class;
3438	__setscheduler(rq, p, attr);	3461	__setscheduler(rq, p, attr);
3439		3462
3440	if (running)	3463	if (running)
3441	p->sched_class->set_curr_task(rq);	3464	p->sched_class->set_curr_task(rq);
3442	if (on_rq) {	3465	if (on_rq) {
3443	/*	3466	/*
3444	* We enqueue to tail when the priority of a task is	3467	* We enqueue to tail when the priority of a task is
3445	* increased (user space view).	3468	* increased (user space view).
3446	*/	3469	*/
3447	enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);	3470	enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3448	}	3471	}
3449		3472
3450	check_class_changed(rq, p, prev_class, oldprio);	3473	check_class_changed(rq, p, prev_class, oldprio);
3451	task_rq_unlock(rq, p, &flags);	3474	task_rq_unlock(rq, p, &flags);
3452		3475
3453	rt_mutex_adjust_pi(p);	3476	rt_mutex_adjust_pi(p);
3454		3477
3455	return 0;	3478	return 0;
3456	}	3479	}
3457		3480
3458	static int _sched_setscheduler(struct task_struct *p, int policy,	3481	static int _sched_setscheduler(struct task_struct *p, int policy,
3459	const struct sched_param *param, bool check)	3482	const struct sched_param *param, bool check)
3460	{	3483	{
3461	struct sched_attr attr = {	3484	struct sched_attr attr = {
3462	.sched_policy = policy,	3485	.sched_policy = policy,
3463	.sched_priority = param->sched_priority,	3486	.sched_priority = param->sched_priority,
3464	.sched_nice = PRIO_TO_NICE(p->static_prio),	3487	.sched_nice = PRIO_TO_NICE(p->static_prio),
3465	};	3488	};
3466		3489
3467	/*	3490	/*
3468	* Fixup the legacy SCHED_RESET_ON_FORK hack	3491	* Fixup the legacy SCHED_RESET_ON_FORK hack
3469	*/	3492	*/
3470	if (policy & SCHED_RESET_ON_FORK) {	3493	if (policy & SCHED_RESET_ON_FORK) {
3471	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;	3494	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
3472	policy &= ~SCHED_RESET_ON_FORK;	3495	policy &= ~SCHED_RESET_ON_FORK;
3473	attr.sched_policy = policy;	3496	attr.sched_policy = policy;
3474	}	3497	}
3475		3498
3476	return __sched_setscheduler(p, &attr, check);	3499	return __sched_setscheduler(p, &attr, check);
3477	}	3500	}
3478	/**	3501	/**
3479	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.	3502	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3480	* @p: the task in question.	3503	* @p: the task in question.
3481	* @policy: new policy.	3504	* @policy: new policy.
3482	* @param: structure containing the new RT priority.	3505	* @param: structure containing the new RT priority.
3483	*	3506	*
3484	* Return: 0 on success. An error code otherwise.	3507	* Return: 0 on success. An error code otherwise.
3485	*	3508	*
3486	* NOTE that the task may be already dead.	3509	* NOTE that the task may be already dead.
3487	*/	3510	*/
3488	int sched_setscheduler(struct task_struct *p, int policy,	3511	int sched_setscheduler(struct task_struct *p, int policy,
3489	const struct sched_param *param)	3512	const struct sched_param *param)
3490	{	3513	{
3491	return _sched_setscheduler(p, policy, param, true);	3514	return _sched_setscheduler(p, policy, param, true);
3492	}	3515	}
3493	EXPORT_SYMBOL_GPL(sched_setscheduler);	3516	EXPORT_SYMBOL_GPL(sched_setscheduler);
3494		3517
3495	int sched_setattr(struct task_struct p, const struct sched_attr attr)	3518	int sched_setattr(struct task_struct p, const struct sched_attr attr)
3496	{	3519	{
3497	return __sched_setscheduler(p, attr, true);	3520	return __sched_setscheduler(p, attr, true);
3498	}	3521	}
3499	EXPORT_SYMBOL_GPL(sched_setattr);	3522	EXPORT_SYMBOL_GPL(sched_setattr);
3500		3523
3501	/**	3524	/**
3502	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.	3525	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3503	* @p: the task in question.	3526	* @p: the task in question.
3504	* @policy: new policy.	3527	* @policy: new policy.
3505	* @param: structure containing the new RT priority.	3528	* @param: structure containing the new RT priority.
3506	*	3529	*
3507	* Just like sched_setscheduler, only don't bother checking if the	3530	* Just like sched_setscheduler, only don't bother checking if the
3508	* current context has permission. For example, this is needed in	3531	* current context has permission. For example, this is needed in
3509	* stop_machine(): we create temporary high priority worker threads,	3532	* stop_machine(): we create temporary high priority worker threads,
3510	* but our caller might not have that capability.	3533	* but our caller might not have that capability.
3511	*	3534	*
3512	* Return: 0 on success. An error code otherwise.	3535	* Return: 0 on success. An error code otherwise.
3513	*/	3536	*/
3514	int sched_setscheduler_nocheck(struct task_struct *p, int policy,	3537	int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3515	const struct sched_param *param)	3538	const struct sched_param *param)
3516	{	3539	{
3517	return _sched_setscheduler(p, policy, param, false);	3540	return _sched_setscheduler(p, policy, param, false);
3518	}	3541	}
3519		3542
3520	static int	3543	static int
3521	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)	3544	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3522	{	3545	{
3523	struct sched_param lparam;	3546	struct sched_param lparam;
3524	struct task_struct *p;	3547	struct task_struct *p;
3525	int retval;	3548	int retval;
3526		3549
3527	if (!param \|\| pid < 0)	3550	if (!param \|\| pid < 0)
3528	return -EINVAL;	3551	return -EINVAL;
3529	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))	3552	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3530	return -EFAULT;	3553	return -EFAULT;
3531		3554
3532	rcu_read_lock();	3555	rcu_read_lock();
3533	retval = -ESRCH;	3556	retval = -ESRCH;
3534	p = find_process_by_pid(pid);	3557	p = find_process_by_pid(pid);
3535	if (p != NULL)	3558	if (p != NULL)
3536	retval = sched_setscheduler(p, policy, &lparam);	3559	retval = sched_setscheduler(p, policy, &lparam);
3537	rcu_read_unlock();	3560	rcu_read_unlock();
3538		3561
3539	return retval;	3562	return retval;
3540	}	3563	}
3541		3564
3542	/*	3565	/*
3543	* Mimics kernel/events/core.c perf_copy_attr().	3566	* Mimics kernel/events/core.c perf_copy_attr().
3544	*/	3567	*/
3545	static int sched_copy_attr(struct sched_attr __user *uattr,	3568	static int sched_copy_attr(struct sched_attr __user *uattr,
3546	struct sched_attr *attr)	3569	struct sched_attr *attr)
3547	{	3570	{
3548	u32 size;	3571	u32 size;
3549	int ret;	3572	int ret;
3550		3573
3551	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))	3574	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3552	return -EFAULT;	3575	return -EFAULT;
3553		3576
3554	/*	3577	/*
3555	* zero the full structure, so that a short copy will be nice.	3578	* zero the full structure, so that a short copy will be nice.
3556	*/	3579	*/
3557	memset(attr, 0, sizeof(*attr));	3580	memset(attr, 0, sizeof(*attr));
3558		3581
3559	ret = get_user(size, &uattr->size);	3582	ret = get_user(size, &uattr->size);
3560	if (ret)	3583	if (ret)
3561	return ret;	3584	return ret;
3562		3585
3563	if (size > PAGE_SIZE) /* silly large */	3586	if (size > PAGE_SIZE) /* silly large */
3564	goto err_size;	3587	goto err_size;
3565		3588
3566	if (!size) /* abi compat */	3589	if (!size) /* abi compat */
3567	size = SCHED_ATTR_SIZE_VER0;	3590	size = SCHED_ATTR_SIZE_VER0;
3568		3591
3569	if (size < SCHED_ATTR_SIZE_VER0)	3592	if (size < SCHED_ATTR_SIZE_VER0)
3570	goto err_size;	3593	goto err_size;
3571		3594
3572	/*	3595	/*
3573	* If we're handed a bigger struct than we know of,	3596	* If we're handed a bigger struct than we know of,
3574	* ensure all the unknown bits are 0 - i.e. new	3597	* ensure all the unknown bits are 0 - i.e. new
3575	* user-space does not rely on any kernel feature	3598	* user-space does not rely on any kernel feature
3576	* extensions we dont know about yet.	3599	* extensions we dont know about yet.
3577	*/	3600	*/
3578	if (size > sizeof(*attr)) {	3601	if (size > sizeof(*attr)) {
3579	unsigned char __user *addr;	3602	unsigned char __user *addr;
3580	unsigned char __user *end;	3603	unsigned char __user *end;
3581	unsigned char val;	3604	unsigned char val;
3582		3605
3583	addr = (void __user )uattr + sizeof(attr);	3606	addr = (void __user )uattr + sizeof(attr);
3584	end = (void __user *)uattr + size;	3607	end = (void __user *)uattr + size;
3585		3608
3586	for (; addr < end; addr++) {	3609	for (; addr < end; addr++) {
3587	ret = get_user(val, addr);	3610	ret = get_user(val, addr);
3588	if (ret)	3611	if (ret)
3589	return ret;	3612	return ret;
3590	if (val)	3613	if (val)
3591	goto err_size;	3614	goto err_size;
3592	}	3615	}
3593	size = sizeof(*attr);	3616	size = sizeof(*attr);
3594	}	3617	}
3595		3618
3596	ret = copy_from_user(attr, uattr, size);	3619	ret = copy_from_user(attr, uattr, size);
3597	if (ret)	3620	if (ret)
3598	return -EFAULT;	3621	return -EFAULT;
3599		3622
3600	/*	3623	/*
3601	* XXX: do we want to be lenient like existing syscalls; or do we want	3624	* XXX: do we want to be lenient like existing syscalls; or do we want
3602	* to be strict and return an error on out-of-bounds values?	3625	* to be strict and return an error on out-of-bounds values?
3603	*/	3626	*/
3604	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);	3627	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3605		3628
3606	out:	3629	out:
3607	return ret;	3630	return ret;
3608		3631
3609	err_size:	3632	err_size:
3610	put_user(sizeof(*attr), &uattr->size);	3633	put_user(sizeof(*attr), &uattr->size);
3611	ret = -E2BIG;	3634	ret = -E2BIG;
3612	goto out;	3635	goto out;
3613	}	3636	}
3614		3637
3615	/**	3638	/**
3616	* sys_sched_setscheduler - set/change the scheduler policy and RT priority	3639	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
3617	* @pid: the pid in question.	3640	* @pid: the pid in question.
3618	* @policy: new policy.	3641	* @policy: new policy.
3619	* @param: structure containing the new RT priority.	3642	* @param: structure containing the new RT priority.
3620	*	3643	*
3621	* Return: 0 on success. An error code otherwise.	3644	* Return: 0 on success. An error code otherwise.
3622	*/	3645	*/
3623	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,	3646	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3624	struct sched_param __user *, param)	3647	struct sched_param __user *, param)
3625	{	3648	{
3626	/* negative values for policy are not valid */	3649	/* negative values for policy are not valid */
3627	if (policy < 0)	3650	if (policy < 0)
3628	return -EINVAL;	3651	return -EINVAL;
3629		3652
3630	return do_sched_setscheduler(pid, policy, param);	3653	return do_sched_setscheduler(pid, policy, param);
3631	}	3654	}
3632		3655
3633	/**	3656	/**
3634	* sys_sched_setparam - set/change the RT priority of a thread	3657	* sys_sched_setparam - set/change the RT priority of a thread
3635	* @pid: the pid in question.	3658	* @pid: the pid in question.
3636	* @param: structure containing the new RT priority.	3659	* @param: structure containing the new RT priority.
3637	*	3660	*
3638	* Return: 0 on success. An error code otherwise.	3661	* Return: 0 on success. An error code otherwise.
3639	*/	3662	*/
3640	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)	3663	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3641	{	3664	{
3642	return do_sched_setscheduler(pid, -1, param);	3665	return do_sched_setscheduler(pid, -1, param);
3643	}	3666	}
3644		3667
3645	/**	3668	/**
3646	* sys_sched_setattr - same as above, but with extended sched_attr	3669	* sys_sched_setattr - same as above, but with extended sched_attr
3647	* @pid: the pid in question.	3670	* @pid: the pid in question.
3648	* @uattr: structure containing the extended parameters.	3671	* @uattr: structure containing the extended parameters.
3649	* @flags: for future extension.	3672	* @flags: for future extension.
3650	*/	3673	*/
3651	SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,	3674	SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3652	unsigned int, flags)	3675	unsigned int, flags)
3653	{	3676	{
3654	struct sched_attr attr;	3677	struct sched_attr attr;
3655	struct task_struct *p;	3678	struct task_struct *p;
3656	int retval;	3679	int retval;
3657		3680
3658	if (!uattr \|\| pid < 0 \|\| flags)	3681	if (!uattr \|\| pid < 0 \|\| flags)
3659	return -EINVAL;	3682	return -EINVAL;
3660		3683
3661	if (sched_copy_attr(uattr, &attr))	3684	retval = sched_copy_attr(uattr, &attr);
3662	return -EFAULT;	3685	if (retval)
		3686	return retval;
3663		3687
		3688	if (attr.sched_policy < 0)
		3689	return -EINVAL;
		3690
3664	rcu_read_lock();	3691	rcu_read_lock();
3665	retval = -ESRCH;	3692	retval = -ESRCH;
3666	p = find_process_by_pid(pid);	3693	p = find_process_by_pid(pid);
3667	if (p != NULL)	3694	if (p != NULL)
3668	retval = sched_setattr(p, &attr);	3695	retval = sched_setattr(p, &attr);
3669	rcu_read_unlock();	3696	rcu_read_unlock();
3670		3697
3671	return retval;	3698	return retval;
3672	}	3699	}
3673		3700
3674	/**	3701	/**
3675	* sys_sched_getscheduler - get the policy (scheduling class) of a thread	3702	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
3676	* @pid: the pid in question.	3703	* @pid: the pid in question.
3677	*	3704	*
3678	* Return: On success, the policy of the thread. Otherwise, a negative error	3705	* Return: On success, the policy of the thread. Otherwise, a negative error
3679	* code.	3706	* code.
3680	*/	3707	*/
3681	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)	3708	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3682	{	3709	{
3683	struct task_struct *p;	3710	struct task_struct *p;
3684	int retval;	3711	int retval;
3685		3712
3686	if (pid < 0)	3713	if (pid < 0)
3687	return -EINVAL;	3714	return -EINVAL;
3688		3715
3689	retval = -ESRCH;	3716	retval = -ESRCH;
3690	rcu_read_lock();	3717	rcu_read_lock();
3691	p = find_process_by_pid(pid);	3718	p = find_process_by_pid(pid);
3692	if (p) {	3719	if (p) {
3693	retval = security_task_getscheduler(p);	3720	retval = security_task_getscheduler(p);
3694	if (!retval)	3721	if (!retval)
3695	retval = p->policy	3722	retval = p->policy
3696	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);	3723	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3697	}	3724	}
3698	rcu_read_unlock();	3725	rcu_read_unlock();
3699	return retval;	3726	return retval;
3700	}	3727	}
3701		3728
3702	/**	3729	/**
3703	* sys_sched_getparam - get the RT priority of a thread	3730	* sys_sched_getparam - get the RT priority of a thread
3704	* @pid: the pid in question.	3731	* @pid: the pid in question.
3705	* @param: structure containing the RT priority.	3732	* @param: structure containing the RT priority.
3706	*	3733	*
3707	* Return: On success, 0 and the RT priority is in @param. Otherwise, an error	3734	* Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3708	* code.	3735	* code.
3709	*/	3736	*/
3710	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)	3737	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3711	{	3738	{
3712	struct sched_param lp;	3739	struct sched_param lp = { .sched_priority = 0 };
3713	struct task_struct *p;	3740	struct task_struct *p;
3714	int retval;	3741	int retval;
3715		3742
3716	if (!param \|\| pid < 0)	3743	if (!param \|\| pid < 0)
3717	return -EINVAL;	3744	return -EINVAL;
3718		3745
3719	rcu_read_lock();	3746	rcu_read_lock();
3720	p = find_process_by_pid(pid);	3747	p = find_process_by_pid(pid);
3721	retval = -ESRCH;	3748	retval = -ESRCH;
3722	if (!p)	3749	if (!p)
3723	goto out_unlock;	3750	goto out_unlock;
3724		3751
3725	retval = security_task_getscheduler(p);	3752	retval = security_task_getscheduler(p);
3726	if (retval)	3753	if (retval)
3727	goto out_unlock;	3754	goto out_unlock;
3728		3755
3729	if (task_has_dl_policy(p)) {	3756	if (task_has_rt_policy(p))
3730	retval = -EINVAL;	3757	lp.sched_priority = p->rt_priority;
3731	goto out_unlock;
3732	}
3733	lp.sched_priority = p->rt_priority;
3734	rcu_read_unlock();	3758	rcu_read_unlock();
3735		3759
3736	/*	3760	/*
3737	* This one might sleep, we cannot do it with a spinlock held ...	3761	* This one might sleep, we cannot do it with a spinlock held ...
3738	*/	3762	*/
3739	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;	3763	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3740		3764
3741	return retval;	3765	return retval;
3742		3766
3743	out_unlock:	3767	out_unlock:
3744	rcu_read_unlock();	3768	rcu_read_unlock();
3745	return retval;	3769	return retval;
3746	}	3770	}
3747		3771
3748	static int sched_read_attr(struct sched_attr __user *uattr,	3772	static int sched_read_attr(struct sched_attr __user *uattr,
3749	struct sched_attr *attr,	3773	struct sched_attr *attr,
3750	unsigned int usize)	3774	unsigned int usize)
3751	{	3775	{
3752	int ret;	3776	int ret;
3753		3777
3754	if (!access_ok(VERIFY_WRITE, uattr, usize))	3778	if (!access_ok(VERIFY_WRITE, uattr, usize))
3755	return -EFAULT;	3779	return -EFAULT;
3756		3780
3757	/*	3781	/*
3758	* If we're handed a smaller struct than we know of,	3782	* If we're handed a smaller struct than we know of,
3759	* ensure all the unknown bits are 0 - i.e. old	3783	* ensure all the unknown bits are 0 - i.e. old
3760	* user-space does not get uncomplete information.	3784	* user-space does not get uncomplete information.
3761	*/	3785	*/
3762	if (usize < sizeof(*attr)) {	3786	if (usize < sizeof(*attr)) {
3763	unsigned char *addr;	3787	unsigned char *addr;
3764	unsigned char *end;	3788	unsigned char *end;
3765		3789
3766	addr = (void *)attr + usize;	3790	addr = (void *)attr + usize;
3767	end = (void )attr + sizeof(attr);	3791	end = (void )attr + sizeof(attr);
3768		3792
3769	for (; addr < end; addr++) {	3793	for (; addr < end; addr++) {
3770	if (*addr)	3794	if (*addr)
3771	goto err_size;	3795	goto err_size;
3772	}	3796	}
3773		3797
3774	attr->size = usize;	3798	attr->size = usize;
3775	}	3799	}
3776		3800
3777	ret = copy_to_user(uattr, attr, attr->size);	3801	ret = copy_to_user(uattr, attr, attr->size);
3778	if (ret)	3802	if (ret)
3779	return -EFAULT;	3803	return -EFAULT;
3780		3804
3781	out:	3805	out:
3782	return ret;	3806	return ret;
3783		3807
3784	err_size:	3808	err_size:
3785	ret = -E2BIG;	3809	ret = -E2BIG;
3786	goto out;	3810	goto out;
3787	}	3811	}
3788		3812
3789	/**	3813	/**
3790	* sys_sched_getattr - similar to sched_getparam, but with sched_attr	3814	* sys_sched_getattr - similar to sched_getparam, but with sched_attr
3791	* @pid: the pid in question.	3815	* @pid: the pid in question.
3792	* @uattr: structure containing the extended parameters.	3816	* @uattr: structure containing the extended parameters.
3793	* @size: sizeof(attr) for fwd/bwd comp.	3817	* @size: sizeof(attr) for fwd/bwd comp.
3794	* @flags: for future extension.	3818	* @flags: for future extension.
3795	*/	3819	*/
3796	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,	3820	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3797	unsigned int, size, unsigned int, flags)	3821	unsigned int, size, unsigned int, flags)
3798	{	3822	{
3799	struct sched_attr attr = {	3823	struct sched_attr attr = {
3800	.size = sizeof(struct sched_attr),	3824	.size = sizeof(struct sched_attr),
3801	};	3825	};
3802	struct task_struct *p;	3826	struct task_struct *p;
3803	int retval;	3827	int retval;
3804		3828
3805	if (!uattr \|\| pid < 0 \|\| size > PAGE_SIZE \|\|	3829	if (!uattr \|\| pid < 0 \|\| size > PAGE_SIZE \|\|
3806	size < SCHED_ATTR_SIZE_VER0 \|\| flags)	3830	size < SCHED_ATTR_SIZE_VER0 \|\| flags)
3807	return -EINVAL;	3831	return -EINVAL;
3808		3832
3809	rcu_read_lock();	3833	rcu_read_lock();
3810	p = find_process_by_pid(pid);	3834	p = find_process_by_pid(pid);
3811	retval = -ESRCH;	3835	retval = -ESRCH;
3812	if (!p)	3836	if (!p)
3813	goto out_unlock;	3837	goto out_unlock;
3814		3838
3815	retval = security_task_getscheduler(p);	3839	retval = security_task_getscheduler(p);
3816	if (retval)	3840	if (retval)
3817	goto out_unlock;	3841	goto out_unlock;
3818		3842
3819	attr.sched_policy = p->policy;	3843	attr.sched_policy = p->policy;
3820	if (p->sched_reset_on_fork)	3844	if (p->sched_reset_on_fork)
3821	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;	3845	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
3822	if (task_has_dl_policy(p))	3846	if (task_has_dl_policy(p))
3823	__getparam_dl(p, &attr);	3847	__getparam_dl(p, &attr);
3824	else if (task_has_rt_policy(p))	3848	else if (task_has_rt_policy(p))
3825	attr.sched_priority = p->rt_priority;	3849	attr.sched_priority = p->rt_priority;
3826	else	3850	else
3827	attr.sched_nice = task_nice(p);	3851	attr.sched_nice = task_nice(p);
3828		3852
3829	rcu_read_unlock();	3853	rcu_read_unlock();
3830		3854
3831	retval = sched_read_attr(uattr, &attr, size);	3855	retval = sched_read_attr(uattr, &attr, size);
3832	return retval;	3856	return retval;
3833		3857
3834	out_unlock:	3858	out_unlock:
3835	rcu_read_unlock();	3859	rcu_read_unlock();
3836	return retval;	3860	return retval;
3837	}	3861	}
3838		3862
3839	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)	3863	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3840	{	3864	{
3841	cpumask_var_t cpus_allowed, new_mask;	3865	cpumask_var_t cpus_allowed, new_mask;
3842	struct task_struct *p;	3866	struct task_struct *p;
3843	int retval;	3867	int retval;
3844		3868
3845	rcu_read_lock();	3869	rcu_read_lock();
3846		3870
3847	p = find_process_by_pid(pid);	3871	p = find_process_by_pid(pid);
3848	if (!p) {	3872	if (!p) {
3849	rcu_read_unlock();	3873	rcu_read_unlock();
3850	return -ESRCH;	3874	return -ESRCH;
3851	}	3875	}
3852		3876
3853	/* Prevent p going away */	3877	/* Prevent p going away */
3854	get_task_struct(p);	3878	get_task_struct(p);
3855	rcu_read_unlock();	3879	rcu_read_unlock();
3856		3880
3857	if (p->flags & PF_NO_SETAFFINITY) {	3881	if (p->flags & PF_NO_SETAFFINITY) {
3858	retval = -EINVAL;	3882	retval = -EINVAL;
3859	goto out_put_task;	3883	goto out_put_task;
3860	}	3884	}
3861	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {	3885	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3862	retval = -ENOMEM;	3886	retval = -ENOMEM;
3863	goto out_put_task;	3887	goto out_put_task;
3864	}	3888	}
3865	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {	3889	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3866	retval = -ENOMEM;	3890	retval = -ENOMEM;
3867	goto out_free_cpus_allowed;	3891	goto out_free_cpus_allowed;
3868	}	3892	}
3869	retval = -EPERM;	3893	retval = -EPERM;
3870	if (!check_same_owner(p)) {	3894	if (!check_same_owner(p)) {
3871	rcu_read_lock();	3895	rcu_read_lock();
3872	if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {	3896	if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3873	rcu_read_unlock();	3897	rcu_read_unlock();
3874	goto out_unlock;	3898	goto out_unlock;
3875	}	3899	}
3876	rcu_read_unlock();	3900	rcu_read_unlock();
3877	}	3901	}
3878		3902
3879	retval = security_task_setscheduler(p);	3903	retval = security_task_setscheduler(p);
3880	if (retval)	3904	if (retval)
3881	goto out_unlock;	3905	goto out_unlock;
3882		3906
3883		3907
3884	cpuset_cpus_allowed(p, cpus_allowed);	3908	cpuset_cpus_allowed(p, cpus_allowed);
3885	cpumask_and(new_mask, in_mask, cpus_allowed);	3909	cpumask_and(new_mask, in_mask, cpus_allowed);
3886		3910
3887	/*	3911	/*
3888	* Since bandwidth control happens on root_domain basis,	3912	* Since bandwidth control happens on root_domain basis,
3889	* if admission test is enabled, we only admit -deadline	3913	* if admission test is enabled, we only admit -deadline
3890	* tasks allowed to run on all the CPUs in the task's	3914	* tasks allowed to run on all the CPUs in the task's
3891	* root_domain.	3915	* root_domain.
3892	*/	3916	*/
3893	#ifdef CONFIG_SMP	3917	#ifdef CONFIG_SMP
3894	if (task_has_dl_policy(p)) {	3918	if (task_has_dl_policy(p)) {
3895	const struct cpumask *span = task_rq(p)->rd->span;	3919	const struct cpumask *span = task_rq(p)->rd->span;
3896		3920
3897	if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {	3921	if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3898	retval = -EBUSY;	3922	retval = -EBUSY;
3899	goto out_unlock;	3923	goto out_unlock;
3900	}	3924	}
3901	}	3925	}
3902	#endif	3926	#endif
3903	again:	3927	again:
3904	retval = set_cpus_allowed_ptr(p, new_mask);	3928	retval = set_cpus_allowed_ptr(p, new_mask);
3905		3929
3906	if (!retval) {	3930	if (!retval) {
3907	cpuset_cpus_allowed(p, cpus_allowed);	3931	cpuset_cpus_allowed(p, cpus_allowed);
3908	if (!cpumask_subset(new_mask, cpus_allowed)) {	3932	if (!cpumask_subset(new_mask, cpus_allowed)) {
3909	/*	3933	/*
3910	* We must have raced with a concurrent cpuset	3934	* We must have raced with a concurrent cpuset
3911	* update. Just reset the cpus_allowed to the	3935	* update. Just reset the cpus_allowed to the
3912	* cpuset's cpus_allowed	3936	* cpuset's cpus_allowed
3913	*/	3937	*/
3914	cpumask_copy(new_mask, cpus_allowed);	3938	cpumask_copy(new_mask, cpus_allowed);
3915	goto again;	3939	goto again;
3916	}	3940	}
3917	}	3941	}
3918	out_unlock:	3942	out_unlock:
3919	free_cpumask_var(new_mask);	3943	free_cpumask_var(new_mask);
3920	out_free_cpus_allowed:	3944	out_free_cpus_allowed:
3921	free_cpumask_var(cpus_allowed);	3945	free_cpumask_var(cpus_allowed);
3922	out_put_task:	3946	out_put_task:
3923	put_task_struct(p);	3947	put_task_struct(p);
3924	return retval;	3948	return retval;
3925	}	3949	}
3926		3950
3927	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,	3951	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3928	struct cpumask *new_mask)	3952	struct cpumask *new_mask)
3929	{	3953	{
3930	if (len < cpumask_size())	3954	if (len < cpumask_size())
3931	cpumask_clear(new_mask);	3955	cpumask_clear(new_mask);
3932	else if (len > cpumask_size())	3956	else if (len > cpumask_size())
3933	len = cpumask_size();	3957	len = cpumask_size();
3934		3958
3935	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;	3959	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3936	}	3960	}
3937		3961
3938	/**	3962	/**
3939	* sys_sched_setaffinity - set the cpu affinity of a process	3963	* sys_sched_setaffinity - set the cpu affinity of a process
3940	* @pid: pid of the process	3964	* @pid: pid of the process
3941	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	3965	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
3942	* @user_mask_ptr: user-space pointer to the new cpu mask	3966	* @user_mask_ptr: user-space pointer to the new cpu mask
3943	*	3967	*
3944	* Return: 0 on success. An error code otherwise.	3968	* Return: 0 on success. An error code otherwise.
3945	*/	3969	*/
3946	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,	3970	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3947	unsigned long __user *, user_mask_ptr)	3971	unsigned long __user *, user_mask_ptr)
3948	{	3972	{
3949	cpumask_var_t new_mask;	3973	cpumask_var_t new_mask;
3950	int retval;	3974	int retval;
3951		3975
3952	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))	3976	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3953	return -ENOMEM;	3977	return -ENOMEM;
3954		3978
3955	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);	3979	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3956	if (retval == 0)	3980	if (retval == 0)
3957	retval = sched_setaffinity(pid, new_mask);	3981	retval = sched_setaffinity(pid, new_mask);
3958	free_cpumask_var(new_mask);	3982	free_cpumask_var(new_mask);
3959	return retval;	3983	return retval;
3960	}	3984	}
3961		3985
3962	long sched_getaffinity(pid_t pid, struct cpumask *mask)	3986	long sched_getaffinity(pid_t pid, struct cpumask *mask)
3963	{	3987	{
3964	struct task_struct *p;	3988	struct task_struct *p;
3965	unsigned long flags;	3989	unsigned long flags;
3966	int retval;	3990	int retval;
3967		3991
3968	rcu_read_lock();	3992	rcu_read_lock();
3969		3993
3970	retval = -ESRCH;	3994	retval = -ESRCH;
3971	p = find_process_by_pid(pid);	3995	p = find_process_by_pid(pid);
3972	if (!p)	3996	if (!p)
3973	goto out_unlock;	3997	goto out_unlock;
3974		3998
3975	retval = security_task_getscheduler(p);	3999	retval = security_task_getscheduler(p);
3976	if (retval)	4000	if (retval)
3977	goto out_unlock;	4001	goto out_unlock;
3978		4002
3979	raw_spin_lock_irqsave(&p->pi_lock, flags);	4003	raw_spin_lock_irqsave(&p->pi_lock, flags);
3980	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);	4004	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3981	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	4005	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3982		4006
3983	out_unlock:	4007	out_unlock:
3984	rcu_read_unlock();	4008	rcu_read_unlock();
3985		4009
3986	return retval;	4010	return retval;
3987	}	4011	}
3988		4012
3989	/**	4013	/**
3990	* sys_sched_getaffinity - get the cpu affinity of a process	4014	* sys_sched_getaffinity - get the cpu affinity of a process
3991	* @pid: pid of the process	4015	* @pid: pid of the process
3992	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	4016	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
3993	* @user_mask_ptr: user-space pointer to hold the current cpu mask	4017	* @user_mask_ptr: user-space pointer to hold the current cpu mask
3994	*	4018	*
3995	* Return: 0 on success. An error code otherwise.	4019	* Return: 0 on success. An error code otherwise.
3996	*/	4020	*/
3997	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,	4021	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3998	unsigned long __user *, user_mask_ptr)	4022	unsigned long __user *, user_mask_ptr)
3999	{	4023	{
4000	int ret;	4024	int ret;
4001	cpumask_var_t mask;	4025	cpumask_var_t mask;
4002		4026
4003	if ((len * BITS_PER_BYTE) < nr_cpu_ids)	4027	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4004	return -EINVAL;	4028	return -EINVAL;
4005	if (len & (sizeof(unsigned long)-1))	4029	if (len & (sizeof(unsigned long)-1))
4006	return -EINVAL;	4030	return -EINVAL;
4007		4031
4008	if (!alloc_cpumask_var(&mask, GFP_KERNEL))	4032	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4009	return -ENOMEM;	4033	return -ENOMEM;
4010		4034
4011	ret = sched_getaffinity(pid, mask);	4035	ret = sched_getaffinity(pid, mask);
4012	if (ret == 0) {	4036	if (ret == 0) {
4013	size_t retlen = min_t(size_t, len, cpumask_size());	4037	size_t retlen = min_t(size_t, len, cpumask_size());
4014		4038
4015	if (copy_to_user(user_mask_ptr, mask, retlen))	4039	if (copy_to_user(user_mask_ptr, mask, retlen))
4016	ret = -EFAULT;	4040	ret = -EFAULT;
4017	else	4041	else
4018	ret = retlen;	4042	ret = retlen;
4019	}	4043	}
4020	free_cpumask_var(mask);	4044	free_cpumask_var(mask);
4021		4045
4022	return ret;	4046	return ret;
4023	}	4047	}
4024		4048
4025	/**	4049	/**
4026	* sys_sched_yield - yield the current processor to other threads.	4050	* sys_sched_yield - yield the current processor to other threads.
4027	*	4051	*
4028	* This function yields the current CPU to other tasks. If there are no	4052	* This function yields the current CPU to other tasks. If there are no
4029	* other threads running on this CPU then this function will return.	4053	* other threads running on this CPU then this function will return.
4030	*	4054	*
4031	* Return: 0.	4055	* Return: 0.
4032	*/	4056	*/
4033	SYSCALL_DEFINE0(sched_yield)	4057	SYSCALL_DEFINE0(sched_yield)
4034	{	4058	{
4035	struct rq *rq = this_rq_lock();	4059	struct rq *rq = this_rq_lock();
4036		4060
4037	schedstat_inc(rq, yld_count);	4061	schedstat_inc(rq, yld_count);
4038	current->sched_class->yield_task(rq);	4062	current->sched_class->yield_task(rq);
4039		4063
4040	/*	4064	/*
4041	* Since we are going to call schedule() anyway, there's	4065	* Since we are going to call schedule() anyway, there's
4042	* no need to preempt or enable interrupts:	4066	* no need to preempt or enable interrupts:
4043	*/	4067	*/
4044	__release(rq->lock);	4068	__release(rq->lock);
4045	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);	4069	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4046	do_raw_spin_unlock(&rq->lock);	4070	do_raw_spin_unlock(&rq->lock);
4047	sched_preempt_enable_no_resched();	4071	sched_preempt_enable_no_resched();
4048		4072
4049	schedule();	4073	schedule();
4050		4074
4051	return 0;	4075	return 0;
4052	}	4076	}
4053		4077
4054	static void __cond_resched(void)	4078	static void __cond_resched(void)
4055	{	4079	{
4056	__preempt_count_add(PREEMPT_ACTIVE);	4080	__preempt_count_add(PREEMPT_ACTIVE);
4057	__schedule();	4081	__schedule();
4058	__preempt_count_sub(PREEMPT_ACTIVE);	4082	__preempt_count_sub(PREEMPT_ACTIVE);
4059	}	4083	}
4060		4084
4061	int __sched _cond_resched(void)	4085	int __sched _cond_resched(void)
4062	{	4086	{
4063	if (should_resched()) {	4087	if (should_resched()) {
4064	__cond_resched();	4088	__cond_resched();
4065	return 1;	4089	return 1;
4066	}	4090	}
4067	return 0;	4091	return 0;
4068	}	4092	}
4069	EXPORT_SYMBOL(_cond_resched);	4093	EXPORT_SYMBOL(_cond_resched);
4070		4094
4071	/*	4095	/*
4072	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,	4096	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4073	* call schedule, and on return reacquire the lock.	4097	* call schedule, and on return reacquire the lock.
4074	*	4098	*
4075	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level	4099	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4076	* operations here to prevent schedule() from being called twice (once via	4100	* operations here to prevent schedule() from being called twice (once via
4077	* spin_unlock(), once by hand).	4101	* spin_unlock(), once by hand).
4078	*/	4102	*/
4079	int __cond_resched_lock(spinlock_t *lock)	4103	int __cond_resched_lock(spinlock_t *lock)
4080	{	4104	{
4081	int resched = should_resched();	4105	int resched = should_resched();
4082	int ret = 0;	4106	int ret = 0;
4083		4107
4084	lockdep_assert_held(lock);	4108	lockdep_assert_held(lock);
4085		4109
4086	if (spin_needbreak(lock) \|\| resched) {	4110	if (spin_needbreak(lock) \|\| resched) {
4087	spin_unlock(lock);	4111	spin_unlock(lock);
4088	if (resched)	4112	if (resched)
4089	__cond_resched();	4113	__cond_resched();
4090	else	4114	else
4091	cpu_relax();	4115	cpu_relax();
4092	ret = 1;	4116	ret = 1;
4093	spin_lock(lock);	4117	spin_lock(lock);
4094	}	4118	}
4095	return ret;	4119	return ret;
4096	}	4120	}
4097	EXPORT_SYMBOL(__cond_resched_lock);	4121	EXPORT_SYMBOL(__cond_resched_lock);
4098		4122
4099	int __sched __cond_resched_softirq(void)	4123	int __sched __cond_resched_softirq(void)
4100	{	4124	{
4101	BUG_ON(!in_softirq());	4125	BUG_ON(!in_softirq());
4102		4126
4103	if (should_resched()) {	4127	if (should_resched()) {
4104	local_bh_enable();	4128	local_bh_enable();
4105	__cond_resched();	4129	__cond_resched();
4106	local_bh_disable();	4130	local_bh_disable();
4107	return 1;	4131	return 1;
4108	}	4132	}
4109	return 0;	4133	return 0;
4110	}	4134	}
4111	EXPORT_SYMBOL(__cond_resched_softirq);	4135	EXPORT_SYMBOL(__cond_resched_softirq);
4112		4136
4113	/**	4137	/**
4114	* yield - yield the current processor to other threads.	4138	* yield - yield the current processor to other threads.
4115	*	4139	*
4116	* Do not ever use this function, there's a 99% chance you're doing it wrong.	4140	* Do not ever use this function, there's a 99% chance you're doing it wrong.
4117	*	4141	*
4118	* The scheduler is at all times free to pick the calling task as the most	4142	* The scheduler is at all times free to pick the calling task as the most
4119	* eligible task to run, if removing the yield() call from your code breaks	4143	* eligible task to run, if removing the yield() call from your code breaks
4120	* it, its already broken.	4144	* it, its already broken.
4121	*	4145	*
4122	* Typical broken usage is:	4146	* Typical broken usage is:
4123	*	4147	*
4124	* while (!event)	4148	* while (!event)
4125	* yield();	4149	* yield();
4126	*	4150	*
4127	* where one assumes that yield() will let 'the other' process run that will	4151	* where one assumes that yield() will let 'the other' process run that will
4128	* make event true. If the current task is a SCHED_FIFO task that will never	4152	* make event true. If the current task is a SCHED_FIFO task that will never
4129	* happen. Never use yield() as a progress guarantee!!	4153	* happen. Never use yield() as a progress guarantee!!
4130	*	4154	*
4131	* If you want to use yield() to wait for something, use wait_event().	4155	* If you want to use yield() to wait for something, use wait_event().
4132	* If you want to use yield() to be 'nice' for others, use cond_resched().	4156	* If you want to use yield() to be 'nice' for others, use cond_resched().
4133	* If you still want to use yield(), do not!	4157	* If you still want to use yield(), do not!
4134	*/	4158	*/
4135	void __sched yield(void)	4159	void __sched yield(void)
4136	{	4160	{
4137	set_current_state(TASK_RUNNING);	4161	set_current_state(TASK_RUNNING);
4138	sys_sched_yield();	4162	sys_sched_yield();
4139	}	4163	}
4140	EXPORT_SYMBOL(yield);	4164	EXPORT_SYMBOL(yield);
4141		4165
4142	/**	4166	/**
4143	* yield_to - yield the current processor to another thread in	4167	* yield_to - yield the current processor to another thread in
4144	* your thread group, or accelerate that thread toward the	4168	* your thread group, or accelerate that thread toward the
4145	* processor it's on.	4169	* processor it's on.
4146	* @p: target task	4170	* @p: target task
4147	* @preempt: whether task preemption is allowed or not	4171	* @preempt: whether task preemption is allowed or not
4148	*	4172	*
4149	* It's the caller's job to ensure that the target task struct	4173	* It's the caller's job to ensure that the target task struct
4150	* can't go away on us before we can do any checks.	4174	* can't go away on us before we can do any checks.
4151	*	4175	*
4152	* Return:	4176	* Return:
4153	* true (>0) if we indeed boosted the target task.	4177	* true (>0) if we indeed boosted the target task.
4154	* false (0) if we failed to boost the target.	4178	* false (0) if we failed to boost the target.
4155	* -ESRCH if there's no task to yield to.	4179	* -ESRCH if there's no task to yield to.
4156	*/	4180	*/
4157	bool __sched yield_to(struct task_struct *p, bool preempt)	4181	bool __sched yield_to(struct task_struct *p, bool preempt)
4158	{	4182	{
4159	struct task_struct *curr = current;	4183	struct task_struct *curr = current;
4160	struct rq rq, p_rq;	4184	struct rq rq, p_rq;
4161	unsigned long flags;	4185	unsigned long flags;
4162	int yielded = 0;	4186	int yielded = 0;
4163		4187
4164	local_irq_save(flags);	4188	local_irq_save(flags);
4165	rq = this_rq();	4189	rq = this_rq();
4166		4190
4167	again:	4191	again:
4168	p_rq = task_rq(p);	4192	p_rq = task_rq(p);
4169	/*	4193	/*
4170	* If we're the only runnable task on the rq and target rq also	4194	* If we're the only runnable task on the rq and target rq also
4171	* has only one task, there's absolutely no point in yielding.	4195	* has only one task, there's absolutely no point in yielding.
4172	*/	4196	*/
4173	if (rq->nr_running == 1 && p_rq->nr_running == 1) {	4197	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4174	yielded = -ESRCH;	4198	yielded = -ESRCH;
4175	goto out_irq;	4199	goto out_irq;
4176	}	4200	}
4177		4201
4178	double_rq_lock(rq, p_rq);	4202	double_rq_lock(rq, p_rq);
4179	if (task_rq(p) != p_rq) {	4203	if (task_rq(p) != p_rq) {
4180	double_rq_unlock(rq, p_rq);	4204	double_rq_unlock(rq, p_rq);
4181	goto again;	4205	goto again;
4182	}	4206	}
4183		4207
4184	if (!curr->sched_class->yield_to_task)	4208	if (!curr->sched_class->yield_to_task)
4185	goto out_unlock;	4209	goto out_unlock;
4186		4210
4187	if (curr->sched_class != p->sched_class)	4211	if (curr->sched_class != p->sched_class)
4188	goto out_unlock;	4212	goto out_unlock;
4189		4213
4190	if (task_running(p_rq, p) \|\| p->state)	4214	if (task_running(p_rq, p) \|\| p->state)
4191	goto out_unlock;	4215	goto out_unlock;
4192		4216
4193	yielded = curr->sched_class->yield_to_task(rq, p, preempt);	4217	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4194	if (yielded) {	4218	if (yielded) {
4195	schedstat_inc(rq, yld_count);	4219	schedstat_inc(rq, yld_count);
4196	/*	4220	/*
4197	* Make p's CPU reschedule; pick_next_entity takes care of	4221	* Make p's CPU reschedule; pick_next_entity takes care of
4198	* fairness.	4222	* fairness.
4199	*/	4223	*/
4200	if (preempt && rq != p_rq)	4224	if (preempt && rq != p_rq)
4201	resched_task(p_rq->curr);	4225	resched_task(p_rq->curr);
4202	}	4226	}
4203		4227
4204	out_unlock:	4228	out_unlock:
4205	double_rq_unlock(rq, p_rq);	4229	double_rq_unlock(rq, p_rq);
4206	out_irq:	4230	out_irq:
4207	local_irq_restore(flags);	4231	local_irq_restore(flags);
4208		4232
4209	if (yielded > 0)	4233	if (yielded > 0)
4210	schedule();	4234	schedule();
4211		4235
4212	return yielded;	4236	return yielded;
4213	}	4237	}
4214	EXPORT_SYMBOL_GPL(yield_to);	4238	EXPORT_SYMBOL_GPL(yield_to);
4215		4239
4216	/*	4240	/*
4217	* This task is about to go to sleep on IO. Increment rq->nr_iowait so	4241	* This task is about to go to sleep on IO. Increment rq->nr_iowait so
4218	* that process accounting knows that this is a task in IO wait state.	4242	* that process accounting knows that this is a task in IO wait state.
4219	*/	4243	*/
4220	void __sched io_schedule(void)	4244	void __sched io_schedule(void)
4221	{	4245	{
4222	struct rq *rq = raw_rq();	4246	struct rq *rq = raw_rq();
4223		4247
4224	delayacct_blkio_start();	4248	delayacct_blkio_start();
4225	atomic_inc(&rq->nr_iowait);	4249	atomic_inc(&rq->nr_iowait);
4226	blk_flush_plug(current);	4250	blk_flush_plug(current);
4227	current->in_iowait = 1;	4251	current->in_iowait = 1;
4228	schedule();	4252	schedule();
4229	current->in_iowait = 0;	4253	current->in_iowait = 0;
4230	atomic_dec(&rq->nr_iowait);	4254	atomic_dec(&rq->nr_iowait);
4231	delayacct_blkio_end();	4255	delayacct_blkio_end();
4232	}	4256	}
4233	EXPORT_SYMBOL(io_schedule);	4257	EXPORT_SYMBOL(io_schedule);
4234		4258
4235	long __sched io_schedule_timeout(long timeout)	4259	long __sched io_schedule_timeout(long timeout)
4236	{	4260	{
4237	struct rq *rq = raw_rq();	4261	struct rq *rq = raw_rq();
4238	long ret;	4262	long ret;
4239		4263
4240	delayacct_blkio_start();	4264	delayacct_blkio_start();
4241	atomic_inc(&rq->nr_iowait);	4265	atomic_inc(&rq->nr_iowait);
4242	blk_flush_plug(current);	4266	blk_flush_plug(current);
4243	current->in_iowait = 1;	4267	current->in_iowait = 1;
4244	ret = schedule_timeout(timeout);	4268	ret = schedule_timeout(timeout);
4245	current->in_iowait = 0;	4269	current->in_iowait = 0;
4246	atomic_dec(&rq->nr_iowait);	4270	atomic_dec(&rq->nr_iowait);
4247	delayacct_blkio_end();	4271	delayacct_blkio_end();
4248	return ret;	4272	return ret;
4249	}	4273	}
4250		4274
4251	/**	4275	/**
4252	* sys_sched_get_priority_max - return maximum RT priority.	4276	* sys_sched_get_priority_max - return maximum RT priority.
4253	* @policy: scheduling class.	4277	* @policy: scheduling class.
4254	*	4278	*
4255	* Return: On success, this syscall returns the maximum	4279	* Return: On success, this syscall returns the maximum
4256	* rt_priority that can be used by a given scheduling class.	4280	* rt_priority that can be used by a given scheduling class.
4257	* On failure, a negative error code is returned.	4281	* On failure, a negative error code is returned.
4258	*/	4282	*/
4259	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)	4283	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4260	{	4284	{
4261	int ret = -EINVAL;	4285	int ret = -EINVAL;
4262		4286
4263	switch (policy) {	4287	switch (policy) {
4264	case SCHED_FIFO:	4288	case SCHED_FIFO:
4265	case SCHED_RR:	4289	case SCHED_RR:
4266	ret = MAX_USER_RT_PRIO-1;	4290	ret = MAX_USER_RT_PRIO-1;
4267	break;	4291	break;
4268	case SCHED_DEADLINE:	4292	case SCHED_DEADLINE:
4269	case SCHED_NORMAL:	4293	case SCHED_NORMAL:
4270	case SCHED_BATCH:	4294	case SCHED_BATCH:
4271	case SCHED_IDLE:	4295	case SCHED_IDLE:
4272	ret = 0;	4296	ret = 0;
4273	break;	4297	break;
4274	}	4298	}
4275	return ret;	4299	return ret;
4276	}	4300	}
4277		4301
4278	/**	4302	/**
4279	* sys_sched_get_priority_min - return minimum RT priority.	4303	* sys_sched_get_priority_min - return minimum RT priority.
4280	* @policy: scheduling class.	4304	* @policy: scheduling class.
4281	*	4305	*
4282	* Return: On success, this syscall returns the minimum	4306	* Return: On success, this syscall returns the minimum
4283	* rt_priority that can be used by a given scheduling class.	4307	* rt_priority that can be used by a given scheduling class.
4284	* On failure, a negative error code is returned.	4308	* On failure, a negative error code is returned.
4285	*/	4309	*/
4286	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)	4310	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4287	{	4311	{
4288	int ret = -EINVAL;	4312	int ret = -EINVAL;
4289		4313
4290	switch (policy) {	4314	switch (policy) {
4291	case SCHED_FIFO:	4315	case SCHED_FIFO:
4292	case SCHED_RR:	4316	case SCHED_RR:
4293	ret = 1;	4317	ret = 1;
4294	break;	4318	break;
4295	case SCHED_DEADLINE:	4319	case SCHED_DEADLINE:
4296	case SCHED_NORMAL:	4320	case SCHED_NORMAL:
4297	case SCHED_BATCH:	4321	case SCHED_BATCH:
4298	case SCHED_IDLE:	4322	case SCHED_IDLE:
4299	ret = 0;	4323	ret = 0;
4300	}	4324	}
4301	return ret;	4325	return ret;
4302	}	4326	}
4303		4327
4304	/**	4328	/**
4305	* sys_sched_rr_get_interval - return the default timeslice of a process.	4329	* sys_sched_rr_get_interval - return the default timeslice of a process.
4306	* @pid: pid of the process.	4330	* @pid: pid of the process.
4307	* @interval: userspace pointer to the timeslice value.	4331	* @interval: userspace pointer to the timeslice value.
4308	*	4332	*
4309	* this syscall writes the default timeslice value of a given process	4333	* this syscall writes the default timeslice value of a given process
4310	* into the user-space timespec buffer. A value of '0' means infinity.	4334	* into the user-space timespec buffer. A value of '0' means infinity.
4311	*	4335	*
4312	* Return: On success, 0 and the timeslice is in @interval. Otherwise,	4336	* Return: On success, 0 and the timeslice is in @interval. Otherwise,
4313	* an error code.	4337	* an error code.
4314	*/	4338	*/
4315	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,	4339	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4316	struct timespec __user *, interval)	4340	struct timespec __user *, interval)
4317	{	4341	{
4318	struct task_struct *p;	4342	struct task_struct *p;
4319	unsigned int time_slice;	4343	unsigned int time_slice;
4320	unsigned long flags;	4344	unsigned long flags;
4321	struct rq *rq;	4345	struct rq *rq;
4322	int retval;	4346	int retval;
4323	struct timespec t;	4347	struct timespec t;
4324		4348
4325	if (pid < 0)	4349	if (pid < 0)
4326	return -EINVAL;	4350	return -EINVAL;
4327		4351
4328	retval = -ESRCH;	4352	retval = -ESRCH;
4329	rcu_read_lock();	4353	rcu_read_lock();
4330	p = find_process_by_pid(pid);	4354	p = find_process_by_pid(pid);
4331	if (!p)	4355	if (!p)
4332	goto out_unlock;	4356	goto out_unlock;
4333		4357
4334	retval = security_task_getscheduler(p);	4358	retval = security_task_getscheduler(p);
4335	if (retval)	4359	if (retval)
4336	goto out_unlock;	4360	goto out_unlock;
4337		4361
4338	rq = task_rq_lock(p, &flags);	4362	rq = task_rq_lock(p, &flags);
4339	time_slice = 0;	4363	time_slice = 0;
4340	if (p->sched_class->get_rr_interval)	4364	if (p->sched_class->get_rr_interval)
4341	time_slice = p->sched_class->get_rr_interval(rq, p);	4365	time_slice = p->sched_class->get_rr_interval(rq, p);
4342	task_rq_unlock(rq, p, &flags);	4366	task_rq_unlock(rq, p, &flags);
4343		4367
4344	rcu_read_unlock();	4368	rcu_read_unlock();
4345	jiffies_to_timespec(time_slice, &t);	4369	jiffies_to_timespec(time_slice, &t);
4346	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;	4370	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4347	return retval;	4371	return retval;
4348		4372
4349	out_unlock:	4373	out_unlock:
4350	rcu_read_unlock();	4374	rcu_read_unlock();
4351	return retval;	4375	return retval;
4352	}	4376	}
4353		4377
4354	static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;	4378	static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4355		4379
4356	void sched_show_task(struct task_struct *p)	4380	void sched_show_task(struct task_struct *p)
4357	{	4381	{
4358	unsigned long free = 0;	4382	unsigned long free = 0;
4359	int ppid;	4383	int ppid;
4360	unsigned state;	4384	unsigned state;
4361		4385
4362	state = p->state ? __ffs(p->state) + 1 : 0;	4386	state = p->state ? __ffs(p->state) + 1 : 0;
4363	printk(KERN_INFO "%-15.15s %c", p->comm,	4387	printk(KERN_INFO "%-15.15s %c", p->comm,
4364	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');	4388	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4365	#if BITS_PER_LONG == 32	4389	#if BITS_PER_LONG == 32
4366	if (state == TASK_RUNNING)	4390	if (state == TASK_RUNNING)
4367	printk(KERN_CONT " running ");	4391	printk(KERN_CONT " running ");
4368	else	4392	else
4369	printk(KERN_CONT " %08lx ", thread_saved_pc(p));	4393	printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4370	#else	4394	#else
4371	if (state == TASK_RUNNING)	4395	if (state == TASK_RUNNING)
4372	printk(KERN_CONT " running task ");	4396	printk(KERN_CONT " running task ");
4373	else	4397	else
4374	printk(KERN_CONT " %016lx ", thread_saved_pc(p));	4398	printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4375	#endif	4399	#endif
4376	#ifdef CONFIG_DEBUG_STACK_USAGE	4400	#ifdef CONFIG_DEBUG_STACK_USAGE
4377	free = stack_not_used(p);	4401	free = stack_not_used(p);
4378	#endif	4402	#endif
4379	rcu_read_lock();	4403	rcu_read_lock();
4380	ppid = task_pid_nr(rcu_dereference(p->real_parent));	4404	ppid = task_pid_nr(rcu_dereference(p->real_parent));
4381	rcu_read_unlock();	4405	rcu_read_unlock();
4382	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,	4406	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4383	task_pid_nr(p), ppid,	4407	task_pid_nr(p), ppid,
4384	(unsigned long)task_thread_info(p)->flags);	4408	(unsigned long)task_thread_info(p)->flags);
4385		4409
4386	print_worker_info(KERN_INFO, p);	4410	print_worker_info(KERN_INFO, p);
4387	show_stack(p, NULL);	4411	show_stack(p, NULL);
4388	}	4412	}
4389		4413
4390	void show_state_filter(unsigned long state_filter)	4414	void show_state_filter(unsigned long state_filter)
4391	{	4415	{
4392	struct task_struct g, p;	4416	struct task_struct g, p;
4393		4417
4394	#if BITS_PER_LONG == 32	4418	#if BITS_PER_LONG == 32
4395	printk(KERN_INFO	4419	printk(KERN_INFO
4396	" task PC stack pid father\n");	4420	" task PC stack pid father\n");
4397	#else	4421	#else
4398	printk(KERN_INFO	4422	printk(KERN_INFO
4399	" task PC stack pid father\n");	4423	" task PC stack pid father\n");
4400	#endif	4424	#endif
4401	rcu_read_lock();	4425	rcu_read_lock();
4402	do_each_thread(g, p) {	4426	do_each_thread(g, p) {
4403	/*	4427	/*
4404	* reset the NMI-timeout, listing all files on a slow	4428	* reset the NMI-timeout, listing all files on a slow
4405	* console might take a lot of time:	4429	* console might take a lot of time:
4406	*/	4430	*/
4407	touch_nmi_watchdog();	4431	touch_nmi_watchdog();
4408	if (!state_filter \|\| (p->state & state_filter))	4432	if (!state_filter \|\| (p->state & state_filter))
4409	sched_show_task(p);	4433	sched_show_task(p);
4410	} while_each_thread(g, p);	4434	} while_each_thread(g, p);
4411		4435
4412	touch_all_softlockup_watchdogs();	4436	touch_all_softlockup_watchdogs();
4413		4437
4414	#ifdef CONFIG_SCHED_DEBUG	4438	#ifdef CONFIG_SCHED_DEBUG
4415	sysrq_sched_debug_show();	4439	sysrq_sched_debug_show();
4416	#endif	4440	#endif
4417	rcu_read_unlock();	4441	rcu_read_unlock();
4418	/*	4442	/*
4419	* Only show locks if all tasks are dumped:	4443	* Only show locks if all tasks are dumped:
4420	*/	4444	*/
4421	if (!state_filter)	4445	if (!state_filter)
4422	debug_show_all_locks();	4446	debug_show_all_locks();
4423	}	4447	}
4424		4448
4425	void init_idle_bootup_task(struct task_struct *idle)	4449	void init_idle_bootup_task(struct task_struct *idle)
4426	{	4450	{
4427	idle->sched_class = &idle_sched_class;	4451	idle->sched_class = &idle_sched_class;
4428	}	4452	}
4429		4453
4430	/**	4454	/**
4431	* init_idle - set up an idle thread for a given CPU	4455	* init_idle - set up an idle thread for a given CPU
4432	* @idle: task in question	4456	* @idle: task in question
4433	* @cpu: cpu the idle task belongs to	4457	* @cpu: cpu the idle task belongs to
4434	*	4458	*
4435	* NOTE: this function does not set the idle thread's NEED_RESCHED	4459	* NOTE: this function does not set the idle thread's NEED_RESCHED
4436	* flag, to make booting more robust.	4460	* flag, to make booting more robust.
4437	*/	4461	*/
4438	void init_idle(struct task_struct *idle, int cpu)	4462	void init_idle(struct task_struct *idle, int cpu)
4439	{	4463	{
4440	struct rq *rq = cpu_rq(cpu);	4464	struct rq *rq = cpu_rq(cpu);
4441	unsigned long flags;	4465	unsigned long flags;
4442		4466
4443	raw_spin_lock_irqsave(&rq->lock, flags);	4467	raw_spin_lock_irqsave(&rq->lock, flags);
4444		4468
4445	__sched_fork(0, idle);	4469	__sched_fork(0, idle);
4446	idle->state = TASK_RUNNING;	4470	idle->state = TASK_RUNNING;
4447	idle->se.exec_start = sched_clock();	4471	idle->se.exec_start = sched_clock();
4448		4472
4449	do_set_cpus_allowed(idle, cpumask_of(cpu));	4473	do_set_cpus_allowed(idle, cpumask_of(cpu));
4450	/*	4474	/*
4451	* We're having a chicken and egg problem, even though we are	4475	* We're having a chicken and egg problem, even though we are
4452	* holding rq->lock, the cpu isn't yet set to this cpu so the	4476	* holding rq->lock, the cpu isn't yet set to this cpu so the
4453	* lockdep check in task_group() will fail.	4477	* lockdep check in task_group() will fail.
4454	*	4478	*
4455	* Similar case to sched_fork(). / Alternatively we could	4479	* Similar case to sched_fork(). / Alternatively we could
4456	* use task_rq_lock() here and obtain the other rq->lock.	4480	* use task_rq_lock() here and obtain the other rq->lock.
4457	*	4481	*
4458	* Silence PROVE_RCU	4482	* Silence PROVE_RCU
4459	*/	4483	*/
4460	rcu_read_lock();	4484	rcu_read_lock();
4461	__set_task_cpu(idle, cpu);	4485	__set_task_cpu(idle, cpu);
4462	rcu_read_unlock();	4486	rcu_read_unlock();
4463		4487
4464	rq->curr = rq->idle = idle;	4488	rq->curr = rq->idle = idle;
4465	idle->on_rq = 1;	4489	idle->on_rq = 1;
4466	#if defined(CONFIG_SMP)	4490	#if defined(CONFIG_SMP)
4467	idle->on_cpu = 1;	4491	idle->on_cpu = 1;
4468	#endif	4492	#endif
4469	raw_spin_unlock_irqrestore(&rq->lock, flags);	4493	raw_spin_unlock_irqrestore(&rq->lock, flags);
4470		4494
4471	/* Set the preempt count _outside_ the spinlocks! */	4495	/* Set the preempt count _outside_ the spinlocks! */
4472	init_idle_preempt_count(idle, cpu);	4496	init_idle_preempt_count(idle, cpu);
4473		4497
4474	/*	4498	/*
4475	* The idle tasks have their own, simple scheduling class:	4499	* The idle tasks have their own, simple scheduling class:
4476	*/	4500	*/
4477	idle->sched_class = &idle_sched_class;	4501	idle->sched_class = &idle_sched_class;
4478	ftrace_graph_init_idle_task(idle, cpu);	4502	ftrace_graph_init_idle_task(idle, cpu);
4479	vtime_init_idle(idle, cpu);	4503	vtime_init_idle(idle, cpu);
4480	#if defined(CONFIG_SMP)	4504	#if defined(CONFIG_SMP)
4481	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);	4505	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4482	#endif	4506	#endif
4483	}	4507	}
4484		4508
4485	#ifdef CONFIG_SMP	4509	#ifdef CONFIG_SMP
4486	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)	4510	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
4487	{	4511	{
4488	if (p->sched_class && p->sched_class->set_cpus_allowed)	4512	if (p->sched_class && p->sched_class->set_cpus_allowed)
4489	p->sched_class->set_cpus_allowed(p, new_mask);	4513	p->sched_class->set_cpus_allowed(p, new_mask);
4490		4514
4491	cpumask_copy(&p->cpus_allowed, new_mask);	4515	cpumask_copy(&p->cpus_allowed, new_mask);
4492	p->nr_cpus_allowed = cpumask_weight(new_mask);	4516	p->nr_cpus_allowed = cpumask_weight(new_mask);
4493	}	4517	}
4494		4518
4495	/*	4519	/*
4496	* This is how migration works:	4520	* This is how migration works:
4497	*	4521	*
4498	* 1) we invoke migration_cpu_stop() on the target CPU using	4522	* 1) we invoke migration_cpu_stop() on the target CPU using
4499	* stop_one_cpu().	4523	* stop_one_cpu().
4500	* 2) stopper starts to run (implicitly forcing the migrated thread	4524	* 2) stopper starts to run (implicitly forcing the migrated thread
4501	* off the CPU)	4525	* off the CPU)
4502	* 3) it checks whether the migrated task is still in the wrong runqueue.	4526	* 3) it checks whether the migrated task is still in the wrong runqueue.
4503	* 4) if it's in the wrong runqueue then the migration thread removes	4527	* 4) if it's in the wrong runqueue then the migration thread removes
4504	* it and puts it into the right queue.	4528	* it and puts it into the right queue.
4505	* 5) stopper completes and stop_one_cpu() returns and the migration	4529	* 5) stopper completes and stop_one_cpu() returns and the migration
4506	* is done.	4530	* is done.
4507	*/	4531	*/
4508		4532
4509	/*	4533	/*
4510	* Change a given task's CPU affinity. Migrate the thread to a	4534	* Change a given task's CPU affinity. Migrate the thread to a
4511	* proper CPU and schedule it away if the CPU it's executing on	4535	* proper CPU and schedule it away if the CPU it's executing on
4512	* is removed from the allowed bitmask.	4536	* is removed from the allowed bitmask.
4513	*	4537	*
4514	* NOTE: the caller must have a valid reference to the task, the	4538	* NOTE: the caller must have a valid reference to the task, the
4515	* task must not exit() & deallocate itself prematurely. The	4539	* task must not exit() & deallocate itself prematurely. The
4516	* call is not atomic; no spinlocks may be held.	4540	* call is not atomic; no spinlocks may be held.
4517	*/	4541	*/
4518	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)	4542	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
4519	{	4543	{
4520	unsigned long flags;	4544	unsigned long flags;
4521	struct rq *rq;	4545	struct rq *rq;
4522	unsigned int dest_cpu;	4546	unsigned int dest_cpu;
4523	int ret = 0;	4547	int ret = 0;
4524		4548
4525	rq = task_rq_lock(p, &flags);	4549	rq = task_rq_lock(p, &flags);
4526		4550
4527	if (cpumask_equal(&p->cpus_allowed, new_mask))	4551	if (cpumask_equal(&p->cpus_allowed, new_mask))
4528	goto out;	4552	goto out;
4529		4553
4530	if (!cpumask_intersects(new_mask, cpu_active_mask)) {	4554	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4531	ret = -EINVAL;	4555	ret = -EINVAL;
4532	goto out;	4556	goto out;
4533	}	4557	}
4534		4558
4535	do_set_cpus_allowed(p, new_mask);	4559	do_set_cpus_allowed(p, new_mask);
4536		4560
4537	/* Can the task run on the task's current CPU? If so, we're done */	4561	/* Can the task run on the task's current CPU? If so, we're done */
4538	if (cpumask_test_cpu(task_cpu(p), new_mask))	4562	if (cpumask_test_cpu(task_cpu(p), new_mask))
4539	goto out;	4563	goto out;
4540		4564
4541	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);	4565	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4542	if (p->on_rq) {	4566	if (p->on_rq) {
4543	struct migration_arg arg = { p, dest_cpu };	4567	struct migration_arg arg = { p, dest_cpu };
4544	/* Need help from migration thread: drop lock and wait. */	4568	/* Need help from migration thread: drop lock and wait. */
4545	task_rq_unlock(rq, p, &flags);	4569	task_rq_unlock(rq, p, &flags);
4546	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);	4570	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4547	tlb_migrate_finish(p->mm);	4571	tlb_migrate_finish(p->mm);
4548	return 0;	4572	return 0;
4549	}	4573	}
4550	out:	4574	out:
4551	task_rq_unlock(rq, p, &flags);	4575	task_rq_unlock(rq, p, &flags);
4552		4576
4553	return ret;	4577	return ret;
4554	}	4578	}
4555	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);	4579	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4556		4580
4557	/*	4581	/*
4558	* Move (not current) task off this cpu, onto dest cpu. We're doing	4582	* Move (not current) task off this cpu, onto dest cpu. We're doing
4559	* this because either it can't run here any more (set_cpus_allowed()	4583	* this because either it can't run here any more (set_cpus_allowed()
4560	* away from this CPU, or CPU going down), or because we're	4584	* away from this CPU, or CPU going down), or because we're
4561	* attempting to rebalance this task on exec (sched_exec).	4585	* attempting to rebalance this task on exec (sched_exec).
4562	*	4586	*
4563	* So we race with normal scheduler movements, but that's OK, as long	4587	* So we race with normal scheduler movements, but that's OK, as long
4564	* as the task is no longer on this CPU.	4588	* as the task is no longer on this CPU.
4565	*	4589	*
4566	* Returns non-zero if task was successfully migrated.	4590	* Returns non-zero if task was successfully migrated.
4567	*/	4591	*/
4568	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)	4592	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4569	{	4593	{
4570	struct rq rq_dest, rq_src;	4594	struct rq rq_dest, rq_src;
4571	int ret = 0;	4595	int ret = 0;
4572		4596
4573	if (unlikely(!cpu_active(dest_cpu)))	4597	if (unlikely(!cpu_active(dest_cpu)))
4574	return ret;	4598	return ret;
4575		4599
4576	rq_src = cpu_rq(src_cpu);	4600	rq_src = cpu_rq(src_cpu);
4577	rq_dest = cpu_rq(dest_cpu);	4601	rq_dest = cpu_rq(dest_cpu);
4578		4602
4579	raw_spin_lock(&p->pi_lock);	4603	raw_spin_lock(&p->pi_lock);
4580	double_rq_lock(rq_src, rq_dest);	4604	double_rq_lock(rq_src, rq_dest);
4581	/* Already moved. */	4605	/* Already moved. */
4582	if (task_cpu(p) != src_cpu)	4606	if (task_cpu(p) != src_cpu)
4583	goto done;	4607	goto done;
4584	/* Affinity changed (again). */	4608	/* Affinity changed (again). */
4585	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))	4609	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4586	goto fail;	4610	goto fail;
4587		4611
4588	/*	4612	/*
4589	* If we're not on a rq, the next wake-up will ensure we're	4613	* If we're not on a rq, the next wake-up will ensure we're
4590	* placed properly.	4614	* placed properly.
4591	*/	4615	*/
4592	if (p->on_rq) {	4616	if (p->on_rq) {
4593	dequeue_task(rq_src, p, 0);	4617	dequeue_task(rq_src, p, 0);
4594	set_task_cpu(p, dest_cpu);	4618	set_task_cpu(p, dest_cpu);
4595	enqueue_task(rq_dest, p, 0);	4619	enqueue_task(rq_dest, p, 0);
4596	check_preempt_curr(rq_dest, p, 0);	4620	check_preempt_curr(rq_dest, p, 0);
4597	}	4621	}
4598	done:	4622	done:
4599	ret = 1;	4623	ret = 1;
4600	fail:	4624	fail:
4601	double_rq_unlock(rq_src, rq_dest);	4625	double_rq_unlock(rq_src, rq_dest);
4602	raw_spin_unlock(&p->pi_lock);	4626	raw_spin_unlock(&p->pi_lock);
4603	return ret;	4627	return ret;
4604	}	4628	}
4605		4629
4606	#ifdef CONFIG_NUMA_BALANCING	4630	#ifdef CONFIG_NUMA_BALANCING
4607	/* Migrate current task p to target_cpu */	4631	/* Migrate current task p to target_cpu */
4608	int migrate_task_to(struct task_struct *p, int target_cpu)	4632	int migrate_task_to(struct task_struct *p, int target_cpu)
4609	{	4633	{
4610	struct migration_arg arg = { p, target_cpu };	4634	struct migration_arg arg = { p, target_cpu };
4611	int curr_cpu = task_cpu(p);	4635	int curr_cpu = task_cpu(p);
4612		4636
4613	if (curr_cpu == target_cpu)	4637	if (curr_cpu == target_cpu)
4614	return 0;	4638	return 0;
4615		4639
4616	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))	4640	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4617	return -EINVAL;	4641	return -EINVAL;
4618		4642
4619	/* TODO: This is not properly updating schedstats */	4643	/* TODO: This is not properly updating schedstats */
4620		4644
4621	trace_sched_move_numa(p, curr_cpu, target_cpu);	4645	trace_sched_move_numa(p, curr_cpu, target_cpu);
4622	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);	4646	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4623	}	4647	}
4624		4648
4625	/*	4649	/*
4626	* Requeue a task on a given node and accurately track the number of NUMA	4650	* Requeue a task on a given node and accurately track the number of NUMA
4627	* tasks on the runqueues	4651	* tasks on the runqueues
4628	*/	4652	*/
4629	void sched_setnuma(struct task_struct *p, int nid)	4653	void sched_setnuma(struct task_struct *p, int nid)
4630	{	4654	{
4631	struct rq *rq;	4655	struct rq *rq;
4632	unsigned long flags;	4656	unsigned long flags;
4633	bool on_rq, running;	4657	bool on_rq, running;
4634		4658
4635	rq = task_rq_lock(p, &flags);	4659	rq = task_rq_lock(p, &flags);
4636	on_rq = p->on_rq;	4660	on_rq = p->on_rq;
4637	running = task_current(rq, p);	4661	running = task_current(rq, p);
4638		4662
4639	if (on_rq)	4663	if (on_rq)
4640	dequeue_task(rq, p, 0);	4664	dequeue_task(rq, p, 0);
4641	if (running)	4665	if (running)
4642	p->sched_class->put_prev_task(rq, p);	4666	p->sched_class->put_prev_task(rq, p);
4643		4667
4644	p->numa_preferred_nid = nid;	4668	p->numa_preferred_nid = nid;
4645		4669
4646	if (running)	4670	if (running)
4647	p->sched_class->set_curr_task(rq);	4671	p->sched_class->set_curr_task(rq);
4648	if (on_rq)	4672	if (on_rq)
4649	enqueue_task(rq, p, 0);	4673	enqueue_task(rq, p, 0);
4650	task_rq_unlock(rq, p, &flags);	4674	task_rq_unlock(rq, p, &flags);
4651	}	4675	}
4652	#endif	4676	#endif
4653		4677
4654	/*	4678	/*
4655	* migration_cpu_stop - this will be executed by a highprio stopper thread	4679	* migration_cpu_stop - this will be executed by a highprio stopper thread
4656	* and performs thread migration by bumping thread off CPU then	4680	* and performs thread migration by bumping thread off CPU then
4657	* 'pushing' onto another runqueue.	4681	* 'pushing' onto another runqueue.
4658	*/	4682	*/
4659	static int migration_cpu_stop(void *data)	4683	static int migration_cpu_stop(void *data)
4660	{	4684	{
4661	struct migration_arg *arg = data;	4685	struct migration_arg *arg = data;
4662		4686
4663	/*	4687	/*
4664	* The original target cpu might have gone down and we might	4688	* The original target cpu might have gone down and we might
4665	* be on another cpu but it doesn't matter.	4689	* be on another cpu but it doesn't matter.
4666	*/	4690	*/
4667	local_irq_disable();	4691	local_irq_disable();
4668	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);	4692	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4669	local_irq_enable();	4693	local_irq_enable();
4670	return 0;	4694	return 0;
4671	}	4695	}
4672		4696
4673	#ifdef CONFIG_HOTPLUG_CPU	4697	#ifdef CONFIG_HOTPLUG_CPU
4674		4698
4675	/*	4699	/*
4676	* Ensures that the idle task is using init_mm right before its cpu goes	4700	* Ensures that the idle task is using init_mm right before its cpu goes
4677	* offline.	4701	* offline.
4678	*/	4702	*/
4679	void idle_task_exit(void)	4703	void idle_task_exit(void)
4680	{	4704	{
4681	struct mm_struct *mm = current->active_mm;	4705	struct mm_struct *mm = current->active_mm;
4682		4706
4683	BUG_ON(cpu_online(smp_processor_id()));	4707	BUG_ON(cpu_online(smp_processor_id()));
4684		4708
4685	if (mm != &init_mm) {	4709	if (mm != &init_mm) {
4686	switch_mm(mm, &init_mm, current);	4710	switch_mm(mm, &init_mm, current);
4687	finish_arch_post_lock_switch();	4711	finish_arch_post_lock_switch();
4688	}	4712	}
4689	mmdrop(mm);	4713	mmdrop(mm);
4690	}	4714	}
4691		4715
4692	/*	4716	/*
4693	* Since this CPU is going 'away' for a while, fold any nr_active delta	4717	* Since this CPU is going 'away' for a while, fold any nr_active delta
4694	* we might have. Assumes we're called after migrate_tasks() so that the	4718	* we might have. Assumes we're called after migrate_tasks() so that the
4695	* nr_active count is stable.	4719	* nr_active count is stable.
4696	*	4720	*
4697	* Also see the comment "Global load-average calculations".	4721	* Also see the comment "Global load-average calculations".
4698	*/	4722	*/
4699	static void calc_load_migrate(struct rq *rq)	4723	static void calc_load_migrate(struct rq *rq)
4700	{	4724	{
4701	long delta = calc_load_fold_active(rq);	4725	long delta = calc_load_fold_active(rq);
4702	if (delta)	4726	if (delta)
4703	atomic_long_add(delta, &calc_load_tasks);	4727	atomic_long_add(delta, &calc_load_tasks);
4704	}	4728	}
4705		4729
4706	static void put_prev_task_fake(struct rq rq, struct task_struct prev)	4730	static void put_prev_task_fake(struct rq rq, struct task_struct prev)
4707	{	4731	{
4708	}	4732	}
4709		4733
4710	static const struct sched_class fake_sched_class = {	4734	static const struct sched_class fake_sched_class = {
4711	.put_prev_task = put_prev_task_fake,	4735	.put_prev_task = put_prev_task_fake,
4712	};	4736	};
4713		4737
4714	static struct task_struct fake_task = {	4738	static struct task_struct fake_task = {
4715	/*	4739	/*
4716	* Avoid pull_{rt,dl}_task()	4740	* Avoid pull_{rt,dl}_task()
4717	*/	4741	*/
4718	.prio = MAX_PRIO + 1,	4742	.prio = MAX_PRIO + 1,
4719	.sched_class = &fake_sched_class,	4743	.sched_class = &fake_sched_class,
4720	};	4744	};
4721		4745
4722	/*	4746	/*
4723	* Migrate all tasks from the rq, sleeping tasks will be migrated by	4747	* Migrate all tasks from the rq, sleeping tasks will be migrated by
4724	* try_to_wake_up()->select_task_rq().	4748	* try_to_wake_up()->select_task_rq().
4725	*	4749	*
4726	* Called with rq->lock held even though we'er in stop_machine() and	4750	* Called with rq->lock held even though we'er in stop_machine() and
4727	* there's no concurrency possible, we hold the required locks anyway	4751	* there's no concurrency possible, we hold the required locks anyway
4728	* because of lock validation efforts.	4752	* because of lock validation efforts.
4729	*/	4753	*/
4730	static void migrate_tasks(unsigned int dead_cpu)	4754	static void migrate_tasks(unsigned int dead_cpu)
4731	{	4755	{
4732	struct rq *rq = cpu_rq(dead_cpu);	4756	struct rq *rq = cpu_rq(dead_cpu);
4733	struct task_struct next, stop = rq->stop;	4757	struct task_struct next, stop = rq->stop;
4734	int dest_cpu;	4758	int dest_cpu;
4735		4759
4736	/*	4760	/*
4737	* Fudge the rq selection such that the below task selection loop	4761	* Fudge the rq selection such that the below task selection loop
4738	* doesn't get stuck on the currently eligible stop task.	4762	* doesn't get stuck on the currently eligible stop task.
4739	*	4763	*
4740	* We're currently inside stop_machine() and the rq is either stuck	4764	* We're currently inside stop_machine() and the rq is either stuck
4741	* in the stop_machine_cpu_stop() loop, or we're executing this code,	4765	* in the stop_machine_cpu_stop() loop, or we're executing this code,
4742	* either way we should never end up calling schedule() until we're	4766	* either way we should never end up calling schedule() until we're
4743	* done here.	4767	* done here.
4744	*/	4768	*/
4745	rq->stop = NULL;	4769	rq->stop = NULL;
4746		4770
4747	/*	4771	/*
4748	* put_prev_task() and pick_next_task() sched	4772	* put_prev_task() and pick_next_task() sched
4749	* class method both need to have an up-to-date	4773	* class method both need to have an up-to-date
4750	* value of rq->clock[_task]	4774	* value of rq->clock[_task]
4751	*/	4775	*/
4752	update_rq_clock(rq);	4776	update_rq_clock(rq);
4753		4777
4754	for ( ; ; ) {	4778	for ( ; ; ) {
4755	/*	4779	/*
4756	* There's this thread running, bail when that's the only	4780	* There's this thread running, bail when that's the only
4757	* remaining thread.	4781	* remaining thread.
4758	*/	4782	*/
4759	if (rq->nr_running == 1)	4783	if (rq->nr_running == 1)
4760	break;	4784	break;
4761		4785
4762	next = pick_next_task(rq, &fake_task);	4786	next = pick_next_task(rq, &fake_task);
4763	BUG_ON(!next);	4787	BUG_ON(!next);
4764	next->sched_class->put_prev_task(rq, next);	4788	next->sched_class->put_prev_task(rq, next);
4765		4789
4766	/* Find suitable destination for @next, with force if needed. */	4790	/* Find suitable destination for @next, with force if needed. */
4767	dest_cpu = select_fallback_rq(dead_cpu, next);	4791	dest_cpu = select_fallback_rq(dead_cpu, next);
4768	raw_spin_unlock(&rq->lock);	4792	raw_spin_unlock(&rq->lock);
4769		4793
4770	__migrate_task(next, dead_cpu, dest_cpu);	4794	__migrate_task(next, dead_cpu, dest_cpu);
4771		4795
4772	raw_spin_lock(&rq->lock);	4796	raw_spin_lock(&rq->lock);
4773	}	4797	}
4774		4798
4775	rq->stop = stop;	4799	rq->stop = stop;
4776	}	4800	}
4777		4801
4778	#endif /* CONFIG_HOTPLUG_CPU */	4802	#endif /* CONFIG_HOTPLUG_CPU */
4779		4803
4780	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)	4804	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4781		4805
4782	static struct ctl_table sd_ctl_dir[] = {	4806	static struct ctl_table sd_ctl_dir[] = {
4783	{	4807	{
4784	.procname = "sched_domain",	4808	.procname = "sched_domain",
4785	.mode = 0555,	4809	.mode = 0555,
4786	},	4810	},
4787	{}	4811	{}
4788	};	4812	};
4789		4813
4790	static struct ctl_table sd_ctl_root[] = {	4814	static struct ctl_table sd_ctl_root[] = {
4791	{	4815	{
4792	.procname = "kernel",	4816	.procname = "kernel",
4793	.mode = 0555,	4817	.mode = 0555,
4794	.child = sd_ctl_dir,	4818	.child = sd_ctl_dir,
4795	},	4819	},
4796	{}	4820	{}
4797	};	4821	};
4798		4822
4799	static struct ctl_table *sd_alloc_ctl_entry(int n)	4823	static struct ctl_table *sd_alloc_ctl_entry(int n)
4800	{	4824	{
4801	struct ctl_table *entry =	4825	struct ctl_table *entry =
4802	kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);	4826	kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4803		4827
4804	return entry;	4828	return entry;
4805	}	4829	}
4806		4830
4807	static void sd_free_ctl_entry(struct ctl_table **tablep)	4831	static void sd_free_ctl_entry(struct ctl_table **tablep)
4808	{	4832	{
4809	struct ctl_table *entry;	4833	struct ctl_table *entry;
4810		4834
4811	/*	4835	/*
4812	* In the intermediate directories, both the child directory and	4836	* In the intermediate directories, both the child directory and
4813	* procname are dynamically allocated and could fail but the mode	4837	* procname are dynamically allocated and could fail but the mode
4814	* will always be set. In the lowest directory the names are	4838	* will always be set. In the lowest directory the names are
4815	* static strings and all have proc handlers.	4839	* static strings and all have proc handlers.
4816	*/	4840	*/
4817	for (entry = *tablep; entry->mode; entry++) {	4841	for (entry = *tablep; entry->mode; entry++) {
4818	if (entry->child)	4842	if (entry->child)
4819	sd_free_ctl_entry(&entry->child);	4843	sd_free_ctl_entry(&entry->child);
4820	if (entry->proc_handler == NULL)	4844	if (entry->proc_handler == NULL)
4821	kfree(entry->procname);	4845	kfree(entry->procname);
4822	}	4846	}
4823		4847
4824	kfree(*tablep);	4848	kfree(*tablep);
4825	*tablep = NULL;	4849	*tablep = NULL;
4826	}	4850	}
4827		4851
4828	static int min_load_idx = 0;	4852	static int min_load_idx = 0;
4829	static int max_load_idx = CPU_LOAD_IDX_MAX-1;	4853	static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4830		4854
4831	static void	4855	static void
4832	set_table_entry(struct ctl_table *entry,	4856	set_table_entry(struct ctl_table *entry,
4833	const char procname, void data, int maxlen,	4857	const char procname, void data, int maxlen,
4834	umode_t mode, proc_handler *proc_handler,	4858	umode_t mode, proc_handler *proc_handler,
4835	bool load_idx)	4859	bool load_idx)
4836	{	4860	{
4837	entry->procname = procname;	4861	entry->procname = procname;
4838	entry->data = data;	4862	entry->data = data;
4839	entry->maxlen = maxlen;	4863	entry->maxlen = maxlen;
4840	entry->mode = mode;	4864	entry->mode = mode;
4841	entry->proc_handler = proc_handler;	4865	entry->proc_handler = proc_handler;
4842		4866
4843	if (load_idx) {	4867	if (load_idx) {
4844	entry->extra1 = &min_load_idx;	4868	entry->extra1 = &min_load_idx;
4845	entry->extra2 = &max_load_idx;	4869	entry->extra2 = &max_load_idx;
4846	}	4870	}
4847	}	4871	}
4848		4872
4849	static struct ctl_table *	4873	static struct ctl_table *
4850	sd_alloc_ctl_domain_table(struct sched_domain *sd)	4874	sd_alloc_ctl_domain_table(struct sched_domain *sd)
4851	{	4875	{
4852	struct ctl_table *table = sd_alloc_ctl_entry(14);	4876	struct ctl_table *table = sd_alloc_ctl_entry(14);
4853		4877
4854	if (table == NULL)	4878	if (table == NULL)
4855	return NULL;	4879	return NULL;
4856		4880
4857	set_table_entry(&table[0], "min_interval", &sd->min_interval,	4881	set_table_entry(&table[0], "min_interval", &sd->min_interval,
4858	sizeof(long), 0644, proc_doulongvec_minmax, false);	4882	sizeof(long), 0644, proc_doulongvec_minmax, false);
4859	set_table_entry(&table[1], "max_interval", &sd->max_interval,	4883	set_table_entry(&table[1], "max_interval", &sd->max_interval,
4860	sizeof(long), 0644, proc_doulongvec_minmax, false);	4884	sizeof(long), 0644, proc_doulongvec_minmax, false);
4861	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,	4885	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4862	sizeof(int), 0644, proc_dointvec_minmax, true);	4886	sizeof(int), 0644, proc_dointvec_minmax, true);
4863	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,	4887	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4864	sizeof(int), 0644, proc_dointvec_minmax, true);	4888	sizeof(int), 0644, proc_dointvec_minmax, true);
4865	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,	4889	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4866	sizeof(int), 0644, proc_dointvec_minmax, true);	4890	sizeof(int), 0644, proc_dointvec_minmax, true);
4867	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,	4891	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4868	sizeof(int), 0644, proc_dointvec_minmax, true);	4892	sizeof(int), 0644, proc_dointvec_minmax, true);
4869	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,	4893	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4870	sizeof(int), 0644, proc_dointvec_minmax, true);	4894	sizeof(int), 0644, proc_dointvec_minmax, true);
4871	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,	4895	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4872	sizeof(int), 0644, proc_dointvec_minmax, false);	4896	sizeof(int), 0644, proc_dointvec_minmax, false);
4873	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,	4897	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4874	sizeof(int), 0644, proc_dointvec_minmax, false);	4898	sizeof(int), 0644, proc_dointvec_minmax, false);
4875	set_table_entry(&table[9], "cache_nice_tries",	4899	set_table_entry(&table[9], "cache_nice_tries",
4876	&sd->cache_nice_tries,	4900	&sd->cache_nice_tries,
4877	sizeof(int), 0644, proc_dointvec_minmax, false);	4901	sizeof(int), 0644, proc_dointvec_minmax, false);
4878	set_table_entry(&table[10], "flags", &sd->flags,	4902	set_table_entry(&table[10], "flags", &sd->flags,
4879	sizeof(int), 0644, proc_dointvec_minmax, false);	4903	sizeof(int), 0644, proc_dointvec_minmax, false);
4880	set_table_entry(&table[11], "max_newidle_lb_cost",	4904	set_table_entry(&table[11], "max_newidle_lb_cost",
4881	&sd->max_newidle_lb_cost,	4905	&sd->max_newidle_lb_cost,
4882	sizeof(long), 0644, proc_doulongvec_minmax, false);	4906	sizeof(long), 0644, proc_doulongvec_minmax, false);
4883	set_table_entry(&table[12], "name", sd->name,	4907	set_table_entry(&table[12], "name", sd->name,
4884	CORENAME_MAX_SIZE, 0444, proc_dostring, false);	4908	CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4885	/* &table[13] is terminator */	4909	/* &table[13] is terminator */
4886		4910
4887	return table;	4911	return table;
4888	}	4912	}
4889		4913
4890	static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)	4914	static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4891	{	4915	{
4892	struct ctl_table entry, table;	4916	struct ctl_table entry, table;
4893	struct sched_domain *sd;	4917	struct sched_domain *sd;
4894	int domain_num = 0, i;	4918	int domain_num = 0, i;
4895	char buf[32];	4919	char buf[32];
4896		4920
4897	for_each_domain(cpu, sd)	4921	for_each_domain(cpu, sd)
4898	domain_num++;	4922	domain_num++;
4899	entry = table = sd_alloc_ctl_entry(domain_num + 1);	4923	entry = table = sd_alloc_ctl_entry(domain_num + 1);
4900	if (table == NULL)	4924	if (table == NULL)
4901	return NULL;	4925	return NULL;
4902		4926
4903	i = 0;	4927	i = 0;
4904	for_each_domain(cpu, sd) {	4928	for_each_domain(cpu, sd) {
4905	snprintf(buf, 32, "domain%d", i);	4929	snprintf(buf, 32, "domain%d", i);
4906	entry->procname = kstrdup(buf, GFP_KERNEL);	4930	entry->procname = kstrdup(buf, GFP_KERNEL);
4907	entry->mode = 0555;	4931	entry->mode = 0555;
4908	entry->child = sd_alloc_ctl_domain_table(sd);	4932	entry->child = sd_alloc_ctl_domain_table(sd);
4909	entry++;	4933	entry++;
4910	i++;	4934	i++;
4911	}	4935	}
4912	return table;	4936	return table;
4913	}	4937	}
4914		4938
4915	static struct ctl_table_header *sd_sysctl_header;	4939	static struct ctl_table_header *sd_sysctl_header;
4916	static void register_sched_domain_sysctl(void)	4940	static void register_sched_domain_sysctl(void)
4917	{	4941	{
4918	int i, cpu_num = num_possible_cpus();	4942	int i, cpu_num = num_possible_cpus();
4919	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);	4943	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4920	char buf[32];	4944	char buf[32];
4921		4945
4922	WARN_ON(sd_ctl_dir[0].child);	4946	WARN_ON(sd_ctl_dir[0].child);
4923	sd_ctl_dir[0].child = entry;	4947	sd_ctl_dir[0].child = entry;
4924		4948
4925	if (entry == NULL)	4949	if (entry == NULL)
4926	return;	4950	return;
4927		4951
4928	for_each_possible_cpu(i) {	4952	for_each_possible_cpu(i) {
4929	snprintf(buf, 32, "cpu%d", i);	4953	snprintf(buf, 32, "cpu%d", i);
4930	entry->procname = kstrdup(buf, GFP_KERNEL);	4954	entry->procname = kstrdup(buf, GFP_KERNEL);
4931	entry->mode = 0555;	4955	entry->mode = 0555;
4932	entry->child = sd_alloc_ctl_cpu_table(i);	4956	entry->child = sd_alloc_ctl_cpu_table(i);
4933	entry++;	4957	entry++;
4934	}	4958	}
4935		4959
4936	WARN_ON(sd_sysctl_header);	4960	WARN_ON(sd_sysctl_header);
4937	sd_sysctl_header = register_sysctl_table(sd_ctl_root);	4961	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4938	}	4962	}
4939		4963
4940	/* may be called multiple times per register */	4964	/* may be called multiple times per register */
4941	static void unregister_sched_domain_sysctl(void)	4965	static void unregister_sched_domain_sysctl(void)
4942	{	4966	{
4943	if (sd_sysctl_header)	4967	if (sd_sysctl_header)
4944	unregister_sysctl_table(sd_sysctl_header);	4968	unregister_sysctl_table(sd_sysctl_header);
4945	sd_sysctl_header = NULL;	4969	sd_sysctl_header = NULL;
4946	if (sd_ctl_dir[0].child)	4970	if (sd_ctl_dir[0].child)
4947	sd_free_ctl_entry(&sd_ctl_dir[0].child);	4971	sd_free_ctl_entry(&sd_ctl_dir[0].child);
4948	}	4972	}
4949	#else	4973	#else
4950	static void register_sched_domain_sysctl(void)	4974	static void register_sched_domain_sysctl(void)
4951	{	4975	{
4952	}	4976	}
4953	static void unregister_sched_domain_sysctl(void)	4977	static void unregister_sched_domain_sysctl(void)
4954	{	4978	{
4955	}	4979	}
4956	#endif	4980	#endif
4957		4981
4958	static void set_rq_online(struct rq *rq)	4982	static void set_rq_online(struct rq *rq)
4959	{	4983	{
4960	if (!rq->online) {	4984	if (!rq->online) {
4961	const struct sched_class *class;	4985	const struct sched_class *class;
4962		4986
4963	cpumask_set_cpu(rq->cpu, rq->rd->online);	4987	cpumask_set_cpu(rq->cpu, rq->rd->online);
4964	rq->online = 1;	4988	rq->online = 1;
4965		4989
4966	for_each_class(class) {	4990	for_each_class(class) {
4967	if (class->rq_online)	4991	if (class->rq_online)
4968	class->rq_online(rq);	4992	class->rq_online(rq);
4969	}	4993	}
4970	}	4994	}
4971	}	4995	}
4972		4996
4973	static void set_rq_offline(struct rq *rq)	4997	static void set_rq_offline(struct rq *rq)
4974	{	4998	{
4975	if (rq->online) {	4999	if (rq->online) {
4976	const struct sched_class *class;	5000	const struct sched_class *class;
4977		5001
4978	for_each_class(class) {	5002	for_each_class(class) {
4979	if (class->rq_offline)	5003	if (class->rq_offline)
4980	class->rq_offline(rq);	5004	class->rq_offline(rq);
4981	}	5005	}
4982		5006
4983	cpumask_clear_cpu(rq->cpu, rq->rd->online);	5007	cpumask_clear_cpu(rq->cpu, rq->rd->online);
4984	rq->online = 0;	5008	rq->online = 0;
4985	}	5009	}
4986	}	5010	}
4987		5011
4988	/*	5012	/*
4989	* migration_call - callback that gets triggered when a CPU is added.	5013	* migration_call - callback that gets triggered when a CPU is added.
4990	* Here we can start up the necessary migration thread for the new CPU.	5014	* Here we can start up the necessary migration thread for the new CPU.
4991	*/	5015	*/
4992	static int	5016	static int
4993	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)	5017	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
4994	{	5018	{
4995	int cpu = (long)hcpu;	5019	int cpu = (long)hcpu;
4996	unsigned long flags;	5020	unsigned long flags;
4997	struct rq *rq = cpu_rq(cpu);	5021	struct rq *rq = cpu_rq(cpu);
4998		5022
4999	switch (action & ~CPU_TASKS_FROZEN) {	5023	switch (action & ~CPU_TASKS_FROZEN) {
5000		5024
5001	case CPU_UP_PREPARE:	5025	case CPU_UP_PREPARE:
5002	rq->calc_load_update = calc_load_update;	5026	rq->calc_load_update = calc_load_update;
5003	break;	5027	break;
5004		5028
5005	case CPU_ONLINE:	5029	case CPU_ONLINE:
5006	/* Update our root-domain */	5030	/* Update our root-domain */
5007	raw_spin_lock_irqsave(&rq->lock, flags);	5031	raw_spin_lock_irqsave(&rq->lock, flags);
5008	if (rq->rd) {	5032	if (rq->rd) {
5009	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	5033	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5010		5034
5011	set_rq_online(rq);	5035	set_rq_online(rq);
5012	}	5036	}
5013	raw_spin_unlock_irqrestore(&rq->lock, flags);	5037	raw_spin_unlock_irqrestore(&rq->lock, flags);
5014	break;	5038	break;
5015		5039
5016	#ifdef CONFIG_HOTPLUG_CPU	5040	#ifdef CONFIG_HOTPLUG_CPU
5017	case CPU_DYING:	5041	case CPU_DYING:
5018	sched_ttwu_pending();	5042	sched_ttwu_pending();
5019	/* Update our root-domain */	5043	/* Update our root-domain */
5020	raw_spin_lock_irqsave(&rq->lock, flags);	5044	raw_spin_lock_irqsave(&rq->lock, flags);
5021	if (rq->rd) {	5045	if (rq->rd) {
5022	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	5046	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5023	set_rq_offline(rq);	5047	set_rq_offline(rq);
5024	}	5048	}
5025	migrate_tasks(cpu);	5049	migrate_tasks(cpu);
5026	BUG_ON(rq->nr_running != 1); /* the migration thread */	5050	BUG_ON(rq->nr_running != 1); /* the migration thread */
5027	raw_spin_unlock_irqrestore(&rq->lock, flags);	5051	raw_spin_unlock_irqrestore(&rq->lock, flags);
5028	break;	5052	break;
5029		5053
5030	case CPU_DEAD:	5054	case CPU_DEAD:
5031	calc_load_migrate(rq);	5055	calc_load_migrate(rq);
5032	break;	5056	break;
5033	#endif	5057	#endif
5034	}	5058	}
5035		5059
5036	update_max_interval();	5060	update_max_interval();
5037		5061
5038	return NOTIFY_OK;	5062	return NOTIFY_OK;
5039	}	5063	}
5040		5064
5041	/*	5065	/*
5042	* Register at high priority so that task migration (migrate_all_tasks)	5066	* Register at high priority so that task migration (migrate_all_tasks)
5043	* happens before everything else. This has to be lower priority than	5067	* happens before everything else. This has to be lower priority than
5044	* the notifier in the perf_event subsystem, though.	5068	* the notifier in the perf_event subsystem, though.
5045	*/	5069	*/
5046	static struct notifier_block migration_notifier = {	5070	static struct notifier_block migration_notifier = {
5047	.notifier_call = migration_call,	5071	.notifier_call = migration_call,
5048	.priority = CPU_PRI_MIGRATION,	5072	.priority = CPU_PRI_MIGRATION,
5049	};	5073	};
5050		5074
5051	static int sched_cpu_active(struct notifier_block *nfb,	5075	static int sched_cpu_active(struct notifier_block *nfb,
5052	unsigned long action, void *hcpu)	5076	unsigned long action, void *hcpu)
5053	{	5077	{
5054	switch (action & ~CPU_TASKS_FROZEN) {	5078	switch (action & ~CPU_TASKS_FROZEN) {
5055	case CPU_STARTING:
5056	case CPU_DOWN_FAILED:	5079	case CPU_DOWN_FAILED:
5057	set_cpu_active((long)hcpu, true);	5080	set_cpu_active((long)hcpu, true);
5058	return NOTIFY_OK;	5081	return NOTIFY_OK;
5059	default:	5082	default:
5060	return NOTIFY_DONE;	5083	return NOTIFY_DONE;
5061	}	5084	}
5062	}	5085	}
5063		5086
5064	static int sched_cpu_inactive(struct notifier_block *nfb,	5087	static int sched_cpu_inactive(struct notifier_block *nfb,
5065	unsigned long action, void *hcpu)	5088	unsigned long action, void *hcpu)
5066	{	5089	{
5067	unsigned long flags;	5090	unsigned long flags;
5068	long cpu = (long)hcpu;	5091	long cpu = (long)hcpu;
5069		5092
5070	switch (action & ~CPU_TASKS_FROZEN) {	5093	switch (action & ~CPU_TASKS_FROZEN) {
5071	case CPU_DOWN_PREPARE:	5094	case CPU_DOWN_PREPARE:
5072	set_cpu_active(cpu, false);	5095	set_cpu_active(cpu, false);
5073		5096
5074	/* explicitly allow suspend */	5097	/* explicitly allow suspend */
5075	if (!(action & CPU_TASKS_FROZEN)) {	5098	if (!(action & CPU_TASKS_FROZEN)) {
5076	struct dl_bw *dl_b = dl_bw_of(cpu);	5099	struct dl_bw *dl_b = dl_bw_of(cpu);
5077	bool overflow;	5100	bool overflow;
5078	int cpus;	5101	int cpus;
5079		5102
5080	raw_spin_lock_irqsave(&dl_b->lock, flags);	5103	raw_spin_lock_irqsave(&dl_b->lock, flags);
5081	cpus = dl_bw_cpus(cpu);	5104	cpus = dl_bw_cpus(cpu);
5082	overflow = __dl_overflow(dl_b, cpus, 0, 0);	5105	overflow = __dl_overflow(dl_b, cpus, 0, 0);
5083	raw_spin_unlock_irqrestore(&dl_b->lock, flags);	5106	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5084		5107
5085	if (overflow)	5108	if (overflow)
5086	return notifier_from_errno(-EBUSY);	5109	return notifier_from_errno(-EBUSY);
5087	}	5110	}
5088	return NOTIFY_OK;	5111	return NOTIFY_OK;
5089	}	5112	}
5090		5113
5091	return NOTIFY_DONE;	5114	return NOTIFY_DONE;
5092	}	5115	}
5093		5116
5094	static int __init migration_init(void)	5117	static int __init migration_init(void)
5095	{	5118	{
5096	void cpu = (void )(long)smp_processor_id();	5119	void cpu = (void )(long)smp_processor_id();
5097	int err;	5120	int err;
5098		5121
5099	/* Initialize migration for the boot CPU */	5122	/* Initialize migration for the boot CPU */
5100	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);	5123	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5101	BUG_ON(err == NOTIFY_BAD);	5124	BUG_ON(err == NOTIFY_BAD);
5102	migration_call(&migration_notifier, CPU_ONLINE, cpu);	5125	migration_call(&migration_notifier, CPU_ONLINE, cpu);
5103	register_cpu_notifier(&migration_notifier);	5126	register_cpu_notifier(&migration_notifier);
5104		5127
5105	/* Register cpu active notifiers */	5128	/* Register cpu active notifiers */
5106	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);	5129	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5107	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);	5130	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5108		5131
5109	return 0;	5132	return 0;
5110	}	5133	}
5111	early_initcall(migration_init);	5134	early_initcall(migration_init);
5112	#endif	5135	#endif
5113		5136
5114	#ifdef CONFIG_SMP	5137	#ifdef CONFIG_SMP
5115		5138
5116	static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */	5139	static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5117		5140
5118	#ifdef CONFIG_SCHED_DEBUG	5141	#ifdef CONFIG_SCHED_DEBUG
5119		5142
5120	static __read_mostly int sched_debug_enabled;	5143	static __read_mostly int sched_debug_enabled;
5121		5144
5122	static int __init sched_debug_setup(char *str)	5145	static int __init sched_debug_setup(char *str)
5123	{	5146	{
5124	sched_debug_enabled = 1;	5147	sched_debug_enabled = 1;
5125		5148
5126	return 0;	5149	return 0;
5127	}	5150	}
5128	early_param("sched_debug", sched_debug_setup);	5151	early_param("sched_debug", sched_debug_setup);
5129		5152
5130	static inline bool sched_debug(void)	5153	static inline bool sched_debug(void)
5131	{	5154	{
5132	return sched_debug_enabled;	5155	return sched_debug_enabled;
5133	}	5156	}
5134		5157
5135	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,	5158	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5136	struct cpumask *groupmask)	5159	struct cpumask *groupmask)
5137	{	5160	{
5138	struct sched_group *group = sd->groups;	5161	struct sched_group *group = sd->groups;
5139	char str[256];	5162	char str[256];
5140		5163
5141	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));	5164	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5142	cpumask_clear(groupmask);	5165	cpumask_clear(groupmask);
5143		5166
5144	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);	5167	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5145		5168
5146	if (!(sd->flags & SD_LOAD_BALANCE)) {	5169	if (!(sd->flags & SD_LOAD_BALANCE)) {
5147	printk("does not load-balance\n");	5170	printk("does not load-balance\n");
5148	if (sd->parent)	5171	if (sd->parent)
5149	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"	5172	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5150	" has parent");	5173	" has parent");
5151	return -1;	5174	return -1;
5152	}	5175	}
5153		5176
5154	printk(KERN_CONT "span %s level %s\n", str, sd->name);	5177	printk(KERN_CONT "span %s level %s\n", str, sd->name);
5155		5178
5156	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {	5179	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5157	printk(KERN_ERR "ERROR: domain->span does not contain "	5180	printk(KERN_ERR "ERROR: domain->span does not contain "
5158	"CPU%d\n", cpu);	5181	"CPU%d\n", cpu);
5159	}	5182	}
5160	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {	5183	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5161	printk(KERN_ERR "ERROR: domain->groups does not contain"	5184	printk(KERN_ERR "ERROR: domain->groups does not contain"
5162	" CPU%d\n", cpu);	5185	" CPU%d\n", cpu);
5163	}	5186	}
5164		5187
5165	printk(KERN_DEBUG "%*s groups:", level + 1, "");	5188	printk(KERN_DEBUG "%*s groups:", level + 1, "");
5166	do {	5189	do {
5167	if (!group) {	5190	if (!group) {
5168	printk("\n");	5191	printk("\n");
5169	printk(KERN_ERR "ERROR: group is NULL\n");	5192	printk(KERN_ERR "ERROR: group is NULL\n");
5170	break;	5193	break;
5171	}	5194	}
5172		5195
5173	/*	5196	/*
5174	* Even though we initialize ->power to something semi-sane,	5197	* Even though we initialize ->power to something semi-sane,
5175	* we leave power_orig unset. This allows us to detect if	5198	* we leave power_orig unset. This allows us to detect if
5176	* domain iteration is still funny without causing /0 traps.	5199	* domain iteration is still funny without causing /0 traps.
5177	*/	5200	*/
5178	if (!group->sgp->power_orig) {	5201	if (!group->sgp->power_orig) {
5179	printk(KERN_CONT "\n");	5202	printk(KERN_CONT "\n");
5180	printk(KERN_ERR "ERROR: domain->cpu_power not "	5203	printk(KERN_ERR "ERROR: domain->cpu_power not "
5181	"set\n");	5204	"set\n");
5182	break;	5205	break;
5183	}	5206	}
5184		5207
5185	if (!cpumask_weight(sched_group_cpus(group))) {	5208	if (!cpumask_weight(sched_group_cpus(group))) {
5186	printk(KERN_CONT "\n");	5209	printk(KERN_CONT "\n");
5187	printk(KERN_ERR "ERROR: empty group\n");	5210	printk(KERN_ERR "ERROR: empty group\n");
5188	break;	5211	break;
5189	}	5212	}
5190		5213
5191	if (!(sd->flags & SD_OVERLAP) &&	5214	if (!(sd->flags & SD_OVERLAP) &&
5192	cpumask_intersects(groupmask, sched_group_cpus(group))) {	5215	cpumask_intersects(groupmask, sched_group_cpus(group))) {
5193	printk(KERN_CONT "\n");	5216	printk(KERN_CONT "\n");
5194	printk(KERN_ERR "ERROR: repeated CPUs\n");	5217	printk(KERN_ERR "ERROR: repeated CPUs\n");
5195	break;	5218	break;
5196	}	5219	}
5197		5220
5198	cpumask_or(groupmask, groupmask, sched_group_cpus(group));	5221	cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5199		5222
5200	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));	5223	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5201		5224
5202	printk(KERN_CONT " %s", str);	5225	printk(KERN_CONT " %s", str);
5203	if (group->sgp->power != SCHED_POWER_SCALE) {	5226	if (group->sgp->power != SCHED_POWER_SCALE) {
5204	printk(KERN_CONT " (cpu_power = %d)",	5227	printk(KERN_CONT " (cpu_power = %d)",
5205	group->sgp->power);	5228	group->sgp->power);
5206	}	5229	}
5207		5230
5208	group = group->next;	5231	group = group->next;
5209	} while (group != sd->groups);	5232	} while (group != sd->groups);
5210	printk(KERN_CONT "\n");	5233	printk(KERN_CONT "\n");
5211		5234
5212	if (!cpumask_equal(sched_domain_span(sd), groupmask))	5235	if (!cpumask_equal(sched_domain_span(sd), groupmask))
5213	printk(KERN_ERR "ERROR: groups don't span domain->span\n");	5236	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5214		5237
5215	if (sd->parent &&	5238	if (sd->parent &&
5216	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))	5239	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5217	printk(KERN_ERR "ERROR: parent span is not a superset "	5240	printk(KERN_ERR "ERROR: parent span is not a superset "
5218	"of domain->span\n");	5241	"of domain->span\n");
5219	return 0;	5242	return 0;
5220	}	5243	}
5221		5244
5222	static void sched_domain_debug(struct sched_domain *sd, int cpu)	5245	static void sched_domain_debug(struct sched_domain *sd, int cpu)
5223	{	5246	{
5224	int level = 0;	5247	int level = 0;
5225		5248
5226	if (!sched_debug_enabled)	5249	if (!sched_debug_enabled)
5227	return;	5250	return;
5228		5251
5229	if (!sd) {	5252	if (!sd) {
5230	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);	5253	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5231	return;	5254	return;
5232	}	5255	}
5233		5256
5234	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);	5257	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5235		5258
5236	for (;;) {	5259	for (;;) {
5237	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))	5260	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5238	break;	5261	break;
5239	level++;	5262	level++;
5240	sd = sd->parent;	5263	sd = sd->parent;
5241	if (!sd)	5264	if (!sd)
5242	break;	5265	break;
5243	}	5266	}
5244	}	5267	}
5245	#else /* !CONFIG_SCHED_DEBUG */	5268	#else /* !CONFIG_SCHED_DEBUG */
5246	# define sched_domain_debug(sd, cpu) do { } while (0)	5269	# define sched_domain_debug(sd, cpu) do { } while (0)
5247	static inline bool sched_debug(void)	5270	static inline bool sched_debug(void)
5248	{	5271	{
5249	return false;	5272	return false;
5250	}	5273	}
5251	#endif /* CONFIG_SCHED_DEBUG */	5274	#endif /* CONFIG_SCHED_DEBUG */
5252		5275
5253	static int sd_degenerate(struct sched_domain *sd)	5276	static int sd_degenerate(struct sched_domain *sd)
5254	{	5277	{
5255	if (cpumask_weight(sched_domain_span(sd)) == 1)	5278	if (cpumask_weight(sched_domain_span(sd)) == 1)
5256	return 1;	5279	return 1;
5257		5280
5258	/* Following flags need at least 2 groups */	5281	/* Following flags need at least 2 groups */
5259	if (sd->flags & (SD_LOAD_BALANCE \|	5282	if (sd->flags & (SD_LOAD_BALANCE \|
5260	SD_BALANCE_NEWIDLE \|	5283	SD_BALANCE_NEWIDLE \|
5261	SD_BALANCE_FORK \|	5284	SD_BALANCE_FORK \|
5262	SD_BALANCE_EXEC \|	5285	SD_BALANCE_EXEC \|
5263	SD_SHARE_CPUPOWER \|	5286	SD_SHARE_CPUPOWER \|
5264	SD_SHARE_PKG_RESOURCES)) {	5287	SD_SHARE_PKG_RESOURCES)) {
5265	if (sd->groups != sd->groups->next)	5288	if (sd->groups != sd->groups->next)
5266	return 0;	5289	return 0;
5267	}	5290	}
5268		5291
5269	/* Following flags don't use groups */	5292	/* Following flags don't use groups */
5270	if (sd->flags & (SD_WAKE_AFFINE))	5293	if (sd->flags & (SD_WAKE_AFFINE))
5271	return 0;	5294	return 0;
5272		5295
5273	return 1;	5296	return 1;
5274	}	5297	}
5275		5298
5276	static int	5299	static int
5277	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)	5300	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
5278	{	5301	{
5279	unsigned long cflags = sd->flags, pflags = parent->flags;	5302	unsigned long cflags = sd->flags, pflags = parent->flags;
5280		5303
5281	if (sd_degenerate(parent))	5304	if (sd_degenerate(parent))
5282	return 1;	5305	return 1;
5283		5306
5284	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))	5307	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5285	return 0;	5308	return 0;
5286		5309
5287	/* Flags needing groups don't count if only 1 group in parent */	5310	/* Flags needing groups don't count if only 1 group in parent */
5288	if (parent->groups == parent->groups->next) {	5311	if (parent->groups == parent->groups->next) {
5289	pflags &= ~(SD_LOAD_BALANCE \|	5312	pflags &= ~(SD_LOAD_BALANCE \|
5290	SD_BALANCE_NEWIDLE \|	5313	SD_BALANCE_NEWIDLE \|
5291	SD_BALANCE_FORK \|	5314	SD_BALANCE_FORK \|
5292	SD_BALANCE_EXEC \|	5315	SD_BALANCE_EXEC \|
5293	SD_SHARE_CPUPOWER \|	5316	SD_SHARE_CPUPOWER \|
5294	SD_SHARE_PKG_RESOURCES \|	5317	SD_SHARE_PKG_RESOURCES \|
5295	SD_PREFER_SIBLING);	5318	SD_PREFER_SIBLING);
5296	if (nr_node_ids == 1)	5319	if (nr_node_ids == 1)
5297	pflags &= ~SD_SERIALIZE;	5320	pflags &= ~SD_SERIALIZE;
5298	}	5321	}
5299	if (~cflags & pflags)	5322	if (~cflags & pflags)
5300	return 0;	5323	return 0;
5301		5324
5302	return 1;	5325	return 1;
5303	}	5326	}
5304		5327
5305	static void free_rootdomain(struct rcu_head *rcu)	5328	static void free_rootdomain(struct rcu_head *rcu)
5306	{	5329	{
5307	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);	5330	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5308		5331
5309	cpupri_cleanup(&rd->cpupri);	5332	cpupri_cleanup(&rd->cpupri);
5310	cpudl_cleanup(&rd->cpudl);	5333	cpudl_cleanup(&rd->cpudl);
5311	free_cpumask_var(rd->dlo_mask);	5334	free_cpumask_var(rd->dlo_mask);
5312	free_cpumask_var(rd->rto_mask);	5335	free_cpumask_var(rd->rto_mask);
5313	free_cpumask_var(rd->online);	5336	free_cpumask_var(rd->online);
5314	free_cpumask_var(rd->span);	5337	free_cpumask_var(rd->span);
5315	kfree(rd);	5338	kfree(rd);
5316	}	5339	}
5317		5340
5318	static void rq_attach_root(struct rq rq, struct root_domain rd)	5341	static void rq_attach_root(struct rq rq, struct root_domain rd)
5319	{	5342	{
5320	struct root_domain *old_rd = NULL;	5343	struct root_domain *old_rd = NULL;
5321	unsigned long flags;	5344	unsigned long flags;
5322		5345
5323	raw_spin_lock_irqsave(&rq->lock, flags);	5346	raw_spin_lock_irqsave(&rq->lock, flags);
5324		5347
5325	if (rq->rd) {	5348	if (rq->rd) {
5326	old_rd = rq->rd;	5349	old_rd = rq->rd;
5327		5350
5328	if (cpumask_test_cpu(rq->cpu, old_rd->online))	5351	if (cpumask_test_cpu(rq->cpu, old_rd->online))
5329	set_rq_offline(rq);	5352	set_rq_offline(rq);
5330		5353
5331	cpumask_clear_cpu(rq->cpu, old_rd->span);	5354	cpumask_clear_cpu(rq->cpu, old_rd->span);
5332		5355
5333	/*	5356	/*
5334	* If we dont want to free the old_rd yet then	5357	* If we dont want to free the old_rd yet then
5335	* set old_rd to NULL to skip the freeing later	5358	* set old_rd to NULL to skip the freeing later
5336	* in this function:	5359	* in this function:
5337	*/	5360	*/
5338	if (!atomic_dec_and_test(&old_rd->refcount))	5361	if (!atomic_dec_and_test(&old_rd->refcount))
5339	old_rd = NULL;	5362	old_rd = NULL;
5340	}	5363	}
5341		5364
5342	atomic_inc(&rd->refcount);	5365	atomic_inc(&rd->refcount);
5343	rq->rd = rd;	5366	rq->rd = rd;
5344		5367
5345	cpumask_set_cpu(rq->cpu, rd->span);	5368	cpumask_set_cpu(rq->cpu, rd->span);
5346	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))	5369	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5347	set_rq_online(rq);	5370	set_rq_online(rq);
5348		5371
5349	raw_spin_unlock_irqrestore(&rq->lock, flags);	5372	raw_spin_unlock_irqrestore(&rq->lock, flags);
5350		5373
5351	if (old_rd)	5374	if (old_rd)
5352	call_rcu_sched(&old_rd->rcu, free_rootdomain);	5375	call_rcu_sched(&old_rd->rcu, free_rootdomain);
5353	}	5376	}
5354		5377
5355	static int init_rootdomain(struct root_domain *rd)	5378	static int init_rootdomain(struct root_domain *rd)
5356	{	5379	{
5357	memset(rd, 0, sizeof(*rd));	5380	memset(rd, 0, sizeof(*rd));
5358		5381
5359	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))	5382	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5360	goto out;	5383	goto out;
5361	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))	5384	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5362	goto free_span;	5385	goto free_span;
5363	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))	5386	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5364	goto free_online;	5387	goto free_online;
5365	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))	5388	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5366	goto free_dlo_mask;	5389	goto free_dlo_mask;
5367		5390
5368	init_dl_bw(&rd->dl_bw);	5391	init_dl_bw(&rd->dl_bw);
5369	if (cpudl_init(&rd->cpudl) != 0)	5392	if (cpudl_init(&rd->cpudl) != 0)
5370	goto free_dlo_mask;	5393	goto free_dlo_mask;
5371		5394
5372	if (cpupri_init(&rd->cpupri) != 0)	5395	if (cpupri_init(&rd->cpupri) != 0)
5373	goto free_rto_mask;	5396	goto free_rto_mask;
5374	return 0;	5397	return 0;
5375		5398
5376	free_rto_mask:	5399	free_rto_mask:
5377	free_cpumask_var(rd->rto_mask);	5400	free_cpumask_var(rd->rto_mask);
5378	free_dlo_mask:	5401	free_dlo_mask:
5379	free_cpumask_var(rd->dlo_mask);	5402	free_cpumask_var(rd->dlo_mask);
5380	free_online:	5403	free_online:
5381	free_cpumask_var(rd->online);	5404	free_cpumask_var(rd->online);
5382	free_span:	5405	free_span:
5383	free_cpumask_var(rd->span);	5406	free_cpumask_var(rd->span);
5384	out:	5407	out:
5385	return -ENOMEM;	5408	return -ENOMEM;
5386	}	5409	}
5387		5410
5388	/*	5411	/*
5389	* By default the system creates a single root-domain with all cpus as	5412	* By default the system creates a single root-domain with all cpus as
5390	* members (mimicking the global state we have today).	5413	* members (mimicking the global state we have today).
5391	*/	5414	*/
5392	struct root_domain def_root_domain;	5415	struct root_domain def_root_domain;
5393		5416
5394	static void init_defrootdomain(void)	5417	static void init_defrootdomain(void)
5395	{	5418	{
5396	init_rootdomain(&def_root_domain);	5419	init_rootdomain(&def_root_domain);
5397		5420
5398	atomic_set(&def_root_domain.refcount, 1);	5421	atomic_set(&def_root_domain.refcount, 1);
5399	}	5422	}
5400		5423
5401	static struct root_domain *alloc_rootdomain(void)	5424	static struct root_domain *alloc_rootdomain(void)
5402	{	5425	{
5403	struct root_domain *rd;	5426	struct root_domain *rd;
5404		5427
5405	rd = kmalloc(sizeof(*rd), GFP_KERNEL);	5428	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5406	if (!rd)	5429	if (!rd)
5407	return NULL;	5430	return NULL;
5408		5431
5409	if (init_rootdomain(rd) != 0) {	5432	if (init_rootdomain(rd) != 0) {
5410	kfree(rd);	5433	kfree(rd);
5411	return NULL;	5434	return NULL;
5412	}	5435	}
5413		5436
5414	return rd;	5437	return rd;
5415	}	5438	}
5416		5439
5417	static void free_sched_groups(struct sched_group *sg, int free_sgp)	5440	static void free_sched_groups(struct sched_group *sg, int free_sgp)
5418	{	5441	{
5419	struct sched_group tmp, first;	5442	struct sched_group tmp, first;
5420		5443
5421	if (!sg)	5444	if (!sg)
5422	return;	5445	return;
5423		5446
5424	first = sg;	5447	first = sg;
5425	do {	5448	do {
5426	tmp = sg->next;	5449	tmp = sg->next;
5427		5450
5428	if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))	5451	if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5429	kfree(sg->sgp);	5452	kfree(sg->sgp);
5430		5453
5431	kfree(sg);	5454	kfree(sg);
5432	sg = tmp;	5455	sg = tmp;
5433	} while (sg != first);	5456	} while (sg != first);
5434	}	5457	}
5435		5458
5436	static void free_sched_domain(struct rcu_head *rcu)	5459	static void free_sched_domain(struct rcu_head *rcu)
5437	{	5460	{
5438	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);	5461	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5439		5462
5440	/*	5463	/*
5441	* If its an overlapping domain it has private groups, iterate and	5464	* If its an overlapping domain it has private groups, iterate and
5442	* nuke them all.	5465	* nuke them all.
5443	*/	5466	*/
5444	if (sd->flags & SD_OVERLAP) {	5467	if (sd->flags & SD_OVERLAP) {
5445	free_sched_groups(sd->groups, 1);	5468	free_sched_groups(sd->groups, 1);
5446	} else if (atomic_dec_and_test(&sd->groups->ref)) {	5469	} else if (atomic_dec_and_test(&sd->groups->ref)) {
5447	kfree(sd->groups->sgp);	5470	kfree(sd->groups->sgp);
5448	kfree(sd->groups);	5471	kfree(sd->groups);
5449	}	5472	}
5450	kfree(sd);	5473	kfree(sd);
5451	}	5474	}
5452		5475
5453	static void destroy_sched_domain(struct sched_domain *sd, int cpu)	5476	static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5454	{	5477	{
5455	call_rcu(&sd->rcu, free_sched_domain);	5478	call_rcu(&sd->rcu, free_sched_domain);
5456	}	5479	}
5457		5480
5458	static void destroy_sched_domains(struct sched_domain *sd, int cpu)	5481	static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5459	{	5482	{
5460	for (; sd; sd = sd->parent)	5483	for (; sd; sd = sd->parent)
5461	destroy_sched_domain(sd, cpu);	5484	destroy_sched_domain(sd, cpu);
5462	}	5485	}
5463		5486
5464	/*	5487	/*
5465	* Keep a special pointer to the highest sched_domain that has	5488	* Keep a special pointer to the highest sched_domain that has
5466	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this	5489	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5467	* allows us to avoid some pointer chasing select_idle_sibling().	5490	* allows us to avoid some pointer chasing select_idle_sibling().
5468	*	5491	*
5469	* Also keep a unique ID per domain (we use the first cpu number in	5492	* Also keep a unique ID per domain (we use the first cpu number in
5470	* the cpumask of the domain), this allows us to quickly tell if	5493	* the cpumask of the domain), this allows us to quickly tell if
5471	* two cpus are in the same cache domain, see cpus_share_cache().	5494	* two cpus are in the same cache domain, see cpus_share_cache().
5472	*/	5495	*/
5473	DEFINE_PER_CPU(struct sched_domain *, sd_llc);	5496	DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5474	DEFINE_PER_CPU(int, sd_llc_size);	5497	DEFINE_PER_CPU(int, sd_llc_size);
5475	DEFINE_PER_CPU(int, sd_llc_id);	5498	DEFINE_PER_CPU(int, sd_llc_id);
5476	DEFINE_PER_CPU(struct sched_domain *, sd_numa);	5499	DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5477	DEFINE_PER_CPU(struct sched_domain *, sd_busy);	5500	DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5478	DEFINE_PER_CPU(struct sched_domain *, sd_asym);	5501	DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5479		5502
5480	static void update_top_cache_domain(int cpu)	5503	static void update_top_cache_domain(int cpu)
5481	{	5504	{
5482	struct sched_domain *sd;	5505	struct sched_domain *sd;
5483	struct sched_domain *busy_sd = NULL;	5506	struct sched_domain *busy_sd = NULL;
5484	int id = cpu;	5507	int id = cpu;
5485	int size = 1;	5508	int size = 1;
5486		5509
5487	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);	5510	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5488	if (sd) {	5511	if (sd) {
5489	id = cpumask_first(sched_domain_span(sd));	5512	id = cpumask_first(sched_domain_span(sd));
5490	size = cpumask_weight(sched_domain_span(sd));	5513	size = cpumask_weight(sched_domain_span(sd));
5491	busy_sd = sd->parent; /* sd_busy */	5514	busy_sd = sd->parent; /* sd_busy */
5492	}	5515	}
5493	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);	5516	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5494		5517
5495	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);	5518	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5496	per_cpu(sd_llc_size, cpu) = size;	5519	per_cpu(sd_llc_size, cpu) = size;
5497	per_cpu(sd_llc_id, cpu) = id;	5520	per_cpu(sd_llc_id, cpu) = id;
5498		5521
5499	sd = lowest_flag_domain(cpu, SD_NUMA);	5522	sd = lowest_flag_domain(cpu, SD_NUMA);
5500	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);	5523	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5501		5524
5502	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);	5525	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
5503	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);	5526	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5504	}	5527	}
5505		5528
5506	/*	5529	/*
5507	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must	5530	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5508	* hold the hotplug lock.	5531	* hold the hotplug lock.
5509	*/	5532	*/
5510	static void	5533	static void
5511	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)	5534	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
5512	{	5535	{
5513	struct rq *rq = cpu_rq(cpu);	5536	struct rq *rq = cpu_rq(cpu);
5514	struct sched_domain *tmp;	5537	struct sched_domain *tmp;
5515		5538
5516	/* Remove the sched domains which do not contribute to scheduling. */	5539	/* Remove the sched domains which do not contribute to scheduling. */
5517	for (tmp = sd; tmp; ) {	5540	for (tmp = sd; tmp; ) {
5518	struct sched_domain *parent = tmp->parent;	5541	struct sched_domain *parent = tmp->parent;
5519	if (!parent)	5542	if (!parent)
5520	break;	5543	break;
5521		5544
5522	if (sd_parent_degenerate(tmp, parent)) {	5545	if (sd_parent_degenerate(tmp, parent)) {
5523	tmp->parent = parent->parent;	5546	tmp->parent = parent->parent;
5524	if (parent->parent)	5547	if (parent->parent)
5525	parent->parent->child = tmp;	5548	parent->parent->child = tmp;
5526	/*	5549	/*
5527	* Transfer SD_PREFER_SIBLING down in case of a	5550	* Transfer SD_PREFER_SIBLING down in case of a
5528	* degenerate parent; the spans match for this	5551	* degenerate parent; the spans match for this
5529	* so the property transfers.	5552	* so the property transfers.
5530	*/	5553	*/
5531	if (parent->flags & SD_PREFER_SIBLING)	5554	if (parent->flags & SD_PREFER_SIBLING)
5532	tmp->flags \|= SD_PREFER_SIBLING;	5555	tmp->flags \|= SD_PREFER_SIBLING;
5533	destroy_sched_domain(parent, cpu);	5556	destroy_sched_domain(parent, cpu);
5534	} else	5557	} else
5535	tmp = tmp->parent;	5558	tmp = tmp->parent;
5536	}	5559	}
5537		5560
5538	if (sd && sd_degenerate(sd)) {	5561	if (sd && sd_degenerate(sd)) {
5539	tmp = sd;	5562	tmp = sd;
5540	sd = sd->parent;	5563	sd = sd->parent;
5541	destroy_sched_domain(tmp, cpu);	5564	destroy_sched_domain(tmp, cpu);
5542	if (sd)	5565	if (sd)
5543	sd->child = NULL;	5566	sd->child = NULL;
5544	}	5567	}
5545		5568
5546	sched_domain_debug(sd, cpu);	5569	sched_domain_debug(sd, cpu);
5547		5570
5548	rq_attach_root(rq, rd);	5571	rq_attach_root(rq, rd);
5549	tmp = rq->sd;	5572	tmp = rq->sd;
5550	rcu_assign_pointer(rq->sd, sd);	5573	rcu_assign_pointer(rq->sd, sd);
5551	destroy_sched_domains(tmp, cpu);	5574	destroy_sched_domains(tmp, cpu);
5552		5575
5553	update_top_cache_domain(cpu);	5576	update_top_cache_domain(cpu);
5554	}	5577	}
5555		5578
5556	/* cpus with isolated domains */	5579	/* cpus with isolated domains */
5557	static cpumask_var_t cpu_isolated_map;	5580	static cpumask_var_t cpu_isolated_map;
5558		5581
5559	/* Setup the mask of cpus configured for isolated domains */	5582	/* Setup the mask of cpus configured for isolated domains */
5560	static int __init isolated_cpu_setup(char *str)	5583	static int __init isolated_cpu_setup(char *str)
5561	{	5584	{
5562	alloc_bootmem_cpumask_var(&cpu_isolated_map);	5585	alloc_bootmem_cpumask_var(&cpu_isolated_map);
5563	cpulist_parse(str, cpu_isolated_map);	5586	cpulist_parse(str, cpu_isolated_map);
5564	return 1;	5587	return 1;
5565	}	5588	}
5566		5589
5567	__setup("isolcpus=", isolated_cpu_setup);	5590	__setup("isolcpus=", isolated_cpu_setup);
5568		5591
5569	static const struct cpumask *cpu_cpu_mask(int cpu)	5592	static const struct cpumask *cpu_cpu_mask(int cpu)
5570	{	5593	{
5571	return cpumask_of_node(cpu_to_node(cpu));	5594	return cpumask_of_node(cpu_to_node(cpu));
5572	}	5595	}
5573		5596
5574	struct sd_data {	5597	struct sd_data {
5575	struct sched_domain **__percpu sd;	5598	struct sched_domain **__percpu sd;
5576	struct sched_group **__percpu sg;	5599	struct sched_group **__percpu sg;
5577	struct sched_group_power **__percpu sgp;	5600	struct sched_group_power **__percpu sgp;
5578	};	5601	};
5579		5602
5580	struct s_data {	5603	struct s_data {
5581	struct sched_domain ** __percpu sd;	5604	struct sched_domain ** __percpu sd;
5582	struct root_domain *rd;	5605	struct root_domain *rd;
5583	};	5606	};
5584		5607
5585	enum s_alloc {	5608	enum s_alloc {
5586	sa_rootdomain,	5609	sa_rootdomain,
5587	sa_sd,	5610	sa_sd,
5588	sa_sd_storage,	5611	sa_sd_storage,
5589	sa_none,	5612	sa_none,
5590	};	5613	};
5591		5614
5592	struct sched_domain_topology_level;	5615	struct sched_domain_topology_level;
5593		5616
5594	typedef struct sched_domain (sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);	5617	typedef struct sched_domain (sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5595	typedef const struct cpumask (sched_domain_mask_f)(int cpu);	5618	typedef const struct cpumask (sched_domain_mask_f)(int cpu);
5596		5619
5597	#define SDTL_OVERLAP 0x01	5620	#define SDTL_OVERLAP 0x01
5598		5621
5599	struct sched_domain_topology_level {	5622	struct sched_domain_topology_level {
5600	sched_domain_init_f init;	5623	sched_domain_init_f init;
5601	sched_domain_mask_f mask;	5624	sched_domain_mask_f mask;
5602	int flags;	5625	int flags;
5603	int numa_level;	5626	int numa_level;
5604	struct sd_data data;	5627	struct sd_data data;
5605	};	5628	};
5606		5629
5607	/*	5630	/*
5608	* Build an iteration mask that can exclude certain CPUs from the upwards	5631	* Build an iteration mask that can exclude certain CPUs from the upwards
5609	* domain traversal.	5632	* domain traversal.
5610	*	5633	*
5611	* Asymmetric node setups can result in situations where the domain tree is of	5634	* Asymmetric node setups can result in situations where the domain tree is of
5612	* unequal depth, make sure to skip domains that already cover the entire	5635	* unequal depth, make sure to skip domains that already cover the entire
5613	* range.	5636	* range.
5614	*	5637	*
5615	* In that case build_sched_domains() will have terminated the iteration early	5638	* In that case build_sched_domains() will have terminated the iteration early
5616	* and our sibling sd spans will be empty. Domains should always include the	5639	* and our sibling sd spans will be empty. Domains should always include the
5617	* cpu they're built on, so check that.	5640	* cpu they're built on, so check that.
5618	*	5641	*
5619	*/	5642	*/
5620	static void build_group_mask(struct sched_domain sd, struct sched_group sg)	5643	static void build_group_mask(struct sched_domain sd, struct sched_group sg)
5621	{	5644	{
5622	const struct cpumask *span = sched_domain_span(sd);	5645	const struct cpumask *span = sched_domain_span(sd);
5623	struct sd_data *sdd = sd->private;	5646	struct sd_data *sdd = sd->private;
5624	struct sched_domain *sibling;	5647	struct sched_domain *sibling;
5625	int i;	5648	int i;
5626		5649
5627	for_each_cpu(i, span) {	5650	for_each_cpu(i, span) {
5628	sibling = *per_cpu_ptr(sdd->sd, i);	5651	sibling = *per_cpu_ptr(sdd->sd, i);
5629	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))	5652	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5630	continue;	5653	continue;
5631		5654
5632	cpumask_set_cpu(i, sched_group_mask(sg));	5655	cpumask_set_cpu(i, sched_group_mask(sg));
5633	}	5656	}
5634	}	5657	}
5635		5658
5636	/*	5659	/*
5637	* Return the canonical balance cpu for this group, this is the first cpu	5660	* Return the canonical balance cpu for this group, this is the first cpu
5638	* of this group that's also in the iteration mask.	5661	* of this group that's also in the iteration mask.
5639	*/	5662	*/
5640	int group_balance_cpu(struct sched_group *sg)	5663	int group_balance_cpu(struct sched_group *sg)
5641	{	5664	{
5642	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));	5665	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5643	}	5666	}
5644		5667
5645	static int	5668	static int
5646	build_overlap_sched_groups(struct sched_domain *sd, int cpu)	5669	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5647	{	5670	{
5648	struct sched_group first = NULL, last = NULL, groups = NULL, sg;	5671	struct sched_group first = NULL, last = NULL, groups = NULL, sg;
5649	const struct cpumask *span = sched_domain_span(sd);	5672	const struct cpumask *span = sched_domain_span(sd);
5650	struct cpumask *covered = sched_domains_tmpmask;	5673	struct cpumask *covered = sched_domains_tmpmask;
5651	struct sd_data *sdd = sd->private;	5674	struct sd_data *sdd = sd->private;
5652	struct sched_domain *child;	5675	struct sched_domain *child;
5653	int i;	5676	int i;
5654		5677
5655	cpumask_clear(covered);	5678	cpumask_clear(covered);
5656		5679
5657	for_each_cpu(i, span) {	5680	for_each_cpu(i, span) {
5658	struct cpumask *sg_span;	5681	struct cpumask *sg_span;
5659		5682
5660	if (cpumask_test_cpu(i, covered))	5683	if (cpumask_test_cpu(i, covered))
5661	continue;	5684	continue;
5662		5685
5663	child = *per_cpu_ptr(sdd->sd, i);	5686	child = *per_cpu_ptr(sdd->sd, i);
5664		5687
5665	/* See the comment near build_group_mask(). */	5688	/* See the comment near build_group_mask(). */
5666	if (!cpumask_test_cpu(i, sched_domain_span(child)))	5689	if (!cpumask_test_cpu(i, sched_domain_span(child)))
5667	continue;	5690	continue;
5668		5691
5669	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),	5692	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5670	GFP_KERNEL, cpu_to_node(cpu));	5693	GFP_KERNEL, cpu_to_node(cpu));
5671		5694
5672	if (!sg)	5695	if (!sg)
5673	goto fail;	5696	goto fail;
5674		5697
5675	sg_span = sched_group_cpus(sg);	5698	sg_span = sched_group_cpus(sg);
5676	if (child->child) {	5699	if (child->child) {
5677	child = child->child;	5700	child = child->child;
5678	cpumask_copy(sg_span, sched_domain_span(child));	5701	cpumask_copy(sg_span, sched_domain_span(child));
5679	} else	5702	} else
5680	cpumask_set_cpu(i, sg_span);	5703	cpumask_set_cpu(i, sg_span);
5681		5704
5682	cpumask_or(covered, covered, sg_span);	5705	cpumask_or(covered, covered, sg_span);
5683		5706
5684	sg->sgp = *per_cpu_ptr(sdd->sgp, i);	5707	sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5685	if (atomic_inc_return(&sg->sgp->ref) == 1)	5708	if (atomic_inc_return(&sg->sgp->ref) == 1)
5686	build_group_mask(sd, sg);	5709	build_group_mask(sd, sg);
5687		5710
5688	/*	5711	/*
5689	* Initialize sgp->power such that even if we mess up the	5712	* Initialize sgp->power such that even if we mess up the
5690	* domains and no possible iteration will get us here, we won't	5713	* domains and no possible iteration will get us here, we won't
5691	* die on a /0 trap.	5714	* die on a /0 trap.
5692	*/	5715	*/
5693	sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);	5716	sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5694	sg->sgp->power_orig = sg->sgp->power;	5717	sg->sgp->power_orig = sg->sgp->power;
5695		5718
5696	/*	5719	/*
5697	* Make sure the first group of this domain contains the	5720	* Make sure the first group of this domain contains the
5698	* canonical balance cpu. Otherwise the sched_domain iteration	5721	* canonical balance cpu. Otherwise the sched_domain iteration
5699	* breaks. See update_sg_lb_stats().	5722	* breaks. See update_sg_lb_stats().
5700	*/	5723	*/
5701	if ((!groups && cpumask_test_cpu(cpu, sg_span)) \|\|	5724	if ((!groups && cpumask_test_cpu(cpu, sg_span)) \|\|
5702	group_balance_cpu(sg) == cpu)	5725	group_balance_cpu(sg) == cpu)
5703	groups = sg;	5726	groups = sg;
5704		5727
5705	if (!first)	5728	if (!first)
5706	first = sg;	5729	first = sg;
5707	if (last)	5730	if (last)
5708	last->next = sg;	5731	last->next = sg;
5709	last = sg;	5732	last = sg;
5710	last->next = first;	5733	last->next = first;
5711	}	5734	}
5712	sd->groups = groups;	5735	sd->groups = groups;
5713		5736
5714	return 0;	5737	return 0;
5715		5738
5716	fail:	5739	fail:
5717	free_sched_groups(first, 0);	5740	free_sched_groups(first, 0);
5718		5741
5719	return -ENOMEM;	5742	return -ENOMEM;
5720	}	5743	}
5721		5744
5722	static int get_group(int cpu, struct sd_data sdd, struct sched_group *sg)	5745	static int get_group(int cpu, struct sd_data sdd, struct sched_group *sg)
5723	{	5746	{
5724	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);	5747	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
5725	struct sched_domain *child = sd->child;	5748	struct sched_domain *child = sd->child;
5726		5749
5727	if (child)	5750	if (child)
5728	cpu = cpumask_first(sched_domain_span(child));	5751	cpu = cpumask_first(sched_domain_span(child));
5729		5752
5730	if (sg) {	5753	if (sg) {
5731	sg = per_cpu_ptr(sdd->sg, cpu);	5754	sg = per_cpu_ptr(sdd->sg, cpu);
5732	(sg)->sgp = per_cpu_ptr(sdd->sgp, cpu);	5755	(sg)->sgp = per_cpu_ptr(sdd->sgp, cpu);
5733	atomic_set(&(sg)->sgp->ref, 1); / for claim_allocations */	5756	atomic_set(&(sg)->sgp->ref, 1); / for claim_allocations */
5734	}	5757	}
5735		5758
5736	return cpu;	5759	return cpu;
5737	}	5760	}
5738		5761
5739	/*	5762	/*
5740	* build_sched_groups will build a circular linked list of the groups	5763	* build_sched_groups will build a circular linked list of the groups
5741	* covered by the given span, and will set each group's ->cpumask correctly,	5764	* covered by the given span, and will set each group's ->cpumask correctly,
5742	* and ->cpu_power to 0.	5765	* and ->cpu_power to 0.
5743	*	5766	*
5744	* Assumes the sched_domain tree is fully constructed	5767	* Assumes the sched_domain tree is fully constructed
5745	*/	5768	*/
5746	static int	5769	static int
5747	build_sched_groups(struct sched_domain *sd, int cpu)	5770	build_sched_groups(struct sched_domain *sd, int cpu)
5748	{	5771	{
5749	struct sched_group first = NULL, last = NULL;	5772	struct sched_group first = NULL, last = NULL;
5750	struct sd_data *sdd = sd->private;	5773	struct sd_data *sdd = sd->private;
5751	const struct cpumask *span = sched_domain_span(sd);	5774	const struct cpumask *span = sched_domain_span(sd);
5752	struct cpumask *covered;	5775	struct cpumask *covered;
5753	int i;	5776	int i;
5754		5777
5755	get_group(cpu, sdd, &sd->groups);	5778	get_group(cpu, sdd, &sd->groups);
5756	atomic_inc(&sd->groups->ref);	5779	atomic_inc(&sd->groups->ref);
5757		5780
5758	if (cpu != cpumask_first(span))	5781	if (cpu != cpumask_first(span))
5759	return 0;	5782	return 0;
5760		5783
5761	lockdep_assert_held(&sched_domains_mutex);	5784	lockdep_assert_held(&sched_domains_mutex);
5762	covered = sched_domains_tmpmask;	5785	covered = sched_domains_tmpmask;
5763		5786
5764	cpumask_clear(covered);	5787	cpumask_clear(covered);
5765		5788
5766	for_each_cpu(i, span) {	5789	for_each_cpu(i, span) {
5767	struct sched_group *sg;	5790	struct sched_group *sg;
5768	int group, j;	5791	int group, j;
5769		5792
5770	if (cpumask_test_cpu(i, covered))	5793	if (cpumask_test_cpu(i, covered))
5771	continue;	5794	continue;
5772		5795
5773	group = get_group(i, sdd, &sg);	5796	group = get_group(i, sdd, &sg);
5774	cpumask_clear(sched_group_cpus(sg));	5797	cpumask_clear(sched_group_cpus(sg));
5775	sg->sgp->power = 0;	5798	sg->sgp->power = 0;
5776	cpumask_setall(sched_group_mask(sg));	5799	cpumask_setall(sched_group_mask(sg));
5777		5800
5778	for_each_cpu(j, span) {	5801	for_each_cpu(j, span) {
5779	if (get_group(j, sdd, NULL) != group)	5802	if (get_group(j, sdd, NULL) != group)
5780	continue;	5803	continue;
5781		5804
5782	cpumask_set_cpu(j, covered);	5805	cpumask_set_cpu(j, covered);
5783	cpumask_set_cpu(j, sched_group_cpus(sg));	5806	cpumask_set_cpu(j, sched_group_cpus(sg));
5784	}	5807	}
5785		5808
5786	if (!first)	5809	if (!first)
5787	first = sg;	5810	first = sg;
5788	if (last)	5811	if (last)
5789	last->next = sg;	5812	last->next = sg;
5790	last = sg;	5813	last = sg;
5791	}	5814	}
5792	last->next = first;	5815	last->next = first;
5793		5816
5794	return 0;	5817	return 0;
5795	}	5818	}
5796		5819
5797	/*	5820	/*
5798	* Initialize sched groups cpu_power.	5821	* Initialize sched groups cpu_power.
5799	*	5822	*
5800	* cpu_power indicates the capacity of sched group, which is used while	5823	* cpu_power indicates the capacity of sched group, which is used while
5801	* distributing the load between different sched groups in a sched domain.	5824	* distributing the load between different sched groups in a sched domain.
5802	* Typically cpu_power for all the groups in a sched domain will be same unless	5825	* Typically cpu_power for all the groups in a sched domain will be same unless
5803	* there are asymmetries in the topology. If there are asymmetries, group	5826	* there are asymmetries in the topology. If there are asymmetries, group
5804	* having more cpu_power will pickup more load compared to the group having	5827	* having more cpu_power will pickup more load compared to the group having
5805	* less cpu_power.	5828	* less cpu_power.
5806	*/	5829	*/
5807	static void init_sched_groups_power(int cpu, struct sched_domain *sd)	5830	static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5808	{	5831	{
5809	struct sched_group *sg = sd->groups;	5832	struct sched_group *sg = sd->groups;
5810		5833
5811	WARN_ON(!sg);	5834	WARN_ON(!sg);
5812		5835
5813	do {	5836	do {
5814	sg->group_weight = cpumask_weight(sched_group_cpus(sg));	5837	sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5815	sg = sg->next;	5838	sg = sg->next;
5816	} while (sg != sd->groups);	5839	} while (sg != sd->groups);
5817		5840
5818	if (cpu != group_balance_cpu(sg))	5841	if (cpu != group_balance_cpu(sg))
5819	return;	5842	return;
5820		5843
5821	update_group_power(sd, cpu);	5844	update_group_power(sd, cpu);
5822	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);	5845	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5823	}	5846	}
5824		5847
5825	int __weak arch_sd_sibling_asym_packing(void)	5848	int __weak arch_sd_sibling_asym_packing(void)
5826	{	5849	{
5827	return 0*SD_ASYM_PACKING;	5850	return 0*SD_ASYM_PACKING;
5828	}	5851	}
5829		5852
5830	/*	5853	/*
5831	* Initializers for schedule domains	5854	* Initializers for schedule domains
5832	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()	5855	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5833	*/	5856	*/
5834		5857
5835	#ifdef CONFIG_SCHED_DEBUG	5858	#ifdef CONFIG_SCHED_DEBUG
5836	# define SD_INIT_NAME(sd, type) sd->name = #type	5859	# define SD_INIT_NAME(sd, type) sd->name = #type
5837	#else	5860	#else
5838	# define SD_INIT_NAME(sd, type) do { } while (0)	5861	# define SD_INIT_NAME(sd, type) do { } while (0)
5839	#endif	5862	#endif
5840		5863
5841	#define SD_INIT_FUNC(type) \	5864	#define SD_INIT_FUNC(type) \
5842	static noinline struct sched_domain * \	5865	static noinline struct sched_domain * \
5843	sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \	5866	sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5844	{ \	5867	{ \
5845	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu); \	5868	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu); \
5846	*sd = SD_##type##_INIT; \	5869	*sd = SD_##type##_INIT; \
5847	SD_INIT_NAME(sd, type); \	5870	SD_INIT_NAME(sd, type); \
5848	sd->private = &tl->data; \	5871	sd->private = &tl->data; \
5849	return sd; \	5872	return sd; \
5850	}	5873	}
5851		5874
5852	SD_INIT_FUNC(CPU)	5875	SD_INIT_FUNC(CPU)
5853	#ifdef CONFIG_SCHED_SMT	5876	#ifdef CONFIG_SCHED_SMT
5854	SD_INIT_FUNC(SIBLING)	5877	SD_INIT_FUNC(SIBLING)
5855	#endif	5878	#endif
5856	#ifdef CONFIG_SCHED_MC	5879	#ifdef CONFIG_SCHED_MC
5857	SD_INIT_FUNC(MC)	5880	SD_INIT_FUNC(MC)
5858	#endif	5881	#endif
5859	#ifdef CONFIG_SCHED_BOOK	5882	#ifdef CONFIG_SCHED_BOOK
5860	SD_INIT_FUNC(BOOK)	5883	SD_INIT_FUNC(BOOK)
5861	#endif	5884	#endif
5862		5885
5863	static int default_relax_domain_level = -1;	5886	static int default_relax_domain_level = -1;
5864	int sched_domain_level_max;	5887	int sched_domain_level_max;
5865		5888
5866	static int __init setup_relax_domain_level(char *str)	5889	static int __init setup_relax_domain_level(char *str)
5867	{	5890	{
5868	if (kstrtoint(str, 0, &default_relax_domain_level))	5891	if (kstrtoint(str, 0, &default_relax_domain_level))
5869	pr_warn("Unable to set relax_domain_level\n");	5892	pr_warn("Unable to set relax_domain_level\n");
5870		5893
5871	return 1;	5894	return 1;
5872	}	5895	}
5873	__setup("relax_domain_level=", setup_relax_domain_level);	5896	__setup("relax_domain_level=", setup_relax_domain_level);
5874		5897
5875	static void set_domain_attribute(struct sched_domain *sd,	5898	static void set_domain_attribute(struct sched_domain *sd,
5876	struct sched_domain_attr *attr)	5899	struct sched_domain_attr *attr)
5877	{	5900	{
5878	int request;	5901	int request;
5879		5902
5880	if (!attr \|\| attr->relax_domain_level < 0) {	5903	if (!attr \|\| attr->relax_domain_level < 0) {
5881	if (default_relax_domain_level < 0)	5904	if (default_relax_domain_level < 0)
5882	return;	5905	return;
5883	else	5906	else
5884	request = default_relax_domain_level;	5907	request = default_relax_domain_level;
5885	} else	5908	} else
5886	request = attr->relax_domain_level;	5909	request = attr->relax_domain_level;
5887	if (request < sd->level) {	5910	if (request < sd->level) {
5888	/* turn off idle balance on this domain */	5911	/* turn off idle balance on this domain */
5889	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);	5912	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
5890	} else {	5913	} else {
5891	/* turn on idle balance on this domain */	5914	/* turn on idle balance on this domain */
5892	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);	5915	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
5893	}	5916	}
5894	}	5917	}
5895		5918
5896	static void __sdt_free(const struct cpumask *cpu_map);	5919	static void __sdt_free(const struct cpumask *cpu_map);
5897	static int __sdt_alloc(const struct cpumask *cpu_map);	5920	static int __sdt_alloc(const struct cpumask *cpu_map);
5898		5921
5899	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,	5922	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5900	const struct cpumask *cpu_map)	5923	const struct cpumask *cpu_map)
5901	{	5924	{
5902	switch (what) {	5925	switch (what) {
5903	case sa_rootdomain:	5926	case sa_rootdomain:
5904	if (!atomic_read(&d->rd->refcount))	5927	if (!atomic_read(&d->rd->refcount))
5905	free_rootdomain(&d->rd->rcu); /* fall through */	5928	free_rootdomain(&d->rd->rcu); /* fall through */
5906	case sa_sd:	5929	case sa_sd:
5907	free_percpu(d->sd); /* fall through */	5930	free_percpu(d->sd); /* fall through */
5908	case sa_sd_storage:	5931	case sa_sd_storage:
5909	__sdt_free(cpu_map); /* fall through */	5932	__sdt_free(cpu_map); /* fall through */
5910	case sa_none:	5933	case sa_none:
5911	break;	5934	break;
5912	}	5935	}
5913	}	5936	}
5914		5937
5915	static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,	5938	static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5916	const struct cpumask *cpu_map)	5939	const struct cpumask *cpu_map)
5917	{	5940	{
5918	memset(d, 0, sizeof(*d));	5941	memset(d, 0, sizeof(*d));
5919		5942
5920	if (__sdt_alloc(cpu_map))	5943	if (__sdt_alloc(cpu_map))
5921	return sa_sd_storage;	5944	return sa_sd_storage;
5922	d->sd = alloc_percpu(struct sched_domain *);	5945	d->sd = alloc_percpu(struct sched_domain *);
5923	if (!d->sd)	5946	if (!d->sd)
5924	return sa_sd_storage;	5947	return sa_sd_storage;
5925	d->rd = alloc_rootdomain();	5948	d->rd = alloc_rootdomain();
5926	if (!d->rd)	5949	if (!d->rd)
5927	return sa_sd;	5950	return sa_sd;
5928	return sa_rootdomain;	5951	return sa_rootdomain;
5929	}	5952	}
5930		5953
5931	/*	5954	/*
5932	* NULL the sd_data elements we've used to build the sched_domain and	5955	* NULL the sd_data elements we've used to build the sched_domain and
5933	* sched_group structure so that the subsequent __free_domain_allocs()	5956	* sched_group structure so that the subsequent __free_domain_allocs()
5934	* will not free the data we're using.	5957	* will not free the data we're using.
5935	*/	5958	*/
5936	static void claim_allocations(int cpu, struct sched_domain *sd)	5959	static void claim_allocations(int cpu, struct sched_domain *sd)
5937	{	5960	{
5938	struct sd_data *sdd = sd->private;	5961	struct sd_data *sdd = sd->private;
5939		5962
5940	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);	5963	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5941	*per_cpu_ptr(sdd->sd, cpu) = NULL;	5964	*per_cpu_ptr(sdd->sd, cpu) = NULL;
5942		5965
5943	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))	5966	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5944	*per_cpu_ptr(sdd->sg, cpu) = NULL;	5967	*per_cpu_ptr(sdd->sg, cpu) = NULL;
5945		5968
5946	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))	5969	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5947	*per_cpu_ptr(sdd->sgp, cpu) = NULL;	5970	*per_cpu_ptr(sdd->sgp, cpu) = NULL;
5948	}	5971	}
5949		5972
5950	#ifdef CONFIG_SCHED_SMT	5973	#ifdef CONFIG_SCHED_SMT
5951	static const struct cpumask *cpu_smt_mask(int cpu)	5974	static const struct cpumask *cpu_smt_mask(int cpu)
5952	{	5975	{
5953	return topology_thread_cpumask(cpu);	5976	return topology_thread_cpumask(cpu);
5954	}	5977	}
5955	#endif	5978	#endif
5956		5979
5957	/*	5980	/*
5958	* Topology list, bottom-up.	5981	* Topology list, bottom-up.
5959	*/	5982	*/
5960	static struct sched_domain_topology_level default_topology[] = {	5983	static struct sched_domain_topology_level default_topology[] = {
5961	#ifdef CONFIG_SCHED_SMT	5984	#ifdef CONFIG_SCHED_SMT
5962	{ sd_init_SIBLING, cpu_smt_mask, },	5985	{ sd_init_SIBLING, cpu_smt_mask, },
5963	#endif	5986	#endif
5964	#ifdef CONFIG_SCHED_MC	5987	#ifdef CONFIG_SCHED_MC
5965	{ sd_init_MC, cpu_coregroup_mask, },	5988	{ sd_init_MC, cpu_coregroup_mask, },
5966	#endif	5989	#endif
5967	#ifdef CONFIG_SCHED_BOOK	5990	#ifdef CONFIG_SCHED_BOOK
5968	{ sd_init_BOOK, cpu_book_mask, },	5991	{ sd_init_BOOK, cpu_book_mask, },
5969	#endif	5992	#endif
5970	{ sd_init_CPU, cpu_cpu_mask, },	5993	{ sd_init_CPU, cpu_cpu_mask, },
5971	{ NULL, },	5994	{ NULL, },
5972	};	5995	};
5973		5996
5974	static struct sched_domain_topology_level *sched_domain_topology = default_topology;	5997	static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5975		5998
5976	#define for_each_sd_topology(tl) \	5999	#define for_each_sd_topology(tl) \
5977	for (tl = sched_domain_topology; tl->init; tl++)	6000	for (tl = sched_domain_topology; tl->init; tl++)
5978		6001
5979	#ifdef CONFIG_NUMA	6002	#ifdef CONFIG_NUMA
5980		6003
5981	static int sched_domains_numa_levels;	6004	static int sched_domains_numa_levels;
5982	static int *sched_domains_numa_distance;	6005	static int *sched_domains_numa_distance;
5983	static struct cpumask ***sched_domains_numa_masks;	6006	static struct cpumask ***sched_domains_numa_masks;
5984	static int sched_domains_curr_level;	6007	static int sched_domains_curr_level;
5985		6008
5986	static inline int sd_local_flags(int level)	6009	static inline int sd_local_flags(int level)
5987	{	6010	{
5988	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)	6011	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5989	return 0;	6012	return 0;
5990		6013
5991	return SD_BALANCE_EXEC \| SD_BALANCE_FORK \| SD_WAKE_AFFINE;	6014	return SD_BALANCE_EXEC \| SD_BALANCE_FORK \| SD_WAKE_AFFINE;
5992	}	6015	}
5993		6016
5994	static struct sched_domain *	6017	static struct sched_domain *
5995	sd_numa_init(struct sched_domain_topology_level *tl, int cpu)	6018	sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5996	{	6019	{
5997	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu);	6020	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu);
5998	int level = tl->numa_level;	6021	int level = tl->numa_level;
5999	int sd_weight = cpumask_weight(	6022	int sd_weight = cpumask_weight(
6000	sched_domains_numa_masks[level][cpu_to_node(cpu)]);	6023	sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6001		6024
6002	*sd = (struct sched_domain){	6025	*sd = (struct sched_domain){
6003	.min_interval = sd_weight,	6026	.min_interval = sd_weight,
6004	.max_interval = 2*sd_weight,	6027	.max_interval = 2*sd_weight,
6005	.busy_factor = 32,	6028	.busy_factor = 32,
6006	.imbalance_pct = 125,	6029	.imbalance_pct = 125,
6007	.cache_nice_tries = 2,	6030	.cache_nice_tries = 2,
6008	.busy_idx = 3,	6031	.busy_idx = 3,
6009	.idle_idx = 2,	6032	.idle_idx = 2,
6010	.newidle_idx = 0,	6033	.newidle_idx = 0,
6011	.wake_idx = 0,	6034	.wake_idx = 0,
6012	.forkexec_idx = 0,	6035	.forkexec_idx = 0,
6013		6036
6014	.flags = 1*SD_LOAD_BALANCE	6037	.flags = 1*SD_LOAD_BALANCE
6015	\| 1*SD_BALANCE_NEWIDLE	6038	\| 1*SD_BALANCE_NEWIDLE
6016	\| 0*SD_BALANCE_EXEC	6039	\| 0*SD_BALANCE_EXEC
6017	\| 0*SD_BALANCE_FORK	6040	\| 0*SD_BALANCE_FORK
6018	\| 0*SD_BALANCE_WAKE	6041	\| 0*SD_BALANCE_WAKE
6019	\| 0*SD_WAKE_AFFINE	6042	\| 0*SD_WAKE_AFFINE
6020	\| 0*SD_SHARE_CPUPOWER	6043	\| 0*SD_SHARE_CPUPOWER
6021	\| 0*SD_SHARE_PKG_RESOURCES	6044	\| 0*SD_SHARE_PKG_RESOURCES
6022	\| 1*SD_SERIALIZE	6045	\| 1*SD_SERIALIZE
6023	\| 0*SD_PREFER_SIBLING	6046	\| 0*SD_PREFER_SIBLING
6024	\| 1*SD_NUMA	6047	\| 1*SD_NUMA
6025	\| sd_local_flags(level)	6048	\| sd_local_flags(level)
6026	,	6049	,
6027	.last_balance = jiffies,	6050	.last_balance = jiffies,
6028	.balance_interval = sd_weight,	6051	.balance_interval = sd_weight,
6029	.max_newidle_lb_cost = 0,	6052	.max_newidle_lb_cost = 0,
6030	.next_decay_max_lb_cost = jiffies,	6053	.next_decay_max_lb_cost = jiffies,
6031	};	6054	};
6032	SD_INIT_NAME(sd, NUMA);	6055	SD_INIT_NAME(sd, NUMA);
6033	sd->private = &tl->data;	6056	sd->private = &tl->data;
6034		6057
6035	/*	6058	/*
6036	* Ugly hack to pass state to sd_numa_mask()...	6059	* Ugly hack to pass state to sd_numa_mask()...
6037	*/	6060	*/
6038	sched_domains_curr_level = tl->numa_level;	6061	sched_domains_curr_level = tl->numa_level;
6039		6062
6040	return sd;	6063	return sd;
6041	}	6064	}
6042		6065
6043	static const struct cpumask *sd_numa_mask(int cpu)	6066	static const struct cpumask *sd_numa_mask(int cpu)
6044	{	6067	{
6045	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];	6068	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6046	}	6069	}
6047		6070
6048	static void sched_numa_warn(const char *str)	6071	static void sched_numa_warn(const char *str)
6049	{	6072	{
6050	static int done = false;	6073	static int done = false;
6051	int i,j;	6074	int i,j;
6052		6075
6053	if (done)	6076	if (done)
6054	return;	6077	return;
6055		6078
6056	done = true;	6079	done = true;
6057		6080
6058	printk(KERN_WARNING "ERROR: %s\n\n", str);	6081	printk(KERN_WARNING "ERROR: %s\n\n", str);
6059		6082
6060	for (i = 0; i < nr_node_ids; i++) {	6083	for (i = 0; i < nr_node_ids; i++) {
6061	printk(KERN_WARNING " ");	6084	printk(KERN_WARNING " ");
6062	for (j = 0; j < nr_node_ids; j++)	6085	for (j = 0; j < nr_node_ids; j++)
6063	printk(KERN_CONT "%02d ", node_distance(i,j));	6086	printk(KERN_CONT "%02d ", node_distance(i,j));
6064	printk(KERN_CONT "\n");	6087	printk(KERN_CONT "\n");
6065	}	6088	}
6066	printk(KERN_WARNING "\n");	6089	printk(KERN_WARNING "\n");
6067	}	6090	}
6068		6091
6069	static bool find_numa_distance(int distance)	6092	static bool find_numa_distance(int distance)
6070	{	6093	{
6071	int i;	6094	int i;
6072		6095
6073	if (distance == node_distance(0, 0))	6096	if (distance == node_distance(0, 0))
6074	return true;	6097	return true;
6075		6098
6076	for (i = 0; i < sched_domains_numa_levels; i++) {	6099	for (i = 0; i < sched_domains_numa_levels; i++) {
6077	if (sched_domains_numa_distance[i] == distance)	6100	if (sched_domains_numa_distance[i] == distance)
6078	return true;	6101	return true;
6079	}	6102	}
6080		6103
6081	return false;	6104	return false;
6082	}	6105	}
6083		6106
6084	static void sched_init_numa(void)	6107	static void sched_init_numa(void)
6085	{	6108	{
6086	int next_distance, curr_distance = node_distance(0, 0);	6109	int next_distance, curr_distance = node_distance(0, 0);
6087	struct sched_domain_topology_level *tl;	6110	struct sched_domain_topology_level *tl;
6088	int level = 0;	6111	int level = 0;
6089	int i, j, k;	6112	int i, j, k;
6090		6113
6091	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);	6114	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6092	if (!sched_domains_numa_distance)	6115	if (!sched_domains_numa_distance)
6093	return;	6116	return;
6094		6117
6095	/*	6118	/*
6096	* O(nr_nodes^2) deduplicating selection sort -- in order to find the	6119	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
6097	* unique distances in the node_distance() table.	6120	* unique distances in the node_distance() table.
6098	*	6121	*
6099	* Assumes node_distance(0,j) includes all distances in	6122	* Assumes node_distance(0,j) includes all distances in
6100	* node_distance(i,j) in order to avoid cubic time.	6123	* node_distance(i,j) in order to avoid cubic time.
6101	*/	6124	*/
6102	next_distance = curr_distance;	6125	next_distance = curr_distance;
6103	for (i = 0; i < nr_node_ids; i++) {	6126	for (i = 0; i < nr_node_ids; i++) {
6104	for (j = 0; j < nr_node_ids; j++) {	6127	for (j = 0; j < nr_node_ids; j++) {
6105	for (k = 0; k < nr_node_ids; k++) {	6128	for (k = 0; k < nr_node_ids; k++) {
6106	int distance = node_distance(i, k);	6129	int distance = node_distance(i, k);
6107		6130
6108	if (distance > curr_distance &&	6131	if (distance > curr_distance &&
6109	(distance < next_distance \|\|	6132	(distance < next_distance \|\|
6110	next_distance == curr_distance))	6133	next_distance == curr_distance))
6111	next_distance = distance;	6134	next_distance = distance;
6112		6135
6113	/*	6136	/*
6114	* While not a strong assumption it would be nice to know	6137	* While not a strong assumption it would be nice to know
6115	* about cases where if node A is connected to B, B is not	6138	* about cases where if node A is connected to B, B is not
6116	* equally connected to A.	6139	* equally connected to A.
6117	*/	6140	*/
6118	if (sched_debug() && node_distance(k, i) != distance)	6141	if (sched_debug() && node_distance(k, i) != distance)
6119	sched_numa_warn("Node-distance not symmetric");	6142	sched_numa_warn("Node-distance not symmetric");
6120		6143
6121	if (sched_debug() && i && !find_numa_distance(distance))	6144	if (sched_debug() && i && !find_numa_distance(distance))
6122	sched_numa_warn("Node-0 not representative");	6145	sched_numa_warn("Node-0 not representative");
6123	}	6146	}
6124	if (next_distance != curr_distance) {	6147	if (next_distance != curr_distance) {
6125	sched_domains_numa_distance[level++] = next_distance;	6148	sched_domains_numa_distance[level++] = next_distance;
6126	sched_domains_numa_levels = level;	6149	sched_domains_numa_levels = level;
6127	curr_distance = next_distance;	6150	curr_distance = next_distance;
6128	} else break;	6151	} else break;
6129	}	6152	}
6130		6153
6131	/*	6154	/*
6132	* In case of sched_debug() we verify the above assumption.	6155	* In case of sched_debug() we verify the above assumption.
6133	*/	6156	*/
6134	if (!sched_debug())	6157	if (!sched_debug())
6135	break;	6158	break;
6136	}	6159	}
6137	/*	6160	/*
6138	* 'level' contains the number of unique distances, excluding the	6161	* 'level' contains the number of unique distances, excluding the
6139	* identity distance node_distance(i,i).	6162	* identity distance node_distance(i,i).
6140	*	6163	*
6141	* The sched_domains_numa_distance[] array includes the actual distance	6164	* The sched_domains_numa_distance[] array includes the actual distance
6142	* numbers.	6165	* numbers.
6143	*/	6166	*/
6144		6167
6145	/*	6168	/*
6146	* Here, we should temporarily reset sched_domains_numa_levels to 0.	6169	* Here, we should temporarily reset sched_domains_numa_levels to 0.
6147	* If it fails to allocate memory for array sched_domains_numa_masks[][],	6170	* If it fails to allocate memory for array sched_domains_numa_masks[][],
6148	* the array will contain less then 'level' members. This could be	6171	* the array will contain less then 'level' members. This could be
6149	* dangerous when we use it to iterate array sched_domains_numa_masks[][]	6172	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
6150	* in other functions.	6173	* in other functions.
6151	*	6174	*
6152	* We reset it to 'level' at the end of this function.	6175	* We reset it to 'level' at the end of this function.
6153	*/	6176	*/
6154	sched_domains_numa_levels = 0;	6177	sched_domains_numa_levels = 0;
6155		6178
6156	sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);	6179	sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
6157	if (!sched_domains_numa_masks)	6180	if (!sched_domains_numa_masks)
6158	return;	6181	return;
6159		6182
6160	/*	6183	/*
6161	* Now for each level, construct a mask per node which contains all	6184	* Now for each level, construct a mask per node which contains all
6162	* cpus of nodes that are that many hops away from us.	6185	* cpus of nodes that are that many hops away from us.
6163	*/	6186	*/
6164	for (i = 0; i < level; i++) {	6187	for (i = 0; i < level; i++) {
6165	sched_domains_numa_masks[i] =	6188	sched_domains_numa_masks[i] =
6166	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);	6189	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6167	if (!sched_domains_numa_masks[i])	6190	if (!sched_domains_numa_masks[i])
6168	return;	6191	return;
6169		6192
6170	for (j = 0; j < nr_node_ids; j++) {	6193	for (j = 0; j < nr_node_ids; j++) {
6171	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);	6194	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6172	if (!mask)	6195	if (!mask)
6173	return;	6196	return;
6174		6197
6175	sched_domains_numa_masks[i][j] = mask;	6198	sched_domains_numa_masks[i][j] = mask;
6176		6199
6177	for (k = 0; k < nr_node_ids; k++) {	6200	for (k = 0; k < nr_node_ids; k++) {
6178	if (node_distance(j, k) > sched_domains_numa_distance[i])	6201	if (node_distance(j, k) > sched_domains_numa_distance[i])
6179	continue;	6202	continue;
6180		6203
6181	cpumask_or(mask, mask, cpumask_of_node(k));	6204	cpumask_or(mask, mask, cpumask_of_node(k));
6182	}	6205	}
6183	}	6206	}
6184	}	6207	}
6185		6208
6186	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *	6209	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6187	sizeof(struct sched_domain_topology_level), GFP_KERNEL);	6210	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6188	if (!tl)	6211	if (!tl)
6189	return;	6212	return;
6190		6213
6191	/*	6214	/*
6192	* Copy the default topology bits..	6215	* Copy the default topology bits..
6193	*/	6216	*/
6194	for (i = 0; default_topology[i].init; i++)	6217	for (i = 0; default_topology[i].init; i++)
6195	tl[i] = default_topology[i];	6218	tl[i] = default_topology[i];
6196		6219
6197	/*	6220	/*
6198	* .. and append 'j' levels of NUMA goodness.	6221	* .. and append 'j' levels of NUMA goodness.
6199	*/	6222	*/
6200	for (j = 0; j < level; i++, j++) {	6223	for (j = 0; j < level; i++, j++) {
6201	tl[i] = (struct sched_domain_topology_level){	6224	tl[i] = (struct sched_domain_topology_level){
6202	.init = sd_numa_init,	6225	.init = sd_numa_init,
6203	.mask = sd_numa_mask,	6226	.mask = sd_numa_mask,
6204	.flags = SDTL_OVERLAP,	6227	.flags = SDTL_OVERLAP,
6205	.numa_level = j,	6228	.numa_level = j,
6206	};	6229	};
6207	}	6230	}
6208		6231
6209	sched_domain_topology = tl;	6232	sched_domain_topology = tl;
6210		6233
6211	sched_domains_numa_levels = level;	6234	sched_domains_numa_levels = level;
6212	}	6235	}
6213		6236
6214	static void sched_domains_numa_masks_set(int cpu)	6237	static void sched_domains_numa_masks_set(int cpu)
6215	{	6238	{
6216	int i, j;	6239	int i, j;
6217	int node = cpu_to_node(cpu);	6240	int node = cpu_to_node(cpu);
6218		6241
6219	for (i = 0; i < sched_domains_numa_levels; i++) {	6242	for (i = 0; i < sched_domains_numa_levels; i++) {
6220	for (j = 0; j < nr_node_ids; j++) {	6243	for (j = 0; j < nr_node_ids; j++) {
6221	if (node_distance(j, node) <= sched_domains_numa_distance[i])	6244	if (node_distance(j, node) <= sched_domains_numa_distance[i])
6222	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);	6245	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6223	}	6246	}
6224	}	6247	}
6225	}	6248	}
6226		6249
6227	static void sched_domains_numa_masks_clear(int cpu)	6250	static void sched_domains_numa_masks_clear(int cpu)
6228	{	6251	{
6229	int i, j;	6252	int i, j;
6230	for (i = 0; i < sched_domains_numa_levels; i++) {	6253	for (i = 0; i < sched_domains_numa_levels; i++) {
6231	for (j = 0; j < nr_node_ids; j++)	6254	for (j = 0; j < nr_node_ids; j++)
6232	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);	6255	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6233	}	6256	}
6234	}	6257	}
6235		6258
6236	/*	6259	/*
6237	* Update sched_domains_numa_masks[level][node] array when new cpus	6260	* Update sched_domains_numa_masks[level][node] array when new cpus
6238	* are onlined.	6261	* are onlined.
6239	*/	6262	*/
6240	static int sched_domains_numa_masks_update(struct notifier_block *nfb,	6263	static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6241	unsigned long action,	6264	unsigned long action,
6242	void *hcpu)	6265	void *hcpu)
6243	{	6266	{
6244	int cpu = (long)hcpu;	6267	int cpu = (long)hcpu;
6245		6268
6246	switch (action & ~CPU_TASKS_FROZEN) {	6269	switch (action & ~CPU_TASKS_FROZEN) {
6247	case CPU_ONLINE:	6270	case CPU_ONLINE:
6248	sched_domains_numa_masks_set(cpu);	6271	sched_domains_numa_masks_set(cpu);
6249	break;	6272	break;
6250		6273
6251	case CPU_DEAD:	6274	case CPU_DEAD:
6252	sched_domains_numa_masks_clear(cpu);	6275	sched_domains_numa_masks_clear(cpu);
6253	break;	6276	break;
6254		6277
6255	default:	6278	default:
6256	return NOTIFY_DONE;	6279	return NOTIFY_DONE;
6257	}	6280	}
6258		6281
6259	return NOTIFY_OK;	6282	return NOTIFY_OK;
6260	}	6283	}
6261	#else	6284	#else
6262	static inline void sched_init_numa(void)	6285	static inline void sched_init_numa(void)
6263	{	6286	{
6264	}	6287	}
6265		6288
6266	static int sched_domains_numa_masks_update(struct notifier_block *nfb,	6289	static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6267	unsigned long action,	6290	unsigned long action,
6268	void *hcpu)	6291	void *hcpu)
6269	{	6292	{
6270	return 0;	6293	return 0;
6271	}	6294	}
6272	#endif /* CONFIG_NUMA */	6295	#endif /* CONFIG_NUMA */
6273		6296
6274	static int __sdt_alloc(const struct cpumask *cpu_map)	6297	static int __sdt_alloc(const struct cpumask *cpu_map)
6275	{	6298	{
6276	struct sched_domain_topology_level *tl;	6299	struct sched_domain_topology_level *tl;
6277	int j;	6300	int j;
6278		6301
6279	for_each_sd_topology(tl) {	6302	for_each_sd_topology(tl) {
6280	struct sd_data *sdd = &tl->data;	6303	struct sd_data *sdd = &tl->data;
6281		6304
6282	sdd->sd = alloc_percpu(struct sched_domain *);	6305	sdd->sd = alloc_percpu(struct sched_domain *);
6283	if (!sdd->sd)	6306	if (!sdd->sd)
6284	return -ENOMEM;	6307	return -ENOMEM;
6285		6308
6286	sdd->sg = alloc_percpu(struct sched_group *);	6309	sdd->sg = alloc_percpu(struct sched_group *);
6287	if (!sdd->sg)	6310	if (!sdd->sg)
6288	return -ENOMEM;	6311	return -ENOMEM;
6289		6312
6290	sdd->sgp = alloc_percpu(struct sched_group_power *);	6313	sdd->sgp = alloc_percpu(struct sched_group_power *);
6291	if (!sdd->sgp)	6314	if (!sdd->sgp)
6292	return -ENOMEM;	6315	return -ENOMEM;
6293		6316
6294	for_each_cpu(j, cpu_map) {	6317	for_each_cpu(j, cpu_map) {
6295	struct sched_domain *sd;	6318	struct sched_domain *sd;
6296	struct sched_group *sg;	6319	struct sched_group *sg;
6297	struct sched_group_power *sgp;	6320	struct sched_group_power *sgp;
6298		6321
6299	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),	6322	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6300	GFP_KERNEL, cpu_to_node(j));	6323	GFP_KERNEL, cpu_to_node(j));
6301	if (!sd)	6324	if (!sd)
6302	return -ENOMEM;	6325	return -ENOMEM;
6303		6326
6304	*per_cpu_ptr(sdd->sd, j) = sd;	6327	*per_cpu_ptr(sdd->sd, j) = sd;
6305		6328
6306	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),	6329	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6307	GFP_KERNEL, cpu_to_node(j));	6330	GFP_KERNEL, cpu_to_node(j));
6308	if (!sg)	6331	if (!sg)
6309	return -ENOMEM;	6332	return -ENOMEM;
6310		6333
6311	sg->next = sg;	6334	sg->next = sg;
6312		6335
6313	*per_cpu_ptr(sdd->sg, j) = sg;	6336	*per_cpu_ptr(sdd->sg, j) = sg;
6314		6337
6315	sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),	6338	sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6316	GFP_KERNEL, cpu_to_node(j));	6339	GFP_KERNEL, cpu_to_node(j));
6317	if (!sgp)	6340	if (!sgp)
6318	return -ENOMEM;	6341	return -ENOMEM;
6319		6342
6320	*per_cpu_ptr(sdd->sgp, j) = sgp;	6343	*per_cpu_ptr(sdd->sgp, j) = sgp;
6321	}	6344	}
6322	}	6345	}
6323		6346
6324	return 0;	6347	return 0;
6325	}	6348	}
6326		6349
6327	static void __sdt_free(const struct cpumask *cpu_map)	6350	static void __sdt_free(const struct cpumask *cpu_map)
6328	{	6351	{
6329	struct sched_domain_topology_level *tl;	6352	struct sched_domain_topology_level *tl;
6330	int j;	6353	int j;
6331		6354
6332	for_each_sd_topology(tl) {	6355	for_each_sd_topology(tl) {
6333	struct sd_data *sdd = &tl->data;	6356	struct sd_data *sdd = &tl->data;
6334		6357
6335	for_each_cpu(j, cpu_map) {	6358	for_each_cpu(j, cpu_map) {
6336	struct sched_domain *sd;	6359	struct sched_domain *sd;
6337		6360
6338	if (sdd->sd) {	6361	if (sdd->sd) {
6339	sd = *per_cpu_ptr(sdd->sd, j);	6362	sd = *per_cpu_ptr(sdd->sd, j);
6340	if (sd && (sd->flags & SD_OVERLAP))	6363	if (sd && (sd->flags & SD_OVERLAP))
6341	free_sched_groups(sd->groups, 0);	6364	free_sched_groups(sd->groups, 0);
6342	kfree(*per_cpu_ptr(sdd->sd, j));	6365	kfree(*per_cpu_ptr(sdd->sd, j));
6343	}	6366	}
6344		6367
6345	if (sdd->sg)	6368	if (sdd->sg)
6346	kfree(*per_cpu_ptr(sdd->sg, j));	6369	kfree(*per_cpu_ptr(sdd->sg, j));
6347	if (sdd->sgp)	6370	if (sdd->sgp)
6348	kfree(*per_cpu_ptr(sdd->sgp, j));	6371	kfree(*per_cpu_ptr(sdd->sgp, j));
6349	}	6372	}
6350	free_percpu(sdd->sd);	6373	free_percpu(sdd->sd);
6351	sdd->sd = NULL;	6374	sdd->sd = NULL;
6352	free_percpu(sdd->sg);	6375	free_percpu(sdd->sg);
6353	sdd->sg = NULL;	6376	sdd->sg = NULL;
6354	free_percpu(sdd->sgp);	6377	free_percpu(sdd->sgp);
6355	sdd->sgp = NULL;	6378	sdd->sgp = NULL;
6356	}	6379	}
6357	}	6380	}
6358		6381
6359	struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,	6382	struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
6360	const struct cpumask cpu_map, struct sched_domain_attr attr,	6383	const struct cpumask cpu_map, struct sched_domain_attr attr,
6361	struct sched_domain *child, int cpu)	6384	struct sched_domain *child, int cpu)
6362	{	6385	{
6363	struct sched_domain *sd = tl->init(tl, cpu);	6386	struct sched_domain *sd = tl->init(tl, cpu);
6364	if (!sd)	6387	if (!sd)
6365	return child;	6388	return child;
6366		6389
6367	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));	6390	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6368	if (child) {	6391	if (child) {
6369	sd->level = child->level + 1;	6392	sd->level = child->level + 1;
6370	sched_domain_level_max = max(sched_domain_level_max, sd->level);	6393	sched_domain_level_max = max(sched_domain_level_max, sd->level);
6371	child->parent = sd;	6394	child->parent = sd;
6372	sd->child = child;	6395	sd->child = child;
6373	}	6396	}
6374	set_domain_attribute(sd, attr);	6397	set_domain_attribute(sd, attr);
6375		6398
6376	return sd;	6399	return sd;
6377	}	6400	}
6378		6401
6379	/*	6402	/*
6380	* Build sched domains for a given set of cpus and attach the sched domains	6403	* Build sched domains for a given set of cpus and attach the sched domains
6381	* to the individual cpus	6404	* to the individual cpus
6382	*/	6405	*/
6383	static int build_sched_domains(const struct cpumask *cpu_map,	6406	static int build_sched_domains(const struct cpumask *cpu_map,
6384	struct sched_domain_attr *attr)	6407	struct sched_domain_attr *attr)
6385	{	6408	{
6386	enum s_alloc alloc_state;	6409	enum s_alloc alloc_state;
6387	struct sched_domain *sd;	6410	struct sched_domain *sd;
6388	struct s_data d;	6411	struct s_data d;
6389	int i, ret = -ENOMEM;	6412	int i, ret = -ENOMEM;
6390		6413
6391	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);	6414	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6392	if (alloc_state != sa_rootdomain)	6415	if (alloc_state != sa_rootdomain)
6393	goto error;	6416	goto error;
6394		6417
6395	/* Set up domains for cpus specified by the cpu_map. */	6418	/* Set up domains for cpus specified by the cpu_map. */
6396	for_each_cpu(i, cpu_map) {	6419	for_each_cpu(i, cpu_map) {
6397	struct sched_domain_topology_level *tl;	6420	struct sched_domain_topology_level *tl;
6398		6421
6399	sd = NULL;	6422	sd = NULL;
6400	for_each_sd_topology(tl) {	6423	for_each_sd_topology(tl) {
6401	sd = build_sched_domain(tl, cpu_map, attr, sd, i);	6424	sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6402	if (tl == sched_domain_topology)	6425	if (tl == sched_domain_topology)
6403	*per_cpu_ptr(d.sd, i) = sd;	6426	*per_cpu_ptr(d.sd, i) = sd;
6404	if (tl->flags & SDTL_OVERLAP \|\| sched_feat(FORCE_SD_OVERLAP))	6427	if (tl->flags & SDTL_OVERLAP \|\| sched_feat(FORCE_SD_OVERLAP))
6405	sd->flags \|= SD_OVERLAP;	6428	sd->flags \|= SD_OVERLAP;
6406	if (cpumask_equal(cpu_map, sched_domain_span(sd)))	6429	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6407	break;	6430	break;
6408	}	6431	}
6409	}	6432	}
6410		6433
6411	/* Build the groups for the domains */	6434	/* Build the groups for the domains */
6412	for_each_cpu(i, cpu_map) {	6435	for_each_cpu(i, cpu_map) {
6413	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {	6436	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6414	sd->span_weight = cpumask_weight(sched_domain_span(sd));	6437	sd->span_weight = cpumask_weight(sched_domain_span(sd));
6415	if (sd->flags & SD_OVERLAP) {	6438	if (sd->flags & SD_OVERLAP) {
6416	if (build_overlap_sched_groups(sd, i))	6439	if (build_overlap_sched_groups(sd, i))
6417	goto error;	6440	goto error;
6418	} else {	6441	} else {
6419	if (build_sched_groups(sd, i))	6442	if (build_sched_groups(sd, i))
6420	goto error;	6443	goto error;
6421	}	6444	}
6422	}	6445	}
6423	}	6446	}
6424		6447
6425	/* Calculate CPU power for physical packages and nodes */	6448	/* Calculate CPU power for physical packages and nodes */
6426	for (i = nr_cpumask_bits-1; i >= 0; i--) {	6449	for (i = nr_cpumask_bits-1; i >= 0; i--) {
6427	if (!cpumask_test_cpu(i, cpu_map))	6450	if (!cpumask_test_cpu(i, cpu_map))
6428	continue;	6451	continue;
6429		6452
6430	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {	6453	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6431	claim_allocations(i, sd);	6454	claim_allocations(i, sd);
6432	init_sched_groups_power(i, sd);	6455	init_sched_groups_power(i, sd);
6433	}	6456	}
6434	}	6457	}
6435		6458
6436	/* Attach the domains */	6459	/* Attach the domains */
6437	rcu_read_lock();	6460	rcu_read_lock();
6438	for_each_cpu(i, cpu_map) {	6461	for_each_cpu(i, cpu_map) {
6439	sd = *per_cpu_ptr(d.sd, i);	6462	sd = *per_cpu_ptr(d.sd, i);
6440	cpu_attach_domain(sd, d.rd, i);	6463	cpu_attach_domain(sd, d.rd, i);
6441	}	6464	}
6442	rcu_read_unlock();	6465	rcu_read_unlock();
6443		6466
6444	ret = 0;	6467	ret = 0;
6445	error:	6468	error:
6446	__free_domain_allocs(&d, alloc_state, cpu_map);	6469	__free_domain_allocs(&d, alloc_state, cpu_map);
6447	return ret;	6470	return ret;
6448	}	6471	}
6449		6472
6450	static cpumask_var_t doms_cur; / current sched domains */	6473	static cpumask_var_t doms_cur; / current sched domains */
6451	static int ndoms_cur; /* number of sched domains in 'doms_cur' */	6474	static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6452	static struct sched_domain_attr *dattr_cur;	6475	static struct sched_domain_attr *dattr_cur;
6453	/* attribues of custom domains in 'doms_cur' */	6476	/* attribues of custom domains in 'doms_cur' */
6454		6477
6455	/*	6478	/*
6456	* Special case: If a kmalloc of a doms_cur partition (array of	6479	* Special case: If a kmalloc of a doms_cur partition (array of
6457	* cpumask) fails, then fallback to a single sched domain,	6480	* cpumask) fails, then fallback to a single sched domain,
6458	* as determined by the single cpumask fallback_doms.	6481	* as determined by the single cpumask fallback_doms.
6459	*/	6482	*/
6460	static cpumask_var_t fallback_doms;	6483	static cpumask_var_t fallback_doms;
6461		6484
6462	/*	6485	/*
6463	* arch_update_cpu_topology lets virtualized architectures update the	6486	* arch_update_cpu_topology lets virtualized architectures update the
6464	* cpu core maps. It is supposed to return 1 if the topology changed	6487	* cpu core maps. It is supposed to return 1 if the topology changed
6465	* or 0 if it stayed the same.	6488	* or 0 if it stayed the same.
6466	*/	6489	*/
6467	int __weak arch_update_cpu_topology(void)	6490	int __weak arch_update_cpu_topology(void)
6468	{	6491	{
6469	return 0;	6492	return 0;
6470	}	6493	}
6471		6494
6472	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)	6495	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6473	{	6496	{
6474	int i;	6497	int i;
6475	cpumask_var_t *doms;	6498	cpumask_var_t *doms;
6476		6499
6477	doms = kmalloc(sizeof(doms) ndoms, GFP_KERNEL);	6500	doms = kmalloc(sizeof(doms) ndoms, GFP_KERNEL);
6478	if (!doms)	6501	if (!doms)
6479	return NULL;	6502	return NULL;
6480	for (i = 0; i < ndoms; i++) {	6503	for (i = 0; i < ndoms; i++) {
6481	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {	6504	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6482	free_sched_domains(doms, i);	6505	free_sched_domains(doms, i);
6483	return NULL;	6506	return NULL;
6484	}	6507	}
6485	}	6508	}
6486	return doms;	6509	return doms;
6487	}	6510	}
6488		6511
6489	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)	6512	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6490	{	6513	{
6491	unsigned int i;	6514	unsigned int i;
6492	for (i = 0; i < ndoms; i++)	6515	for (i = 0; i < ndoms; i++)
6493	free_cpumask_var(doms[i]);	6516	free_cpumask_var(doms[i]);
6494	kfree(doms);	6517	kfree(doms);
6495	}	6518	}
6496		6519
6497	/*	6520	/*
6498	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	6521	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
6499	* For now this just excludes isolated cpus, but could be used to	6522	* For now this just excludes isolated cpus, but could be used to
6500	* exclude other special cases in the future.	6523	* exclude other special cases in the future.
6501	*/	6524	*/
6502	static int init_sched_domains(const struct cpumask *cpu_map)	6525	static int init_sched_domains(const struct cpumask *cpu_map)
6503	{	6526	{
6504	int err;	6527	int err;
6505		6528
6506	arch_update_cpu_topology();	6529	arch_update_cpu_topology();
6507	ndoms_cur = 1;	6530	ndoms_cur = 1;
6508	doms_cur = alloc_sched_domains(ndoms_cur);	6531	doms_cur = alloc_sched_domains(ndoms_cur);
6509	if (!doms_cur)	6532	if (!doms_cur)
6510	doms_cur = &fallback_doms;	6533	doms_cur = &fallback_doms;
6511	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);	6534	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6512	err = build_sched_domains(doms_cur[0], NULL);	6535	err = build_sched_domains(doms_cur[0], NULL);
6513	register_sched_domain_sysctl();	6536	register_sched_domain_sysctl();
6514		6537
6515	return err;	6538	return err;
6516	}	6539	}
6517		6540
6518	/*	6541	/*
6519	* Detach sched domains from a group of cpus specified in cpu_map	6542	* Detach sched domains from a group of cpus specified in cpu_map
6520	* These cpus will now be attached to the NULL domain	6543	* These cpus will now be attached to the NULL domain
6521	*/	6544	*/
6522	static void detach_destroy_domains(const struct cpumask *cpu_map)	6545	static void detach_destroy_domains(const struct cpumask *cpu_map)
6523	{	6546	{
6524	int i;	6547	int i;
6525		6548
6526	rcu_read_lock();	6549	rcu_read_lock();
6527	for_each_cpu(i, cpu_map)	6550	for_each_cpu(i, cpu_map)
6528	cpu_attach_domain(NULL, &def_root_domain, i);	6551	cpu_attach_domain(NULL, &def_root_domain, i);
6529	rcu_read_unlock();	6552	rcu_read_unlock();
6530	}	6553	}
6531		6554
6532	/* handle null as "default" */	6555	/* handle null as "default" */
6533	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,	6556	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6534	struct sched_domain_attr *new, int idx_new)	6557	struct sched_domain_attr *new, int idx_new)
6535	{	6558	{
6536	struct sched_domain_attr tmp;	6559	struct sched_domain_attr tmp;
6537		6560
6538	/* fast path */	6561	/* fast path */
6539	if (!new && !cur)	6562	if (!new && !cur)
6540	return 1;	6563	return 1;
6541		6564
6542	tmp = SD_ATTR_INIT;	6565	tmp = SD_ATTR_INIT;
6543	return !memcmp(cur ? (cur + idx_cur) : &tmp,	6566	return !memcmp(cur ? (cur + idx_cur) : &tmp,
6544	new ? (new + idx_new) : &tmp,	6567	new ? (new + idx_new) : &tmp,
6545	sizeof(struct sched_domain_attr));	6568	sizeof(struct sched_domain_attr));
6546	}	6569	}
6547		6570
6548	/*	6571	/*
6549	* Partition sched domains as specified by the 'ndoms_new'	6572	* Partition sched domains as specified by the 'ndoms_new'
6550	* cpumasks in the array doms_new[] of cpumasks. This compares	6573	* cpumasks in the array doms_new[] of cpumasks. This compares
6551	* doms_new[] to the current sched domain partitioning, doms_cur[].	6574	* doms_new[] to the current sched domain partitioning, doms_cur[].
6552	* It destroys each deleted domain and builds each new domain.	6575	* It destroys each deleted domain and builds each new domain.
6553	*	6576	*
6554	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.	6577	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6555	* The masks don't intersect (don't overlap.) We should setup one	6578	* The masks don't intersect (don't overlap.) We should setup one
6556	* sched domain for each mask. CPUs not in any of the cpumasks will	6579	* sched domain for each mask. CPUs not in any of the cpumasks will
6557	* not be load balanced. If the same cpumask appears both in the	6580	* not be load balanced. If the same cpumask appears both in the
6558	* current 'doms_cur' domains and in the new 'doms_new', we can leave	6581	* current 'doms_cur' domains and in the new 'doms_new', we can leave
6559	* it as it is.	6582	* it as it is.
6560	*	6583	*
6561	* The passed in 'doms_new' should be allocated using	6584	* The passed in 'doms_new' should be allocated using
6562	* alloc_sched_domains. This routine takes ownership of it and will	6585	* alloc_sched_domains. This routine takes ownership of it and will
6563	* free_sched_domains it when done with it. If the caller failed the	6586	* free_sched_domains it when done with it. If the caller failed the
6564	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,	6587	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6565	* and partition_sched_domains() will fallback to the single partition	6588	* and partition_sched_domains() will fallback to the single partition
6566	* 'fallback_doms', it also forces the domains to be rebuilt.	6589	* 'fallback_doms', it also forces the domains to be rebuilt.
6567	*	6590	*
6568	* If doms_new == NULL it will be replaced with cpu_online_mask.	6591	* If doms_new == NULL it will be replaced with cpu_online_mask.
6569	* ndoms_new == 0 is a special case for destroying existing domains,	6592	* ndoms_new == 0 is a special case for destroying existing domains,
6570	* and it will not create the default domain.	6593	* and it will not create the default domain.
6571	*	6594	*
6572	* Call with hotplug lock held	6595	* Call with hotplug lock held
6573	*/	6596	*/
6574	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],	6597	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6575	struct sched_domain_attr *dattr_new)	6598	struct sched_domain_attr *dattr_new)
6576	{	6599	{
6577	int i, j, n;	6600	int i, j, n;
6578	int new_topology;	6601	int new_topology;
6579		6602
6580	mutex_lock(&sched_domains_mutex);	6603	mutex_lock(&sched_domains_mutex);
6581		6604
6582	/* always unregister in case we don't destroy any domains */	6605	/* always unregister in case we don't destroy any domains */
6583	unregister_sched_domain_sysctl();	6606	unregister_sched_domain_sysctl();
6584		6607
6585	/* Let architecture update cpu core mappings. */	6608	/* Let architecture update cpu core mappings. */
6586	new_topology = arch_update_cpu_topology();	6609	new_topology = arch_update_cpu_topology();
6587		6610
6588	n = doms_new ? ndoms_new : 0;	6611	n = doms_new ? ndoms_new : 0;
6589		6612
6590	/* Destroy deleted domains */	6613	/* Destroy deleted domains */
6591	for (i = 0; i < ndoms_cur; i++) {	6614	for (i = 0; i < ndoms_cur; i++) {
6592	for (j = 0; j < n && !new_topology; j++) {	6615	for (j = 0; j < n && !new_topology; j++) {
6593	if (cpumask_equal(doms_cur[i], doms_new[j])	6616	if (cpumask_equal(doms_cur[i], doms_new[j])
6594	&& dattrs_equal(dattr_cur, i, dattr_new, j))	6617	&& dattrs_equal(dattr_cur, i, dattr_new, j))
6595	goto match1;	6618	goto match1;
6596	}	6619	}
6597	/* no match - a current sched domain not in new doms_new[] */	6620	/* no match - a current sched domain not in new doms_new[] */
6598	detach_destroy_domains(doms_cur[i]);	6621	detach_destroy_domains(doms_cur[i]);
6599	match1:	6622	match1:
6600	;	6623	;
6601	}	6624	}
6602		6625
6603	n = ndoms_cur;	6626	n = ndoms_cur;
6604	if (doms_new == NULL) {	6627	if (doms_new == NULL) {
6605	n = 0;	6628	n = 0;
6606	doms_new = &fallback_doms;	6629	doms_new = &fallback_doms;
6607	cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);	6630	cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6608	WARN_ON_ONCE(dattr_new);	6631	WARN_ON_ONCE(dattr_new);
6609	}	6632	}
6610		6633
6611	/* Build new domains */	6634	/* Build new domains */
6612	for (i = 0; i < ndoms_new; i++) {	6635	for (i = 0; i < ndoms_new; i++) {
6613	for (j = 0; j < n && !new_topology; j++) {	6636	for (j = 0; j < n && !new_topology; j++) {
6614	if (cpumask_equal(doms_new[i], doms_cur[j])	6637	if (cpumask_equal(doms_new[i], doms_cur[j])
6615	&& dattrs_equal(dattr_new, i, dattr_cur, j))	6638	&& dattrs_equal(dattr_new, i, dattr_cur, j))
6616	goto match2;	6639	goto match2;
6617	}	6640	}
6618	/* no match - add a new doms_new */	6641	/* no match - add a new doms_new */
6619	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);	6642	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6620	match2:	6643	match2:
6621	;	6644	;
6622	}	6645	}
6623		6646
6624	/* Remember the new sched domains */	6647	/* Remember the new sched domains */
6625	if (doms_cur != &fallback_doms)	6648	if (doms_cur != &fallback_doms)
6626	free_sched_domains(doms_cur, ndoms_cur);	6649	free_sched_domains(doms_cur, ndoms_cur);
6627	kfree(dattr_cur); /* kfree(NULL) is safe */	6650	kfree(dattr_cur); /* kfree(NULL) is safe */
6628	doms_cur = doms_new;	6651	doms_cur = doms_new;
6629	dattr_cur = dattr_new;	6652	dattr_cur = dattr_new;
6630	ndoms_cur = ndoms_new;	6653	ndoms_cur = ndoms_new;
6631		6654
6632	register_sched_domain_sysctl();	6655	register_sched_domain_sysctl();
6633		6656
6634	mutex_unlock(&sched_domains_mutex);	6657	mutex_unlock(&sched_domains_mutex);
6635	}	6658	}
6636		6659
6637	static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */	6660	static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
6638		6661
6639	/*	6662	/*
6640	* Update cpusets according to cpu_active mask. If cpusets are	6663	* Update cpusets according to cpu_active mask. If cpusets are
6641	* disabled, cpuset_update_active_cpus() becomes a simple wrapper	6664	* disabled, cpuset_update_active_cpus() becomes a simple wrapper
6642	* around partition_sched_domains().	6665	* around partition_sched_domains().
6643	*	6666	*
6644	* If we come here as part of a suspend/resume, don't touch cpusets because we	6667	* If we come here as part of a suspend/resume, don't touch cpusets because we
6645	* want to restore it back to its original state upon resume anyway.	6668	* want to restore it back to its original state upon resume anyway.
6646	*/	6669	*/
6647	static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,	6670	static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6648	void *hcpu)	6671	void *hcpu)
6649	{	6672	{
6650	switch (action) {	6673	switch (action) {
6651	case CPU_ONLINE_FROZEN:	6674	case CPU_ONLINE_FROZEN:
6652	case CPU_DOWN_FAILED_FROZEN:	6675	case CPU_DOWN_FAILED_FROZEN:
6653		6676
6654	/*	6677	/*
6655	* num_cpus_frozen tracks how many CPUs are involved in suspend	6678	* num_cpus_frozen tracks how many CPUs are involved in suspend
6656	* resume sequence. As long as this is not the last online	6679	* resume sequence. As long as this is not the last online
6657	* operation in the resume sequence, just build a single sched	6680	* operation in the resume sequence, just build a single sched
6658	* domain, ignoring cpusets.	6681	* domain, ignoring cpusets.
6659	*/	6682	*/
6660	num_cpus_frozen--;	6683	num_cpus_frozen--;
6661	if (likely(num_cpus_frozen)) {	6684	if (likely(num_cpus_frozen)) {
6662	partition_sched_domains(1, NULL, NULL);	6685	partition_sched_domains(1, NULL, NULL);
6663	break;	6686	break;
6664	}	6687	}
6665		6688
6666	/*	6689	/*
6667	* This is the last CPU online operation. So fall through and	6690	* This is the last CPU online operation. So fall through and
6668	* restore the original sched domains by considering the	6691	* restore the original sched domains by considering the
6669	* cpuset configurations.	6692	* cpuset configurations.
6670	*/	6693	*/
6671		6694
6672	case CPU_ONLINE:	6695	case CPU_ONLINE:
6673	case CPU_DOWN_FAILED:	6696	case CPU_DOWN_FAILED:
6674	cpuset_update_active_cpus(true);	6697	cpuset_update_active_cpus(true);
6675	break;	6698	break;
6676	default:	6699	default:
6677	return NOTIFY_DONE;	6700	return NOTIFY_DONE;
6678	}	6701	}
6679	return NOTIFY_OK;	6702	return NOTIFY_OK;
6680	}	6703	}
6681		6704
6682	static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,	6705	static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6683	void *hcpu)	6706	void *hcpu)
6684	{	6707	{
6685	switch (action) {	6708	switch (action) {
6686	case CPU_DOWN_PREPARE:	6709	case CPU_DOWN_PREPARE:
6687	cpuset_update_active_cpus(false);	6710	cpuset_update_active_cpus(false);
6688	break;	6711	break;
6689	case CPU_DOWN_PREPARE_FROZEN:	6712	case CPU_DOWN_PREPARE_FROZEN:
6690	num_cpus_frozen++;	6713	num_cpus_frozen++;
6691	partition_sched_domains(1, NULL, NULL);	6714	partition_sched_domains(1, NULL, NULL);
6692	break;	6715	break;
6693	default:	6716	default:
6694	return NOTIFY_DONE;	6717	return NOTIFY_DONE;
6695	}	6718	}
6696	return NOTIFY_OK;	6719	return NOTIFY_OK;
6697	}	6720	}
6698		6721
6699	void __init sched_init_smp(void)	6722	void __init sched_init_smp(void)
6700	{	6723	{
6701	cpumask_var_t non_isolated_cpus;	6724	cpumask_var_t non_isolated_cpus;
6702		6725
6703	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);	6726	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6704	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);	6727	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6705		6728
6706	sched_init_numa();	6729	sched_init_numa();
6707		6730
6708	/*	6731	/*
6709	* There's no userspace yet to cause hotplug operations; hence all the	6732	* There's no userspace yet to cause hotplug operations; hence all the
6710	* cpu masks are stable and all blatant races in the below code cannot	6733	* cpu masks are stable and all blatant races in the below code cannot
6711	* happen.	6734	* happen.
6712	*/	6735	*/
6713	mutex_lock(&sched_domains_mutex);	6736	mutex_lock(&sched_domains_mutex);
6714	init_sched_domains(cpu_active_mask);	6737	init_sched_domains(cpu_active_mask);
6715	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);	6738	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6716	if (cpumask_empty(non_isolated_cpus))	6739	if (cpumask_empty(non_isolated_cpus))
6717	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);	6740	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6718	mutex_unlock(&sched_domains_mutex);	6741	mutex_unlock(&sched_domains_mutex);
6719		6742
6720	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);	6743	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6721	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);	6744	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6722	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);	6745	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6723		6746
6724	init_hrtick();	6747	init_hrtick();
6725		6748
6726	/* Move init over to a non-isolated CPU */	6749	/* Move init over to a non-isolated CPU */
6727	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)	6750	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6728	BUG();	6751	BUG();
6729	sched_init_granularity();	6752	sched_init_granularity();
6730	free_cpumask_var(non_isolated_cpus);	6753	free_cpumask_var(non_isolated_cpus);
6731		6754
6732	init_sched_rt_class();	6755	init_sched_rt_class();
6733	init_sched_dl_class();	6756	init_sched_dl_class();
6734	}	6757	}
6735	#else	6758	#else
6736	void __init sched_init_smp(void)	6759	void __init sched_init_smp(void)
6737	{	6760	{
6738	sched_init_granularity();	6761	sched_init_granularity();
6739	}	6762	}
6740	#endif /* CONFIG_SMP */	6763	#endif /* CONFIG_SMP */
6741		6764
6742	const_debug unsigned int sysctl_timer_migration = 1;	6765	const_debug unsigned int sysctl_timer_migration = 1;
6743		6766
6744	int in_sched_functions(unsigned long addr)	6767	int in_sched_functions(unsigned long addr)
6745	{	6768	{
6746	return in_lock_functions(addr) \|\|	6769	return in_lock_functions(addr) \|\|
6747	(addr >= (unsigned long)__sched_text_start	6770	(addr >= (unsigned long)__sched_text_start
6748	&& addr < (unsigned long)__sched_text_end);	6771	&& addr < (unsigned long)__sched_text_end);
6749	}	6772	}
6750		6773
6751	#ifdef CONFIG_CGROUP_SCHED	6774	#ifdef CONFIG_CGROUP_SCHED
6752	/*	6775	/*
6753	* Default task group.	6776	* Default task group.
6754	* Every task in system belongs to this group at bootup.	6777	* Every task in system belongs to this group at bootup.
6755	*/	6778	*/
6756	struct task_group root_task_group;	6779	struct task_group root_task_group;
6757	LIST_HEAD(task_groups);	6780	LIST_HEAD(task_groups);
6758	#endif	6781	#endif
6759		6782
6760	DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);	6783	DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6761		6784
6762	void __init sched_init(void)	6785	void __init sched_init(void)
6763	{	6786	{
6764	int i, j;	6787	int i, j;
6765	unsigned long alloc_size = 0, ptr;	6788	unsigned long alloc_size = 0, ptr;
6766		6789
6767	#ifdef CONFIG_FAIR_GROUP_SCHED	6790	#ifdef CONFIG_FAIR_GROUP_SCHED
6768	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	6791	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6769	#endif	6792	#endif
6770	#ifdef CONFIG_RT_GROUP_SCHED	6793	#ifdef CONFIG_RT_GROUP_SCHED
6771	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	6794	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6772	#endif	6795	#endif
6773	#ifdef CONFIG_CPUMASK_OFFSTACK	6796	#ifdef CONFIG_CPUMASK_OFFSTACK
6774	alloc_size += num_possible_cpus() * cpumask_size();	6797	alloc_size += num_possible_cpus() * cpumask_size();
6775	#endif	6798	#endif
6776	if (alloc_size) {	6799	if (alloc_size) {
6777	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);	6800	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6778		6801
6779	#ifdef CONFIG_FAIR_GROUP_SCHED	6802	#ifdef CONFIG_FAIR_GROUP_SCHED
6780	root_task_group.se = (struct sched_entity **)ptr;	6803	root_task_group.se = (struct sched_entity **)ptr;
6781	ptr += nr_cpu_ids * sizeof(void **);	6804	ptr += nr_cpu_ids * sizeof(void **);
6782		6805
6783	root_task_group.cfs_rq = (struct cfs_rq **)ptr;	6806	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6784	ptr += nr_cpu_ids * sizeof(void **);	6807	ptr += nr_cpu_ids * sizeof(void **);
6785		6808
6786	#endif /* CONFIG_FAIR_GROUP_SCHED */	6809	#endif /* CONFIG_FAIR_GROUP_SCHED */
6787	#ifdef CONFIG_RT_GROUP_SCHED	6810	#ifdef CONFIG_RT_GROUP_SCHED
6788	root_task_group.rt_se = (struct sched_rt_entity **)ptr;	6811	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6789	ptr += nr_cpu_ids * sizeof(void **);	6812	ptr += nr_cpu_ids * sizeof(void **);
6790		6813
6791	root_task_group.rt_rq = (struct rt_rq **)ptr;	6814	root_task_group.rt_rq = (struct rt_rq **)ptr;
6792	ptr += nr_cpu_ids * sizeof(void **);	6815	ptr += nr_cpu_ids * sizeof(void **);
6793		6816
6794	#endif /* CONFIG_RT_GROUP_SCHED */	6817	#endif /* CONFIG_RT_GROUP_SCHED */
6795	#ifdef CONFIG_CPUMASK_OFFSTACK	6818	#ifdef CONFIG_CPUMASK_OFFSTACK
6796	for_each_possible_cpu(i) {	6819	for_each_possible_cpu(i) {
6797	per_cpu(load_balance_mask, i) = (void *)ptr;	6820	per_cpu(load_balance_mask, i) = (void *)ptr;
6798	ptr += cpumask_size();	6821	ptr += cpumask_size();
6799	}	6822	}
6800	#endif /* CONFIG_CPUMASK_OFFSTACK */	6823	#endif /* CONFIG_CPUMASK_OFFSTACK */
6801	}	6824	}
6802		6825
6803	init_rt_bandwidth(&def_rt_bandwidth,	6826	init_rt_bandwidth(&def_rt_bandwidth,
6804	global_rt_period(), global_rt_runtime());	6827	global_rt_period(), global_rt_runtime());
6805	init_dl_bandwidth(&def_dl_bandwidth,	6828	init_dl_bandwidth(&def_dl_bandwidth,
6806	global_rt_period(), global_rt_runtime());	6829	global_rt_period(), global_rt_runtime());
6807		6830
6808	#ifdef CONFIG_SMP	6831	#ifdef CONFIG_SMP
6809	init_defrootdomain();	6832	init_defrootdomain();
6810	#endif	6833	#endif
6811		6834
6812	#ifdef CONFIG_RT_GROUP_SCHED	6835	#ifdef CONFIG_RT_GROUP_SCHED
6813	init_rt_bandwidth(&root_task_group.rt_bandwidth,	6836	init_rt_bandwidth(&root_task_group.rt_bandwidth,
6814	global_rt_period(), global_rt_runtime());	6837	global_rt_period(), global_rt_runtime());
6815	#endif /* CONFIG_RT_GROUP_SCHED */	6838	#endif /* CONFIG_RT_GROUP_SCHED */
6816		6839
6817	#ifdef CONFIG_CGROUP_SCHED	6840	#ifdef CONFIG_CGROUP_SCHED
6818	list_add(&root_task_group.list, &task_groups);	6841	list_add(&root_task_group.list, &task_groups);
6819	INIT_LIST_HEAD(&root_task_group.children);	6842	INIT_LIST_HEAD(&root_task_group.children);
6820	INIT_LIST_HEAD(&root_task_group.siblings);	6843	INIT_LIST_HEAD(&root_task_group.siblings);
6821	autogroup_init(&init_task);	6844	autogroup_init(&init_task);
6822		6845
6823	#endif /* CONFIG_CGROUP_SCHED */	6846	#endif /* CONFIG_CGROUP_SCHED */
6824		6847
6825	for_each_possible_cpu(i) {	6848	for_each_possible_cpu(i) {
6826	struct rq *rq;	6849	struct rq *rq;
6827		6850
6828	rq = cpu_rq(i);	6851	rq = cpu_rq(i);
6829	raw_spin_lock_init(&rq->lock);	6852	raw_spin_lock_init(&rq->lock);
6830	rq->nr_running = 0;	6853	rq->nr_running = 0;
6831	rq->calc_load_active = 0;	6854	rq->calc_load_active = 0;
6832	rq->calc_load_update = jiffies + LOAD_FREQ;	6855	rq->calc_load_update = jiffies + LOAD_FREQ;
6833	init_cfs_rq(&rq->cfs);	6856	init_cfs_rq(&rq->cfs);
6834	init_rt_rq(&rq->rt, rq);	6857	init_rt_rq(&rq->rt, rq);
6835	init_dl_rq(&rq->dl, rq);	6858	init_dl_rq(&rq->dl, rq);
6836	#ifdef CONFIG_FAIR_GROUP_SCHED	6859	#ifdef CONFIG_FAIR_GROUP_SCHED
6837	root_task_group.shares = ROOT_TASK_GROUP_LOAD;	6860	root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6838	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);	6861	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6839	/*	6862	/*
6840	* How much cpu bandwidth does root_task_group get?	6863	* How much cpu bandwidth does root_task_group get?
6841	*	6864	*
6842	* In case of task-groups formed thr' the cgroup filesystem, it	6865	* In case of task-groups formed thr' the cgroup filesystem, it
6843	* gets 100% of the cpu resources in the system. This overall	6866	* gets 100% of the cpu resources in the system. This overall
6844	* system cpu resource is divided among the tasks of	6867	* system cpu resource is divided among the tasks of
6845	* root_task_group and its child task-groups in a fair manner,	6868	* root_task_group and its child task-groups in a fair manner,
6846	* based on each entity's (task or task-group's) weight	6869	* based on each entity's (task or task-group's) weight
6847	* (se->load.weight).	6870	* (se->load.weight).
6848	*	6871	*
6849	* In other words, if root_task_group has 10 tasks of weight	6872	* In other words, if root_task_group has 10 tasks of weight
6850	* 1024) and two child groups A0 and A1 (of weight 1024 each),	6873	* 1024) and two child groups A0 and A1 (of weight 1024 each),
6851	* then A0's share of the cpu resource is:	6874	* then A0's share of the cpu resource is:
6852	*	6875	*
6853	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%	6876	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6854	*	6877	*
6855	* We achieve this by letting root_task_group's tasks sit	6878	* We achieve this by letting root_task_group's tasks sit
6856	* directly in rq->cfs (i.e root_task_group->se[] = NULL).	6879	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
6857	*/	6880	*/
6858	init_cfs_bandwidth(&root_task_group.cfs_bandwidth);	6881	init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6859	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);	6882	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6860	#endif /* CONFIG_FAIR_GROUP_SCHED */	6883	#endif /* CONFIG_FAIR_GROUP_SCHED */
6861		6884
6862	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;	6885	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6863	#ifdef CONFIG_RT_GROUP_SCHED	6886	#ifdef CONFIG_RT_GROUP_SCHED
6864	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);	6887	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6865	#endif	6888	#endif
6866		6889
6867	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)	6890	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6868	rq->cpu_load[j] = 0;	6891	rq->cpu_load[j] = 0;
6869		6892
6870	rq->last_load_update_tick = jiffies;	6893	rq->last_load_update_tick = jiffies;
6871		6894
6872	#ifdef CONFIG_SMP	6895	#ifdef CONFIG_SMP
6873	rq->sd = NULL;	6896	rq->sd = NULL;
6874	rq->rd = NULL;	6897	rq->rd = NULL;
6875	rq->cpu_power = SCHED_POWER_SCALE;	6898	rq->cpu_power = SCHED_POWER_SCALE;
6876	rq->post_schedule = 0;	6899	rq->post_schedule = 0;
6877	rq->active_balance = 0;	6900	rq->active_balance = 0;
6878	rq->next_balance = jiffies;	6901	rq->next_balance = jiffies;
6879	rq->push_cpu = 0;	6902	rq->push_cpu = 0;
6880	rq->cpu = i;	6903	rq->cpu = i;
6881	rq->online = 0;	6904	rq->online = 0;
6882	rq->idle_stamp = 0;	6905	rq->idle_stamp = 0;
6883	rq->avg_idle = 2*sysctl_sched_migration_cost;	6906	rq->avg_idle = 2*sysctl_sched_migration_cost;
6884	rq->max_idle_balance_cost = sysctl_sched_migration_cost;	6907	rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6885		6908
6886	INIT_LIST_HEAD(&rq->cfs_tasks);	6909	INIT_LIST_HEAD(&rq->cfs_tasks);
6887		6910
6888	rq_attach_root(rq, &def_root_domain);	6911	rq_attach_root(rq, &def_root_domain);
6889	#ifdef CONFIG_NO_HZ_COMMON	6912	#ifdef CONFIG_NO_HZ_COMMON
6890	rq->nohz_flags = 0;	6913	rq->nohz_flags = 0;
6891	#endif	6914	#endif
6892	#ifdef CONFIG_NO_HZ_FULL	6915	#ifdef CONFIG_NO_HZ_FULL
6893	rq->last_sched_tick = 0;	6916	rq->last_sched_tick = 0;
6894	#endif	6917	#endif
6895	#endif	6918	#endif
6896	init_rq_hrtick(rq);	6919	init_rq_hrtick(rq);
6897	atomic_set(&rq->nr_iowait, 0);	6920	atomic_set(&rq->nr_iowait, 0);
6898	}	6921	}
6899		6922
6900	set_load_weight(&init_task);	6923	set_load_weight(&init_task);
6901		6924
6902	#ifdef CONFIG_PREEMPT_NOTIFIERS	6925	#ifdef CONFIG_PREEMPT_NOTIFIERS
6903	INIT_HLIST_HEAD(&init_task.preempt_notifiers);	6926	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6904	#endif	6927	#endif
6905		6928
6906	/*	6929	/*
6907	* The boot idle thread does lazy MMU switching as well:	6930	* The boot idle thread does lazy MMU switching as well:
6908	*/	6931	*/
6909	atomic_inc(&init_mm.mm_count);	6932	atomic_inc(&init_mm.mm_count);
6910	enter_lazy_tlb(&init_mm, current);	6933	enter_lazy_tlb(&init_mm, current);
6911		6934
6912	/*	6935	/*
6913	* Make us the idle thread. Technically, schedule() should not be	6936	* Make us the idle thread. Technically, schedule() should not be
6914	* called from this thread, however somewhere below it might be,	6937	* called from this thread, however somewhere below it might be,
6915	* but because we are the idle thread, we just pick up running again	6938	* but because we are the idle thread, we just pick up running again
6916	* when this runqueue becomes "idle".	6939	* when this runqueue becomes "idle".
6917	*/	6940	*/
6918	init_idle(current, smp_processor_id());	6941	init_idle(current, smp_processor_id());
6919		6942
6920	calc_load_update = jiffies + LOAD_FREQ;	6943	calc_load_update = jiffies + LOAD_FREQ;
6921		6944
6922	/*	6945	/*
6923	* During early bootup we pretend to be a normal task:	6946	* During early bootup we pretend to be a normal task:
6924	*/	6947	*/
6925	current->sched_class = &fair_sched_class;	6948	current->sched_class = &fair_sched_class;
6926		6949
6927	#ifdef CONFIG_SMP	6950	#ifdef CONFIG_SMP
6928	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);	6951	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6929	/* May be allocated at isolcpus cmdline parse time */	6952	/* May be allocated at isolcpus cmdline parse time */
6930	if (cpu_isolated_map == NULL)	6953	if (cpu_isolated_map == NULL)
6931	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);	6954	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6932	idle_thread_set_boot_cpu();	6955	idle_thread_set_boot_cpu();
6933	#endif	6956	#endif
6934	init_sched_fair_class();	6957	init_sched_fair_class();
6935		6958
6936	scheduler_running = 1;	6959	scheduler_running = 1;
6937	}	6960	}
6938		6961
6939	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP	6962	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6940	static inline int preempt_count_equals(int preempt_offset)	6963	static inline int preempt_count_equals(int preempt_offset)
6941	{	6964	{
6942	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();	6965	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6943		6966
6944	return (nested == preempt_offset);	6967	return (nested == preempt_offset);
6945	}	6968	}
6946		6969
6947	void __might_sleep(const char *file, int line, int preempt_offset)	6970	void __might_sleep(const char *file, int line, int preempt_offset)
6948	{	6971	{
6949	static unsigned long prev_jiffy; /* ratelimiting */	6972	static unsigned long prev_jiffy; /* ratelimiting */
6950		6973
6951	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */	6974	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6952	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&	6975	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6953	!is_idle_task(current)) \|\|	6976	!is_idle_task(current)) \|\|
6954	system_state != SYSTEM_RUNNING \|\| oops_in_progress)	6977	system_state != SYSTEM_RUNNING \|\| oops_in_progress)
6955	return;	6978	return;
6956	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)	6979	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6957	return;	6980	return;
6958	prev_jiffy = jiffies;	6981	prev_jiffy = jiffies;
6959		6982
6960	printk(KERN_ERR	6983	printk(KERN_ERR
6961	"BUG: sleeping function called from invalid context at %s:%d\n",	6984	"BUG: sleeping function called from invalid context at %s:%d\n",
6962	file, line);	6985	file, line);
6963	printk(KERN_ERR	6986	printk(KERN_ERR
6964	"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",	6987	"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6965	in_atomic(), irqs_disabled(),	6988	in_atomic(), irqs_disabled(),
6966	current->pid, current->comm);	6989	current->pid, current->comm);
6967		6990
6968	debug_show_held_locks(current);	6991	debug_show_held_locks(current);
6969	if (irqs_disabled())	6992	if (irqs_disabled())
6970	print_irqtrace_events(current);	6993	print_irqtrace_events(current);
6971	#ifdef CONFIG_DEBUG_PREEMPT	6994	#ifdef CONFIG_DEBUG_PREEMPT
6972	if (!preempt_count_equals(preempt_offset)) {	6995	if (!preempt_count_equals(preempt_offset)) {
6973	pr_err("Preemption disabled at:");	6996	pr_err("Preemption disabled at:");
6974	print_ip_sym(current->preempt_disable_ip);	6997	print_ip_sym(current->preempt_disable_ip);
6975	pr_cont("\n");	6998	pr_cont("\n");
6976	}	6999	}
6977	#endif	7000	#endif
6978	dump_stack();	7001	dump_stack();
6979	}	7002	}
6980	EXPORT_SYMBOL(__might_sleep);	7003	EXPORT_SYMBOL(__might_sleep);
6981	#endif	7004	#endif
6982		7005
6983	#ifdef CONFIG_MAGIC_SYSRQ	7006	#ifdef CONFIG_MAGIC_SYSRQ
6984	static void normalize_task(struct rq rq, struct task_struct p)	7007	static void normalize_task(struct rq rq, struct task_struct p)
6985	{	7008	{
6986	const struct sched_class *prev_class = p->sched_class;	7009	const struct sched_class *prev_class = p->sched_class;
6987	struct sched_attr attr = {	7010	struct sched_attr attr = {
6988	.sched_policy = SCHED_NORMAL,	7011	.sched_policy = SCHED_NORMAL,
6989	};	7012	};
6990	int old_prio = p->prio;	7013	int old_prio = p->prio;
6991	int on_rq;	7014	int on_rq;
6992		7015
6993	on_rq = p->on_rq;	7016	on_rq = p->on_rq;
6994	if (on_rq)	7017	if (on_rq)
6995	dequeue_task(rq, p, 0);	7018	dequeue_task(rq, p, 0);
6996	__setscheduler(rq, p, &attr);	7019	__setscheduler(rq, p, &attr);
6997	if (on_rq) {	7020	if (on_rq) {
6998	enqueue_task(rq, p, 0);	7021	enqueue_task(rq, p, 0);
6999	resched_task(rq->curr);	7022	resched_task(rq->curr);
7000	}	7023	}
7001		7024
7002	check_class_changed(rq, p, prev_class, old_prio);	7025	check_class_changed(rq, p, prev_class, old_prio);
7003	}	7026	}
7004		7027
7005	void normalize_rt_tasks(void)	7028	void normalize_rt_tasks(void)
7006	{	7029	{
7007	struct task_struct g, p;	7030	struct task_struct g, p;
7008	unsigned long flags;	7031	unsigned long flags;
7009	struct rq *rq;	7032	struct rq *rq;
7010		7033
7011	read_lock_irqsave(&tasklist_lock, flags);	7034	read_lock_irqsave(&tasklist_lock, flags);
7012	do_each_thread(g, p) {	7035	do_each_thread(g, p) {
7013	/*	7036	/*
7014	* Only normalize user tasks:	7037	* Only normalize user tasks:
7015	*/	7038	*/
7016	if (!p->mm)	7039	if (!p->mm)
7017	continue;	7040	continue;
7018		7041
7019	p->se.exec_start = 0;	7042	p->se.exec_start = 0;
7020	#ifdef CONFIG_SCHEDSTATS	7043	#ifdef CONFIG_SCHEDSTATS
7021	p->se.statistics.wait_start = 0;	7044	p->se.statistics.wait_start = 0;
7022	p->se.statistics.sleep_start = 0;	7045	p->se.statistics.sleep_start = 0;
7023	p->se.statistics.block_start = 0;	7046	p->se.statistics.block_start = 0;
7024	#endif	7047	#endif
7025		7048
7026	if (!dl_task(p) && !rt_task(p)) {	7049	if (!dl_task(p) && !rt_task(p)) {
7027	/*	7050	/*
7028	* Renice negative nice level userspace	7051	* Renice negative nice level userspace
7029	* tasks back to 0:	7052	* tasks back to 0:
7030	*/	7053	*/
7031	if (task_nice(p) < 0 && p->mm)	7054	if (task_nice(p) < 0 && p->mm)
7032	set_user_nice(p, 0);	7055	set_user_nice(p, 0);
7033	continue;	7056	continue;
7034	}	7057	}
7035		7058
7036	raw_spin_lock(&p->pi_lock);	7059	raw_spin_lock(&p->pi_lock);
7037	rq = __task_rq_lock(p);	7060	rq = __task_rq_lock(p);
7038		7061
7039	normalize_task(rq, p);	7062	normalize_task(rq, p);
7040		7063
7041	__task_rq_unlock(rq);	7064	__task_rq_unlock(rq);
7042	raw_spin_unlock(&p->pi_lock);	7065	raw_spin_unlock(&p->pi_lock);
7043	} while_each_thread(g, p);	7066	} while_each_thread(g, p);
7044		7067
7045	read_unlock_irqrestore(&tasklist_lock, flags);	7068	read_unlock_irqrestore(&tasklist_lock, flags);
7046	}	7069	}
7047		7070
7048	#endif /* CONFIG_MAGIC_SYSRQ */	7071	#endif /* CONFIG_MAGIC_SYSRQ */
7049		7072
7050	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)	7073	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)
7051	/*	7074	/*
7052	* These functions are only useful for the IA64 MCA handling, or kdb.	7075	* These functions are only useful for the IA64 MCA handling, or kdb.
7053	*	7076	*
7054	* They can only be called when the whole system has been	7077	* They can only be called when the whole system has been
7055	* stopped - every CPU needs to be quiescent, and no scheduling	7078	* stopped - every CPU needs to be quiescent, and no scheduling
7056	* activity can take place. Using them for anything else would	7079	* activity can take place. Using them for anything else would
7057	* be a serious bug, and as a result, they aren't even visible	7080	* be a serious bug, and as a result, they aren't even visible
7058	* under any other configuration.	7081	* under any other configuration.
7059	*/	7082	*/
7060		7083
7061	/**	7084	/**
7062	* curr_task - return the current task for a given cpu.	7085	* curr_task - return the current task for a given cpu.
7063	* @cpu: the processor in question.	7086	* @cpu: the processor in question.
7064	*	7087	*
7065	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!	7088	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7066	*	7089	*
7067	* Return: The current task for @cpu.	7090	* Return: The current task for @cpu.
7068	*/	7091	*/
7069	struct task_struct *curr_task(int cpu)	7092	struct task_struct *curr_task(int cpu)
7070	{	7093	{
7071	return cpu_curr(cpu);	7094	return cpu_curr(cpu);
7072	}	7095	}
7073		7096
7074	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */	7097	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */
7075		7098
7076	#ifdef CONFIG_IA64	7099	#ifdef CONFIG_IA64
7077	/**	7100	/**
7078	* set_curr_task - set the current task for a given cpu.	7101	* set_curr_task - set the current task for a given cpu.
7079	* @cpu: the processor in question.	7102	* @cpu: the processor in question.
7080	* @p: the task pointer to set.	7103	* @p: the task pointer to set.
7081	*	7104	*
7082	* Description: This function must only be used when non-maskable interrupts	7105	* Description: This function must only be used when non-maskable interrupts
7083	* are serviced on a separate stack. It allows the architecture to switch the	7106	* are serviced on a separate stack. It allows the architecture to switch the
7084	* notion of the current task on a cpu in a non-blocking manner. This function	7107	* notion of the current task on a cpu in a non-blocking manner. This function
7085	* must be called with all CPU's synchronized, and interrupts disabled, the	7108	* must be called with all CPU's synchronized, and interrupts disabled, the
7086	* and caller must save the original value of the current task (see	7109	* and caller must save the original value of the current task (see
7087	* curr_task() above) and restore that value before reenabling interrupts and	7110	* curr_task() above) and restore that value before reenabling interrupts and
7088	* re-starting the system.	7111	* re-starting the system.
7089	*	7112	*
7090	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!	7113	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7091	*/	7114	*/
7092	void set_curr_task(int cpu, struct task_struct *p)	7115	void set_curr_task(int cpu, struct task_struct *p)
7093	{	7116	{
7094	cpu_curr(cpu) = p;	7117	cpu_curr(cpu) = p;
7095	}	7118	}
7096		7119
7097	#endif	7120	#endif
7098		7121
7099	#ifdef CONFIG_CGROUP_SCHED	7122	#ifdef CONFIG_CGROUP_SCHED
7100	/* task_group_lock serializes the addition/removal of task groups */	7123	/* task_group_lock serializes the addition/removal of task groups */
7101	static DEFINE_SPINLOCK(task_group_lock);	7124	static DEFINE_SPINLOCK(task_group_lock);
7102		7125
7103	static void free_sched_group(struct task_group *tg)	7126	static void free_sched_group(struct task_group *tg)
7104	{	7127	{
7105	free_fair_sched_group(tg);	7128	free_fair_sched_group(tg);
7106	free_rt_sched_group(tg);	7129	free_rt_sched_group(tg);
7107	autogroup_free(tg);	7130	autogroup_free(tg);
7108	kfree(tg);	7131	kfree(tg);
7109	}	7132	}
7110		7133
7111	/* allocate runqueue etc for a new task group */	7134	/* allocate runqueue etc for a new task group */
7112	struct task_group sched_create_group(struct task_group parent)	7135	struct task_group sched_create_group(struct task_group parent)
7113	{	7136	{
7114	struct task_group *tg;	7137	struct task_group *tg;
7115		7138
7116	tg = kzalloc(sizeof(*tg), GFP_KERNEL);	7139	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7117	if (!tg)	7140	if (!tg)
7118	return ERR_PTR(-ENOMEM);	7141	return ERR_PTR(-ENOMEM);
7119		7142
7120	if (!alloc_fair_sched_group(tg, parent))	7143	if (!alloc_fair_sched_group(tg, parent))
7121	goto err;	7144	goto err;
7122		7145
7123	if (!alloc_rt_sched_group(tg, parent))	7146	if (!alloc_rt_sched_group(tg, parent))
7124	goto err;	7147	goto err;
7125		7148
7126	return tg;	7149	return tg;
7127		7150
7128	err:	7151	err:
7129	free_sched_group(tg);	7152	free_sched_group(tg);
7130	return ERR_PTR(-ENOMEM);	7153	return ERR_PTR(-ENOMEM);
7131	}	7154	}
7132		7155
7133	void sched_online_group(struct task_group tg, struct task_group parent)	7156	void sched_online_group(struct task_group tg, struct task_group parent)
7134	{	7157	{
7135	unsigned long flags;	7158	unsigned long flags;
7136		7159
7137	spin_lock_irqsave(&task_group_lock, flags);	7160	spin_lock_irqsave(&task_group_lock, flags);
7138	list_add_rcu(&tg->list, &task_groups);	7161	list_add_rcu(&tg->list, &task_groups);
7139		7162
7140	WARN_ON(!parent); /* root should already exist */	7163	WARN_ON(!parent); /* root should already exist */
7141		7164
7142	tg->parent = parent;	7165	tg->parent = parent;
7143	INIT_LIST_HEAD(&tg->children);	7166	INIT_LIST_HEAD(&tg->children);
7144	list_add_rcu(&tg->siblings, &parent->children);	7167	list_add_rcu(&tg->siblings, &parent->children);
7145	spin_unlock_irqrestore(&task_group_lock, flags);	7168	spin_unlock_irqrestore(&task_group_lock, flags);
7146	}	7169	}
7147		7170
7148	/* rcu callback to free various structures associated with a task group */	7171	/* rcu callback to free various structures associated with a task group */
7149	static void free_sched_group_rcu(struct rcu_head *rhp)	7172	static void free_sched_group_rcu(struct rcu_head *rhp)
7150	{	7173	{
7151	/* now it should be safe to free those cfs_rqs */	7174	/* now it should be safe to free those cfs_rqs */
7152	free_sched_group(container_of(rhp, struct task_group, rcu));	7175	free_sched_group(container_of(rhp, struct task_group, rcu));
7153	}	7176	}
7154		7177
7155	/* Destroy runqueue etc associated with a task group */	7178	/* Destroy runqueue etc associated with a task group */
7156	void sched_destroy_group(struct task_group *tg)	7179	void sched_destroy_group(struct task_group *tg)
7157	{	7180	{
7158	/* wait for possible concurrent references to cfs_rqs complete */	7181	/* wait for possible concurrent references to cfs_rqs complete */
7159	call_rcu(&tg->rcu, free_sched_group_rcu);	7182	call_rcu(&tg->rcu, free_sched_group_rcu);
7160	}	7183	}
7161		7184
7162	void sched_offline_group(struct task_group *tg)	7185	void sched_offline_group(struct task_group *tg)
7163	{	7186	{
7164	unsigned long flags;	7187	unsigned long flags;
7165	int i;	7188	int i;
7166		7189
7167	/* end participation in shares distribution */	7190	/* end participation in shares distribution */
7168	for_each_possible_cpu(i)	7191	for_each_possible_cpu(i)
7169	unregister_fair_sched_group(tg, i);	7192	unregister_fair_sched_group(tg, i);
7170		7193
7171	spin_lock_irqsave(&task_group_lock, flags);	7194	spin_lock_irqsave(&task_group_lock, flags);
7172	list_del_rcu(&tg->list);	7195	list_del_rcu(&tg->list);
7173	list_del_rcu(&tg->siblings);	7196	list_del_rcu(&tg->siblings);
7174	spin_unlock_irqrestore(&task_group_lock, flags);	7197	spin_unlock_irqrestore(&task_group_lock, flags);
7175	}	7198	}
7176		7199
7177	/* change task's runqueue when it moves between groups.	7200	/* change task's runqueue when it moves between groups.
7178	* The caller of this function should have put the task in its new group	7201	* The caller of this function should have put the task in its new group
7179	* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to	7202	* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7180	* reflect its new group.	7203	* reflect its new group.
7181	*/	7204	*/
7182	void sched_move_task(struct task_struct *tsk)	7205	void sched_move_task(struct task_struct *tsk)
7183	{	7206	{
7184	struct task_group *tg;	7207	struct task_group *tg;
7185	int on_rq, running;	7208	int on_rq, running;
7186	unsigned long flags;	7209	unsigned long flags;
7187	struct rq *rq;	7210	struct rq *rq;
7188		7211
7189	rq = task_rq_lock(tsk, &flags);	7212	rq = task_rq_lock(tsk, &flags);
7190		7213
7191	running = task_current(rq, tsk);	7214	running = task_current(rq, tsk);
7192	on_rq = tsk->on_rq;	7215	on_rq = tsk->on_rq;
7193		7216
7194	if (on_rq)	7217	if (on_rq)
7195	dequeue_task(rq, tsk, 0);	7218	dequeue_task(rq, tsk, 0);
7196	if (unlikely(running))	7219	if (unlikely(running))
7197	tsk->sched_class->put_prev_task(rq, tsk);	7220	tsk->sched_class->put_prev_task(rq, tsk);
7198		7221
7199	tg = container_of(task_css_check(tsk, cpu_cgrp_id,	7222	tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7200	lockdep_is_held(&tsk->sighand->siglock)),	7223	lockdep_is_held(&tsk->sighand->siglock)),
7201	struct task_group, css);	7224	struct task_group, css);
7202	tg = autogroup_task_group(tsk, tg);	7225	tg = autogroup_task_group(tsk, tg);
7203	tsk->sched_task_group = tg;	7226	tsk->sched_task_group = tg;
7204		7227
7205	#ifdef CONFIG_FAIR_GROUP_SCHED	7228	#ifdef CONFIG_FAIR_GROUP_SCHED
7206	if (tsk->sched_class->task_move_group)	7229	if (tsk->sched_class->task_move_group)
7207	tsk->sched_class->task_move_group(tsk, on_rq);	7230	tsk->sched_class->task_move_group(tsk, on_rq);
7208	else	7231	else
7209	#endif	7232	#endif
7210	set_task_rq(tsk, task_cpu(tsk));	7233	set_task_rq(tsk, task_cpu(tsk));
7211		7234
7212	if (unlikely(running))	7235	if (unlikely(running))
7213	tsk->sched_class->set_curr_task(rq);	7236	tsk->sched_class->set_curr_task(rq);
7214	if (on_rq)	7237	if (on_rq)
7215	enqueue_task(rq, tsk, 0);	7238	enqueue_task(rq, tsk, 0);
7216		7239
7217	task_rq_unlock(rq, tsk, &flags);	7240	task_rq_unlock(rq, tsk, &flags);
7218	}	7241	}
7219	#endif /* CONFIG_CGROUP_SCHED */	7242	#endif /* CONFIG_CGROUP_SCHED */
7220		7243
7221	#ifdef CONFIG_RT_GROUP_SCHED	7244	#ifdef CONFIG_RT_GROUP_SCHED
7222	/*	7245	/*
7223	* Ensure that the real time constraints are schedulable.	7246	* Ensure that the real time constraints are schedulable.
7224	*/	7247	*/
7225	static DEFINE_MUTEX(rt_constraints_mutex);	7248	static DEFINE_MUTEX(rt_constraints_mutex);
7226		7249
7227	/* Must be called with tasklist_lock held */	7250	/* Must be called with tasklist_lock held */
7228	static inline int tg_has_rt_tasks(struct task_group *tg)	7251	static inline int tg_has_rt_tasks(struct task_group *tg)
7229	{	7252	{
7230	struct task_struct g, p;	7253	struct task_struct g, p;
7231		7254
7232	do_each_thread(g, p) {	7255	do_each_thread(g, p) {
7233	if (rt_task(p) && task_rq(p)->rt.tg == tg)	7256	if (rt_task(p) && task_rq(p)->rt.tg == tg)
7234	return 1;	7257	return 1;
7235	} while_each_thread(g, p);	7258	} while_each_thread(g, p);
7236		7259
7237	return 0;	7260	return 0;
7238	}	7261	}
7239		7262
7240	struct rt_schedulable_data {	7263	struct rt_schedulable_data {
7241	struct task_group *tg;	7264	struct task_group *tg;
7242	u64 rt_period;	7265	u64 rt_period;
7243	u64 rt_runtime;	7266	u64 rt_runtime;
7244	};	7267	};
7245		7268
7246	static int tg_rt_schedulable(struct task_group tg, void data)	7269	static int tg_rt_schedulable(struct task_group tg, void data)
7247	{	7270	{
7248	struct rt_schedulable_data *d = data;	7271	struct rt_schedulable_data *d = data;
7249	struct task_group *child;	7272	struct task_group *child;
7250	unsigned long total, sum = 0;	7273	unsigned long total, sum = 0;
7251	u64 period, runtime;	7274	u64 period, runtime;
7252		7275
7253	period = ktime_to_ns(tg->rt_bandwidth.rt_period);	7276	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7254	runtime = tg->rt_bandwidth.rt_runtime;	7277	runtime = tg->rt_bandwidth.rt_runtime;
7255		7278
7256	if (tg == d->tg) {	7279	if (tg == d->tg) {
7257	period = d->rt_period;	7280	period = d->rt_period;
7258	runtime = d->rt_runtime;	7281	runtime = d->rt_runtime;
7259	}	7282	}
7260		7283
7261	/*	7284	/*
7262	* Cannot have more runtime than the period.	7285	* Cannot have more runtime than the period.
7263	*/	7286	*/
7264	if (runtime > period && runtime != RUNTIME_INF)	7287	if (runtime > period && runtime != RUNTIME_INF)
7265	return -EINVAL;	7288	return -EINVAL;
7266		7289
7267	/*	7290	/*
7268	* Ensure we don't starve existing RT tasks.	7291	* Ensure we don't starve existing RT tasks.
7269	*/	7292	*/
7270	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))	7293	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7271	return -EBUSY;	7294	return -EBUSY;
7272		7295
7273	total = to_ratio(period, runtime);	7296	total = to_ratio(period, runtime);
7274		7297
7275	/*	7298	/*
7276	* Nobody can have more than the global setting allows.	7299	* Nobody can have more than the global setting allows.
7277	*/	7300	*/
7278	if (total > to_ratio(global_rt_period(), global_rt_runtime()))	7301	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7279	return -EINVAL;	7302	return -EINVAL;
7280		7303
7281	/*	7304	/*
7282	* The sum of our children's runtime should not exceed our own.	7305	* The sum of our children's runtime should not exceed our own.
7283	*/	7306	*/
7284	list_for_each_entry_rcu(child, &tg->children, siblings) {	7307	list_for_each_entry_rcu(child, &tg->children, siblings) {
7285	period = ktime_to_ns(child->rt_bandwidth.rt_period);	7308	period = ktime_to_ns(child->rt_bandwidth.rt_period);
7286	runtime = child->rt_bandwidth.rt_runtime;	7309	runtime = child->rt_bandwidth.rt_runtime;
7287		7310
7288	if (child == d->tg) {	7311	if (child == d->tg) {
7289	period = d->rt_period;	7312	period = d->rt_period;
7290	runtime = d->rt_runtime;	7313	runtime = d->rt_runtime;
7291	}	7314	}
7292		7315
7293	sum += to_ratio(period, runtime);	7316	sum += to_ratio(period, runtime);
7294	}	7317	}
7295		7318
7296	if (sum > total)	7319	if (sum > total)
7297	return -EINVAL;	7320	return -EINVAL;
7298		7321
7299	return 0;	7322	return 0;
7300	}	7323	}
7301		7324
7302	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)	7325	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7303	{	7326	{
7304	int ret;	7327	int ret;
7305		7328
7306	struct rt_schedulable_data data = {	7329	struct rt_schedulable_data data = {
7307	.tg = tg,	7330	.tg = tg,
7308	.rt_period = period,	7331	.rt_period = period,
7309	.rt_runtime = runtime,	7332	.rt_runtime = runtime,
7310	};	7333	};
7311		7334
7312	rcu_read_lock();	7335	rcu_read_lock();
7313	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);	7336	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7314	rcu_read_unlock();	7337	rcu_read_unlock();
7315		7338
7316	return ret;	7339	return ret;
7317	}	7340	}
7318		7341
7319	static int tg_set_rt_bandwidth(struct task_group *tg,	7342	static int tg_set_rt_bandwidth(struct task_group *tg,
7320	u64 rt_period, u64 rt_runtime)	7343	u64 rt_period, u64 rt_runtime)
7321	{	7344	{
7322	int i, err = 0;	7345	int i, err = 0;
7323		7346
7324	mutex_lock(&rt_constraints_mutex);	7347	mutex_lock(&rt_constraints_mutex);
7325	read_lock(&tasklist_lock);	7348	read_lock(&tasklist_lock);
7326	err = __rt_schedulable(tg, rt_period, rt_runtime);	7349	err = __rt_schedulable(tg, rt_period, rt_runtime);
7327	if (err)	7350	if (err)
7328	goto unlock;	7351	goto unlock;
7329		7352
7330	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);	7353	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7331	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);	7354	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7332	tg->rt_bandwidth.rt_runtime = rt_runtime;	7355	tg->rt_bandwidth.rt_runtime = rt_runtime;
7333		7356
7334	for_each_possible_cpu(i) {	7357	for_each_possible_cpu(i) {
7335	struct rt_rq *rt_rq = tg->rt_rq[i];	7358	struct rt_rq *rt_rq = tg->rt_rq[i];
7336		7359
7337	raw_spin_lock(&rt_rq->rt_runtime_lock);	7360	raw_spin_lock(&rt_rq->rt_runtime_lock);
7338	rt_rq->rt_runtime = rt_runtime;	7361	rt_rq->rt_runtime = rt_runtime;
7339	raw_spin_unlock(&rt_rq->rt_runtime_lock);	7362	raw_spin_unlock(&rt_rq->rt_runtime_lock);
7340	}	7363	}
7341	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);	7364	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7342	unlock:	7365	unlock:
7343	read_unlock(&tasklist_lock);	7366	read_unlock(&tasklist_lock);
7344	mutex_unlock(&rt_constraints_mutex);	7367	mutex_unlock(&rt_constraints_mutex);
7345		7368
7346	return err;	7369	return err;
7347	}	7370	}
7348		7371
7349	static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)	7372	static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7350	{	7373	{
7351	u64 rt_runtime, rt_period;	7374	u64 rt_runtime, rt_period;
7352		7375
7353	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);	7376	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7354	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;	7377	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7355	if (rt_runtime_us < 0)	7378	if (rt_runtime_us < 0)
7356	rt_runtime = RUNTIME_INF;	7379	rt_runtime = RUNTIME_INF;
7357		7380
7358	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);	7381	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7359	}	7382	}
7360		7383
7361	static long sched_group_rt_runtime(struct task_group *tg)	7384	static long sched_group_rt_runtime(struct task_group *tg)
7362	{	7385	{
7363	u64 rt_runtime_us;	7386	u64 rt_runtime_us;
7364		7387
7365	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)	7388	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7366	return -1;	7389	return -1;
7367		7390
7368	rt_runtime_us = tg->rt_bandwidth.rt_runtime;	7391	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7369	do_div(rt_runtime_us, NSEC_PER_USEC);	7392	do_div(rt_runtime_us, NSEC_PER_USEC);
7370	return rt_runtime_us;	7393	return rt_runtime_us;
7371	}	7394	}
7372		7395
7373	static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)	7396	static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7374	{	7397	{
7375	u64 rt_runtime, rt_period;	7398	u64 rt_runtime, rt_period;
7376		7399
7377	rt_period = (u64)rt_period_us * NSEC_PER_USEC;	7400	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7378	rt_runtime = tg->rt_bandwidth.rt_runtime;	7401	rt_runtime = tg->rt_bandwidth.rt_runtime;
7379		7402
7380	if (rt_period == 0)	7403	if (rt_period == 0)
7381	return -EINVAL;	7404	return -EINVAL;
7382		7405
7383	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);	7406	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7384	}	7407	}
7385		7408
7386	static long sched_group_rt_period(struct task_group *tg)	7409	static long sched_group_rt_period(struct task_group *tg)
7387	{	7410	{
7388	u64 rt_period_us;	7411	u64 rt_period_us;
7389		7412
7390	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);	7413	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7391	do_div(rt_period_us, NSEC_PER_USEC);	7414	do_div(rt_period_us, NSEC_PER_USEC);
7392	return rt_period_us;	7415	return rt_period_us;
7393	}	7416	}
7394	#endif /* CONFIG_RT_GROUP_SCHED */	7417	#endif /* CONFIG_RT_GROUP_SCHED */
7395		7418
7396	#ifdef CONFIG_RT_GROUP_SCHED	7419	#ifdef CONFIG_RT_GROUP_SCHED
7397	static int sched_rt_global_constraints(void)	7420	static int sched_rt_global_constraints(void)
7398	{	7421	{
7399	int ret = 0;	7422	int ret = 0;
7400		7423
7401	mutex_lock(&rt_constraints_mutex);	7424	mutex_lock(&rt_constraints_mutex);
7402	read_lock(&tasklist_lock);	7425	read_lock(&tasklist_lock);
7403	ret = __rt_schedulable(NULL, 0, 0);	7426	ret = __rt_schedulable(NULL, 0, 0);
7404	read_unlock(&tasklist_lock);	7427	read_unlock(&tasklist_lock);
7405	mutex_unlock(&rt_constraints_mutex);	7428	mutex_unlock(&rt_constraints_mutex);
7406		7429
7407	return ret;	7430	return ret;
7408	}	7431	}
7409		7432
7410	static int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)	7433	static int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)
7411	{	7434	{
7412	/* Don't accept realtime tasks when there is no way for them to run */	7435	/* Don't accept realtime tasks when there is no way for them to run */
7413	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)	7436	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7414	return 0;	7437	return 0;
7415		7438
7416	return 1;	7439	return 1;
7417	}	7440	}
7418		7441
7419	#else /* !CONFIG_RT_GROUP_SCHED */	7442	#else /* !CONFIG_RT_GROUP_SCHED */
7420	static int sched_rt_global_constraints(void)	7443	static int sched_rt_global_constraints(void)
7421	{	7444	{
7422	unsigned long flags;	7445	unsigned long flags;
7423	int i, ret = 0;	7446	int i, ret = 0;
7424		7447
7425	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);	7448	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7426	for_each_possible_cpu(i) {	7449	for_each_possible_cpu(i) {
7427	struct rt_rq *rt_rq = &cpu_rq(i)->rt;	7450	struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7428		7451
7429	raw_spin_lock(&rt_rq->rt_runtime_lock);	7452	raw_spin_lock(&rt_rq->rt_runtime_lock);
7430	rt_rq->rt_runtime = global_rt_runtime();	7453	rt_rq->rt_runtime = global_rt_runtime();
7431	raw_spin_unlock(&rt_rq->rt_runtime_lock);	7454	raw_spin_unlock(&rt_rq->rt_runtime_lock);
7432	}	7455	}
7433	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);	7456	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7434		7457
7435	return ret;	7458	return ret;
7436	}	7459	}
7437	#endif /* CONFIG_RT_GROUP_SCHED */	7460	#endif /* CONFIG_RT_GROUP_SCHED */
7438		7461
7439	static int sched_dl_global_constraints(void)	7462	static int sched_dl_global_constraints(void)
7440	{	7463	{
7441	u64 runtime = global_rt_runtime();	7464	u64 runtime = global_rt_runtime();
7442	u64 period = global_rt_period();	7465	u64 period = global_rt_period();
7443	u64 new_bw = to_ratio(period, runtime);	7466	u64 new_bw = to_ratio(period, runtime);
7444	int cpu, ret = 0;	7467	int cpu, ret = 0;
7445	unsigned long flags;	7468	unsigned long flags;
7446		7469
7447	/*	7470	/*
7448	* Here we want to check the bandwidth not being set to some	7471	* Here we want to check the bandwidth not being set to some
7449	* value smaller than the currently allocated bandwidth in	7472	* value smaller than the currently allocated bandwidth in
7450	* any of the root_domains.	7473	* any of the root_domains.
7451	*	7474	*
7452	* FIXME: Cycling on all the CPUs is overdoing, but simpler than	7475	* FIXME: Cycling on all the CPUs is overdoing, but simpler than
7453	* cycling on root_domains... Discussion on different/better	7476	* cycling on root_domains... Discussion on different/better
7454	* solutions is welcome!	7477	* solutions is welcome!
7455	*/	7478	*/
7456	for_each_possible_cpu(cpu) {	7479	for_each_possible_cpu(cpu) {
7457	struct dl_bw *dl_b = dl_bw_of(cpu);	7480	struct dl_bw *dl_b = dl_bw_of(cpu);
7458		7481
7459	raw_spin_lock_irqsave(&dl_b->lock, flags);	7482	raw_spin_lock_irqsave(&dl_b->lock, flags);
7460	if (new_bw < dl_b->total_bw)	7483	if (new_bw < dl_b->total_bw)
7461	ret = -EBUSY;	7484	ret = -EBUSY;
7462	raw_spin_unlock_irqrestore(&dl_b->lock, flags);	7485	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7463		7486
7464	if (ret)	7487	if (ret)
7465	break;	7488	break;
7466	}	7489	}
7467		7490
7468	return ret;	7491	return ret;
7469	}	7492	}
7470		7493
7471	static void sched_dl_do_global(void)	7494	static void sched_dl_do_global(void)
7472	{	7495	{
7473	u64 new_bw = -1;	7496	u64 new_bw = -1;
7474	int cpu;	7497	int cpu;
7475	unsigned long flags;	7498	unsigned long flags;
7476		7499
7477	def_dl_bandwidth.dl_period = global_rt_period();	7500	def_dl_bandwidth.dl_period = global_rt_period();
7478	def_dl_bandwidth.dl_runtime = global_rt_runtime();	7501	def_dl_bandwidth.dl_runtime = global_rt_runtime();
7479		7502
7480	if (global_rt_runtime() != RUNTIME_INF)	7503	if (global_rt_runtime() != RUNTIME_INF)
7481	new_bw = to_ratio(global_rt_period(), global_rt_runtime());	7504	new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7482		7505
7483	/*	7506	/*
7484	* FIXME: As above...	7507	* FIXME: As above...
7485	*/	7508	*/
7486	for_each_possible_cpu(cpu) {	7509	for_each_possible_cpu(cpu) {
7487	struct dl_bw *dl_b = dl_bw_of(cpu);	7510	struct dl_bw *dl_b = dl_bw_of(cpu);
7488		7511
7489	raw_spin_lock_irqsave(&dl_b->lock, flags);	7512	raw_spin_lock_irqsave(&dl_b->lock, flags);
7490	dl_b->bw = new_bw;	7513	dl_b->bw = new_bw;
7491	raw_spin_unlock_irqrestore(&dl_b->lock, flags);	7514	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7492	}	7515	}
7493	}	7516	}
7494		7517
7495	static int sched_rt_global_validate(void)	7518	static int sched_rt_global_validate(void)
7496	{	7519	{
7497	if (sysctl_sched_rt_period <= 0)	7520	if (sysctl_sched_rt_period <= 0)
7498	return -EINVAL;	7521	return -EINVAL;
7499		7522
7500	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&	7523	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7501	(sysctl_sched_rt_runtime > sysctl_sched_rt_period))	7524	(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7502	return -EINVAL;	7525	return -EINVAL;
7503		7526
7504	return 0;	7527	return 0;
7505	}	7528	}
7506		7529
7507	static void sched_rt_do_global(void)	7530	static void sched_rt_do_global(void)
7508	{	7531	{
7509	def_rt_bandwidth.rt_runtime = global_rt_runtime();	7532	def_rt_bandwidth.rt_runtime = global_rt_runtime();
7510	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());	7533	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7511	}	7534	}
7512		7535
7513	int sched_rt_handler(struct ctl_table *table, int write,	7536	int sched_rt_handler(struct ctl_table *table, int write,
7514	void __user buffer, size_t lenp,	7537	void __user buffer, size_t lenp,
7515	loff_t *ppos)	7538	loff_t *ppos)
7516	{	7539	{
7517	int old_period, old_runtime;	7540	int old_period, old_runtime;
7518	static DEFINE_MUTEX(mutex);	7541	static DEFINE_MUTEX(mutex);
7519	int ret;	7542	int ret;
7520		7543
7521	mutex_lock(&mutex);	7544	mutex_lock(&mutex);
7522	old_period = sysctl_sched_rt_period;	7545	old_period = sysctl_sched_rt_period;
7523	old_runtime = sysctl_sched_rt_runtime;	7546	old_runtime = sysctl_sched_rt_runtime;
7524		7547
7525	ret = proc_dointvec(table, write, buffer, lenp, ppos);	7548	ret = proc_dointvec(table, write, buffer, lenp, ppos);
7526		7549
7527	if (!ret && write) {	7550	if (!ret && write) {
7528	ret = sched_rt_global_validate();	7551	ret = sched_rt_global_validate();
7529	if (ret)	7552	if (ret)
7530	goto undo;	7553	goto undo;
7531		7554
7532	ret = sched_rt_global_constraints();	7555	ret = sched_rt_global_constraints();
7533	if (ret)	7556	if (ret)
7534	goto undo;	7557	goto undo;
7535		7558
7536	ret = sched_dl_global_constraints();	7559	ret = sched_dl_global_constraints();
7537	if (ret)	7560	if (ret)
7538	goto undo;	7561	goto undo;
7539		7562
7540	sched_rt_do_global();	7563	sched_rt_do_global();
7541	sched_dl_do_global();	7564	sched_dl_do_global();
7542	}	7565	}
7543	if (0) {	7566	if (0) {
7544	undo:	7567	undo:
7545	sysctl_sched_rt_period = old_period;	7568	sysctl_sched_rt_period = old_period;
7546	sysctl_sched_rt_runtime = old_runtime;	7569	sysctl_sched_rt_runtime = old_runtime;
7547	}	7570	}
7548	mutex_unlock(&mutex);	7571	mutex_unlock(&mutex);
7549		7572
7550	return ret;	7573	return ret;
7551	}	7574	}
7552		7575
7553	int sched_rr_handler(struct ctl_table *table, int write,	7576	int sched_rr_handler(struct ctl_table *table, int write,
7554	void __user buffer, size_t lenp,	7577	void __user buffer, size_t lenp,
7555	loff_t *ppos)	7578	loff_t *ppos)
7556	{	7579	{
7557	int ret;	7580	int ret;
7558	static DEFINE_MUTEX(mutex);	7581	static DEFINE_MUTEX(mutex);
7559		7582
7560	mutex_lock(&mutex);	7583	mutex_lock(&mutex);
7561	ret = proc_dointvec(table, write, buffer, lenp, ppos);	7584	ret = proc_dointvec(table, write, buffer, lenp, ppos);
7562	/* make sure that internally we keep jiffies */	7585	/* make sure that internally we keep jiffies */
7563	/* also, writing zero resets timeslice to default */	7586	/* also, writing zero resets timeslice to default */
7564	if (!ret && write) {	7587	if (!ret && write) {
7565	sched_rr_timeslice = sched_rr_timeslice <= 0 ?	7588	sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7566	RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);	7589	RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7567	}	7590	}
7568	mutex_unlock(&mutex);	7591	mutex_unlock(&mutex);
7569	return ret;	7592	return ret;
7570	}	7593	}
7571		7594
7572	#ifdef CONFIG_CGROUP_SCHED	7595	#ifdef CONFIG_CGROUP_SCHED
7573		7596
7574	static inline struct task_group css_tg(struct cgroup_subsys_state css)	7597	static inline struct task_group css_tg(struct cgroup_subsys_state css)
7575	{	7598	{
7576	return css ? container_of(css, struct task_group, css) : NULL;	7599	return css ? container_of(css, struct task_group, css) : NULL;
7577	}	7600	}
7578		7601
7579	static struct cgroup_subsys_state *	7602	static struct cgroup_subsys_state *
7580	cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)	7603	cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7581	{	7604	{
7582	struct task_group *parent = css_tg(parent_css);	7605	struct task_group *parent = css_tg(parent_css);
7583	struct task_group *tg;	7606	struct task_group *tg;
7584		7607
7585	if (!parent) {	7608	if (!parent) {
7586	/* This is early initialization for the top cgroup */	7609	/* This is early initialization for the top cgroup */
7587	return &root_task_group.css;	7610	return &root_task_group.css;
7588	}	7611	}
7589		7612
7590	tg = sched_create_group(parent);	7613	tg = sched_create_group(parent);
7591	if (IS_ERR(tg))	7614	if (IS_ERR(tg))
7592	return ERR_PTR(-ENOMEM);	7615	return ERR_PTR(-ENOMEM);
7593		7616
7594	return &tg->css;	7617	return &tg->css;
7595	}	7618	}
7596		7619
7597	static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)	7620	static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7598	{	7621	{
7599	struct task_group *tg = css_tg(css);	7622	struct task_group *tg = css_tg(css);
7600	struct task_group *parent = css_tg(css_parent(css));	7623	struct task_group *parent = css_tg(css_parent(css));
7601		7624
7602	if (parent)	7625	if (parent)
7603	sched_online_group(tg, parent);	7626	sched_online_group(tg, parent);
7604	return 0;	7627	return 0;
7605	}	7628	}
7606		7629
7607	static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)	7630	static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7608	{	7631	{
7609	struct task_group *tg = css_tg(css);	7632	struct task_group *tg = css_tg(css);
7610		7633
7611	sched_destroy_group(tg);	7634	sched_destroy_group(tg);
7612	}	7635	}
7613		7636
7614	static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)	7637	static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7615	{	7638	{
7616	struct task_group *tg = css_tg(css);	7639	struct task_group *tg = css_tg(css);
7617		7640
7618	sched_offline_group(tg);	7641	sched_offline_group(tg);
7619	}	7642	}
7620		7643
7621	static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,	7644	static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7622	struct cgroup_taskset *tset)	7645	struct cgroup_taskset *tset)
7623	{	7646	{
7624	struct task_struct *task;	7647	struct task_struct *task;
7625		7648
7626	cgroup_taskset_for_each(task, tset) {	7649	cgroup_taskset_for_each(task, tset) {
7627	#ifdef CONFIG_RT_GROUP_SCHED	7650	#ifdef CONFIG_RT_GROUP_SCHED
7628	if (!sched_rt_can_attach(css_tg(css), task))	7651	if (!sched_rt_can_attach(css_tg(css), task))
7629	return -EINVAL;	7652	return -EINVAL;
7630	#else	7653	#else
7631	/* We don't support RT-tasks being in separate groups */	7654	/* We don't support RT-tasks being in separate groups */
7632	if (task->sched_class != &fair_sched_class)	7655	if (task->sched_class != &fair_sched_class)
7633	return -EINVAL;	7656	return -EINVAL;
7634	#endif	7657	#endif
7635	}	7658	}
7636	return 0;	7659	return 0;
7637	}	7660	}
7638		7661
7639	static void cpu_cgroup_attach(struct cgroup_subsys_state *css,	7662	static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7640	struct cgroup_taskset *tset)	7663	struct cgroup_taskset *tset)
7641	{	7664	{
7642	struct task_struct *task;	7665	struct task_struct *task;
7643		7666
7644	cgroup_taskset_for_each(task, tset)	7667	cgroup_taskset_for_each(task, tset)
7645	sched_move_task(task);	7668	sched_move_task(task);
7646	}	7669	}
7647		7670
7648	static void cpu_cgroup_exit(struct cgroup_subsys_state *css,	7671	static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7649	struct cgroup_subsys_state *old_css,	7672	struct cgroup_subsys_state *old_css,
7650	struct task_struct *task)	7673	struct task_struct *task)
7651	{	7674	{
7652	/*	7675	/*
7653	* cgroup_exit() is called in the copy_process() failure path.	7676	* cgroup_exit() is called in the copy_process() failure path.
7654	* Ignore this case since the task hasn't ran yet, this avoids	7677	* Ignore this case since the task hasn't ran yet, this avoids
7655	* trying to poke a half freed task state from generic code.	7678	* trying to poke a half freed task state from generic code.
7656	*/	7679	*/
7657	if (!(task->flags & PF_EXITING))	7680	if (!(task->flags & PF_EXITING))
7658	return;	7681	return;
7659		7682
7660	sched_move_task(task);	7683	sched_move_task(task);
7661	}	7684	}
7662		7685
7663	#ifdef CONFIG_FAIR_GROUP_SCHED	7686	#ifdef CONFIG_FAIR_GROUP_SCHED
7664	static int cpu_shares_write_u64(struct cgroup_subsys_state *css,	7687	static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7665	struct cftype *cftype, u64 shareval)	7688	struct cftype *cftype, u64 shareval)
7666	{	7689	{
7667	return sched_group_set_shares(css_tg(css), scale_load(shareval));	7690	return sched_group_set_shares(css_tg(css), scale_load(shareval));
7668	}	7691	}
7669		7692
7670	static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,	7693	static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7671	struct cftype *cft)	7694	struct cftype *cft)
7672	{	7695	{
7673	struct task_group *tg = css_tg(css);	7696	struct task_group *tg = css_tg(css);
7674		7697
7675	return (u64) scale_load_down(tg->shares);	7698	return (u64) scale_load_down(tg->shares);
7676	}	7699	}
7677		7700
7678	#ifdef CONFIG_CFS_BANDWIDTH	7701	#ifdef CONFIG_CFS_BANDWIDTH
7679	static DEFINE_MUTEX(cfs_constraints_mutex);	7702	static DEFINE_MUTEX(cfs_constraints_mutex);
7680		7703
7681	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */	7704	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7682	const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */	7705	const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7683		7706
7684	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);	7707	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7685		7708
7686	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)	7709	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7687	{	7710	{
7688	int i, ret = 0, runtime_enabled, runtime_was_enabled;	7711	int i, ret = 0, runtime_enabled, runtime_was_enabled;
7689	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;	7712	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7690		7713
7691	if (tg == &root_task_group)	7714	if (tg == &root_task_group)
7692	return -EINVAL;	7715	return -EINVAL;
7693		7716
7694	/*	7717	/*
7695	* Ensure we have at some amount of bandwidth every period. This is	7718	* Ensure we have at some amount of bandwidth every period. This is
7696	* to prevent reaching a state of large arrears when throttled via	7719	* to prevent reaching a state of large arrears when throttled via
7697	* entity_tick() resulting in prolonged exit starvation.	7720	* entity_tick() resulting in prolonged exit starvation.
7698	*/	7721	*/
7699	if (quota < min_cfs_quota_period \|\| period < min_cfs_quota_period)	7722	if (quota < min_cfs_quota_period \|\| period < min_cfs_quota_period)
7700	return -EINVAL;	7723	return -EINVAL;
7701		7724
7702	/*	7725	/*
7703	* Likewise, bound things on the otherside by preventing insane quota	7726	* Likewise, bound things on the otherside by preventing insane quota
7704	* periods. This also allows us to normalize in computing quota	7727	* periods. This also allows us to normalize in computing quota
7705	* feasibility.	7728	* feasibility.
7706	*/	7729	*/
7707	if (period > max_cfs_quota_period)	7730	if (period > max_cfs_quota_period)
7708	return -EINVAL;	7731	return -EINVAL;
7709		7732
7710	mutex_lock(&cfs_constraints_mutex);	7733	mutex_lock(&cfs_constraints_mutex);
7711	ret = __cfs_schedulable(tg, period, quota);	7734	ret = __cfs_schedulable(tg, period, quota);
7712	if (ret)	7735	if (ret)
7713	goto out_unlock;	7736	goto out_unlock;
7714		7737
7715	runtime_enabled = quota != RUNTIME_INF;	7738	runtime_enabled = quota != RUNTIME_INF;
7716	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;	7739	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7717	/*	7740	/*
7718	* If we need to toggle cfs_bandwidth_used, off->on must occur	7741	* If we need to toggle cfs_bandwidth_used, off->on must occur
7719	* before making related changes, and on->off must occur afterwards	7742	* before making related changes, and on->off must occur afterwards
7720	*/	7743	*/
7721	if (runtime_enabled && !runtime_was_enabled)	7744	if (runtime_enabled && !runtime_was_enabled)
7722	cfs_bandwidth_usage_inc();	7745	cfs_bandwidth_usage_inc();
7723	raw_spin_lock_irq(&cfs_b->lock);	7746	raw_spin_lock_irq(&cfs_b->lock);
7724	cfs_b->period = ns_to_ktime(period);	7747	cfs_b->period = ns_to_ktime(period);
7725	cfs_b->quota = quota;	7748	cfs_b->quota = quota;
7726		7749
7727	__refill_cfs_bandwidth_runtime(cfs_b);	7750	__refill_cfs_bandwidth_runtime(cfs_b);
7728	/* restart the period timer (if active) to handle new period expiry */	7751	/* restart the period timer (if active) to handle new period expiry */
7729	if (runtime_enabled && cfs_b->timer_active) {	7752	if (runtime_enabled && cfs_b->timer_active) {
7730	/* force a reprogram */	7753	/* force a reprogram */
7731	cfs_b->timer_active = 0;	7754	cfs_b->timer_active = 0;
7732	__start_cfs_bandwidth(cfs_b);	7755	__start_cfs_bandwidth(cfs_b);
7733	}	7756	}
7734	raw_spin_unlock_irq(&cfs_b->lock);	7757	raw_spin_unlock_irq(&cfs_b->lock);
7735		7758
7736	for_each_possible_cpu(i) {	7759	for_each_possible_cpu(i) {
7737	struct cfs_rq *cfs_rq = tg->cfs_rq[i];	7760	struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7738	struct rq *rq = cfs_rq->rq;	7761	struct rq *rq = cfs_rq->rq;
7739		7762
7740	raw_spin_lock_irq(&rq->lock);	7763	raw_spin_lock_irq(&rq->lock);
7741	cfs_rq->runtime_enabled = runtime_enabled;	7764	cfs_rq->runtime_enabled = runtime_enabled;
7742	cfs_rq->runtime_remaining = 0;	7765	cfs_rq->runtime_remaining = 0;
7743		7766
7744	if (cfs_rq->throttled)	7767	if (cfs_rq->throttled)
7745	unthrottle_cfs_rq(cfs_rq);	7768	unthrottle_cfs_rq(cfs_rq);
7746	raw_spin_unlock_irq(&rq->lock);	7769	raw_spin_unlock_irq(&rq->lock);
7747	}	7770	}
7748	if (runtime_was_enabled && !runtime_enabled)	7771	if (runtime_was_enabled && !runtime_enabled)
7749	cfs_bandwidth_usage_dec();	7772	cfs_bandwidth_usage_dec();
7750	out_unlock:	7773	out_unlock:
7751	mutex_unlock(&cfs_constraints_mutex);	7774	mutex_unlock(&cfs_constraints_mutex);
7752		7775
7753	return ret;	7776	return ret;
7754	}	7777	}
7755		7778
7756	int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)	7779	int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7757	{	7780	{
7758	u64 quota, period;	7781	u64 quota, period;
7759		7782
7760	period = ktime_to_ns(tg->cfs_bandwidth.period);	7783	period = ktime_to_ns(tg->cfs_bandwidth.period);
7761	if (cfs_quota_us < 0)	7784	if (cfs_quota_us < 0)
7762	quota = RUNTIME_INF;	7785	quota = RUNTIME_INF;
7763	else	7786	else
7764	quota = (u64)cfs_quota_us * NSEC_PER_USEC;	7787	quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7765		7788
7766	return tg_set_cfs_bandwidth(tg, period, quota);	7789	return tg_set_cfs_bandwidth(tg, period, quota);
7767	}	7790	}
7768		7791
7769	long tg_get_cfs_quota(struct task_group *tg)	7792	long tg_get_cfs_quota(struct task_group *tg)
7770	{	7793	{
7771	u64 quota_us;	7794	u64 quota_us;
7772		7795
7773	if (tg->cfs_bandwidth.quota == RUNTIME_INF)	7796	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7774	return -1;	7797	return -1;
7775		7798
7776	quota_us = tg->cfs_bandwidth.quota;	7799	quota_us = tg->cfs_bandwidth.quota;
7777	do_div(quota_us, NSEC_PER_USEC);	7800	do_div(quota_us, NSEC_PER_USEC);
7778		7801
7779	return quota_us;	7802	return quota_us;
7780	}	7803	}
7781		7804
7782	int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)	7805	int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7783	{	7806	{
7784	u64 quota, period;	7807	u64 quota, period;
7785		7808
7786	period = (u64)cfs_period_us * NSEC_PER_USEC;	7809	period = (u64)cfs_period_us * NSEC_PER_USEC;
7787	quota = tg->cfs_bandwidth.quota;	7810	quota = tg->cfs_bandwidth.quota;
7788		7811
7789	return tg_set_cfs_bandwidth(tg, period, quota);	7812	return tg_set_cfs_bandwidth(tg, period, quota);
7790	}	7813	}
7791		7814
7792	long tg_get_cfs_period(struct task_group *tg)	7815	long tg_get_cfs_period(struct task_group *tg)
7793	{	7816	{
7794	u64 cfs_period_us;	7817	u64 cfs_period_us;
7795		7818
7796	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);	7819	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7797	do_div(cfs_period_us, NSEC_PER_USEC);	7820	do_div(cfs_period_us, NSEC_PER_USEC);
7798		7821
7799	return cfs_period_us;	7822	return cfs_period_us;
7800	}	7823	}
7801		7824
7802	static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,	7825	static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7803	struct cftype *cft)	7826	struct cftype *cft)
7804	{	7827	{
7805	return tg_get_cfs_quota(css_tg(css));	7828	return tg_get_cfs_quota(css_tg(css));
7806	}	7829	}
7807		7830
7808	static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,	7831	static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7809	struct cftype *cftype, s64 cfs_quota_us)	7832	struct cftype *cftype, s64 cfs_quota_us)
7810	{	7833	{
7811	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);	7834	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7812	}	7835	}
7813		7836
7814	static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,	7837	static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7815	struct cftype *cft)	7838	struct cftype *cft)
7816	{	7839	{
7817	return tg_get_cfs_period(css_tg(css));	7840	return tg_get_cfs_period(css_tg(css));
7818	}	7841	}
7819		7842
7820	static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,	7843	static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7821	struct cftype *cftype, u64 cfs_period_us)	7844	struct cftype *cftype, u64 cfs_period_us)
7822	{	7845	{
7823	return tg_set_cfs_period(css_tg(css), cfs_period_us);	7846	return tg_set_cfs_period(css_tg(css), cfs_period_us);
7824	}	7847	}
7825		7848
7826	struct cfs_schedulable_data {	7849	struct cfs_schedulable_data {
7827	struct task_group *tg;	7850	struct task_group *tg;
7828	u64 period, quota;	7851	u64 period, quota;
7829	};	7852	};
7830		7853
7831	/*	7854	/*
7832	* normalize group quota/period to be quota/max_period	7855	* normalize group quota/period to be quota/max_period
7833	* note: units are usecs	7856	* note: units are usecs
7834	*/	7857	*/
7835	static u64 normalize_cfs_quota(struct task_group *tg,	7858	static u64 normalize_cfs_quota(struct task_group *tg,
7836	struct cfs_schedulable_data *d)	7859	struct cfs_schedulable_data *d)
7837	{	7860	{
7838	u64 quota, period;	7861	u64 quota, period;
7839		7862
7840	if (tg == d->tg) {	7863	if (tg == d->tg) {
7841	period = d->period;	7864	period = d->period;
7842	quota = d->quota;	7865	quota = d->quota;
7843	} else {	7866	} else {
7844	period = tg_get_cfs_period(tg);	7867	period = tg_get_cfs_period(tg);
7845	quota = tg_get_cfs_quota(tg);	7868	quota = tg_get_cfs_quota(tg);
7846	}	7869	}
7847		7870
7848	/* note: these should typically be equivalent */	7871	/* note: these should typically be equivalent */
7849	if (quota == RUNTIME_INF \|\| quota == -1)	7872	if (quota == RUNTIME_INF \|\| quota == -1)
7850	return RUNTIME_INF;	7873	return RUNTIME_INF;
7851		7874
7852	return to_ratio(period, quota);	7875	return to_ratio(period, quota);
7853	}	7876	}
7854		7877
7855	static int tg_cfs_schedulable_down(struct task_group tg, void data)	7878	static int tg_cfs_schedulable_down(struct task_group tg, void data)
7856	{	7879	{
7857	struct cfs_schedulable_data *d = data;	7880	struct cfs_schedulable_data *d = data;
7858	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;	7881	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7859	s64 quota = 0, parent_quota = -1;	7882	s64 quota = 0, parent_quota = -1;
7860		7883
7861	if (!tg->parent) {	7884	if (!tg->parent) {
7862	quota = RUNTIME_INF;	7885	quota = RUNTIME_INF;
7863	} else {	7886	} else {
7864	struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;	7887	struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7865		7888
7866	quota = normalize_cfs_quota(tg, d);	7889	quota = normalize_cfs_quota(tg, d);
7867	parent_quota = parent_b->hierarchal_quota;	7890	parent_quota = parent_b->hierarchal_quota;
7868		7891
7869	/*	7892	/*
7870	* ensure max(child_quota) <= parent_quota, inherit when no	7893	* ensure max(child_quota) <= parent_quota, inherit when no
7871	* limit is set	7894	* limit is set
7872	*/	7895	*/
7873	if (quota == RUNTIME_INF)	7896	if (quota == RUNTIME_INF)
7874	quota = parent_quota;	7897	quota = parent_quota;
7875	else if (parent_quota != RUNTIME_INF && quota > parent_quota)	7898	else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7876	return -EINVAL;	7899	return -EINVAL;
7877	}	7900	}
7878	cfs_b->hierarchal_quota = quota;	7901	cfs_b->hierarchal_quota = quota;
7879		7902
7880	return 0;	7903	return 0;
7881	}	7904	}
7882		7905
7883	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)	7906	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7884	{	7907	{
7885	int ret;	7908	int ret;
7886	struct cfs_schedulable_data data = {	7909	struct cfs_schedulable_data data = {
7887	.tg = tg,	7910	.tg = tg,
7888	.period = period,	7911	.period = period,
7889	.quota = quota,	7912	.quota = quota,
7890	};	7913	};
7891		7914
7892	if (quota != RUNTIME_INF) {	7915	if (quota != RUNTIME_INF) {
7893	do_div(data.period, NSEC_PER_USEC);	7916	do_div(data.period, NSEC_PER_USEC);
7894	do_div(data.quota, NSEC_PER_USEC);	7917	do_div(data.quota, NSEC_PER_USEC);
7895	}	7918	}
7896		7919
7897	rcu_read_lock();	7920	rcu_read_lock();
7898	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);	7921	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7899	rcu_read_unlock();	7922	rcu_read_unlock();
7900		7923
7901	return ret;	7924	return ret;
7902	}	7925	}
7903		7926
7904	static int cpu_stats_show(struct seq_file sf, void v)	7927	static int cpu_stats_show(struct seq_file sf, void v)
7905	{	7928	{
7906	struct task_group *tg = css_tg(seq_css(sf));	7929	struct task_group *tg = css_tg(seq_css(sf));
7907	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;	7930	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7908		7931
7909	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);	7932	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7910	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);	7933	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7911	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);	7934	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7912		7935
7913	return 0;	7936	return 0;
7914	}	7937	}
7915	#endif /* CONFIG_CFS_BANDWIDTH */	7938	#endif /* CONFIG_CFS_BANDWIDTH */
7916	#endif /* CONFIG_FAIR_GROUP_SCHED */	7939	#endif /* CONFIG_FAIR_GROUP_SCHED */
7917		7940
7918	#ifdef CONFIG_RT_GROUP_SCHED	7941	#ifdef CONFIG_RT_GROUP_SCHED
7919	static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,	7942	static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7920	struct cftype *cft, s64 val)	7943	struct cftype *cft, s64 val)
7921	{	7944	{
7922	return sched_group_set_rt_runtime(css_tg(css), val);	7945	return sched_group_set_rt_runtime(css_tg(css), val);
7923	}	7946	}
7924		7947
7925	static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,	7948	static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7926	struct cftype *cft)	7949	struct cftype *cft)
7927	{	7950	{
7928	return sched_group_rt_runtime(css_tg(css));	7951	return sched_group_rt_runtime(css_tg(css));
7929	}	7952	}
7930		7953
7931	static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,	7954	static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7932	struct cftype *cftype, u64 rt_period_us)	7955	struct cftype *cftype, u64 rt_period_us)
7933	{	7956	{
7934	return sched_group_set_rt_period(css_tg(css), rt_period_us);	7957	return sched_group_set_rt_period(css_tg(css), rt_period_us);
7935	}	7958	}
7936		7959
7937	static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,	7960	static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7938	struct cftype *cft)	7961	struct cftype *cft)
7939	{	7962	{
7940	return sched_group_rt_period(css_tg(css));	7963	return sched_group_rt_period(css_tg(css));
7941	}	7964	}
7942	#endif /* CONFIG_RT_GROUP_SCHED */	7965	#endif /* CONFIG_RT_GROUP_SCHED */
7943		7966
7944	static struct cftype cpu_files[] = {	7967	static struct cftype cpu_files[] = {
7945	#ifdef CONFIG_FAIR_GROUP_SCHED	7968	#ifdef CONFIG_FAIR_GROUP_SCHED
7946	{	7969	{
7947	.name = "shares",	7970	.name = "shares",
7948	.read_u64 = cpu_shares_read_u64,	7971	.read_u64 = cpu_shares_read_u64,
7949	.write_u64 = cpu_shares_write_u64,	7972	.write_u64 = cpu_shares_write_u64,
7950	},	7973	},
7951	#endif	7974	#endif
7952	#ifdef CONFIG_CFS_BANDWIDTH	7975	#ifdef CONFIG_CFS_BANDWIDTH
7953	{	7976	{
7954	.name = "cfs_quota_us",	7977	.name = "cfs_quota_us",
7955	.read_s64 = cpu_cfs_quota_read_s64,	7978	.read_s64 = cpu_cfs_quota_read_s64,
7956	.write_s64 = cpu_cfs_quota_write_s64,	7979	.write_s64 = cpu_cfs_quota_write_s64,
7957	},	7980	},
7958	{	7981	{
7959	.name = "cfs_period_us",	7982	.name = "cfs_period_us",
7960	.read_u64 = cpu_cfs_period_read_u64,	7983	.read_u64 = cpu_cfs_period_read_u64,
7961	.write_u64 = cpu_cfs_period_write_u64,	7984	.write_u64 = cpu_cfs_period_write_u64,
7962	},	7985	},
7963	{	7986	{
7964	.name = "stat",	7987	.name = "stat",
7965	.seq_show = cpu_stats_show,	7988	.seq_show = cpu_stats_show,
7966	},	7989	},
7967	#endif	7990	#endif
7968	#ifdef CONFIG_RT_GROUP_SCHED	7991	#ifdef CONFIG_RT_GROUP_SCHED
7969	{	7992	{
7970	.name = "rt_runtime_us",	7993	.name = "rt_runtime_us",
7971	.read_s64 = cpu_rt_runtime_read,	7994	.read_s64 = cpu_rt_runtime_read,
7972	.write_s64 = cpu_rt_runtime_write,	7995	.write_s64 = cpu_rt_runtime_write,
7973	},	7996	},
7974	{	7997	{
7975	.name = "rt_period_us",	7998	.name = "rt_period_us",
7976	.read_u64 = cpu_rt_period_read_uint,	7999	.read_u64 = cpu_rt_period_read_uint,
7977	.write_u64 = cpu_rt_period_write_uint,	8000	.write_u64 = cpu_rt_period_write_uint,
7978	},	8001	},
7979	#endif	8002	#endif
7980	{ } /* terminate */	8003	{ } /* terminate */
7981	};	8004	};
7982		8005
7983	struct cgroup_subsys cpu_cgrp_subsys = {	8006	struct cgroup_subsys cpu_cgrp_subsys = {
7984	.css_alloc = cpu_cgroup_css_alloc,	8007	.css_alloc = cpu_cgroup_css_alloc,
7985	.css_free = cpu_cgroup_css_free,	8008	.css_free = cpu_cgroup_css_free,
7986	.css_online = cpu_cgroup_css_online,	8009	.css_online = cpu_cgroup_css_online,
7987	.css_offline = cpu_cgroup_css_offline,	8010	.css_offline = cpu_cgroup_css_offline,
7988	.can_attach = cpu_cgroup_can_attach,	8011	.can_attach = cpu_cgroup_can_attach,
7989	.attach = cpu_cgroup_attach,	8012	.attach = cpu_cgroup_attach,
7990	.exit = cpu_cgroup_exit,	8013	.exit = cpu_cgroup_exit,
7991	.base_cftypes = cpu_files,	8014	.base_cftypes = cpu_files,
7992	.early_init = 1,	8015	.early_init = 1,
7993	};	8016	};
7994		8017
7995	#endif /* CONFIG_CGROUP_SCHED */	8018	#endif /* CONFIG_CGROUP_SCHED */
7996		8019
7997	void dump_cpu_task(int cpu)	8020	void dump_cpu_task(int cpu)
7998	{	8021	{

kernel/sched/cpudeadline.c

Diff comments View file @ 3243970

1	/*	1	/*
2	* kernel/sched/cpudl.c	2	* kernel/sched/cpudl.c
3	*	3	*
4	* Global CPU deadline management	4	* Global CPU deadline management
5	*	5	*
6	* Author: Juri Lelli <j.lelli@sssup.it>	6	* Author: Juri Lelli <j.lelli@sssup.it>
7	*	7	*
8	* This program is free software; you can redistribute it and/or	8	* This program is free software; you can redistribute it and/or
9	* modify it under the terms of the GNU General Public License	9	* modify it under the terms of the GNU General Public License
10	* as published by the Free Software Foundation; version 2	10	* as published by the Free Software Foundation; version 2
11	* of the License.	11	* of the License.
12	*/	12	*/
13		13
14	#include <linux/gfp.h>	14	#include <linux/gfp.h>
15	#include <linux/kernel.h>	15	#include <linux/kernel.h>
		16	#include <linux/slab.h>
16	#include "cpudeadline.h"	17	#include "cpudeadline.h"
17		18
18	static inline int parent(int i)	19	static inline int parent(int i)
19	{	20	{
20	return (i - 1) >> 1;	21	return (i - 1) >> 1;
21	}	22	}
22		23
23	static inline int left_child(int i)	24	static inline int left_child(int i)
24	{	25	{
25	return (i << 1) + 1;	26	return (i << 1) + 1;
26	}	27	}
27		28
28	static inline int right_child(int i)	29	static inline int right_child(int i)
29	{	30	{
30	return (i << 1) + 2;	31	return (i << 1) + 2;
31	}	32	}
32		33
33	static inline int dl_time_before(u64 a, u64 b)	34	static inline int dl_time_before(u64 a, u64 b)
34	{	35	{
35	return (s64)(a - b) < 0;	36	return (s64)(a - b) < 0;
36	}	37	}
37		38
38	static void cpudl_exchange(struct cpudl *cp, int a, int b)	39	static void cpudl_exchange(struct cpudl *cp, int a, int b)
39	{	40	{
40	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;	41	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41		42
42	swap(cp->elements[a], cp->elements[b]);	43	swap(cp->elements[a].cpu, cp->elements[b].cpu);
43	swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);	44	swap(cp->elements[a].dl , cp->elements[b].dl );
		45
		46	swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
44	}	47	}
45		48
46	static void cpudl_heapify(struct cpudl *cp, int idx)	49	static void cpudl_heapify(struct cpudl *cp, int idx)
47	{	50	{
48	int l, r, largest;	51	int l, r, largest;
49		52
50	/* adapted from lib/prio_heap.c */	53	/* adapted from lib/prio_heap.c */
51	while(1) {	54	while(1) {
52	l = left_child(idx);	55	l = left_child(idx);
53	r = right_child(idx);	56	r = right_child(idx);
54	largest = idx;	57	largest = idx;
55		58
56	if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,	59	if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
57	cp->elements[l].dl))	60	cp->elements[l].dl))
58	largest = l;	61	largest = l;
59	if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,	62	if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
60	cp->elements[r].dl))	63	cp->elements[r].dl))
61	largest = r;	64	largest = r;
62	if (largest == idx)	65	if (largest == idx)
63	break;	66	break;
64		67
65	/* Push idx down the heap one level and bump one up */	68	/* Push idx down the heap one level and bump one up */
66	cpudl_exchange(cp, largest, idx);	69	cpudl_exchange(cp, largest, idx);
67	idx = largest;	70	idx = largest;
68	}	71	}
69	}	72	}
70		73
71	static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)	74	static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72	{	75	{
73	WARN_ON(idx == IDX_INVALID \|\| !cpu_present(idx));	76	WARN_ON(idx == IDX_INVALID \|\| !cpu_present(idx));
74		77
75	if (dl_time_before(new_dl, cp->elements[idx].dl)) {	78	if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76	cp->elements[idx].dl = new_dl;	79	cp->elements[idx].dl = new_dl;
77	cpudl_heapify(cp, idx);	80	cpudl_heapify(cp, idx);
78	} else {	81	} else {
79	cp->elements[idx].dl = new_dl;	82	cp->elements[idx].dl = new_dl;
80	while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,	83	while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
81	cp->elements[idx].dl)) {	84	cp->elements[idx].dl)) {
82	cpudl_exchange(cp, idx, parent(idx));	85	cpudl_exchange(cp, idx, parent(idx));
83	idx = parent(idx);	86	idx = parent(idx);
84	}	87	}
85	}	88	}
86	}	89	}
87		90
88	static inline int cpudl_maximum(struct cpudl *cp)	91	static inline int cpudl_maximum(struct cpudl *cp)
89	{	92	{
90	return cp->elements[0].cpu;	93	return cp->elements[0].cpu;
91	}	94	}
92		95
93	/*	96	/*
94	* cpudl_find - find the best (later-dl) CPU in the system	97	* cpudl_find - find the best (later-dl) CPU in the system
95	* @cp: the cpudl max-heap context	98	* @cp: the cpudl max-heap context
96	* @p: the task	99	* @p: the task
97	* @later_mask: a mask to fill in with the selected CPUs (or NULL)	100	* @later_mask: a mask to fill in with the selected CPUs (or NULL)
98	*	101	*
99	* Returns: int - best CPU (heap maximum if suitable)	102	* Returns: int - best CPU (heap maximum if suitable)
100	*/	103	*/
101	int cpudl_find(struct cpudl cp, struct task_struct p,	104	int cpudl_find(struct cpudl cp, struct task_struct p,
102	struct cpumask *later_mask)	105	struct cpumask *later_mask)
103	{	106	{
104	int best_cpu = -1;	107	int best_cpu = -1;
105	const struct sched_dl_entity *dl_se = &p->dl;	108	const struct sched_dl_entity *dl_se = &p->dl;
106		109
107	if (later_mask && cpumask_and(later_mask, cp->free_cpus,	110	if (later_mask && cpumask_and(later_mask, cp->free_cpus,
108	&p->cpus_allowed) && cpumask_and(later_mask,	111	&p->cpus_allowed) && cpumask_and(later_mask,
109	later_mask, cpu_active_mask)) {	112	later_mask, cpu_active_mask)) {
110	best_cpu = cpumask_any(later_mask);	113	best_cpu = cpumask_any(later_mask);
111	goto out;	114	goto out;
112	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&	115	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
113	dl_time_before(dl_se->deadline, cp->elements[0].dl)) {	116	dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
114	best_cpu = cpudl_maximum(cp);	117	best_cpu = cpudl_maximum(cp);
115	if (later_mask)	118	if (later_mask)
116	cpumask_set_cpu(best_cpu, later_mask);	119	cpumask_set_cpu(best_cpu, later_mask);
117	}	120	}
118		121
119	out:	122	out:
120	WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));	123	WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
121		124
122	return best_cpu;	125	return best_cpu;
123	}	126	}
124		127
125	/*	128	/*
126	* cpudl_set - update the cpudl max-heap	129	* cpudl_set - update the cpudl max-heap
127	* @cp: the cpudl max-heap context	130	* @cp: the cpudl max-heap context
128	* @cpu: the target cpu	131	* @cpu: the target cpu
129	* @dl: the new earliest deadline for this cpu	132	* @dl: the new earliest deadline for this cpu
130	*	133	*
131	* Notes: assumes cpu_rq(cpu)->lock is locked	134	* Notes: assumes cpu_rq(cpu)->lock is locked
132	*	135	*
133	* Returns: (void)	136	* Returns: (void)
134	*/	137	*/
135	void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)	138	void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
136	{	139	{
137	int old_idx, new_cpu;	140	int old_idx, new_cpu;
138	unsigned long flags;	141	unsigned long flags;
139		142
140	WARN_ON(!cpu_present(cpu));	143	WARN_ON(!cpu_present(cpu));
141		144
142	raw_spin_lock_irqsave(&cp->lock, flags);	145	raw_spin_lock_irqsave(&cp->lock, flags);
143	old_idx = cp->cpu_to_idx[cpu];	146	old_idx = cp->elements[cpu].idx;
144	if (!is_valid) {	147	if (!is_valid) {
145	/* remove item */	148	/* remove item */
146	if (old_idx == IDX_INVALID) {	149	if (old_idx == IDX_INVALID) {
147	/*	150	/*
148	* Nothing to remove if old_idx was invalid.	151	* Nothing to remove if old_idx was invalid.
149	* This could happen if a rq_offline_dl is	152	* This could happen if a rq_offline_dl is
150	* called for a CPU without -dl tasks running.	153	* called for a CPU without -dl tasks running.
151	*/	154	*/
152	goto out;	155	goto out;
153	}	156	}
154	new_cpu = cp->elements[cp->size - 1].cpu;	157	new_cpu = cp->elements[cp->size - 1].cpu;
155	cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;	158	cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156	cp->elements[old_idx].cpu = new_cpu;	159	cp->elements[old_idx].cpu = new_cpu;
157	cp->size--;	160	cp->size--;
158	cp->cpu_to_idx[new_cpu] = old_idx;	161	cp->elements[new_cpu].idx = old_idx;
159	cp->cpu_to_idx[cpu] = IDX_INVALID;	162	cp->elements[cpu].idx = IDX_INVALID;
160	while (old_idx > 0 && dl_time_before(	163	while (old_idx > 0 && dl_time_before(
161	cp->elements[parent(old_idx)].dl,	164	cp->elements[parent(old_idx)].dl,
162	cp->elements[old_idx].dl)) {	165	cp->elements[old_idx].dl)) {
163	cpudl_exchange(cp, old_idx, parent(old_idx));	166	cpudl_exchange(cp, old_idx, parent(old_idx));
164	old_idx = parent(old_idx);	167	old_idx = parent(old_idx);
165	}	168	}
166	cpumask_set_cpu(cpu, cp->free_cpus);	169	cpumask_set_cpu(cpu, cp->free_cpus);
167	cpudl_heapify(cp, old_idx);	170	cpudl_heapify(cp, old_idx);
168		171
169	goto out;	172	goto out;
170	}	173	}
171		174
172	if (old_idx == IDX_INVALID) {	175	if (old_idx == IDX_INVALID) {
173	cp->size++;	176	cp->size++;
174	cp->elements[cp->size - 1].dl = 0;	177	cp->elements[cp->size - 1].dl = 0;
175	cp->elements[cp->size - 1].cpu = cpu;	178	cp->elements[cp->size - 1].cpu = cpu;
176	cp->cpu_to_idx[cpu] = cp->size - 1;	179	cp->elements[cpu].idx = cp->size - 1;
177	cpudl_change_key(cp, cp->size - 1, dl);	180	cpudl_change_key(cp, cp->size - 1, dl);
178	cpumask_clear_cpu(cpu, cp->free_cpus);	181	cpumask_clear_cpu(cpu, cp->free_cpus);
179	} else {	182	} else {
180	cpudl_change_key(cp, old_idx, dl);	183	cpudl_change_key(cp, old_idx, dl);
181	}	184	}
182		185
183	out:	186	out:
184	raw_spin_unlock_irqrestore(&cp->lock, flags);	187	raw_spin_unlock_irqrestore(&cp->lock, flags);
185	}	188	}
186		189
187	/*	190	/*
188	* cpudl_init - initialize the cpudl structure	191	* cpudl_init - initialize the cpudl structure
189	* @cp: the cpudl max-heap context	192	* @cp: the cpudl max-heap context
190	*/	193	*/
191	int cpudl_init(struct cpudl *cp)	194	int cpudl_init(struct cpudl *cp)
192	{	195	{
193	int i;	196	int i;
194		197
195	memset(cp, 0, sizeof(*cp));	198	memset(cp, 0, sizeof(*cp));
196	raw_spin_lock_init(&cp->lock);	199	raw_spin_lock_init(&cp->lock);
197	cp->size = 0;	200	cp->size = 0;
198	for (i = 0; i < NR_CPUS; i++)	201
199	cp->cpu_to_idx[i] = IDX_INVALID;	202	cp->elements = kcalloc(nr_cpu_ids,
200	if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))	203	sizeof(struct cpudl_item),
		204	GFP_KERNEL);
		205	if (!cp->elements)
201	return -ENOMEM;	206	return -ENOMEM;
		207
		208	if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
		209	kfree(cp->elements);
		210	return -ENOMEM;
		211	}
		212
		213	for_each_possible_cpu(i)
		214	cp->elements[i].idx = IDX_INVALID;
		215
202	cpumask_setall(cp->free_cpus);	216	cpumask_setall(cp->free_cpus);
203		217
204	return 0;	218	return 0;
205	}	219	}
206		220
207	/*	221	/*
208	* cpudl_cleanup - clean up the cpudl structure	222	* cpudl_cleanup - clean up the cpudl structure
209	* @cp: the cpudl max-heap context	223	* @cp: the cpudl max-heap context
210	*/	224	*/
211	void cpudl_cleanup(struct cpudl *cp)	225	void cpudl_cleanup(struct cpudl *cp)
212	{	226	{
213	free_cpumask_var(cp->free_cpus);	227	free_cpumask_var(cp->free_cpus);
		228	kfree(cp->elements);
214	}	229	}
215		230

kernel/sched/cpudeadline.h

Diff comments View file @ 3243970

 #ifndef _LINUX_CPUDL_H
 #define _LINUX_CPUDL_H
 #include <linux/sched.h>
 #define IDX_INVALID     -1
-struct array_item {
+struct cpudl_item {
 	u64 dl;
 	int cpu;
+	int idx;
 };
 struct cpudl {
 	raw_spinlock_t lock;
 	int size;
-	int cpu_to_idx[NR_CPUS];
-	struct array_item elements[NR_CPUS];
 	cpumask_var_t free_cpus;
+	struct cpudl_item *elements;
 };
 #ifdef CONFIG_SMP
 int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	       struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
 int cpudl_init(struct cpudl *cp);
 void cpudl_cleanup(struct cpudl *cp);
 #else
 #define cpudl_set(cp, cpu, dl) do { } while (0)
 #define cpudl_init() do { } while (0)
 #endif /* CONFIG_SMP */

kernel/sched/cpupri.c

Diff comments View file @ 3243970

 /*
  *  kernel/sched/cpupri.c
  *
  *  CPU priority management
  *
  *  Copyright (C) 2007-2008 Novell
  *
  *  Author: Gregory Haskins <ghaskins@novell.com>
  *
  *  This code tracks the priority of each CPU so that global migration
  *  decisions are easy to calculate.  Each CPU can be in a state as follows:
  *
  *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
  *
  *  going from the lowest priority to the highest.  CPUs in the INVALID state
  *  are not eligible for routing.  The system maintains this state with
  *  a 2 dimensional bitmap (the first for priority class, the second for cpus
  *  in that class).  Therefore a typical application without affinity
  *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
  *  searches).  For tasks with affinity restrictions, the algorithm has a
  *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
  *  yields the worst case search is fairly contrived.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; version 2
  *  of the License.
  */
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/slab.h>
 #include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
 static int convert_prio(int prio)
 {
 	int cpupri;
 	if (prio == CPUPRI_INVALID)
 		cpupri = CPUPRI_INVALID;
 	else if (prio == MAX_PRIO)
 		cpupri = CPUPRI_IDLE;
 	else if (prio >= MAX_RT_PRIO)
 		cpupri = CPUPRI_NORMAL;
 	else
 		cpupri = MAX_RT_PRIO - prio + 1;
 	return cpupri;
 }
 /**
  * cpupri_find - find the best (lowest-pri) CPU in the system
  * @cp: The cpupri context
  * @p: The task
  * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
  *
  * Note: This function returns the recommended CPUs as calculated during the
  * current invocation.  By the time the call returns, the CPUs may have in
  * fact changed priorities any number of times.  While not ideal, it is not
  * an issue of correctness since the normal rebalancer logic will correct
  * any discrepancies created by racing against the uncertainty of the current
  * priority configuration.
  *
  * Return: (int)bool - CPUs were found
  */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		struct cpumask *lowest_mask)
 {
 	int idx = 0;
 	int task_pri = convert_prio(p->prio);
 	BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
 	for (idx = 0; idx < task_pri; idx++) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
 		int skip = 0;
 		if (!atomic_read(&(vec)->count))
 			skip = 1;
 		/*
 		 * When looking at the vector, we need to read the counter,
 		 * do a memory barrier, then read the mask.
 		 *
 		 * Note: This is still all racey, but we can deal with it.
 		 *  Ideally, we only want to look at masks that are set.
 		 *
 		 *  If a mask is not set, then the only thing wrong is that we
 		 *  did a little more work than necessary.
 		 *
 		 *  If we read a zero count but the mask is set, because of the
 		 *  memory barriers, that can only happen when the highest prio
 		 *  task for a run queue has left the run queue, in which case,
 		 *  it will be followed by a pull. If the task we are processing
 		 *  fails to find a proper place to go, that pull request will
 		 *  pull this task if the run queue is running at a lower
 		 *  priority.
 		 */
 		smp_rmb();
 		/* Need to do the rmb for every iteration */
 		if (skip)
 			continue;
 		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
 		if (lowest_mask) {
 			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
 			/*
 			 * We have to ensure that we have at least one bit
 			 * still set in the array, since the map could have
 			 * been concurrently emptied between the first and
 			 * second reads of vec->mask.  If we hit this
 			 * condition, simply act as though we never hit this
 			 * priority level and continue on.
 			 */
 			if (cpumask_any(lowest_mask) >= nr_cpu_ids)
 				continue;
 		}
 		return 1;
 	}
 	return 0;
 }
 /**
  * cpupri_set - update the cpu priority setting
  * @cp: The cpupri context
  * @cpu: The target cpu
  * @newpri: The priority (INVALID-RT99) to assign to this CPU
  *
  * Note: Assumes cpu_rq(cpu)->lock is locked
  *
  * Returns: (void)
  */
 void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
 	int *currpri = &cp->cpu_to_pri[cpu];
 	int oldpri = *currpri;
 	int do_mb = 0;
 	newpri = convert_prio(newpri);
 	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
 	if (newpri == oldpri)
 		return;
 	/*
 	 * If the cpu was currently mapped to a different value, we
 	 * need to map it to the new value then remove the old value.
 	 * Note, we must add the new value first, otherwise we risk the
 	 * cpu being missed by the priority loop in cpupri_find.
 	 */
 	if (likely(newpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
 		cpumask_set_cpu(cpu, vec->mask);
 		/*
 		 * When adding a new vector, we update the mask first,
 		 * do a write memory barrier, and then update the count, to
 		 * make sure the vector is visible when count is set.
 		 */
 		smp_mb__before_atomic_inc();
 		atomic_inc(&(vec)->count);
 		do_mb = 1;
 	}
 	if (likely(oldpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
 		/*
 		 * Because the order of modification of the vec->count
 		 * is important, we must make sure that the update
 		 * of the new prio is seen before we decrement the
 		 * old prio. This makes sure that the loop sees
 		 * one or the other when we raise the priority of
 		 * the run queue. We don't care about when we lower the
 		 * priority, as that will trigger an rt pull anyway.
 		 *
 		 * We only need to do a memory barrier if we updated
 		 * the new priority vec.
 		 */
 		if (do_mb)
 			smp_mb__after_atomic_inc();
 		/*
 		 * When removing from the vector, we decrement the counter first
 		 * do a memory barrier and then clear the mask.
 		 */
 		atomic_dec(&(vec)->count);
 		smp_mb__after_atomic_inc();
 		cpumask_clear_cpu(cpu, vec->mask);
 	}
 	*currpri = newpri;
 }
 /**
  * cpupri_init - initialize the cpupri structure
  * @cp: The cpupri context
  *
  * Return: -ENOMEM on memory allocation failure.
  */
 int cpupri_init(struct cpupri *cp)
 {
 	int i;
 	memset(cp, 0, sizeof(*cp));
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 		atomic_set(&vec->count, 0);
 		if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
 			goto cleanup;
 	}
+	cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+	if (!cp->cpu_to_pri)
+		goto cleanup;
 	for_each_possible_cpu(i)
 		cp->cpu_to_pri[i] = CPUPRI_INVALID;
 	return 0;
 cleanup:
 	for (i--; i >= 0; i--)
 		free_cpumask_var(cp->pri_to_cpu[i].mask);
 	return -ENOMEM;
 }
 /**
  * cpupri_cleanup - clean up the cpupri structure
  * @cp: The cpupri context
  */
 void cpupri_cleanup(struct cpupri *cp)
 {
 	int i;
+	kfree(cp->cpu_to_pri);
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
 		free_cpumask_var(cp->pri_to_cpu[i].mask);
 }

kernel/sched/cpupri.h

Diff comments View file @ 3243970

1	#ifndef _LINUX_CPUPRI_H	1	#ifndef _LINUX_CPUPRI_H
2	#define _LINUX_CPUPRI_H	2	#define _LINUX_CPUPRI_H
3		3
4	#include <linux/sched.h>	4	#include <linux/sched.h>
5		5
6	#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)	6	#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7		7
8	#define CPUPRI_INVALID -1	8	#define CPUPRI_INVALID -1
9	#define CPUPRI_IDLE 0	9	#define CPUPRI_IDLE 0
10	#define CPUPRI_NORMAL 1	10	#define CPUPRI_NORMAL 1
11	/* values 2-101 are RT priorities 0-99 */	11	/* values 2-101 are RT priorities 0-99 */
12		12
13	struct cpupri_vec {	13	struct cpupri_vec {
14	atomic_t count;	14	atomic_t count;
15	cpumask_var_t mask;	15	cpumask_var_t mask;
16	};	16	};
17		17
18	struct cpupri {	18	struct cpupri {
19	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];	19	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20	int cpu_to_pri[NR_CPUS];	20	int *cpu_to_pri;
21	};	21	};
22		22
23	#ifdef CONFIG_SMP	23	#ifdef CONFIG_SMP
24	int cpupri_find(struct cpupri *cp,	24	int cpupri_find(struct cpupri *cp,
25	struct task_struct p, struct cpumask lowest_mask);	25	struct task_struct p, struct cpumask lowest_mask);
26	void cpupri_set(struct cpupri *cp, int cpu, int pri);	26	void cpupri_set(struct cpupri *cp, int cpu, int pri);
27	int cpupri_init(struct cpupri *cp);	27	int cpupri_init(struct cpupri *cp);
28	void cpupri_cleanup(struct cpupri *cp);	28	void cpupri_cleanup(struct cpupri *cp);
29	#else	29	#else
30	#define cpupri_set(cp, cpu, pri) do { } while (0)	30	#define cpupri_set(cp, cpu, pri) do { } while (0)
31	#define cpupri_init() do { } while (0)	31	#define cpupri_init() do { } while (0)
32	#endif	32	#endif
33		33
34	#endif /* _LINUX_CPUPRI_H */	34	#endif /* _LINUX_CPUPRI_H */
35		35