Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 09dc4ab03936df5c5aa711d27c81283c6d09f495

Authored by Roman Gushchin 2014-05-19 19:10:09 +0800

Committed by Ingo Molnar 2014-06-05 17:51:34 +0800

Exists in ti-lsk-linux-4.1.y and in 12 other branches

sched/fair: Fix tg_set_cfs_bandwidth() deadlock on rq->lock

tg_set_cfs_bandwidth() sets cfs_b->timer_active to 0 to
force the period timer restart. It's not safe, because
can lead to deadlock, described in commit 927b54fccbf0:
"__start_cfs_bandwidth calls hrtimer_cancel while holding rq->lock,
waiting for the hrtimer to finish. However, if sched_cfs_period_timer
runs for another loop iteration, the hrtimer can attempt to take
rq->lock, resulting in deadlock."

Three CPUs must be involved:

  CPU0               CPU1                         CPU2
  take rq->lock      period timer fired
  ...                take cfs_b lock
  ...                ...                          tg_set_cfs_bandwidth()
  throttle_cfs_rq()  release cfs_b lock           take cfs_b lock
  ...                distribute_cfs_runtime()     timer_active = 0
  take cfs_b->lock   wait for rq->lock            ...
  __start_cfs_bandwidth()
  {wait for timer callback
   break if timer_active == 1}

So, CPU0 and CPU1 are deadlocked.

Instead of resetting cfs_b->timer_active, tg_set_cfs_bandwidth can
wait for period timer callbacks (ignoring cfs_b->timer_active) and
restart the timer explicitly.

Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
Reviewed-by: Ben Segall <bsegall@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/87wqdi9g8e.wl\%klamm@yandex-team.ru
Cc: pjt@google.com
Cc: chris.j.arges@canonical.com
Cc: gregkh@linuxfoundation.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Showing 3 changed files with 6 additions and 7 deletions Inline Diff

kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/sched.h

kernel/sched/core.c

Diff comments View file @ 09dc4ab

 /*
  *  kernel/sched/core.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
 #include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
 	ktime_t soft, hard, now;
 	for (;;) {
 		if (hrtimer_active(period_timer))
 			break;
 		now = hrtimer_cb_get_time(period_timer);
 		hrtimer_forward(period_timer, now, period);
 		soft = hrtimer_get_softexpires(period_timer);
 		hard = hrtimer_get_expires(period_timer);
 		delta = ktime_to_ns(ktime_sub(hard, soft));
 		__hrtimer_start_range_ns(period_timer, soft, delta,
 					 HRTIMER_MODE_ABS_PINNED, 0);
 	}
 }
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 void update_rq_clock(struct rq *rq)
 {
 	s64 delta;
 	if (rq->skip_clock_update > 0)
 		return;
 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 	rq->clock += delta;
 	update_rq_clock_task(rq, delta);
 }
 /*
  * Debugging: various feature bits
  */
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
 #undef SCHED_FEAT
 #ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 static const char * const sched_feat_names[] = {
 #include "features.h"
 };
 #undef SCHED_FEAT
 static int sched_feat_show(struct seq_file *m, void *v)
 {
 	int i;
 	for (i = 0; i < __SCHED_FEAT_NR; i++) {
 		if (!(sysctl_sched_features & (1UL << i)))
 			seq_puts(m, "NO_");
 		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
 	seq_puts(m, "\n");
 	return 0;
 }
 #ifdef HAVE_JUMP_LABEL
 #define jump_label_key__true  STATIC_KEY_INIT_TRUE
 #define jump_label_key__false STATIC_KEY_INIT_FALSE
 #define SCHED_FEAT(name, enabled)	\
 	jump_label_key__##enabled ,
 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 #include "features.h"
 };
 #undef SCHED_FEAT
 static void sched_feat_disable(int i)
 {
 	if (static_key_enabled(&sched_feat_keys[i]))
 		static_key_slow_dec(&sched_feat_keys[i]);
 }
 static void sched_feat_enable(int i)
 {
 	if (!static_key_enabled(&sched_feat_keys[i]))
 		static_key_slow_inc(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* HAVE_JUMP_LABEL */
 static int sched_feat_set(char *cmp)
 {
 	int i;
 	int neg = 0;
 	if (strncmp(cmp, "NO_", 3) == 0) {
 		neg = 1;
 		cmp += 3;
 	}
 	for (i = 0; i < __SCHED_FEAT_NR; i++) {
 		if (strcmp(cmp, sched_feat_names[i]) == 0) {
 			if (neg) {
 				sysctl_sched_features &= ~(1UL << i);
 				sched_feat_disable(i);
 			} else {
 				sysctl_sched_features |= (1UL << i);
 				sched_feat_enable(i);
 			}
 			break;
 		}
 	}
 	return i;
 }
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char buf[64];
 	char *cmp;
 	int i;
 	if (cnt > 63)
 		cnt = 63;
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
 	buf[cnt] = 0;
 	cmp = strstrip(buf);
 	i = sched_feat_set(cmp);
 	if (i == __SCHED_FEAT_NR)
 		return -EINVAL;
 	*ppos += cnt;
 	return cnt;
 }
 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_feat_show, NULL);
 }
 static const struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static __init int sched_init_debug(void)
 {
 	debugfs_create_file("sched_features", 0644, NULL, NULL,
 			&sched_feat_fops);
 	return 0;
 }
 late_initcall(sched_init_debug);
 #endif /* CONFIG_SCHED_DEBUG */
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
  * period over which we average the RT time consumption, measured
  * in ms.
  *
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 __read_mostly int scheduler_running;
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
 int sysctl_sched_rt_runtime = 950000;
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	lockdep_assert_held(&p->pi_lock);
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 	}
 }
 /*
  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 	}
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	raw_spin_unlock(&rq->lock);
 }
 static inline void
 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	raw_spin_lock(&rq->lock);
 	return rq;
 }
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
  */
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	raw_spin_unlock(&rq->lock);
 	return HRTIMER_NORESTART;
 }
 #ifdef CONFIG_SMP
 static int __hrtick_restart(struct rq *rq)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = hrtimer_get_softexpires(timer);
 	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
 }
 /*
  * called from hardirq (IPI) context
  */
 static void __hrtick_start(void *arg)
 {
 	struct rq *rq = arg;
 	raw_spin_lock(&rq->lock);
 	__hrtick_restart(rq);
 	rq->hrtick_csd_pending = 0;
 	raw_spin_unlock(&rq->lock);
 }
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 void hrtick_start(struct rq *rq, u64 delay)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 	hrtimer_set_expires(timer, time);
 	if (rq == this_rq()) {
 		__hrtick_restart(rq);
 	} else if (!rq->hrtick_csd_pending) {
 		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 		rq->hrtick_csd_pending = 1;
 	}
 }
 static int
 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
 #else
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 void hrtick_start(struct rq *rq, u64 delay)
 {
 	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
 			HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
 static void init_rq_hrtick(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
 	rq->hrtick_csd.flags = 0;
 	rq->hrtick_csd.func = __hrtick_start;
 	rq->hrtick_csd.info = rq;
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 static inline void init_hrtick(void)
 {
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 void resched_task(struct task_struct *p)
 {
 	int cpu;
 	lockdep_assert_held(&task_rq(p)->lock);
 	if (test_tsk_need_resched(p))
 		return;
 	set_tsk_need_resched(p);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id()) {
 		set_preempt_need_resched();
 		return;
 	}
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_task(cpu_curr(cpu));
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In the semi idle case, use the nearest busy cpu for migrating timers
  * from an idle cpu.  This is good for power-savings.
  *
  * We don't do similar optimization for completely idle system, as
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
 int get_nohz_timer_target(int pinned)
 {
 	int cpu = smp_processor_id();
 	int i;
 	struct sched_domain *sd;
 	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
 		return cpu;
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
 			if (!idle_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 unlock:
 	rcu_read_unlock();
 	return cpu;
 }
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
  * which is scheduled to wake up that CPU. In case of a completely
  * idle system the next event might even be infinite time into the
  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
  * leaves the inner idle loop so the newly added timer is taken into
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
 static void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (cpu == smp_processor_id())
 		return;
 	/*
 	 * This is safe, as this function is called with the timer
 	 * wheel base lock of (cpu) held. When the CPU is on the way
 	 * to idle and has not yet set rq->curr to idle then it will
 	 * be serialized on the timer wheel base lock and take the new
 	 * timer into account automatically.
 	 */
 	if (rq->curr != rq->idle)
 		return;
 	/*
 	 * We can set TIF_RESCHED on the idle task of the other CPU
 	 * lockless. The worst case is that the other CPU runs the
 	 * idle task through an additional NOOP schedule()
 	 */
 	set_tsk_need_resched(rq->idle);
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
 static bool wake_up_full_nohz_cpu(int cpu)
 {
 	if (tick_nohz_full_cpu(cpu)) {
 		if (cpu != smp_processor_id() ||
 		    tick_nohz_tick_stopped())
 			smp_send_reschedule(cpu);
 		return true;
 	}
 	return false;
 }
 void wake_up_nohz_cpu(int cpu)
 {
 	if (!wake_up_full_nohz_cpu(cpu))
 		wake_up_idle_cpu(cpu);
 }
 static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
 	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
 		return false;
 	if (idle_cpu(cpu) && !need_resched())
 		return true;
 	/*
 	 * We can't run Idle Load Balance on this CPU for this time so we
 	 * cancel it and clear NOHZ_BALANCE_KICK
 	 */
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 	return false;
 }
 #else /* CONFIG_NO_HZ_COMMON */
 static inline bool got_nohz_idle_kick(void)
 {
 	return false;
 }
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
 bool sched_can_stop_tick(void)
 {
        struct rq *rq;
        rq = this_rq();
        /* Make sure rq->nr_running update is visible after the IPI */
        smp_rmb();
        /* More than one running task need preemption */
        if (rq->nr_running > 1)
                return false;
        return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
 void sched_avg_update(struct rq *rq)
 {
 	s64 period = sched_avg_period();
 	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
 		/*
 		 * Inline assembly required to prevent the compiler
 		 * optimising this loop into a divmod call.
 		 * See __iter_div_u64_rem() for another example of this.
 		 */
 		asm("" : "+rm" (rq->age_stamp));
 		rq->age_stamp += period;
 		rq->rt_avg /= 2;
 	}
 }
 #endif /* CONFIG_SMP */
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
 			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 /*
  * Iterate task_group tree rooted at *from, calling @down when first entering a
  * node and @up when leaving it for the final time.
  *
  * Caller must hold rcu_lock or sufficient equivalent.
  */
 int walk_tg_tree_from(struct task_group *from,
 			     tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 	parent = from;
 down:
 	ret = (*down)(parent, data);
 	if (ret)
 		goto out;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
 up:
 		continue;
 	}
 	ret = (*up)(parent, data);
 	if (ret || parent == from)
 		goto out;
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
 out:
 	return ret;
 }
 int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
 }
 #endif
 static void set_load_weight(struct task_struct *p)
 {
 	int prio = p->static_prio - MAX_RT_PRIO;
 	struct load_weight *load = &p->se.load;
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	load->weight = scale_load(prio_to_weight[prio]);
 	load->inv_weight = prio_to_wmult[prio];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, flags);
 }
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, flags);
 }
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 /*
  * In theory, the compile should just see 0 here, and optimize out the call
  * to sched_rt_avg_update. But I don't trust it...
  */
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 	s64 steal = 0, irq_delta = 0;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 	/*
 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
 	 * this case when a previous update_rq_clock() happened inside a
 	 * {soft,}irq region.
 	 *
 	 * When this happens, we stop ->clock_task and only update the
 	 * prev_irq_time stamp to account for the part that fit, so that a next
 	 * update will consume the rest. This ensures ->clock_task is
 	 * monotonic.
 	 *
 	 * It does however cause some slight miss-attribution of {soft,}irq
 	 * time, a more accurate solution would be to update the irq_time using
 	 * the current rq->clock timestamp, except that would require using
 	 * atomic ops.
 	 */
 	if (irq_delta > delta)
 		irq_delta = delta;
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
 		steal = paravirt_steal_clock(cpu_of(rq));
 		steal -= rq->prev_steal_time_rq;
 		if (unlikely(steal > delta))
 			steal = delta;
 		rq->prev_steal_time_rq += steal;
 		delta -= steal;
 	}
 #endif
 	rq->clock_task += delta;
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
 		sched_rt_avg_update(rq, irq_delta + steal);
 #endif
 }
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 	if (stop) {
 		/*
 		 * Make it appear like a SCHED_FIFO task, its something
 		 * userspace knows about and won't get confused about.
 		 *
 		 * Also, it will make PI more or less work without too
 		 * much confusion -- but then, stop work should not
 		 * rely on PI working anyway.
 		 */
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 		stop->sched_class = &stop_sched_class;
 	}
 	cpu_rq(cpu)->stop = stop;
 	if (old_stop) {
 		/*
 		 * Reset it back to a normal scheduling class so that
 		 * it can die in pieces.
 		 */
 		old_stop->sched_class = &rt_sched_class;
 	}
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_dl_policy(p))
 		prio = MAX_DL_PRIO-1;
 	else if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  *
  * Return: 1 if the task is currently executing. 0 otherwise.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
 				       int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	const struct sched_class *class;
 	if (p->sched_class == rq->curr->sched_class) {
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 	} else {
 		for_each_class(class) {
 			if (class == rq->curr->sched_class)
 				break;
 			if (class == p->sched_class) {
 				resched_task(rq->curr);
 				break;
 			}
 		}
 	}
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
 	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
 	/*
 	 * We should never call set_task_cpu() on a blocked task,
 	 * ttwu() will sort out the placement.
 	 */
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!(task_preempt_count(p) & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
 	 *
 	 * sched_move_task() holds both and thus holding either pins the cgroup,
 	 * see task_group().
 	 *
 	 * Furthermore, all task_rq users should acquire both locks, see
 	 * task_rq_lock().
 	 */
 	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
 				      lockdep_is_held(&task_rq(p)->lock)));
 #endif
 #endif
 	trace_sched_migrate_task(p, new_cpu);
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 	}
 	__set_task_cpu(p, new_cpu);
 }
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
 	if (p->on_rq) {
 		struct rq *src_rq, *dst_rq;
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
 		 * Task isn't running anymore; make it appear like we migrated
 		 * it before it went to sleep. This means on wakeup we make the
 		 * previous cpu our targer instead of where it really is.
 		 */
 		p->wake_cpu = cpu;
 	}
 }
 struct migration_swap_arg {
 	struct task_struct *src_task, *dst_task;
 	int src_cpu, dst_cpu;
 };
 static int migrate_swap_stop(void *data)
 {
 	struct migration_swap_arg *arg = data;
 	struct rq *src_rq, *dst_rq;
 	int ret = -EAGAIN;
 	src_rq = cpu_rq(arg->src_cpu);
 	dst_rq = cpu_rq(arg->dst_cpu);
 	double_raw_lock(&arg->src_task->pi_lock,
 			&arg->dst_task->pi_lock);
 	double_rq_lock(src_rq, dst_rq);
 	if (task_cpu(arg->dst_task) != arg->dst_cpu)
 		goto unlock;
 	if (task_cpu(arg->src_task) != arg->src_cpu)
 		goto unlock;
 	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
 		goto unlock;
 	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
 		goto unlock;
 	__migrate_swap_task(arg->src_task, arg->dst_cpu);
 	__migrate_swap_task(arg->dst_task, arg->src_cpu);
 	ret = 0;
 unlock:
 	double_rq_unlock(src_rq, dst_rq);
 	raw_spin_unlock(&arg->dst_task->pi_lock);
 	raw_spin_unlock(&arg->src_task->pi_lock);
 	return ret;
 }
 /*
  * Cross migrate two tasks
  */
 int migrate_swap(struct task_struct *cur, struct task_struct *p)
 {
 	struct migration_swap_arg arg;
 	int ret = -EINVAL;
 	arg = (struct migration_swap_arg){
 		.src_task = cur,
 		.src_cpu = task_cpu(cur),
 		.dst_task = p,
 		.dst_cpu = task_cpu(p),
 	};
 	if (arg.src_cpu == arg.dst_cpu)
 		goto out;
 	/*
 	 * These three tests are all lockless; this is OK since all of them
 	 * will be re-checked with proper locks held further down the line.
 	 */
 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
 		goto out;
 	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
 		goto out;
 	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
 		goto out;
 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
 	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
 out:
 	return ret;
 }
 struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
 };
 static int migration_cpu_stop(void *data);
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
  * not expected to change.  If it changes, i.e. @p might have woken up,
  * then return zero.  When we succeed in waiting for @p to be off its CPU,
  * we return a positive number (its total switch count).  If a second call
  * a short while later returns the same number, the caller can be sure that
  * @p has remained unscheduled the whole time.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
 	unsigned long ncsw;
 	struct rq *rq;
 	for (;;) {
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
 		 * the runqueue lock when things look like they will
 		 * work out!
 		 */
 		rq = task_rq(p);
 		/*
 		 * If the task is actively running on another CPU
 		 * still, just relax and busy-wait without holding
 		 * any locks.
 		 *
 		 * NOTE! Since we don't hold any locks, it's not
 		 * even sure that "rq" stays as the right runqueue!
 		 * But we don't care, since "task_running()" will
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
 			if (match_state && unlikely(p->state != match_state))
 				return 0;
 			cpu_relax();
 		}
 		/*
 		 * Ok, time to look more closely! We need the rq
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		on_rq = p->on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &flags);
 		/*
 		 * If it changed from the expected state, bail out now.
 		 */
 		if (unlikely(!ncsw))
 			break;
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
 		 * Oops. Go back and try again..
 		 */
 		if (unlikely(running)) {
 			cpu_relax();
 			continue;
 		}
 		/*
 		 * It's not enough that it's not actively running,
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
 		 * So if it was still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
 			continue;
 		}
 		/*
 		 * Ahh, all good. It wasn't running, and it wasn't
 		 * runnable, which means that it will never become
 		 * running in the future either. We're all done!
 		 */
 		break;
 	}
 	return ncsw;
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesn't have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
 	int nid = cpu_to_node(cpu);
 	const struct cpumask *nodemask = NULL;
 	enum { cpuset, possible, fail } state = cpuset;
 	int dest_cpu;
 	/*
 	 * If the node that the cpu is on has been offlined, cpu_to_node()
 	 * will return -1. There is no cpu on the node, and we should
 	 * select the cpu on the other node.
 	 */
 	if (nid != -1) {
 		nodemask = cpumask_of_node(nid);
 		/* Look for allowed, online CPU in same node. */
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_online(dest_cpu))
 				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 				return dest_cpu;
 		}
 	}
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
 			if (!cpu_online(dest_cpu))
 				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			goto out;
 		}
 		switch (state) {
 		case cpuset:
 			/* No more Mr. Nice Guy. */
 			cpuset_cpus_allowed_fallback(p);
 			state = possible;
 			break;
 		case possible:
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
 		case fail:
 			BUG();
 			break;
 		}
 	}
 out:
 	if (state != cpuset) {
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
 		 * leave kernel.
 		 */
 		if (p->mm && printk_ratelimit()) {
 			printk_sched("process %d (%s) no longer affine to cpu%d\n",
 					task_pid_nr(p), p->comm, cpu);
 		}
 	}
 	return dest_cpu;
 }
 /*
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
 	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
 	 * cpu.
 	 *
 	 * Since this is common to all placement strategies, this lives here.
 	 *
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
 	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 	return cpu;
 }
 static void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
 #endif
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
 #ifdef CONFIG_SCHEDSTATS
 	struct rq *rq = this_rq();
 #ifdef CONFIG_SMP
 	int this_cpu = smp_processor_id();
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	} else {
 		struct sched_domain *sd;
 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
 		rcu_read_lock();
 		for_each_domain(this_cpu, sd) {
 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
 		}
 		rcu_read_unlock();
 	}
 	if (wake_flags & WF_MIGRATED)
 		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 	schedstat_inc(rq, ttwu_count);
 	schedstat_inc(p, se.statistics.nr_wakeups);
 	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.statistics.nr_wakeups_sync);
 #endif /* CONFIG_SCHEDSTATS */
 }
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = 1;
 	/* if a worker is waking up, notify workqueue */
 	if (p->flags & PF_WQ_WORKER)
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	check_preempt_curr(rq, p, wake_flags);
 	trace_sched_wakeup(p, true);
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
 		u64 max = 2*rq->max_idle_balance_cost;
 		update_avg(&rq->avg_idle, delta);
 		if (rq->avg_idle > max)
 			rq->avg_idle = max;
 		rq->idle_stamp = 0;
 	}
 #endif
 }
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
 #endif
 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 	ttwu_do_wakeup(rq, p, wake_flags);
 }
 /*
  * Called in case the task @p isn't fully descheduled from its runqueue,
  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
  * since all we need to do is flip p->state to TASK_RUNNING, since
  * the task is still ->on_rq.
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
 	struct rq *rq;
 	int ret = 0;
 	rq = __task_rq_lock(p);
 	if (p->on_rq) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
 	return ret;
 }
 #ifdef CONFIG_SMP
 static void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
 	struct task_struct *p;
 	raw_spin_lock(&rq->lock);
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
 		ttwu_do_activate(rq, p, 0);
 	}
 	raw_spin_unlock(&rq->lock);
 }
 void scheduler_ipi(void)
 {
 	/*
 	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
 	 * TIF_NEED_RESCHED remotely (for the first time) will also send
 	 * this IPI.
 	 */
 	preempt_fold_need_resched();
 	if (llist_empty(&this_rq()->wake_list)
 			&& !tick_nohz_full_cpu(smp_processor_id())
 			&& !got_nohz_idle_kick())
 		return;
 	/*
 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
 	 * traditionally all their work was done from the interrupt return
 	 * path. Now that we actually do some work, we need to make sure
 	 * we do call them.
 	 *
 	 * Some archs already do call them, luckily irq_enter/exit nest
 	 * properly.
 	 *
 	 * Arguably we should visit all archs and update all handlers,
 	 * however a fair share of IPIs are still resched only so this would
 	 * somewhat pessimize the simple resched case.
 	 */
 	irq_enter();
 	tick_nohz_full_check();
 	sched_ttwu_pending();
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
 	if (unlikely(got_nohz_idle_kick())) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
 	irq_exit();
 }
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
 		smp_send_reschedule(cpu);
 }
 bool cpus_share_cache(int this_cpu, int that_cpu)
 {
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 #endif /* CONFIG_SMP */
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
 		return;
 	}
 #endif
 	raw_spin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
 	raw_spin_unlock(&rq->lock);
 }
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * Return: %true if @p was woken up, %false if it was already running.
  * or @state didn't match @p's state.
  */
 static int
 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, success = 0;
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
 	 * reordered with p->state check below. This pairs with mb() in
 	 * set_current_state() the waiting thread does.
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	if (!(p->state & state))
 		goto out;
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 #ifdef CONFIG_SMP
 	/*
 	 * If the owning (remote) cpu is still in the middle of schedule() with
 	 * this task as prev, wait until its done referencing the task.
 	 */
 	while (p->on_cpu)
 		cpu_relax();
 	/*
 	 * Pairs with the smp_wmb() in finish_lock_switch().
 	 */
 	smp_rmb();
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(p);
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
 #endif /* CONFIG_SMP */
 	ttwu_queue(p, cpu);
 stat:
 	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	return success;
 }
 /**
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
  * Put @p on the run-queue if it's not already there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
 static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 	if (WARN_ON_ONCE(rq != this_rq()) ||
 	    WARN_ON_ONCE(p == current))
 		return;
 	lockdep_assert_held(&rq->lock);
 	if (!raw_spin_trylock(&p->pi_lock)) {
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
 	}
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 	if (!p->on_rq)
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 	ttwu_do_wakeup(rq, p, 0);
 	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
  * Attempt to wake up the nominated process and move it to the set of runnable
  * processes.
  *
  * Return: 1 if the process was woken up, 0 if it was already running.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 int wake_up_process(struct task_struct *p)
 {
 	WARN_ON(task_is_stopped_or_traced(p));
 	return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	p->on_rq			= 0;
 	p->se.on_rq			= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	p->dl.dl_runtime = p->dl.runtime = 0;
 	p->dl.dl_deadline = p->dl.deadline = 0;
 	p->dl.dl_period = 0;
 	p->dl.flags = 0;
 	INIT_LIST_HEAD(&p->rt.run_list);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
 		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 		p->mm->numa_scan_seq = 0;
 	}
 	if (clone_flags & CLONE_VM)
 		p->numa_preferred_nid = current->numa_preferred_nid;
 	else
 		p->numa_preferred_nid = -1;
 	p->node_stamp = 0ULL;
 	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
 	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
 	p->numa_work.next = &p->numa_work;
 	p->numa_faults_memory = NULL;
 	p->numa_faults_buffer_memory = NULL;
 	p->last_task_numa_placement = 0;
 	p->last_sum_exec_runtime = 0;
 	INIT_LIST_HEAD(&p->numa_entry);
 	p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
 #ifdef CONFIG_NUMA_BALANCING
 #ifdef CONFIG_SCHED_DEBUG
 void set_numabalancing_state(bool enabled)
 {
 	if (enabled)
 		sched_feat_set("NUMA");
 	else
 		sched_feat_set("NO_NUMA");
 }
 #else
 __read_mostly bool numabalancing_enabled;
 void set_numabalancing_state(bool enabled)
 {
 	numabalancing_enabled = enabled;
 }
 #endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	int err;
 	int state = numabalancing_enabled;
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	t = *table;
 	t.data = &state;
 	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
 	if (err < 0)
 		return err;
 	if (write)
 		set_numabalancing_state(state);
 	return err;
 }
 #endif
 #endif
 /*
  * fork()/clone()-time setup:
  */
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long flags;
 	int cpu = get_cpu();
 	__sched_fork(clone_flags, p);
 	/*
 	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
 	p->prio = current->normal_prio;
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
 		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 			p->policy = SCHED_NORMAL;
 			p->static_prio = NICE_TO_PRIO(0);
 			p->rt_priority = 0;
 		} else if (PRIO_TO_NICE(p->static_prio) < 0)
 			p->static_prio = NICE_TO_PRIO(0);
 		p->prio = p->normal_prio = __normal_prio(p);
 		set_load_weight(p);
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:
 		 */
 		p->sched_reset_on_fork = 0;
 	}
 	if (dl_prio(p->prio)) {
 		put_cpu();
 		return -EAGAIN;
 	} else if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
 	} else {
 		p->sched_class = &fair_sched_class;
 	}
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
 	 * is ran before sched_fork().
 	 *
 	 * Silence PROVE_RCU.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP)
 	p->on_cpu = 0;
 #endif
 	init_task_preempt_count(p);
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
 	put_cpu();
 	return 0;
 }
 unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 20;
 	/*
 	 * Doing this here saves a lot of checks in all
 	 * the calling paths, and returning zero seems
 	 * safe for them anyway.
 	 */
 	if (period == 0)
 		return 0;
 	return div64_u64(runtime << 20, period);
 }
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
 	return &cpu_rq(i)->rd->dl_bw;
 }
 static inline int dl_bw_cpus(int i)
 {
 	struct root_domain *rd = cpu_rq(i)->rd;
 	int cpus = 0;
 	for_each_cpu_and(i, rd->span, cpu_active_mask)
 		cpus++;
 	return cpus;
 }
 #else
 inline struct dl_bw *dl_bw_of(int i)
 {
 	return &cpu_rq(i)->dl.dl_bw;
 }
 static inline int dl_bw_cpus(int i)
 {
 	return 1;
 }
 #endif
 static inline
 void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
 {
 	dl_b->total_bw -= tsk_bw;
 }
 static inline
 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
 {
 	dl_b->total_bw += tsk_bw;
 }
 static inline
 bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 {
 	return dl_b->bw != -1 &&
 	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 }
 /*
  * We must be sure that accepting a new task (or allowing changing the
  * parameters of an existing one) is consistent with the bandwidth
  * constraints. If yes, this function also accordingly updates the currently
  * allocated bandwidth to reflect the new situation.
  *
  * This function is called while holding p's rq->lock.
  */
 static int dl_overflow(struct task_struct *p, int policy,
 		       const struct sched_attr *attr)
 {
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 	u64 period = attr->sched_period ?: attr->sched_deadline;
 	u64 runtime = attr->sched_runtime;
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
 	int cpus, err = -1;
 	if (new_bw == p->dl.dl_bw)
 		return 0;
 	/*
 	 * Either if a task, enters, leave, or stays -deadline but changes
 	 * its parameters, we may need to update accordingly the total
 	 * allocated bandwidth of the container.
 	 */
 	raw_spin_lock(&dl_b->lock);
 	cpus = dl_bw_cpus(task_cpu(p));
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
 		__dl_add(dl_b, new_bw);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
 		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
 		__dl_clear(dl_b, p->dl.dl_bw);
 		__dl_add(dl_b, new_bw);
 		err = 0;
 	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
 		__dl_clear(dl_b, p->dl.dl_bw);
 		err = 0;
 	}
 	raw_spin_unlock(&dl_b->lock);
 	return err;
 }
 extern void init_dl_bw(struct dl_bw *dl_b);
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void wake_up_new_task(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	/* Initialize new task's runnable average */
 	init_task_runnable_average(p);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = 1;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 #endif
 	task_rq_unlock(rq, p, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	trace_sched_switch(prev, next);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(prev, current);
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 	tick_nohz_task_switch(current);
 }
 #ifdef CONFIG_SMP
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
 	if (rq->post_schedule) {
 		unsigned long flags;
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->curr->sched_class->post_schedule)
 			rq->curr->sched_class->post_schedule(rq);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		rq->post_schedule = 0;
 	}
 }
 #else
 static inline void post_schedule(struct rq *rq)
 {
 }
 #endif
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 	/*
 	 * FIXME: do we need to worry about rq being invalidated by the
 	 * task_switch?
 	 */
 	post_schedule(rq);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_start_context_switch(prev);
 	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (!prev->mm) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, total number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_iowait_cpu(int cpu)
 {
 	struct rq *this = cpu_rq(cpu);
 	return atomic_read(&this->nr_iowait);
 }
 #ifdef CONFIG_SMP
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	struct task_struct *p = current;
 	unsigned long flags;
 	int dest_cpu;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 	if (likely(cpu_active(dest_cpu))) {
 		struct migration_arg arg = { p, dest_cpu };
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 /*
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
  * Called with task_rq_lock() held on @rq.
  */
 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
 	u64 ns = 0;
 	if (task_current(rq, p)) {
 		update_rq_clock(rq);
 		ns = rq_clock_task(rq) - p->se.exec_start;
 		if ((s64)ns < 0)
 			ns = 0;
 	}
 	return ns;
 }
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, p, &flags);
 	return ns;
 }
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
  * pending runtime that have not been accounted yet.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
 	/*
 	 * 64-bit doesn't need locks to atomically read a 64bit value.
 	 * So we have a optimization chance when the task's delta_exec is 0.
 	 * Reading ->on_cpu is racy, but this is ok.
 	 *
 	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
 	 * If we race with it entering cpu, unaccounted time is 0. This is
 	 * indistinguishable from the read occurring a few cycles earlier.
 	 */
 	if (!p->on_cpu)
 		return p->se.sum_exec_runtime;
 #endif
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, p, &flags);
 	return ns;
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	sched_clock_tick();
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
 	raw_spin_unlock(&rq->lock);
 	perf_event_task_tick();
 #ifdef CONFIG_SMP
 	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq);
 #endif
 	rq_last_tick_reset(rq);
 }
 #ifdef CONFIG_NO_HZ_FULL
 /**
  * scheduler_tick_max_deferment
  *
  * Keep at least one tick per second when a single
  * active task is running because the scheduler doesn't
  * yet completely support full dynticks environment.
  *
  * This makes sure that uptime, CFS vruntime, load
  * balancing, etc... continue to move forward, even
  * with a very low granularity.
  *
  * Return: Maximum deferment in nanoseconds.
  */
 u64 scheduler_tick_max_deferment(void)
 {
 	struct rq *rq = this_rq();
 	unsigned long next, now = ACCESS_ONCE(jiffies);
 	next = rq->last_sched_tick + HZ;
 	if (time_before_eq(next, now))
 		return 0;
 	return jiffies_to_nsecs(next - now);
 }
 #endif
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
 		if (in_lock_functions(addr))
 			addr = CALLER_ADDR3;
 	}
 	return addr;
 }
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 void __kprobes preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 #endif
 	__preempt_count_add(val);
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val) {
 		unsigned long ip = get_parent_ip(CALLER_ADDR1);
 #ifdef CONFIG_DEBUG_PREEMPT
 		current->preempt_disable_ip = ip;
 #endif
 		trace_preempt_off(CALLER_ADDR0, ip);
 	}
 }
 EXPORT_SYMBOL(preempt_count_add);
 void __kprobes preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 #endif
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	if (oops_in_progress)
 		return;
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 	debug_show_held_locks(prev);
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 #ifdef CONFIG_DEBUG_PREEMPT
 	if (in_atomic_preempt_off()) {
 		pr_err("Preemption disabled at:");
 		print_ip_sym(current->preempt_disable_ip);
 		pr_cont("\n");
 	}
 #endif
 	dump_stack();
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path. Otherwise whine
 	 * if we are scheduling when we should not.
 	 */
 	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
 		__schedule_bug(prev);
 	rcu_sleep_check();
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_count);
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class = &fair_sched_class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(prev->sched_class == class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq, prev);
 		if (unlikely(p == RETRY_TASK))
 			goto again;
 		/* assumes fair_sched_class->next == idle_sched_class */
 		if (unlikely(!p))
 			p = idle_sched_class.pick_next_task(rq, prev);
 		return p;
 	}
 again:
 	for_each_class(class) {
 		p = class->pick_next_task(rq, prev);
 		if (p) {
 			if (unlikely(p == RETRY_TASK))
 				goto again;
 			return p;
 		}
 	}
 	BUG(); /* the idle class will always have a runnable task */
 }
 /*
  * __schedule() is the main scheduler function.
  *
  * The main means of driving the scheduler and thus entering this function are:
  *
  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
  *
  *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
  *      paths. For example, see arch/x86/entry_64.S.
  *
  *      To drive preemption between tasks, the scheduler sets the flag in timer
  *      interrupt handler scheduler_tick().
  *
  *   3. Wakeups don't really cause entry into schedule(). They add a
  *      task to the run-queue and that's it.
  *
  *      Now, if the new task added to the run-queue preempts the current
  *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
  *      called on the nearest possible occasion:
  *
  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
  *
  *         - in syscall or exception context, at the next outmost
  *           preempt_enable(). (this might be as soon as the wake_up()'s
  *           spin_unlock()!)
  *
  *         - in IRQ context, return from interrupt-handler to
  *           preemptible context
  *
  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
  *         then at the next:
  *
  *          - cond_resched() call
  *          - explicit schedule() call
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
  */
 static void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 	schedule_debug(prev);
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
 			 * whether it wants to wake up a task to maintain
 			 * concurrency.
 			 */
 			if (prev->flags & PF_WQ_WORKER) {
 				struct task_struct *to_wakeup;
 				to_wakeup = wq_worker_sleeping(prev, cpu);
 				if (to_wakeup)
 					try_to_wake_up_local(to_wakeup);
 			}
 		}
 		switch_count = &prev->nvcsw;
 	}
 	if (prev->on_rq || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * The context switch have flipped the stack from under us
 		 * and restored the local variables which were saved when
 		 * this task called schedule() in the past. prev == current
 		 * is still correct, but it can be moved to another cpu/rq.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 	post_schedule(rq);
 	sched_preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
 {
 	if (!tsk->state || tsk_is_pi_blocked(tsk))
 		return;
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
 	 */
 	if (blk_needs_flush_plug(tsk))
 		blk_schedule_flush_plug(tsk);
 }
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 	sched_submit_work(tsk);
 	__schedule();
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
 asmlinkage void __sched schedule_user(void)
 {
 	/*
 	 * If we come here after a random call to set_need_resched(),
 	 * or we have been woken up remotely but the IPI has not yet arrived,
 	 * we haven't yet exited the RCU idle mode. Do it here manually until
 	 * we find a better solution.
 	 */
 	user_exit();
 	schedule();
 	user_enter();
 }
 #endif
 /**
  * schedule_preempt_disabled - called with preemption disabled
  *
  * Returns with preemption disabled. Note: preempt_count must be 1
  */
 void __sched schedule_preempt_disabled(void)
 {
 	sched_preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 }
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
 	if (likely(!preemptible()))
 		return;
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
 		__schedule();
 		__preempt_count_sub(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
 #endif /* CONFIG_PREEMPT */
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	enum ctx_state prev_state;
 	/* Catch callers which need to be fixed */
 	BUG_ON(preempt_count() || !irqs_disabled());
 	prev_state = exception_enter();
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
 		local_irq_enable();
 		__schedule();
 		local_irq_disable();
 		__preempt_count_sub(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 	exception_exit(prev_state);
 }
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	int oldprio, on_rq, running, enqueue_flag = 0;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 	BUG_ON(prio > MAX_PRIO);
 	rq = __task_rq_lock(p);
 	/*
 	 * Idle task boosting is a nono in general. There is one
 	 * exception, when PREEMPT_RT and NOHZ is active:
 	 *
 	 * The idle task calls get_next_timer_interrupt() and holds
 	 * the timer wheel base->lock on the CPU and another CPU wants
 	 * to access the timer (probably to cancel it). We can safely
 	 * ignore the boosting request, as the idle CPU runs this code
 	 * with interrupts disabled and will complete the lock
 	 * protected section without being interrupted. So there is no
 	 * real need to boost.
 	 */
 	if (unlikely(p == rq->idle)) {
 		WARN_ON(p != rq->curr);
 		WARN_ON(p->pi_blocked_on);
 		goto out_unlock;
 	}
 	trace_sched_pi_setprio(p, prio);
 	p->pi_top_task = rt_mutex_get_top_task(p);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	/*
 	 * Boosting condition are:
 	 * 1. -rt task is running and holds mutex A
 	 *      --> -dl task blocks on mutex A
 	 *
 	 * 2. -dl task is running and holds mutex A
 	 *      --> -dl task blocks on mutex A and could preempt the
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
 		if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
 			dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
 			p->dl.dl_throttled = 0;
 			enqueue_flag = ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
 	} else if (rt_prio(prio)) {
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
 			enqueue_flag = ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		p->sched_class = &fair_sched_class;
 	}
 	p->prio = prio;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, p, enqueue_flag);
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
 	__task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
 	 */
 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	on_rq = p->on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = task_nice(current) + increment;
 	if (nice < MIN_NICE)
 		nice = MIN_NICE;
 	if (nice > MAX_NICE)
 		nice = MAX_NICE;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * Return: The priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  *
  * Return: 1 if the CPU is currently idle. 0 otherwise.
  */
 int idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (rq->curr != rq->idle)
 		return 0;
 	if (rq->nr_running)
 		return 0;
 #ifdef CONFIG_SMP
 	if (!llist_empty(&rq->wake_list))
 		return 0;
 #endif
 	return 1;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  *
  * Return: The idle task for the cpu @cpu.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  *
  * The task of @pid, if found. %NULL otherwise.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
 /*
  * This function initializes the sched_dl_entity of a newly becoming
  * SCHED_DEADLINE task.
  *
  * Only the static values are considered here, the actual runtime and the
  * absolute deadline will be properly calculated when the task is enqueued
  * for the first time with its new policy.
  */
 static void
 __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 	init_dl_task_timer(dl_se);
 	dl_se->dl_runtime = attr->sched_runtime;
 	dl_se->dl_deadline = attr->sched_deadline;
 	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
 	dl_se->flags = attr->sched_flags;
 	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
 	dl_se->dl_throttled = 0;
 	dl_se->dl_new = 1;
 	dl_se->dl_yielded = 0;
 }
 static void __setscheduler_params(struct task_struct *p,
 		const struct sched_attr *attr)
 {
 	int policy = attr->sched_policy;
 	if (policy == -1) /* setparam */
 		policy = p->policy;
 	p->policy = policy;
 	if (dl_policy(policy))
 		__setparam_dl(p, attr);
 	else if (fair_policy(policy))
 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
 	/*
 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
 	 * !rt_policy. Always setting this ensures that things like
 	 * getparam()/getattr() don't report silly values for !rt tasks.
 	 */
 	p->rt_priority = attr->sched_priority;
 	p->normal_prio = normal_prio(p);
 	set_load_weight(p);
 }
 /* Actually do priority change: must hold pi & rq lock. */
 static void __setscheduler(struct rq *rq, struct task_struct *p,
 			   const struct sched_attr *attr)
 {
 	__setscheduler_params(p, attr);
 	/*
 	 * If we get here, there was no pi waiters boosting the
 	 * task. It is safe to use the normal prio.
 	 */
 	p->prio = normal_prio(p);
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
 	else if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 }
 static void
 __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 	attr->sched_priority = p->rt_priority;
 	attr->sched_runtime = dl_se->dl_runtime;
 	attr->sched_deadline = dl_se->dl_deadline;
 	attr->sched_period = dl_se->dl_period;
 	attr->sched_flags = dl_se->flags;
 }
 /*
  * This function validates the new parameters of a -deadline task.
  * We ask for the deadline not being zero, and greater or equal
  * than the runtime, as well as the period of being zero or
  * greater than deadline. Furthermore, we have to be sure that
  * user parameters are above the internal resolution of 1us (we
  * check sched_runtime only since it is always the smaller one) and
  * below 2^63 ns (we have to check both sched_deadline and
  * sched_period, as the latter can be zero).
  */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
 	/* deadline != 0 */
 	if (attr->sched_deadline == 0)
 		return false;
 	/*
 	 * Since we truncate DL_SCALE bits, make sure we're at least
 	 * that big.
 	 */
 	if (attr->sched_runtime < (1ULL << DL_SCALE))
 		return false;
 	/*
 	 * Since we use the MSB for wrap-around and sign issues, make
 	 * sure it's not set (mind that period can be equal to zero).
 	 */
 	if (attr->sched_deadline & (1ULL << 63) ||
 	    attr->sched_period & (1ULL << 63))
 		return false;
 	/* runtime <= deadline <= period (if period != 0) */
 	if ((attr->sched_period != 0 &&
 	     attr->sched_period < attr->sched_deadline) ||
 	    attr->sched_deadline < attr->sched_runtime)
 		return false;
 	return true;
 }
 /*
  * check the target process has a UID that matches the current process's
  */
 static bool check_same_owner(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred;
 	bool match;
 	rcu_read_lock();
 	pcred = __task_cred(p);
 	match = (uid_eq(cred->euid, pcred->euid) ||
 		 uid_eq(cred->euid, pcred->uid));
 	rcu_read_unlock();
 	return match;
 }
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
 				bool user)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0) {
 		reset_on_fork = p->sched_reset_on_fork;
 		policy = oldpolicy = p->policy;
 	} else {
 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
 		if (policy != SCHED_DEADLINE &&
 				policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 				policy != SCHED_IDLE)
 			return -EINVAL;
 	}
 	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
 	    (rt_policy(policy) != (attr->sched_priority != 0)))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (fair_policy(policy)) {
 			if (attr->sched_nice < task_nice(p) &&
 			    !can_nice(p, attr->sched_nice))
 				return -EPERM;
 		}
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio =
 					task_rlimit(p, RLIMIT_RTPRIO);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (attr->sched_priority > p->rt_priority &&
 			    attr->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		 /*
 		  * Can't set/change SCHED_DEADLINE policy at all for now
 		  * (safest behavior); in the future we would like to allow
 		  * unprivileged DL tasks to increase their relative deadline
 		  * or reduce their runtime (both ways reducing utilization)
 		  */
 		if (dl_policy(policy))
 			return -EPERM;
 		/*
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
 			if (!can_nice(p, task_nice(p)))
 				return -EPERM;
 		}
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
 			return -EPERM;
 		/* Normal users shall not reset the sched_reset_on_fork flag */
 		if (p->sched_reset_on_fork && !reset_on_fork)
 			return -EPERM;
 	}
 	if (user) {
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
 	}
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 *
 	 * To be able to change p->policy safely, the appropriate
 	 * runqueue lock must be held.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
 		task_rq_unlock(rq, p, &flags);
 		return -EINVAL;
 	}
 	/*
 	 * If not changing anything there's no need to proceed further,
 	 * but store a possible modification of reset_on_fork.
 	 */
 	if (unlikely(policy == p->policy)) {
 		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
 			goto change;
 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
 			goto change;
 		if (dl_policy(policy))
 			goto change;
 		p->sched_reset_on_fork = reset_on_fork;
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
 change:
 	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		/*
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
 				!task_group_is_autogroup(task_group(p))) {
 			task_rq_unlock(rq, p, &flags);
 			return -EPERM;
 		}
 #endif
 #ifdef CONFIG_SMP
 		if (dl_bandwidth_enabled() && dl_policy(policy)) {
 			cpumask_t *span = rq->rd->span;
 			/*
 			 * Don't allow tasks with an affinity mask smaller than
 			 * the entire root_domain to become SCHED_DEADLINE. We
 			 * will also fail if there's no bandwidth available.
 			 */
 			if (!cpumask_subset(span, &p->cpus_allowed) ||
 			    rq->rd->dl_bw.bw == 0) {
 				task_rq_unlock(rq, p, &flags);
 				return -EPERM;
 			}
 		}
 #endif
 	}
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
 	/*
 	 * If setscheduling to SCHED_DEADLINE (or changing the parameters
 	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
 	 * is available.
 	 */
 	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
 		task_rq_unlock(rq, p, &flags);
 		return -EBUSY;
 	}
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 	/*
 	 * Special case for priority boosted tasks.
 	 *
 	 * If the new priority is lower or equal (user space view)
 	 * than the current (boosted) priority, we just store the new
 	 * normal parameters and do not touch the scheduler class and
 	 * the runqueue. This will be done when the task deboost
 	 * itself.
 	 */
 	if (rt_mutex_check_prio(p, newprio)) {
 		__setscheduler_params(p, attr);
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, attr);
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
 		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
 	}
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 static int _sched_setscheduler(struct task_struct *p, int policy,
 			       const struct sched_param *param, bool check)
 {
 	struct sched_attr attr = {
 		.sched_policy   = policy,
 		.sched_priority = param->sched_priority,
 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
 	};
 	/*
 	 * Fixup the legacy SCHED_RESET_ON_FORK hack
 	 */
 	if (policy & SCHED_RESET_ON_FORK) {
 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 		policy &= ~SCHED_RESET_ON_FORK;
 		attr.sched_policy = policy;
 	}
 	return __sched_setscheduler(p, &attr, check);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Return: 0 on success. An error code otherwise.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       const struct sched_param *param)
 {
 	return _sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Just like sched_setscheduler, only don't bother checking if the
  * current context has permission.  For example, this is needed in
  * stop_machine(): we create temporary high priority worker threads,
  * but our caller might not have that capability.
  *
  * Return: 0 on success. An error code otherwise.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 			       const struct sched_param *param)
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /*
  * Mimics kernel/events/core.c perf_copy_attr().
  */
 static int sched_copy_attr(struct sched_attr __user *uattr,
 			   struct sched_attr *attr)
 {
 	u32 size;
 	int ret;
 	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
 		return -EFAULT;
 	/*
 	 * zero the full structure, so that a short copy will be nice.
 	 */
 	memset(attr, 0, sizeof(*attr));
 	ret = get_user(size, &uattr->size);
 	if (ret)
 		return ret;
 	if (size > PAGE_SIZE)	/* silly large */
 		goto err_size;
 	if (!size)		/* abi compat */
 		size = SCHED_ATTR_SIZE_VER0;
 	if (size < SCHED_ATTR_SIZE_VER0)
 		goto err_size;
 	/*
 	 * If we're handed a bigger struct than we know of,
 	 * ensure all the unknown bits are 0 - i.e. new
 	 * user-space does not rely on any kernel feature
 	 * extensions we dont know about yet.
 	 */
 	if (size > sizeof(*attr)) {
 		unsigned char __user *addr;
 		unsigned char __user *end;
 		unsigned char val;
 		addr = (void __user *)uattr + sizeof(*attr);
 		end  = (void __user *)uattr + size;
 		for (; addr < end; addr++) {
 			ret = get_user(val, addr);
 			if (ret)
 				return ret;
 			if (val)
 				goto err_size;
 		}
 		size = sizeof(*attr);
 	}
 	ret = copy_from_user(attr, uattr, size);
 	if (ret)
 		return -EFAULT;
 	/*
 	 * XXX: do we want to be lenient like existing syscalls; or do we want
 	 * to be strict and return an error on out-of-bounds values?
 	 */
 	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 out:
 	return ret;
 err_size:
 	put_user(sizeof(*attr), &uattr->size);
 	ret = -E2BIG;
 	goto out;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_setattr - same as above, but with extended sched_attr
  * @pid: the pid in question.
  * @uattr: structure containing the extended parameters.
  * @flags: for future extension.
  */
 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 			       unsigned int, flags)
 {
 	struct sched_attr attr;
 	struct task_struct *p;
 	int retval;
 	if (!uattr || pid < 0 || flags)
 		return -EINVAL;
 	retval = sched_copy_attr(uattr, &attr);
 	if (retval)
 		return retval;
 	if ((int)attr.sched_policy < 0)
 		return -EINVAL;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setattr(p, &attr);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  *
  * Return: On success, the policy of the thread. Otherwise, a negative error
  * code.
  */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy
 				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
 	}
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  *
  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
  * code.
  */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp = { .sched_priority = 0 };
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	if (task_has_rt_policy(p))
 		lp.sched_priority = p->rt_priority;
 	rcu_read_unlock();
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 static int sched_read_attr(struct sched_attr __user *uattr,
 			   struct sched_attr *attr,
 			   unsigned int usize)
 {
 	int ret;
 	if (!access_ok(VERIFY_WRITE, uattr, usize))
 		return -EFAULT;
 	/*
 	 * If we're handed a smaller struct than we know of,
 	 * ensure all the unknown bits are 0 - i.e. old
 	 * user-space does not get uncomplete information.
 	 */
 	if (usize < sizeof(*attr)) {
 		unsigned char *addr;
 		unsigned char *end;
 		addr = (void *)attr + usize;
 		end  = (void *)attr + sizeof(*attr);
 		for (; addr < end; addr++) {
 			if (*addr)
 				goto err_size;
 		}
 		attr->size = usize;
 	}
 	ret = copy_to_user(uattr, attr, attr->size);
 	if (ret)
 		return -EFAULT;
 out:
 	return ret;
 err_size:
 	ret = -E2BIG;
 	goto out;
 }
 /**
  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
  * @pid: the pid in question.
  * @uattr: structure containing the extended parameters.
  * @size: sizeof(attr) for fwd/bwd comp.
  * @flags: for future extension.
  */
 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		unsigned int, size, unsigned int, flags)
 {
 	struct sched_attr attr = {
 		.size = sizeof(struct sched_attr),
 	};
 	struct task_struct *p;
 	int retval;
 	if (!uattr || pid < 0 || size > PAGE_SIZE ||
 	    size < SCHED_ATTR_SIZE_VER0 || flags)
 		return -EINVAL;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	attr.sched_policy = p->policy;
 	if (p->sched_reset_on_fork)
 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 	if (task_has_dl_policy(p))
 		__getparam_dl(p, &attr);
 	else if (task_has_rt_policy(p))
 		attr.sched_priority = p->rt_priority;
 	else
 		attr.sched_nice = task_nice(p);
 	rcu_read_unlock();
 	retval = sched_read_attr(uattr, &attr, size);
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (!p) {
 		rcu_read_unlock();
 		return -ESRCH;
 	}
 	/* Prevent p going away */
 	get_task_struct(p);
 	rcu_read_unlock();
 	if (p->flags & PF_NO_SETAFFINITY) {
 		retval = -EINVAL;
 		goto out_put_task;
 	}
 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_put_task;
 	}
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
 	if (!check_same_owner(p)) {
 		rcu_read_lock();
 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
 			rcu_read_unlock();
 			goto out_unlock;
 		}
 		rcu_read_unlock();
 	}
 	retval = security_task_setscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpuset_cpus_allowed(p, cpus_allowed);
 	cpumask_and(new_mask, in_mask, cpus_allowed);
 	/*
 	 * Since bandwidth control happens on root_domain basis,
 	 * if admission test is enabled, we only admit -deadline
 	 * tasks allowed to run on all the CPUs in the task's
 	 * root_domain.
 	 */
 #ifdef CONFIG_SMP
 	if (task_has_dl_policy(p)) {
 		const struct cpumask *span = task_rq(p)->rd->span;
 		if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
 			retval = -EBUSY;
 			goto out_unlock;
 		}
 	}
 #endif
 again:
 	retval = set_cpus_allowed_ptr(p, new_mask);
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
 		if (!cpumask_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
 			cpumask_copy(new_mask, cpus_allowed);
 			goto again;
 		}
 	}
 out_unlock:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
 	free_cpumask_var(cpus_allowed);
 out_put_task:
 	put_task_struct(p);
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     struct cpumask *new_mask)
 {
 	if (len < cpumask_size())
 		cpumask_clear(new_mask);
 	else if (len > cpumask_size())
 		len = cpumask_size();
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
 	if (retval == 0)
 		retval = sched_setaffinity(pid, new_mask);
 	free_cpumask_var(new_mask);
 	return retval;
 }
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
 	unsigned long flags;
 	int retval;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
 		return -EINVAL;
 	if (len & (sizeof(unsigned long)-1))
 		return -EINVAL;
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
 		size_t retlen = min_t(size_t, len, cpumask_size());
 		if (copy_to_user(user_mask_ptr, mask, retlen))
 			ret = -EFAULT;
 		else
 			ret = retlen;
 	}
 	free_cpumask_var(mask);
 	return ret;
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  *
  * Return: 0.
  */
 SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	do_raw_spin_unlock(&rq->lock);
 	sched_preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static void __cond_resched(void)
 {
 	__preempt_count_add(PREEMPT_ACTIVE);
 	__schedule();
 	__preempt_count_sub(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
 	int resched = should_resched();
 	int ret = 0;
 	lockdep_assert_held(lock);
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
 			__cond_resched();
 		else
 			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (should_resched()) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * Do not ever use this function, there's a 99% chance you're doing it wrong.
  *
  * The scheduler is at all times free to pick the calling task as the most
  * eligible task to run, if removing the yield() call from your code breaks
  * it, its already broken.
  *
  * Typical broken usage is:
  *
  * while (!event)
  * 	yield();
  *
  * where one assumes that yield() will let 'the other' process run that will
  * make event true. If the current task is a SCHED_FIFO task that will never
  * happen. Never use yield() as a progress guarantee!!
  *
  * If you want to use yield() to wait for something, use wait_event().
  * If you want to use yield() to be 'nice' for others, use cond_resched().
  * If you still want to use yield(), do not!
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /**
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
  * processor it's on.
  * @p: target task
  * @preempt: whether task preemption is allowed or not
  *
  * It's the caller's job to ensure that the target task struct
  * can't go away on us before we can do any checks.
  *
  * Return:
  *	true (>0) if we indeed boosted the target task.
  *	false (0) if we failed to boost the target.
  *	-ESRCH if there's no task to yield to.
  */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
 	unsigned long flags;
 	int yielded = 0;
 	local_irq_save(flags);
 	rq = this_rq();
 again:
 	p_rq = task_rq(p);
 	/*
 	 * If we're the only runnable task on the rq and target rq also
 	 * has only one task, there's absolutely no point in yielding.
 	 */
 	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
 		yielded = -ESRCH;
 		goto out_irq;
 	}
 	double_rq_lock(rq, p_rq);
 	if (task_rq(p) != p_rq) {
 		double_rq_unlock(rq, p_rq);
 		goto again;
 	}
 	if (!curr->sched_class->yield_to_task)
 		goto out_unlock;
 	if (curr->sched_class != p->sched_class)
 		goto out_unlock;
 	if (task_running(p_rq, p) || p->state)
 		goto out_unlock;
 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
 	if (yielded) {
 		schedstat_inc(rq, yld_count);
 		/*
 		 * Make p's CPU reschedule; pick_next_entity takes care of
 		 * fairness.
 		 */
 		if (preempt && rq != p_rq)
 			resched_task(p_rq->curr);
 	}
 out_unlock:
 	double_rq_unlock(rq, p_rq);
 out_irq:
 	local_irq_restore(flags);
 	if (yielded > 0)
 		schedule();
 	return yielded;
 }
 EXPORT_SYMBOL_GPL(yield_to);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = raw_rq();
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = raw_rq();
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * Return: On success, this syscall returns the maximum
  * rt_priority that can be used by a given scheduling class.
  * On failure, a negative error code is returned.
  */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_DEADLINE:
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * Return: On success, this syscall returns the minimum
  * rt_priority that can be used by a given scheduling class.
  * On failure, a negative error code is returned.
  */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_DEADLINE:
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  *
  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
  * an error code.
  */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	unsigned long flags;
 	struct rq *rq;
 	int retval;
 	struct timespec t;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	rq = task_rq_lock(p, &flags);
 	time_slice = 0;
 	if (p->sched_class->get_rr_interval)
 		time_slice = p->sched_class->get_rr_interval(rq, p);
 	task_rq_unlock(rq, p, &flags);
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	int ppid;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
 	rcu_read_lock();
 	ppid = task_pid_nr(rcu_dereference(p->real_parent));
 	rcu_read_unlock();
 	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
 		task_pid_nr(p), ppid,
 		(unsigned long)task_thread_info(p)->flags);
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	rcu_read_lock();
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	rcu_read_unlock();
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (!state_filter)
 		debug_show_all_locks();
 }
 void init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 	do_set_cpus_allowed(idle, cpumask_of(cpu));
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
 	 * lockdep check in task_group() will fail.
 	 *
 	 * Similar case to sched_fork(). / Alternatively we could
 	 * use task_rq_lock() here and obtain the other rq->lock.
 	 *
 	 * Silence PROVE_RCU
 	 */
 	rcu_read_lock();
 	__set_task_cpu(idle, cpu);
 	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 	idle->on_rq = 1;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	if (p->sched_class && p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
  * This is how migration works:
  *
  * 1) we invoke migration_cpu_stop() on the target CPU using
  *    stop_one_cpu().
  * 2) stopper starts to run (implicitly forcing the migrated thread
  *    off the CPU)
  * 3) it checks whether the migrated task is still in the wrong runqueue.
  * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 5) stopper completes and stop_one_cpu() returns and the migration
  *    is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	unsigned long flags;
 	struct rq *rq;
 	unsigned int dest_cpu;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	do_set_cpus_allowed(p, new_mask);
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
 	if (p->on_rq) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &flags);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, p, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0;
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	raw_spin_lock(&p->pi_lock);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
 	/*
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
 	if (p->on_rq) {
 		dequeue_task(rq_src, p, 0);
 		set_task_cpu(p, dest_cpu);
 		enqueue_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
 fail:
 	double_rq_unlock(rq_src, rq_dest);
 	raw_spin_unlock(&p->pi_lock);
 	return ret;
 }
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
 int migrate_task_to(struct task_struct *p, int target_cpu)
 {
 	struct migration_arg arg = { p, target_cpu };
 	int curr_cpu = task_cpu(p);
 	if (curr_cpu == target_cpu)
 		return 0;
 	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
 		return -EINVAL;
 	/* TODO: This is not properly updating schedstats */
 	trace_sched_move_numa(p, curr_cpu, target_cpu);
 	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
 /*
  * Requeue a task on a given node and accurately track the number of NUMA
  * tasks on the runqueues
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
 	struct rq *rq;
 	unsigned long flags;
 	bool on_rq, running;
 	rq = task_rq_lock(p, &flags);
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	p->numa_preferred_nid = nid;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
  * 'pushing' onto another runqueue.
  */
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
 	/*
 	 * The original target cpu might have gone down and we might
 	 * be on another cpu but it doesn't matter.
 	 */
 	local_irq_disable();
 	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
 	local_irq_enable();
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
 	mmdrop(mm);
 }
 /*
  * Since this CPU is going 'away' for a while, fold any nr_active delta
  * we might have. Assumes we're called after migrate_tasks() so that the
  * nr_active count is stable.
  *
  * Also see the comment "Global load-average calculations".
  */
 static void calc_load_migrate(struct rq *rq)
 {
 	long delta = calc_load_fold_active(rq);
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 }
 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
 {
 }
 static const struct sched_class fake_sched_class = {
 	.put_prev_task = put_prev_task_fake,
 };
 static struct task_struct fake_task = {
 	/*
 	 * Avoid pull_{rt,dl}_task()
 	 */
 	.prio = MAX_PRIO + 1,
 	.sched_class = &fake_sched_class,
 };
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
  *
  * Called with rq->lock held even though we'er in stop_machine() and
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
 static void migrate_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 	/*
 	 * Fudge the rq selection such that the below task selection loop
 	 * doesn't get stuck on the currently eligible stop task.
 	 *
 	 * We're currently inside stop_machine() and the rq is either stuck
 	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
 	 * either way we should never end up calling schedule() until we're
 	 * done here.
 	 */
 	rq->stop = NULL;
 	/*
 	 * put_prev_task() and pick_next_task() sched
 	 * class method both need to have an up-to-date
 	 * value of rq->clock[_task]
 	 */
 	update_rq_clock(rq);
 	for ( ; ; ) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
 		 */
 		if (rq->nr_running == 1)
 			break;
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_cpu, next);
 		raw_spin_unlock(&rq->lock);
 		__migrate_task(next, dead_cpu, dest_cpu);
 		raw_spin_lock(&rq->lock);
 	}
 	rq->stop = stop;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
 	{}
 };
 static struct ctl_table sd_ctl_root[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{}
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 	return entry;
 }
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
 	struct ctl_table *entry;
 	/*
 	 * In the intermediate directories, both the child directory and
 	 * procname are dynamically allocated and could fail but the mode
 	 * will always be set. In the lowest directory the names are
 	 * static strings and all have proc handlers.
 	 */
 	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
 		if (entry->proc_handler == NULL)
 			kfree(entry->procname);
 	}
 	kfree(*tablep);
 	*tablep = NULL;
 }
 static int min_load_idx = 0;
 static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		umode_t mode, proc_handler *proc_handler,
 		bool load_idx)
 {
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 	if (load_idx) {
 		entry->extra1 = &min_load_idx;
 		entry->extra2 = &max_load_idx;
 	}
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(14);
 	if (table == NULL)
 		return NULL;
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[11], "max_newidle_lb_cost",
 		&sd->max_newidle_lb_cost,
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[12], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
 	/* &table[13] is terminator */
 	return table;
 }
 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	if (table == NULL)
 		return NULL;
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_possible_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	WARN_ON(sd_ctl_dir[0].child);
 	sd_ctl_dir[0].child = entry;
 	if (entry == NULL)
 		return;
 	for_each_possible_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
 	if (sd_sysctl_header)
 		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
 {
 }
 static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
 		cpumask_set_cpu(rq->cpu, rq->rd->online);
 		rq->online = 1;
 		for_each_class(class) {
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
 	}
 }
 static void set_rq_offline(struct rq *rq)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 		for_each_class(class) {
 			if (class->rq_offline)
 				class->rq_offline(rq);
 		}
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
 		rq->online = 0;
 	}
 }
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq = cpu_rq(cpu);
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		rq->calc_load_update = calc_load_update;
 		break;
 	case CPU_ONLINE:
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_online(rq);
 		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DYING:
 		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		migrate_tasks(cpu);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 	case CPU_DEAD:
 		calc_load_migrate(rq);
 		break;
 #endif
 	}
 	update_max_interval();
 	return NOTIFY_OK;
 }
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = CPU_PRI_MIGRATION,
 };
 static int sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_FAILED:
 		set_cpu_active((long)hcpu, true);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 static int sched_cpu_inactive(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned long flags;
 	long cpu = (long)hcpu;
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		set_cpu_active(cpu, false);
 		/* explicitly allow suspend */
 		if (!(action & CPU_TASKS_FROZEN)) {
 			struct dl_bw *dl_b = dl_bw_of(cpu);
 			bool overflow;
 			int cpus;
 			raw_spin_lock_irqsave(&dl_b->lock, flags);
 			cpus = dl_bw_cpus(cpu);
 			overflow = __dl_overflow(dl_b, cpus, 0, 0);
 			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 			if (overflow)
 				return notifier_from_errno(-EBUSY);
 		}
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	/* Register cpu active notifiers */
 	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
 	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
 	return 0;
 }
 early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
 static __read_mostly int sched_debug_enabled;
 static int __init sched_debug_setup(char *str)
 {
 	sched_debug_enabled = 1;
 	return 0;
 }
 early_param("sched_debug", sched_debug_setup);
 static inline bool sched_debug(void)
 {
 	return sched_debug_enabled;
 }
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
 	cpumask_clear(groupmask);
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 					" has parent");
 		return -1;
 	}
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
 		if (!group) {
 			printk("\n");
 			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 		/*
 		 * Even though we initialize ->power to something semi-sane,
 		 * we leave power_orig unset. This allows us to detect if
 		 * domain iteration is still funny without causing /0 traps.
 		 */
 		if (!group->sgp->power_orig) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
 			break;
 		}
 		if (!cpumask_weight(sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 		if (!(sd->flags & SD_OVERLAP) &&
 		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 		printk(KERN_CONT " %s", str);
 		if (group->sgp->power != SCHED_POWER_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
 				group->sgp->power);
 		}
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 	if (sd->parent &&
 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
 }
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sched_debug_enabled)
 		return;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
 	return false;
 }
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpumask_weight(sched_domain_span(sd)) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES |
 				SD_PREFER_SIBLING);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 	cpupri_cleanup(&rd->cpupri);
 	cpudl_cleanup(&rd->cpudl);
 	free_cpumask_var(rd->dlo_mask);
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
 	kfree(rd);
 }
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	if (rq->rd) {
 		old_rd = rq->rd;
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 		/*
 		 * If we dont want to free the old_rd yet then
 		 * set old_rd to NULL to skip the freeing later
 		 * in this function:
 		 */
 		if (!atomic_dec_and_test(&old_rd->refcount))
 			old_rd = NULL;
 	}
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 static int init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
 		goto out;
 	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
 		goto free_span;
 	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
 		goto free_online;
 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_dlo_mask;
 	init_dl_bw(&rd->dl_bw);
 	if (cpudl_init(&rd->cpudl) != 0)
 		goto free_dlo_mask;
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_rto_mask;
 	return 0;
 free_rto_mask:
 	free_cpumask_var(rd->rto_mask);
 free_dlo_mask:
 	free_cpumask_var(rd->dlo_mask);
 free_online:
 	free_cpumask_var(rd->online);
 free_span:
 	free_cpumask_var(rd->span);
 out:
 	return -ENOMEM;
 }
 /*
  * By default the system creates a single root-domain with all cpus as
  * members (mimicking the global state we have today).
  */
 struct root_domain def_root_domain;
 static void init_defrootdomain(void)
 {
 	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 	if (init_rootdomain(rd) != 0) {
 		kfree(rd);
 		return NULL;
 	}
 	return rd;
 }
 static void free_sched_groups(struct sched_group *sg, int free_sgp)
 {
 	struct sched_group *tmp, *first;
 	if (!sg)
 		return;
 	first = sg;
 	do {
 		tmp = sg->next;
 		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
 			kfree(sg->sgp);
 		kfree(sg);
 		sg = tmp;
 	} while (sg != first);
 }
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
 	/*
 	 * If its an overlapping domain it has private groups, iterate and
 	 * nuke them all.
 	 */
 	if (sd->flags & SD_OVERLAP) {
 		free_sched_groups(sd->groups, 1);
 	} else if (atomic_dec_and_test(&sd->groups->ref)) {
 		kfree(sd->groups->sgp);
 		kfree(sd->groups);
 	}
 	kfree(sd);
 }
 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
 {
 	call_rcu(&sd->rcu, free_sched_domain);
 }
 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 {
 	for (; sd; sd = sd->parent)
 		destroy_sched_domain(sd, cpu);
 }
 /*
  * Keep a special pointer to the highest sched_domain that has
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
  * allows us to avoid some pointer chasing select_idle_sibling().
  *
  * Also keep a unique ID per domain (we use the first cpu number in
  * the cpumask of the domain), this allows us to quickly tell if
  * two cpus are in the same cache domain, see cpus_share_cache().
  */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain *sd;
 	struct sched_domain *busy_sd = NULL;
 	int id = cpu;
 	int size = 1;
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 		busy_sd = sd->parent; /* sd_busy */
 	}
 	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 			/*
 			 * Transfer SD_PREFER_SIBLING down in case of a
 			 * degenerate parent; the spans match for this
 			 * so the property transfers.
 			 */
 			if (parent->flags & SD_PREFER_SIBLING)
 				tmp->flags |= SD_PREFER_SIBLING;
 			destroy_sched_domain(parent, cpu);
 		} else
 			tmp = tmp->parent;
 	}
 	if (sd && sd_degenerate(sd)) {
 		tmp = sd;
 		sd = sd->parent;
 		destroy_sched_domain(tmp, cpu);
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
 	destroy_sched_domains(tmp, cpu);
 	update_top_cache_domain(cpu);
 }
 /* cpus with isolated domains */
 static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	cpulist_parse(str, cpu_isolated_map);
 	return 1;
 }
 __setup("isolcpus=", isolated_cpu_setup);
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 struct sd_data {
 	struct sched_domain **__percpu sd;
 	struct sched_group **__percpu sg;
 	struct sched_group_power **__percpu sgp;
 };
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
 };
 enum s_alloc {
 	sa_rootdomain,
 	sa_sd,
 	sa_sd_storage,
 	sa_none,
 };
 struct sched_domain_topology_level;
 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 #define SDTL_OVERLAP	0x01
 struct sched_domain_topology_level {
 	sched_domain_init_f init;
 	sched_domain_mask_f mask;
 	int		    flags;
 	int		    numa_level;
 	struct sd_data      data;
 };
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
  *
  * Asymmetric node setups can result in situations where the domain tree is of
  * unequal depth, make sure to skip domains that already cover the entire
  * range.
  *
  * In that case build_sched_domains() will have terminated the iteration early
  * and our sibling sd spans will be empty. Domains should always include the
  * cpu they're built on, so check that.
  *
  */
 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 {
 	const struct cpumask *span = sched_domain_span(sd);
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
 	for_each_cpu(i, span) {
 		sibling = *per_cpu_ptr(sdd->sd, i);
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 		cpumask_set_cpu(i, sched_group_mask(sg));
 	}
 }
 /*
  * Return the canonical balance cpu for this group, this is the first cpu
  * of this group that's also in the iteration mask.
  */
 int group_balance_cpu(struct sched_group *sg)
 {
 	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
 }
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered = sched_domains_tmpmask;
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *child;
 	int i;
 	cpumask_clear(covered);
 	for_each_cpu(i, span) {
 		struct cpumask *sg_span;
 		if (cpumask_test_cpu(i, covered))
 			continue;
 		child = *per_cpu_ptr(sdd->sd, i);
 		/* See the comment near build_group_mask(). */
 		if (!cpumask_test_cpu(i, sched_domain_span(child)))
 			continue;
 		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				GFP_KERNEL, cpu_to_node(cpu));
 		if (!sg)
 			goto fail;
 		sg_span = sched_group_cpus(sg);
 		if (child->child) {
 			child = child->child;
 			cpumask_copy(sg_span, sched_domain_span(child));
 		} else
 			cpumask_set_cpu(i, sg_span);
 		cpumask_or(covered, covered, sg_span);
 		sg->sgp = *per_cpu_ptr(sdd->sgp, i);
 		if (atomic_inc_return(&sg->sgp->ref) == 1)
 			build_group_mask(sd, sg);
 		/*
 		 * Initialize sgp->power such that even if we mess up the
 		 * domains and no possible iteration will get us here, we won't
 		 * die on a /0 trap.
 		 */
 		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
 		sg->sgp->power_orig = sg->sgp->power;
 		/*
 		 * Make sure the first group of this domain contains the
 		 * canonical balance cpu. Otherwise the sched_domain iteration
 		 * breaks. See update_sg_lb_stats().
 		 */
 		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
 		    group_balance_cpu(sg) == cpu)
 			groups = sg;
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 		last->next = first;
 	}
 	sd->groups = groups;
 	return 0;
 fail:
 	free_sched_groups(first, 0);
 	return -ENOMEM;
 }
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	struct sched_domain *child = sd->child;
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
 		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
 	}
 	return cpu;
 }
 /*
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  *
  * Assumes the sched_domain tree is fully constructed
  */
 static int
 build_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	struct sd_data *sdd = sd->private;
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered;
 	int i;
 	get_group(cpu, sdd, &sd->groups);
 	atomic_inc(&sd->groups->ref);
 	if (cpu != cpumask_first(span))
 		return 0;
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 	cpumask_clear(covered);
 	for_each_cpu(i, span) {
 		struct sched_group *sg;
 		int group, j;
 		if (cpumask_test_cpu(i, covered))
 			continue;
 		group = get_group(i, sdd, &sg);
 		cpumask_clear(sched_group_cpus(sg));
 		sg->sgp->power = 0;
 		cpumask_setall(sched_group_mask(sg));
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
 				continue;
 			cpumask_set_cpu(j, covered);
 			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 	return 0;
 }
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
 	WARN_ON(!sg);
 	do {
 		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
 		sg = sg->next;
 	} while (sg != sd->groups);
 	if (cpu != group_balance_cpu(sg))
 		return;
 	update_group_power(sd, cpu);
 	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
 }
 int __weak arch_sd_sibling_asym_packing(void)
 {
        return 0*SD_ASYM_PACKING;
 }
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define SD_INIT_NAME(sd, type)		sd->name = #type
 #else
 # define SD_INIT_NAME(sd, type)		do { } while (0)
 #endif
 #define SD_INIT_FUNC(type)						\
 static noinline struct sched_domain *					\
 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
 {									\
 	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\
 	*sd = SD_##type##_INIT;						\
 	SD_INIT_NAME(sd, type);						\
 	sd->private = &tl->data;					\
 	return sd;							\
 }
 SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
 #ifdef CONFIG_SCHED_MC
  SD_INIT_FUNC(MC)
 #endif
 #ifdef CONFIG_SCHED_BOOK
  SD_INIT_FUNC(BOOK)
 #endif
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
 	if (kstrtoint(str, 0, &default_relax_domain_level))
 		pr_warn("Unable to set relax_domain_level\n");
 	return 1;
 }
 __setup("relax_domain_level=", setup_relax_domain_level);
 static void set_domain_attribute(struct sched_domain *sd,
 				 struct sched_domain_attr *attr)
 {
 	int request;
 	if (!attr || attr->relax_domain_level < 0) {
 		if (default_relax_domain_level < 0)
 			return;
 		else
 			request = default_relax_domain_level;
 	} else
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 static void __sdt_free(const struct cpumask *cpu_map);
 static int __sdt_alloc(const struct cpumask *cpu_map);
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 				 const struct cpumask *cpu_map)
 {
 	switch (what) {
 	case sa_rootdomain:
 		if (!atomic_read(&d->rd->refcount))
 			free_rootdomain(&d->rd->rcu); /* fall through */
 	case sa_sd:
 		free_percpu(d->sd); /* fall through */
 	case sa_sd_storage:
 		__sdt_free(cpu_map); /* fall through */
 	case sa_none:
 		break;
 	}
 }
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 						   const struct cpumask *cpu_map)
 {
 	memset(d, 0, sizeof(*d));
 	if (__sdt_alloc(cpu_map))
 		return sa_sd_storage;
 	d->sd = alloc_percpu(struct sched_domain *);
 	if (!d->sd)
 		return sa_sd_storage;
 	d->rd = alloc_rootdomain();
 	if (!d->rd)
 		return sa_sd;
 	return sa_rootdomain;
 }
 /*
  * NULL the sd_data elements we've used to build the sched_domain and
  * sched_group structure so that the subsequent __free_domain_allocs()
  * will not free the data we're using.
  */
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
 	struct sd_data *sdd = sd->private;
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
 	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
 		*per_cpu_ptr(sdd->sgp, cpu) = NULL;
 }
 #ifdef CONFIG_SCHED_SMT
 static const struct cpumask *cpu_smt_mask(int cpu)
 {
 	return topology_thread_cpumask(cpu);
 }
 #endif
 /*
  * Topology list, bottom-up.
  */
 static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ sd_init_SIBLING, cpu_smt_mask, },
 #endif
 #ifdef CONFIG_SCHED_MC
 	{ sd_init_MC, cpu_coregroup_mask, },
 #endif
 #ifdef CONFIG_SCHED_BOOK
 	{ sd_init_BOOK, cpu_book_mask, },
 #endif
 	{ sd_init_CPU, cpu_cpu_mask, },
 	{ NULL, },
 };
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 #define for_each_sd_topology(tl)			\
 	for (tl = sched_domain_topology; tl->init; tl++)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 static inline int sd_local_flags(int level)
 {
 	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
 		return 0;
 	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
 }
 static struct sched_domain *
 sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
 {
 	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
 	int level = tl->numa_level;
 	int sd_weight = cpumask_weight(
 			sched_domains_numa_masks[level][cpu_to_node(cpu)]);
 	*sd = (struct sched_domain){
 		.min_interval		= sd_weight,
 		.max_interval		= 2*sd_weight,
 		.busy_factor		= 32,
 		.imbalance_pct		= 125,
 		.cache_nice_tries	= 2,
 		.busy_idx		= 3,
 		.idle_idx		= 2,
 		.newidle_idx		= 0,
 		.wake_idx		= 0,
 		.forkexec_idx		= 0,
 		.flags			= 1*SD_LOAD_BALANCE
 					| 1*SD_BALANCE_NEWIDLE
 					| 0*SD_BALANCE_EXEC
 					| 0*SD_BALANCE_FORK
 					| 0*SD_BALANCE_WAKE
 					| 0*SD_WAKE_AFFINE
 					| 0*SD_SHARE_CPUPOWER
 					| 0*SD_SHARE_PKG_RESOURCES
 					| 1*SD_SERIALIZE
 					| 0*SD_PREFER_SIBLING
 					| 1*SD_NUMA
 					| sd_local_flags(level)
 					,
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
 		.max_newidle_lb_cost	= 0,
 		.next_decay_max_lb_cost	= jiffies,
 	};
 	SD_INIT_NAME(sd, NUMA);
 	sd->private = &tl->data;
 	/*
 	 * Ugly hack to pass state to sd_numa_mask()...
 	 */
 	sched_domains_curr_level = tl->numa_level;
 	return sd;
 }
 static const struct cpumask *sd_numa_mask(int cpu)
 {
 	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
 static void sched_numa_warn(const char *str)
 {
 	static int done = false;
 	int i,j;
 	if (done)
 		return;
 	done = true;
 	printk(KERN_WARNING "ERROR: %s\n\n", str);
 	for (i = 0; i < nr_node_ids; i++) {
 		printk(KERN_WARNING "  ");
 		for (j = 0; j < nr_node_ids; j++)
 			printk(KERN_CONT "%02d ", node_distance(i,j));
 		printk(KERN_CONT "\n");
 	}
 	printk(KERN_WARNING "\n");
 }
 static bool find_numa_distance(int distance)
 {
 	int i;
 	if (distance == node_distance(0, 0))
 		return true;
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		if (sched_domains_numa_distance[i] == distance)
 			return true;
 	}
 	return false;
 }
 static void sched_init_numa(void)
 {
 	int next_distance, curr_distance = node_distance(0, 0);
 	struct sched_domain_topology_level *tl;
 	int level = 0;
 	int i, j, k;
 	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
 	if (!sched_domains_numa_distance)
 		return;
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
 	 *
 	 * Assumes node_distance(0,j) includes all distances in
 	 * node_distance(i,j) in order to avoid cubic time.
 	 */
 	next_distance = curr_distance;
 	for (i = 0; i < nr_node_ids; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
 			for (k = 0; k < nr_node_ids; k++) {
 				int distance = node_distance(i, k);
 				if (distance > curr_distance &&
 				    (distance < next_distance ||
 				     next_distance == curr_distance))
 					next_distance = distance;
 				/*
 				 * While not a strong assumption it would be nice to know
 				 * about cases where if node A is connected to B, B is not
 				 * equally connected to A.
 				 */
 				if (sched_debug() && node_distance(k, i) != distance)
 					sched_numa_warn("Node-distance not symmetric");
 				if (sched_debug() && i && !find_numa_distance(distance))
 					sched_numa_warn("Node-0 not representative");
 			}
 			if (next_distance != curr_distance) {
 				sched_domains_numa_distance[level++] = next_distance;
 				sched_domains_numa_levels = level;
 				curr_distance = next_distance;
 			} else break;
 		}
 		/*
 		 * In case of sched_debug() we verify the above assumption.
 		 */
 		if (!sched_debug())
 			break;
 	}
 	/*
 	 * 'level' contains the number of unique distances, excluding the
 	 * identity distance node_distance(i,i).
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
 	 */
 	/*
 	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
 	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
 	 * the array will contain less then 'level' members. This could be
 	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
 	 * in other functions.
 	 *
 	 * We reset it to 'level' at the end of this function.
 	 */
 	sched_domains_numa_levels = 0;
 	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
 	if (!sched_domains_numa_masks)
 		return;
 	/*
 	 * Now for each level, construct a mask per node which contains all
 	 * cpus of nodes that are that many hops away from us.
 	 */
 	for (i = 0; i < level; i++) {
 		sched_domains_numa_masks[i] =
 			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
 		if (!sched_domains_numa_masks[i])
 			return;
 		for (j = 0; j < nr_node_ids; j++) {
 			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
 			if (!mask)
 				return;
 			sched_domains_numa_masks[i][j] = mask;
 			for (k = 0; k < nr_node_ids; k++) {
 				if (node_distance(j, k) > sched_domains_numa_distance[i])
 					continue;
 				cpumask_or(mask, mask, cpumask_of_node(k));
 			}
 		}
 	}
 	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
 			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
 	if (!tl)
 		return;
 	/*
 	 * Copy the default topology bits..
 	 */
 	for (i = 0; default_topology[i].init; i++)
 		tl[i] = default_topology[i];
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
 	for (j = 0; j < level; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.init = sd_numa_init,
 			.mask = sd_numa_mask,
 			.flags = SDTL_OVERLAP,
 			.numa_level = j,
 		};
 	}
 	sched_domain_topology = tl;
 	sched_domains_numa_levels = level;
 }
 static void sched_domains_numa_masks_set(int cpu)
 {
 	int i, j;
 	int node = cpu_to_node(cpu);
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
 			if (node_distance(j, node) <= sched_domains_numa_distance[i])
 				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
 		}
 	}
 }
 static void sched_domains_numa_masks_clear(int cpu)
 {
 	int i, j;
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++)
 			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
 	}
 }
 /*
  * Update sched_domains_numa_masks[level][node] array when new cpus
  * are onlined.
  */
 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 					   unsigned long action,
 					   void *hcpu)
 {
 	int cpu = (long)hcpu;
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 		sched_domains_numa_masks_set(cpu);
 		break;
 	case CPU_DEAD:
 		sched_domains_numa_masks_clear(cpu);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 #else
 static inline void sched_init_numa(void)
 {
 }
 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 					   unsigned long action,
 					   void *hcpu)
 {
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
 	int j;
 	for_each_sd_topology(tl) {
 		struct sd_data *sdd = &tl->data;
 		sdd->sd = alloc_percpu(struct sched_domain *);
 		if (!sdd->sd)
 			return -ENOMEM;
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
 		sdd->sgp = alloc_percpu(struct sched_group_power *);
 		if (!sdd->sgp)
 			return -ENOMEM;
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
 			struct sched_group_power *sgp;
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;
 			*per_cpu_ptr(sdd->sd, j) = sd;
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
 				return -ENOMEM;
 			sg->next = sg;
 			*per_cpu_ptr(sdd->sg, j) = sg;
 			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sgp)
 				return -ENOMEM;
 			*per_cpu_ptr(sdd->sgp, j) = sgp;
 		}
 	}
 	return 0;
 }
 static void __sdt_free(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
 	int j;
 	for_each_sd_topology(tl) {
 		struct sd_data *sdd = &tl->data;
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			if (sdd->sd) {
 				sd = *per_cpu_ptr(sdd->sd, j);
 				if (sd && (sd->flags & SD_OVERLAP))
 					free_sched_groups(sd->groups, 0);
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
 			if (sdd->sgp)
 				kfree(*per_cpu_ptr(sdd->sgp, j));
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
 		free_percpu(sdd->sgp);
 		sdd->sgp = NULL;
 	}
 }
 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
 {
 	struct sched_domain *sd = tl->init(tl, cpu);
 	if (!sd)
 		return child;
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
 		child->parent = sd;
 		sd->child = child;
 	}
 	set_domain_attribute(sd, attr);
 	return sd;
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int build_sched_domains(const struct cpumask *cpu_map,
 			       struct sched_domain_attr *attr)
 {
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
 	int i, ret = -ENOMEM;
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
 		goto error;
 	/* Set up domains for cpus specified by the cpu_map. */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain_topology_level *tl;
 		sd = NULL;
 		for_each_sd_topology(tl) {
 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
 				sd->flags |= SD_OVERLAP;
 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
 				break;
 		}
 	}
 	/* Build the groups for the domains */
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
 			if (sd->flags & SD_OVERLAP) {
 				if (build_overlap_sched_groups(sd, i))
 					goto error;
 			} else {
 				if (build_sched_groups(sd, i))
 					goto error;
 			}
 		}
 	}
 	/* Calculate CPU power for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
 			continue;
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			claim_allocations(i, sd);
 			init_sched_groups_power(i, sd);
 		}
 	}
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
 		sd = *per_cpu_ptr(d.sd, i);
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
 	return ret;
 }
 static cpumask_var_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
  * cpumask) fails, then fallback to a single sched domain,
  * as determined by the single cpumask fallback_doms.
  */
 static cpumask_var_t fallback_doms;
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
  * cpu core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
 int __weak arch_update_cpu_topology(void)
 {
 	return 0;
 }
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
 {
 	int i;
 	cpumask_var_t *doms;
 	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
 	if (!doms)
 		return NULL;
 	for (i = 0; i < ndoms; i++) {
 		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
 			free_sched_domains(doms, i);
 			return NULL;
 		}
 	}
 	return doms;
 }
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 {
 	unsigned int i;
 	for (i = 0; i < ndoms; i++)
 		free_cpumask_var(doms[i]);
 	kfree(doms);
 }
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
 static int init_sched_domains(const struct cpumask *cpu_map)
 {
 	int err;
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 	return err;
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
 	int i;
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	rcu_read_unlock();
 }
 /* handle null as "default" */
 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 			struct sched_domain_attr *new, int idx_new)
 {
 	struct sched_domain_attr tmp;
 	/* fast path */
 	if (!new && !cur)
 		return 1;
 	tmp = SD_ATTR_INIT;
 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
 			new ? (new + idx_new) : &tmp,
 			sizeof(struct sched_domain_attr));
 }
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
  * The passed in 'doms_new' should be allocated using
  * alloc_sched_domains.  This routine takes ownership of it and will
  * free_sched_domains it when done with it. If the caller failed the
  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
  * and partition_sched_domains() will fallback to the single partition
  * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
 	int new_topology;
 	mutex_lock(&sched_domains_mutex);
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	/* Let architecture update cpu core mappings. */
 	new_topology = arch_update_cpu_topology();
 	n = doms_new ? ndoms_new : 0;
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
 	}
 	n = ndoms_cur;
 	if (doms_new == NULL) {
 		n = 0;
 		doms_new = &fallback_doms;
 		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
 	register_sched_domain_sysctl();
 	mutex_unlock(&sched_domains_mutex);
 }
 static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  * around partition_sched_domains().
  *
  * If we come here as part of a suspend/resume, don't touch cpusets because we
  * want to restore it back to its original state upon resume anyway.
  */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	switch (action) {
 	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED_FROZEN:
 		/*
 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 		 * resume sequence. As long as this is not the last online
 		 * operation in the resume sequence, just build a single sched
 		 * domain, ignoring cpusets.
 		 */
 		num_cpus_frozen--;
 		if (likely(num_cpus_frozen)) {
 			partition_sched_domains(1, NULL, NULL);
 			break;
 		}
 		/*
 		 * This is the last CPU online operation. So fall through and
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		cpuset_update_active_cpus(true);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 			       void *hcpu)
 {
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 		cpuset_update_active_cpus(false);
 		break;
 	case CPU_DOWN_PREPARE_FROZEN:
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 	sched_init_numa();
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
 	 * cpu masks are stable and all blatant races in the below code cannot
 	 * happen.
 	 */
 	mutex_lock(&sched_domains_mutex);
 	init_sched_domains(cpu_active_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 	init_hrtick();
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
 	init_sched_rt_class();
 	init_sched_dl_class();
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 #ifdef CONFIG_CGROUP_SCHED
 /*
  * Default task group.
  * Every task in system belongs to this group at bootup.
  */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	if (alloc_size) {
 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 		for_each_possible_cpu(i) {
 			per_cpu(load_balance_mask, i) = (void *)ptr;
 			ptr += cpumask_size();
 		}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 	init_rt_bandwidth(&def_rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 	init_dl_bandwidth(&def_dl_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
 	list_add(&root_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&root_task_group.children);
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 		rq = cpu_rq(i);
 		raw_spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs);
 		init_rt_rq(&rq->rt, rq);
 		init_dl_rq(&rq->dl, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		/*
 		 * How much cpu bandwidth does root_task_group get?
 		 *
 		 * In case of task-groups formed thr' the cgroup filesystem, it
 		 * gets 100% of the cpu resources in the system. This overall
 		 * system cpu resource is divided among the tasks of
 		 * root_task_group and its child task-groups in a fair manner,
 		 * based on each entity's (task or task-group's) weight
 		 * (se->load.weight).
 		 *
 		 * In other words, if root_task_group has 10 tasks of weight
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 		rq->last_load_update_tick = jiffies;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_power = SCHED_POWER_SCALE;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
 		rq->last_sched_tick = 0;
 #endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	calc_load_update = jiffies + LOAD_FREQ;
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 #ifdef CONFIG_SMP
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
 #endif
 	init_sched_fair_class();
 	scheduler_running = 1;
 }
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 	return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
 	     !is_idle_task(current)) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
 	prev_jiffy = jiffies;
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 			in_atomic(), irqs_disabled(),
 			current->pid, current->comm);
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 #ifdef CONFIG_DEBUG_PREEMPT
 	if (!preempt_count_equals(preempt_offset)) {
 		pr_err("Preemption disabled at:");
 		print_ip_sym(current->preempt_disable_ip);
 		pr_cont("\n");
 	}
 #endif
 	dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	const struct sched_class *prev_class = p->sched_class;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
 	int old_prio = p->prio;
 	int on_rq;
 	on_rq = p->on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	__setscheduler(rq, p, &attr);
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
 	check_class_changed(rq, p, prev_class, old_prio);
 }
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
 		if (!p->mm)
 			continue;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.statistics.wait_start	= 0;
 		p->se.statistics.sleep_start	= 0;
 		p->se.statistics.block_start	= 0;
 #endif
 		if (!dl_task(p) && !rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (task_nice(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
 		raw_spin_lock(&p->pi_lock);
 		rq = __task_rq_lock(p);
 		normalize_task(rq, p);
 		__task_rq_unlock(rq);
 		raw_spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 /*
  * These functions are only useful for the IA64 MCA handling, or kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  *
  * Return: The current task for @cpu.
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
 #ifdef CONFIG_IA64
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
 	kfree(tg);
 }
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 	return tg;
 err:
 	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 void sched_online_group(struct task_group *tg, struct task_group *parent)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_add_rcu(&tg->list, &task_groups);
 	WARN_ON(!parent); /* root should already exist */
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 /* rcu callback to free various structures associated with a task group */
 static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
 	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;
 	int i;
 	/* end participation in shares distribution */
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 /* change task's runqueue when it moves between groups.
  *	The caller of this function should have put the task in its new group
  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(tsk, &flags);
 	running = task_current(rq, tsk);
 	on_rq = tsk->on_rq;
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 	tg = container_of(task_css_check(tsk, cpu_cgrp_id,
 				lockdep_is_held(&tsk->sighand->siglock)),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
 	tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
 		tsk->sched_class->task_move_group(tsk, on_rq);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
 	task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *g, *p;
 	do_each_thread(g, p) {
 		if (rt_task(p) && task_rq(p)->rt.tg == tg)
 			return 1;
 	} while_each_thread(g, p);
 	return 0;
 }
 struct rt_schedulable_data {
 	struct task_group *tg;
 	u64 rt_period;
 	u64 rt_runtime;
 };
 static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	runtime = tg->rt_bandwidth.rt_runtime;
 	if (tg == d->tg) {
 		period = d->rt_period;
 		runtime = d->rt_runtime;
 	}
 	/*
 	 * Cannot have more runtime than the period.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	/*
 	 * Ensure we don't starve existing RT tasks.
 	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 	total = to_ratio(period, runtime);
 	/*
 	 * Nobody can have more than the global setting allows.
 	 */
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
 		if (child == d->tg) {
 			period = d->rt_period;
 			runtime = d->rt_runtime;
 		}
 		sum += to_ratio(period, runtime);
 	}
 	if (sum > total)
 		return -EINVAL;
 	return 0;
 }
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	int ret;
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 	rcu_read_lock();
 	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
 	rcu_read_unlock();
 	return ret;
 }
 static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
 	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = tg->rt_rq[i];
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_runtime;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return err;
 }
 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 static long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
 		return -1;
 	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
 static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (rt_period == 0)
 		return -EINVAL;
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 static long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
 	int ret = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return ret;
 }
 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept realtime tasks when there is no way for them to run */
 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
 		return 0;
 	return 1;
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
 	int i, ret = 0;
 	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = global_rt_runtime();
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 	return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static int sched_dl_global_constraints(void)
 {
 	u64 runtime = global_rt_runtime();
 	u64 period = global_rt_period();
 	u64 new_bw = to_ratio(period, runtime);
 	int cpu, ret = 0;
 	unsigned long flags;
 	/*
 	 * Here we want to check the bandwidth not being set to some
 	 * value smaller than the currently allocated bandwidth in
 	 * any of the root_domains.
 	 *
 	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
 	 * cycling on root_domains... Discussion on different/better
 	 * solutions is welcome!
 	 */
 	for_each_possible_cpu(cpu) {
 		struct dl_bw *dl_b = dl_bw_of(cpu);
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		if (new_bw < dl_b->total_bw)
 			ret = -EBUSY;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 		if (ret)
 			break;
 	}
 	return ret;
 }
 static void sched_dl_do_global(void)
 {
 	u64 new_bw = -1;
 	int cpu;
 	unsigned long flags;
 	def_dl_bandwidth.dl_period = global_rt_period();
 	def_dl_bandwidth.dl_runtime = global_rt_runtime();
 	if (global_rt_runtime() != RUNTIME_INF)
 		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
 	/*
 	 * FIXME: As above...
 	 */
 	for_each_possible_cpu(cpu) {
 		struct dl_bw *dl_b = dl_bw_of(cpu);
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		dl_b->bw = new_bw;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 	}
 }
 static int sched_rt_global_validate(void)
 {
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
 		(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
 		return -EINVAL;
 	return 0;
 }
 static void sched_rt_do_global(void)
 {
 	def_rt_bandwidth.rt_runtime = global_rt_runtime();
 	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
 }
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int old_period, old_runtime;
 	static DEFINE_MUTEX(mutex);
 	int ret;
 	mutex_lock(&mutex);
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		ret = sched_rt_global_validate();
 		if (ret)
 			goto undo;
 		ret = sched_rt_global_constraints();
 		if (ret)
 			goto undo;
 		ret = sched_dl_global_constraints();
 		if (ret)
 			goto undo;
 		sched_rt_do_global();
 		sched_dl_do_global();
 	}
 	if (0) {
 undo:
 		sysctl_sched_rt_period = old_period;
 		sysctl_sched_rt_runtime = old_runtime;
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 int sched_rr_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	static DEFINE_MUTEX(mutex);
 	mutex_lock(&mutex);
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	/* make sure that internally we keep jiffies */
 	/* also, writing zero resets timeslice to default */
 	if (!ret && write) {
 		sched_rr_timeslice = sched_rr_timeslice <= 0 ?
 			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 #ifdef CONFIG_CGROUP_SCHED
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct task_group, css) : NULL;
 }
 static struct cgroup_subsys_state *
 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct task_group *parent = css_tg(parent_css);
 	struct task_group *tg;
 	if (!parent) {
 		/* This is early initialization for the top cgroup */
 		return &root_task_group.css;
 	}
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 	return &tg->css;
 }
 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css_parent(css));
 	if (parent)
 		sched_online_group(tg, parent);
 	return 0;
 }
 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	sched_destroy_group(tg);
 }
 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	sched_offline_group(tg);
 }
 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	cgroup_taskset_for_each(task, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		if (!sched_rt_can_attach(css_tg(css), task))
 			return -EINVAL;
 #else
 		/* We don't support RT-tasks being in separate groups */
 		if (task->sched_class != &fair_sched_class)
 			return -EINVAL;
 #endif
 	}
 	return 0;
 }
 static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
 			      struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	cgroup_taskset_for_each(task, tset)
 		sched_move_task(task);
 }
 static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
 			    struct cgroup_subsys_state *old_css,
 			    struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
 	 * Ignore this case since the task hasn't ran yet, this avoids
 	 * trying to poke a half freed task state from generic code.
 	 */
 	if (!(task->flags & PF_EXITING))
 		return;
 	sched_move_task(task);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
 {
 	return sched_group_set_shares(css_tg(css), scale_load(shareval));
 }
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
 	struct task_group *tg = css_tg(css);
 	return (u64) scale_load_down(tg->shares);
 }
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
 	int i, ret = 0, runtime_enabled, runtime_was_enabled;
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 	if (tg == &root_task_group)
 		return -EINVAL;
 	/*
 	 * Ensure we have at some amount of bandwidth every period.  This is
 	 * to prevent reaching a state of large arrears when throttled via
 	 * entity_tick() resulting in prolonged exit starvation.
 	 */
 	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
 		return -EINVAL;
 	/*
 	 * Likewise, bound things on the otherside by preventing insane quota
 	 * periods.  This also allows us to normalize in computing quota
 	 * feasibility.
 	 */
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
 		goto out_unlock;
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
 	/*
 	 * If we need to toggle cfs_bandwidth_used, off->on must occur
 	 * before making related changes, and on->off must occur afterwards
 	 */
 	if (runtime_enabled && !runtime_was_enabled)
 		cfs_bandwidth_usage_inc();
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
 	__refill_cfs_bandwidth_runtime(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
 	if (runtime_enabled && cfs_b->timer_active) {
 		/* force a reprogram */
-		cfs_b->timer_active = 0;
+		__start_cfs_bandwidth(cfs_b, true);
-		__start_cfs_bandwidth(cfs_b);
 	}
 	raw_spin_unlock_irq(&cfs_b->lock);
 	for_each_possible_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
 		raw_spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
 out_unlock:
 	mutex_unlock(&cfs_constraints_mutex);
 	return ret;
 }
 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 {
 	u64 quota, period;
 	period = ktime_to_ns(tg->cfs_bandwidth.period);
 	if (cfs_quota_us < 0)
 		quota = RUNTIME_INF;
 	else
 		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
 long tg_get_cfs_quota(struct task_group *tg)
 {
 	u64 quota_us;
 	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
 		return -1;
 	quota_us = tg->cfs_bandwidth.quota;
 	do_div(quota_us, NSEC_PER_USEC);
 	return quota_us;
 }
 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 {
 	u64 quota, period;
 	period = (u64)cfs_period_us * NSEC_PER_USEC;
 	quota = tg->cfs_bandwidth.quota;
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
 long tg_get_cfs_period(struct task_group *tg)
 {
 	u64 cfs_period_us;
 	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
 	do_div(cfs_period_us, NSEC_PER_USEC);
 	return cfs_period_us;
 }
 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
 				  struct cftype *cft)
 {
 	return tg_get_cfs_quota(css_tg(css));
 }
 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
 				   struct cftype *cftype, s64 cfs_quota_us)
 {
 	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
 }
 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
 				   struct cftype *cft)
 {
 	return tg_get_cfs_period(css_tg(css));
 }
 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
 				    struct cftype *cftype, u64 cfs_period_us)
 {
 	return tg_set_cfs_period(css_tg(css), cfs_period_us);
 }
 struct cfs_schedulable_data {
 	struct task_group *tg;
 	u64 period, quota;
 };
 /*
  * normalize group quota/period to be quota/max_period
  * note: units are usecs
  */
 static u64 normalize_cfs_quota(struct task_group *tg,
 			       struct cfs_schedulable_data *d)
 {
 	u64 quota, period;
 	if (tg == d->tg) {
 		period = d->period;
 		quota = d->quota;
 	} else {
 		period = tg_get_cfs_period(tg);
 		quota = tg_get_cfs_quota(tg);
 	}
 	/* note: these should typically be equivalent */
 	if (quota == RUNTIME_INF || quota == -1)
 		return RUNTIME_INF;
 	return to_ratio(period, quota);
 }
 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 {
 	struct cfs_schedulable_data *d = data;
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 	s64 quota = 0, parent_quota = -1;
 	if (!tg->parent) {
 		quota = RUNTIME_INF;
 	} else {
 		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
 		quota = normalize_cfs_quota(tg, d);
 		parent_quota = parent_b->hierarchal_quota;
 		/*
 		 * ensure max(child_quota) <= parent_quota, inherit when no
 		 * limit is set
 		 */
 		if (quota == RUNTIME_INF)
 			quota = parent_quota;
 		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
 			return -EINVAL;
 	}
 	cfs_b->hierarchal_quota = quota;
 	return 0;
 }
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 {
 	int ret;
 	struct cfs_schedulable_data data = {
 		.tg = tg,
 		.period = period,
 		.quota = quota,
 	};
 	if (quota != RUNTIME_INF) {
 		do_div(data.period, NSEC_PER_USEC);
 		do_div(data.quota, NSEC_PER_USEC);
 	}
 	rcu_read_lock();
 	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
 	rcu_read_unlock();
 	return ret;
 }
 static int cpu_stats_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
 	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
 	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
 	return 0;
 }
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
 				struct cftype *cft, s64 val)
 {
 	return sched_group_set_rt_runtime(css_tg(css), val);
 }
 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
 	return sched_group_rt_runtime(css_tg(css));
 }
 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
 				    struct cftype *cftype, u64 rt_period_us)
 {
 	return sched_group_set_rt_period(css_tg(css), rt_period_us);
 }
 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 				   struct cftype *cft)
 {
 	return sched_group_rt_period(css_tg(css));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "cfs_quota_us",
 		.read_s64 = cpu_cfs_quota_read_s64,
 		.write_s64 = cpu_cfs_quota_write_s64,
 	},
 	{
 		.name = "cfs_period_us",
 		.read_u64 = cpu_cfs_period_read_u64,
 		.write_u64 = cpu_cfs_period_write_u64,
 	},
 	{
 		.name = "stat",
 		.seq_show = cpu_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
 		.read_s64 = cpu_rt_runtime_read,
 		.write_s64 = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
 	{ }	/* terminate */
 };
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_free	= cpu_cgroup_css_free,
 	.css_online	= cpu_cgroup_css_online,
 	.css_offline	= cpu_cgroup_css_offline,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
 	.base_cftypes	= cpu_files,
 	.early_init	= 1,
 };
 #endif	/* CONFIG_CGROUP_SCHED */
 void dump_cpu_task(int cpu)
 {
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }

kernel/sched/fair.c

Diff comments View file @ 09dc4ab

1	/*	1	/*
2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)	2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3	*	3	*
4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>	4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5	*	5	*
6	* Interactivity improvements by Mike Galbraith	6	* Interactivity improvements by Mike Galbraith
7	* (C) 2007 Mike Galbraith <efault@gmx.de>	7	* (C) 2007 Mike Galbraith <efault@gmx.de>
8	*	8	*
9	* Various enhancements by Dmitry Adamushko.	9	* Various enhancements by Dmitry Adamushko.
10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>	10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11	*	11	*
12	* Group scheduling enhancements by Srivatsa Vaddagiri	12	* Group scheduling enhancements by Srivatsa Vaddagiri
13	* Copyright IBM Corporation, 2007	13	* Copyright IBM Corporation, 2007
14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>	14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15	*	15	*
16	* Scaled math optimizations by Thomas Gleixner	16	* Scaled math optimizations by Thomas Gleixner
17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>	17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18	*	18	*
19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra	19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>	20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21	*/	21	*/
22		22
23	#include <linux/latencytop.h>	23	#include <linux/latencytop.h>
24	#include <linux/sched.h>	24	#include <linux/sched.h>
25	#include <linux/cpumask.h>	25	#include <linux/cpumask.h>
26	#include <linux/slab.h>	26	#include <linux/slab.h>
27	#include <linux/profile.h>	27	#include <linux/profile.h>
28	#include <linux/interrupt.h>	28	#include <linux/interrupt.h>
29	#include <linux/mempolicy.h>	29	#include <linux/mempolicy.h>
30	#include <linux/migrate.h>	30	#include <linux/migrate.h>
31	#include <linux/task_work.h>	31	#include <linux/task_work.h>
32		32
33	#include <trace/events/sched.h>	33	#include <trace/events/sched.h>
34		34
35	#include "sched.h"	35	#include "sched.h"
36		36
37	/*	37	/*
38	* Targeted preemption latency for CPU-bound tasks:	38	* Targeted preemption latency for CPU-bound tasks:
39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40	*	40	*
41	* NOTE: this latency value is not the same as the concept of	41	* NOTE: this latency value is not the same as the concept of
42	* 'timeslice length' - timeslices in CFS are of variable length	42	* 'timeslice length' - timeslices in CFS are of variable length
43	* and have no persistent notion like in traditional, time-slice	43	* and have no persistent notion like in traditional, time-slice
44	* based scheduling concepts.	44	* based scheduling concepts.
45	*	45	*
46	* (to see the precise effective timeslice length of your workload,	46	* (to see the precise effective timeslice length of your workload,
47	* run vmstat and monitor the context-switches (cs) field)	47	* run vmstat and monitor the context-switches (cs) field)
48	*/	48	*/
49	unsigned int sysctl_sched_latency = 6000000ULL;	49	unsigned int sysctl_sched_latency = 6000000ULL;
50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;	50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51		51
52	/*	52	/*
53	* The initial- and re-scaling of tunables is configurable	53	* The initial- and re-scaling of tunables is configurable
54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))	54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55	*	55	*
56	* Options are:	56	* Options are:
57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1	57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)	58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus	59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60	*/	60	*/
61	enum sched_tunable_scaling sysctl_sched_tunable_scaling	61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
62	= SCHED_TUNABLESCALING_LOG;	62	= SCHED_TUNABLESCALING_LOG;
63		63
64	/*	64	/*
65	* Minimal preemption granularity for CPU-bound tasks:	65	* Minimal preemption granularity for CPU-bound tasks:
66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67	*/	67	*/
68	unsigned int sysctl_sched_min_granularity = 750000ULL;	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;	69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70		70
71	/*	71	/*
72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity	72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73	*/	73	*/
74	static unsigned int sched_nr_latency = 8;	74	static unsigned int sched_nr_latency = 8;
75		75
76	/*	76	/*
77	* After fork, child runs first. If set to 0 (default) then	77	* After fork, child runs first. If set to 0 (default) then
78	* parent will (try to) run first.	78	* parent will (try to) run first.
79	*/	79	*/
80	unsigned int sysctl_sched_child_runs_first __read_mostly;	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
81		81
82	/*	82	/*
83	* SCHED_OTHER wake-up granularity.	83	* SCHED_OTHER wake-up granularity.
84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85	*	85	*
86	* This option delays the preemption effects of decoupled workloads	86	* This option delays the preemption effects of decoupled workloads
87	* and reduces their over-scheduling. Synchronous workloads will still	87	* and reduces their over-scheduling. Synchronous workloads will still
88	* have immediate wakeup/sleep latencies.	88	* have immediate wakeup/sleep latencies.
89	*/	89	*/
90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92		92
93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94		94
95	/*	95	/*
96	* The exponential sliding window over which load is averaged for shares	96	* The exponential sliding window over which load is averaged for shares
97	* distribution.	97	* distribution.
98	* (default: 10msec)	98	* (default: 10msec)
99	*/	99	*/
100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;	100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101		101
102	#ifdef CONFIG_CFS_BANDWIDTH	102	#ifdef CONFIG_CFS_BANDWIDTH
103	/*	103	/*
104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool	104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105	* each time a cfs_rq requests quota.	105	* each time a cfs_rq requests quota.
106	*	106	*
107	* Note: in the case that the slice exceeds the runtime remaining (either due	107	* Note: in the case that the slice exceeds the runtime remaining (either due
108	* to consumption or the quota being specified to be smaller than the slice)	108	* to consumption or the quota being specified to be smaller than the slice)
109	* we will always only issue the remaining available time.	109	* we will always only issue the remaining available time.
110	*	110	*
111	* default: 5 msec, units: microseconds	111	* default: 5 msec, units: microseconds
112	*/	112	*/
113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;	113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114	#endif	114	#endif
115		115
116	static inline void update_load_add(struct load_weight *lw, unsigned long inc)	116	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117	{	117	{
118	lw->weight += inc;	118	lw->weight += inc;
119	lw->inv_weight = 0;	119	lw->inv_weight = 0;
120	}	120	}
121		121
122	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)	122	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123	{	123	{
124	lw->weight -= dec;	124	lw->weight -= dec;
125	lw->inv_weight = 0;	125	lw->inv_weight = 0;
126	}	126	}
127		127
128	static inline void update_load_set(struct load_weight *lw, unsigned long w)	128	static inline void update_load_set(struct load_weight *lw, unsigned long w)
129	{	129	{
130	lw->weight = w;	130	lw->weight = w;
131	lw->inv_weight = 0;	131	lw->inv_weight = 0;
132	}	132	}
133		133
134	/*	134	/*
135	* Increase the granularity value when there are more CPUs,	135	* Increase the granularity value when there are more CPUs,
136	* because with more CPUs the 'effective latency' as visible	136	* because with more CPUs the 'effective latency' as visible
137	* to users decreases. But the relationship is not linear,	137	* to users decreases. But the relationship is not linear,
138	* so pick a second-best guess by going with the log2 of the	138	* so pick a second-best guess by going with the log2 of the
139	* number of CPUs.	139	* number of CPUs.
140	*	140	*
141	* This idea comes from the SD scheduler of Con Kolivas:	141	* This idea comes from the SD scheduler of Con Kolivas:
142	*/	142	*/
143	static int get_update_sysctl_factor(void)	143	static int get_update_sysctl_factor(void)
144	{	144	{
145	unsigned int cpus = min_t(int, num_online_cpus(), 8);	145	unsigned int cpus = min_t(int, num_online_cpus(), 8);
146	unsigned int factor;	146	unsigned int factor;
147		147
148	switch (sysctl_sched_tunable_scaling) {	148	switch (sysctl_sched_tunable_scaling) {
149	case SCHED_TUNABLESCALING_NONE:	149	case SCHED_TUNABLESCALING_NONE:
150	factor = 1;	150	factor = 1;
151	break;	151	break;
152	case SCHED_TUNABLESCALING_LINEAR:	152	case SCHED_TUNABLESCALING_LINEAR:
153	factor = cpus;	153	factor = cpus;
154	break;	154	break;
155	case SCHED_TUNABLESCALING_LOG:	155	case SCHED_TUNABLESCALING_LOG:
156	default:	156	default:
157	factor = 1 + ilog2(cpus);	157	factor = 1 + ilog2(cpus);
158	break;	158	break;
159	}	159	}
160		160
161	return factor;	161	return factor;
162	}	162	}
163		163
164	static void update_sysctl(void)	164	static void update_sysctl(void)
165	{	165	{
166	unsigned int factor = get_update_sysctl_factor();	166	unsigned int factor = get_update_sysctl_factor();
167		167
168	#define SET_SYSCTL(name) \	168	#define SET_SYSCTL(name) \
169	(sysctl_##name = (factor) * normalized_sysctl_##name)	169	(sysctl_##name = (factor) * normalized_sysctl_##name)
170	SET_SYSCTL(sched_min_granularity);	170	SET_SYSCTL(sched_min_granularity);
171	SET_SYSCTL(sched_latency);	171	SET_SYSCTL(sched_latency);
172	SET_SYSCTL(sched_wakeup_granularity);	172	SET_SYSCTL(sched_wakeup_granularity);
173	#undef SET_SYSCTL	173	#undef SET_SYSCTL
174	}	174	}
175		175
176	void sched_init_granularity(void)	176	void sched_init_granularity(void)
177	{	177	{
178	update_sysctl();	178	update_sysctl();
179	}	179	}
180		180
181	#define WMULT_CONST (~0U)	181	#define WMULT_CONST (~0U)
182	#define WMULT_SHIFT 32	182	#define WMULT_SHIFT 32
183		183
184	static void __update_inv_weight(struct load_weight *lw)	184	static void __update_inv_weight(struct load_weight *lw)
185	{	185	{
186	unsigned long w;	186	unsigned long w;
187		187
188	if (likely(lw->inv_weight))	188	if (likely(lw->inv_weight))
189	return;	189	return;
190		190
191	w = scale_load_down(lw->weight);	191	w = scale_load_down(lw->weight);
192		192
193	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))	193	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194	lw->inv_weight = 1;	194	lw->inv_weight = 1;
195	else if (unlikely(!w))	195	else if (unlikely(!w))
196	lw->inv_weight = WMULT_CONST;	196	lw->inv_weight = WMULT_CONST;
197	else	197	else
198	lw->inv_weight = WMULT_CONST / w;	198	lw->inv_weight = WMULT_CONST / w;
199	}	199	}
200		200
201	/*	201	/*
202	* delta_exec * weight / lw.weight	202	* delta_exec * weight / lw.weight
203	* OR	203	* OR
204	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT	204	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205	*	205	*
206	* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case	206	* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207	* we're guaranteed shift stays positive because inv_weight is guaranteed to	207	* we're guaranteed shift stays positive because inv_weight is guaranteed to
208	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.	208	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209	*	209	*
210	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus	210	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211	* weight/lw.weight <= 1, and therefore our shift will also be positive.	211	* weight/lw.weight <= 1, and therefore our shift will also be positive.
212	*/	212	*/
213	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)	213	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
214	{	214	{
215	u64 fact = scale_load_down(weight);	215	u64 fact = scale_load_down(weight);
216	int shift = WMULT_SHIFT;	216	int shift = WMULT_SHIFT;
217		217
218	__update_inv_weight(lw);	218	__update_inv_weight(lw);
219		219
220	if (unlikely(fact >> 32)) {	220	if (unlikely(fact >> 32)) {
221	while (fact >> 32) {	221	while (fact >> 32) {
222	fact >>= 1;	222	fact >>= 1;
223	shift--;	223	shift--;
224	}	224	}
225	}	225	}
226		226
227	/* hint to use a 32x32->64 mul */	227	/* hint to use a 32x32->64 mul */
228	fact = (u64)(u32)fact * lw->inv_weight;	228	fact = (u64)(u32)fact * lw->inv_weight;
229		229
230	while (fact >> 32) {	230	while (fact >> 32) {
231	fact >>= 1;	231	fact >>= 1;
232	shift--;	232	shift--;
233	}	233	}
234		234
235	return mul_u64_u32_shr(delta_exec, fact, shift);	235	return mul_u64_u32_shr(delta_exec, fact, shift);
236	}	236	}
237		237
238		238
239	const struct sched_class fair_sched_class;	239	const struct sched_class fair_sched_class;
240		240
241	/**************************************************************	241	/**************************************************************
242	* CFS operations on generic schedulable entities:	242	* CFS operations on generic schedulable entities:
243	*/	243	*/
244		244
245	#ifdef CONFIG_FAIR_GROUP_SCHED	245	#ifdef CONFIG_FAIR_GROUP_SCHED
246		246
247	/* cpu runqueue to which this cfs_rq is attached */	247	/* cpu runqueue to which this cfs_rq is attached */
248	static inline struct rq rq_of(struct cfs_rq cfs_rq)	248	static inline struct rq rq_of(struct cfs_rq cfs_rq)
249	{	249	{
250	return cfs_rq->rq;	250	return cfs_rq->rq;
251	}	251	}
252		252
253	/* An entity is a task if it doesn't "own" a runqueue */	253	/* An entity is a task if it doesn't "own" a runqueue */
254	#define entity_is_task(se) (!se->my_q)	254	#define entity_is_task(se) (!se->my_q)
255		255
256	static inline struct task_struct task_of(struct sched_entity se)	256	static inline struct task_struct task_of(struct sched_entity se)
257	{	257	{
258	#ifdef CONFIG_SCHED_DEBUG	258	#ifdef CONFIG_SCHED_DEBUG
259	WARN_ON_ONCE(!entity_is_task(se));	259	WARN_ON_ONCE(!entity_is_task(se));
260	#endif	260	#endif
261	return container_of(se, struct task_struct, se);	261	return container_of(se, struct task_struct, se);
262	}	262	}
263		263
264	/* Walk up scheduling entities hierarchy */	264	/* Walk up scheduling entities hierarchy */
265	#define for_each_sched_entity(se) \	265	#define for_each_sched_entity(se) \
266	for (; se; se = se->parent)	266	for (; se; se = se->parent)
267		267
268	static inline struct cfs_rq task_cfs_rq(struct task_struct p)	268	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
269	{	269	{
270	return p->se.cfs_rq;	270	return p->se.cfs_rq;
271	}	271	}
272		272
273	/* runqueue on which this entity is (to be) queued */	273	/* runqueue on which this entity is (to be) queued */
274	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)	274	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
275	{	275	{
276	return se->cfs_rq;	276	return se->cfs_rq;
277	}	277	}
278		278
279	/* runqueue "owned" by this group */	279	/* runqueue "owned" by this group */
280	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)	280	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
281	{	281	{
282	return grp->my_q;	282	return grp->my_q;
283	}	283	}
284		284
285	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,	285	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
286	int force_update);	286	int force_update);
287		287
288	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)	288	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
289	{	289	{
290	if (!cfs_rq->on_list) {	290	if (!cfs_rq->on_list) {
291	/*	291	/*
292	* Ensure we either appear before our parent (if already	292	* Ensure we either appear before our parent (if already
293	* enqueued) or force our parent to appear after us when it is	293	* enqueued) or force our parent to appear after us when it is
294	* enqueued. The fact that we always enqueue bottom-up	294	* enqueued. The fact that we always enqueue bottom-up
295	* reduces this to two cases.	295	* reduces this to two cases.
296	*/	296	*/
297	if (cfs_rq->tg->parent &&	297	if (cfs_rq->tg->parent &&
298	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {	298	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
299	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,	299	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
300	&rq_of(cfs_rq)->leaf_cfs_rq_list);	300	&rq_of(cfs_rq)->leaf_cfs_rq_list);
301	} else {	301	} else {
302	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,	302	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
303	&rq_of(cfs_rq)->leaf_cfs_rq_list);	303	&rq_of(cfs_rq)->leaf_cfs_rq_list);
304	}	304	}
305		305
306	cfs_rq->on_list = 1;	306	cfs_rq->on_list = 1;
307	/* We should have no load, but we need to update last_decay. */	307	/* We should have no load, but we need to update last_decay. */
308	update_cfs_rq_blocked_load(cfs_rq, 0);	308	update_cfs_rq_blocked_load(cfs_rq, 0);
309	}	309	}
310	}	310	}
311		311
312	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)	312	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
313	{	313	{
314	if (cfs_rq->on_list) {	314	if (cfs_rq->on_list) {
315	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);	315	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
316	cfs_rq->on_list = 0;	316	cfs_rq->on_list = 0;
317	}	317	}
318	}	318	}
319		319
320	/* Iterate thr' all leaf cfs_rq's on a runqueue */	320	/* Iterate thr' all leaf cfs_rq's on a runqueue */
321	#define for_each_leaf_cfs_rq(rq, cfs_rq) \	321	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
322	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)	322	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323		323
324	/* Do the two (enqueued) entities belong to the same group ? */	324	/* Do the two (enqueued) entities belong to the same group ? */
325	static inline struct cfs_rq *	325	static inline struct cfs_rq *
326	is_same_group(struct sched_entity se, struct sched_entity pse)	326	is_same_group(struct sched_entity se, struct sched_entity pse)
327	{	327	{
328	if (se->cfs_rq == pse->cfs_rq)	328	if (se->cfs_rq == pse->cfs_rq)
329	return se->cfs_rq;	329	return se->cfs_rq;
330		330
331	return NULL;	331	return NULL;
332	}	332	}
333		333
334	static inline struct sched_entity parent_entity(struct sched_entity se)	334	static inline struct sched_entity parent_entity(struct sched_entity se)
335	{	335	{
336	return se->parent;	336	return se->parent;
337	}	337	}
338		338
339	static void	339	static void
340	find_matching_se(struct sched_entity se, struct sched_entity pse)	340	find_matching_se(struct sched_entity se, struct sched_entity pse)
341	{	341	{
342	int se_depth, pse_depth;	342	int se_depth, pse_depth;
343		343
344	/*	344	/*
345	* preemption test can be made between sibling entities who are in the	345	* preemption test can be made between sibling entities who are in the
346	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of	346	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
347	* both tasks until we find their ancestors who are siblings of common	347	* both tasks until we find their ancestors who are siblings of common
348	* parent.	348	* parent.
349	*/	349	*/
350		350
351	/* First walk up until both entities are at same depth */	351	/* First walk up until both entities are at same depth */
352	se_depth = (*se)->depth;	352	se_depth = (*se)->depth;
353	pse_depth = (*pse)->depth;	353	pse_depth = (*pse)->depth;
354		354
355	while (se_depth > pse_depth) {	355	while (se_depth > pse_depth) {
356	se_depth--;	356	se_depth--;
357	se = parent_entity(se);	357	se = parent_entity(se);
358	}	358	}
359		359
360	while (pse_depth > se_depth) {	360	while (pse_depth > se_depth) {
361	pse_depth--;	361	pse_depth--;
362	pse = parent_entity(pse);	362	pse = parent_entity(pse);
363	}	363	}
364		364
365	while (!is_same_group(se, pse)) {	365	while (!is_same_group(se, pse)) {
366	se = parent_entity(se);	366	se = parent_entity(se);
367	pse = parent_entity(pse);	367	pse = parent_entity(pse);
368	}	368	}
369	}	369	}
370		370
371	#else /* !CONFIG_FAIR_GROUP_SCHED */	371	#else /* !CONFIG_FAIR_GROUP_SCHED */
372		372
373	static inline struct task_struct task_of(struct sched_entity se)	373	static inline struct task_struct task_of(struct sched_entity se)
374	{	374	{
375	return container_of(se, struct task_struct, se);	375	return container_of(se, struct task_struct, se);
376	}	376	}
377		377
378	static inline struct rq rq_of(struct cfs_rq cfs_rq)	378	static inline struct rq rq_of(struct cfs_rq cfs_rq)
379	{	379	{
380	return container_of(cfs_rq, struct rq, cfs);	380	return container_of(cfs_rq, struct rq, cfs);
381	}	381	}
382		382
383	#define entity_is_task(se) 1	383	#define entity_is_task(se) 1
384		384
385	#define for_each_sched_entity(se) \	385	#define for_each_sched_entity(se) \
386	for (; se; se = NULL)	386	for (; se; se = NULL)
387		387
388	static inline struct cfs_rq task_cfs_rq(struct task_struct p)	388	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
389	{	389	{
390	return &task_rq(p)->cfs;	390	return &task_rq(p)->cfs;
391	}	391	}
392		392
393	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)	393	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
394	{	394	{
395	struct task_struct *p = task_of(se);	395	struct task_struct *p = task_of(se);
396	struct rq *rq = task_rq(p);	396	struct rq *rq = task_rq(p);
397		397
398	return &rq->cfs;	398	return &rq->cfs;
399	}	399	}
400		400
401	/* runqueue "owned" by this group */	401	/* runqueue "owned" by this group */
402	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)	402	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
403	{	403	{
404	return NULL;	404	return NULL;
405	}	405	}
406		406
407	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)	407	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408	{	408	{
409	}	409	}
410		410
411	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)	411	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
412	{	412	{
413	}	413	}
414		414
415	#define for_each_leaf_cfs_rq(rq, cfs_rq) \	415	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
416	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)	416	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
417		417
418	static inline struct sched_entity parent_entity(struct sched_entity se)	418	static inline struct sched_entity parent_entity(struct sched_entity se)
419	{	419	{
420	return NULL;	420	return NULL;
421	}	421	}
422		422
423	static inline void	423	static inline void
424	find_matching_se(struct sched_entity se, struct sched_entity pse)	424	find_matching_se(struct sched_entity se, struct sched_entity pse)
425	{	425	{
426	}	426	}
427		427
428	#endif /* CONFIG_FAIR_GROUP_SCHED */	428	#endif /* CONFIG_FAIR_GROUP_SCHED */
429		429
430	static __always_inline	430	static __always_inline
431	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);	431	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
432		432
433	/**************************************************************	433	/**************************************************************
434	* Scheduling class tree data structure manipulation methods:	434	* Scheduling class tree data structure manipulation methods:
435	*/	435	*/
436		436
437	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)	437	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
438	{	438	{
439	s64 delta = (s64)(vruntime - max_vruntime);	439	s64 delta = (s64)(vruntime - max_vruntime);
440	if (delta > 0)	440	if (delta > 0)
441	max_vruntime = vruntime;	441	max_vruntime = vruntime;
442		442
443	return max_vruntime;	443	return max_vruntime;
444	}	444	}
445		445
446	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)	446	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
447	{	447	{
448	s64 delta = (s64)(vruntime - min_vruntime);	448	s64 delta = (s64)(vruntime - min_vruntime);
449	if (delta < 0)	449	if (delta < 0)
450	min_vruntime = vruntime;	450	min_vruntime = vruntime;
451		451
452	return min_vruntime;	452	return min_vruntime;
453	}	453	}
454		454
455	static inline int entity_before(struct sched_entity *a,	455	static inline int entity_before(struct sched_entity *a,
456	struct sched_entity *b)	456	struct sched_entity *b)
457	{	457	{
458	return (s64)(a->vruntime - b->vruntime) < 0;	458	return (s64)(a->vruntime - b->vruntime) < 0;
459	}	459	}
460		460
461	static void update_min_vruntime(struct cfs_rq *cfs_rq)	461	static void update_min_vruntime(struct cfs_rq *cfs_rq)
462	{	462	{
463	u64 vruntime = cfs_rq->min_vruntime;	463	u64 vruntime = cfs_rq->min_vruntime;
464		464
465	if (cfs_rq->curr)	465	if (cfs_rq->curr)
466	vruntime = cfs_rq->curr->vruntime;	466	vruntime = cfs_rq->curr->vruntime;
467		467
468	if (cfs_rq->rb_leftmost) {	468	if (cfs_rq->rb_leftmost) {
469	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,	469	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
470	struct sched_entity,	470	struct sched_entity,
471	run_node);	471	run_node);
472		472
473	if (!cfs_rq->curr)	473	if (!cfs_rq->curr)
474	vruntime = se->vruntime;	474	vruntime = se->vruntime;
475	else	475	else
476	vruntime = min_vruntime(vruntime, se->vruntime);	476	vruntime = min_vruntime(vruntime, se->vruntime);
477	}	477	}
478		478
479	/* ensure we never gain time by being placed backwards. */	479	/* ensure we never gain time by being placed backwards. */
480	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);	480	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
481	#ifndef CONFIG_64BIT	481	#ifndef CONFIG_64BIT
482	smp_wmb();	482	smp_wmb();
483	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;	483	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
484	#endif	484	#endif
485	}	485	}
486		486
487	/*	487	/*
488	* Enqueue an entity into the rb-tree:	488	* Enqueue an entity into the rb-tree:
489	*/	489	*/
490	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)	490	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
491	{	491	{
492	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;	492	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
493	struct rb_node *parent = NULL;	493	struct rb_node *parent = NULL;
494	struct sched_entity *entry;	494	struct sched_entity *entry;
495	int leftmost = 1;	495	int leftmost = 1;
496		496
497	/*	497	/*
498	* Find the right place in the rbtree:	498	* Find the right place in the rbtree:
499	*/	499	*/
500	while (*link) {	500	while (*link) {
501	parent = *link;	501	parent = *link;
502	entry = rb_entry(parent, struct sched_entity, run_node);	502	entry = rb_entry(parent, struct sched_entity, run_node);
503	/*	503	/*
504	* We dont care about collisions. Nodes with	504	* We dont care about collisions. Nodes with
505	* the same key stay together.	505	* the same key stay together.
506	*/	506	*/
507	if (entity_before(se, entry)) {	507	if (entity_before(se, entry)) {
508	link = &parent->rb_left;	508	link = &parent->rb_left;
509	} else {	509	} else {
510	link = &parent->rb_right;	510	link = &parent->rb_right;
511	leftmost = 0;	511	leftmost = 0;
512	}	512	}
513	}	513	}
514		514
515	/*	515	/*
516	* Maintain a cache of leftmost tree entries (it is frequently	516	* Maintain a cache of leftmost tree entries (it is frequently
517	* used):	517	* used):
518	*/	518	*/
519	if (leftmost)	519	if (leftmost)
520	cfs_rq->rb_leftmost = &se->run_node;	520	cfs_rq->rb_leftmost = &se->run_node;
521		521
522	rb_link_node(&se->run_node, parent, link);	522	rb_link_node(&se->run_node, parent, link);
523	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);	523	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
524	}	524	}
525		525
526	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)	526	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
527	{	527	{
528	if (cfs_rq->rb_leftmost == &se->run_node) {	528	if (cfs_rq->rb_leftmost == &se->run_node) {
529	struct rb_node *next_node;	529	struct rb_node *next_node;
530		530
531	next_node = rb_next(&se->run_node);	531	next_node = rb_next(&se->run_node);
532	cfs_rq->rb_leftmost = next_node;	532	cfs_rq->rb_leftmost = next_node;
533	}	533	}
534		534
535	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);	535	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
536	}	536	}
537		537
538	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)	538	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
539	{	539	{
540	struct rb_node *left = cfs_rq->rb_leftmost;	540	struct rb_node *left = cfs_rq->rb_leftmost;
541		541
542	if (!left)	542	if (!left)
543	return NULL;	543	return NULL;
544		544
545	return rb_entry(left, struct sched_entity, run_node);	545	return rb_entry(left, struct sched_entity, run_node);
546	}	546	}
547		547
548	static struct sched_entity __pick_next_entity(struct sched_entity se)	548	static struct sched_entity __pick_next_entity(struct sched_entity se)
549	{	549	{
550	struct rb_node *next = rb_next(&se->run_node);	550	struct rb_node *next = rb_next(&se->run_node);
551		551
552	if (!next)	552	if (!next)
553	return NULL;	553	return NULL;
554		554
555	return rb_entry(next, struct sched_entity, run_node);	555	return rb_entry(next, struct sched_entity, run_node);
556	}	556	}
557		557
558	#ifdef CONFIG_SCHED_DEBUG	558	#ifdef CONFIG_SCHED_DEBUG
559	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)	559	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
560	{	560	{
561	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);	561	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
562		562
563	if (!last)	563	if (!last)
564	return NULL;	564	return NULL;
565		565
566	return rb_entry(last, struct sched_entity, run_node);	566	return rb_entry(last, struct sched_entity, run_node);
567	}	567	}
568		568
569	/**************************************************************	569	/**************************************************************
570	* Scheduling class statistics methods:	570	* Scheduling class statistics methods:
571	*/	571	*/
572		572
573	int sched_proc_update_handler(struct ctl_table *table, int write,	573	int sched_proc_update_handler(struct ctl_table *table, int write,
574	void __user buffer, size_t lenp,	574	void __user buffer, size_t lenp,
575	loff_t *ppos)	575	loff_t *ppos)
576	{	576	{
577	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);	577	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
578	int factor = get_update_sysctl_factor();	578	int factor = get_update_sysctl_factor();
579		579
580	if (ret \|\| !write)	580	if (ret \|\| !write)
581	return ret;	581	return ret;
582		582
583	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,	583	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
584	sysctl_sched_min_granularity);	584	sysctl_sched_min_granularity);
585		585
586	#define WRT_SYSCTL(name) \	586	#define WRT_SYSCTL(name) \
587	(normalized_sysctl_##name = sysctl_##name / (factor))	587	(normalized_sysctl_##name = sysctl_##name / (factor))
588	WRT_SYSCTL(sched_min_granularity);	588	WRT_SYSCTL(sched_min_granularity);
589	WRT_SYSCTL(sched_latency);	589	WRT_SYSCTL(sched_latency);
590	WRT_SYSCTL(sched_wakeup_granularity);	590	WRT_SYSCTL(sched_wakeup_granularity);
591	#undef WRT_SYSCTL	591	#undef WRT_SYSCTL
592		592
593	return 0;	593	return 0;
594	}	594	}
595	#endif	595	#endif
596		596
597	/*	597	/*
598	* delta /= w	598	* delta /= w
599	*/	599	*/
600	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)	600	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
601	{	601	{
602	if (unlikely(se->load.weight != NICE_0_LOAD))	602	if (unlikely(se->load.weight != NICE_0_LOAD))
603	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);	603	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
604		604
605	return delta;	605	return delta;
606	}	606	}
607		607
608	/*	608	/*
609	* The idea is to set a period in which each task runs once.	609	* The idea is to set a period in which each task runs once.
610	*	610	*
611	* When there are too many tasks (sched_nr_latency) we have to stretch	611	* When there are too many tasks (sched_nr_latency) we have to stretch
612	* this period because otherwise the slices get too small.	612	* this period because otherwise the slices get too small.
613	*	613	*
614	* p = (nr <= nl) ? l : l*nr/nl	614	* p = (nr <= nl) ? l : l*nr/nl
615	*/	615	*/
616	static u64 __sched_period(unsigned long nr_running)	616	static u64 __sched_period(unsigned long nr_running)
617	{	617	{
618	u64 period = sysctl_sched_latency;	618	u64 period = sysctl_sched_latency;
619	unsigned long nr_latency = sched_nr_latency;	619	unsigned long nr_latency = sched_nr_latency;
620		620
621	if (unlikely(nr_running > nr_latency)) {	621	if (unlikely(nr_running > nr_latency)) {
622	period = sysctl_sched_min_granularity;	622	period = sysctl_sched_min_granularity;
623	period *= nr_running;	623	period *= nr_running;
624	}	624	}
625		625
626	return period;	626	return period;
627	}	627	}
628		628
629	/*	629	/*
630	* We calculate the wall-time slice from the period by taking a part	630	* We calculate the wall-time slice from the period by taking a part
631	* proportional to the weight.	631	* proportional to the weight.
632	*	632	*
633	* s = p*P[w/rw]	633	* s = p*P[w/rw]
634	*/	634	*/
635	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)	635	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
636	{	636	{
637	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);	637	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
638		638
639	for_each_sched_entity(se) {	639	for_each_sched_entity(se) {
640	struct load_weight *load;	640	struct load_weight *load;
641	struct load_weight lw;	641	struct load_weight lw;
642		642
643	cfs_rq = cfs_rq_of(se);	643	cfs_rq = cfs_rq_of(se);
644	load = &cfs_rq->load;	644	load = &cfs_rq->load;
645		645
646	if (unlikely(!se->on_rq)) {	646	if (unlikely(!se->on_rq)) {
647	lw = cfs_rq->load;	647	lw = cfs_rq->load;
648		648
649	update_load_add(&lw, se->load.weight);	649	update_load_add(&lw, se->load.weight);
650	load = &lw;	650	load = &lw;
651	}	651	}
652	slice = __calc_delta(slice, se->load.weight, load);	652	slice = __calc_delta(slice, se->load.weight, load);
653	}	653	}
654	return slice;	654	return slice;
655	}	655	}
656		656
657	/*	657	/*
658	* We calculate the vruntime slice of a to-be-inserted task.	658	* We calculate the vruntime slice of a to-be-inserted task.
659	*	659	*
660	* vs = s/w	660	* vs = s/w
661	*/	661	*/
662	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)	662	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
663	{	663	{
664	return calc_delta_fair(sched_slice(cfs_rq, se), se);	664	return calc_delta_fair(sched_slice(cfs_rq, se), se);
665	}	665	}
666		666
667	#ifdef CONFIG_SMP	667	#ifdef CONFIG_SMP
668	static unsigned long task_h_load(struct task_struct *p);	668	static unsigned long task_h_load(struct task_struct *p);
669		669
670	static inline void __update_task_entity_contrib(struct sched_entity *se);	670	static inline void __update_task_entity_contrib(struct sched_entity *se);
671		671
672	/* Give new task start runnable values to heavy its load in infant time */	672	/* Give new task start runnable values to heavy its load in infant time */
673	void init_task_runnable_average(struct task_struct *p)	673	void init_task_runnable_average(struct task_struct *p)
674	{	674	{
675	u32 slice;	675	u32 slice;
676		676
677	p->se.avg.decay_count = 0;	677	p->se.avg.decay_count = 0;
678	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;	678	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
679	p->se.avg.runnable_avg_sum = slice;	679	p->se.avg.runnable_avg_sum = slice;
680	p->se.avg.runnable_avg_period = slice;	680	p->se.avg.runnable_avg_period = slice;
681	__update_task_entity_contrib(&p->se);	681	__update_task_entity_contrib(&p->se);
682	}	682	}
683	#else	683	#else
684	void init_task_runnable_average(struct task_struct *p)	684	void init_task_runnable_average(struct task_struct *p)
685	{	685	{
686	}	686	}
687	#endif	687	#endif
688		688
689	/*	689	/*
690	* Update the current task's runtime statistics.	690	* Update the current task's runtime statistics.
691	*/	691	*/
692	static void update_curr(struct cfs_rq *cfs_rq)	692	static void update_curr(struct cfs_rq *cfs_rq)
693	{	693	{
694	struct sched_entity *curr = cfs_rq->curr;	694	struct sched_entity *curr = cfs_rq->curr;
695	u64 now = rq_clock_task(rq_of(cfs_rq));	695	u64 now = rq_clock_task(rq_of(cfs_rq));
696	u64 delta_exec;	696	u64 delta_exec;
697		697
698	if (unlikely(!curr))	698	if (unlikely(!curr))
699	return;	699	return;
700		700
701	delta_exec = now - curr->exec_start;	701	delta_exec = now - curr->exec_start;
702	if (unlikely((s64)delta_exec <= 0))	702	if (unlikely((s64)delta_exec <= 0))
703	return;	703	return;
704		704
705	curr->exec_start = now;	705	curr->exec_start = now;
706		706
707	schedstat_set(curr->statistics.exec_max,	707	schedstat_set(curr->statistics.exec_max,
708	max(delta_exec, curr->statistics.exec_max));	708	max(delta_exec, curr->statistics.exec_max));
709		709
710	curr->sum_exec_runtime += delta_exec;	710	curr->sum_exec_runtime += delta_exec;
711	schedstat_add(cfs_rq, exec_clock, delta_exec);	711	schedstat_add(cfs_rq, exec_clock, delta_exec);
712		712
713	curr->vruntime += calc_delta_fair(delta_exec, curr);	713	curr->vruntime += calc_delta_fair(delta_exec, curr);
714	update_min_vruntime(cfs_rq);	714	update_min_vruntime(cfs_rq);
715		715
716	if (entity_is_task(curr)) {	716	if (entity_is_task(curr)) {
717	struct task_struct *curtask = task_of(curr);	717	struct task_struct *curtask = task_of(curr);
718		718
719	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);	719	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
720	cpuacct_charge(curtask, delta_exec);	720	cpuacct_charge(curtask, delta_exec);
721	account_group_exec_runtime(curtask, delta_exec);	721	account_group_exec_runtime(curtask, delta_exec);
722	}	722	}
723		723
724	account_cfs_rq_runtime(cfs_rq, delta_exec);	724	account_cfs_rq_runtime(cfs_rq, delta_exec);
725	}	725	}
726		726
727	static inline void	727	static inline void
728	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)	728	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
729	{	729	{
730	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));	730	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
731	}	731	}
732		732
733	/*	733	/*
734	* Task is being enqueued - update stats:	734	* Task is being enqueued - update stats:
735	*/	735	*/
736	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	736	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
737	{	737	{
738	/*	738	/*
739	* Are we enqueueing a waiting task? (for current tasks	739	* Are we enqueueing a waiting task? (for current tasks
740	* a dequeue/enqueue event is a NOP)	740	* a dequeue/enqueue event is a NOP)
741	*/	741	*/
742	if (se != cfs_rq->curr)	742	if (se != cfs_rq->curr)
743	update_stats_wait_start(cfs_rq, se);	743	update_stats_wait_start(cfs_rq, se);
744	}	744	}
745		745
746	static void	746	static void
747	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)	747	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
748	{	748	{
749	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,	749	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
750	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));	750	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
751	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);	751	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
752	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +	752	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
753	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);	753	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
754	#ifdef CONFIG_SCHEDSTATS	754	#ifdef CONFIG_SCHEDSTATS
755	if (entity_is_task(se)) {	755	if (entity_is_task(se)) {
756	trace_sched_stat_wait(task_of(se),	756	trace_sched_stat_wait(task_of(se),
757	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);	757	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
758	}	758	}
759	#endif	759	#endif
760	schedstat_set(se->statistics.wait_start, 0);	760	schedstat_set(se->statistics.wait_start, 0);
761	}	761	}
762		762
763	static inline void	763	static inline void
764	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)	764	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
765	{	765	{
766	/*	766	/*
767	* Mark the end of the wait period if dequeueing a	767	* Mark the end of the wait period if dequeueing a
768	* waiting task:	768	* waiting task:
769	*/	769	*/
770	if (se != cfs_rq->curr)	770	if (se != cfs_rq->curr)
771	update_stats_wait_end(cfs_rq, se);	771	update_stats_wait_end(cfs_rq, se);
772	}	772	}
773		773
774	/*	774	/*
775	* We are picking a new current task - update its stats:	775	* We are picking a new current task - update its stats:
776	*/	776	*/
777	static inline void	777	static inline void
778	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)	778	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
779	{	779	{
780	/*	780	/*
781	* We are starting a new run period:	781	* We are starting a new run period:
782	*/	782	*/
783	se->exec_start = rq_clock_task(rq_of(cfs_rq));	783	se->exec_start = rq_clock_task(rq_of(cfs_rq));
784	}	784	}
785		785
786	/**************************************************	786	/**************************************************
787	* Scheduling class queueing methods:	787	* Scheduling class queueing methods:
788	*/	788	*/
789		789
790	#ifdef CONFIG_NUMA_BALANCING	790	#ifdef CONFIG_NUMA_BALANCING
791	/*	791	/*
792	* Approximate time to scan a full NUMA task in ms. The task scan period is	792	* Approximate time to scan a full NUMA task in ms. The task scan period is
793	* calculated based on the tasks virtual memory size and	793	* calculated based on the tasks virtual memory size and
794	* numa_balancing_scan_size.	794	* numa_balancing_scan_size.
795	*/	795	*/
796	unsigned int sysctl_numa_balancing_scan_period_min = 1000;	796	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
797	unsigned int sysctl_numa_balancing_scan_period_max = 60000;	797	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
798		798
799	/* Portion of address space to scan in MB */	799	/* Portion of address space to scan in MB */
800	unsigned int sysctl_numa_balancing_scan_size = 256;	800	unsigned int sysctl_numa_balancing_scan_size = 256;
801		801
802	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */	802	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
803	unsigned int sysctl_numa_balancing_scan_delay = 1000;	803	unsigned int sysctl_numa_balancing_scan_delay = 1000;
804		804
805	static unsigned int task_nr_scan_windows(struct task_struct *p)	805	static unsigned int task_nr_scan_windows(struct task_struct *p)
806	{	806	{
807	unsigned long rss = 0;	807	unsigned long rss = 0;
808	unsigned long nr_scan_pages;	808	unsigned long nr_scan_pages;
809		809
810	/*	810	/*
811	* Calculations based on RSS as non-present and empty pages are skipped	811	* Calculations based on RSS as non-present and empty pages are skipped
812	* by the PTE scanner and NUMA hinting faults should be trapped based	812	* by the PTE scanner and NUMA hinting faults should be trapped based
813	* on resident pages	813	* on resident pages
814	*/	814	*/
815	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);	815	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
816	rss = get_mm_rss(p->mm);	816	rss = get_mm_rss(p->mm);
817	if (!rss)	817	if (!rss)
818	rss = nr_scan_pages;	818	rss = nr_scan_pages;
819		819
820	rss = round_up(rss, nr_scan_pages);	820	rss = round_up(rss, nr_scan_pages);
821	return rss / nr_scan_pages;	821	return rss / nr_scan_pages;
822	}	822	}
823		823
824	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */	824	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
825	#define MAX_SCAN_WINDOW 2560	825	#define MAX_SCAN_WINDOW 2560
826		826
827	static unsigned int task_scan_min(struct task_struct *p)	827	static unsigned int task_scan_min(struct task_struct *p)
828	{	828	{
829	unsigned int scan, floor;	829	unsigned int scan, floor;
830	unsigned int windows = 1;	830	unsigned int windows = 1;
831		831
832	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)	832	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
833	windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;	833	windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
834	floor = 1000 / windows;	834	floor = 1000 / windows;
835		835
836	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);	836	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
837	return max_t(unsigned int, floor, scan);	837	return max_t(unsigned int, floor, scan);
838	}	838	}
839		839
840	static unsigned int task_scan_max(struct task_struct *p)	840	static unsigned int task_scan_max(struct task_struct *p)
841	{	841	{
842	unsigned int smin = task_scan_min(p);	842	unsigned int smin = task_scan_min(p);
843	unsigned int smax;	843	unsigned int smax;
844		844
845	/* Watch for min being lower than max due to floor calculations */	845	/* Watch for min being lower than max due to floor calculations */
846	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);	846	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
847	return max(smin, smax);	847	return max(smin, smax);
848	}	848	}
849		849
850	static void account_numa_enqueue(struct rq rq, struct task_struct p)	850	static void account_numa_enqueue(struct rq rq, struct task_struct p)
851	{	851	{
852	rq->nr_numa_running += (p->numa_preferred_nid != -1);	852	rq->nr_numa_running += (p->numa_preferred_nid != -1);
853	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));	853	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
854	}	854	}
855		855
856	static void account_numa_dequeue(struct rq rq, struct task_struct p)	856	static void account_numa_dequeue(struct rq rq, struct task_struct p)
857	{	857	{
858	rq->nr_numa_running -= (p->numa_preferred_nid != -1);	858	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
859	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));	859	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
860	}	860	}
861		861
862	struct numa_group {	862	struct numa_group {
863	atomic_t refcount;	863	atomic_t refcount;
864		864
865	spinlock_t lock; /* nr_tasks, tasks */	865	spinlock_t lock; /* nr_tasks, tasks */
866	int nr_tasks;	866	int nr_tasks;
867	pid_t gid;	867	pid_t gid;
868	struct list_head task_list;	868	struct list_head task_list;
869		869
870	struct rcu_head rcu;	870	struct rcu_head rcu;
871	nodemask_t active_nodes;	871	nodemask_t active_nodes;
872	unsigned long total_faults;	872	unsigned long total_faults;
873	/*	873	/*
874	* Faults_cpu is used to decide whether memory should move	874	* Faults_cpu is used to decide whether memory should move
875	* towards the CPU. As a consequence, these stats are weighted	875	* towards the CPU. As a consequence, these stats are weighted
876	* more by CPU use than by memory faults.	876	* more by CPU use than by memory faults.
877	*/	877	*/
878	unsigned long *faults_cpu;	878	unsigned long *faults_cpu;
879	unsigned long faults[0];	879	unsigned long faults[0];
880	};	880	};
881		881
882	/* Shared or private faults. */	882	/* Shared or private faults. */
883	#define NR_NUMA_HINT_FAULT_TYPES 2	883	#define NR_NUMA_HINT_FAULT_TYPES 2
884		884
885	/* Memory and CPU locality */	885	/* Memory and CPU locality */
886	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)	886	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887		887
888	/* Averaged statistics, and temporary buffers. */	888	/* Averaged statistics, and temporary buffers. */
889	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)	889	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890		890
891	pid_t task_numa_group_id(struct task_struct *p)	891	pid_t task_numa_group_id(struct task_struct *p)
892	{	892	{
893	return p->numa_group ? p->numa_group->gid : 0;	893	return p->numa_group ? p->numa_group->gid : 0;
894	}	894	}
895		895
896	static inline int task_faults_idx(int nid, int priv)	896	static inline int task_faults_idx(int nid, int priv)
897	{	897	{
898	return NR_NUMA_HINT_FAULT_TYPES * nid + priv;	898	return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
899	}	899	}
900		900
901	static inline unsigned long task_faults(struct task_struct *p, int nid)	901	static inline unsigned long task_faults(struct task_struct *p, int nid)
902	{	902	{
903	if (!p->numa_faults_memory)	903	if (!p->numa_faults_memory)
904	return 0;	904	return 0;
905		905
906	return p->numa_faults_memory[task_faults_idx(nid, 0)] +	906	return p->numa_faults_memory[task_faults_idx(nid, 0)] +
907	p->numa_faults_memory[task_faults_idx(nid, 1)];	907	p->numa_faults_memory[task_faults_idx(nid, 1)];
908	}	908	}
909		909
910	static inline unsigned long group_faults(struct task_struct *p, int nid)	910	static inline unsigned long group_faults(struct task_struct *p, int nid)
911	{	911	{
912	if (!p->numa_group)	912	if (!p->numa_group)
913	return 0;	913	return 0;
914		914
915	return p->numa_group->faults[task_faults_idx(nid, 0)] +	915	return p->numa_group->faults[task_faults_idx(nid, 0)] +
916	p->numa_group->faults[task_faults_idx(nid, 1)];	916	p->numa_group->faults[task_faults_idx(nid, 1)];
917	}	917	}
918		918
919	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)	919	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920	{	920	{
921	return group->faults_cpu[task_faults_idx(nid, 0)] +	921	return group->faults_cpu[task_faults_idx(nid, 0)] +
922	group->faults_cpu[task_faults_idx(nid, 1)];	922	group->faults_cpu[task_faults_idx(nid, 1)];
923	}	923	}
924		924
925	/*	925	/*
926	* These return the fraction of accesses done by a particular task, or	926	* These return the fraction of accesses done by a particular task, or
927	* task group, on a particular numa node. The group weight is given a	927	* task group, on a particular numa node. The group weight is given a
928	* larger multiplier, in order to group tasks together that are almost	928	* larger multiplier, in order to group tasks together that are almost
929	* evenly spread out between numa nodes.	929	* evenly spread out between numa nodes.
930	*/	930	*/
931	static inline unsigned long task_weight(struct task_struct *p, int nid)	931	static inline unsigned long task_weight(struct task_struct *p, int nid)
932	{	932	{
933	unsigned long total_faults;	933	unsigned long total_faults;
934		934
935	if (!p->numa_faults_memory)	935	if (!p->numa_faults_memory)
936	return 0;	936	return 0;
937		937
938	total_faults = p->total_numa_faults;	938	total_faults = p->total_numa_faults;
939		939
940	if (!total_faults)	940	if (!total_faults)
941	return 0;	941	return 0;
942		942
943	return 1000 * task_faults(p, nid) / total_faults;	943	return 1000 * task_faults(p, nid) / total_faults;
944	}	944	}
945		945
946	static inline unsigned long group_weight(struct task_struct *p, int nid)	946	static inline unsigned long group_weight(struct task_struct *p, int nid)
947	{	947	{
948	if (!p->numa_group \|\| !p->numa_group->total_faults)	948	if (!p->numa_group \|\| !p->numa_group->total_faults)
949	return 0;	949	return 0;
950		950
951	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;	951	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
952	}	952	}
953		953
954	bool should_numa_migrate_memory(struct task_struct p, struct page page,	954	bool should_numa_migrate_memory(struct task_struct p, struct page page,
955	int src_nid, int dst_cpu)	955	int src_nid, int dst_cpu)
956	{	956	{
957	struct numa_group *ng = p->numa_group;	957	struct numa_group *ng = p->numa_group;
958	int dst_nid = cpu_to_node(dst_cpu);	958	int dst_nid = cpu_to_node(dst_cpu);
959	int last_cpupid, this_cpupid;	959	int last_cpupid, this_cpupid;
960		960
961	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);	961	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962		962
963	/*	963	/*
964	* Multi-stage node selection is used in conjunction with a periodic	964	* Multi-stage node selection is used in conjunction with a periodic
965	* migration fault to build a temporal task<->page relation. By using	965	* migration fault to build a temporal task<->page relation. By using
966	* a two-stage filter we remove short/unlikely relations.	966	* a two-stage filter we remove short/unlikely relations.
967	*	967	*
968	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate	968	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969	* a task's usage of a particular page (n_p) per total usage of this	969	* a task's usage of a particular page (n_p) per total usage of this
970	* page (n_t) (in a given time-span) to a probability.	970	* page (n_t) (in a given time-span) to a probability.
971	*	971	*
972	* Our periodic faults will sample this probability and getting the	972	* Our periodic faults will sample this probability and getting the
973	* same result twice in a row, given these samples are fully	973	* same result twice in a row, given these samples are fully
974	* independent, is then given by P(n)^2, provided our sample period	974	* independent, is then given by P(n)^2, provided our sample period
975	* is sufficiently short compared to the usage pattern.	975	* is sufficiently short compared to the usage pattern.
976	*	976	*
977	* This quadric squishes small probabilities, making it less likely we	977	* This quadric squishes small probabilities, making it less likely we
978	* act on an unlikely task<->page relation.	978	* act on an unlikely task<->page relation.
979	*/	979	*/
980	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);	980	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981	if (!cpupid_pid_unset(last_cpupid) &&	981	if (!cpupid_pid_unset(last_cpupid) &&
982	cpupid_to_nid(last_cpupid) != dst_nid)	982	cpupid_to_nid(last_cpupid) != dst_nid)
983	return false;	983	return false;
984		984
985	/* Always allow migrate on private faults */	985	/* Always allow migrate on private faults */
986	if (cpupid_match_pid(p, last_cpupid))	986	if (cpupid_match_pid(p, last_cpupid))
987	return true;	987	return true;
988		988
989	/* A shared fault, but p->numa_group has not been set up yet. */	989	/* A shared fault, but p->numa_group has not been set up yet. */
990	if (!ng)	990	if (!ng)
991	return true;	991	return true;
992		992
993	/*	993	/*
994	* Do not migrate if the destination is not a node that	994	* Do not migrate if the destination is not a node that
995	* is actively used by this numa group.	995	* is actively used by this numa group.
996	*/	996	*/
997	if (!node_isset(dst_nid, ng->active_nodes))	997	if (!node_isset(dst_nid, ng->active_nodes))
998	return false;	998	return false;
999		999
1000	/*	1000	/*
1001	* Source is a node that is not actively used by this	1001	* Source is a node that is not actively used by this
1002	* numa group, while the destination is. Migrate.	1002	* numa group, while the destination is. Migrate.
1003	*/	1003	*/
1004	if (!node_isset(src_nid, ng->active_nodes))	1004	if (!node_isset(src_nid, ng->active_nodes))
1005	return true;	1005	return true;
1006		1006
1007	/*	1007	/*
1008	* Both source and destination are nodes in active	1008	* Both source and destination are nodes in active
1009	* use by this numa group. Maximize memory bandwidth	1009	* use by this numa group. Maximize memory bandwidth
1010	* by migrating from more heavily used groups, to less	1010	* by migrating from more heavily used groups, to less
1011	* heavily used ones, spreading the load around.	1011	* heavily used ones, spreading the load around.
1012	* Use a 1/4 hysteresis to avoid spurious page movement.	1012	* Use a 1/4 hysteresis to avoid spurious page movement.
1013	*/	1013	*/
1014	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);	1014	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015	}	1015	}
1016		1016
1017	static unsigned long weighted_cpuload(const int cpu);	1017	static unsigned long weighted_cpuload(const int cpu);
1018	static unsigned long source_load(int cpu, int type);	1018	static unsigned long source_load(int cpu, int type);
1019	static unsigned long target_load(int cpu, int type);	1019	static unsigned long target_load(int cpu, int type);
1020	static unsigned long power_of(int cpu);	1020	static unsigned long power_of(int cpu);
1021	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);	1021	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1022		1022
1023	/* Cached statistics for all CPUs within a node */	1023	/* Cached statistics for all CPUs within a node */
1024	struct numa_stats {	1024	struct numa_stats {
1025	unsigned long nr_running;	1025	unsigned long nr_running;
1026	unsigned long load;	1026	unsigned long load;
1027		1027
1028	/* Total compute capacity of CPUs on a node */	1028	/* Total compute capacity of CPUs on a node */
1029	unsigned long power;	1029	unsigned long power;
1030		1030
1031	/* Approximate capacity in terms of runnable tasks on a node */	1031	/* Approximate capacity in terms of runnable tasks on a node */
1032	unsigned long capacity;	1032	unsigned long capacity;
1033	int has_capacity;	1033	int has_capacity;
1034	};	1034	};
1035		1035
1036	/*	1036	/*
1037	* XXX borrowed from update_sg_lb_stats	1037	* XXX borrowed from update_sg_lb_stats
1038	*/	1038	*/
1039	static void update_numa_stats(struct numa_stats *ns, int nid)	1039	static void update_numa_stats(struct numa_stats *ns, int nid)
1040	{	1040	{
1041	int cpu, cpus = 0;	1041	int cpu, cpus = 0;
1042		1042
1043	memset(ns, 0, sizeof(*ns));	1043	memset(ns, 0, sizeof(*ns));
1044	for_each_cpu(cpu, cpumask_of_node(nid)) {	1044	for_each_cpu(cpu, cpumask_of_node(nid)) {
1045	struct rq *rq = cpu_rq(cpu);	1045	struct rq *rq = cpu_rq(cpu);
1046		1046
1047	ns->nr_running += rq->nr_running;	1047	ns->nr_running += rq->nr_running;
1048	ns->load += weighted_cpuload(cpu);	1048	ns->load += weighted_cpuload(cpu);
1049	ns->power += power_of(cpu);	1049	ns->power += power_of(cpu);
1050		1050
1051	cpus++;	1051	cpus++;
1052	}	1052	}
1053		1053
1054	/*	1054	/*
1055	* If we raced with hotplug and there are no CPUs left in our mask	1055	* If we raced with hotplug and there are no CPUs left in our mask
1056	* the @ns structure is NULL'ed and task_numa_compare() will	1056	* the @ns structure is NULL'ed and task_numa_compare() will
1057	* not find this node attractive.	1057	* not find this node attractive.
1058	*	1058	*
1059	* We'll either bail at !has_capacity, or we'll detect a huge imbalance	1059	* We'll either bail at !has_capacity, or we'll detect a huge imbalance
1060	* and bail there.	1060	* and bail there.
1061	*/	1061	*/
1062	if (!cpus)	1062	if (!cpus)
1063	return;	1063	return;
1064		1064
1065	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;	1065	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1066	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);	1066	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1067	ns->has_capacity = (ns->nr_running < ns->capacity);	1067	ns->has_capacity = (ns->nr_running < ns->capacity);
1068	}	1068	}
1069		1069
1070	struct task_numa_env {	1070	struct task_numa_env {
1071	struct task_struct *p;	1071	struct task_struct *p;
1072		1072
1073	int src_cpu, src_nid;	1073	int src_cpu, src_nid;
1074	int dst_cpu, dst_nid;	1074	int dst_cpu, dst_nid;
1075		1075
1076	struct numa_stats src_stats, dst_stats;	1076	struct numa_stats src_stats, dst_stats;
1077		1077
1078	int imbalance_pct;	1078	int imbalance_pct;
1079		1079
1080	struct task_struct *best_task;	1080	struct task_struct *best_task;
1081	long best_imp;	1081	long best_imp;
1082	int best_cpu;	1082	int best_cpu;
1083	};	1083	};
1084		1084
1085	static void task_numa_assign(struct task_numa_env *env,	1085	static void task_numa_assign(struct task_numa_env *env,
1086	struct task_struct *p, long imp)	1086	struct task_struct *p, long imp)
1087	{	1087	{
1088	if (env->best_task)	1088	if (env->best_task)
1089	put_task_struct(env->best_task);	1089	put_task_struct(env->best_task);
1090	if (p)	1090	if (p)
1091	get_task_struct(p);	1091	get_task_struct(p);
1092		1092
1093	env->best_task = p;	1093	env->best_task = p;
1094	env->best_imp = imp;	1094	env->best_imp = imp;
1095	env->best_cpu = env->dst_cpu;	1095	env->best_cpu = env->dst_cpu;
1096	}	1096	}
1097		1097
1098	/*	1098	/*
1099	* This checks if the overall compute and NUMA accesses of the system would	1099	* This checks if the overall compute and NUMA accesses of the system would
1100	* be improved if the source tasks was migrated to the target dst_cpu taking	1100	* be improved if the source tasks was migrated to the target dst_cpu taking
1101	* into account that it might be best if task running on the dst_cpu should	1101	* into account that it might be best if task running on the dst_cpu should
1102	* be exchanged with the source task	1102	* be exchanged with the source task
1103	*/	1103	*/
1104	static void task_numa_compare(struct task_numa_env *env,	1104	static void task_numa_compare(struct task_numa_env *env,
1105	long taskimp, long groupimp)	1105	long taskimp, long groupimp)
1106	{	1106	{
1107	struct rq *src_rq = cpu_rq(env->src_cpu);	1107	struct rq *src_rq = cpu_rq(env->src_cpu);
1108	struct rq *dst_rq = cpu_rq(env->dst_cpu);	1108	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1109	struct task_struct *cur;	1109	struct task_struct *cur;
1110	long dst_load, src_load;	1110	long dst_load, src_load;
1111	long load;	1111	long load;
1112	long imp = (groupimp > 0) ? groupimp : taskimp;	1112	long imp = (groupimp > 0) ? groupimp : taskimp;
1113		1113
1114	rcu_read_lock();	1114	rcu_read_lock();
1115	cur = ACCESS_ONCE(dst_rq->curr);	1115	cur = ACCESS_ONCE(dst_rq->curr);
1116	if (cur->pid == 0) /* idle */	1116	if (cur->pid == 0) /* idle */
1117	cur = NULL;	1117	cur = NULL;
1118		1118
1119	/*	1119	/*
1120	* "imp" is the fault differential for the source task between the	1120	* "imp" is the fault differential for the source task between the
1121	* source and destination node. Calculate the total differential for	1121	* source and destination node. Calculate the total differential for
1122	* the source task and potential destination task. The more negative	1122	* the source task and potential destination task. The more negative
1123	* the value is, the more rmeote accesses that would be expected to	1123	* the value is, the more rmeote accesses that would be expected to
1124	* be incurred if the tasks were swapped.	1124	* be incurred if the tasks were swapped.
1125	*/	1125	*/
1126	if (cur) {	1126	if (cur) {
1127	/* Skip this swap candidate if cannot move to the source cpu */	1127	/* Skip this swap candidate if cannot move to the source cpu */
1128	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))	1128	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1129	goto unlock;	1129	goto unlock;
1130		1130
1131	/*	1131	/*
1132	* If dst and source tasks are in the same NUMA group, or not	1132	* If dst and source tasks are in the same NUMA group, or not
1133	* in any group then look only at task weights.	1133	* in any group then look only at task weights.
1134	*/	1134	*/
1135	if (cur->numa_group == env->p->numa_group) {	1135	if (cur->numa_group == env->p->numa_group) {
1136	imp = taskimp + task_weight(cur, env->src_nid) -	1136	imp = taskimp + task_weight(cur, env->src_nid) -
1137	task_weight(cur, env->dst_nid);	1137	task_weight(cur, env->dst_nid);
1138	/*	1138	/*
1139	* Add some hysteresis to prevent swapping the	1139	* Add some hysteresis to prevent swapping the
1140	* tasks within a group over tiny differences.	1140	* tasks within a group over tiny differences.
1141	*/	1141	*/
1142	if (cur->numa_group)	1142	if (cur->numa_group)
1143	imp -= imp/16;	1143	imp -= imp/16;
1144	} else {	1144	} else {
1145	/*	1145	/*
1146	* Compare the group weights. If a task is all by	1146	* Compare the group weights. If a task is all by
1147	* itself (not part of a group), use the task weight	1147	* itself (not part of a group), use the task weight
1148	* instead.	1148	* instead.
1149	*/	1149	*/
1150	if (env->p->numa_group)	1150	if (env->p->numa_group)
1151	imp = groupimp;	1151	imp = groupimp;
1152	else	1152	else
1153	imp = taskimp;	1153	imp = taskimp;
1154		1154
1155	if (cur->numa_group)	1155	if (cur->numa_group)
1156	imp += group_weight(cur, env->src_nid) -	1156	imp += group_weight(cur, env->src_nid) -
1157	group_weight(cur, env->dst_nid);	1157	group_weight(cur, env->dst_nid);
1158	else	1158	else
1159	imp += task_weight(cur, env->src_nid) -	1159	imp += task_weight(cur, env->src_nid) -
1160	task_weight(cur, env->dst_nid);	1160	task_weight(cur, env->dst_nid);
1161	}	1161	}
1162	}	1162	}
1163		1163
1164	if (imp < env->best_imp)	1164	if (imp < env->best_imp)
1165	goto unlock;	1165	goto unlock;
1166		1166
1167	if (!cur) {	1167	if (!cur) {
1168	/* Is there capacity at our destination? */	1168	/* Is there capacity at our destination? */
1169	if (env->src_stats.has_capacity &&	1169	if (env->src_stats.has_capacity &&
1170	!env->dst_stats.has_capacity)	1170	!env->dst_stats.has_capacity)
1171	goto unlock;	1171	goto unlock;
1172		1172
1173	goto balance;	1173	goto balance;
1174	}	1174	}
1175		1175
1176	/* Balance doesn't matter much if we're running a task per cpu */	1176	/* Balance doesn't matter much if we're running a task per cpu */
1177	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)	1177	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1178	goto assign;	1178	goto assign;
1179		1179
1180	/*	1180	/*
1181	* In the overloaded case, try and keep the load balanced.	1181	* In the overloaded case, try and keep the load balanced.
1182	*/	1182	*/
1183	balance:	1183	balance:
1184	dst_load = env->dst_stats.load;	1184	dst_load = env->dst_stats.load;
1185	src_load = env->src_stats.load;	1185	src_load = env->src_stats.load;
1186		1186
1187	/* XXX missing power terms */	1187	/* XXX missing power terms */
1188	load = task_h_load(env->p);	1188	load = task_h_load(env->p);
1189	dst_load += load;	1189	dst_load += load;
1190	src_load -= load;	1190	src_load -= load;
1191		1191
1192	if (cur) {	1192	if (cur) {
1193	load = task_h_load(cur);	1193	load = task_h_load(cur);
1194	dst_load -= load;	1194	dst_load -= load;
1195	src_load += load;	1195	src_load += load;
1196	}	1196	}
1197		1197
1198	/* make src_load the smaller */	1198	/* make src_load the smaller */
1199	if (dst_load < src_load)	1199	if (dst_load < src_load)
1200	swap(dst_load, src_load);	1200	swap(dst_load, src_load);
1201		1201
1202	if (src_load * env->imbalance_pct < dst_load * 100)	1202	if (src_load * env->imbalance_pct < dst_load * 100)
1203	goto unlock;	1203	goto unlock;
1204		1204
1205	assign:	1205	assign:
1206	task_numa_assign(env, cur, imp);	1206	task_numa_assign(env, cur, imp);
1207	unlock:	1207	unlock:
1208	rcu_read_unlock();	1208	rcu_read_unlock();
1209	}	1209	}
1210		1210
1211	static void task_numa_find_cpu(struct task_numa_env *env,	1211	static void task_numa_find_cpu(struct task_numa_env *env,
1212	long taskimp, long groupimp)	1212	long taskimp, long groupimp)
1213	{	1213	{
1214	int cpu;	1214	int cpu;
1215		1215
1216	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {	1216	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1217	/* Skip this CPU if the source task cannot migrate */	1217	/* Skip this CPU if the source task cannot migrate */
1218	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))	1218	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1219	continue;	1219	continue;
1220		1220
1221	env->dst_cpu = cpu;	1221	env->dst_cpu = cpu;
1222	task_numa_compare(env, taskimp, groupimp);	1222	task_numa_compare(env, taskimp, groupimp);
1223	}	1223	}
1224	}	1224	}
1225		1225
1226	static int task_numa_migrate(struct task_struct *p)	1226	static int task_numa_migrate(struct task_struct *p)
1227	{	1227	{
1228	struct task_numa_env env = {	1228	struct task_numa_env env = {
1229	.p = p,	1229	.p = p,
1230		1230
1231	.src_cpu = task_cpu(p),	1231	.src_cpu = task_cpu(p),
1232	.src_nid = task_node(p),	1232	.src_nid = task_node(p),
1233		1233
1234	.imbalance_pct = 112,	1234	.imbalance_pct = 112,
1235		1235
1236	.best_task = NULL,	1236	.best_task = NULL,
1237	.best_imp = 0,	1237	.best_imp = 0,
1238	.best_cpu = -1	1238	.best_cpu = -1
1239	};	1239	};
1240	struct sched_domain *sd;	1240	struct sched_domain *sd;
1241	unsigned long taskweight, groupweight;	1241	unsigned long taskweight, groupweight;
1242	int nid, ret;	1242	int nid, ret;
1243	long taskimp, groupimp;	1243	long taskimp, groupimp;
1244		1244
1245	/*	1245	/*
1246	* Pick the lowest SD_NUMA domain, as that would have the smallest	1246	* Pick the lowest SD_NUMA domain, as that would have the smallest
1247	* imbalance and would be the first to start moving tasks about.	1247	* imbalance and would be the first to start moving tasks about.
1248	*	1248	*
1249	* And we want to avoid any moving of tasks about, as that would create	1249	* And we want to avoid any moving of tasks about, as that would create
1250	* random movement of tasks -- counter the numa conditions we're trying	1250	* random movement of tasks -- counter the numa conditions we're trying
1251	* to satisfy here.	1251	* to satisfy here.
1252	*/	1252	*/
1253	rcu_read_lock();	1253	rcu_read_lock();
1254	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));	1254	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1255	if (sd)	1255	if (sd)
1256	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;	1256	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1257	rcu_read_unlock();	1257	rcu_read_unlock();
1258		1258
1259	/*	1259	/*
1260	* Cpusets can break the scheduler domain tree into smaller	1260	* Cpusets can break the scheduler domain tree into smaller
1261	* balance domains, some of which do not cross NUMA boundaries.	1261	* balance domains, some of which do not cross NUMA boundaries.
1262	* Tasks that are "trapped" in such domains cannot be migrated	1262	* Tasks that are "trapped" in such domains cannot be migrated
1263	* elsewhere, so there is no point in (re)trying.	1263	* elsewhere, so there is no point in (re)trying.
1264	*/	1264	*/
1265	if (unlikely(!sd)) {	1265	if (unlikely(!sd)) {
1266	p->numa_preferred_nid = task_node(p);	1266	p->numa_preferred_nid = task_node(p);
1267	return -EINVAL;	1267	return -EINVAL;
1268	}	1268	}
1269		1269
1270	taskweight = task_weight(p, env.src_nid);	1270	taskweight = task_weight(p, env.src_nid);
1271	groupweight = group_weight(p, env.src_nid);	1271	groupweight = group_weight(p, env.src_nid);
1272	update_numa_stats(&env.src_stats, env.src_nid);	1272	update_numa_stats(&env.src_stats, env.src_nid);
1273	env.dst_nid = p->numa_preferred_nid;	1273	env.dst_nid = p->numa_preferred_nid;
1274	taskimp = task_weight(p, env.dst_nid) - taskweight;	1274	taskimp = task_weight(p, env.dst_nid) - taskweight;
1275	groupimp = group_weight(p, env.dst_nid) - groupweight;	1275	groupimp = group_weight(p, env.dst_nid) - groupweight;
1276	update_numa_stats(&env.dst_stats, env.dst_nid);	1276	update_numa_stats(&env.dst_stats, env.dst_nid);
1277		1277
1278	/* If the preferred nid has capacity, try to use it. */	1278	/* If the preferred nid has capacity, try to use it. */
1279	if (env.dst_stats.has_capacity)	1279	if (env.dst_stats.has_capacity)
1280	task_numa_find_cpu(&env, taskimp, groupimp);	1280	task_numa_find_cpu(&env, taskimp, groupimp);
1281		1281
1282	/* No space available on the preferred nid. Look elsewhere. */	1282	/* No space available on the preferred nid. Look elsewhere. */
1283	if (env.best_cpu == -1) {	1283	if (env.best_cpu == -1) {
1284	for_each_online_node(nid) {	1284	for_each_online_node(nid) {
1285	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)	1285	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
1286	continue;	1286	continue;
1287		1287
1288	/* Only consider nodes where both task and groups benefit */	1288	/* Only consider nodes where both task and groups benefit */
1289	taskimp = task_weight(p, nid) - taskweight;	1289	taskimp = task_weight(p, nid) - taskweight;
1290	groupimp = group_weight(p, nid) - groupweight;	1290	groupimp = group_weight(p, nid) - groupweight;
1291	if (taskimp < 0 && groupimp < 0)	1291	if (taskimp < 0 && groupimp < 0)
1292	continue;	1292	continue;
1293		1293
1294	env.dst_nid = nid;	1294	env.dst_nid = nid;
1295	update_numa_stats(&env.dst_stats, env.dst_nid);	1295	update_numa_stats(&env.dst_stats, env.dst_nid);
1296	task_numa_find_cpu(&env, taskimp, groupimp);	1296	task_numa_find_cpu(&env, taskimp, groupimp);
1297	}	1297	}
1298	}	1298	}
1299		1299
1300	/* No better CPU than the current one was found. */	1300	/* No better CPU than the current one was found. */
1301	if (env.best_cpu == -1)	1301	if (env.best_cpu == -1)
1302	return -EAGAIN;	1302	return -EAGAIN;
1303		1303
1304	sched_setnuma(p, env.dst_nid);	1304	sched_setnuma(p, env.dst_nid);
1305		1305
1306	/*	1306	/*
1307	* Reset the scan period if the task is being rescheduled on an	1307	* Reset the scan period if the task is being rescheduled on an
1308	* alternative node to recheck if the tasks is now properly placed.	1308	* alternative node to recheck if the tasks is now properly placed.
1309	*/	1309	*/
1310	p->numa_scan_period = task_scan_min(p);	1310	p->numa_scan_period = task_scan_min(p);
1311		1311
1312	if (env.best_task == NULL) {	1312	if (env.best_task == NULL) {
1313	ret = migrate_task_to(p, env.best_cpu);	1313	ret = migrate_task_to(p, env.best_cpu);
1314	if (ret != 0)	1314	if (ret != 0)
1315	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);	1315	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1316	return ret;	1316	return ret;
1317	}	1317	}
1318		1318
1319	ret = migrate_swap(p, env.best_task);	1319	ret = migrate_swap(p, env.best_task);
1320	if (ret != 0)	1320	if (ret != 0)
1321	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));	1321	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1322	put_task_struct(env.best_task);	1322	put_task_struct(env.best_task);
1323	return ret;	1323	return ret;
1324	}	1324	}
1325		1325
1326	/* Attempt to migrate a task to a CPU on the preferred node. */	1326	/* Attempt to migrate a task to a CPU on the preferred node. */
1327	static void numa_migrate_preferred(struct task_struct *p)	1327	static void numa_migrate_preferred(struct task_struct *p)
1328	{	1328	{
1329	/* This task has no NUMA fault statistics yet */	1329	/* This task has no NUMA fault statistics yet */
1330	if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults_memory))	1330	if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults_memory))
1331	return;	1331	return;
1332		1332
1333	/* Periodically retry migrating the task to the preferred node */	1333	/* Periodically retry migrating the task to the preferred node */
1334	p->numa_migrate_retry = jiffies + HZ;	1334	p->numa_migrate_retry = jiffies + HZ;
1335		1335
1336	/* Success if task is already running on preferred CPU */	1336	/* Success if task is already running on preferred CPU */
1337	if (task_node(p) == p->numa_preferred_nid)	1337	if (task_node(p) == p->numa_preferred_nid)
1338	return;	1338	return;
1339		1339
1340	/* Otherwise, try migrate to a CPU on the preferred node */	1340	/* Otherwise, try migrate to a CPU on the preferred node */
1341	task_numa_migrate(p);	1341	task_numa_migrate(p);
1342	}	1342	}
1343		1343
1344	/*	1344	/*
1345	* Find the nodes on which the workload is actively running. We do this by	1345	* Find the nodes on which the workload is actively running. We do this by
1346	* tracking the nodes from which NUMA hinting faults are triggered. This can	1346	* tracking the nodes from which NUMA hinting faults are triggered. This can
1347	* be different from the set of nodes where the workload's memory is currently	1347	* be different from the set of nodes where the workload's memory is currently
1348	* located.	1348	* located.
1349	*	1349	*
1350	* The bitmask is used to make smarter decisions on when to do NUMA page	1350	* The bitmask is used to make smarter decisions on when to do NUMA page
1351	* migrations, To prevent flip-flopping, and excessive page migrations, nodes	1351	* migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352	* are added when they cause over 6/16 of the maximum number of faults, but	1352	* are added when they cause over 6/16 of the maximum number of faults, but
1353	* only removed when they drop below 3/16.	1353	* only removed when they drop below 3/16.
1354	*/	1354	*/
1355	static void update_numa_active_node_mask(struct numa_group *numa_group)	1355	static void update_numa_active_node_mask(struct numa_group *numa_group)
1356	{	1356	{
1357	unsigned long faults, max_faults = 0;	1357	unsigned long faults, max_faults = 0;
1358	int nid;	1358	int nid;
1359		1359
1360	for_each_online_node(nid) {	1360	for_each_online_node(nid) {
1361	faults = group_faults_cpu(numa_group, nid);	1361	faults = group_faults_cpu(numa_group, nid);
1362	if (faults > max_faults)	1362	if (faults > max_faults)
1363	max_faults = faults;	1363	max_faults = faults;
1364	}	1364	}
1365		1365
1366	for_each_online_node(nid) {	1366	for_each_online_node(nid) {
1367	faults = group_faults_cpu(numa_group, nid);	1367	faults = group_faults_cpu(numa_group, nid);
1368	if (!node_isset(nid, numa_group->active_nodes)) {	1368	if (!node_isset(nid, numa_group->active_nodes)) {
1369	if (faults > max_faults * 6 / 16)	1369	if (faults > max_faults * 6 / 16)
1370	node_set(nid, numa_group->active_nodes);	1370	node_set(nid, numa_group->active_nodes);
1371	} else if (faults < max_faults * 3 / 16)	1371	} else if (faults < max_faults * 3 / 16)
1372	node_clear(nid, numa_group->active_nodes);	1372	node_clear(nid, numa_group->active_nodes);
1373	}	1373	}
1374	}	1374	}
1375		1375
1376	/*	1376	/*
1377	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS	1377	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1378	* increments. The more local the fault statistics are, the higher the scan	1378	* increments. The more local the fault statistics are, the higher the scan
1379	* period will be for the next scan window. If local/remote ratio is below	1379	* period will be for the next scan window. If local/remote ratio is below
1380	* NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the	1380	* NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1381	* scan period will decrease	1381	* scan period will decrease
1382	*/	1382	*/
1383	#define NUMA_PERIOD_SLOTS 10	1383	#define NUMA_PERIOD_SLOTS 10
1384	#define NUMA_PERIOD_THRESHOLD 3	1384	#define NUMA_PERIOD_THRESHOLD 3
1385		1385
1386	/*	1386	/*
1387	* Increase the scan period (slow down scanning) if the majority of	1387	* Increase the scan period (slow down scanning) if the majority of
1388	* our memory is already on our local node, or if the majority of	1388	* our memory is already on our local node, or if the majority of
1389	* the page accesses are shared with other processes.	1389	* the page accesses are shared with other processes.
1390	* Otherwise, decrease the scan period.	1390	* Otherwise, decrease the scan period.
1391	*/	1391	*/
1392	static void update_task_scan_period(struct task_struct *p,	1392	static void update_task_scan_period(struct task_struct *p,
1393	unsigned long shared, unsigned long private)	1393	unsigned long shared, unsigned long private)
1394	{	1394	{
1395	unsigned int period_slot;	1395	unsigned int period_slot;
1396	int ratio;	1396	int ratio;
1397	int diff;	1397	int diff;
1398		1398
1399	unsigned long remote = p->numa_faults_locality[0];	1399	unsigned long remote = p->numa_faults_locality[0];
1400	unsigned long local = p->numa_faults_locality[1];	1400	unsigned long local = p->numa_faults_locality[1];
1401		1401
1402	/*	1402	/*
1403	* If there were no record hinting faults then either the task is	1403	* If there were no record hinting faults then either the task is
1404	* completely idle or all activity is areas that are not of interest	1404	* completely idle or all activity is areas that are not of interest
1405	* to automatic numa balancing. Scan slower	1405	* to automatic numa balancing. Scan slower
1406	*/	1406	*/
1407	if (local + shared == 0) {	1407	if (local + shared == 0) {
1408	p->numa_scan_period = min(p->numa_scan_period_max,	1408	p->numa_scan_period = min(p->numa_scan_period_max,
1409	p->numa_scan_period << 1);	1409	p->numa_scan_period << 1);
1410		1410
1411	p->mm->numa_next_scan = jiffies +	1411	p->mm->numa_next_scan = jiffies +
1412	msecs_to_jiffies(p->numa_scan_period);	1412	msecs_to_jiffies(p->numa_scan_period);
1413		1413
1414	return;	1414	return;
1415	}	1415	}
1416		1416
1417	/*	1417	/*
1418	* Prepare to scale scan period relative to the current period.	1418	* Prepare to scale scan period relative to the current period.
1419	* == NUMA_PERIOD_THRESHOLD scan period stays the same	1419	* == NUMA_PERIOD_THRESHOLD scan period stays the same
1420	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)	1420	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1421	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)	1421	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1422	*/	1422	*/
1423	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);	1423	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1424	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);	1424	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1425	if (ratio >= NUMA_PERIOD_THRESHOLD) {	1425	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1426	int slot = ratio - NUMA_PERIOD_THRESHOLD;	1426	int slot = ratio - NUMA_PERIOD_THRESHOLD;
1427	if (!slot)	1427	if (!slot)
1428	slot = 1;	1428	slot = 1;
1429	diff = slot * period_slot;	1429	diff = slot * period_slot;
1430	} else {	1430	} else {
1431	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;	1431	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1432		1432
1433	/*	1433	/*
1434	* Scale scan rate increases based on sharing. There is an	1434	* Scale scan rate increases based on sharing. There is an
1435	* inverse relationship between the degree of sharing and	1435	* inverse relationship between the degree of sharing and
1436	* the adjustment made to the scanning period. Broadly	1436	* the adjustment made to the scanning period. Broadly
1437	* speaking the intent is that there is little point	1437	* speaking the intent is that there is little point
1438	* scanning faster if shared accesses dominate as it may	1438	* scanning faster if shared accesses dominate as it may
1439	* simply bounce migrations uselessly	1439	* simply bounce migrations uselessly
1440	*/	1440	*/
1441	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));	1441	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1442	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;	1442	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1443	}	1443	}
1444		1444
1445	p->numa_scan_period = clamp(p->numa_scan_period + diff,	1445	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1446	task_scan_min(p), task_scan_max(p));	1446	task_scan_min(p), task_scan_max(p));
1447	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));	1447	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1448	}	1448	}
1449		1449
1450	/*	1450	/*
1451	* Get the fraction of time the task has been running since the last	1451	* Get the fraction of time the task has been running since the last
1452	* NUMA placement cycle. The scheduler keeps similar statistics, but	1452	* NUMA placement cycle. The scheduler keeps similar statistics, but
1453	* decays those on a 32ms period, which is orders of magnitude off	1453	* decays those on a 32ms period, which is orders of magnitude off
1454	* from the dozens-of-seconds NUMA balancing period. Use the scheduler	1454	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455	* stats only if the task is so new there are no NUMA statistics yet.	1455	* stats only if the task is so new there are no NUMA statistics yet.
1456	*/	1456	*/
1457	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)	1457	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
1458	{	1458	{
1459	u64 runtime, delta, now;	1459	u64 runtime, delta, now;
1460	/* Use the start of this time slice to avoid calculations. */	1460	/* Use the start of this time slice to avoid calculations. */
1461	now = p->se.exec_start;	1461	now = p->se.exec_start;
1462	runtime = p->se.sum_exec_runtime;	1462	runtime = p->se.sum_exec_runtime;
1463		1463
1464	if (p->last_task_numa_placement) {	1464	if (p->last_task_numa_placement) {
1465	delta = runtime - p->last_sum_exec_runtime;	1465	delta = runtime - p->last_sum_exec_runtime;
1466	*period = now - p->last_task_numa_placement;	1466	*period = now - p->last_task_numa_placement;
1467	} else {	1467	} else {
1468	delta = p->se.avg.runnable_avg_sum;	1468	delta = p->se.avg.runnable_avg_sum;
1469	*period = p->se.avg.runnable_avg_period;	1469	*period = p->se.avg.runnable_avg_period;
1470	}	1470	}
1471		1471
1472	p->last_sum_exec_runtime = runtime;	1472	p->last_sum_exec_runtime = runtime;
1473	p->last_task_numa_placement = now;	1473	p->last_task_numa_placement = now;
1474		1474
1475	return delta;	1475	return delta;
1476	}	1476	}
1477		1477
1478	static void task_numa_placement(struct task_struct *p)	1478	static void task_numa_placement(struct task_struct *p)
1479	{	1479	{
1480	int seq, nid, max_nid = -1, max_group_nid = -1;	1480	int seq, nid, max_nid = -1, max_group_nid = -1;
1481	unsigned long max_faults = 0, max_group_faults = 0;	1481	unsigned long max_faults = 0, max_group_faults = 0;
1482	unsigned long fault_types[2] = { 0, 0 };	1482	unsigned long fault_types[2] = { 0, 0 };
1483	unsigned long total_faults;	1483	unsigned long total_faults;
1484	u64 runtime, period;	1484	u64 runtime, period;
1485	spinlock_t *group_lock = NULL;	1485	spinlock_t *group_lock = NULL;
1486		1486
1487	seq = ACCESS_ONCE(p->mm->numa_scan_seq);	1487	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1488	if (p->numa_scan_seq == seq)	1488	if (p->numa_scan_seq == seq)
1489	return;	1489	return;
1490	p->numa_scan_seq = seq;	1490	p->numa_scan_seq = seq;
1491	p->numa_scan_period_max = task_scan_max(p);	1491	p->numa_scan_period_max = task_scan_max(p);
1492		1492
1493	total_faults = p->numa_faults_locality[0] +	1493	total_faults = p->numa_faults_locality[0] +
1494	p->numa_faults_locality[1];	1494	p->numa_faults_locality[1];
1495	runtime = numa_get_avg_runtime(p, &period);	1495	runtime = numa_get_avg_runtime(p, &period);
1496		1496
1497	/* If the task is part of a group prevent parallel updates to group stats */	1497	/* If the task is part of a group prevent parallel updates to group stats */
1498	if (p->numa_group) {	1498	if (p->numa_group) {
1499	group_lock = &p->numa_group->lock;	1499	group_lock = &p->numa_group->lock;
1500	spin_lock_irq(group_lock);	1500	spin_lock_irq(group_lock);
1501	}	1501	}
1502		1502
1503	/* Find the node with the highest number of faults */	1503	/* Find the node with the highest number of faults */
1504	for_each_online_node(nid) {	1504	for_each_online_node(nid) {
1505	unsigned long faults = 0, group_faults = 0;	1505	unsigned long faults = 0, group_faults = 0;
1506	int priv, i;	1506	int priv, i;
1507		1507
1508	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {	1508	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1509	long diff, f_diff, f_weight;	1509	long diff, f_diff, f_weight;
1510		1510
1511	i = task_faults_idx(nid, priv);	1511	i = task_faults_idx(nid, priv);
1512		1512
1513	/* Decay existing window, copy faults since last scan */	1513	/* Decay existing window, copy faults since last scan */
1514	diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;	1514	diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1515	fault_types[priv] += p->numa_faults_buffer_memory[i];	1515	fault_types[priv] += p->numa_faults_buffer_memory[i];
1516	p->numa_faults_buffer_memory[i] = 0;	1516	p->numa_faults_buffer_memory[i] = 0;
1517		1517
1518	/*	1518	/*
1519	* Normalize the faults_from, so all tasks in a group	1519	* Normalize the faults_from, so all tasks in a group
1520	* count according to CPU use, instead of by the raw	1520	* count according to CPU use, instead of by the raw
1521	* number of faults. Tasks with little runtime have	1521	* number of faults. Tasks with little runtime have
1522	* little over-all impact on throughput, and thus their	1522	* little over-all impact on throughput, and thus their
1523	* faults are less important.	1523	* faults are less important.
1524	*/	1524	*/
1525	f_weight = div64_u64(runtime << 16, period + 1);	1525	f_weight = div64_u64(runtime << 16, period + 1);
1526	f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /	1526	f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527	(total_faults + 1);	1527	(total_faults + 1);
1528	f_diff = f_weight - p->numa_faults_cpu[i] / 2;	1528	f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529	p->numa_faults_buffer_cpu[i] = 0;	1529	p->numa_faults_buffer_cpu[i] = 0;
1530		1530
1531	p->numa_faults_memory[i] += diff;	1531	p->numa_faults_memory[i] += diff;
1532	p->numa_faults_cpu[i] += f_diff;	1532	p->numa_faults_cpu[i] += f_diff;
1533	faults += p->numa_faults_memory[i];	1533	faults += p->numa_faults_memory[i];
1534	p->total_numa_faults += diff;	1534	p->total_numa_faults += diff;
1535	if (p->numa_group) {	1535	if (p->numa_group) {
1536	/* safe because we can only change our own group */	1536	/* safe because we can only change our own group */
1537	p->numa_group->faults[i] += diff;	1537	p->numa_group->faults[i] += diff;
1538	p->numa_group->faults_cpu[i] += f_diff;	1538	p->numa_group->faults_cpu[i] += f_diff;
1539	p->numa_group->total_faults += diff;	1539	p->numa_group->total_faults += diff;
1540	group_faults += p->numa_group->faults[i];	1540	group_faults += p->numa_group->faults[i];
1541	}	1541	}
1542	}	1542	}
1543		1543
1544	if (faults > max_faults) {	1544	if (faults > max_faults) {
1545	max_faults = faults;	1545	max_faults = faults;
1546	max_nid = nid;	1546	max_nid = nid;
1547	}	1547	}
1548		1548
1549	if (group_faults > max_group_faults) {	1549	if (group_faults > max_group_faults) {
1550	max_group_faults = group_faults;	1550	max_group_faults = group_faults;
1551	max_group_nid = nid;	1551	max_group_nid = nid;
1552	}	1552	}
1553	}	1553	}
1554		1554
1555	update_task_scan_period(p, fault_types[0], fault_types[1]);	1555	update_task_scan_period(p, fault_types[0], fault_types[1]);
1556		1556
1557	if (p->numa_group) {	1557	if (p->numa_group) {
1558	update_numa_active_node_mask(p->numa_group);	1558	update_numa_active_node_mask(p->numa_group);
1559	/*	1559	/*
1560	* If the preferred task and group nids are different,	1560	* If the preferred task and group nids are different,
1561	* iterate over the nodes again to find the best place.	1561	* iterate over the nodes again to find the best place.
1562	*/	1562	*/
1563	if (max_nid != max_group_nid) {	1563	if (max_nid != max_group_nid) {
1564	unsigned long weight, max_weight = 0;	1564	unsigned long weight, max_weight = 0;
1565		1565
1566	for_each_online_node(nid) {	1566	for_each_online_node(nid) {
1567	weight = task_weight(p, nid) + group_weight(p, nid);	1567	weight = task_weight(p, nid) + group_weight(p, nid);
1568	if (weight > max_weight) {	1568	if (weight > max_weight) {
1569	max_weight = weight;	1569	max_weight = weight;
1570	max_nid = nid;	1570	max_nid = nid;
1571	}	1571	}
1572	}	1572	}
1573	}	1573	}
1574		1574
1575	spin_unlock_irq(group_lock);	1575	spin_unlock_irq(group_lock);
1576	}	1576	}
1577		1577
1578	/* Preferred node as the node with the most faults */	1578	/* Preferred node as the node with the most faults */
1579	if (max_faults && max_nid != p->numa_preferred_nid) {	1579	if (max_faults && max_nid != p->numa_preferred_nid) {
1580	/* Update the preferred nid and migrate task if possible */	1580	/* Update the preferred nid and migrate task if possible */
1581	sched_setnuma(p, max_nid);	1581	sched_setnuma(p, max_nid);
1582	numa_migrate_preferred(p);	1582	numa_migrate_preferred(p);
1583	}	1583	}
1584	}	1584	}
1585		1585
1586	static inline int get_numa_group(struct numa_group *grp)	1586	static inline int get_numa_group(struct numa_group *grp)
1587	{	1587	{
1588	return atomic_inc_not_zero(&grp->refcount);	1588	return atomic_inc_not_zero(&grp->refcount);
1589	}	1589	}
1590		1590
1591	static inline void put_numa_group(struct numa_group *grp)	1591	static inline void put_numa_group(struct numa_group *grp)
1592	{	1592	{
1593	if (atomic_dec_and_test(&grp->refcount))	1593	if (atomic_dec_and_test(&grp->refcount))
1594	kfree_rcu(grp, rcu);	1594	kfree_rcu(grp, rcu);
1595	}	1595	}
1596		1596
1597	static void task_numa_group(struct task_struct *p, int cpupid, int flags,	1597	static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1598	int *priv)	1598	int *priv)
1599	{	1599	{
1600	struct numa_group grp, my_grp;	1600	struct numa_group grp, my_grp;
1601	struct task_struct *tsk;	1601	struct task_struct *tsk;
1602	bool join = false;	1602	bool join = false;
1603	int cpu = cpupid_to_cpu(cpupid);	1603	int cpu = cpupid_to_cpu(cpupid);
1604	int i;	1604	int i;
1605		1605
1606	if (unlikely(!p->numa_group)) {	1606	if (unlikely(!p->numa_group)) {
1607	unsigned int size = sizeof(struct numa_group) +	1607	unsigned int size = sizeof(struct numa_group) +
1608	4nr_node_idssizeof(unsigned long);	1608	4nr_node_idssizeof(unsigned long);
1609		1609
1610	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);	1610	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
1611	if (!grp)	1611	if (!grp)
1612	return;	1612	return;
1613		1613
1614	atomic_set(&grp->refcount, 1);	1614	atomic_set(&grp->refcount, 1);
1615	spin_lock_init(&grp->lock);	1615	spin_lock_init(&grp->lock);
1616	INIT_LIST_HEAD(&grp->task_list);	1616	INIT_LIST_HEAD(&grp->task_list);
1617	grp->gid = p->pid;	1617	grp->gid = p->pid;
1618	/* Second half of the array tracks nids where faults happen */	1618	/* Second half of the array tracks nids where faults happen */
1619	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *	1619	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620	nr_node_ids;	1620	nr_node_ids;
1621		1621
1622	node_set(task_node(current), grp->active_nodes);	1622	node_set(task_node(current), grp->active_nodes);
1623		1623
1624	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)	1624	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1625	grp->faults[i] = p->numa_faults_memory[i];	1625	grp->faults[i] = p->numa_faults_memory[i];
1626		1626
1627	grp->total_faults = p->total_numa_faults;	1627	grp->total_faults = p->total_numa_faults;
1628		1628
1629	list_add(&p->numa_entry, &grp->task_list);	1629	list_add(&p->numa_entry, &grp->task_list);
1630	grp->nr_tasks++;	1630	grp->nr_tasks++;
1631	rcu_assign_pointer(p->numa_group, grp);	1631	rcu_assign_pointer(p->numa_group, grp);
1632	}	1632	}
1633		1633
1634	rcu_read_lock();	1634	rcu_read_lock();
1635	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);	1635	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1636		1636
1637	if (!cpupid_match_pid(tsk, cpupid))	1637	if (!cpupid_match_pid(tsk, cpupid))
1638	goto no_join;	1638	goto no_join;
1639		1639
1640	grp = rcu_dereference(tsk->numa_group);	1640	grp = rcu_dereference(tsk->numa_group);
1641	if (!grp)	1641	if (!grp)
1642	goto no_join;	1642	goto no_join;
1643		1643
1644	my_grp = p->numa_group;	1644	my_grp = p->numa_group;
1645	if (grp == my_grp)	1645	if (grp == my_grp)
1646	goto no_join;	1646	goto no_join;
1647		1647
1648	/*	1648	/*
1649	* Only join the other group if its bigger; if we're the bigger group,	1649	* Only join the other group if its bigger; if we're the bigger group,
1650	* the other task will join us.	1650	* the other task will join us.
1651	*/	1651	*/
1652	if (my_grp->nr_tasks > grp->nr_tasks)	1652	if (my_grp->nr_tasks > grp->nr_tasks)
1653	goto no_join;	1653	goto no_join;
1654		1654
1655	/*	1655	/*
1656	* Tie-break on the grp address.	1656	* Tie-break on the grp address.
1657	*/	1657	*/
1658	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)	1658	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1659	goto no_join;	1659	goto no_join;
1660		1660
1661	/* Always join threads in the same process. */	1661	/* Always join threads in the same process. */
1662	if (tsk->mm == current->mm)	1662	if (tsk->mm == current->mm)
1663	join = true;	1663	join = true;
1664		1664
1665	/* Simple filter to avoid false positives due to PID collisions */	1665	/* Simple filter to avoid false positives due to PID collisions */
1666	if (flags & TNF_SHARED)	1666	if (flags & TNF_SHARED)
1667	join = true;	1667	join = true;
1668		1668
1669	/* Update priv based on whether false sharing was detected */	1669	/* Update priv based on whether false sharing was detected */
1670	*priv = !join;	1670	*priv = !join;
1671		1671
1672	if (join && !get_numa_group(grp))	1672	if (join && !get_numa_group(grp))
1673	goto no_join;	1673	goto no_join;
1674		1674
1675	rcu_read_unlock();	1675	rcu_read_unlock();
1676		1676
1677	if (!join)	1677	if (!join)
1678	return;	1678	return;
1679		1679
1680	BUG_ON(irqs_disabled());	1680	BUG_ON(irqs_disabled());
1681	double_lock_irq(&my_grp->lock, &grp->lock);	1681	double_lock_irq(&my_grp->lock, &grp->lock);
1682		1682
1683	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {	1683	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1684	my_grp->faults[i] -= p->numa_faults_memory[i];	1684	my_grp->faults[i] -= p->numa_faults_memory[i];
1685	grp->faults[i] += p->numa_faults_memory[i];	1685	grp->faults[i] += p->numa_faults_memory[i];
1686	}	1686	}
1687	my_grp->total_faults -= p->total_numa_faults;	1687	my_grp->total_faults -= p->total_numa_faults;
1688	grp->total_faults += p->total_numa_faults;	1688	grp->total_faults += p->total_numa_faults;
1689		1689
1690	list_move(&p->numa_entry, &grp->task_list);	1690	list_move(&p->numa_entry, &grp->task_list);
1691	my_grp->nr_tasks--;	1691	my_grp->nr_tasks--;
1692	grp->nr_tasks++;	1692	grp->nr_tasks++;
1693		1693
1694	spin_unlock(&my_grp->lock);	1694	spin_unlock(&my_grp->lock);
1695	spin_unlock_irq(&grp->lock);	1695	spin_unlock_irq(&grp->lock);
1696		1696
1697	rcu_assign_pointer(p->numa_group, grp);	1697	rcu_assign_pointer(p->numa_group, grp);
1698		1698
1699	put_numa_group(my_grp);	1699	put_numa_group(my_grp);
1700	return;	1700	return;
1701		1701
1702	no_join:	1702	no_join:
1703	rcu_read_unlock();	1703	rcu_read_unlock();
1704	return;	1704	return;
1705	}	1705	}
1706		1706
1707	void task_numa_free(struct task_struct *p)	1707	void task_numa_free(struct task_struct *p)
1708	{	1708	{
1709	struct numa_group *grp = p->numa_group;	1709	struct numa_group *grp = p->numa_group;
1710	void *numa_faults = p->numa_faults_memory;	1710	void *numa_faults = p->numa_faults_memory;
1711	unsigned long flags;	1711	unsigned long flags;
1712	int i;	1712	int i;
1713		1713
1714	if (grp) {	1714	if (grp) {
1715	spin_lock_irqsave(&grp->lock, flags);	1715	spin_lock_irqsave(&grp->lock, flags);
1716	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)	1716	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1717	grp->faults[i] -= p->numa_faults_memory[i];	1717	grp->faults[i] -= p->numa_faults_memory[i];
1718	grp->total_faults -= p->total_numa_faults;	1718	grp->total_faults -= p->total_numa_faults;
1719		1719
1720	list_del(&p->numa_entry);	1720	list_del(&p->numa_entry);
1721	grp->nr_tasks--;	1721	grp->nr_tasks--;
1722	spin_unlock_irqrestore(&grp->lock, flags);	1722	spin_unlock_irqrestore(&grp->lock, flags);
1723	rcu_assign_pointer(p->numa_group, NULL);	1723	rcu_assign_pointer(p->numa_group, NULL);
1724	put_numa_group(grp);	1724	put_numa_group(grp);
1725	}	1725	}
1726		1726
1727	p->numa_faults_memory = NULL;	1727	p->numa_faults_memory = NULL;
1728	p->numa_faults_buffer_memory = NULL;	1728	p->numa_faults_buffer_memory = NULL;
1729	p->numa_faults_cpu= NULL;	1729	p->numa_faults_cpu= NULL;
1730	p->numa_faults_buffer_cpu = NULL;	1730	p->numa_faults_buffer_cpu = NULL;
1731	kfree(numa_faults);	1731	kfree(numa_faults);
1732	}	1732	}
1733		1733
1734	/*	1734	/*
1735	* Got a PROT_NONE fault for a page on @node.	1735	* Got a PROT_NONE fault for a page on @node.
1736	*/	1736	*/
1737	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)	1737	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1738	{	1738	{
1739	struct task_struct *p = current;	1739	struct task_struct *p = current;
1740	bool migrated = flags & TNF_MIGRATED;	1740	bool migrated = flags & TNF_MIGRATED;
1741	int cpu_node = task_node(current);	1741	int cpu_node = task_node(current);
1742	int priv;	1742	int priv;
1743		1743
1744	if (!numabalancing_enabled)	1744	if (!numabalancing_enabled)
1745	return;	1745	return;
1746		1746
1747	/* for example, ksmd faulting in a user's mm */	1747	/* for example, ksmd faulting in a user's mm */
1748	if (!p->mm)	1748	if (!p->mm)
1749	return;	1749	return;
1750		1750
1751	/* Do not worry about placement if exiting */	1751	/* Do not worry about placement if exiting */
1752	if (p->state == TASK_DEAD)	1752	if (p->state == TASK_DEAD)
1753	return;	1753	return;
1754		1754
1755	/* Allocate buffer to track faults on a per-node basis */	1755	/* Allocate buffer to track faults on a per-node basis */
1756	if (unlikely(!p->numa_faults_memory)) {	1756	if (unlikely(!p->numa_faults_memory)) {
1757	int size = sizeof(p->numa_faults_memory)	1757	int size = sizeof(p->numa_faults_memory)
1758	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;	1758	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1759		1759
1760	p->numa_faults_memory = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);	1760	p->numa_faults_memory = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
1761	if (!p->numa_faults_memory)	1761	if (!p->numa_faults_memory)
1762	return;	1762	return;
1763		1763
1764	BUG_ON(p->numa_faults_buffer_memory);	1764	BUG_ON(p->numa_faults_buffer_memory);
1765	/*	1765	/*
1766	* The averaged statistics, shared & private, memory & cpu,	1766	* The averaged statistics, shared & private, memory & cpu,
1767	* occupy the first half of the array. The second half of the	1767	* occupy the first half of the array. The second half of the
1768	* array is for current counters, which are averaged into the	1768	* array is for current counters, which are averaged into the
1769	* first set by task_numa_placement.	1769	* first set by task_numa_placement.
1770	*/	1770	*/
1771	p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);	1771	p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1772	p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);	1772	p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1773	p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);	1773	p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1774	p->total_numa_faults = 0;	1774	p->total_numa_faults = 0;
1775	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));	1775	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1776	}	1776	}
1777		1777
1778	/*	1778	/*
1779	* First accesses are treated as private, otherwise consider accesses	1779	* First accesses are treated as private, otherwise consider accesses
1780	* to be private if the accessing pid has not changed	1780	* to be private if the accessing pid has not changed
1781	*/	1781	*/
1782	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {	1782	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
1783	priv = 1;	1783	priv = 1;
1784	} else {	1784	} else {
1785	priv = cpupid_match_pid(p, last_cpupid);	1785	priv = cpupid_match_pid(p, last_cpupid);
1786	if (!priv && !(flags & TNF_NO_GROUP))	1786	if (!priv && !(flags & TNF_NO_GROUP))
1787	task_numa_group(p, last_cpupid, flags, &priv);	1787	task_numa_group(p, last_cpupid, flags, &priv);
1788	}	1788	}
1789		1789
1790	task_numa_placement(p);	1790	task_numa_placement(p);
1791		1791
1792	/*	1792	/*
1793	* Retry task to preferred node migration periodically, in case it	1793	* Retry task to preferred node migration periodically, in case it
1794	* case it previously failed, or the scheduler moved us.	1794	* case it previously failed, or the scheduler moved us.
1795	*/	1795	*/
1796	if (time_after(jiffies, p->numa_migrate_retry))	1796	if (time_after(jiffies, p->numa_migrate_retry))
1797	numa_migrate_preferred(p);	1797	numa_migrate_preferred(p);
1798		1798
1799	if (migrated)	1799	if (migrated)
1800	p->numa_pages_migrated += pages;	1800	p->numa_pages_migrated += pages;
1801		1801
1802	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;	1802	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1803	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;	1803	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1804	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;	1804	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1805	}	1805	}
1806		1806
1807	static void reset_ptenuma_scan(struct task_struct *p)	1807	static void reset_ptenuma_scan(struct task_struct *p)
1808	{	1808	{
1809	ACCESS_ONCE(p->mm->numa_scan_seq)++;	1809	ACCESS_ONCE(p->mm->numa_scan_seq)++;
1810	p->mm->numa_scan_offset = 0;	1810	p->mm->numa_scan_offset = 0;
1811	}	1811	}
1812		1812
1813	/*	1813	/*
1814	* The expensive part of numa migration is done from task_work context.	1814	* The expensive part of numa migration is done from task_work context.
1815	* Triggered from task_tick_numa().	1815	* Triggered from task_tick_numa().
1816	*/	1816	*/
1817	void task_numa_work(struct callback_head *work)	1817	void task_numa_work(struct callback_head *work)
1818	{	1818	{
1819	unsigned long migrate, next_scan, now = jiffies;	1819	unsigned long migrate, next_scan, now = jiffies;
1820	struct task_struct *p = current;	1820	struct task_struct *p = current;
1821	struct mm_struct *mm = p->mm;	1821	struct mm_struct *mm = p->mm;
1822	struct vm_area_struct *vma;	1822	struct vm_area_struct *vma;
1823	unsigned long start, end;	1823	unsigned long start, end;
1824	unsigned long nr_pte_updates = 0;	1824	unsigned long nr_pte_updates = 0;
1825	long pages;	1825	long pages;
1826		1826
1827	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));	1827	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1828		1828
1829	work->next = work; /* protect against double add */	1829	work->next = work; /* protect against double add */
1830	/*	1830	/*
1831	* Who cares about NUMA placement when they're dying.	1831	* Who cares about NUMA placement when they're dying.
1832	*	1832	*
1833	* NOTE: make sure not to dereference p->mm before this check,	1833	* NOTE: make sure not to dereference p->mm before this check,
1834	* exit_task_work() happens _after_ exit_mm() so we could be called	1834	* exit_task_work() happens _after_ exit_mm() so we could be called
1835	* without p->mm even though we still had it when we enqueued this	1835	* without p->mm even though we still had it when we enqueued this
1836	* work.	1836	* work.
1837	*/	1837	*/
1838	if (p->flags & PF_EXITING)	1838	if (p->flags & PF_EXITING)
1839	return;	1839	return;
1840		1840
1841	if (!mm->numa_next_scan) {	1841	if (!mm->numa_next_scan) {
1842	mm->numa_next_scan = now +	1842	mm->numa_next_scan = now +
1843	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);	1843	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1844	}	1844	}
1845		1845
1846	/*	1846	/*
1847	* Enforce maximal scan/migration frequency..	1847	* Enforce maximal scan/migration frequency..
1848	*/	1848	*/
1849	migrate = mm->numa_next_scan;	1849	migrate = mm->numa_next_scan;
1850	if (time_before(now, migrate))	1850	if (time_before(now, migrate))
1851	return;	1851	return;
1852		1852
1853	if (p->numa_scan_period == 0) {	1853	if (p->numa_scan_period == 0) {
1854	p->numa_scan_period_max = task_scan_max(p);	1854	p->numa_scan_period_max = task_scan_max(p);
1855	p->numa_scan_period = task_scan_min(p);	1855	p->numa_scan_period = task_scan_min(p);
1856	}	1856	}
1857		1857
1858	next_scan = now + msecs_to_jiffies(p->numa_scan_period);	1858	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1859	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)	1859	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1860	return;	1860	return;
1861		1861
1862	/*	1862	/*
1863	* Delay this task enough that another task of this mm will likely win	1863	* Delay this task enough that another task of this mm will likely win
1864	* the next time around.	1864	* the next time around.
1865	*/	1865	*/
1866	p->node_stamp += 2 * TICK_NSEC;	1866	p->node_stamp += 2 * TICK_NSEC;
1867		1867
1868	start = mm->numa_scan_offset;	1868	start = mm->numa_scan_offset;
1869	pages = sysctl_numa_balancing_scan_size;	1869	pages = sysctl_numa_balancing_scan_size;
1870	pages <<= 20 - PAGE_SHIFT; /* MB in pages */	1870	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1871	if (!pages)	1871	if (!pages)
1872	return;	1872	return;
1873		1873
1874	down_read(&mm->mmap_sem);	1874	down_read(&mm->mmap_sem);
1875	vma = find_vma(mm, start);	1875	vma = find_vma(mm, start);
1876	if (!vma) {	1876	if (!vma) {
1877	reset_ptenuma_scan(p);	1877	reset_ptenuma_scan(p);
1878	start = 0;	1878	start = 0;
1879	vma = mm->mmap;	1879	vma = mm->mmap;
1880	}	1880	}
1881	for (; vma; vma = vma->vm_next) {	1881	for (; vma; vma = vma->vm_next) {
1882	if (!vma_migratable(vma) \|\| !vma_policy_mof(p, vma))	1882	if (!vma_migratable(vma) \|\| !vma_policy_mof(p, vma))
1883	continue;	1883	continue;
1884		1884
1885	/*	1885	/*
1886	* Shared library pages mapped by multiple processes are not	1886	* Shared library pages mapped by multiple processes are not
1887	* migrated as it is expected they are cache replicated. Avoid	1887	* migrated as it is expected they are cache replicated. Avoid
1888	* hinting faults in read-only file-backed mappings or the vdso	1888	* hinting faults in read-only file-backed mappings or the vdso
1889	* as migrating the pages will be of marginal benefit.	1889	* as migrating the pages will be of marginal benefit.
1890	*/	1890	*/
1891	if (!vma->vm_mm \|\|	1891	if (!vma->vm_mm \|\|
1892	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))	1892	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
1893	continue;	1893	continue;
1894		1894
1895	/*	1895	/*
1896	* Skip inaccessible VMAs to avoid any confusion between	1896	* Skip inaccessible VMAs to avoid any confusion between
1897	* PROT_NONE and NUMA hinting ptes	1897	* PROT_NONE and NUMA hinting ptes
1898	*/	1898	*/
1899	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))	1899	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
1900	continue;	1900	continue;
1901		1901
1902	do {	1902	do {
1903	start = max(start, vma->vm_start);	1903	start = max(start, vma->vm_start);
1904	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);	1904	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1905	end = min(end, vma->vm_end);	1905	end = min(end, vma->vm_end);
1906	nr_pte_updates += change_prot_numa(vma, start, end);	1906	nr_pte_updates += change_prot_numa(vma, start, end);
1907		1907
1908	/*	1908	/*
1909	* Scan sysctl_numa_balancing_scan_size but ensure that	1909	* Scan sysctl_numa_balancing_scan_size but ensure that
1910	* at least one PTE is updated so that unused virtual	1910	* at least one PTE is updated so that unused virtual
1911	* address space is quickly skipped.	1911	* address space is quickly skipped.
1912	*/	1912	*/
1913	if (nr_pte_updates)	1913	if (nr_pte_updates)
1914	pages -= (end - start) >> PAGE_SHIFT;	1914	pages -= (end - start) >> PAGE_SHIFT;
1915		1915
1916	start = end;	1916	start = end;
1917	if (pages <= 0)	1917	if (pages <= 0)
1918	goto out;	1918	goto out;
1919		1919
1920	cond_resched();	1920	cond_resched();
1921	} while (end != vma->vm_end);	1921	} while (end != vma->vm_end);
1922	}	1922	}
1923		1923
1924	out:	1924	out:
1925	/*	1925	/*
1926	* It is possible to reach the end of the VMA list but the last few	1926	* It is possible to reach the end of the VMA list but the last few
1927	* VMAs are not guaranteed to the vma_migratable. If they are not, we	1927	* VMAs are not guaranteed to the vma_migratable. If they are not, we
1928	* would find the !migratable VMA on the next scan but not reset the	1928	* would find the !migratable VMA on the next scan but not reset the
1929	* scanner to the start so check it now.	1929	* scanner to the start so check it now.
1930	*/	1930	*/
1931	if (vma)	1931	if (vma)
1932	mm->numa_scan_offset = start;	1932	mm->numa_scan_offset = start;
1933	else	1933	else
1934	reset_ptenuma_scan(p);	1934	reset_ptenuma_scan(p);
1935	up_read(&mm->mmap_sem);	1935	up_read(&mm->mmap_sem);
1936	}	1936	}
1937		1937
1938	/*	1938	/*
1939	* Drive the periodic memory faults..	1939	* Drive the periodic memory faults..
1940	*/	1940	*/
1941	void task_tick_numa(struct rq rq, struct task_struct curr)	1941	void task_tick_numa(struct rq rq, struct task_struct curr)
1942	{	1942	{
1943	struct callback_head *work = &curr->numa_work;	1943	struct callback_head *work = &curr->numa_work;
1944	u64 period, now;	1944	u64 period, now;
1945		1945
1946	/*	1946	/*
1947	* We don't care about NUMA placement if we don't have memory.	1947	* We don't care about NUMA placement if we don't have memory.
1948	*/	1948	*/
1949	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)	1949	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
1950	return;	1950	return;
1951		1951
1952	/*	1952	/*
1953	* Using runtime rather than walltime has the dual advantage that	1953	* Using runtime rather than walltime has the dual advantage that
1954	* we (mostly) drive the selection from busy threads and that the	1954	* we (mostly) drive the selection from busy threads and that the
1955	* task needs to have done some actual work before we bother with	1955	* task needs to have done some actual work before we bother with
1956	* NUMA placement.	1956	* NUMA placement.
1957	*/	1957	*/
1958	now = curr->se.sum_exec_runtime;	1958	now = curr->se.sum_exec_runtime;
1959	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;	1959	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1960		1960
1961	if (now - curr->node_stamp > period) {	1961	if (now - curr->node_stamp > period) {
1962	if (!curr->node_stamp)	1962	if (!curr->node_stamp)
1963	curr->numa_scan_period = task_scan_min(curr);	1963	curr->numa_scan_period = task_scan_min(curr);
1964	curr->node_stamp += period;	1964	curr->node_stamp += period;
1965		1965
1966	if (!time_before(jiffies, curr->mm->numa_next_scan)) {	1966	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1967	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */	1967	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1968	task_work_add(curr, work, true);	1968	task_work_add(curr, work, true);
1969	}	1969	}
1970	}	1970	}
1971	}	1971	}
1972	#else	1972	#else
1973	static void task_tick_numa(struct rq rq, struct task_struct curr)	1973	static void task_tick_numa(struct rq rq, struct task_struct curr)
1974	{	1974	{
1975	}	1975	}
1976		1976
1977	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)	1977	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
1978	{	1978	{
1979	}	1979	}
1980		1980
1981	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)	1981	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
1982	{	1982	{
1983	}	1983	}
1984	#endif /* CONFIG_NUMA_BALANCING */	1984	#endif /* CONFIG_NUMA_BALANCING */
1985		1985
1986	static void	1986	static void
1987	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	1987	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
1988	{	1988	{
1989	update_load_add(&cfs_rq->load, se->load.weight);	1989	update_load_add(&cfs_rq->load, se->load.weight);
1990	if (!parent_entity(se))	1990	if (!parent_entity(se))
1991	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);	1991	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1992	#ifdef CONFIG_SMP	1992	#ifdef CONFIG_SMP
1993	if (entity_is_task(se)) {	1993	if (entity_is_task(se)) {
1994	struct rq *rq = rq_of(cfs_rq);	1994	struct rq *rq = rq_of(cfs_rq);
1995		1995
1996	account_numa_enqueue(rq, task_of(se));	1996	account_numa_enqueue(rq, task_of(se));
1997	list_add(&se->group_node, &rq->cfs_tasks);	1997	list_add(&se->group_node, &rq->cfs_tasks);
1998	}	1998	}
1999	#endif	1999	#endif
2000	cfs_rq->nr_running++;	2000	cfs_rq->nr_running++;
2001	}	2001	}
2002		2002
2003	static void	2003	static void
2004	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)	2004	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
2005	{	2005	{
2006	update_load_sub(&cfs_rq->load, se->load.weight);	2006	update_load_sub(&cfs_rq->load, se->load.weight);
2007	if (!parent_entity(se))	2007	if (!parent_entity(se))
2008	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);	2008	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2009	if (entity_is_task(se)) {	2009	if (entity_is_task(se)) {
2010	account_numa_dequeue(rq_of(cfs_rq), task_of(se));	2010	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2011	list_del_init(&se->group_node);	2011	list_del_init(&se->group_node);
2012	}	2012	}
2013	cfs_rq->nr_running--;	2013	cfs_rq->nr_running--;
2014	}	2014	}
2015		2015
2016	#ifdef CONFIG_FAIR_GROUP_SCHED	2016	#ifdef CONFIG_FAIR_GROUP_SCHED
2017	# ifdef CONFIG_SMP	2017	# ifdef CONFIG_SMP
2018	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)	2018	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
2019	{	2019	{
2020	long tg_weight;	2020	long tg_weight;
2021		2021
2022	/*	2022	/*
2023	* Use this CPU's actual weight instead of the last load_contribution	2023	* Use this CPU's actual weight instead of the last load_contribution
2024	* to gain a more accurate current total weight. See	2024	* to gain a more accurate current total weight. See
2025	* update_cfs_rq_load_contribution().	2025	* update_cfs_rq_load_contribution().
2026	*/	2026	*/
2027	tg_weight = atomic_long_read(&tg->load_avg);	2027	tg_weight = atomic_long_read(&tg->load_avg);
2028	tg_weight -= cfs_rq->tg_load_contrib;	2028	tg_weight -= cfs_rq->tg_load_contrib;
2029	tg_weight += cfs_rq->load.weight;	2029	tg_weight += cfs_rq->load.weight;
2030		2030
2031	return tg_weight;	2031	return tg_weight;
2032	}	2032	}
2033		2033
2034	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)	2034	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
2035	{	2035	{
2036	long tg_weight, load, shares;	2036	long tg_weight, load, shares;
2037		2037
2038	tg_weight = calc_tg_weight(tg, cfs_rq);	2038	tg_weight = calc_tg_weight(tg, cfs_rq);
2039	load = cfs_rq->load.weight;	2039	load = cfs_rq->load.weight;
2040		2040
2041	shares = (tg->shares * load);	2041	shares = (tg->shares * load);
2042	if (tg_weight)	2042	if (tg_weight)
2043	shares /= tg_weight;	2043	shares /= tg_weight;
2044		2044
2045	if (shares < MIN_SHARES)	2045	if (shares < MIN_SHARES)
2046	shares = MIN_SHARES;	2046	shares = MIN_SHARES;
2047	if (shares > tg->shares)	2047	if (shares > tg->shares)
2048	shares = tg->shares;	2048	shares = tg->shares;
2049		2049
2050	return shares;	2050	return shares;
2051	}	2051	}
2052	# else /* CONFIG_SMP */	2052	# else /* CONFIG_SMP */
2053	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)	2053	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
2054	{	2054	{
2055	return tg->shares;	2055	return tg->shares;
2056	}	2056	}
2057	# endif /* CONFIG_SMP */	2057	# endif /* CONFIG_SMP */
2058	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,	2058	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
2059	unsigned long weight)	2059	unsigned long weight)
2060	{	2060	{
2061	if (se->on_rq) {	2061	if (se->on_rq) {
2062	/* commit outstanding execution time */	2062	/* commit outstanding execution time */
2063	if (cfs_rq->curr == se)	2063	if (cfs_rq->curr == se)
2064	update_curr(cfs_rq);	2064	update_curr(cfs_rq);
2065	account_entity_dequeue(cfs_rq, se);	2065	account_entity_dequeue(cfs_rq, se);
2066	}	2066	}
2067		2067
2068	update_load_set(&se->load, weight);	2068	update_load_set(&se->load, weight);
2069		2069
2070	if (se->on_rq)	2070	if (se->on_rq)
2071	account_entity_enqueue(cfs_rq, se);	2071	account_entity_enqueue(cfs_rq, se);
2072	}	2072	}
2073		2073
2074	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);	2074	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2075		2075
2076	static void update_cfs_shares(struct cfs_rq *cfs_rq)	2076	static void update_cfs_shares(struct cfs_rq *cfs_rq)
2077	{	2077	{
2078	struct task_group *tg;	2078	struct task_group *tg;
2079	struct sched_entity *se;	2079	struct sched_entity *se;
2080	long shares;	2080	long shares;
2081		2081
2082	tg = cfs_rq->tg;	2082	tg = cfs_rq->tg;
2083	se = tg->se[cpu_of(rq_of(cfs_rq))];	2083	se = tg->se[cpu_of(rq_of(cfs_rq))];
2084	if (!se \|\| throttled_hierarchy(cfs_rq))	2084	if (!se \|\| throttled_hierarchy(cfs_rq))
2085	return;	2085	return;
2086	#ifndef CONFIG_SMP	2086	#ifndef CONFIG_SMP
2087	if (likely(se->load.weight == tg->shares))	2087	if (likely(se->load.weight == tg->shares))
2088	return;	2088	return;
2089	#endif	2089	#endif
2090	shares = calc_cfs_shares(cfs_rq, tg);	2090	shares = calc_cfs_shares(cfs_rq, tg);
2091		2091
2092	reweight_entity(cfs_rq_of(se), se, shares);	2092	reweight_entity(cfs_rq_of(se), se, shares);
2093	}	2093	}
2094	#else /* CONFIG_FAIR_GROUP_SCHED */	2094	#else /* CONFIG_FAIR_GROUP_SCHED */
2095	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)	2095	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2096	{	2096	{
2097	}	2097	}
2098	#endif /* CONFIG_FAIR_GROUP_SCHED */	2098	#endif /* CONFIG_FAIR_GROUP_SCHED */
2099		2099
2100	#ifdef CONFIG_SMP	2100	#ifdef CONFIG_SMP
2101	/*	2101	/*
2102	* We choose a half-life close to 1 scheduling period.	2102	* We choose a half-life close to 1 scheduling period.
2103	* Note: The tables below are dependent on this value.	2103	* Note: The tables below are dependent on this value.
2104	*/	2104	*/
2105	#define LOAD_AVG_PERIOD 32	2105	#define LOAD_AVG_PERIOD 32
2106	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */	2106	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
2107	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */	2107	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
2108		2108
2109	/* Precomputed fixed inverse multiplies for multiplication by y^n */	2109	/* Precomputed fixed inverse multiplies for multiplication by y^n */
2110	static const u32 runnable_avg_yN_inv[] = {	2110	static const u32 runnable_avg_yN_inv[] = {
2111	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,	2111	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2112	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,	2112	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2113	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,	2113	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2114	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,	2114	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2115	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,	2115	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2116	0x85aac367, 0x82cd8698,	2116	0x85aac367, 0x82cd8698,
2117	};	2117	};
2118		2118
2119	/*	2119	/*
2120	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent	2120	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2121	* over-estimates when re-combining.	2121	* over-estimates when re-combining.
2122	*/	2122	*/
2123	static const u32 runnable_avg_yN_sum[] = {	2123	static const u32 runnable_avg_yN_sum[] = {
2124	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,	2124	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2125	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,	2125	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2126	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,	2126	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2127	};	2127	};
2128		2128
2129	/*	2129	/*
2130	* Approximate:	2130	* Approximate:
2131	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)	2131	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2132	*/	2132	*/
2133	static __always_inline u64 decay_load(u64 val, u64 n)	2133	static __always_inline u64 decay_load(u64 val, u64 n)
2134	{	2134	{
2135	unsigned int local_n;	2135	unsigned int local_n;
2136		2136
2137	if (!n)	2137	if (!n)
2138	return val;	2138	return val;
2139	else if (unlikely(n > LOAD_AVG_PERIOD * 63))	2139	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2140	return 0;	2140	return 0;
2141		2141
2142	/* after bounds checking we can collapse to 32-bit */	2142	/* after bounds checking we can collapse to 32-bit */
2143	local_n = n;	2143	local_n = n;
2144		2144
2145	/*	2145	/*
2146	* As y^PERIOD = 1/2, we can combine	2146	* As y^PERIOD = 1/2, we can combine
2147	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)	2147	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
2148	* With a look-up table which covers k^n (n<PERIOD)	2148	* With a look-up table which covers k^n (n<PERIOD)
2149	*	2149	*
2150	* To achieve constant time decay_load.	2150	* To achieve constant time decay_load.
2151	*/	2151	*/
2152	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {	2152	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2153	val >>= local_n / LOAD_AVG_PERIOD;	2153	val >>= local_n / LOAD_AVG_PERIOD;
2154	local_n %= LOAD_AVG_PERIOD;	2154	local_n %= LOAD_AVG_PERIOD;
2155	}	2155	}
2156		2156
2157	val *= runnable_avg_yN_inv[local_n];	2157	val *= runnable_avg_yN_inv[local_n];
2158	/* We don't use SRR here since we always want to round down. */	2158	/* We don't use SRR here since we always want to round down. */
2159	return val >> 32;	2159	return val >> 32;
2160	}	2160	}
2161		2161
2162	/*	2162	/*
2163	* For updates fully spanning n periods, the contribution to runnable	2163	* For updates fully spanning n periods, the contribution to runnable
2164	* average will be: \Sum 1024*y^n	2164	* average will be: \Sum 1024*y^n
2165	*	2165	*
2166	* We can compute this reasonably efficiently by combining:	2166	* We can compute this reasonably efficiently by combining:
2167	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}	2167	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2168	*/	2168	*/
2169	static u32 __compute_runnable_contrib(u64 n)	2169	static u32 __compute_runnable_contrib(u64 n)
2170	{	2170	{
2171	u32 contrib = 0;	2171	u32 contrib = 0;
2172		2172
2173	if (likely(n <= LOAD_AVG_PERIOD))	2173	if (likely(n <= LOAD_AVG_PERIOD))
2174	return runnable_avg_yN_sum[n];	2174	return runnable_avg_yN_sum[n];
2175	else if (unlikely(n >= LOAD_AVG_MAX_N))	2175	else if (unlikely(n >= LOAD_AVG_MAX_N))
2176	return LOAD_AVG_MAX;	2176	return LOAD_AVG_MAX;
2177		2177
2178	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */	2178	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2179	do {	2179	do {
2180	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */	2180	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2181	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];	2181	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2182		2182
2183	n -= LOAD_AVG_PERIOD;	2183	n -= LOAD_AVG_PERIOD;
2184	} while (n > LOAD_AVG_PERIOD);	2184	} while (n > LOAD_AVG_PERIOD);
2185		2185
2186	contrib = decay_load(contrib, n);	2186	contrib = decay_load(contrib, n);
2187	return contrib + runnable_avg_yN_sum[n];	2187	return contrib + runnable_avg_yN_sum[n];
2188	}	2188	}
2189		2189
2190	/*	2190	/*
2191	* We can represent the historical contribution to runnable average as the	2191	* We can represent the historical contribution to runnable average as the
2192	* coefficients of a geometric series. To do this we sub-divide our runnable	2192	* coefficients of a geometric series. To do this we sub-divide our runnable
2193	* history into segments of approximately 1ms (1024us); label the segment that	2193	* history into segments of approximately 1ms (1024us); label the segment that
2194	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.	2194	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2195	*	2195	*
2196	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...	2196	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
2197	* p0 p1 p2	2197	* p0 p1 p2
2198	* (now) (~1ms ago) (~2ms ago)	2198	* (now) (~1ms ago) (~2ms ago)
2199	*	2199	*
2200	* Let u_i denote the fraction of p_i that the entity was runnable.	2200	* Let u_i denote the fraction of p_i that the entity was runnable.
2201	*	2201	*
2202	* We then designate the fractions u_i as our co-efficients, yielding the	2202	* We then designate the fractions u_i as our co-efficients, yielding the
2203	* following representation of historical load:	2203	* following representation of historical load:
2204	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...	2204	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
2205	*	2205	*
2206	* We choose y based on the with of a reasonably scheduling period, fixing:	2206	* We choose y based on the with of a reasonably scheduling period, fixing:
2207	* y^32 = 0.5	2207	* y^32 = 0.5
2208	*	2208	*
2209	* This means that the contribution to load ~32ms ago (u_32) will be weighted	2209	* This means that the contribution to load ~32ms ago (u_32) will be weighted
2210	* approximately half as much as the contribution to load within the last ms	2210	* approximately half as much as the contribution to load within the last ms
2211	* (u_0).	2211	* (u_0).
2212	*	2212	*
2213	* When a period "rolls over" and we have new u_0`, multiplying the previous	2213	* When a period "rolls over" and we have new u_0`, multiplying the previous
2214	* sum again by y is sufficient to update:	2214	* sum again by y is sufficient to update:
2215	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )	2215	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
2216	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]	2216	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
2217	*/	2217	*/
2218	static __always_inline int __update_entity_runnable_avg(u64 now,	2218	static __always_inline int __update_entity_runnable_avg(u64 now,
2219	struct sched_avg *sa,	2219	struct sched_avg *sa,
2220	int runnable)	2220	int runnable)
2221	{	2221	{
2222	u64 delta, periods;	2222	u64 delta, periods;
2223	u32 runnable_contrib;	2223	u32 runnable_contrib;
2224	int delta_w, decayed = 0;	2224	int delta_w, decayed = 0;
2225		2225
2226	delta = now - sa->last_runnable_update;	2226	delta = now - sa->last_runnable_update;
2227	/*	2227	/*
2228	* This should only happen when time goes backwards, which it	2228	* This should only happen when time goes backwards, which it
2229	* unfortunately does during sched clock init when we swap over to TSC.	2229	* unfortunately does during sched clock init when we swap over to TSC.
2230	*/	2230	*/
2231	if ((s64)delta < 0) {	2231	if ((s64)delta < 0) {
2232	sa->last_runnable_update = now;	2232	sa->last_runnable_update = now;
2233	return 0;	2233	return 0;
2234	}	2234	}
2235		2235
2236	/*	2236	/*
2237	* Use 1024ns as the unit of measurement since it's a reasonable	2237	* Use 1024ns as the unit of measurement since it's a reasonable
2238	* approximation of 1us and fast to compute.	2238	* approximation of 1us and fast to compute.
2239	*/	2239	*/
2240	delta >>= 10;	2240	delta >>= 10;
2241	if (!delta)	2241	if (!delta)
2242	return 0;	2242	return 0;
2243	sa->last_runnable_update = now;	2243	sa->last_runnable_update = now;
2244		2244
2245	/* delta_w is the amount already accumulated against our next period */	2245	/* delta_w is the amount already accumulated against our next period */
2246	delta_w = sa->runnable_avg_period % 1024;	2246	delta_w = sa->runnable_avg_period % 1024;
2247	if (delta + delta_w >= 1024) {	2247	if (delta + delta_w >= 1024) {
2248	/* period roll-over */	2248	/* period roll-over */
2249	decayed = 1;	2249	decayed = 1;
2250		2250
2251	/*	2251	/*
2252	* Now that we know we're crossing a period boundary, figure	2252	* Now that we know we're crossing a period boundary, figure
2253	* out how much from delta we need to complete the current	2253	* out how much from delta we need to complete the current
2254	* period and accrue it.	2254	* period and accrue it.
2255	*/	2255	*/
2256	delta_w = 1024 - delta_w;	2256	delta_w = 1024 - delta_w;
2257	if (runnable)	2257	if (runnable)
2258	sa->runnable_avg_sum += delta_w;	2258	sa->runnable_avg_sum += delta_w;
2259	sa->runnable_avg_period += delta_w;	2259	sa->runnable_avg_period += delta_w;
2260		2260
2261	delta -= delta_w;	2261	delta -= delta_w;
2262		2262
2263	/* Figure out how many additional periods this update spans */	2263	/* Figure out how many additional periods this update spans */
2264	periods = delta / 1024;	2264	periods = delta / 1024;
2265	delta %= 1024;	2265	delta %= 1024;
2266		2266
2267	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,	2267	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2268	periods + 1);	2268	periods + 1);
2269	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,	2269	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
2270	periods + 1);	2270	periods + 1);
2271		2271
2272	/* Efficiently calculate \sum (1..n_period) 1024y^i /	2272	/* Efficiently calculate \sum (1..n_period) 1024y^i /
2273	runnable_contrib = __compute_runnable_contrib(periods);	2273	runnable_contrib = __compute_runnable_contrib(periods);
2274	if (runnable)	2274	if (runnable)
2275	sa->runnable_avg_sum += runnable_contrib;	2275	sa->runnable_avg_sum += runnable_contrib;
2276	sa->runnable_avg_period += runnable_contrib;	2276	sa->runnable_avg_period += runnable_contrib;
2277	}	2277	}
2278		2278
2279	/* Remainder of delta accrued against u_0` */	2279	/* Remainder of delta accrued against u_0` */
2280	if (runnable)	2280	if (runnable)
2281	sa->runnable_avg_sum += delta;	2281	sa->runnable_avg_sum += delta;
2282	sa->runnable_avg_period += delta;	2282	sa->runnable_avg_period += delta;
2283		2283
2284	return decayed;	2284	return decayed;
2285	}	2285	}
2286		2286
2287	/* Synchronize an entity's decay with its parenting cfs_rq.*/	2287	/* Synchronize an entity's decay with its parenting cfs_rq.*/
2288	static inline u64 __synchronize_entity_decay(struct sched_entity *se)	2288	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2289	{	2289	{
2290	struct cfs_rq *cfs_rq = cfs_rq_of(se);	2290	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2291	u64 decays = atomic64_read(&cfs_rq->decay_counter);	2291	u64 decays = atomic64_read(&cfs_rq->decay_counter);
2292		2292
2293	decays -= se->avg.decay_count;	2293	decays -= se->avg.decay_count;
2294	if (!decays)	2294	if (!decays)
2295	return 0;	2295	return 0;
2296		2296
2297	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);	2297	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2298	se->avg.decay_count = 0;	2298	se->avg.decay_count = 0;
2299		2299
2300	return decays;	2300	return decays;
2301	}	2301	}
2302		2302
2303	#ifdef CONFIG_FAIR_GROUP_SCHED	2303	#ifdef CONFIG_FAIR_GROUP_SCHED
2304	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,	2304	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2305	int force_update)	2305	int force_update)
2306	{	2306	{
2307	struct task_group *tg = cfs_rq->tg;	2307	struct task_group *tg = cfs_rq->tg;
2308	long tg_contrib;	2308	long tg_contrib;
2309		2309
2310	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;	2310	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2311	tg_contrib -= cfs_rq->tg_load_contrib;	2311	tg_contrib -= cfs_rq->tg_load_contrib;
2312		2312
2313	if (force_update \|\| abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {	2313	if (force_update \|\| abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2314	atomic_long_add(tg_contrib, &tg->load_avg);	2314	atomic_long_add(tg_contrib, &tg->load_avg);
2315	cfs_rq->tg_load_contrib += tg_contrib;	2315	cfs_rq->tg_load_contrib += tg_contrib;
2316	}	2316	}
2317	}	2317	}
2318		2318
2319	/*	2319	/*
2320	* Aggregate cfs_rq runnable averages into an equivalent task_group	2320	* Aggregate cfs_rq runnable averages into an equivalent task_group
2321	* representation for computing load contributions.	2321	* representation for computing load contributions.
2322	*/	2322	*/
2323	static inline void __update_tg_runnable_avg(struct sched_avg *sa,	2323	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2324	struct cfs_rq *cfs_rq)	2324	struct cfs_rq *cfs_rq)
2325	{	2325	{
2326	struct task_group *tg = cfs_rq->tg;	2326	struct task_group *tg = cfs_rq->tg;
2327	long contrib;	2327	long contrib;
2328		2328
2329	/* The fraction of a cpu used by this cfs_rq */	2329	/* The fraction of a cpu used by this cfs_rq */
2330	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,	2330	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2331	sa->runnable_avg_period + 1);	2331	sa->runnable_avg_period + 1);
2332	contrib -= cfs_rq->tg_runnable_contrib;	2332	contrib -= cfs_rq->tg_runnable_contrib;
2333		2333
2334	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {	2334	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2335	atomic_add(contrib, &tg->runnable_avg);	2335	atomic_add(contrib, &tg->runnable_avg);
2336	cfs_rq->tg_runnable_contrib += contrib;	2336	cfs_rq->tg_runnable_contrib += contrib;
2337	}	2337	}
2338	}	2338	}
2339		2339
2340	static inline void __update_group_entity_contrib(struct sched_entity *se)	2340	static inline void __update_group_entity_contrib(struct sched_entity *se)
2341	{	2341	{
2342	struct cfs_rq *cfs_rq = group_cfs_rq(se);	2342	struct cfs_rq *cfs_rq = group_cfs_rq(se);
2343	struct task_group *tg = cfs_rq->tg;	2343	struct task_group *tg = cfs_rq->tg;
2344	int runnable_avg;	2344	int runnable_avg;
2345		2345
2346	u64 contrib;	2346	u64 contrib;
2347		2347
2348	contrib = cfs_rq->tg_load_contrib * tg->shares;	2348	contrib = cfs_rq->tg_load_contrib * tg->shares;
2349	se->avg.load_avg_contrib = div_u64(contrib,	2349	se->avg.load_avg_contrib = div_u64(contrib,
2350	atomic_long_read(&tg->load_avg) + 1);	2350	atomic_long_read(&tg->load_avg) + 1);
2351		2351
2352	/*	2352	/*
2353	* For group entities we need to compute a correction term in the case	2353	* For group entities we need to compute a correction term in the case
2354	* that they are consuming <1 cpu so that we would contribute the same	2354	* that they are consuming <1 cpu so that we would contribute the same
2355	* load as a task of equal weight.	2355	* load as a task of equal weight.
2356	*	2356	*
2357	* Explicitly co-ordinating this measurement would be expensive, but	2357	* Explicitly co-ordinating this measurement would be expensive, but
2358	* fortunately the sum of each cpus contribution forms a usable	2358	* fortunately the sum of each cpus contribution forms a usable
2359	* lower-bound on the true value.	2359	* lower-bound on the true value.
2360	*	2360	*
2361	* Consider the aggregate of 2 contributions. Either they are disjoint	2361	* Consider the aggregate of 2 contributions. Either they are disjoint
2362	* (and the sum represents true value) or they are disjoint and we are	2362	* (and the sum represents true value) or they are disjoint and we are
2363	* understating by the aggregate of their overlap.	2363	* understating by the aggregate of their overlap.
2364	*	2364	*
2365	* Extending this to N cpus, for a given overlap, the maximum amount we	2365	* Extending this to N cpus, for a given overlap, the maximum amount we
2366	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of	2366	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2367	* cpus that overlap for this interval and w_i is the interval width.	2367	* cpus that overlap for this interval and w_i is the interval width.
2368	*	2368	*
2369	* On a small machine; the first term is well-bounded which bounds the	2369	* On a small machine; the first term is well-bounded which bounds the
2370	* total error since w_i is a subset of the period. Whereas on a	2370	* total error since w_i is a subset of the period. Whereas on a
2371	* larger machine, while this first term can be larger, if w_i is the	2371	* larger machine, while this first term can be larger, if w_i is the
2372	* of consequential size guaranteed to see n_i*w_i quickly converge to	2372	* of consequential size guaranteed to see n_i*w_i quickly converge to
2373	* our upper bound of 1-cpu.	2373	* our upper bound of 1-cpu.
2374	*/	2374	*/
2375	runnable_avg = atomic_read(&tg->runnable_avg);	2375	runnable_avg = atomic_read(&tg->runnable_avg);
2376	if (runnable_avg < NICE_0_LOAD) {	2376	if (runnable_avg < NICE_0_LOAD) {
2377	se->avg.load_avg_contrib *= runnable_avg;	2377	se->avg.load_avg_contrib *= runnable_avg;
2378	se->avg.load_avg_contrib >>= NICE_0_SHIFT;	2378	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2379	}	2379	}
2380	}	2380	}
2381		2381
2382	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)	2382	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2383	{	2383	{
2384	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);	2384	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2385	__update_tg_runnable_avg(&rq->avg, &rq->cfs);	2385	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
2386	}	2386	}
2387	#else /* CONFIG_FAIR_GROUP_SCHED */	2387	#else /* CONFIG_FAIR_GROUP_SCHED */
2388	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,	2388	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2389	int force_update) {}	2389	int force_update) {}
2390	static inline void __update_tg_runnable_avg(struct sched_avg *sa,	2390	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2391	struct cfs_rq *cfs_rq) {}	2391	struct cfs_rq *cfs_rq) {}
2392	static inline void __update_group_entity_contrib(struct sched_entity *se) {}	2392	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2393	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}	2393	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2394	#endif /* CONFIG_FAIR_GROUP_SCHED */	2394	#endif /* CONFIG_FAIR_GROUP_SCHED */
2395		2395
2396	static inline void __update_task_entity_contrib(struct sched_entity *se)	2396	static inline void __update_task_entity_contrib(struct sched_entity *se)
2397	{	2397	{
2398	u32 contrib;	2398	u32 contrib;
2399		2399
2400	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */	2400	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2401	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);	2401	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2402	contrib /= (se->avg.runnable_avg_period + 1);	2402	contrib /= (se->avg.runnable_avg_period + 1);
2403	se->avg.load_avg_contrib = scale_load(contrib);	2403	se->avg.load_avg_contrib = scale_load(contrib);
2404	}	2404	}
2405		2405
2406	/* Compute the current contribution to load_avg by se, return any delta */	2406	/* Compute the current contribution to load_avg by se, return any delta */
2407	static long __update_entity_load_avg_contrib(struct sched_entity *se)	2407	static long __update_entity_load_avg_contrib(struct sched_entity *se)
2408	{	2408	{
2409	long old_contrib = se->avg.load_avg_contrib;	2409	long old_contrib = se->avg.load_avg_contrib;
2410		2410
2411	if (entity_is_task(se)) {	2411	if (entity_is_task(se)) {
2412	__update_task_entity_contrib(se);	2412	__update_task_entity_contrib(se);
2413	} else {	2413	} else {
2414	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));	2414	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2415	__update_group_entity_contrib(se);	2415	__update_group_entity_contrib(se);
2416	}	2416	}
2417		2417
2418	return se->avg.load_avg_contrib - old_contrib;	2418	return se->avg.load_avg_contrib - old_contrib;
2419	}	2419	}
2420		2420
2421	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,	2421	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2422	long load_contrib)	2422	long load_contrib)
2423	{	2423	{
2424	if (likely(load_contrib < cfs_rq->blocked_load_avg))	2424	if (likely(load_contrib < cfs_rq->blocked_load_avg))
2425	cfs_rq->blocked_load_avg -= load_contrib;	2425	cfs_rq->blocked_load_avg -= load_contrib;
2426	else	2426	else
2427	cfs_rq->blocked_load_avg = 0;	2427	cfs_rq->blocked_load_avg = 0;
2428	}	2428	}
2429		2429
2430	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);	2430	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2431		2431
2432	/* Update a sched_entity's runnable average */	2432	/* Update a sched_entity's runnable average */
2433	static inline void update_entity_load_avg(struct sched_entity *se,	2433	static inline void update_entity_load_avg(struct sched_entity *se,
2434	int update_cfs_rq)	2434	int update_cfs_rq)
2435	{	2435	{
2436	struct cfs_rq *cfs_rq = cfs_rq_of(se);	2436	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2437	long contrib_delta;	2437	long contrib_delta;
2438	u64 now;	2438	u64 now;
2439		2439
2440	/*	2440	/*
2441	* For a group entity we need to use their owned cfs_rq_clock_task() in	2441	* For a group entity we need to use their owned cfs_rq_clock_task() in
2442	* case they are the parent of a throttled hierarchy.	2442	* case they are the parent of a throttled hierarchy.
2443	*/	2443	*/
2444	if (entity_is_task(se))	2444	if (entity_is_task(se))
2445	now = cfs_rq_clock_task(cfs_rq);	2445	now = cfs_rq_clock_task(cfs_rq);
2446	else	2446	else
2447	now = cfs_rq_clock_task(group_cfs_rq(se));	2447	now = cfs_rq_clock_task(group_cfs_rq(se));
2448		2448
2449	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))	2449	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2450	return;	2450	return;
2451		2451
2452	contrib_delta = __update_entity_load_avg_contrib(se);	2452	contrib_delta = __update_entity_load_avg_contrib(se);
2453		2453
2454	if (!update_cfs_rq)	2454	if (!update_cfs_rq)
2455	return;	2455	return;
2456		2456
2457	if (se->on_rq)	2457	if (se->on_rq)
2458	cfs_rq->runnable_load_avg += contrib_delta;	2458	cfs_rq->runnable_load_avg += contrib_delta;
2459	else	2459	else
2460	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);	2460	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2461	}	2461	}
2462		2462
2463	/*	2463	/*
2464	* Decay the load contributed by all blocked children and account this so that	2464	* Decay the load contributed by all blocked children and account this so that
2465	* their contribution may appropriately discounted when they wake up.	2465	* their contribution may appropriately discounted when they wake up.
2466	*/	2466	*/
2467	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)	2467	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2468	{	2468	{
2469	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;	2469	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2470	u64 decays;	2470	u64 decays;
2471		2471
2472	decays = now - cfs_rq->last_decay;	2472	decays = now - cfs_rq->last_decay;
2473	if (!decays && !force_update)	2473	if (!decays && !force_update)
2474	return;	2474	return;
2475		2475
2476	if (atomic_long_read(&cfs_rq->removed_load)) {	2476	if (atomic_long_read(&cfs_rq->removed_load)) {
2477	unsigned long removed_load;	2477	unsigned long removed_load;
2478	removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);	2478	removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2479	subtract_blocked_load_contrib(cfs_rq, removed_load);	2479	subtract_blocked_load_contrib(cfs_rq, removed_load);
2480	}	2480	}
2481		2481
2482	if (decays) {	2482	if (decays) {
2483	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,	2483	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2484	decays);	2484	decays);
2485	atomic64_add(decays, &cfs_rq->decay_counter);	2485	atomic64_add(decays, &cfs_rq->decay_counter);
2486	cfs_rq->last_decay = now;	2486	cfs_rq->last_decay = now;
2487	}	2487	}
2488		2488
2489	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);	2489	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2490	}	2490	}
2491		2491
2492	/* Add the load generated by se into cfs_rq's child load-average */	2492	/* Add the load generated by se into cfs_rq's child load-average */
2493	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,	2493	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2494	struct sched_entity *se,	2494	struct sched_entity *se,
2495	int wakeup)	2495	int wakeup)
2496	{	2496	{
2497	/*	2497	/*
2498	* We track migrations using entity decay_count <= 0, on a wake-up	2498	* We track migrations using entity decay_count <= 0, on a wake-up
2499	* migration we use a negative decay count to track the remote decays	2499	* migration we use a negative decay count to track the remote decays
2500	* accumulated while sleeping.	2500	* accumulated while sleeping.
2501	*	2501	*
2502	* Newly forked tasks are enqueued with se->avg.decay_count == 0, they	2502	* Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2503	* are seen by enqueue_entity_load_avg() as a migration with an already	2503	* are seen by enqueue_entity_load_avg() as a migration with an already
2504	* constructed load_avg_contrib.	2504	* constructed load_avg_contrib.
2505	*/	2505	*/
2506	if (unlikely(se->avg.decay_count <= 0)) {	2506	if (unlikely(se->avg.decay_count <= 0)) {
2507	se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));	2507	se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2508	if (se->avg.decay_count) {	2508	if (se->avg.decay_count) {
2509	/*	2509	/*
2510	* In a wake-up migration we have to approximate the	2510	* In a wake-up migration we have to approximate the
2511	* time sleeping. This is because we can't synchronize	2511	* time sleeping. This is because we can't synchronize
2512	* clock_task between the two cpus, and it is not	2512	* clock_task between the two cpus, and it is not
2513	* guaranteed to be read-safe. Instead, we can	2513	* guaranteed to be read-safe. Instead, we can
2514	* approximate this using our carried decays, which are	2514	* approximate this using our carried decays, which are
2515	* explicitly atomically readable.	2515	* explicitly atomically readable.
2516	*/	2516	*/
2517	se->avg.last_runnable_update -= (-se->avg.decay_count)	2517	se->avg.last_runnable_update -= (-se->avg.decay_count)
2518	<< 20;	2518	<< 20;
2519	update_entity_load_avg(se, 0);	2519	update_entity_load_avg(se, 0);
2520	/* Indicate that we're now synchronized and on-rq */	2520	/* Indicate that we're now synchronized and on-rq */
2521	se->avg.decay_count = 0;	2521	se->avg.decay_count = 0;
2522	}	2522	}
2523	wakeup = 0;	2523	wakeup = 0;
2524	} else {	2524	} else {
2525	__synchronize_entity_decay(se);	2525	__synchronize_entity_decay(se);
2526	}	2526	}
2527		2527
2528	/* migrated tasks did not contribute to our blocked load */	2528	/* migrated tasks did not contribute to our blocked load */
2529	if (wakeup) {	2529	if (wakeup) {
2530	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);	2530	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2531	update_entity_load_avg(se, 0);	2531	update_entity_load_avg(se, 0);
2532	}	2532	}
2533		2533
2534	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;	2534	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2535	/* we force update consideration on load-balancer moves */	2535	/* we force update consideration on load-balancer moves */
2536	update_cfs_rq_blocked_load(cfs_rq, !wakeup);	2536	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2537	}	2537	}
2538		2538
2539	/*	2539	/*
2540	* Remove se's load from this cfs_rq child load-average, if the entity is	2540	* Remove se's load from this cfs_rq child load-average, if the entity is
2541	* transitioning to a blocked state we track its projected decay using	2541	* transitioning to a blocked state we track its projected decay using
2542	* blocked_load_avg.	2542	* blocked_load_avg.
2543	*/	2543	*/
2544	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,	2544	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2545	struct sched_entity *se,	2545	struct sched_entity *se,
2546	int sleep)	2546	int sleep)
2547	{	2547	{
2548	update_entity_load_avg(se, 1);	2548	update_entity_load_avg(se, 1);
2549	/* we force update consideration on load-balancer moves */	2549	/* we force update consideration on load-balancer moves */
2550	update_cfs_rq_blocked_load(cfs_rq, !sleep);	2550	update_cfs_rq_blocked_load(cfs_rq, !sleep);
2551		2551
2552	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;	2552	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2553	if (sleep) {	2553	if (sleep) {
2554	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;	2554	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2555	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);	2555	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2556	} /* migrations, e.g. sleep=0 leave decay_count == 0 */	2556	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
2557	}	2557	}
2558		2558
2559	/*	2559	/*
2560	* Update the rq's load with the elapsed running time before entering	2560	* Update the rq's load with the elapsed running time before entering
2561	* idle. if the last scheduled task is not a CFS task, idle_enter will	2561	* idle. if the last scheduled task is not a CFS task, idle_enter will
2562	* be the only way to update the runnable statistic.	2562	* be the only way to update the runnable statistic.
2563	*/	2563	*/
2564	void idle_enter_fair(struct rq *this_rq)	2564	void idle_enter_fair(struct rq *this_rq)
2565	{	2565	{
2566	update_rq_runnable_avg(this_rq, 1);	2566	update_rq_runnable_avg(this_rq, 1);
2567	}	2567	}
2568		2568
2569	/*	2569	/*
2570	* Update the rq's load with the elapsed idle time before a task is	2570	* Update the rq's load with the elapsed idle time before a task is
2571	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will	2571	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2572	* be the only way to update the runnable statistic.	2572	* be the only way to update the runnable statistic.
2573	*/	2573	*/
2574	void idle_exit_fair(struct rq *this_rq)	2574	void idle_exit_fair(struct rq *this_rq)
2575	{	2575	{
2576	update_rq_runnable_avg(this_rq, 0);	2576	update_rq_runnable_avg(this_rq, 0);
2577	}	2577	}
2578		2578
2579	static int idle_balance(struct rq *this_rq);	2579	static int idle_balance(struct rq *this_rq);
2580		2580
2581	#else /* CONFIG_SMP */	2581	#else /* CONFIG_SMP */
2582		2582
2583	static inline void update_entity_load_avg(struct sched_entity *se,	2583	static inline void update_entity_load_avg(struct sched_entity *se,
2584	int update_cfs_rq) {}	2584	int update_cfs_rq) {}
2585	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}	2585	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2586	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,	2586	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2587	struct sched_entity *se,	2587	struct sched_entity *se,
2588	int wakeup) {}	2588	int wakeup) {}
2589	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,	2589	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2590	struct sched_entity *se,	2590	struct sched_entity *se,
2591	int sleep) {}	2591	int sleep) {}
2592	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,	2592	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2593	int force_update) {}	2593	int force_update) {}
2594		2594
2595	static inline int idle_balance(struct rq *rq)	2595	static inline int idle_balance(struct rq *rq)
2596	{	2596	{
2597	return 0;	2597	return 0;
2598	}	2598	}
2599		2599
2600	#endif /* CONFIG_SMP */	2600	#endif /* CONFIG_SMP */
2601		2601
2602	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)	2602	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
2603	{	2603	{
2604	#ifdef CONFIG_SCHEDSTATS	2604	#ifdef CONFIG_SCHEDSTATS
2605	struct task_struct *tsk = NULL;	2605	struct task_struct *tsk = NULL;
2606		2606
2607	if (entity_is_task(se))	2607	if (entity_is_task(se))
2608	tsk = task_of(se);	2608	tsk = task_of(se);
2609		2609
2610	if (se->statistics.sleep_start) {	2610	if (se->statistics.sleep_start) {
2611	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;	2611	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2612		2612
2613	if ((s64)delta < 0)	2613	if ((s64)delta < 0)
2614	delta = 0;	2614	delta = 0;
2615		2615
2616	if (unlikely(delta > se->statistics.sleep_max))	2616	if (unlikely(delta > se->statistics.sleep_max))
2617	se->statistics.sleep_max = delta;	2617	se->statistics.sleep_max = delta;
2618		2618
2619	se->statistics.sleep_start = 0;	2619	se->statistics.sleep_start = 0;
2620	se->statistics.sum_sleep_runtime += delta;	2620	se->statistics.sum_sleep_runtime += delta;
2621		2621
2622	if (tsk) {	2622	if (tsk) {
2623	account_scheduler_latency(tsk, delta >> 10, 1);	2623	account_scheduler_latency(tsk, delta >> 10, 1);
2624	trace_sched_stat_sleep(tsk, delta);	2624	trace_sched_stat_sleep(tsk, delta);
2625	}	2625	}
2626	}	2626	}
2627	if (se->statistics.block_start) {	2627	if (se->statistics.block_start) {
2628	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;	2628	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2629		2629
2630	if ((s64)delta < 0)	2630	if ((s64)delta < 0)
2631	delta = 0;	2631	delta = 0;
2632		2632
2633	if (unlikely(delta > se->statistics.block_max))	2633	if (unlikely(delta > se->statistics.block_max))
2634	se->statistics.block_max = delta;	2634	se->statistics.block_max = delta;
2635		2635
2636	se->statistics.block_start = 0;	2636	se->statistics.block_start = 0;
2637	se->statistics.sum_sleep_runtime += delta;	2637	se->statistics.sum_sleep_runtime += delta;
2638		2638
2639	if (tsk) {	2639	if (tsk) {
2640	if (tsk->in_iowait) {	2640	if (tsk->in_iowait) {
2641	se->statistics.iowait_sum += delta;	2641	se->statistics.iowait_sum += delta;
2642	se->statistics.iowait_count++;	2642	se->statistics.iowait_count++;
2643	trace_sched_stat_iowait(tsk, delta);	2643	trace_sched_stat_iowait(tsk, delta);
2644	}	2644	}
2645		2645
2646	trace_sched_stat_blocked(tsk, delta);	2646	trace_sched_stat_blocked(tsk, delta);
2647		2647
2648	/*	2648	/*
2649	* Blocking time is in units of nanosecs, so shift by	2649	* Blocking time is in units of nanosecs, so shift by
2650	* 20 to get a milliseconds-range estimation of the	2650	* 20 to get a milliseconds-range estimation of the
2651	* amount of time that the task spent sleeping:	2651	* amount of time that the task spent sleeping:
2652	*/	2652	*/
2653	if (unlikely(prof_on == SLEEP_PROFILING)) {	2653	if (unlikely(prof_on == SLEEP_PROFILING)) {
2654	profile_hits(SLEEP_PROFILING,	2654	profile_hits(SLEEP_PROFILING,
2655	(void *)get_wchan(tsk),	2655	(void *)get_wchan(tsk),
2656	delta >> 20);	2656	delta >> 20);
2657	}	2657	}
2658	account_scheduler_latency(tsk, delta >> 10, 0);	2658	account_scheduler_latency(tsk, delta >> 10, 0);
2659	}	2659	}
2660	}	2660	}
2661	#endif	2661	#endif
2662	}	2662	}
2663		2663
2664	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)	2664	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
2665	{	2665	{
2666	#ifdef CONFIG_SCHED_DEBUG	2666	#ifdef CONFIG_SCHED_DEBUG
2667	s64 d = se->vruntime - cfs_rq->min_vruntime;	2667	s64 d = se->vruntime - cfs_rq->min_vruntime;
2668		2668
2669	if (d < 0)	2669	if (d < 0)
2670	d = -d;	2670	d = -d;
2671		2671
2672	if (d > 3*sysctl_sched_latency)	2672	if (d > 3*sysctl_sched_latency)
2673	schedstat_inc(cfs_rq, nr_spread_over);	2673	schedstat_inc(cfs_rq, nr_spread_over);
2674	#endif	2674	#endif
2675	}	2675	}
2676		2676
2677	static void	2677	static void
2678	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)	2678	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
2679	{	2679	{
2680	u64 vruntime = cfs_rq->min_vruntime;	2680	u64 vruntime = cfs_rq->min_vruntime;
2681		2681
2682	/*	2682	/*
2683	* The 'current' period is already promised to the current tasks,	2683	* The 'current' period is already promised to the current tasks,
2684	* however the extra weight of the new task will slow them down a	2684	* however the extra weight of the new task will slow them down a
2685	* little, place the new task so that it fits in the slot that	2685	* little, place the new task so that it fits in the slot that
2686	* stays open at the end.	2686	* stays open at the end.
2687	*/	2687	*/
2688	if (initial && sched_feat(START_DEBIT))	2688	if (initial && sched_feat(START_DEBIT))
2689	vruntime += sched_vslice(cfs_rq, se);	2689	vruntime += sched_vslice(cfs_rq, se);
2690		2690
2691	/* sleeps up to a single latency don't count. */	2691	/* sleeps up to a single latency don't count. */
2692	if (!initial) {	2692	if (!initial) {
2693	unsigned long thresh = sysctl_sched_latency;	2693	unsigned long thresh = sysctl_sched_latency;
2694		2694
2695	/*	2695	/*
2696	* Halve their sleep time's effect, to allow	2696	* Halve their sleep time's effect, to allow
2697	* for a gentler effect of sleepers:	2697	* for a gentler effect of sleepers:
2698	*/	2698	*/
2699	if (sched_feat(GENTLE_FAIR_SLEEPERS))	2699	if (sched_feat(GENTLE_FAIR_SLEEPERS))
2700	thresh >>= 1;	2700	thresh >>= 1;
2701		2701
2702	vruntime -= thresh;	2702	vruntime -= thresh;
2703	}	2703	}
2704		2704
2705	/* ensure we never gain time by being placed backwards. */	2705	/* ensure we never gain time by being placed backwards. */
2706	se->vruntime = max_vruntime(se->vruntime, vruntime);	2706	se->vruntime = max_vruntime(se->vruntime, vruntime);
2707	}	2707	}
2708		2708
2709	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);	2709	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2710		2710
2711	static void	2711	static void
2712	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)	2712	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
2713	{	2713	{
2714	/*	2714	/*
2715	* Update the normalized vruntime before updating min_vruntime	2715	* Update the normalized vruntime before updating min_vruntime
2716	* through calling update_curr().	2716	* through calling update_curr().
2717	*/	2717	*/
2718	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))	2718	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
2719	se->vruntime += cfs_rq->min_vruntime;	2719	se->vruntime += cfs_rq->min_vruntime;
2720		2720
2721	/*	2721	/*
2722	* Update run-time statistics of the 'current'.	2722	* Update run-time statistics of the 'current'.
2723	*/	2723	*/
2724	update_curr(cfs_rq);	2724	update_curr(cfs_rq);
2725	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);	2725	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
2726	account_entity_enqueue(cfs_rq, se);	2726	account_entity_enqueue(cfs_rq, se);
2727	update_cfs_shares(cfs_rq);	2727	update_cfs_shares(cfs_rq);
2728		2728
2729	if (flags & ENQUEUE_WAKEUP) {	2729	if (flags & ENQUEUE_WAKEUP) {
2730	place_entity(cfs_rq, se, 0);	2730	place_entity(cfs_rq, se, 0);
2731	enqueue_sleeper(cfs_rq, se);	2731	enqueue_sleeper(cfs_rq, se);
2732	}	2732	}
2733		2733
2734	update_stats_enqueue(cfs_rq, se);	2734	update_stats_enqueue(cfs_rq, se);
2735	check_spread(cfs_rq, se);	2735	check_spread(cfs_rq, se);
2736	if (se != cfs_rq->curr)	2736	if (se != cfs_rq->curr)
2737	__enqueue_entity(cfs_rq, se);	2737	__enqueue_entity(cfs_rq, se);
2738	se->on_rq = 1;	2738	se->on_rq = 1;
2739		2739
2740	if (cfs_rq->nr_running == 1) {	2740	if (cfs_rq->nr_running == 1) {
2741	list_add_leaf_cfs_rq(cfs_rq);	2741	list_add_leaf_cfs_rq(cfs_rq);
2742	check_enqueue_throttle(cfs_rq);	2742	check_enqueue_throttle(cfs_rq);
2743	}	2743	}
2744	}	2744	}
2745		2745
2746	static void __clear_buddies_last(struct sched_entity *se)	2746	static void __clear_buddies_last(struct sched_entity *se)
2747	{	2747	{
2748	for_each_sched_entity(se) {	2748	for_each_sched_entity(se) {
2749	struct cfs_rq *cfs_rq = cfs_rq_of(se);	2749	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2750	if (cfs_rq->last != se)	2750	if (cfs_rq->last != se)
2751	break;	2751	break;
2752		2752
2753	cfs_rq->last = NULL;	2753	cfs_rq->last = NULL;
2754	}	2754	}
2755	}	2755	}
2756		2756
2757	static void __clear_buddies_next(struct sched_entity *se)	2757	static void __clear_buddies_next(struct sched_entity *se)
2758	{	2758	{
2759	for_each_sched_entity(se) {	2759	for_each_sched_entity(se) {
2760	struct cfs_rq *cfs_rq = cfs_rq_of(se);	2760	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2761	if (cfs_rq->next != se)	2761	if (cfs_rq->next != se)
2762	break;	2762	break;
2763		2763
2764	cfs_rq->next = NULL;	2764	cfs_rq->next = NULL;
2765	}	2765	}
2766	}	2766	}
2767		2767
2768	static void __clear_buddies_skip(struct sched_entity *se)	2768	static void __clear_buddies_skip(struct sched_entity *se)
2769	{	2769	{
2770	for_each_sched_entity(se) {	2770	for_each_sched_entity(se) {
2771	struct cfs_rq *cfs_rq = cfs_rq_of(se);	2771	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2772	if (cfs_rq->skip != se)	2772	if (cfs_rq->skip != se)
2773	break;	2773	break;
2774		2774
2775	cfs_rq->skip = NULL;	2775	cfs_rq->skip = NULL;
2776	}	2776	}
2777	}	2777	}
2778		2778
2779	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)	2779	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
2780	{	2780	{
2781	if (cfs_rq->last == se)	2781	if (cfs_rq->last == se)
2782	__clear_buddies_last(se);	2782	__clear_buddies_last(se);
2783		2783
2784	if (cfs_rq->next == se)	2784	if (cfs_rq->next == se)
2785	__clear_buddies_next(se);	2785	__clear_buddies_next(se);
2786		2786
2787	if (cfs_rq->skip == se)	2787	if (cfs_rq->skip == se)
2788	__clear_buddies_skip(se);	2788	__clear_buddies_skip(se);
2789	}	2789	}
2790		2790
2791	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);	2791	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2792		2792
2793	static void	2793	static void
2794	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)	2794	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
2795	{	2795	{
2796	/*	2796	/*
2797	* Update run-time statistics of the 'current'.	2797	* Update run-time statistics of the 'current'.
2798	*/	2798	*/
2799	update_curr(cfs_rq);	2799	update_curr(cfs_rq);
2800	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);	2800	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
2801		2801
2802	update_stats_dequeue(cfs_rq, se);	2802	update_stats_dequeue(cfs_rq, se);
2803	if (flags & DEQUEUE_SLEEP) {	2803	if (flags & DEQUEUE_SLEEP) {
2804	#ifdef CONFIG_SCHEDSTATS	2804	#ifdef CONFIG_SCHEDSTATS
2805	if (entity_is_task(se)) {	2805	if (entity_is_task(se)) {
2806	struct task_struct *tsk = task_of(se);	2806	struct task_struct *tsk = task_of(se);
2807		2807
2808	if (tsk->state & TASK_INTERRUPTIBLE)	2808	if (tsk->state & TASK_INTERRUPTIBLE)
2809	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));	2809	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
2810	if (tsk->state & TASK_UNINTERRUPTIBLE)	2810	if (tsk->state & TASK_UNINTERRUPTIBLE)
2811	se->statistics.block_start = rq_clock(rq_of(cfs_rq));	2811	se->statistics.block_start = rq_clock(rq_of(cfs_rq));
2812	}	2812	}
2813	#endif	2813	#endif
2814	}	2814	}
2815		2815
2816	clear_buddies(cfs_rq, se);	2816	clear_buddies(cfs_rq, se);
2817		2817
2818	if (se != cfs_rq->curr)	2818	if (se != cfs_rq->curr)
2819	__dequeue_entity(cfs_rq, se);	2819	__dequeue_entity(cfs_rq, se);
2820	se->on_rq = 0;	2820	se->on_rq = 0;
2821	account_entity_dequeue(cfs_rq, se);	2821	account_entity_dequeue(cfs_rq, se);
2822		2822
2823	/*	2823	/*
2824	* Normalize the entity after updating the min_vruntime because the	2824	* Normalize the entity after updating the min_vruntime because the
2825	* update can refer to the ->curr item and we need to reflect this	2825	* update can refer to the ->curr item and we need to reflect this
2826	* movement in our normalized position.	2826	* movement in our normalized position.
2827	*/	2827	*/
2828	if (!(flags & DEQUEUE_SLEEP))	2828	if (!(flags & DEQUEUE_SLEEP))
2829	se->vruntime -= cfs_rq->min_vruntime;	2829	se->vruntime -= cfs_rq->min_vruntime;
2830		2830
2831	/* return excess runtime on last dequeue */	2831	/* return excess runtime on last dequeue */
2832	return_cfs_rq_runtime(cfs_rq);	2832	return_cfs_rq_runtime(cfs_rq);
2833		2833
2834	update_min_vruntime(cfs_rq);	2834	update_min_vruntime(cfs_rq);
2835	update_cfs_shares(cfs_rq);	2835	update_cfs_shares(cfs_rq);
2836	}	2836	}
2837		2837
2838	/*	2838	/*
2839	* Preempt the current task with a newly woken task if needed:	2839	* Preempt the current task with a newly woken task if needed:
2840	*/	2840	*/
2841	static void	2841	static void
2842	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)	2842	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
2843	{	2843	{
2844	unsigned long ideal_runtime, delta_exec;	2844	unsigned long ideal_runtime, delta_exec;
2845	struct sched_entity *se;	2845	struct sched_entity *se;
2846	s64 delta;	2846	s64 delta;
2847		2847
2848	ideal_runtime = sched_slice(cfs_rq, curr);	2848	ideal_runtime = sched_slice(cfs_rq, curr);
2849	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;	2849	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2850	if (delta_exec > ideal_runtime) {	2850	if (delta_exec > ideal_runtime) {
2851	resched_task(rq_of(cfs_rq)->curr);	2851	resched_task(rq_of(cfs_rq)->curr);
2852	/*	2852	/*
2853	* The current task ran long enough, ensure it doesn't get	2853	* The current task ran long enough, ensure it doesn't get
2854	* re-elected due to buddy favours.	2854	* re-elected due to buddy favours.
2855	*/	2855	*/
2856	clear_buddies(cfs_rq, curr);	2856	clear_buddies(cfs_rq, curr);
2857	return;	2857	return;
2858	}	2858	}
2859		2859
2860	/*	2860	/*
2861	* Ensure that a task that missed wakeup preemption by a	2861	* Ensure that a task that missed wakeup preemption by a
2862	* narrow margin doesn't have to wait for a full slice.	2862	* narrow margin doesn't have to wait for a full slice.
2863	* This also mitigates buddy induced latencies under load.	2863	* This also mitigates buddy induced latencies under load.
2864	*/	2864	*/
2865	if (delta_exec < sysctl_sched_min_granularity)	2865	if (delta_exec < sysctl_sched_min_granularity)
2866	return;	2866	return;
2867		2867
2868	se = __pick_first_entity(cfs_rq);	2868	se = __pick_first_entity(cfs_rq);
2869	delta = curr->vruntime - se->vruntime;	2869	delta = curr->vruntime - se->vruntime;
2870		2870
2871	if (delta < 0)	2871	if (delta < 0)
2872	return;	2872	return;
2873		2873
2874	if (delta > ideal_runtime)	2874	if (delta > ideal_runtime)
2875	resched_task(rq_of(cfs_rq)->curr);	2875	resched_task(rq_of(cfs_rq)->curr);
2876	}	2876	}
2877		2877
2878	static void	2878	static void
2879	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)	2879	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
2880	{	2880	{
2881	/* 'current' is not kept within the tree. */	2881	/* 'current' is not kept within the tree. */
2882	if (se->on_rq) {	2882	if (se->on_rq) {
2883	/*	2883	/*
2884	* Any task has to be enqueued before it get to execute on	2884	* Any task has to be enqueued before it get to execute on
2885	* a CPU. So account for the time it spent waiting on the	2885	* a CPU. So account for the time it spent waiting on the
2886	* runqueue.	2886	* runqueue.
2887	*/	2887	*/
2888	update_stats_wait_end(cfs_rq, se);	2888	update_stats_wait_end(cfs_rq, se);
2889	__dequeue_entity(cfs_rq, se);	2889	__dequeue_entity(cfs_rq, se);
2890	}	2890	}
2891		2891
2892	update_stats_curr_start(cfs_rq, se);	2892	update_stats_curr_start(cfs_rq, se);
2893	cfs_rq->curr = se;	2893	cfs_rq->curr = se;
2894	#ifdef CONFIG_SCHEDSTATS	2894	#ifdef CONFIG_SCHEDSTATS
2895	/*	2895	/*
2896	* Track our maximum slice length, if the CPU's load is at	2896	* Track our maximum slice length, if the CPU's load is at
2897	* least twice that of our own weight (i.e. dont track it	2897	* least twice that of our own weight (i.e. dont track it
2898	* when there are only lesser-weight tasks around):	2898	* when there are only lesser-weight tasks around):
2899	*/	2899	*/
2900	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {	2900	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2901	se->statistics.slice_max = max(se->statistics.slice_max,	2901	se->statistics.slice_max = max(se->statistics.slice_max,
2902	se->sum_exec_runtime - se->prev_sum_exec_runtime);	2902	se->sum_exec_runtime - se->prev_sum_exec_runtime);
2903	}	2903	}
2904	#endif	2904	#endif
2905	se->prev_sum_exec_runtime = se->sum_exec_runtime;	2905	se->prev_sum_exec_runtime = se->sum_exec_runtime;
2906	}	2906	}
2907		2907
2908	static int	2908	static int
2909	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);	2909	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
2910		2910
2911	/*	2911	/*
2912	* Pick the next process, keeping these things in mind, in this order:	2912	* Pick the next process, keeping these things in mind, in this order:
2913	* 1) keep things fair between processes/task groups	2913	* 1) keep things fair between processes/task groups
2914	* 2) pick the "next" process, since someone really wants that to run	2914	* 2) pick the "next" process, since someone really wants that to run
2915	* 3) pick the "last" process, for cache locality	2915	* 3) pick the "last" process, for cache locality
2916	* 4) do not run the "skip" process, if something else is available	2916	* 4) do not run the "skip" process, if something else is available
2917	*/	2917	*/
2918	static struct sched_entity *	2918	static struct sched_entity *
2919	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)	2919	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
2920	{	2920	{
2921	struct sched_entity *left = __pick_first_entity(cfs_rq);	2921	struct sched_entity *left = __pick_first_entity(cfs_rq);
2922	struct sched_entity *se;	2922	struct sched_entity *se;
2923		2923
2924	/*	2924	/*
2925	* If curr is set we have to see if its left of the leftmost entity	2925	* If curr is set we have to see if its left of the leftmost entity
2926	* still in the tree, provided there was anything in the tree at all.	2926	* still in the tree, provided there was anything in the tree at all.
2927	*/	2927	*/
2928	if (!left \|\| (curr && entity_before(curr, left)))	2928	if (!left \|\| (curr && entity_before(curr, left)))
2929	left = curr;	2929	left = curr;
2930		2930
2931	se = left; /* ideally we run the leftmost entity */	2931	se = left; /* ideally we run the leftmost entity */
2932		2932
2933	/*	2933	/*
2934	* Avoid running the skip buddy, if running something else can	2934	* Avoid running the skip buddy, if running something else can
2935	* be done without getting too unfair.	2935	* be done without getting too unfair.
2936	*/	2936	*/
2937	if (cfs_rq->skip == se) {	2937	if (cfs_rq->skip == se) {
2938	struct sched_entity *second;	2938	struct sched_entity *second;
2939		2939
2940	if (se == curr) {	2940	if (se == curr) {
2941	second = __pick_first_entity(cfs_rq);	2941	second = __pick_first_entity(cfs_rq);
2942	} else {	2942	} else {
2943	second = __pick_next_entity(se);	2943	second = __pick_next_entity(se);
2944	if (!second \|\| (curr && entity_before(curr, second)))	2944	if (!second \|\| (curr && entity_before(curr, second)))
2945	second = curr;	2945	second = curr;
2946	}	2946	}
2947		2947
2948	if (second && wakeup_preempt_entity(second, left) < 1)	2948	if (second && wakeup_preempt_entity(second, left) < 1)
2949	se = second;	2949	se = second;
2950	}	2950	}
2951		2951
2952	/*	2952	/*
2953	* Prefer last buddy, try to return the CPU to a preempted task.	2953	* Prefer last buddy, try to return the CPU to a preempted task.
2954	*/	2954	*/
2955	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)	2955	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2956	se = cfs_rq->last;	2956	se = cfs_rq->last;
2957		2957
2958	/*	2958	/*
2959	* Someone really wants this to run. If it's not unfair, run it.	2959	* Someone really wants this to run. If it's not unfair, run it.
2960	*/	2960	*/
2961	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)	2961	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2962	se = cfs_rq->next;	2962	se = cfs_rq->next;
2963		2963
2964	clear_buddies(cfs_rq, se);	2964	clear_buddies(cfs_rq, se);
2965		2965
2966	return se;	2966	return se;
2967	}	2967	}
2968		2968
2969	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);	2969	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2970		2970
2971	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)	2971	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
2972	{	2972	{
2973	/*	2973	/*
2974	* If still on the runqueue then deactivate_task()	2974	* If still on the runqueue then deactivate_task()
2975	* was not called and update_curr() has to be done:	2975	* was not called and update_curr() has to be done:
2976	*/	2976	*/
2977	if (prev->on_rq)	2977	if (prev->on_rq)
2978	update_curr(cfs_rq);	2978	update_curr(cfs_rq);
2979		2979
2980	/* throttle cfs_rqs exceeding runtime */	2980	/* throttle cfs_rqs exceeding runtime */
2981	check_cfs_rq_runtime(cfs_rq);	2981	check_cfs_rq_runtime(cfs_rq);
2982		2982
2983	check_spread(cfs_rq, prev);	2983	check_spread(cfs_rq, prev);
2984	if (prev->on_rq) {	2984	if (prev->on_rq) {
2985	update_stats_wait_start(cfs_rq, prev);	2985	update_stats_wait_start(cfs_rq, prev);
2986	/* Put 'current' back into the tree. */	2986	/* Put 'current' back into the tree. */
2987	__enqueue_entity(cfs_rq, prev);	2987	__enqueue_entity(cfs_rq, prev);
2988	/* in !on_rq case, update occurred at dequeue */	2988	/* in !on_rq case, update occurred at dequeue */
2989	update_entity_load_avg(prev, 1);	2989	update_entity_load_avg(prev, 1);
2990	}	2990	}
2991	cfs_rq->curr = NULL;	2991	cfs_rq->curr = NULL;
2992	}	2992	}
2993		2993
2994	static void	2994	static void
2995	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)	2995	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
2996	{	2996	{
2997	/*	2997	/*
2998	* Update run-time statistics of the 'current'.	2998	* Update run-time statistics of the 'current'.
2999	*/	2999	*/
3000	update_curr(cfs_rq);	3000	update_curr(cfs_rq);
3001		3001
3002	/*	3002	/*
3003	* Ensure that runnable average is periodically updated.	3003	* Ensure that runnable average is periodically updated.
3004	*/	3004	*/
3005	update_entity_load_avg(curr, 1);	3005	update_entity_load_avg(curr, 1);
3006	update_cfs_rq_blocked_load(cfs_rq, 1);	3006	update_cfs_rq_blocked_load(cfs_rq, 1);
3007	update_cfs_shares(cfs_rq);	3007	update_cfs_shares(cfs_rq);
3008		3008
3009	#ifdef CONFIG_SCHED_HRTICK	3009	#ifdef CONFIG_SCHED_HRTICK
3010	/*	3010	/*
3011	* queued ticks are scheduled to match the slice, so don't bother	3011	* queued ticks are scheduled to match the slice, so don't bother
3012	* validating it and just reschedule.	3012	* validating it and just reschedule.
3013	*/	3013	*/
3014	if (queued) {	3014	if (queued) {
3015	resched_task(rq_of(cfs_rq)->curr);	3015	resched_task(rq_of(cfs_rq)->curr);
3016	return;	3016	return;
3017	}	3017	}
3018	/*	3018	/*
3019	* don't let the period tick interfere with the hrtick preemption	3019	* don't let the period tick interfere with the hrtick preemption
3020	*/	3020	*/
3021	if (!sched_feat(DOUBLE_TICK) &&	3021	if (!sched_feat(DOUBLE_TICK) &&
3022	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))	3022	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3023	return;	3023	return;
3024	#endif	3024	#endif
3025		3025
3026	if (cfs_rq->nr_running > 1)	3026	if (cfs_rq->nr_running > 1)
3027	check_preempt_tick(cfs_rq, curr);	3027	check_preempt_tick(cfs_rq, curr);
3028	}	3028	}
3029		3029
3030		3030
3031	/**************************************************	3031	/**************************************************
3032	* CFS bandwidth control machinery	3032	* CFS bandwidth control machinery
3033	*/	3033	*/
3034		3034
3035	#ifdef CONFIG_CFS_BANDWIDTH	3035	#ifdef CONFIG_CFS_BANDWIDTH
3036		3036
3037	#ifdef HAVE_JUMP_LABEL	3037	#ifdef HAVE_JUMP_LABEL
3038	static struct static_key __cfs_bandwidth_used;	3038	static struct static_key __cfs_bandwidth_used;
3039		3039
3040	static inline bool cfs_bandwidth_used(void)	3040	static inline bool cfs_bandwidth_used(void)
3041	{	3041	{
3042	return static_key_false(&__cfs_bandwidth_used);	3042	return static_key_false(&__cfs_bandwidth_used);
3043	}	3043	}
3044		3044
3045	void cfs_bandwidth_usage_inc(void)	3045	void cfs_bandwidth_usage_inc(void)
3046	{	3046	{
3047	static_key_slow_inc(&__cfs_bandwidth_used);	3047	static_key_slow_inc(&__cfs_bandwidth_used);
3048	}	3048	}
3049		3049
3050	void cfs_bandwidth_usage_dec(void)	3050	void cfs_bandwidth_usage_dec(void)
3051	{	3051	{
3052	static_key_slow_dec(&__cfs_bandwidth_used);	3052	static_key_slow_dec(&__cfs_bandwidth_used);
3053	}	3053	}
3054	#else /* HAVE_JUMP_LABEL */	3054	#else /* HAVE_JUMP_LABEL */
3055	static bool cfs_bandwidth_used(void)	3055	static bool cfs_bandwidth_used(void)
3056	{	3056	{
3057	return true;	3057	return true;
3058	}	3058	}
3059		3059
3060	void cfs_bandwidth_usage_inc(void) {}	3060	void cfs_bandwidth_usage_inc(void) {}
3061	void cfs_bandwidth_usage_dec(void) {}	3061	void cfs_bandwidth_usage_dec(void) {}
3062	#endif /* HAVE_JUMP_LABEL */	3062	#endif /* HAVE_JUMP_LABEL */
3063		3063
3064	/*	3064	/*
3065	* default period for cfs group bandwidth.	3065	* default period for cfs group bandwidth.
3066	* default: 0.1s, units: nanoseconds	3066	* default: 0.1s, units: nanoseconds
3067	*/	3067	*/
3068	static inline u64 default_cfs_period(void)	3068	static inline u64 default_cfs_period(void)
3069	{	3069	{
3070	return 100000000ULL;	3070	return 100000000ULL;
3071	}	3071	}
3072		3072
3073	static inline u64 sched_cfs_bandwidth_slice(void)	3073	static inline u64 sched_cfs_bandwidth_slice(void)
3074	{	3074	{
3075	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;	3075	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3076	}	3076	}
3077		3077
3078	/*	3078	/*
3079	* Replenish runtime according to assigned quota and update expiration time.	3079	* Replenish runtime according to assigned quota and update expiration time.
3080	* We use sched_clock_cpu directly instead of rq->clock to avoid adding	3080	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
3081	* additional synchronization around rq->lock.	3081	* additional synchronization around rq->lock.
3082	*	3082	*
3083	* requires cfs_b->lock	3083	* requires cfs_b->lock
3084	*/	3084	*/
3085	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)	3085	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3086	{	3086	{
3087	u64 now;	3087	u64 now;
3088		3088
3089	if (cfs_b->quota == RUNTIME_INF)	3089	if (cfs_b->quota == RUNTIME_INF)
3090	return;	3090	return;
3091		3091
3092	now = sched_clock_cpu(smp_processor_id());	3092	now = sched_clock_cpu(smp_processor_id());
3093	cfs_b->runtime = cfs_b->quota;	3093	cfs_b->runtime = cfs_b->quota;
3094	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);	3094	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3095	}	3095	}
3096		3096
3097	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)	3097	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
3098	{	3098	{
3099	return &tg->cfs_bandwidth;	3099	return &tg->cfs_bandwidth;
3100	}	3100	}
3101		3101
3102	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */	3102	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3103	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)	3103	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3104	{	3104	{
3105	if (unlikely(cfs_rq->throttle_count))	3105	if (unlikely(cfs_rq->throttle_count))
3106	return cfs_rq->throttled_clock_task;	3106	return cfs_rq->throttled_clock_task;
3107		3107
3108	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;	3108	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3109	}	3109	}
3110		3110
3111	/* returns 0 on failure to allocate runtime */	3111	/* returns 0 on failure to allocate runtime */
3112	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)	3112	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3113	{	3113	{
3114	struct task_group *tg = cfs_rq->tg;	3114	struct task_group *tg = cfs_rq->tg;
3115	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);	3115	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3116	u64 amount = 0, min_amount, expires;	3116	u64 amount = 0, min_amount, expires;
3117		3117
3118	/* note: this is a positive sum as runtime_remaining <= 0 */	3118	/* note: this is a positive sum as runtime_remaining <= 0 */
3119	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;	3119	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3120		3120
3121	raw_spin_lock(&cfs_b->lock);	3121	raw_spin_lock(&cfs_b->lock);
3122	if (cfs_b->quota == RUNTIME_INF)	3122	if (cfs_b->quota == RUNTIME_INF)
3123	amount = min_amount;	3123	amount = min_amount;
3124	else {	3124	else {
3125	/*	3125	/*
3126	* If the bandwidth pool has become inactive, then at least one	3126	* If the bandwidth pool has become inactive, then at least one
3127	* period must have elapsed since the last consumption.	3127	* period must have elapsed since the last consumption.
3128	* Refresh the global state and ensure bandwidth timer becomes	3128	* Refresh the global state and ensure bandwidth timer becomes
3129	* active.	3129	* active.
3130	*/	3130	*/
3131	if (!cfs_b->timer_active) {	3131	if (!cfs_b->timer_active) {
3132	__refill_cfs_bandwidth_runtime(cfs_b);	3132	__refill_cfs_bandwidth_runtime(cfs_b);
3133	__start_cfs_bandwidth(cfs_b);	3133	__start_cfs_bandwidth(cfs_b, false);
3134	}	3134	}
3135		3135
3136	if (cfs_b->runtime > 0) {	3136	if (cfs_b->runtime > 0) {
3137	amount = min(cfs_b->runtime, min_amount);	3137	amount = min(cfs_b->runtime, min_amount);
3138	cfs_b->runtime -= amount;	3138	cfs_b->runtime -= amount;
3139	cfs_b->idle = 0;	3139	cfs_b->idle = 0;
3140	}	3140	}
3141	}	3141	}
3142	expires = cfs_b->runtime_expires;	3142	expires = cfs_b->runtime_expires;
3143	raw_spin_unlock(&cfs_b->lock);	3143	raw_spin_unlock(&cfs_b->lock);
3144		3144
3145	cfs_rq->runtime_remaining += amount;	3145	cfs_rq->runtime_remaining += amount;
3146	/*	3146	/*
3147	* we may have advanced our local expiration to account for allowed	3147	* we may have advanced our local expiration to account for allowed
3148	* spread between our sched_clock and the one on which runtime was	3148	* spread between our sched_clock and the one on which runtime was
3149	* issued.	3149	* issued.
3150	*/	3150	*/
3151	if ((s64)(expires - cfs_rq->runtime_expires) > 0)	3151	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3152	cfs_rq->runtime_expires = expires;	3152	cfs_rq->runtime_expires = expires;
3153		3153
3154	return cfs_rq->runtime_remaining > 0;	3154	return cfs_rq->runtime_remaining > 0;
3155	}	3155	}
3156		3156
3157	/*	3157	/*
3158	* Note: This depends on the synchronization provided by sched_clock and the	3158	* Note: This depends on the synchronization provided by sched_clock and the
3159	* fact that rq->clock snapshots this value.	3159	* fact that rq->clock snapshots this value.
3160	*/	3160	*/
3161	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)	3161	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3162	{	3162	{
3163	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);	3163	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3164		3164
3165	/* if the deadline is ahead of our clock, nothing to do */	3165	/* if the deadline is ahead of our clock, nothing to do */
3166	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))	3166	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3167	return;	3167	return;
3168		3168
3169	if (cfs_rq->runtime_remaining < 0)	3169	if (cfs_rq->runtime_remaining < 0)
3170	return;	3170	return;
3171		3171
3172	/*	3172	/*
3173	* If the local deadline has passed we have to consider the	3173	* If the local deadline has passed we have to consider the
3174	* possibility that our sched_clock is 'fast' and the global deadline	3174	* possibility that our sched_clock is 'fast' and the global deadline
3175	* has not truly expired.	3175	* has not truly expired.
3176	*	3176	*
3177	* Fortunately we can check determine whether this the case by checking	3177	* Fortunately we can check determine whether this the case by checking
3178	* whether the global deadline has advanced.	3178	* whether the global deadline has advanced.
3179	*/	3179	*/
3180		3180
3181	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {	3181	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
3182	/* extend local deadline, drift is bounded above by 2 ticks */	3182	/* extend local deadline, drift is bounded above by 2 ticks */
3183	cfs_rq->runtime_expires += TICK_NSEC;	3183	cfs_rq->runtime_expires += TICK_NSEC;
3184	} else {	3184	} else {
3185	/* global deadline is ahead, expiration has passed */	3185	/* global deadline is ahead, expiration has passed */
3186	cfs_rq->runtime_remaining = 0;	3186	cfs_rq->runtime_remaining = 0;
3187	}	3187	}
3188	}	3188	}
3189		3189
3190	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)	3190	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3191	{	3191	{
3192	/* dock delta_exec before expiring quota (as it could span periods) */	3192	/* dock delta_exec before expiring quota (as it could span periods) */
3193	cfs_rq->runtime_remaining -= delta_exec;	3193	cfs_rq->runtime_remaining -= delta_exec;
3194	expire_cfs_rq_runtime(cfs_rq);	3194	expire_cfs_rq_runtime(cfs_rq);
3195		3195
3196	if (likely(cfs_rq->runtime_remaining > 0))	3196	if (likely(cfs_rq->runtime_remaining > 0))
3197	return;	3197	return;
3198		3198
3199	/*	3199	/*
3200	* if we're unable to extend our runtime we resched so that the active	3200	* if we're unable to extend our runtime we resched so that the active
3201	* hierarchy can be throttled	3201	* hierarchy can be throttled
3202	*/	3202	*/
3203	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))	3203	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3204	resched_task(rq_of(cfs_rq)->curr);	3204	resched_task(rq_of(cfs_rq)->curr);
3205	}	3205	}
3206		3206
3207	static __always_inline	3207	static __always_inline
3208	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)	3208	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3209	{	3209	{
3210	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)	3210	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
3211	return;	3211	return;
3212		3212
3213	__account_cfs_rq_runtime(cfs_rq, delta_exec);	3213	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3214	}	3214	}
3215		3215
3216	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)	3216	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3217	{	3217	{
3218	return cfs_bandwidth_used() && cfs_rq->throttled;	3218	return cfs_bandwidth_used() && cfs_rq->throttled;
3219	}	3219	}
3220		3220
3221	/* check whether cfs_rq, or any parent, is throttled */	3221	/* check whether cfs_rq, or any parent, is throttled */
3222	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)	3222	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3223	{	3223	{
3224	return cfs_bandwidth_used() && cfs_rq->throttle_count;	3224	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3225	}	3225	}
3226		3226
3227	/*	3227	/*
3228	* Ensure that neither of the group entities corresponding to src_cpu or	3228	* Ensure that neither of the group entities corresponding to src_cpu or
3229	* dest_cpu are members of a throttled hierarchy when performing group	3229	* dest_cpu are members of a throttled hierarchy when performing group
3230	* load-balance operations.	3230	* load-balance operations.
3231	*/	3231	*/
3232	static inline int throttled_lb_pair(struct task_group *tg,	3232	static inline int throttled_lb_pair(struct task_group *tg,
3233	int src_cpu, int dest_cpu)	3233	int src_cpu, int dest_cpu)
3234	{	3234	{
3235	struct cfs_rq src_cfs_rq, dest_cfs_rq;	3235	struct cfs_rq src_cfs_rq, dest_cfs_rq;
3236		3236
3237	src_cfs_rq = tg->cfs_rq[src_cpu];	3237	src_cfs_rq = tg->cfs_rq[src_cpu];
3238	dest_cfs_rq = tg->cfs_rq[dest_cpu];	3238	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3239		3239
3240	return throttled_hierarchy(src_cfs_rq) \|\|	3240	return throttled_hierarchy(src_cfs_rq) \|\|
3241	throttled_hierarchy(dest_cfs_rq);	3241	throttled_hierarchy(dest_cfs_rq);
3242	}	3242	}
3243		3243
3244	/* updated child weight may affect parent so we have to do this bottom up */	3244	/* updated child weight may affect parent so we have to do this bottom up */
3245	static int tg_unthrottle_up(struct task_group tg, void data)	3245	static int tg_unthrottle_up(struct task_group tg, void data)
3246	{	3246	{
3247	struct rq *rq = data;	3247	struct rq *rq = data;
3248	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];	3248	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3249		3249
3250	cfs_rq->throttle_count--;	3250	cfs_rq->throttle_count--;
3251	#ifdef CONFIG_SMP	3251	#ifdef CONFIG_SMP
3252	if (!cfs_rq->throttle_count) {	3252	if (!cfs_rq->throttle_count) {
3253	/* adjust cfs_rq_clock_task() */	3253	/* adjust cfs_rq_clock_task() */
3254	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -	3254	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3255	cfs_rq->throttled_clock_task;	3255	cfs_rq->throttled_clock_task;
3256	}	3256	}
3257	#endif	3257	#endif
3258		3258
3259	return 0;	3259	return 0;
3260	}	3260	}
3261		3261
3262	static int tg_throttle_down(struct task_group tg, void data)	3262	static int tg_throttle_down(struct task_group tg, void data)
3263	{	3263	{
3264	struct rq *rq = data;	3264	struct rq *rq = data;
3265	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];	3265	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3266		3266
3267	/* group is entering throttled state, stop time */	3267	/* group is entering throttled state, stop time */
3268	if (!cfs_rq->throttle_count)	3268	if (!cfs_rq->throttle_count)
3269	cfs_rq->throttled_clock_task = rq_clock_task(rq);	3269	cfs_rq->throttled_clock_task = rq_clock_task(rq);
3270	cfs_rq->throttle_count++;	3270	cfs_rq->throttle_count++;
3271		3271
3272	return 0;	3272	return 0;
3273	}	3273	}
3274		3274
3275	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)	3275	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3276	{	3276	{
3277	struct rq *rq = rq_of(cfs_rq);	3277	struct rq *rq = rq_of(cfs_rq);
3278	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);	3278	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3279	struct sched_entity *se;	3279	struct sched_entity *se;
3280	long task_delta, dequeue = 1;	3280	long task_delta, dequeue = 1;
3281		3281
3282	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];	3282	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3283		3283
3284	/* freeze hierarchy runnable averages while throttled */	3284	/* freeze hierarchy runnable averages while throttled */
3285	rcu_read_lock();	3285	rcu_read_lock();
3286	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);	3286	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3287	rcu_read_unlock();	3287	rcu_read_unlock();
3288		3288
3289	task_delta = cfs_rq->h_nr_running;	3289	task_delta = cfs_rq->h_nr_running;
3290	for_each_sched_entity(se) {	3290	for_each_sched_entity(se) {
3291	struct cfs_rq *qcfs_rq = cfs_rq_of(se);	3291	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3292	/* throttled entity or throttle-on-deactivate */	3292	/* throttled entity or throttle-on-deactivate */
3293	if (!se->on_rq)	3293	if (!se->on_rq)
3294	break;	3294	break;
3295		3295
3296	if (dequeue)	3296	if (dequeue)
3297	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);	3297	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3298	qcfs_rq->h_nr_running -= task_delta;	3298	qcfs_rq->h_nr_running -= task_delta;
3299		3299
3300	if (qcfs_rq->load.weight)	3300	if (qcfs_rq->load.weight)
3301	dequeue = 0;	3301	dequeue = 0;
3302	}	3302	}
3303		3303
3304	if (!se)	3304	if (!se)
3305	rq->nr_running -= task_delta;	3305	rq->nr_running -= task_delta;
3306		3306
3307	cfs_rq->throttled = 1;	3307	cfs_rq->throttled = 1;
3308	cfs_rq->throttled_clock = rq_clock(rq);	3308	cfs_rq->throttled_clock = rq_clock(rq);
3309	raw_spin_lock(&cfs_b->lock);	3309	raw_spin_lock(&cfs_b->lock);
3310	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);	3310	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3311	if (!cfs_b->timer_active)	3311	if (!cfs_b->timer_active)
3312	__start_cfs_bandwidth(cfs_b);	3312	__start_cfs_bandwidth(cfs_b, false);
3313	raw_spin_unlock(&cfs_b->lock);	3313	raw_spin_unlock(&cfs_b->lock);
3314	}	3314	}
3315		3315
3316	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)	3316	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3317	{	3317	{
3318	struct rq *rq = rq_of(cfs_rq);	3318	struct rq *rq = rq_of(cfs_rq);
3319	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);	3319	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3320	struct sched_entity *se;	3320	struct sched_entity *se;
3321	int enqueue = 1;	3321	int enqueue = 1;
3322	long task_delta;	3322	long task_delta;
3323		3323
3324	se = cfs_rq->tg->se[cpu_of(rq)];	3324	se = cfs_rq->tg->se[cpu_of(rq)];
3325		3325
3326	cfs_rq->throttled = 0;	3326	cfs_rq->throttled = 0;
3327		3327
3328	update_rq_clock(rq);	3328	update_rq_clock(rq);
3329		3329
3330	raw_spin_lock(&cfs_b->lock);	3330	raw_spin_lock(&cfs_b->lock);
3331	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;	3331	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3332	list_del_rcu(&cfs_rq->throttled_list);	3332	list_del_rcu(&cfs_rq->throttled_list);
3333	raw_spin_unlock(&cfs_b->lock);	3333	raw_spin_unlock(&cfs_b->lock);
3334		3334
3335	/* update hierarchical throttle state */	3335	/* update hierarchical throttle state */
3336	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);	3336	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3337		3337
3338	if (!cfs_rq->load.weight)	3338	if (!cfs_rq->load.weight)
3339	return;	3339	return;
3340		3340
3341	task_delta = cfs_rq->h_nr_running;	3341	task_delta = cfs_rq->h_nr_running;
3342	for_each_sched_entity(se) {	3342	for_each_sched_entity(se) {
3343	if (se->on_rq)	3343	if (se->on_rq)
3344	enqueue = 0;	3344	enqueue = 0;
3345		3345
3346	cfs_rq = cfs_rq_of(se);	3346	cfs_rq = cfs_rq_of(se);
3347	if (enqueue)	3347	if (enqueue)
3348	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);	3348	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3349	cfs_rq->h_nr_running += task_delta;	3349	cfs_rq->h_nr_running += task_delta;
3350		3350
3351	if (cfs_rq_throttled(cfs_rq))	3351	if (cfs_rq_throttled(cfs_rq))
3352	break;	3352	break;
3353	}	3353	}
3354		3354
3355	if (!se)	3355	if (!se)
3356	rq->nr_running += task_delta;	3356	rq->nr_running += task_delta;
3357		3357
3358	/* determine whether we need to wake up potentially idle cpu */	3358	/* determine whether we need to wake up potentially idle cpu */
3359	if (rq->curr == rq->idle && rq->cfs.nr_running)	3359	if (rq->curr == rq->idle && rq->cfs.nr_running)
3360	resched_task(rq->curr);	3360	resched_task(rq->curr);
3361	}	3361	}
3362		3362
3363	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,	3363	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3364	u64 remaining, u64 expires)	3364	u64 remaining, u64 expires)
3365	{	3365	{
3366	struct cfs_rq *cfs_rq;	3366	struct cfs_rq *cfs_rq;
3367	u64 runtime = remaining;	3367	u64 runtime = remaining;
3368		3368
3369	rcu_read_lock();	3369	rcu_read_lock();
3370	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,	3370	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3371	throttled_list) {	3371	throttled_list) {
3372	struct rq *rq = rq_of(cfs_rq);	3372	struct rq *rq = rq_of(cfs_rq);
3373		3373
3374	raw_spin_lock(&rq->lock);	3374	raw_spin_lock(&rq->lock);
3375	if (!cfs_rq_throttled(cfs_rq))	3375	if (!cfs_rq_throttled(cfs_rq))
3376	goto next;	3376	goto next;
3377		3377
3378	runtime = -cfs_rq->runtime_remaining + 1;	3378	runtime = -cfs_rq->runtime_remaining + 1;
3379	if (runtime > remaining)	3379	if (runtime > remaining)
3380	runtime = remaining;	3380	runtime = remaining;
3381	remaining -= runtime;	3381	remaining -= runtime;
3382		3382
3383	cfs_rq->runtime_remaining += runtime;	3383	cfs_rq->runtime_remaining += runtime;
3384	cfs_rq->runtime_expires = expires;	3384	cfs_rq->runtime_expires = expires;
3385		3385
3386	/* we check whether we're throttled above */	3386	/* we check whether we're throttled above */
3387	if (cfs_rq->runtime_remaining > 0)	3387	if (cfs_rq->runtime_remaining > 0)
3388	unthrottle_cfs_rq(cfs_rq);	3388	unthrottle_cfs_rq(cfs_rq);
3389		3389
3390	next:	3390	next:
3391	raw_spin_unlock(&rq->lock);	3391	raw_spin_unlock(&rq->lock);
3392		3392
3393	if (!remaining)	3393	if (!remaining)
3394	break;	3394	break;
3395	}	3395	}
3396	rcu_read_unlock();	3396	rcu_read_unlock();
3397		3397
3398	return remaining;	3398	return remaining;
3399	}	3399	}
3400		3400
3401	/*	3401	/*
3402	* Responsible for refilling a task_group's bandwidth and unthrottling its	3402	* Responsible for refilling a task_group's bandwidth and unthrottling its
3403	* cfs_rqs as appropriate. If there has been no activity within the last	3403	* cfs_rqs as appropriate. If there has been no activity within the last
3404	* period the timer is deactivated until scheduling resumes; cfs_b->idle is	3404	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
3405	* used to track this state.	3405	* used to track this state.
3406	*/	3406	*/
3407	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)	3407	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3408	{	3408	{
3409	u64 runtime, runtime_expires;	3409	u64 runtime, runtime_expires;
3410	int idle = 1, throttled;	3410	int idle = 1, throttled;
3411		3411
3412	raw_spin_lock(&cfs_b->lock);	3412	raw_spin_lock(&cfs_b->lock);
3413	/* no need to continue the timer with no bandwidth constraint */	3413	/* no need to continue the timer with no bandwidth constraint */
3414	if (cfs_b->quota == RUNTIME_INF)	3414	if (cfs_b->quota == RUNTIME_INF)
3415	goto out_unlock;	3415	goto out_unlock;
3416		3416
3417	throttled = !list_empty(&cfs_b->throttled_cfs_rq);	3417	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3418	/* idle depends on !throttled (for the case of a large deficit) */	3418	/* idle depends on !throttled (for the case of a large deficit) */
3419	idle = cfs_b->idle && !throttled;	3419	idle = cfs_b->idle && !throttled;
3420	cfs_b->nr_periods += overrun;	3420	cfs_b->nr_periods += overrun;
3421		3421
3422	/* if we're going inactive then everything else can be deferred */	3422	/* if we're going inactive then everything else can be deferred */
3423	if (idle)	3423	if (idle)
3424	goto out_unlock;	3424	goto out_unlock;
3425		3425
3426	/*	3426	/*
3427	* if we have relooped after returning idle once, we need to update our	3427	* if we have relooped after returning idle once, we need to update our
3428	* status as actually running, so that other cpus doing	3428	* status as actually running, so that other cpus doing
3429	* __start_cfs_bandwidth will stop trying to cancel us.	3429	* __start_cfs_bandwidth will stop trying to cancel us.
3430	*/	3430	*/
3431	cfs_b->timer_active = 1;	3431	cfs_b->timer_active = 1;
3432		3432
3433	__refill_cfs_bandwidth_runtime(cfs_b);	3433	__refill_cfs_bandwidth_runtime(cfs_b);
3434		3434
3435	if (!throttled) {	3435	if (!throttled) {
3436	/* mark as potentially idle for the upcoming period */	3436	/* mark as potentially idle for the upcoming period */
3437	cfs_b->idle = 1;	3437	cfs_b->idle = 1;
3438	goto out_unlock;	3438	goto out_unlock;
3439	}	3439	}
3440		3440
3441	/* account preceding periods in which throttling occurred */	3441	/* account preceding periods in which throttling occurred */
3442	cfs_b->nr_throttled += overrun;	3442	cfs_b->nr_throttled += overrun;
3443		3443
3444	/*	3444	/*
3445	* There are throttled entities so we must first use the new bandwidth	3445	* There are throttled entities so we must first use the new bandwidth
3446	* to unthrottle them before making it generally available. This	3446	* to unthrottle them before making it generally available. This
3447	* ensures that all existing debts will be paid before a new cfs_rq is	3447	* ensures that all existing debts will be paid before a new cfs_rq is
3448	* allowed to run.	3448	* allowed to run.
3449	*/	3449	*/
3450	runtime = cfs_b->runtime;	3450	runtime = cfs_b->runtime;
3451	runtime_expires = cfs_b->runtime_expires;	3451	runtime_expires = cfs_b->runtime_expires;
3452	cfs_b->runtime = 0;	3452	cfs_b->runtime = 0;
3453		3453
3454	/*	3454	/*
3455	* This check is repeated as we are holding onto the new bandwidth	3455	* This check is repeated as we are holding onto the new bandwidth
3456	* while we unthrottle. This can potentially race with an unthrottled	3456	* while we unthrottle. This can potentially race with an unthrottled
3457	* group trying to acquire new bandwidth from the global pool.	3457	* group trying to acquire new bandwidth from the global pool.
3458	*/	3458	*/
3459	while (throttled && runtime > 0) {	3459	while (throttled && runtime > 0) {
3460	raw_spin_unlock(&cfs_b->lock);	3460	raw_spin_unlock(&cfs_b->lock);
3461	/* we can't nest cfs_b->lock while distributing bandwidth */	3461	/* we can't nest cfs_b->lock while distributing bandwidth */
3462	runtime = distribute_cfs_runtime(cfs_b, runtime,	3462	runtime = distribute_cfs_runtime(cfs_b, runtime,
3463	runtime_expires);	3463	runtime_expires);
3464	raw_spin_lock(&cfs_b->lock);	3464	raw_spin_lock(&cfs_b->lock);
3465		3465
3466	throttled = !list_empty(&cfs_b->throttled_cfs_rq);	3466	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3467	}	3467	}
3468		3468
3469	/* return (any) remaining runtime */	3469	/* return (any) remaining runtime */
3470	cfs_b->runtime = runtime;	3470	cfs_b->runtime = runtime;
3471	/*	3471	/*
3472	* While we are ensured activity in the period following an	3472	* While we are ensured activity in the period following an
3473	* unthrottle, this also covers the case in which the new bandwidth is	3473	* unthrottle, this also covers the case in which the new bandwidth is
3474	* insufficient to cover the existing bandwidth deficit. (Forcing the	3474	* insufficient to cover the existing bandwidth deficit. (Forcing the
3475	* timer to remain active while there are any throttled entities.)	3475	* timer to remain active while there are any throttled entities.)
3476	*/	3476	*/
3477	cfs_b->idle = 0;	3477	cfs_b->idle = 0;
3478	out_unlock:	3478	out_unlock:
3479	if (idle)	3479	if (idle)
3480	cfs_b->timer_active = 0;	3480	cfs_b->timer_active = 0;
3481	raw_spin_unlock(&cfs_b->lock);	3481	raw_spin_unlock(&cfs_b->lock);
3482		3482
3483	return idle;	3483	return idle;
3484	}	3484	}
3485		3485
3486	/* a cfs_rq won't donate quota below this amount */	3486	/* a cfs_rq won't donate quota below this amount */
3487	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;	3487	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3488	/* minimum remaining period time to redistribute slack quota */	3488	/* minimum remaining period time to redistribute slack quota */
3489	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;	3489	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3490	/* how long we wait to gather additional slack before distributing */	3490	/* how long we wait to gather additional slack before distributing */
3491	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;	3491	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3492		3492
3493	/*	3493	/*
3494	* Are we near the end of the current quota period?	3494	* Are we near the end of the current quota period?
3495	*	3495	*
3496	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the	3496	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3497	* hrtimer base being cleared by __hrtimer_start_range_ns. In the case of	3497	* hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3498	* migrate_hrtimers, base is never cleared, so we are fine.	3498	* migrate_hrtimers, base is never cleared, so we are fine.
3499	*/	3499	*/
3500	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)	3500	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3501	{	3501	{
3502	struct hrtimer *refresh_timer = &cfs_b->period_timer;	3502	struct hrtimer *refresh_timer = &cfs_b->period_timer;
3503	u64 remaining;	3503	u64 remaining;
3504		3504
3505	/* if the call-back is running a quota refresh is already occurring */	3505	/* if the call-back is running a quota refresh is already occurring */
3506	if (hrtimer_callback_running(refresh_timer))	3506	if (hrtimer_callback_running(refresh_timer))
3507	return 1;	3507	return 1;
3508		3508
3509	/* is a quota refresh about to occur? */	3509	/* is a quota refresh about to occur? */
3510	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));	3510	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3511	if (remaining < min_expire)	3511	if (remaining < min_expire)
3512	return 1;	3512	return 1;
3513		3513
3514	return 0;	3514	return 0;
3515	}	3515	}
3516		3516
3517	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)	3517	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3518	{	3518	{
3519	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;	3519	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3520		3520
3521	/* if there's a quota refresh soon don't bother with slack */	3521	/* if there's a quota refresh soon don't bother with slack */
3522	if (runtime_refresh_within(cfs_b, min_left))	3522	if (runtime_refresh_within(cfs_b, min_left))
3523	return;	3523	return;
3524		3524
3525	start_bandwidth_timer(&cfs_b->slack_timer,	3525	start_bandwidth_timer(&cfs_b->slack_timer,
3526	ns_to_ktime(cfs_bandwidth_slack_period));	3526	ns_to_ktime(cfs_bandwidth_slack_period));
3527	}	3527	}
3528		3528
3529	/* we know any runtime found here is valid as update_curr() precedes return */	3529	/* we know any runtime found here is valid as update_curr() precedes return */
3530	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)	3530	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3531	{	3531	{
3532	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);	3532	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3533	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;	3533	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3534		3534
3535	if (slack_runtime <= 0)	3535	if (slack_runtime <= 0)
3536	return;	3536	return;
3537		3537
3538	raw_spin_lock(&cfs_b->lock);	3538	raw_spin_lock(&cfs_b->lock);
3539	if (cfs_b->quota != RUNTIME_INF &&	3539	if (cfs_b->quota != RUNTIME_INF &&
3540	cfs_rq->runtime_expires == cfs_b->runtime_expires) {	3540	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3541	cfs_b->runtime += slack_runtime;	3541	cfs_b->runtime += slack_runtime;
3542		3542
3543	/* we are under rq->lock, defer unthrottling using a timer */	3543	/* we are under rq->lock, defer unthrottling using a timer */
3544	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&	3544	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3545	!list_empty(&cfs_b->throttled_cfs_rq))	3545	!list_empty(&cfs_b->throttled_cfs_rq))
3546	start_cfs_slack_bandwidth(cfs_b);	3546	start_cfs_slack_bandwidth(cfs_b);
3547	}	3547	}
3548	raw_spin_unlock(&cfs_b->lock);	3548	raw_spin_unlock(&cfs_b->lock);
3549		3549
3550	/* even if it's not valid for return we don't want to try again */	3550	/* even if it's not valid for return we don't want to try again */
3551	cfs_rq->runtime_remaining -= slack_runtime;	3551	cfs_rq->runtime_remaining -= slack_runtime;
3552	}	3552	}
3553		3553
3554	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)	3554	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3555	{	3555	{
3556	if (!cfs_bandwidth_used())	3556	if (!cfs_bandwidth_used())
3557	return;	3557	return;
3558		3558
3559	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)	3559	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
3560	return;	3560	return;
3561		3561
3562	__return_cfs_rq_runtime(cfs_rq);	3562	__return_cfs_rq_runtime(cfs_rq);
3563	}	3563	}
3564		3564
3565	/*	3565	/*
3566	* This is done with a timer (instead of inline with bandwidth return) since	3566	* This is done with a timer (instead of inline with bandwidth return) since
3567	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.	3567	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3568	*/	3568	*/
3569	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)	3569	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3570	{	3570	{
3571	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();	3571	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3572	u64 expires;	3572	u64 expires;
3573		3573
3574	/* confirm we're still not at a refresh boundary */	3574	/* confirm we're still not at a refresh boundary */
3575	raw_spin_lock(&cfs_b->lock);	3575	raw_spin_lock(&cfs_b->lock);
3576	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {	3576	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3577	raw_spin_unlock(&cfs_b->lock);	3577	raw_spin_unlock(&cfs_b->lock);
3578	return;	3578	return;
3579	}	3579	}
3580		3580
3581	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {	3581	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3582	runtime = cfs_b->runtime;	3582	runtime = cfs_b->runtime;
3583	cfs_b->runtime = 0;	3583	cfs_b->runtime = 0;
3584	}	3584	}
3585	expires = cfs_b->runtime_expires;	3585	expires = cfs_b->runtime_expires;
3586	raw_spin_unlock(&cfs_b->lock);	3586	raw_spin_unlock(&cfs_b->lock);
3587		3587
3588	if (!runtime)	3588	if (!runtime)
3589	return;	3589	return;
3590		3590
3591	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);	3591	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3592		3592
3593	raw_spin_lock(&cfs_b->lock);	3593	raw_spin_lock(&cfs_b->lock);
3594	if (expires == cfs_b->runtime_expires)	3594	if (expires == cfs_b->runtime_expires)
3595	cfs_b->runtime = runtime;	3595	cfs_b->runtime = runtime;
3596	raw_spin_unlock(&cfs_b->lock);	3596	raw_spin_unlock(&cfs_b->lock);
3597	}	3597	}
3598		3598
3599	/*	3599	/*
3600	* When a group wakes up we want to make sure that its quota is not already	3600	* When a group wakes up we want to make sure that its quota is not already
3601	* expired/exceeded, otherwise it may be allowed to steal additional ticks of	3601	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
3602	* runtime as update_curr() throttling can not not trigger until it's on-rq.	3602	* runtime as update_curr() throttling can not not trigger until it's on-rq.
3603	*/	3603	*/
3604	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)	3604	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3605	{	3605	{
3606	if (!cfs_bandwidth_used())	3606	if (!cfs_bandwidth_used())
3607	return;	3607	return;
3608		3608
3609	/* an active group must be handled by the update_curr()->put() path */	3609	/* an active group must be handled by the update_curr()->put() path */
3610	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)	3610	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
3611	return;	3611	return;
3612		3612
3613	/* ensure the group is not already throttled */	3613	/* ensure the group is not already throttled */
3614	if (cfs_rq_throttled(cfs_rq))	3614	if (cfs_rq_throttled(cfs_rq))
3615	return;	3615	return;
3616		3616
3617	/* update runtime allocation */	3617	/* update runtime allocation */
3618	account_cfs_rq_runtime(cfs_rq, 0);	3618	account_cfs_rq_runtime(cfs_rq, 0);
3619	if (cfs_rq->runtime_remaining <= 0)	3619	if (cfs_rq->runtime_remaining <= 0)
3620	throttle_cfs_rq(cfs_rq);	3620	throttle_cfs_rq(cfs_rq);
3621	}	3621	}
3622		3622
3623	/* conditionally throttle active cfs_rq's from put_prev_entity() */	3623	/* conditionally throttle active cfs_rq's from put_prev_entity() */
3624	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)	3624	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3625	{	3625	{
3626	if (!cfs_bandwidth_used())	3626	if (!cfs_bandwidth_used())
3627	return false;	3627	return false;
3628		3628
3629	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))	3629	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
3630	return false;	3630	return false;
3631		3631
3632	/*	3632	/*
3633	* it's possible for a throttled entity to be forced into a running	3633	* it's possible for a throttled entity to be forced into a running
3634	* state (e.g. set_curr_task), in this case we're finished.	3634	* state (e.g. set_curr_task), in this case we're finished.
3635	*/	3635	*/
3636	if (cfs_rq_throttled(cfs_rq))	3636	if (cfs_rq_throttled(cfs_rq))
3637	return true;	3637	return true;
3638		3638
3639	throttle_cfs_rq(cfs_rq);	3639	throttle_cfs_rq(cfs_rq);
3640	return true;	3640	return true;
3641	}	3641	}
3642		3642
3643	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)	3643	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3644	{	3644	{
3645	struct cfs_bandwidth *cfs_b =	3645	struct cfs_bandwidth *cfs_b =
3646	container_of(timer, struct cfs_bandwidth, slack_timer);	3646	container_of(timer, struct cfs_bandwidth, slack_timer);
3647	do_sched_cfs_slack_timer(cfs_b);	3647	do_sched_cfs_slack_timer(cfs_b);
3648		3648
3649	return HRTIMER_NORESTART;	3649	return HRTIMER_NORESTART;
3650	}	3650	}
3651		3651
3652	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)	3652	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3653	{	3653	{
3654	struct cfs_bandwidth *cfs_b =	3654	struct cfs_bandwidth *cfs_b =
3655	container_of(timer, struct cfs_bandwidth, period_timer);	3655	container_of(timer, struct cfs_bandwidth, period_timer);
3656	ktime_t now;	3656	ktime_t now;
3657	int overrun;	3657	int overrun;
3658	int idle = 0;	3658	int idle = 0;
3659		3659
3660	for (;;) {	3660	for (;;) {
3661	now = hrtimer_cb_get_time(timer);	3661	now = hrtimer_cb_get_time(timer);
3662	overrun = hrtimer_forward(timer, now, cfs_b->period);	3662	overrun = hrtimer_forward(timer, now, cfs_b->period);
3663		3663
3664	if (!overrun)	3664	if (!overrun)
3665	break;	3665	break;
3666		3666
3667	idle = do_sched_cfs_period_timer(cfs_b, overrun);	3667	idle = do_sched_cfs_period_timer(cfs_b, overrun);
3668	}	3668	}
3669		3669
3670	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;	3670	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3671	}	3671	}
3672		3672
3673	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	3673	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3674	{	3674	{
3675	raw_spin_lock_init(&cfs_b->lock);	3675	raw_spin_lock_init(&cfs_b->lock);
3676	cfs_b->runtime = 0;	3676	cfs_b->runtime = 0;
3677	cfs_b->quota = RUNTIME_INF;	3677	cfs_b->quota = RUNTIME_INF;
3678	cfs_b->period = ns_to_ktime(default_cfs_period());	3678	cfs_b->period = ns_to_ktime(default_cfs_period());
3679		3679
3680	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);	3680	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3681	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	3681	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3682	cfs_b->period_timer.function = sched_cfs_period_timer;	3682	cfs_b->period_timer.function = sched_cfs_period_timer;
3683	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	3683	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3684	cfs_b->slack_timer.function = sched_cfs_slack_timer;	3684	cfs_b->slack_timer.function = sched_cfs_slack_timer;
3685	}	3685	}
3686		3686
3687	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)	3687	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3688	{	3688	{
3689	cfs_rq->runtime_enabled = 0;	3689	cfs_rq->runtime_enabled = 0;
3690	INIT_LIST_HEAD(&cfs_rq->throttled_list);	3690	INIT_LIST_HEAD(&cfs_rq->throttled_list);
3691	}	3691	}
3692		3692
3693	/* requires cfs_b->lock, may release to reprogram timer */	3693	/* requires cfs_b->lock, may release to reprogram timer */
3694	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	3694	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3695	{	3695	{
3696	/*	3696	/*
3697	* The timer may be active because we're trying to set a new bandwidth	3697	* The timer may be active because we're trying to set a new bandwidth
3698	* period or because we're racing with the tear-down path	3698	* period or because we're racing with the tear-down path
3699	* (timer_active==0 becomes visible before the hrtimer call-back	3699	* (timer_active==0 becomes visible before the hrtimer call-back
3700	* terminates). In either case we ensure that it's re-programmed	3700	* terminates). In either case we ensure that it's re-programmed
3701	*/	3701	*/
3702	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&	3702	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3703	hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {	3703	hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3704	/* bounce the lock to allow do_sched_cfs_period_timer to run */	3704	/* bounce the lock to allow do_sched_cfs_period_timer to run */
3705	raw_spin_unlock(&cfs_b->lock);	3705	raw_spin_unlock(&cfs_b->lock);
3706	cpu_relax();	3706	cpu_relax();
3707	raw_spin_lock(&cfs_b->lock);	3707	raw_spin_lock(&cfs_b->lock);
3708	/* if someone else restarted the timer then we're done */	3708	/* if someone else restarted the timer then we're done */
3709	if (cfs_b->timer_active)	3709	if (!force && cfs_b->timer_active)
3710	return;	3710	return;
3711	}	3711	}
3712		3712
3713	cfs_b->timer_active = 1;	3713	cfs_b->timer_active = 1;
3714	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);	3714	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3715	}	3715	}
3716		3716
3717	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	3717	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3718	{	3718	{
3719	hrtimer_cancel(&cfs_b->period_timer);	3719	hrtimer_cancel(&cfs_b->period_timer);
3720	hrtimer_cancel(&cfs_b->slack_timer);	3720	hrtimer_cancel(&cfs_b->slack_timer);
3721	}	3721	}
3722		3722
3723	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)	3723	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3724	{	3724	{
3725	struct cfs_rq *cfs_rq;	3725	struct cfs_rq *cfs_rq;
3726		3726
3727	for_each_leaf_cfs_rq(rq, cfs_rq) {	3727	for_each_leaf_cfs_rq(rq, cfs_rq) {
3728	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);	3728	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3729		3729
3730	if (!cfs_rq->runtime_enabled)	3730	if (!cfs_rq->runtime_enabled)
3731	continue;	3731	continue;
3732		3732
3733	/*	3733	/*
3734	* clock_task is not advancing so we just need to make sure	3734	* clock_task is not advancing so we just need to make sure
3735	* there's some valid quota amount	3735	* there's some valid quota amount
3736	*/	3736	*/
3737	cfs_rq->runtime_remaining = cfs_b->quota;	3737	cfs_rq->runtime_remaining = cfs_b->quota;
3738	if (cfs_rq_throttled(cfs_rq))	3738	if (cfs_rq_throttled(cfs_rq))
3739	unthrottle_cfs_rq(cfs_rq);	3739	unthrottle_cfs_rq(cfs_rq);
3740	}	3740	}
3741	}	3741	}
3742		3742
3743	#else /* CONFIG_CFS_BANDWIDTH */	3743	#else /* CONFIG_CFS_BANDWIDTH */
3744	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)	3744	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3745	{	3745	{
3746	return rq_clock_task(rq_of(cfs_rq));	3746	return rq_clock_task(rq_of(cfs_rq));
3747	}	3747	}
3748		3748
3749	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}	3749	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3750	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }	3750	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3751	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}	3751	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3752	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	3752	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3753		3753
3754	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)	3754	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3755	{	3755	{
3756	return 0;	3756	return 0;
3757	}	3757	}
3758		3758
3759	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)	3759	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3760	{	3760	{
3761	return 0;	3761	return 0;
3762	}	3762	}
3763		3763
3764	static inline int throttled_lb_pair(struct task_group *tg,	3764	static inline int throttled_lb_pair(struct task_group *tg,
3765	int src_cpu, int dest_cpu)	3765	int src_cpu, int dest_cpu)
3766	{	3766	{
3767	return 0;	3767	return 0;
3768	}	3768	}
3769		3769
3770	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}	3770	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3771		3771
3772	#ifdef CONFIG_FAIR_GROUP_SCHED	3772	#ifdef CONFIG_FAIR_GROUP_SCHED
3773	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	3773	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3774	#endif	3774	#endif
3775		3775
3776	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)	3776	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
3777	{	3777	{
3778	return NULL;	3778	return NULL;
3779	}	3779	}
3780	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}	3780	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3781	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}	3781	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3782		3782
3783	#endif /* CONFIG_CFS_BANDWIDTH */	3783	#endif /* CONFIG_CFS_BANDWIDTH */
3784		3784
3785	/**************************************************	3785	/**************************************************
3786	* CFS operations on tasks:	3786	* CFS operations on tasks:
3787	*/	3787	*/
3788		3788
3789	#ifdef CONFIG_SCHED_HRTICK	3789	#ifdef CONFIG_SCHED_HRTICK
3790	static void hrtick_start_fair(struct rq rq, struct task_struct p)	3790	static void hrtick_start_fair(struct rq rq, struct task_struct p)
3791	{	3791	{
3792	struct sched_entity *se = &p->se;	3792	struct sched_entity *se = &p->se;
3793	struct cfs_rq *cfs_rq = cfs_rq_of(se);	3793	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3794		3794
3795	WARN_ON(task_rq(p) != rq);	3795	WARN_ON(task_rq(p) != rq);
3796		3796
3797	if (cfs_rq->nr_running > 1) {	3797	if (cfs_rq->nr_running > 1) {
3798	u64 slice = sched_slice(cfs_rq, se);	3798	u64 slice = sched_slice(cfs_rq, se);
3799	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;	3799	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3800	s64 delta = slice - ran;	3800	s64 delta = slice - ran;
3801		3801
3802	if (delta < 0) {	3802	if (delta < 0) {
3803	if (rq->curr == p)	3803	if (rq->curr == p)
3804	resched_task(p);	3804	resched_task(p);
3805	return;	3805	return;
3806	}	3806	}
3807		3807
3808	/*	3808	/*
3809	* Don't schedule slices shorter than 10000ns, that just	3809	* Don't schedule slices shorter than 10000ns, that just
3810	* doesn't make sense. Rely on vruntime for fairness.	3810	* doesn't make sense. Rely on vruntime for fairness.
3811	*/	3811	*/
3812	if (rq->curr != p)	3812	if (rq->curr != p)
3813	delta = max_t(s64, 10000LL, delta);	3813	delta = max_t(s64, 10000LL, delta);
3814		3814
3815	hrtick_start(rq, delta);	3815	hrtick_start(rq, delta);
3816	}	3816	}
3817	}	3817	}
3818		3818
3819	/*	3819	/*
3820	* called from enqueue/dequeue and updates the hrtick when the	3820	* called from enqueue/dequeue and updates the hrtick when the
3821	* current task is from our class and nr_running is low enough	3821	* current task is from our class and nr_running is low enough
3822	* to matter.	3822	* to matter.
3823	*/	3823	*/
3824	static void hrtick_update(struct rq *rq)	3824	static void hrtick_update(struct rq *rq)
3825	{	3825	{
3826	struct task_struct *curr = rq->curr;	3826	struct task_struct *curr = rq->curr;
3827		3827
3828	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)	3828	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
3829	return;	3829	return;
3830		3830
3831	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)	3831	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3832	hrtick_start_fair(rq, curr);	3832	hrtick_start_fair(rq, curr);
3833	}	3833	}
3834	#else /* !CONFIG_SCHED_HRTICK */	3834	#else /* !CONFIG_SCHED_HRTICK */
3835	static inline void	3835	static inline void
3836	hrtick_start_fair(struct rq rq, struct task_struct p)	3836	hrtick_start_fair(struct rq rq, struct task_struct p)
3837	{	3837	{
3838	}	3838	}
3839		3839
3840	static inline void hrtick_update(struct rq *rq)	3840	static inline void hrtick_update(struct rq *rq)
3841	{	3841	{
3842	}	3842	}
3843	#endif	3843	#endif
3844		3844
3845	/*	3845	/*
3846	* The enqueue_task method is called before nr_running is	3846	* The enqueue_task method is called before nr_running is
3847	* increased. Here we update the fair scheduling stats and	3847	* increased. Here we update the fair scheduling stats and
3848	* then put the task into the rbtree:	3848	* then put the task into the rbtree:
3849	*/	3849	*/
3850	static void	3850	static void
3851	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)	3851	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
3852	{	3852	{
3853	struct cfs_rq *cfs_rq;	3853	struct cfs_rq *cfs_rq;
3854	struct sched_entity *se = &p->se;	3854	struct sched_entity *se = &p->se;
3855		3855
3856	for_each_sched_entity(se) {	3856	for_each_sched_entity(se) {
3857	if (se->on_rq)	3857	if (se->on_rq)
3858	break;	3858	break;
3859	cfs_rq = cfs_rq_of(se);	3859	cfs_rq = cfs_rq_of(se);
3860	enqueue_entity(cfs_rq, se, flags);	3860	enqueue_entity(cfs_rq, se, flags);
3861		3861
3862	/*	3862	/*
3863	* end evaluation on encountering a throttled cfs_rq	3863	* end evaluation on encountering a throttled cfs_rq
3864	*	3864	*
3865	* note: in the case of encountering a throttled cfs_rq we will	3865	* note: in the case of encountering a throttled cfs_rq we will
3866	* post the final h_nr_running increment below.	3866	* post the final h_nr_running increment below.
3867	*/	3867	*/
3868	if (cfs_rq_throttled(cfs_rq))	3868	if (cfs_rq_throttled(cfs_rq))
3869	break;	3869	break;
3870	cfs_rq->h_nr_running++;	3870	cfs_rq->h_nr_running++;
3871		3871
3872	flags = ENQUEUE_WAKEUP;	3872	flags = ENQUEUE_WAKEUP;
3873	}	3873	}
3874		3874
3875	for_each_sched_entity(se) {	3875	for_each_sched_entity(se) {
3876	cfs_rq = cfs_rq_of(se);	3876	cfs_rq = cfs_rq_of(se);
3877	cfs_rq->h_nr_running++;	3877	cfs_rq->h_nr_running++;
3878		3878
3879	if (cfs_rq_throttled(cfs_rq))	3879	if (cfs_rq_throttled(cfs_rq))
3880	break;	3880	break;
3881		3881
3882	update_cfs_shares(cfs_rq);	3882	update_cfs_shares(cfs_rq);
3883	update_entity_load_avg(se, 1);	3883	update_entity_load_avg(se, 1);
3884	}	3884	}
3885		3885
3886	if (!se) {	3886	if (!se) {
3887	update_rq_runnable_avg(rq, rq->nr_running);	3887	update_rq_runnable_avg(rq, rq->nr_running);
3888	inc_nr_running(rq);	3888	inc_nr_running(rq);
3889	}	3889	}
3890	hrtick_update(rq);	3890	hrtick_update(rq);
3891	}	3891	}
3892		3892
3893	static void set_next_buddy(struct sched_entity *se);	3893	static void set_next_buddy(struct sched_entity *se);
3894		3894
3895	/*	3895	/*
3896	* The dequeue_task method is called before nr_running is	3896	* The dequeue_task method is called before nr_running is
3897	* decreased. We remove the task from the rbtree and	3897	* decreased. We remove the task from the rbtree and
3898	* update the fair scheduling stats:	3898	* update the fair scheduling stats:
3899	*/	3899	*/
3900	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)	3900	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
3901	{	3901	{
3902	struct cfs_rq *cfs_rq;	3902	struct cfs_rq *cfs_rq;
3903	struct sched_entity *se = &p->se;	3903	struct sched_entity *se = &p->se;
3904	int task_sleep = flags & DEQUEUE_SLEEP;	3904	int task_sleep = flags & DEQUEUE_SLEEP;
3905		3905
3906	for_each_sched_entity(se) {	3906	for_each_sched_entity(se) {
3907	cfs_rq = cfs_rq_of(se);	3907	cfs_rq = cfs_rq_of(se);
3908	dequeue_entity(cfs_rq, se, flags);	3908	dequeue_entity(cfs_rq, se, flags);
3909		3909
3910	/*	3910	/*
3911	* end evaluation on encountering a throttled cfs_rq	3911	* end evaluation on encountering a throttled cfs_rq
3912	*	3912	*
3913	* note: in the case of encountering a throttled cfs_rq we will	3913	* note: in the case of encountering a throttled cfs_rq we will
3914	* post the final h_nr_running decrement below.	3914	* post the final h_nr_running decrement below.
3915	*/	3915	*/
3916	if (cfs_rq_throttled(cfs_rq))	3916	if (cfs_rq_throttled(cfs_rq))
3917	break;	3917	break;
3918	cfs_rq->h_nr_running--;	3918	cfs_rq->h_nr_running--;
3919		3919
3920	/* Don't dequeue parent if it has other entities besides us */	3920	/* Don't dequeue parent if it has other entities besides us */
3921	if (cfs_rq->load.weight) {	3921	if (cfs_rq->load.weight) {
3922	/*	3922	/*
3923	* Bias pick_next to pick a task from this cfs_rq, as	3923	* Bias pick_next to pick a task from this cfs_rq, as
3924	* p is sleeping when it is within its sched_slice.	3924	* p is sleeping when it is within its sched_slice.
3925	*/	3925	*/
3926	if (task_sleep && parent_entity(se))	3926	if (task_sleep && parent_entity(se))
3927	set_next_buddy(parent_entity(se));	3927	set_next_buddy(parent_entity(se));
3928		3928
3929	/* avoid re-evaluating load for this entity */	3929	/* avoid re-evaluating load for this entity */
3930	se = parent_entity(se);	3930	se = parent_entity(se);
3931	break;	3931	break;
3932	}	3932	}
3933	flags \|= DEQUEUE_SLEEP;	3933	flags \|= DEQUEUE_SLEEP;
3934	}	3934	}
3935		3935
3936	for_each_sched_entity(se) {	3936	for_each_sched_entity(se) {
3937	cfs_rq = cfs_rq_of(se);	3937	cfs_rq = cfs_rq_of(se);
3938	cfs_rq->h_nr_running--;	3938	cfs_rq->h_nr_running--;
3939		3939
3940	if (cfs_rq_throttled(cfs_rq))	3940	if (cfs_rq_throttled(cfs_rq))
3941	break;	3941	break;
3942		3942
3943	update_cfs_shares(cfs_rq);	3943	update_cfs_shares(cfs_rq);
3944	update_entity_load_avg(se, 1);	3944	update_entity_load_avg(se, 1);
3945	}	3945	}
3946		3946
3947	if (!se) {	3947	if (!se) {
3948	dec_nr_running(rq);	3948	dec_nr_running(rq);
3949	update_rq_runnable_avg(rq, 1);	3949	update_rq_runnable_avg(rq, 1);
3950	}	3950	}
3951	hrtick_update(rq);	3951	hrtick_update(rq);
3952	}	3952	}
3953		3953
3954	#ifdef CONFIG_SMP	3954	#ifdef CONFIG_SMP
3955	/* Used instead of source_load when we know the type == 0 */	3955	/* Used instead of source_load when we know the type == 0 */
3956	static unsigned long weighted_cpuload(const int cpu)	3956	static unsigned long weighted_cpuload(const int cpu)
3957	{	3957	{
3958	return cpu_rq(cpu)->cfs.runnable_load_avg;	3958	return cpu_rq(cpu)->cfs.runnable_load_avg;
3959	}	3959	}
3960		3960
3961	/*	3961	/*
3962	* Return a low guess at the load of a migration-source cpu weighted	3962	* Return a low guess at the load of a migration-source cpu weighted
3963	* according to the scheduling class and "nice" value.	3963	* according to the scheduling class and "nice" value.
3964	*	3964	*
3965	* We want to under-estimate the load of migration sources, to	3965	* We want to under-estimate the load of migration sources, to
3966	* balance conservatively.	3966	* balance conservatively.
3967	*/	3967	*/
3968	static unsigned long source_load(int cpu, int type)	3968	static unsigned long source_load(int cpu, int type)
3969	{	3969	{
3970	struct rq *rq = cpu_rq(cpu);	3970	struct rq *rq = cpu_rq(cpu);
3971	unsigned long total = weighted_cpuload(cpu);	3971	unsigned long total = weighted_cpuload(cpu);
3972		3972
3973	if (type == 0 \|\| !sched_feat(LB_BIAS))	3973	if (type == 0 \|\| !sched_feat(LB_BIAS))
3974	return total;	3974	return total;
3975		3975
3976	return min(rq->cpu_load[type-1], total);	3976	return min(rq->cpu_load[type-1], total);
3977	}	3977	}
3978		3978
3979	/*	3979	/*
3980	* Return a high guess at the load of a migration-target cpu weighted	3980	* Return a high guess at the load of a migration-target cpu weighted
3981	* according to the scheduling class and "nice" value.	3981	* according to the scheduling class and "nice" value.
3982	*/	3982	*/
3983	static unsigned long target_load(int cpu, int type)	3983	static unsigned long target_load(int cpu, int type)
3984	{	3984	{
3985	struct rq *rq = cpu_rq(cpu);	3985	struct rq *rq = cpu_rq(cpu);
3986	unsigned long total = weighted_cpuload(cpu);	3986	unsigned long total = weighted_cpuload(cpu);
3987		3987
3988	if (type == 0 \|\| !sched_feat(LB_BIAS))	3988	if (type == 0 \|\| !sched_feat(LB_BIAS))
3989	return total;	3989	return total;
3990		3990
3991	return max(rq->cpu_load[type-1], total);	3991	return max(rq->cpu_load[type-1], total);
3992	}	3992	}
3993		3993
3994	static unsigned long power_of(int cpu)	3994	static unsigned long power_of(int cpu)
3995	{	3995	{
3996	return cpu_rq(cpu)->cpu_power;	3996	return cpu_rq(cpu)->cpu_power;
3997	}	3997	}
3998		3998
3999	static unsigned long cpu_avg_load_per_task(int cpu)	3999	static unsigned long cpu_avg_load_per_task(int cpu)
4000	{	4000	{
4001	struct rq *rq = cpu_rq(cpu);	4001	struct rq *rq = cpu_rq(cpu);
4002	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);	4002	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
4003	unsigned long load_avg = rq->cfs.runnable_load_avg;	4003	unsigned long load_avg = rq->cfs.runnable_load_avg;
4004		4004
4005	if (nr_running)	4005	if (nr_running)
4006	return load_avg / nr_running;	4006	return load_avg / nr_running;
4007		4007
4008	return 0;	4008	return 0;
4009	}	4009	}
4010		4010
4011	static void record_wakee(struct task_struct *p)	4011	static void record_wakee(struct task_struct *p)
4012	{	4012	{
4013	/*	4013	/*
4014	* Rough decay (wiping) for cost saving, don't worry	4014	* Rough decay (wiping) for cost saving, don't worry
4015	* about the boundary, really active task won't care	4015	* about the boundary, really active task won't care
4016	* about the loss.	4016	* about the loss.
4017	*/	4017	*/
4018	if (jiffies > current->wakee_flip_decay_ts + HZ) {	4018	if (jiffies > current->wakee_flip_decay_ts + HZ) {
4019	current->wakee_flips = 0;	4019	current->wakee_flips = 0;
4020	current->wakee_flip_decay_ts = jiffies;	4020	current->wakee_flip_decay_ts = jiffies;
4021	}	4021	}
4022		4022
4023	if (current->last_wakee != p) {	4023	if (current->last_wakee != p) {
4024	current->last_wakee = p;	4024	current->last_wakee = p;
4025	current->wakee_flips++;	4025	current->wakee_flips++;
4026	}	4026	}
4027	}	4027	}
4028		4028
4029	static void task_waking_fair(struct task_struct *p)	4029	static void task_waking_fair(struct task_struct *p)
4030	{	4030	{
4031	struct sched_entity *se = &p->se;	4031	struct sched_entity *se = &p->se;
4032	struct cfs_rq *cfs_rq = cfs_rq_of(se);	4032	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4033	u64 min_vruntime;	4033	u64 min_vruntime;
4034		4034
4035	#ifndef CONFIG_64BIT	4035	#ifndef CONFIG_64BIT
4036	u64 min_vruntime_copy;	4036	u64 min_vruntime_copy;
4037		4037
4038	do {	4038	do {
4039	min_vruntime_copy = cfs_rq->min_vruntime_copy;	4039	min_vruntime_copy = cfs_rq->min_vruntime_copy;
4040	smp_rmb();	4040	smp_rmb();
4041	min_vruntime = cfs_rq->min_vruntime;	4041	min_vruntime = cfs_rq->min_vruntime;
4042	} while (min_vruntime != min_vruntime_copy);	4042	} while (min_vruntime != min_vruntime_copy);
4043	#else	4043	#else
4044	min_vruntime = cfs_rq->min_vruntime;	4044	min_vruntime = cfs_rq->min_vruntime;
4045	#endif	4045	#endif
4046		4046
4047	se->vruntime -= min_vruntime;	4047	se->vruntime -= min_vruntime;
4048	record_wakee(p);	4048	record_wakee(p);
4049	}	4049	}
4050		4050
4051	#ifdef CONFIG_FAIR_GROUP_SCHED	4051	#ifdef CONFIG_FAIR_GROUP_SCHED
4052	/*	4052	/*
4053	* effective_load() calculates the load change as seen from the root_task_group	4053	* effective_load() calculates the load change as seen from the root_task_group
4054	*	4054	*
4055	* Adding load to a group doesn't make a group heavier, but can cause movement	4055	* Adding load to a group doesn't make a group heavier, but can cause movement
4056	* of group shares between cpus. Assuming the shares were perfectly aligned one	4056	* of group shares between cpus. Assuming the shares were perfectly aligned one
4057	* can calculate the shift in shares.	4057	* can calculate the shift in shares.
4058	*	4058	*
4059	* Calculate the effective load difference if @wl is added (subtracted) to @tg	4059	* Calculate the effective load difference if @wl is added (subtracted) to @tg
4060	* on this @cpu and results in a total addition (subtraction) of @wg to the	4060	* on this @cpu and results in a total addition (subtraction) of @wg to the
4061	* total group weight.	4061	* total group weight.
4062	*	4062	*
4063	* Given a runqueue weight distribution (rw_i) we can compute a shares	4063	* Given a runqueue weight distribution (rw_i) we can compute a shares
4064	* distribution (s_i) using:	4064	* distribution (s_i) using:
4065	*	4065	*
4066	* s_i = rw_i / \Sum rw_j (1)	4066	* s_i = rw_i / \Sum rw_j (1)
4067	*	4067	*
4068	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and	4068	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4069	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting	4069	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4070	* shares distribution (s_i):	4070	* shares distribution (s_i):
4071	*	4071	*
4072	* rw_i = { 2, 4, 1, 0 }	4072	* rw_i = { 2, 4, 1, 0 }
4073	* s_i = { 2/7, 4/7, 1/7, 0 }	4073	* s_i = { 2/7, 4/7, 1/7, 0 }
4074	*	4074	*
4075	* As per wake_affine() we're interested in the load of two CPUs (the CPU the	4075	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
4076	* task used to run on and the CPU the waker is running on), we need to	4076	* task used to run on and the CPU the waker is running on), we need to
4077	* compute the effect of waking a task on either CPU and, in case of a sync	4077	* compute the effect of waking a task on either CPU and, in case of a sync
4078	* wakeup, compute the effect of the current task going to sleep.	4078	* wakeup, compute the effect of the current task going to sleep.
4079	*	4079	*
4080	* So for a change of @wl to the local @cpu with an overall group weight change	4080	* So for a change of @wl to the local @cpu with an overall group weight change
4081	* of @wl we can compute the new shares distribution (s'_i) using:	4081	* of @wl we can compute the new shares distribution (s'_i) using:
4082	*	4082	*
4083	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)	4083	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
4084	*	4084	*
4085	* Suppose we're interested in CPUs 0 and 1, and want to compute the load	4085	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
4086	* differences in waking a task to CPU 0. The additional task changes the	4086	* differences in waking a task to CPU 0. The additional task changes the
4087	* weight and shares distributions like:	4087	* weight and shares distributions like:
4088	*	4088	*
4089	* rw'_i = { 3, 4, 1, 0 }	4089	* rw'_i = { 3, 4, 1, 0 }
4090	* s'_i = { 3/8, 4/8, 1/8, 0 }	4090	* s'_i = { 3/8, 4/8, 1/8, 0 }
4091	*	4091	*
4092	* We can then compute the difference in effective weight by using:	4092	* We can then compute the difference in effective weight by using:
4093	*	4093	*
4094	* dw_i = S * (s'_i - s_i) (3)	4094	* dw_i = S * (s'_i - s_i) (3)
4095	*	4095	*
4096	* Where 'S' is the group weight as seen by its parent.	4096	* Where 'S' is the group weight as seen by its parent.
4097	*	4097	*
4098	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)	4098	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4099	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -	4099	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4100	* 4/7) times the weight of the group.	4100	* 4/7) times the weight of the group.
4101	*/	4101	*/
4102	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)	4102	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4103	{	4103	{
4104	struct sched_entity *se = tg->se[cpu];	4104	struct sched_entity *se = tg->se[cpu];
4105		4105
4106	if (!tg->parent) /* the trivial, non-cgroup case */	4106	if (!tg->parent) /* the trivial, non-cgroup case */
4107	return wl;	4107	return wl;
4108		4108
4109	for_each_sched_entity(se) {	4109	for_each_sched_entity(se) {
4110	long w, W;	4110	long w, W;
4111		4111
4112	tg = se->my_q->tg;	4112	tg = se->my_q->tg;
4113		4113
4114	/*	4114	/*
4115	* W = @wg + \Sum rw_j	4115	* W = @wg + \Sum rw_j
4116	*/	4116	*/
4117	W = wg + calc_tg_weight(tg, se->my_q);	4117	W = wg + calc_tg_weight(tg, se->my_q);
4118		4118
4119	/*	4119	/*
4120	* w = rw_i + @wl	4120	* w = rw_i + @wl
4121	*/	4121	*/
4122	w = se->my_q->load.weight + wl;	4122	w = se->my_q->load.weight + wl;
4123		4123
4124	/*	4124	/*
4125	* wl = S * s'_i; see (2)	4125	* wl = S * s'_i; see (2)
4126	*/	4126	*/
4127	if (W > 0 && w < W)	4127	if (W > 0 && w < W)
4128	wl = (w * tg->shares) / W;	4128	wl = (w * tg->shares) / W;
4129	else	4129	else
4130	wl = tg->shares;	4130	wl = tg->shares;
4131		4131
4132	/*	4132	/*
4133	* Per the above, wl is the new se->load.weight value; since	4133	* Per the above, wl is the new se->load.weight value; since
4134	* those are clipped to [MIN_SHARES, ...) do so now. See	4134	* those are clipped to [MIN_SHARES, ...) do so now. See
4135	* calc_cfs_shares().	4135	* calc_cfs_shares().
4136	*/	4136	*/
4137	if (wl < MIN_SHARES)	4137	if (wl < MIN_SHARES)
4138	wl = MIN_SHARES;	4138	wl = MIN_SHARES;
4139		4139
4140	/*	4140	/*
4141	* wl = dw_i = S * (s'_i - s_i); see (3)	4141	* wl = dw_i = S * (s'_i - s_i); see (3)
4142	*/	4142	*/
4143	wl -= se->load.weight;	4143	wl -= se->load.weight;
4144		4144
4145	/*	4145	/*
4146	* Recursively apply this logic to all parent groups to compute	4146	* Recursively apply this logic to all parent groups to compute
4147	* the final effective load change on the root group. Since	4147	* the final effective load change on the root group. Since
4148	* only the @tg group gets extra weight, all parent groups can	4148	* only the @tg group gets extra weight, all parent groups can
4149	* only redistribute existing shares. @wl is the shift in shares	4149	* only redistribute existing shares. @wl is the shift in shares
4150	* resulting from this level per the above.	4150	* resulting from this level per the above.
4151	*/	4151	*/
4152	wg = 0;	4152	wg = 0;
4153	}	4153	}
4154		4154
4155	return wl;	4155	return wl;
4156	}	4156	}
4157	#else	4157	#else
4158		4158
4159	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)	4159	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4160	{	4160	{
4161	return wl;	4161	return wl;
4162	}	4162	}
4163		4163
4164	#endif	4164	#endif
4165		4165
4166	static int wake_wide(struct task_struct *p)	4166	static int wake_wide(struct task_struct *p)
4167	{	4167	{
4168	int factor = this_cpu_read(sd_llc_size);	4168	int factor = this_cpu_read(sd_llc_size);
4169		4169
4170	/*	4170	/*
4171	* Yeah, it's the switching-frequency, could means many wakee or	4171	* Yeah, it's the switching-frequency, could means many wakee or
4172	* rapidly switch, use factor here will just help to automatically	4172	* rapidly switch, use factor here will just help to automatically
4173	* adjust the loose-degree, so bigger node will lead to more pull.	4173	* adjust the loose-degree, so bigger node will lead to more pull.
4174	*/	4174	*/
4175	if (p->wakee_flips > factor) {	4175	if (p->wakee_flips > factor) {
4176	/*	4176	/*
4177	* wakee is somewhat hot, it needs certain amount of cpu	4177	* wakee is somewhat hot, it needs certain amount of cpu
4178	* resource, so if waker is far more hot, prefer to leave	4178	* resource, so if waker is far more hot, prefer to leave
4179	* it alone.	4179	* it alone.
4180	*/	4180	*/
4181	if (current->wakee_flips > (factor * p->wakee_flips))	4181	if (current->wakee_flips > (factor * p->wakee_flips))
4182	return 1;	4182	return 1;
4183	}	4183	}
4184		4184
4185	return 0;	4185	return 0;
4186	}	4186	}
4187		4187
4188	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)	4188	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
4189	{	4189	{
4190	s64 this_load, load;	4190	s64 this_load, load;
4191	int idx, this_cpu, prev_cpu;	4191	int idx, this_cpu, prev_cpu;
4192	unsigned long tl_per_task;	4192	unsigned long tl_per_task;
4193	struct task_group *tg;	4193	struct task_group *tg;
4194	unsigned long weight;	4194	unsigned long weight;
4195	int balanced;	4195	int balanced;
4196		4196
4197	/*	4197	/*
4198	* If we wake multiple tasks be careful to not bounce	4198	* If we wake multiple tasks be careful to not bounce
4199	* ourselves around too much.	4199	* ourselves around too much.
4200	*/	4200	*/
4201	if (wake_wide(p))	4201	if (wake_wide(p))
4202	return 0;	4202	return 0;
4203		4203
4204	idx = sd->wake_idx;	4204	idx = sd->wake_idx;
4205	this_cpu = smp_processor_id();	4205	this_cpu = smp_processor_id();
4206	prev_cpu = task_cpu(p);	4206	prev_cpu = task_cpu(p);
4207	load = source_load(prev_cpu, idx);	4207	load = source_load(prev_cpu, idx);
4208	this_load = target_load(this_cpu, idx);	4208	this_load = target_load(this_cpu, idx);
4209		4209
4210	/*	4210	/*
4211	* If sync wakeup then subtract the (maximum possible)	4211	* If sync wakeup then subtract the (maximum possible)
4212	* effect of the currently running task from the load	4212	* effect of the currently running task from the load
4213	* of the current CPU:	4213	* of the current CPU:
4214	*/	4214	*/
4215	if (sync) {	4215	if (sync) {
4216	tg = task_group(current);	4216	tg = task_group(current);
4217	weight = current->se.load.weight;	4217	weight = current->se.load.weight;
4218		4218
4219	this_load += effective_load(tg, this_cpu, -weight, -weight);	4219	this_load += effective_load(tg, this_cpu, -weight, -weight);
4220	load += effective_load(tg, prev_cpu, 0, -weight);	4220	load += effective_load(tg, prev_cpu, 0, -weight);
4221	}	4221	}
4222		4222
4223	tg = task_group(p);	4223	tg = task_group(p);
4224	weight = p->se.load.weight;	4224	weight = p->se.load.weight;
4225		4225
4226	/*	4226	/*
4227	* In low-load situations, where prev_cpu is idle and this_cpu is idle	4227	* In low-load situations, where prev_cpu is idle and this_cpu is idle
4228	* due to the sync cause above having dropped this_load to 0, we'll	4228	* due to the sync cause above having dropped this_load to 0, we'll
4229	* always have an imbalance, but there's really nothing you can do	4229	* always have an imbalance, but there's really nothing you can do
4230	* about that, so that's good too.	4230	* about that, so that's good too.
4231	*	4231	*
4232	* Otherwise check if either cpus are near enough in load to allow this	4232	* Otherwise check if either cpus are near enough in load to allow this
4233	* task to be woken on this_cpu.	4233	* task to be woken on this_cpu.
4234	*/	4234	*/
4235	if (this_load > 0) {	4235	if (this_load > 0) {
4236	s64 this_eff_load, prev_eff_load;	4236	s64 this_eff_load, prev_eff_load;
4237		4237
4238	this_eff_load = 100;	4238	this_eff_load = 100;
4239	this_eff_load *= power_of(prev_cpu);	4239	this_eff_load *= power_of(prev_cpu);
4240	this_eff_load *= this_load +	4240	this_eff_load *= this_load +
4241	effective_load(tg, this_cpu, weight, weight);	4241	effective_load(tg, this_cpu, weight, weight);
4242		4242
4243	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;	4243	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4244	prev_eff_load *= power_of(this_cpu);	4244	prev_eff_load *= power_of(this_cpu);
4245	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);	4245	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4246		4246
4247	balanced = this_eff_load <= prev_eff_load;	4247	balanced = this_eff_load <= prev_eff_load;
4248	} else	4248	} else
4249	balanced = true;	4249	balanced = true;
4250		4250
4251	/*	4251	/*
4252	* If the currently running task will sleep within	4252	* If the currently running task will sleep within
4253	* a reasonable amount of time then attract this newly	4253	* a reasonable amount of time then attract this newly
4254	* woken task:	4254	* woken task:
4255	*/	4255	*/
4256	if (sync && balanced)	4256	if (sync && balanced)
4257	return 1;	4257	return 1;
4258		4258
4259	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);	4259	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4260	tl_per_task = cpu_avg_load_per_task(this_cpu);	4260	tl_per_task = cpu_avg_load_per_task(this_cpu);
4261		4261
4262	if (balanced \|\|	4262	if (balanced \|\|
4263	(this_load <= load &&	4263	(this_load <= load &&
4264	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {	4264	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4265	/*	4265	/*
4266	* This domain has SD_WAKE_AFFINE and	4266	* This domain has SD_WAKE_AFFINE and
4267	* p is cache cold in this domain, and	4267	* p is cache cold in this domain, and
4268	* there is no bad imbalance.	4268	* there is no bad imbalance.
4269	*/	4269	*/
4270	schedstat_inc(sd, ttwu_move_affine);	4270	schedstat_inc(sd, ttwu_move_affine);
4271	schedstat_inc(p, se.statistics.nr_wakeups_affine);	4271	schedstat_inc(p, se.statistics.nr_wakeups_affine);
4272		4272
4273	return 1;	4273	return 1;
4274	}	4274	}
4275	return 0;	4275	return 0;
4276	}	4276	}
4277		4277
4278	/*	4278	/*
4279	* find_idlest_group finds and returns the least busy CPU group within the	4279	* find_idlest_group finds and returns the least busy CPU group within the
4280	* domain.	4280	* domain.
4281	*/	4281	*/
4282	static struct sched_group *	4282	static struct sched_group *
4283	find_idlest_group(struct sched_domain sd, struct task_struct p,	4283	find_idlest_group(struct sched_domain sd, struct task_struct p,
4284	int this_cpu, int sd_flag)	4284	int this_cpu, int sd_flag)
4285	{	4285	{
4286	struct sched_group idlest = NULL, group = sd->groups;	4286	struct sched_group idlest = NULL, group = sd->groups;
4287	unsigned long min_load = ULONG_MAX, this_load = 0;	4287	unsigned long min_load = ULONG_MAX, this_load = 0;
4288	int load_idx = sd->forkexec_idx;	4288	int load_idx = sd->forkexec_idx;
4289	int imbalance = 100 + (sd->imbalance_pct-100)/2;	4289	int imbalance = 100 + (sd->imbalance_pct-100)/2;
4290		4290
4291	if (sd_flag & SD_BALANCE_WAKE)	4291	if (sd_flag & SD_BALANCE_WAKE)
4292	load_idx = sd->wake_idx;	4292	load_idx = sd->wake_idx;
4293		4293
4294	do {	4294	do {
4295	unsigned long load, avg_load;	4295	unsigned long load, avg_load;
4296	int local_group;	4296	int local_group;
4297	int i;	4297	int i;
4298		4298
4299	/* Skip over this group if it has no CPUs allowed */	4299	/* Skip over this group if it has no CPUs allowed */
4300	if (!cpumask_intersects(sched_group_cpus(group),	4300	if (!cpumask_intersects(sched_group_cpus(group),
4301	tsk_cpus_allowed(p)))	4301	tsk_cpus_allowed(p)))
4302	continue;	4302	continue;
4303		4303
4304	local_group = cpumask_test_cpu(this_cpu,	4304	local_group = cpumask_test_cpu(this_cpu,
4305	sched_group_cpus(group));	4305	sched_group_cpus(group));
4306		4306
4307	/* Tally up the load of all CPUs in the group */	4307	/* Tally up the load of all CPUs in the group */
4308	avg_load = 0;	4308	avg_load = 0;
4309		4309
4310	for_each_cpu(i, sched_group_cpus(group)) {	4310	for_each_cpu(i, sched_group_cpus(group)) {
4311	/* Bias balancing toward cpus of our domain */	4311	/* Bias balancing toward cpus of our domain */
4312	if (local_group)	4312	if (local_group)
4313	load = source_load(i, load_idx);	4313	load = source_load(i, load_idx);
4314	else	4314	else
4315	load = target_load(i, load_idx);	4315	load = target_load(i, load_idx);
4316		4316
4317	avg_load += load;	4317	avg_load += load;
4318	}	4318	}
4319		4319
4320	/* Adjust by relative CPU power of the group */	4320	/* Adjust by relative CPU power of the group */
4321	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;	4321	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
4322		4322
4323	if (local_group) {	4323	if (local_group) {
4324	this_load = avg_load;	4324	this_load = avg_load;
4325	} else if (avg_load < min_load) {	4325	} else if (avg_load < min_load) {
4326	min_load = avg_load;	4326	min_load = avg_load;
4327	idlest = group;	4327	idlest = group;
4328	}	4328	}
4329	} while (group = group->next, group != sd->groups);	4329	} while (group = group->next, group != sd->groups);
4330		4330
4331	if (!idlest \|\| 100this_load < imbalancemin_load)	4331	if (!idlest \|\| 100this_load < imbalancemin_load)
4332	return NULL;	4332	return NULL;
4333	return idlest;	4333	return idlest;
4334	}	4334	}
4335		4335
4336	/*	4336	/*
4337	* find_idlest_cpu - find the idlest cpu among the cpus in group.	4337	* find_idlest_cpu - find the idlest cpu among the cpus in group.
4338	*/	4338	*/
4339	static int	4339	static int
4340	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)	4340	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
4341	{	4341	{
4342	unsigned long load, min_load = ULONG_MAX;	4342	unsigned long load, min_load = ULONG_MAX;
4343	int idlest = -1;	4343	int idlest = -1;
4344	int i;	4344	int i;
4345		4345
4346	/* Traverse only the allowed CPUs */	4346	/* Traverse only the allowed CPUs */
4347	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {	4347	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4348	load = weighted_cpuload(i);	4348	load = weighted_cpuload(i);
4349		4349
4350	if (load < min_load \|\| (load == min_load && i == this_cpu)) {	4350	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
4351	min_load = load;	4351	min_load = load;
4352	idlest = i;	4352	idlest = i;
4353	}	4353	}
4354	}	4354	}
4355		4355
4356	return idlest;	4356	return idlest;
4357	}	4357	}
4358		4358
4359	/*	4359	/*
4360	* Try and locate an idle CPU in the sched_domain.	4360	* Try and locate an idle CPU in the sched_domain.
4361	*/	4361	*/
4362	static int select_idle_sibling(struct task_struct *p, int target)	4362	static int select_idle_sibling(struct task_struct *p, int target)
4363	{	4363	{
4364	struct sched_domain *sd;	4364	struct sched_domain *sd;
4365	struct sched_group *sg;	4365	struct sched_group *sg;
4366	int i = task_cpu(p);	4366	int i = task_cpu(p);
4367		4367
4368	if (idle_cpu(target))	4368	if (idle_cpu(target))
4369	return target;	4369	return target;
4370		4370
4371	/*	4371	/*
4372	* If the prevous cpu is cache affine and idle, don't be stupid.	4372	* If the prevous cpu is cache affine and idle, don't be stupid.
4373	*/	4373	*/
4374	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))	4374	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4375	return i;	4375	return i;
4376		4376
4377	/*	4377	/*
4378	* Otherwise, iterate the domains and find an elegible idle cpu.	4378	* Otherwise, iterate the domains and find an elegible idle cpu.
4379	*/	4379	*/
4380	sd = rcu_dereference(per_cpu(sd_llc, target));	4380	sd = rcu_dereference(per_cpu(sd_llc, target));
4381	for_each_lower_domain(sd) {	4381	for_each_lower_domain(sd) {
4382	sg = sd->groups;	4382	sg = sd->groups;
4383	do {	4383	do {
4384	if (!cpumask_intersects(sched_group_cpus(sg),	4384	if (!cpumask_intersects(sched_group_cpus(sg),
4385	tsk_cpus_allowed(p)))	4385	tsk_cpus_allowed(p)))
4386	goto next;	4386	goto next;
4387		4387
4388	for_each_cpu(i, sched_group_cpus(sg)) {	4388	for_each_cpu(i, sched_group_cpus(sg)) {
4389	if (i == target \|\| !idle_cpu(i))	4389	if (i == target \|\| !idle_cpu(i))
4390	goto next;	4390	goto next;
4391	}	4391	}
4392		4392
4393	target = cpumask_first_and(sched_group_cpus(sg),	4393	target = cpumask_first_and(sched_group_cpus(sg),
4394	tsk_cpus_allowed(p));	4394	tsk_cpus_allowed(p));
4395	goto done;	4395	goto done;
4396	next:	4396	next:
4397	sg = sg->next;	4397	sg = sg->next;
4398	} while (sg != sd->groups);	4398	} while (sg != sd->groups);
4399	}	4399	}
4400	done:	4400	done:
4401	return target;	4401	return target;
4402	}	4402	}
4403		4403
4404	/*	4404	/*
4405	* select_task_rq_fair: Select target runqueue for the waking task in domains	4405	* select_task_rq_fair: Select target runqueue for the waking task in domains
4406	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,	4406	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4407	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.	4407	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4408	*	4408	*
4409	* Balances load by selecting the idlest cpu in the idlest group, or under	4409	* Balances load by selecting the idlest cpu in the idlest group, or under
4410	* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.	4410	* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4411	*	4411	*
4412	* Returns the target cpu number.	4412	* Returns the target cpu number.
4413	*	4413	*
4414	* preempt must be disabled.	4414	* preempt must be disabled.
4415	*/	4415	*/
4416	static int	4416	static int
4417	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)	4417	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4418	{	4418	{
4419	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;	4419	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
4420	int cpu = smp_processor_id();	4420	int cpu = smp_processor_id();
4421	int new_cpu = cpu;	4421	int new_cpu = cpu;
4422	int want_affine = 0;	4422	int want_affine = 0;
4423	int sync = wake_flags & WF_SYNC;	4423	int sync = wake_flags & WF_SYNC;
4424		4424
4425	if (p->nr_cpus_allowed == 1)	4425	if (p->nr_cpus_allowed == 1)
4426	return prev_cpu;	4426	return prev_cpu;
4427		4427
4428	if (sd_flag & SD_BALANCE_WAKE) {	4428	if (sd_flag & SD_BALANCE_WAKE) {
4429	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))	4429	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4430	want_affine = 1;	4430	want_affine = 1;
4431	new_cpu = prev_cpu;	4431	new_cpu = prev_cpu;
4432	}	4432	}
4433		4433
4434	rcu_read_lock();	4434	rcu_read_lock();
4435	for_each_domain(cpu, tmp) {	4435	for_each_domain(cpu, tmp) {
4436	if (!(tmp->flags & SD_LOAD_BALANCE))	4436	if (!(tmp->flags & SD_LOAD_BALANCE))
4437	continue;	4437	continue;
4438		4438
4439	/*	4439	/*
4440	* If both cpu and prev_cpu are part of this domain,	4440	* If both cpu and prev_cpu are part of this domain,
4441	* cpu is a valid SD_WAKE_AFFINE target.	4441	* cpu is a valid SD_WAKE_AFFINE target.
4442	*/	4442	*/
4443	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&	4443	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4444	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {	4444	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4445	affine_sd = tmp;	4445	affine_sd = tmp;
4446	break;	4446	break;
4447	}	4447	}
4448		4448
4449	if (tmp->flags & sd_flag)	4449	if (tmp->flags & sd_flag)
4450	sd = tmp;	4450	sd = tmp;
4451	}	4451	}
4452		4452
4453	if (affine_sd) {	4453	if (affine_sd) {
4454	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))	4454	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4455	prev_cpu = cpu;	4455	prev_cpu = cpu;
4456		4456
4457	new_cpu = select_idle_sibling(p, prev_cpu);	4457	new_cpu = select_idle_sibling(p, prev_cpu);
4458	goto unlock;	4458	goto unlock;
4459	}	4459	}
4460		4460
4461	while (sd) {	4461	while (sd) {
4462	struct sched_group *group;	4462	struct sched_group *group;
4463	int weight;	4463	int weight;
4464		4464
4465	if (!(sd->flags & sd_flag)) {	4465	if (!(sd->flags & sd_flag)) {
4466	sd = sd->child;	4466	sd = sd->child;
4467	continue;	4467	continue;
4468	}	4468	}
4469		4469
4470	group = find_idlest_group(sd, p, cpu, sd_flag);	4470	group = find_idlest_group(sd, p, cpu, sd_flag);
4471	if (!group) {	4471	if (!group) {
4472	sd = sd->child;	4472	sd = sd->child;
4473	continue;	4473	continue;
4474	}	4474	}
4475		4475
4476	new_cpu = find_idlest_cpu(group, p, cpu);	4476	new_cpu = find_idlest_cpu(group, p, cpu);
4477	if (new_cpu == -1 \|\| new_cpu == cpu) {	4477	if (new_cpu == -1 \|\| new_cpu == cpu) {
4478	/* Now try balancing at a lower domain level of cpu */	4478	/* Now try balancing at a lower domain level of cpu */
4479	sd = sd->child;	4479	sd = sd->child;
4480	continue;	4480	continue;
4481	}	4481	}
4482		4482
4483	/* Now try balancing at a lower domain level of new_cpu */	4483	/* Now try balancing at a lower domain level of new_cpu */
4484	cpu = new_cpu;	4484	cpu = new_cpu;
4485	weight = sd->span_weight;	4485	weight = sd->span_weight;
4486	sd = NULL;	4486	sd = NULL;
4487	for_each_domain(cpu, tmp) {	4487	for_each_domain(cpu, tmp) {
4488	if (weight <= tmp->span_weight)	4488	if (weight <= tmp->span_weight)
4489	break;	4489	break;
4490	if (tmp->flags & sd_flag)	4490	if (tmp->flags & sd_flag)
4491	sd = tmp;	4491	sd = tmp;
4492	}	4492	}
4493	/* while loop will break here if sd == NULL */	4493	/* while loop will break here if sd == NULL */
4494	}	4494	}
4495	unlock:	4495	unlock:
4496	rcu_read_unlock();	4496	rcu_read_unlock();
4497		4497
4498	return new_cpu;	4498	return new_cpu;
4499	}	4499	}
4500		4500
4501	/*	4501	/*
4502	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and	4502	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4503	* cfs_rq_of(p) references at time of call are still valid and identify the	4503	* cfs_rq_of(p) references at time of call are still valid and identify the
4504	* previous cpu. However, the caller only guarantees p->pi_lock is held; no	4504	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
4505	* other assumptions, including the state of rq->lock, should be made.	4505	* other assumptions, including the state of rq->lock, should be made.
4506	*/	4506	*/
4507	static void	4507	static void
4508	migrate_task_rq_fair(struct task_struct *p, int next_cpu)	4508	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4509	{	4509	{
4510	struct sched_entity *se = &p->se;	4510	struct sched_entity *se = &p->se;
4511	struct cfs_rq *cfs_rq = cfs_rq_of(se);	4511	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4512		4512
4513	/*	4513	/*
4514	* Load tracking: accumulate removed load so that it can be processed	4514	* Load tracking: accumulate removed load so that it can be processed
4515	* when we next update owning cfs_rq under rq->lock. Tasks contribute	4515	* when we next update owning cfs_rq under rq->lock. Tasks contribute
4516	* to blocked load iff they have a positive decay-count. It can never	4516	* to blocked load iff they have a positive decay-count. It can never
4517	* be negative here since on-rq tasks have decay-count == 0.	4517	* be negative here since on-rq tasks have decay-count == 0.
4518	*/	4518	*/
4519	if (se->avg.decay_count) {	4519	if (se->avg.decay_count) {
4520	se->avg.decay_count = -__synchronize_entity_decay(se);	4520	se->avg.decay_count = -__synchronize_entity_decay(se);
4521	atomic_long_add(se->avg.load_avg_contrib,	4521	atomic_long_add(se->avg.load_avg_contrib,
4522	&cfs_rq->removed_load);	4522	&cfs_rq->removed_load);
4523	}	4523	}
4524	}	4524	}
4525	#endif /* CONFIG_SMP */	4525	#endif /* CONFIG_SMP */
4526		4526
4527	static unsigned long	4527	static unsigned long
4528	wakeup_gran(struct sched_entity curr, struct sched_entity se)	4528	wakeup_gran(struct sched_entity curr, struct sched_entity se)
4529	{	4529	{
4530	unsigned long gran = sysctl_sched_wakeup_granularity;	4530	unsigned long gran = sysctl_sched_wakeup_granularity;
4531		4531
4532	/*	4532	/*
4533	* Since its curr running now, convert the gran from real-time	4533	* Since its curr running now, convert the gran from real-time
4534	* to virtual-time in his units.	4534	* to virtual-time in his units.
4535	*	4535	*
4536	* By using 'se' instead of 'curr' we penalize light tasks, so	4536	* By using 'se' instead of 'curr' we penalize light tasks, so
4537	* they get preempted easier. That is, if 'se' < 'curr' then	4537	* they get preempted easier. That is, if 'se' < 'curr' then
4538	* the resulting gran will be larger, therefore penalizing the	4538	* the resulting gran will be larger, therefore penalizing the
4539	* lighter, if otoh 'se' > 'curr' then the resulting gran will	4539	* lighter, if otoh 'se' > 'curr' then the resulting gran will
4540	* be smaller, again penalizing the lighter task.	4540	* be smaller, again penalizing the lighter task.
4541	*	4541	*
4542	* This is especially important for buddies when the leftmost	4542	* This is especially important for buddies when the leftmost
4543	* task is higher priority than the buddy.	4543	* task is higher priority than the buddy.
4544	*/	4544	*/
4545	return calc_delta_fair(gran, se);	4545	return calc_delta_fair(gran, se);
4546	}	4546	}
4547		4547
4548	/*	4548	/*
4549	* Should 'se' preempt 'curr'.	4549	* Should 'se' preempt 'curr'.
4550	*	4550	*
4551	* \|s1	4551	* \|s1
4552	* \|s2	4552	* \|s2
4553	* \|s3	4553	* \|s3
4554	* g	4554	* g
4555	* \|<--->\|c	4555	* \|<--->\|c
4556	*	4556	*
4557	* w(c, s1) = -1	4557	* w(c, s1) = -1
4558	* w(c, s2) = 0	4558	* w(c, s2) = 0
4559	* w(c, s3) = 1	4559	* w(c, s3) = 1
4560	*	4560	*
4561	*/	4561	*/
4562	static int	4562	static int
4563	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)	4563	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
4564	{	4564	{
4565	s64 gran, vdiff = curr->vruntime - se->vruntime;	4565	s64 gran, vdiff = curr->vruntime - se->vruntime;
4566		4566
4567	if (vdiff <= 0)	4567	if (vdiff <= 0)
4568	return -1;	4568	return -1;
4569		4569
4570	gran = wakeup_gran(curr, se);	4570	gran = wakeup_gran(curr, se);
4571	if (vdiff > gran)	4571	if (vdiff > gran)
4572	return 1;	4572	return 1;
4573		4573
4574	return 0;	4574	return 0;
4575	}	4575	}
4576		4576
4577	static void set_last_buddy(struct sched_entity *se)	4577	static void set_last_buddy(struct sched_entity *se)
4578	{	4578	{
4579	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))	4579	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4580	return;	4580	return;
4581		4581
4582	for_each_sched_entity(se)	4582	for_each_sched_entity(se)
4583	cfs_rq_of(se)->last = se;	4583	cfs_rq_of(se)->last = se;
4584	}	4584	}
4585		4585
4586	static void set_next_buddy(struct sched_entity *se)	4586	static void set_next_buddy(struct sched_entity *se)
4587	{	4587	{
4588	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))	4588	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4589	return;	4589	return;
4590		4590
4591	for_each_sched_entity(se)	4591	for_each_sched_entity(se)
4592	cfs_rq_of(se)->next = se;	4592	cfs_rq_of(se)->next = se;
4593	}	4593	}
4594		4594
4595	static void set_skip_buddy(struct sched_entity *se)	4595	static void set_skip_buddy(struct sched_entity *se)
4596	{	4596	{
4597	for_each_sched_entity(se)	4597	for_each_sched_entity(se)
4598	cfs_rq_of(se)->skip = se;	4598	cfs_rq_of(se)->skip = se;
4599	}	4599	}
4600		4600
4601	/*	4601	/*
4602	* Preempt the current task with a newly woken task if needed:	4602	* Preempt the current task with a newly woken task if needed:
4603	*/	4603	*/
4604	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)	4604	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
4605	{	4605	{
4606	struct task_struct *curr = rq->curr;	4606	struct task_struct *curr = rq->curr;
4607	struct sched_entity se = &curr->se, pse = &p->se;	4607	struct sched_entity se = &curr->se, pse = &p->se;
4608	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	4608	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4609	int scale = cfs_rq->nr_running >= sched_nr_latency;	4609	int scale = cfs_rq->nr_running >= sched_nr_latency;
4610	int next_buddy_marked = 0;	4610	int next_buddy_marked = 0;
4611		4611
4612	if (unlikely(se == pse))	4612	if (unlikely(se == pse))
4613	return;	4613	return;
4614		4614
4615	/*	4615	/*
4616	* This is possible from callers such as move_task(), in which we	4616	* This is possible from callers such as move_task(), in which we
4617	* unconditionally check_prempt_curr() after an enqueue (which may have	4617	* unconditionally check_prempt_curr() after an enqueue (which may have
4618	* lead to a throttle). This both saves work and prevents false	4618	* lead to a throttle). This both saves work and prevents false
4619	* next-buddy nomination below.	4619	* next-buddy nomination below.
4620	*/	4620	*/
4621	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))	4621	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4622	return;	4622	return;
4623		4623
4624	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {	4624	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4625	set_next_buddy(pse);	4625	set_next_buddy(pse);
4626	next_buddy_marked = 1;	4626	next_buddy_marked = 1;
4627	}	4627	}
4628		4628
4629	/*	4629	/*
4630	* We can come here with TIF_NEED_RESCHED already set from new task	4630	* We can come here with TIF_NEED_RESCHED already set from new task
4631	* wake up path.	4631	* wake up path.
4632	*	4632	*
4633	* Note: this also catches the edge-case of curr being in a throttled	4633	* Note: this also catches the edge-case of curr being in a throttled
4634	* group (e.g. via set_curr_task), since update_curr() (in the	4634	* group (e.g. via set_curr_task), since update_curr() (in the
4635	* enqueue of curr) will have resulted in resched being set. This	4635	* enqueue of curr) will have resulted in resched being set. This
4636	* prevents us from potentially nominating it as a false LAST_BUDDY	4636	* prevents us from potentially nominating it as a false LAST_BUDDY
4637	* below.	4637	* below.
4638	*/	4638	*/
4639	if (test_tsk_need_resched(curr))	4639	if (test_tsk_need_resched(curr))
4640	return;	4640	return;
4641		4641
4642	/* Idle tasks are by definition preempted by non-idle tasks. */	4642	/* Idle tasks are by definition preempted by non-idle tasks. */
4643	if (unlikely(curr->policy == SCHED_IDLE) &&	4643	if (unlikely(curr->policy == SCHED_IDLE) &&
4644	likely(p->policy != SCHED_IDLE))	4644	likely(p->policy != SCHED_IDLE))
4645	goto preempt;	4645	goto preempt;
4646		4646
4647	/*	4647	/*
4648	* Batch and idle tasks do not preempt non-idle tasks (their preemption	4648	* Batch and idle tasks do not preempt non-idle tasks (their preemption
4649	* is driven by the tick):	4649	* is driven by the tick):
4650	*/	4650	*/
4651	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))	4651	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
4652	return;	4652	return;
4653		4653
4654	find_matching_se(&se, &pse);	4654	find_matching_se(&se, &pse);
4655	update_curr(cfs_rq_of(se));	4655	update_curr(cfs_rq_of(se));
4656	BUG_ON(!pse);	4656	BUG_ON(!pse);
4657	if (wakeup_preempt_entity(se, pse) == 1) {	4657	if (wakeup_preempt_entity(se, pse) == 1) {
4658	/*	4658	/*
4659	* Bias pick_next to pick the sched entity that is	4659	* Bias pick_next to pick the sched entity that is
4660	* triggering this preemption.	4660	* triggering this preemption.
4661	*/	4661	*/
4662	if (!next_buddy_marked)	4662	if (!next_buddy_marked)
4663	set_next_buddy(pse);	4663	set_next_buddy(pse);
4664	goto preempt;	4664	goto preempt;
4665	}	4665	}
4666		4666
4667	return;	4667	return;
4668		4668
4669	preempt:	4669	preempt:
4670	resched_task(curr);	4670	resched_task(curr);
4671	/*	4671	/*
4672	* Only set the backward buddy when the current task is still	4672	* Only set the backward buddy when the current task is still
4673	* on the rq. This can happen when a wakeup gets interleaved	4673	* on the rq. This can happen when a wakeup gets interleaved
4674	* with schedule on the ->pre_schedule() or idle_balance()	4674	* with schedule on the ->pre_schedule() or idle_balance()
4675	* point, either of which can * drop the rq lock.	4675	* point, either of which can * drop the rq lock.
4676	*	4676	*
4677	* Also, during early boot the idle thread is in the fair class,	4677	* Also, during early boot the idle thread is in the fair class,
4678	* for obvious reasons its a bad idea to schedule back to it.	4678	* for obvious reasons its a bad idea to schedule back to it.
4679	*/	4679	*/
4680	if (unlikely(!se->on_rq \|\| curr == rq->idle))	4680	if (unlikely(!se->on_rq \|\| curr == rq->idle))
4681	return;	4681	return;
4682		4682
4683	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))	4683	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4684	set_last_buddy(se);	4684	set_last_buddy(se);
4685	}	4685	}
4686		4686
4687	static struct task_struct *	4687	static struct task_struct *
4688	pick_next_task_fair(struct rq rq, struct task_struct prev)	4688	pick_next_task_fair(struct rq rq, struct task_struct prev)
4689	{	4689	{
4690	struct cfs_rq *cfs_rq = &rq->cfs;	4690	struct cfs_rq *cfs_rq = &rq->cfs;
4691	struct sched_entity *se;	4691	struct sched_entity *se;
4692	struct task_struct *p;	4692	struct task_struct *p;
4693	int new_tasks;	4693	int new_tasks;
4694		4694
4695	again:	4695	again:
4696	#ifdef CONFIG_FAIR_GROUP_SCHED	4696	#ifdef CONFIG_FAIR_GROUP_SCHED
4697	if (!cfs_rq->nr_running)	4697	if (!cfs_rq->nr_running)
4698	goto idle;	4698	goto idle;
4699		4699
4700	if (prev->sched_class != &fair_sched_class)	4700	if (prev->sched_class != &fair_sched_class)
4701	goto simple;	4701	goto simple;
4702		4702
4703	/*	4703	/*
4704	* Because of the set_next_buddy() in dequeue_task_fair() it is rather	4704	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
4705	* likely that a next task is from the same cgroup as the current.	4705	* likely that a next task is from the same cgroup as the current.
4706	*	4706	*
4707	* Therefore attempt to avoid putting and setting the entire cgroup	4707	* Therefore attempt to avoid putting and setting the entire cgroup
4708	* hierarchy, only change the part that actually changes.	4708	* hierarchy, only change the part that actually changes.
4709	*/	4709	*/
4710		4710
4711	do {	4711	do {
4712	struct sched_entity *curr = cfs_rq->curr;	4712	struct sched_entity *curr = cfs_rq->curr;
4713		4713
4714	/*	4714	/*
4715	* Since we got here without doing put_prev_entity() we also	4715	* Since we got here without doing put_prev_entity() we also
4716	* have to consider cfs_rq->curr. If it is still a runnable	4716	* have to consider cfs_rq->curr. If it is still a runnable
4717	* entity, update_curr() will update its vruntime, otherwise	4717	* entity, update_curr() will update its vruntime, otherwise
4718	* forget we've ever seen it.	4718	* forget we've ever seen it.
4719	*/	4719	*/
4720	if (curr && curr->on_rq)	4720	if (curr && curr->on_rq)
4721	update_curr(cfs_rq);	4721	update_curr(cfs_rq);
4722	else	4722	else
4723	curr = NULL;	4723	curr = NULL;
4724		4724
4725	/*	4725	/*
4726	* This call to check_cfs_rq_runtime() will do the throttle and	4726	* This call to check_cfs_rq_runtime() will do the throttle and
4727	* dequeue its entity in the parent(s). Therefore the 'simple'	4727	* dequeue its entity in the parent(s). Therefore the 'simple'
4728	* nr_running test will indeed be correct.	4728	* nr_running test will indeed be correct.
4729	*/	4729	*/
4730	if (unlikely(check_cfs_rq_runtime(cfs_rq)))	4730	if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4731	goto simple;	4731	goto simple;
4732		4732
4733	se = pick_next_entity(cfs_rq, curr);	4733	se = pick_next_entity(cfs_rq, curr);
4734	cfs_rq = group_cfs_rq(se);	4734	cfs_rq = group_cfs_rq(se);
4735	} while (cfs_rq);	4735	} while (cfs_rq);
4736		4736
4737	p = task_of(se);	4737	p = task_of(se);
4738		4738
4739	/*	4739	/*
4740	* Since we haven't yet done put_prev_entity and if the selected task	4740	* Since we haven't yet done put_prev_entity and if the selected task
4741	* is a different task than we started out with, try and touch the	4741	* is a different task than we started out with, try and touch the
4742	* least amount of cfs_rqs.	4742	* least amount of cfs_rqs.
4743	*/	4743	*/
4744	if (prev != p) {	4744	if (prev != p) {
4745	struct sched_entity *pse = &prev->se;	4745	struct sched_entity *pse = &prev->se;
4746		4746
4747	while (!(cfs_rq = is_same_group(se, pse))) {	4747	while (!(cfs_rq = is_same_group(se, pse))) {
4748	int se_depth = se->depth;	4748	int se_depth = se->depth;
4749	int pse_depth = pse->depth;	4749	int pse_depth = pse->depth;
4750		4750
4751	if (se_depth <= pse_depth) {	4751	if (se_depth <= pse_depth) {
4752	put_prev_entity(cfs_rq_of(pse), pse);	4752	put_prev_entity(cfs_rq_of(pse), pse);
4753	pse = parent_entity(pse);	4753	pse = parent_entity(pse);
4754	}	4754	}
4755	if (se_depth >= pse_depth) {	4755	if (se_depth >= pse_depth) {
4756	set_next_entity(cfs_rq_of(se), se);	4756	set_next_entity(cfs_rq_of(se), se);
4757	se = parent_entity(se);	4757	se = parent_entity(se);
4758	}	4758	}
4759	}	4759	}
4760		4760
4761	put_prev_entity(cfs_rq, pse);	4761	put_prev_entity(cfs_rq, pse);
4762	set_next_entity(cfs_rq, se);	4762	set_next_entity(cfs_rq, se);
4763	}	4763	}
4764		4764
4765	if (hrtick_enabled(rq))	4765	if (hrtick_enabled(rq))
4766	hrtick_start_fair(rq, p);	4766	hrtick_start_fair(rq, p);
4767		4767
4768	return p;	4768	return p;
4769	simple:	4769	simple:
4770	cfs_rq = &rq->cfs;	4770	cfs_rq = &rq->cfs;
4771	#endif	4771	#endif
4772		4772
4773	if (!cfs_rq->nr_running)	4773	if (!cfs_rq->nr_running)
4774	goto idle;	4774	goto idle;
4775		4775
4776	put_prev_task(rq, prev);	4776	put_prev_task(rq, prev);
4777		4777
4778	do {	4778	do {
4779	se = pick_next_entity(cfs_rq, NULL);	4779	se = pick_next_entity(cfs_rq, NULL);
4780	set_next_entity(cfs_rq, se);	4780	set_next_entity(cfs_rq, se);
4781	cfs_rq = group_cfs_rq(se);	4781	cfs_rq = group_cfs_rq(se);
4782	} while (cfs_rq);	4782	} while (cfs_rq);
4783		4783
4784	p = task_of(se);	4784	p = task_of(se);
4785		4785
4786	if (hrtick_enabled(rq))	4786	if (hrtick_enabled(rq))
4787	hrtick_start_fair(rq, p);	4787	hrtick_start_fair(rq, p);
4788		4788
4789	return p;	4789	return p;
4790		4790
4791	idle:	4791	idle:
4792	new_tasks = idle_balance(rq);	4792	new_tasks = idle_balance(rq);
4793	/*	4793	/*
4794	* Because idle_balance() releases (and re-acquires) rq->lock, it is	4794	* Because idle_balance() releases (and re-acquires) rq->lock, it is
4795	* possible for any higher priority task to appear. In that case we	4795	* possible for any higher priority task to appear. In that case we
4796	* must re-start the pick_next_entity() loop.	4796	* must re-start the pick_next_entity() loop.
4797	*/	4797	*/
4798	if (new_tasks < 0)	4798	if (new_tasks < 0)
4799	return RETRY_TASK;	4799	return RETRY_TASK;
4800		4800
4801	if (new_tasks > 0)	4801	if (new_tasks > 0)
4802	goto again;	4802	goto again;
4803		4803
4804	return NULL;	4804	return NULL;
4805	}	4805	}
4806		4806
4807	/*	4807	/*
4808	* Account for a descheduled task:	4808	* Account for a descheduled task:
4809	*/	4809	*/
4810	static void put_prev_task_fair(struct rq rq, struct task_struct prev)	4810	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
4811	{	4811	{
4812	struct sched_entity *se = &prev->se;	4812	struct sched_entity *se = &prev->se;
4813	struct cfs_rq *cfs_rq;	4813	struct cfs_rq *cfs_rq;
4814		4814
4815	for_each_sched_entity(se) {	4815	for_each_sched_entity(se) {
4816	cfs_rq = cfs_rq_of(se);	4816	cfs_rq = cfs_rq_of(se);
4817	put_prev_entity(cfs_rq, se);	4817	put_prev_entity(cfs_rq, se);
4818	}	4818	}
4819	}	4819	}
4820		4820
4821	/*	4821	/*
4822	* sched_yield() is very simple	4822	* sched_yield() is very simple
4823	*	4823	*
4824	* The magic of dealing with the ->skip buddy is in pick_next_entity.	4824	* The magic of dealing with the ->skip buddy is in pick_next_entity.
4825	*/	4825	*/
4826	static void yield_task_fair(struct rq *rq)	4826	static void yield_task_fair(struct rq *rq)
4827	{	4827	{
4828	struct task_struct *curr = rq->curr;	4828	struct task_struct *curr = rq->curr;
4829	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	4829	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4830	struct sched_entity *se = &curr->se;	4830	struct sched_entity *se = &curr->se;
4831		4831
4832	/*	4832	/*
4833	* Are we the only task in the tree?	4833	* Are we the only task in the tree?
4834	*/	4834	*/
4835	if (unlikely(rq->nr_running == 1))	4835	if (unlikely(rq->nr_running == 1))
4836	return;	4836	return;
4837		4837
4838	clear_buddies(cfs_rq, se);	4838	clear_buddies(cfs_rq, se);
4839		4839
4840	if (curr->policy != SCHED_BATCH) {	4840	if (curr->policy != SCHED_BATCH) {
4841	update_rq_clock(rq);	4841	update_rq_clock(rq);
4842	/*	4842	/*
4843	* Update run-time statistics of the 'current'.	4843	* Update run-time statistics of the 'current'.
4844	*/	4844	*/
4845	update_curr(cfs_rq);	4845	update_curr(cfs_rq);
4846	/*	4846	/*
4847	* Tell update_rq_clock() that we've just updated,	4847	* Tell update_rq_clock() that we've just updated,
4848	* so we don't do microscopic update in schedule()	4848	* so we don't do microscopic update in schedule()
4849	* and double the fastpath cost.	4849	* and double the fastpath cost.
4850	*/	4850	*/
4851	rq->skip_clock_update = 1;	4851	rq->skip_clock_update = 1;
4852	}	4852	}
4853		4853
4854	set_skip_buddy(se);	4854	set_skip_buddy(se);
4855	}	4855	}
4856		4856
4857	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)	4857	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
4858	{	4858	{
4859	struct sched_entity *se = &p->se;	4859	struct sched_entity *se = &p->se;
4860		4860
4861	/* throttled hierarchies are not runnable */	4861	/* throttled hierarchies are not runnable */
4862	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))	4862	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
4863	return false;	4863	return false;
4864		4864
4865	/* Tell the scheduler that we'd really like pse to run next. */	4865	/* Tell the scheduler that we'd really like pse to run next. */
4866	set_next_buddy(se);	4866	set_next_buddy(se);
4867		4867
4868	yield_task_fair(rq);	4868	yield_task_fair(rq);
4869		4869
4870	return true;	4870	return true;
4871	}	4871	}
4872		4872
4873	#ifdef CONFIG_SMP	4873	#ifdef CONFIG_SMP
4874	/**************************************************	4874	/**************************************************
4875	* Fair scheduling class load-balancing methods.	4875	* Fair scheduling class load-balancing methods.
4876	*	4876	*
4877	* BASICS	4877	* BASICS
4878	*	4878	*
4879	* The purpose of load-balancing is to achieve the same basic fairness the	4879	* The purpose of load-balancing is to achieve the same basic fairness the
4880	* per-cpu scheduler provides, namely provide a proportional amount of compute	4880	* per-cpu scheduler provides, namely provide a proportional amount of compute
4881	* time to each task. This is expressed in the following equation:	4881	* time to each task. This is expressed in the following equation:
4882	*	4882	*
4883	* W_i,n/P_i == W_j,n/P_j for all i,j (1)	4883	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
4884	*	4884	*
4885	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight	4885	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
4886	* W_i,0 is defined as:	4886	* W_i,0 is defined as:
4887	*	4887	*
4888	* W_i,0 = \Sum_j w_i,j (2)	4888	* W_i,0 = \Sum_j w_i,j (2)
4889	*	4889	*
4890	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight	4890	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
4891	* is derived from the nice value as per prio_to_weight[].	4891	* is derived from the nice value as per prio_to_weight[].
4892	*	4892	*
4893	* The weight average is an exponential decay average of the instantaneous	4893	* The weight average is an exponential decay average of the instantaneous
4894	* weight:	4894	* weight:
4895	*	4895	*
4896	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)	4896	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
4897	*	4897	*
4898	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the	4898	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
4899	* fraction of 'recent' time available for SCHED_OTHER task execution. But it	4899	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
4900	* can also include other factors [XXX].	4900	* can also include other factors [XXX].
4901	*	4901	*
4902	* To achieve this balance we define a measure of imbalance which follows	4902	* To achieve this balance we define a measure of imbalance which follows
4903	* directly from (1):	4903	* directly from (1):
4904	*	4904	*
4905	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)	4905	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
4906	*	4906	*
4907	* We them move tasks around to minimize the imbalance. In the continuous	4907	* We them move tasks around to minimize the imbalance. In the continuous
4908	* function space it is obvious this converges, in the discrete case we get	4908	* function space it is obvious this converges, in the discrete case we get
4909	* a few fun cases generally called infeasible weight scenarios.	4909	* a few fun cases generally called infeasible weight scenarios.
4910	*	4910	*
4911	* [XXX expand on:	4911	* [XXX expand on:
4912	* - infeasible weights;	4912	* - infeasible weights;
4913	* - local vs global optima in the discrete case. ]	4913	* - local vs global optima in the discrete case. ]
4914	*	4914	*
4915	*	4915	*
4916	* SCHED DOMAINS	4916	* SCHED DOMAINS
4917	*	4917	*
4918	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)	4918	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
4919	* for all i,j solution, we create a tree of cpus that follows the hardware	4919	* for all i,j solution, we create a tree of cpus that follows the hardware
4920	* topology where each level pairs two lower groups (or better). This results	4920	* topology where each level pairs two lower groups (or better). This results
4921	* in O(log n) layers. Furthermore we reduce the number of cpus going up the	4921	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
4922	* tree to only the first of the previous level and we decrease the frequency	4922	* tree to only the first of the previous level and we decrease the frequency
4923	* of load-balance at each level inv. proportional to the number of cpus in	4923	* of load-balance at each level inv. proportional to the number of cpus in
4924	* the groups.	4924	* the groups.
4925	*	4925	*
4926	* This yields:	4926	* This yields:
4927	*	4927	*
4928	* log_2 n 1 n	4928	* log_2 n 1 n
4929	* \Sum { --- * --- * 2^i } = O(n) (5)	4929	* \Sum { --- * --- * 2^i } = O(n) (5)
4930	* i = 0 2^i 2^i	4930	* i = 0 2^i 2^i
4931	* `- size of each group	4931	* `- size of each group
4932	* \| \| `- number of cpus doing load-balance	4932	* \| \| `- number of cpus doing load-balance
4933	* \| `- freq	4933	* \| `- freq
4934	* `- sum over all levels	4934	* `- sum over all levels
4935	*	4935	*
4936	* Coupled with a limit on how many tasks we can migrate every balance pass,	4936	* Coupled with a limit on how many tasks we can migrate every balance pass,
4937	* this makes (5) the runtime complexity of the balancer.	4937	* this makes (5) the runtime complexity of the balancer.
4938	*	4938	*
4939	* An important property here is that each CPU is still (indirectly) connected	4939	* An important property here is that each CPU is still (indirectly) connected
4940	* to every other cpu in at most O(log n) steps:	4940	* to every other cpu in at most O(log n) steps:
4941	*	4941	*
4942	* The adjacency matrix of the resulting graph is given by:	4942	* The adjacency matrix of the resulting graph is given by:
4943	*	4943	*
4944	* log_2 n	4944	* log_2 n
4945	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)	4945	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
4946	* k = 0	4946	* k = 0
4947	*	4947	*
4948	* And you'll find that:	4948	* And you'll find that:
4949	*	4949	*
4950	* A^(log_2 n)_i,j != 0 for all i,j (7)	4950	* A^(log_2 n)_i,j != 0 for all i,j (7)
4951	*	4951	*
4952	* Showing there's indeed a path between every cpu in at most O(log n) steps.	4952	* Showing there's indeed a path between every cpu in at most O(log n) steps.
4953	* The task movement gives a factor of O(m), giving a convergence complexity	4953	* The task movement gives a factor of O(m), giving a convergence complexity
4954	* of:	4954	* of:
4955	*	4955	*
4956	* O(nm log n), n := nr_cpus, m := nr_tasks (8)	4956	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
4957	*	4957	*
4958	*	4958	*
4959	* WORK CONSERVING	4959	* WORK CONSERVING
4960	*	4960	*
4961	* In order to avoid CPUs going idle while there's still work to do, new idle	4961	* In order to avoid CPUs going idle while there's still work to do, new idle
4962	* balancing is more aggressive and has the newly idle cpu iterate up the domain	4962	* balancing is more aggressive and has the newly idle cpu iterate up the domain
4963	* tree itself instead of relying on other CPUs to bring it work.	4963	* tree itself instead of relying on other CPUs to bring it work.
4964	*	4964	*
4965	* This adds some complexity to both (5) and (8) but it reduces the total idle	4965	* This adds some complexity to both (5) and (8) but it reduces the total idle
4966	* time.	4966	* time.
4967	*	4967	*
4968	* [XXX more?]	4968	* [XXX more?]
4969	*	4969	*
4970	*	4970	*
4971	* CGROUPS	4971	* CGROUPS
4972	*	4972	*
4973	* Cgroups make a horror show out of (2), instead of a simple sum we get:	4973	* Cgroups make a horror show out of (2), instead of a simple sum we get:
4974	*	4974	*
4975	* s_k,i	4975	* s_k,i
4976	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)	4976	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
4977	* S_k	4977	* S_k
4978	*	4978	*
4979	* Where	4979	* Where
4980	*	4980	*
4981	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)	4981	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
4982	*	4982	*
4983	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.	4983	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
4984	*	4984	*
4985	* The big problem is S_k, its a global sum needed to compute a local (W_i)	4985	* The big problem is S_k, its a global sum needed to compute a local (W_i)
4986	* property.	4986	* property.
4987	*	4987	*
4988	* [XXX write more on how we solve this.. _after_ merging pjt's patches that	4988	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
4989	* rewrite all of this once again.]	4989	* rewrite all of this once again.]
4990	*/	4990	*/
4991		4991
4992	static unsigned long __read_mostly max_load_balance_interval = HZ/10;	4992	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4993		4993
4994	enum fbq_type { regular, remote, all };	4994	enum fbq_type { regular, remote, all };
4995		4995
4996	#define LBF_ALL_PINNED 0x01	4996	#define LBF_ALL_PINNED 0x01
4997	#define LBF_NEED_BREAK 0x02	4997	#define LBF_NEED_BREAK 0x02
4998	#define LBF_DST_PINNED 0x04	4998	#define LBF_DST_PINNED 0x04
4999	#define LBF_SOME_PINNED 0x08	4999	#define LBF_SOME_PINNED 0x08
5000		5000
5001	struct lb_env {	5001	struct lb_env {
5002	struct sched_domain *sd;	5002	struct sched_domain *sd;
5003		5003
5004	struct rq *src_rq;	5004	struct rq *src_rq;
5005	int src_cpu;	5005	int src_cpu;
5006		5006
5007	int dst_cpu;	5007	int dst_cpu;
5008	struct rq *dst_rq;	5008	struct rq *dst_rq;
5009		5009
5010	struct cpumask *dst_grpmask;	5010	struct cpumask *dst_grpmask;
5011	int new_dst_cpu;	5011	int new_dst_cpu;
5012	enum cpu_idle_type idle;	5012	enum cpu_idle_type idle;
5013	long imbalance;	5013	long imbalance;
5014	/* The set of CPUs under consideration for load-balancing */	5014	/* The set of CPUs under consideration for load-balancing */
5015	struct cpumask *cpus;	5015	struct cpumask *cpus;
5016		5016
5017	unsigned int flags;	5017	unsigned int flags;
5018		5018
5019	unsigned int loop;	5019	unsigned int loop;
5020	unsigned int loop_break;	5020	unsigned int loop_break;
5021	unsigned int loop_max;	5021	unsigned int loop_max;
5022		5022
5023	enum fbq_type fbq_type;	5023	enum fbq_type fbq_type;
5024	};	5024	};
5025		5025
5026	/*	5026	/*
5027	* move_task - move a task from one runqueue to another runqueue.	5027	* move_task - move a task from one runqueue to another runqueue.
5028	* Both runqueues must be locked.	5028	* Both runqueues must be locked.
5029	*/	5029	*/
5030	static void move_task(struct task_struct p, struct lb_env env)	5030	static void move_task(struct task_struct p, struct lb_env env)
5031	{	5031	{
5032	deactivate_task(env->src_rq, p, 0);	5032	deactivate_task(env->src_rq, p, 0);
5033	set_task_cpu(p, env->dst_cpu);	5033	set_task_cpu(p, env->dst_cpu);
5034	activate_task(env->dst_rq, p, 0);	5034	activate_task(env->dst_rq, p, 0);
5035	check_preempt_curr(env->dst_rq, p, 0);	5035	check_preempt_curr(env->dst_rq, p, 0);
5036	}	5036	}
5037		5037
5038	/*	5038	/*
5039	* Is this task likely cache-hot:	5039	* Is this task likely cache-hot:
5040	*/	5040	*/
5041	static int	5041	static int
5042	task_hot(struct task_struct *p, u64 now)	5042	task_hot(struct task_struct *p, u64 now)
5043	{	5043	{
5044	s64 delta;	5044	s64 delta;
5045		5045
5046	if (p->sched_class != &fair_sched_class)	5046	if (p->sched_class != &fair_sched_class)
5047	return 0;	5047	return 0;
5048		5048
5049	if (unlikely(p->policy == SCHED_IDLE))	5049	if (unlikely(p->policy == SCHED_IDLE))
5050	return 0;	5050	return 0;
5051		5051
5052	/*	5052	/*
5053	* Buddy candidates are cache hot:	5053	* Buddy candidates are cache hot:
5054	*/	5054	*/
5055	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&	5055	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
5056	(&p->se == cfs_rq_of(&p->se)->next \|\|	5056	(&p->se == cfs_rq_of(&p->se)->next \|\|
5057	&p->se == cfs_rq_of(&p->se)->last))	5057	&p->se == cfs_rq_of(&p->se)->last))
5058	return 1;	5058	return 1;
5059		5059
5060	if (sysctl_sched_migration_cost == -1)	5060	if (sysctl_sched_migration_cost == -1)
5061	return 1;	5061	return 1;
5062	if (sysctl_sched_migration_cost == 0)	5062	if (sysctl_sched_migration_cost == 0)
5063	return 0;	5063	return 0;
5064		5064
5065	delta = now - p->se.exec_start;	5065	delta = now - p->se.exec_start;
5066		5066
5067	return delta < (s64)sysctl_sched_migration_cost;	5067	return delta < (s64)sysctl_sched_migration_cost;
5068	}	5068	}
5069		5069
5070	#ifdef CONFIG_NUMA_BALANCING	5070	#ifdef CONFIG_NUMA_BALANCING
5071	/* Returns true if the destination node has incurred more faults */	5071	/* Returns true if the destination node has incurred more faults */
5072	static bool migrate_improves_locality(struct task_struct p, struct lb_env env)	5072	static bool migrate_improves_locality(struct task_struct p, struct lb_env env)
5073	{	5073	{
5074	int src_nid, dst_nid;	5074	int src_nid, dst_nid;
5075		5075
5076	if (!sched_feat(NUMA_FAVOUR_HIGHER) \|\| !p->numa_faults_memory \|\|	5076	if (!sched_feat(NUMA_FAVOUR_HIGHER) \|\| !p->numa_faults_memory \|\|
5077	!(env->sd->flags & SD_NUMA)) {	5077	!(env->sd->flags & SD_NUMA)) {
5078	return false;	5078	return false;
5079	}	5079	}
5080		5080
5081	src_nid = cpu_to_node(env->src_cpu);	5081	src_nid = cpu_to_node(env->src_cpu);
5082	dst_nid = cpu_to_node(env->dst_cpu);	5082	dst_nid = cpu_to_node(env->dst_cpu);
5083		5083
5084	if (src_nid == dst_nid)	5084	if (src_nid == dst_nid)
5085	return false;	5085	return false;
5086		5086
5087	/* Always encourage migration to the preferred node. */	5087	/* Always encourage migration to the preferred node. */
5088	if (dst_nid == p->numa_preferred_nid)	5088	if (dst_nid == p->numa_preferred_nid)
5089	return true;	5089	return true;
5090		5090
5091	/* If both task and group weight improve, this move is a winner. */	5091	/* If both task and group weight improve, this move is a winner. */
5092	if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&	5092	if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
5093	group_weight(p, dst_nid) > group_weight(p, src_nid))	5093	group_weight(p, dst_nid) > group_weight(p, src_nid))
5094	return true;	5094	return true;
5095		5095
5096	return false;	5096	return false;
5097	}	5097	}
5098		5098
5099		5099
5100	static bool migrate_degrades_locality(struct task_struct p, struct lb_env env)	5100	static bool migrate_degrades_locality(struct task_struct p, struct lb_env env)
5101	{	5101	{
5102	int src_nid, dst_nid;	5102	int src_nid, dst_nid;
5103		5103
5104	if (!sched_feat(NUMA) \|\| !sched_feat(NUMA_RESIST_LOWER))	5104	if (!sched_feat(NUMA) \|\| !sched_feat(NUMA_RESIST_LOWER))
5105	return false;	5105	return false;
5106		5106
5107	if (!p->numa_faults_memory \|\| !(env->sd->flags & SD_NUMA))	5107	if (!p->numa_faults_memory \|\| !(env->sd->flags & SD_NUMA))
5108	return false;	5108	return false;
5109		5109
5110	src_nid = cpu_to_node(env->src_cpu);	5110	src_nid = cpu_to_node(env->src_cpu);
5111	dst_nid = cpu_to_node(env->dst_cpu);	5111	dst_nid = cpu_to_node(env->dst_cpu);
5112		5112
5113	if (src_nid == dst_nid)	5113	if (src_nid == dst_nid)
5114	return false;	5114	return false;
5115		5115
5116	/* Migrating away from the preferred node is always bad. */	5116	/* Migrating away from the preferred node is always bad. */
5117	if (src_nid == p->numa_preferred_nid)	5117	if (src_nid == p->numa_preferred_nid)
5118	return true;	5118	return true;
5119		5119
5120	/* If either task or group weight get worse, don't do it. */	5120	/* If either task or group weight get worse, don't do it. */
5121	if (task_weight(p, dst_nid) < task_weight(p, src_nid) \|\|	5121	if (task_weight(p, dst_nid) < task_weight(p, src_nid) \|\|
5122	group_weight(p, dst_nid) < group_weight(p, src_nid))	5122	group_weight(p, dst_nid) < group_weight(p, src_nid))
5123	return true;	5123	return true;
5124		5124
5125	return false;	5125	return false;
5126	}	5126	}
5127		5127
5128	#else	5128	#else
5129	static inline bool migrate_improves_locality(struct task_struct *p,	5129	static inline bool migrate_improves_locality(struct task_struct *p,
5130	struct lb_env *env)	5130	struct lb_env *env)
5131	{	5131	{
5132	return false;	5132	return false;
5133	}	5133	}
5134		5134
5135	static inline bool migrate_degrades_locality(struct task_struct *p,	5135	static inline bool migrate_degrades_locality(struct task_struct *p,
5136	struct lb_env *env)	5136	struct lb_env *env)
5137	{	5137	{
5138	return false;	5138	return false;
5139	}	5139	}
5140	#endif	5140	#endif
5141		5141
5142	/*	5142	/*
5143	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?	5143	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5144	*/	5144	*/
5145	static	5145	static
5146	int can_migrate_task(struct task_struct p, struct lb_env env)	5146	int can_migrate_task(struct task_struct p, struct lb_env env)
5147	{	5147	{
5148	int tsk_cache_hot = 0;	5148	int tsk_cache_hot = 0;
5149	/*	5149	/*
5150	* We do not migrate tasks that are:	5150	* We do not migrate tasks that are:
5151	* 1) throttled_lb_pair, or	5151	* 1) throttled_lb_pair, or
5152	* 2) cannot be migrated to this CPU due to cpus_allowed, or	5152	* 2) cannot be migrated to this CPU due to cpus_allowed, or
5153	* 3) running (obviously), or	5153	* 3) running (obviously), or
5154	* 4) are cache-hot on their current CPU.	5154	* 4) are cache-hot on their current CPU.
5155	*/	5155	*/
5156	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))	5156	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5157	return 0;	5157	return 0;
5158		5158
5159	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {	5159	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5160	int cpu;	5160	int cpu;
5161		5161
5162	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);	5162	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5163		5163
5164	env->flags \|= LBF_SOME_PINNED;	5164	env->flags \|= LBF_SOME_PINNED;
5165		5165
5166	/*	5166	/*
5167	* Remember if this task can be migrated to any other cpu in	5167	* Remember if this task can be migrated to any other cpu in
5168	* our sched_group. We may want to revisit it if we couldn't	5168	* our sched_group. We may want to revisit it if we couldn't
5169	* meet load balance goals by pulling other tasks on src_cpu.	5169	* meet load balance goals by pulling other tasks on src_cpu.
5170	*	5170	*
5171	* Also avoid computing new_dst_cpu if we have already computed	5171	* Also avoid computing new_dst_cpu if we have already computed
5172	* one in current iteration.	5172	* one in current iteration.
5173	*/	5173	*/
5174	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))	5174	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
5175	return 0;	5175	return 0;
5176		5176
5177	/* Prevent to re-select dst_cpu via env's cpus */	5177	/* Prevent to re-select dst_cpu via env's cpus */
5178	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {	5178	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5179	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {	5179	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5180	env->flags \|= LBF_DST_PINNED;	5180	env->flags \|= LBF_DST_PINNED;
5181	env->new_dst_cpu = cpu;	5181	env->new_dst_cpu = cpu;
5182	break;	5182	break;
5183	}	5183	}
5184	}	5184	}
5185		5185
5186	return 0;	5186	return 0;
5187	}	5187	}
5188		5188
5189	/* Record that we found atleast one task that could run on dst_cpu */	5189	/* Record that we found atleast one task that could run on dst_cpu */
5190	env->flags &= ~LBF_ALL_PINNED;	5190	env->flags &= ~LBF_ALL_PINNED;
5191		5191
5192	if (task_running(env->src_rq, p)) {	5192	if (task_running(env->src_rq, p)) {
5193	schedstat_inc(p, se.statistics.nr_failed_migrations_running);	5193	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5194	return 0;	5194	return 0;
5195	}	5195	}
5196		5196
5197	/*	5197	/*
5198	* Aggressive migration if:	5198	* Aggressive migration if:
5199	* 1) destination numa is preferred	5199	* 1) destination numa is preferred
5200	* 2) task is cache cold, or	5200	* 2) task is cache cold, or
5201	* 3) too many balance attempts have failed.	5201	* 3) too many balance attempts have failed.
5202	*/	5202	*/
5203	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));	5203	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
5204	if (!tsk_cache_hot)	5204	if (!tsk_cache_hot)
5205	tsk_cache_hot = migrate_degrades_locality(p, env);	5205	tsk_cache_hot = migrate_degrades_locality(p, env);
5206		5206
5207	if (migrate_improves_locality(p, env)) {	5207	if (migrate_improves_locality(p, env)) {
5208	#ifdef CONFIG_SCHEDSTATS	5208	#ifdef CONFIG_SCHEDSTATS
5209	if (tsk_cache_hot) {	5209	if (tsk_cache_hot) {
5210	schedstat_inc(env->sd, lb_hot_gained[env->idle]);	5210	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5211	schedstat_inc(p, se.statistics.nr_forced_migrations);	5211	schedstat_inc(p, se.statistics.nr_forced_migrations);
5212	}	5212	}
5213	#endif	5213	#endif
5214	return 1;	5214	return 1;
5215	}	5215	}
5216		5216
5217	if (!tsk_cache_hot \|\|	5217	if (!tsk_cache_hot \|\|
5218	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {	5218	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5219		5219
5220	if (tsk_cache_hot) {	5220	if (tsk_cache_hot) {
5221	schedstat_inc(env->sd, lb_hot_gained[env->idle]);	5221	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5222	schedstat_inc(p, se.statistics.nr_forced_migrations);	5222	schedstat_inc(p, se.statistics.nr_forced_migrations);
5223	}	5223	}
5224		5224
5225	return 1;	5225	return 1;
5226	}	5226	}
5227		5227
5228	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);	5228	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5229	return 0;	5229	return 0;
5230	}	5230	}
5231		5231
5232	/*	5232	/*
5233	* move_one_task tries to move exactly one task from busiest to this_rq, as	5233	* move_one_task tries to move exactly one task from busiest to this_rq, as
5234	* part of active balancing operations within "domain".	5234	* part of active balancing operations within "domain".
5235	* Returns 1 if successful and 0 otherwise.	5235	* Returns 1 if successful and 0 otherwise.
5236	*	5236	*
5237	* Called with both runqueues locked.	5237	* Called with both runqueues locked.
5238	*/	5238	*/
5239	static int move_one_task(struct lb_env *env)	5239	static int move_one_task(struct lb_env *env)
5240	{	5240	{
5241	struct task_struct p, n;	5241	struct task_struct p, n;
5242		5242
5243	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {	5243	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5244	if (!can_migrate_task(p, env))	5244	if (!can_migrate_task(p, env))
5245	continue;	5245	continue;
5246		5246
5247	move_task(p, env);	5247	move_task(p, env);
5248	/*	5248	/*
5249	* Right now, this is only the second place move_task()	5249	* Right now, this is only the second place move_task()
5250	* is called, so we can safely collect move_task()	5250	* is called, so we can safely collect move_task()
5251	* stats here rather than inside move_task().	5251	* stats here rather than inside move_task().
5252	*/	5252	*/
5253	schedstat_inc(env->sd, lb_gained[env->idle]);	5253	schedstat_inc(env->sd, lb_gained[env->idle]);
5254	return 1;	5254	return 1;
5255	}	5255	}
5256	return 0;	5256	return 0;
5257	}	5257	}
5258		5258
5259	static const unsigned int sched_nr_migrate_break = 32;	5259	static const unsigned int sched_nr_migrate_break = 32;
5260		5260
5261	/*	5261	/*
5262	* move_tasks tries to move up to imbalance weighted load from busiest to	5262	* move_tasks tries to move up to imbalance weighted load from busiest to
5263	* this_rq, as part of a balancing operation within domain "sd".	5263	* this_rq, as part of a balancing operation within domain "sd".
5264	* Returns 1 if successful and 0 otherwise.	5264	* Returns 1 if successful and 0 otherwise.
5265	*	5265	*
5266	* Called with both runqueues locked.	5266	* Called with both runqueues locked.
5267	*/	5267	*/
5268	static int move_tasks(struct lb_env *env)	5268	static int move_tasks(struct lb_env *env)
5269	{	5269	{
5270	struct list_head *tasks = &env->src_rq->cfs_tasks;	5270	struct list_head *tasks = &env->src_rq->cfs_tasks;
5271	struct task_struct *p;	5271	struct task_struct *p;
5272	unsigned long load;	5272	unsigned long load;
5273	int pulled = 0;	5273	int pulled = 0;
5274		5274
5275	if (env->imbalance <= 0)	5275	if (env->imbalance <= 0)
5276	return 0;	5276	return 0;
5277		5277
5278	while (!list_empty(tasks)) {	5278	while (!list_empty(tasks)) {
5279	p = list_first_entry(tasks, struct task_struct, se.group_node);	5279	p = list_first_entry(tasks, struct task_struct, se.group_node);
5280		5280
5281	env->loop++;	5281	env->loop++;
5282	/* We've more or less seen every task there is, call it quits */	5282	/* We've more or less seen every task there is, call it quits */
5283	if (env->loop > env->loop_max)	5283	if (env->loop > env->loop_max)
5284	break;	5284	break;
5285		5285
5286	/* take a breather every nr_migrate tasks */	5286	/* take a breather every nr_migrate tasks */
5287	if (env->loop > env->loop_break) {	5287	if (env->loop > env->loop_break) {
5288	env->loop_break += sched_nr_migrate_break;	5288	env->loop_break += sched_nr_migrate_break;
5289	env->flags \|= LBF_NEED_BREAK;	5289	env->flags \|= LBF_NEED_BREAK;
5290	break;	5290	break;
5291	}	5291	}
5292		5292
5293	if (!can_migrate_task(p, env))	5293	if (!can_migrate_task(p, env))
5294	goto next;	5294	goto next;
5295		5295
5296	load = task_h_load(p);	5296	load = task_h_load(p);
5297		5297
5298	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)	5298	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5299	goto next;	5299	goto next;
5300		5300
5301	if ((load / 2) > env->imbalance)	5301	if ((load / 2) > env->imbalance)
5302	goto next;	5302	goto next;
5303		5303
5304	move_task(p, env);	5304	move_task(p, env);
5305	pulled++;	5305	pulled++;
5306	env->imbalance -= load;	5306	env->imbalance -= load;
5307		5307
5308	#ifdef CONFIG_PREEMPT	5308	#ifdef CONFIG_PREEMPT
5309	/*	5309	/*
5310	* NEWIDLE balancing is a source of latency, so preemptible	5310	* NEWIDLE balancing is a source of latency, so preemptible
5311	* kernels will stop after the first task is pulled to minimize	5311	* kernels will stop after the first task is pulled to minimize
5312	* the critical section.	5312	* the critical section.
5313	*/	5313	*/
5314	if (env->idle == CPU_NEWLY_IDLE)	5314	if (env->idle == CPU_NEWLY_IDLE)
5315	break;	5315	break;
5316	#endif	5316	#endif
5317		5317
5318	/*	5318	/*
5319	* We only want to steal up to the prescribed amount of	5319	* We only want to steal up to the prescribed amount of
5320	* weighted load.	5320	* weighted load.
5321	*/	5321	*/
5322	if (env->imbalance <= 0)	5322	if (env->imbalance <= 0)
5323	break;	5323	break;
5324		5324
5325	continue;	5325	continue;
5326	next:	5326	next:
5327	list_move_tail(&p->se.group_node, tasks);	5327	list_move_tail(&p->se.group_node, tasks);
5328	}	5328	}
5329		5329
5330	/*	5330	/*
5331	* Right now, this is one of only two places move_task() is called,	5331	* Right now, this is one of only two places move_task() is called,
5332	* so we can safely collect move_task() stats here rather than	5332	* so we can safely collect move_task() stats here rather than
5333	* inside move_task().	5333	* inside move_task().
5334	*/	5334	*/
5335	schedstat_add(env->sd, lb_gained[env->idle], pulled);	5335	schedstat_add(env->sd, lb_gained[env->idle], pulled);
5336		5336
5337	return pulled;	5337	return pulled;
5338	}	5338	}
5339		5339
5340	#ifdef CONFIG_FAIR_GROUP_SCHED	5340	#ifdef CONFIG_FAIR_GROUP_SCHED
5341	/*	5341	/*
5342	* update tg->load_weight by folding this cpu's load_avg	5342	* update tg->load_weight by folding this cpu's load_avg
5343	*/	5343	*/
5344	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)	5344	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5345	{	5345	{
5346	struct sched_entity *se = tg->se[cpu];	5346	struct sched_entity *se = tg->se[cpu];
5347	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];	5347	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5348		5348
5349	/* throttled entities do not contribute to load */	5349	/* throttled entities do not contribute to load */
5350	if (throttled_hierarchy(cfs_rq))	5350	if (throttled_hierarchy(cfs_rq))
5351	return;	5351	return;
5352		5352
5353	update_cfs_rq_blocked_load(cfs_rq, 1);	5353	update_cfs_rq_blocked_load(cfs_rq, 1);
5354		5354
5355	if (se) {	5355	if (se) {
5356	update_entity_load_avg(se, 1);	5356	update_entity_load_avg(se, 1);
5357	/*	5357	/*
5358	* We pivot on our runnable average having decayed to zero for	5358	* We pivot on our runnable average having decayed to zero for
5359	* list removal. This generally implies that all our children	5359	* list removal. This generally implies that all our children
5360	* have also been removed (modulo rounding error or bandwidth	5360	* have also been removed (modulo rounding error or bandwidth
5361	* control); however, such cases are rare and we can fix these	5361	* control); however, such cases are rare and we can fix these
5362	* at enqueue.	5362	* at enqueue.
5363	*	5363	*
5364	* TODO: fix up out-of-order children on enqueue.	5364	* TODO: fix up out-of-order children on enqueue.
5365	*/	5365	*/
5366	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)	5366	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5367	list_del_leaf_cfs_rq(cfs_rq);	5367	list_del_leaf_cfs_rq(cfs_rq);
5368	} else {	5368	} else {
5369	struct rq *rq = rq_of(cfs_rq);	5369	struct rq *rq = rq_of(cfs_rq);
5370	update_rq_runnable_avg(rq, rq->nr_running);	5370	update_rq_runnable_avg(rq, rq->nr_running);
5371	}	5371	}
5372	}	5372	}
5373		5373
5374	static void update_blocked_averages(int cpu)	5374	static void update_blocked_averages(int cpu)
5375	{	5375	{
5376	struct rq *rq = cpu_rq(cpu);	5376	struct rq *rq = cpu_rq(cpu);
5377	struct cfs_rq *cfs_rq;	5377	struct cfs_rq *cfs_rq;
5378	unsigned long flags;	5378	unsigned long flags;
5379		5379
5380	raw_spin_lock_irqsave(&rq->lock, flags);	5380	raw_spin_lock_irqsave(&rq->lock, flags);
5381	update_rq_clock(rq);	5381	update_rq_clock(rq);
5382	/*	5382	/*
5383	* Iterates the task_group tree in a bottom up fashion, see	5383	* Iterates the task_group tree in a bottom up fashion, see
5384	* list_add_leaf_cfs_rq() for details.	5384	* list_add_leaf_cfs_rq() for details.
5385	*/	5385	*/
5386	for_each_leaf_cfs_rq(rq, cfs_rq) {	5386	for_each_leaf_cfs_rq(rq, cfs_rq) {
5387	/*	5387	/*
5388	* Note: We may want to consider periodically releasing	5388	* Note: We may want to consider periodically releasing
5389	* rq->lock about these updates so that creating many task	5389	* rq->lock about these updates so that creating many task
5390	* groups does not result in continually extending hold time.	5390	* groups does not result in continually extending hold time.
5391	*/	5391	*/
5392	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);	5392	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5393	}	5393	}
5394		5394
5395	raw_spin_unlock_irqrestore(&rq->lock, flags);	5395	raw_spin_unlock_irqrestore(&rq->lock, flags);
5396	}	5396	}
5397		5397
5398	/*	5398	/*
5399	* Compute the hierarchical load factor for cfs_rq and all its ascendants.	5399	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
5400	* This needs to be done in a top-down fashion because the load of a child	5400	* This needs to be done in a top-down fashion because the load of a child
5401	* group is a fraction of its parents load.	5401	* group is a fraction of its parents load.
5402	*/	5402	*/
5403	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)	5403	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5404	{	5404	{
5405	struct rq *rq = rq_of(cfs_rq);	5405	struct rq *rq = rq_of(cfs_rq);
5406	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];	5406	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5407	unsigned long now = jiffies;	5407	unsigned long now = jiffies;
5408	unsigned long load;	5408	unsigned long load;
5409		5409
5410	if (cfs_rq->last_h_load_update == now)	5410	if (cfs_rq->last_h_load_update == now)
5411	return;	5411	return;
5412		5412
5413	cfs_rq->h_load_next = NULL;	5413	cfs_rq->h_load_next = NULL;
5414	for_each_sched_entity(se) {	5414	for_each_sched_entity(se) {
5415	cfs_rq = cfs_rq_of(se);	5415	cfs_rq = cfs_rq_of(se);
5416	cfs_rq->h_load_next = se;	5416	cfs_rq->h_load_next = se;
5417	if (cfs_rq->last_h_load_update == now)	5417	if (cfs_rq->last_h_load_update == now)
5418	break;	5418	break;
5419	}	5419	}
5420		5420
5421	if (!se) {	5421	if (!se) {
5422	cfs_rq->h_load = cfs_rq->runnable_load_avg;	5422	cfs_rq->h_load = cfs_rq->runnable_load_avg;
5423	cfs_rq->last_h_load_update = now;	5423	cfs_rq->last_h_load_update = now;
5424	}	5424	}
5425		5425
5426	while ((se = cfs_rq->h_load_next) != NULL) {	5426	while ((se = cfs_rq->h_load_next) != NULL) {
5427	load = cfs_rq->h_load;	5427	load = cfs_rq->h_load;
5428	load = div64_ul(load * se->avg.load_avg_contrib,	5428	load = div64_ul(load * se->avg.load_avg_contrib,
5429	cfs_rq->runnable_load_avg + 1);	5429	cfs_rq->runnable_load_avg + 1);
5430	cfs_rq = group_cfs_rq(se);	5430	cfs_rq = group_cfs_rq(se);
5431	cfs_rq->h_load = load;	5431	cfs_rq->h_load = load;
5432	cfs_rq->last_h_load_update = now;	5432	cfs_rq->last_h_load_update = now;
5433	}	5433	}
5434	}	5434	}
5435		5435
5436	static unsigned long task_h_load(struct task_struct *p)	5436	static unsigned long task_h_load(struct task_struct *p)
5437	{	5437	{
5438	struct cfs_rq *cfs_rq = task_cfs_rq(p);	5438	struct cfs_rq *cfs_rq = task_cfs_rq(p);
5439		5439
5440	update_cfs_rq_h_load(cfs_rq);	5440	update_cfs_rq_h_load(cfs_rq);
5441	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,	5441	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
5442	cfs_rq->runnable_load_avg + 1);	5442	cfs_rq->runnable_load_avg + 1);
5443	}	5443	}
5444	#else	5444	#else
5445	static inline void update_blocked_averages(int cpu)	5445	static inline void update_blocked_averages(int cpu)
5446	{	5446	{
5447	}	5447	}
5448		5448
5449	static unsigned long task_h_load(struct task_struct *p)	5449	static unsigned long task_h_load(struct task_struct *p)
5450	{	5450	{
5451	return p->se.avg.load_avg_contrib;	5451	return p->se.avg.load_avg_contrib;
5452	}	5452	}
5453	#endif	5453	#endif
5454		5454
5455	/******** Helpers for find_busiest_group **********************/	5455	/******** Helpers for find_busiest_group **********************/
5456	/*	5456	/*
5457	* sg_lb_stats - stats of a sched_group required for load_balancing	5457	* sg_lb_stats - stats of a sched_group required for load_balancing
5458	*/	5458	*/
5459	struct sg_lb_stats {	5459	struct sg_lb_stats {
5460	unsigned long avg_load; /Avg load across the CPUs of the group /	5460	unsigned long avg_load; /Avg load across the CPUs of the group /
5461	unsigned long group_load; /* Total load over the CPUs of the group */	5461	unsigned long group_load; /* Total load over the CPUs of the group */
5462	unsigned long sum_weighted_load; /* Weighted load of group's tasks */	5462	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5463	unsigned long load_per_task;	5463	unsigned long load_per_task;
5464	unsigned long group_power;	5464	unsigned long group_power;
5465	unsigned int sum_nr_running; /* Nr tasks running in the group */	5465	unsigned int sum_nr_running; /* Nr tasks running in the group */
5466	unsigned int group_capacity;	5466	unsigned int group_capacity;
5467	unsigned int idle_cpus;	5467	unsigned int idle_cpus;
5468	unsigned int group_weight;	5468	unsigned int group_weight;
5469	int group_imb; /* Is there an imbalance in the group ? */	5469	int group_imb; /* Is there an imbalance in the group ? */
5470	int group_has_capacity; /* Is there extra capacity in the group? */	5470	int group_has_capacity; /* Is there extra capacity in the group? */
5471	#ifdef CONFIG_NUMA_BALANCING	5471	#ifdef CONFIG_NUMA_BALANCING
5472	unsigned int nr_numa_running;	5472	unsigned int nr_numa_running;
5473	unsigned int nr_preferred_running;	5473	unsigned int nr_preferred_running;
5474	#endif	5474	#endif
5475	};	5475	};
5476		5476
5477	/*	5477	/*
5478	* sd_lb_stats - Structure to store the statistics of a sched_domain	5478	* sd_lb_stats - Structure to store the statistics of a sched_domain
5479	* during load balancing.	5479	* during load balancing.
5480	*/	5480	*/
5481	struct sd_lb_stats {	5481	struct sd_lb_stats {
5482	struct sched_group busiest; / Busiest group in this sd */	5482	struct sched_group busiest; / Busiest group in this sd */
5483	struct sched_group local; / Local group in this sd */	5483	struct sched_group local; / Local group in this sd */
5484	unsigned long total_load; /* Total load of all groups in sd */	5484	unsigned long total_load; /* Total load of all groups in sd */
5485	unsigned long total_pwr; /* Total power of all groups in sd */	5485	unsigned long total_pwr; /* Total power of all groups in sd */
5486	unsigned long avg_load; /* Average load across all groups in sd */	5486	unsigned long avg_load; /* Average load across all groups in sd */
5487		5487
5488	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */	5488	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
5489	struct sg_lb_stats local_stat; /* Statistics of the local group */	5489	struct sg_lb_stats local_stat; /* Statistics of the local group */
5490	};	5490	};
5491		5491
5492	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)	5492	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5493	{	5493	{
5494	/*	5494	/*
5495	* Skimp on the clearing to avoid duplicate work. We can avoid clearing	5495	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
5496	* local_stat because update_sg_lb_stats() does a full clear/assignment.	5496	* local_stat because update_sg_lb_stats() does a full clear/assignment.
5497	* We must however clear busiest_stat::avg_load because	5497	* We must however clear busiest_stat::avg_load because
5498	* update_sd_pick_busiest() reads this before assignment.	5498	* update_sd_pick_busiest() reads this before assignment.
5499	*/	5499	*/
5500	*sds = (struct sd_lb_stats){	5500	*sds = (struct sd_lb_stats){
5501	.busiest = NULL,	5501	.busiest = NULL,
5502	.local = NULL,	5502	.local = NULL,
5503	.total_load = 0UL,	5503	.total_load = 0UL,
5504	.total_pwr = 0UL,	5504	.total_pwr = 0UL,
5505	.busiest_stat = {	5505	.busiest_stat = {
5506	.avg_load = 0UL,	5506	.avg_load = 0UL,
5507	},	5507	},
5508	};	5508	};
5509	}	5509	}
5510		5510
5511	/**	5511	/**
5512	* get_sd_load_idx - Obtain the load index for a given sched domain.	5512	* get_sd_load_idx - Obtain the load index for a given sched domain.
5513	* @sd: The sched_domain whose load_idx is to be obtained.	5513	* @sd: The sched_domain whose load_idx is to be obtained.
5514	* @idle: The idle status of the CPU for whose sd load_idx is obtained.	5514	* @idle: The idle status of the CPU for whose sd load_idx is obtained.
5515	*	5515	*
5516	* Return: The load index.	5516	* Return: The load index.
5517	*/	5517	*/
5518	static inline int get_sd_load_idx(struct sched_domain *sd,	5518	static inline int get_sd_load_idx(struct sched_domain *sd,
5519	enum cpu_idle_type idle)	5519	enum cpu_idle_type idle)
5520	{	5520	{
5521	int load_idx;	5521	int load_idx;
5522		5522
5523	switch (idle) {	5523	switch (idle) {
5524	case CPU_NOT_IDLE:	5524	case CPU_NOT_IDLE:
5525	load_idx = sd->busy_idx;	5525	load_idx = sd->busy_idx;
5526	break;	5526	break;
5527		5527
5528	case CPU_NEWLY_IDLE:	5528	case CPU_NEWLY_IDLE:
5529	load_idx = sd->newidle_idx;	5529	load_idx = sd->newidle_idx;
5530	break;	5530	break;
5531	default:	5531	default:
5532	load_idx = sd->idle_idx;	5532	load_idx = sd->idle_idx;
5533	break;	5533	break;
5534	}	5534	}
5535		5535
5536	return load_idx;	5536	return load_idx;
5537	}	5537	}
5538		5538
5539	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)	5539	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
5540	{	5540	{
5541	return SCHED_POWER_SCALE;	5541	return SCHED_POWER_SCALE;
5542	}	5542	}
5543		5543
5544	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)	5544	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
5545	{	5545	{
5546	return default_scale_freq_power(sd, cpu);	5546	return default_scale_freq_power(sd, cpu);
5547	}	5547	}
5548		5548
5549	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)	5549	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5550	{	5550	{
5551	unsigned long weight = sd->span_weight;	5551	unsigned long weight = sd->span_weight;
5552	unsigned long smt_gain = sd->smt_gain;	5552	unsigned long smt_gain = sd->smt_gain;
5553		5553
5554	smt_gain /= weight;	5554	smt_gain /= weight;
5555		5555
5556	return smt_gain;	5556	return smt_gain;
5557	}	5557	}
5558		5558
5559	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)	5559	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
5560	{	5560	{
5561	return default_scale_smt_power(sd, cpu);	5561	return default_scale_smt_power(sd, cpu);
5562	}	5562	}
5563		5563
5564	static unsigned long scale_rt_power(int cpu)	5564	static unsigned long scale_rt_power(int cpu)
5565	{	5565	{
5566	struct rq *rq = cpu_rq(cpu);	5566	struct rq *rq = cpu_rq(cpu);
5567	u64 total, available, age_stamp, avg;	5567	u64 total, available, age_stamp, avg;
5568		5568
5569	/*	5569	/*
5570	* Since we're reading these variables without serialization make sure	5570	* Since we're reading these variables without serialization make sure
5571	* we read them once before doing sanity checks on them.	5571	* we read them once before doing sanity checks on them.
5572	*/	5572	*/
5573	age_stamp = ACCESS_ONCE(rq->age_stamp);	5573	age_stamp = ACCESS_ONCE(rq->age_stamp);
5574	avg = ACCESS_ONCE(rq->rt_avg);	5574	avg = ACCESS_ONCE(rq->rt_avg);
5575		5575
5576	total = sched_avg_period() + (rq_clock(rq) - age_stamp);	5576	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
5577		5577
5578	if (unlikely(total < avg)) {	5578	if (unlikely(total < avg)) {
5579	/* Ensures that power won't end up being negative */	5579	/* Ensures that power won't end up being negative */
5580	available = 0;	5580	available = 0;
5581	} else {	5581	} else {
5582	available = total - avg;	5582	available = total - avg;
5583	}	5583	}
5584		5584
5585	if (unlikely((s64)total < SCHED_POWER_SCALE))	5585	if (unlikely((s64)total < SCHED_POWER_SCALE))
5586	total = SCHED_POWER_SCALE;	5586	total = SCHED_POWER_SCALE;
5587		5587
5588	total >>= SCHED_POWER_SHIFT;	5588	total >>= SCHED_POWER_SHIFT;
5589		5589
5590	return div_u64(available, total);	5590	return div_u64(available, total);
5591	}	5591	}
5592		5592
5593	static void update_cpu_power(struct sched_domain *sd, int cpu)	5593	static void update_cpu_power(struct sched_domain *sd, int cpu)
5594	{	5594	{
5595	unsigned long weight = sd->span_weight;	5595	unsigned long weight = sd->span_weight;
5596	unsigned long power = SCHED_POWER_SCALE;	5596	unsigned long power = SCHED_POWER_SCALE;
5597	struct sched_group *sdg = sd->groups;	5597	struct sched_group *sdg = sd->groups;
5598		5598
5599	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {	5599	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
5600	if (sched_feat(ARCH_POWER))	5600	if (sched_feat(ARCH_POWER))
5601	power *= arch_scale_smt_power(sd, cpu);	5601	power *= arch_scale_smt_power(sd, cpu);
5602	else	5602	else
5603	power *= default_scale_smt_power(sd, cpu);	5603	power *= default_scale_smt_power(sd, cpu);
5604		5604
5605	power >>= SCHED_POWER_SHIFT;	5605	power >>= SCHED_POWER_SHIFT;
5606	}	5606	}
5607		5607
5608	sdg->sgp->power_orig = power;	5608	sdg->sgp->power_orig = power;
5609		5609
5610	if (sched_feat(ARCH_POWER))	5610	if (sched_feat(ARCH_POWER))
5611	power *= arch_scale_freq_power(sd, cpu);	5611	power *= arch_scale_freq_power(sd, cpu);
5612	else	5612	else
5613	power *= default_scale_freq_power(sd, cpu);	5613	power *= default_scale_freq_power(sd, cpu);
5614		5614
5615	power >>= SCHED_POWER_SHIFT;	5615	power >>= SCHED_POWER_SHIFT;
5616		5616
5617	power *= scale_rt_power(cpu);	5617	power *= scale_rt_power(cpu);
5618	power >>= SCHED_POWER_SHIFT;	5618	power >>= SCHED_POWER_SHIFT;
5619		5619
5620	if (!power)	5620	if (!power)
5621	power = 1;	5621	power = 1;
5622		5622
5623	cpu_rq(cpu)->cpu_power = power;	5623	cpu_rq(cpu)->cpu_power = power;
5624	sdg->sgp->power = power;	5624	sdg->sgp->power = power;
5625	}	5625	}
5626		5626
5627	void update_group_power(struct sched_domain *sd, int cpu)	5627	void update_group_power(struct sched_domain *sd, int cpu)
5628	{	5628	{
5629	struct sched_domain *child = sd->child;	5629	struct sched_domain *child = sd->child;
5630	struct sched_group group, sdg = sd->groups;	5630	struct sched_group group, sdg = sd->groups;
5631	unsigned long power, power_orig;	5631	unsigned long power, power_orig;
5632	unsigned long interval;	5632	unsigned long interval;
5633		5633
5634	interval = msecs_to_jiffies(sd->balance_interval);	5634	interval = msecs_to_jiffies(sd->balance_interval);
5635	interval = clamp(interval, 1UL, max_load_balance_interval);	5635	interval = clamp(interval, 1UL, max_load_balance_interval);
5636	sdg->sgp->next_update = jiffies + interval;	5636	sdg->sgp->next_update = jiffies + interval;
5637		5637
5638	if (!child) {	5638	if (!child) {
5639	update_cpu_power(sd, cpu);	5639	update_cpu_power(sd, cpu);
5640	return;	5640	return;
5641	}	5641	}
5642		5642
5643	power_orig = power = 0;	5643	power_orig = power = 0;
5644		5644
5645	if (child->flags & SD_OVERLAP) {	5645	if (child->flags & SD_OVERLAP) {
5646	/*	5646	/*
5647	* SD_OVERLAP domains cannot assume that child groups	5647	* SD_OVERLAP domains cannot assume that child groups
5648	* span the current group.	5648	* span the current group.
5649	*/	5649	*/
5650		5650
5651	for_each_cpu(cpu, sched_group_cpus(sdg)) {	5651	for_each_cpu(cpu, sched_group_cpus(sdg)) {
5652	struct sched_group_power *sgp;	5652	struct sched_group_power *sgp;
5653	struct rq *rq = cpu_rq(cpu);	5653	struct rq *rq = cpu_rq(cpu);
5654		5654
5655	/*	5655	/*
5656	* build_sched_domains() -> init_sched_groups_power()	5656	* build_sched_domains() -> init_sched_groups_power()
5657	* gets here before we've attached the domains to the	5657	* gets here before we've attached the domains to the
5658	* runqueues.	5658	* runqueues.
5659	*	5659	*
5660	* Use power_of(), which is set irrespective of domains	5660	* Use power_of(), which is set irrespective of domains
5661	* in update_cpu_power().	5661	* in update_cpu_power().
5662	*	5662	*
5663	* This avoids power/power_orig from being 0 and	5663	* This avoids power/power_orig from being 0 and
5664	* causing divide-by-zero issues on boot.	5664	* causing divide-by-zero issues on boot.
5665	*	5665	*
5666	* Runtime updates will correct power_orig.	5666	* Runtime updates will correct power_orig.
5667	*/	5667	*/
5668	if (unlikely(!rq->sd)) {	5668	if (unlikely(!rq->sd)) {
5669	power_orig += power_of(cpu);	5669	power_orig += power_of(cpu);
5670	power += power_of(cpu);	5670	power += power_of(cpu);
5671	continue;	5671	continue;
5672	}	5672	}
5673		5673
5674	sgp = rq->sd->groups->sgp;	5674	sgp = rq->sd->groups->sgp;
5675	power_orig += sgp->power_orig;	5675	power_orig += sgp->power_orig;
5676	power += sgp->power;	5676	power += sgp->power;
5677	}	5677	}
5678	} else {	5678	} else {
5679	/*	5679	/*
5680	* !SD_OVERLAP domains can assume that child groups	5680	* !SD_OVERLAP domains can assume that child groups
5681	* span the current group.	5681	* span the current group.
5682	*/	5682	*/
5683		5683
5684	group = child->groups;	5684	group = child->groups;
5685	do {	5685	do {
5686	power_orig += group->sgp->power_orig;	5686	power_orig += group->sgp->power_orig;
5687	power += group->sgp->power;	5687	power += group->sgp->power;
5688	group = group->next;	5688	group = group->next;
5689	} while (group != child->groups);	5689	} while (group != child->groups);
5690	}	5690	}
5691		5691
5692	sdg->sgp->power_orig = power_orig;	5692	sdg->sgp->power_orig = power_orig;
5693	sdg->sgp->power = power;	5693	sdg->sgp->power = power;
5694	}	5694	}
5695		5695
5696	/*	5696	/*
5697	* Try and fix up capacity for tiny siblings, this is needed when	5697	* Try and fix up capacity for tiny siblings, this is needed when
5698	* things like SD_ASYM_PACKING need f_b_g to select another sibling	5698	* things like SD_ASYM_PACKING need f_b_g to select another sibling
5699	* which on its own isn't powerful enough.	5699	* which on its own isn't powerful enough.
5700	*	5700	*
5701	* See update_sd_pick_busiest() and check_asym_packing().	5701	* See update_sd_pick_busiest() and check_asym_packing().
5702	*/	5702	*/
5703	static inline int	5703	static inline int
5704	fix_small_capacity(struct sched_domain sd, struct sched_group group)	5704	fix_small_capacity(struct sched_domain sd, struct sched_group group)
5705	{	5705	{
5706	/*	5706	/*
5707	* Only siblings can have significantly less than SCHED_POWER_SCALE	5707	* Only siblings can have significantly less than SCHED_POWER_SCALE
5708	*/	5708	*/
5709	if (!(sd->flags & SD_SHARE_CPUPOWER))	5709	if (!(sd->flags & SD_SHARE_CPUPOWER))
5710	return 0;	5710	return 0;
5711		5711
5712	/*	5712	/*
5713	* If ~90% of the cpu_power is still there, we're good.	5713	* If ~90% of the cpu_power is still there, we're good.
5714	*/	5714	*/
5715	if (group->sgp->power * 32 > group->sgp->power_orig * 29)	5715	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
5716	return 1;	5716	return 1;
5717		5717
5718	return 0;	5718	return 0;
5719	}	5719	}
5720		5720
5721	/*	5721	/*
5722	* Group imbalance indicates (and tries to solve) the problem where balancing	5722	* Group imbalance indicates (and tries to solve) the problem where balancing
5723	* groups is inadequate due to tsk_cpus_allowed() constraints.	5723	* groups is inadequate due to tsk_cpus_allowed() constraints.
5724	*	5724	*
5725	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a	5725	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
5726	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.	5726	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
5727	* Something like:	5727	* Something like:
5728	*	5728	*
5729	* { 0 1 2 3 } { 4 5 6 7 }	5729	* { 0 1 2 3 } { 4 5 6 7 }
5730	* * * * *	5730	* * * * *
5731	*	5731	*
5732	* If we were to balance group-wise we'd place two tasks in the first group and	5732	* If we were to balance group-wise we'd place two tasks in the first group and
5733	* two tasks in the second group. Clearly this is undesired as it will overload	5733	* two tasks in the second group. Clearly this is undesired as it will overload
5734	* cpu 3 and leave one of the cpus in the second group unused.	5734	* cpu 3 and leave one of the cpus in the second group unused.
5735	*	5735	*
5736	* The current solution to this issue is detecting the skew in the first group	5736	* The current solution to this issue is detecting the skew in the first group
5737	* by noticing the lower domain failed to reach balance and had difficulty	5737	* by noticing the lower domain failed to reach balance and had difficulty
5738	* moving tasks due to affinity constraints.	5738	* moving tasks due to affinity constraints.
5739	*	5739	*
5740	* When this is so detected; this group becomes a candidate for busiest; see	5740	* When this is so detected; this group becomes a candidate for busiest; see
5741	* update_sd_pick_busiest(). And calculate_imbalance() and	5741	* update_sd_pick_busiest(). And calculate_imbalance() and
5742	* find_busiest_group() avoid some of the usual balance conditions to allow it	5742	* find_busiest_group() avoid some of the usual balance conditions to allow it
5743	* to create an effective group imbalance.	5743	* to create an effective group imbalance.
5744	*	5744	*
5745	* This is a somewhat tricky proposition since the next run might not find the	5745	* This is a somewhat tricky proposition since the next run might not find the
5746	* group imbalance and decide the groups need to be balanced again. A most	5746	* group imbalance and decide the groups need to be balanced again. A most
5747	* subtle and fragile situation.	5747	* subtle and fragile situation.
5748	*/	5748	*/
5749		5749
5750	static inline int sg_imbalanced(struct sched_group *group)	5750	static inline int sg_imbalanced(struct sched_group *group)
5751	{	5751	{
5752	return group->sgp->imbalance;	5752	return group->sgp->imbalance;
5753	}	5753	}
5754		5754
5755	/*	5755	/*
5756	* Compute the group capacity.	5756	* Compute the group capacity.
5757	*	5757	*
5758	* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by	5758	* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5759	* first dividing out the smt factor and computing the actual number of cores	5759	* first dividing out the smt factor and computing the actual number of cores
5760	* and limit power unit capacity with that.	5760	* and limit power unit capacity with that.
5761	*/	5761	*/
5762	static inline int sg_capacity(struct lb_env env, struct sched_group group)	5762	static inline int sg_capacity(struct lb_env env, struct sched_group group)
5763	{	5763	{
5764	unsigned int capacity, smt, cpus;	5764	unsigned int capacity, smt, cpus;
5765	unsigned int power, power_orig;	5765	unsigned int power, power_orig;
5766		5766
5767	power = group->sgp->power;	5767	power = group->sgp->power;
5768	power_orig = group->sgp->power_orig;	5768	power_orig = group->sgp->power_orig;
5769	cpus = group->group_weight;	5769	cpus = group->group_weight;
5770		5770
5771	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */	5771	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
5772	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);	5772	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
5773	capacity = cpus / smt; /* cores */	5773	capacity = cpus / smt; /* cores */
5774		5774
5775	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));	5775	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5776	if (!capacity)	5776	if (!capacity)
5777	capacity = fix_small_capacity(env->sd, group);	5777	capacity = fix_small_capacity(env->sd, group);
5778		5778
5779	return capacity;	5779	return capacity;
5780	}	5780	}
5781		5781
5782	/**	5782	/**
5783	* update_sg_lb_stats - Update sched_group's statistics for load balancing.	5783	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
5784	* @env: The load balancing environment.	5784	* @env: The load balancing environment.
5785	* @group: sched_group whose statistics are to be updated.	5785	* @group: sched_group whose statistics are to be updated.
5786	* @load_idx: Load index of sched_domain of this_cpu for load calc.	5786	* @load_idx: Load index of sched_domain of this_cpu for load calc.
5787	* @local_group: Does group contain this_cpu.	5787	* @local_group: Does group contain this_cpu.
5788	* @sgs: variable to hold the statistics for this group.	5788	* @sgs: variable to hold the statistics for this group.
5789	*/	5789	*/
5790	static inline void update_sg_lb_stats(struct lb_env *env,	5790	static inline void update_sg_lb_stats(struct lb_env *env,
5791	struct sched_group *group, int load_idx,	5791	struct sched_group *group, int load_idx,
5792	int local_group, struct sg_lb_stats *sgs)	5792	int local_group, struct sg_lb_stats *sgs)
5793	{	5793	{
5794	unsigned long load;	5794	unsigned long load;
5795	int i;	5795	int i;
5796		5796
5797	memset(sgs, 0, sizeof(*sgs));	5797	memset(sgs, 0, sizeof(*sgs));
5798		5798
5799	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {	5799	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5800	struct rq *rq = cpu_rq(i);	5800	struct rq *rq = cpu_rq(i);
5801		5801
5802	/* Bias balancing toward cpus of our domain */	5802	/* Bias balancing toward cpus of our domain */
5803	if (local_group)	5803	if (local_group)
5804	load = target_load(i, load_idx);	5804	load = target_load(i, load_idx);
5805	else	5805	else
5806	load = source_load(i, load_idx);	5806	load = source_load(i, load_idx);
5807		5807
5808	sgs->group_load += load;	5808	sgs->group_load += load;
5809	sgs->sum_nr_running += rq->nr_running;	5809	sgs->sum_nr_running += rq->nr_running;
5810	#ifdef CONFIG_NUMA_BALANCING	5810	#ifdef CONFIG_NUMA_BALANCING
5811	sgs->nr_numa_running += rq->nr_numa_running;	5811	sgs->nr_numa_running += rq->nr_numa_running;
5812	sgs->nr_preferred_running += rq->nr_preferred_running;	5812	sgs->nr_preferred_running += rq->nr_preferred_running;
5813	#endif	5813	#endif
5814	sgs->sum_weighted_load += weighted_cpuload(i);	5814	sgs->sum_weighted_load += weighted_cpuload(i);
5815	if (idle_cpu(i))	5815	if (idle_cpu(i))
5816	sgs->idle_cpus++;	5816	sgs->idle_cpus++;
5817	}	5817	}
5818		5818
5819	/* Adjust by relative CPU power of the group */	5819	/* Adjust by relative CPU power of the group */
5820	sgs->group_power = group->sgp->power;	5820	sgs->group_power = group->sgp->power;
5821	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;	5821	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
5822		5822
5823	if (sgs->sum_nr_running)	5823	if (sgs->sum_nr_running)
5824	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;	5824	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5825		5825
5826	sgs->group_weight = group->group_weight;	5826	sgs->group_weight = group->group_weight;
5827		5827
5828	sgs->group_imb = sg_imbalanced(group);	5828	sgs->group_imb = sg_imbalanced(group);
5829	sgs->group_capacity = sg_capacity(env, group);	5829	sgs->group_capacity = sg_capacity(env, group);
5830		5830
5831	if (sgs->group_capacity > sgs->sum_nr_running)	5831	if (sgs->group_capacity > sgs->sum_nr_running)
5832	sgs->group_has_capacity = 1;	5832	sgs->group_has_capacity = 1;
5833	}	5833	}
5834		5834
5835	/**	5835	/**
5836	* update_sd_pick_busiest - return 1 on busiest group	5836	* update_sd_pick_busiest - return 1 on busiest group
5837	* @env: The load balancing environment.	5837	* @env: The load balancing environment.
5838	* @sds: sched_domain statistics	5838	* @sds: sched_domain statistics
5839	* @sg: sched_group candidate to be checked for being the busiest	5839	* @sg: sched_group candidate to be checked for being the busiest
5840	* @sgs: sched_group statistics	5840	* @sgs: sched_group statistics
5841	*	5841	*
5842	* Determine if @sg is a busier group than the previously selected	5842	* Determine if @sg is a busier group than the previously selected
5843	* busiest group.	5843	* busiest group.
5844	*	5844	*
5845	* Return: %true if @sg is a busier group than the previously selected	5845	* Return: %true if @sg is a busier group than the previously selected
5846	* busiest group. %false otherwise.	5846	* busiest group. %false otherwise.
5847	*/	5847	*/
5848	static bool update_sd_pick_busiest(struct lb_env *env,	5848	static bool update_sd_pick_busiest(struct lb_env *env,
5849	struct sd_lb_stats *sds,	5849	struct sd_lb_stats *sds,
5850	struct sched_group *sg,	5850	struct sched_group *sg,
5851	struct sg_lb_stats *sgs)	5851	struct sg_lb_stats *sgs)
5852	{	5852	{
5853	if (sgs->avg_load <= sds->busiest_stat.avg_load)	5853	if (sgs->avg_load <= sds->busiest_stat.avg_load)
5854	return false;	5854	return false;
5855		5855
5856	if (sgs->sum_nr_running > sgs->group_capacity)	5856	if (sgs->sum_nr_running > sgs->group_capacity)
5857	return true;	5857	return true;
5858		5858
5859	if (sgs->group_imb)	5859	if (sgs->group_imb)
5860	return true;	5860	return true;
5861		5861
5862	/*	5862	/*
5863	* ASYM_PACKING needs to move all the work to the lowest	5863	* ASYM_PACKING needs to move all the work to the lowest
5864	* numbered CPUs in the group, therefore mark all groups	5864	* numbered CPUs in the group, therefore mark all groups
5865	* higher than ourself as busy.	5865	* higher than ourself as busy.
5866	*/	5866	*/
5867	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&	5867	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5868	env->dst_cpu < group_first_cpu(sg)) {	5868	env->dst_cpu < group_first_cpu(sg)) {
5869	if (!sds->busiest)	5869	if (!sds->busiest)
5870	return true;	5870	return true;
5871		5871
5872	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))	5872	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5873	return true;	5873	return true;
5874	}	5874	}
5875		5875
5876	return false;	5876	return false;
5877	}	5877	}
5878		5878
5879	#ifdef CONFIG_NUMA_BALANCING	5879	#ifdef CONFIG_NUMA_BALANCING
5880	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)	5880	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5881	{	5881	{
5882	if (sgs->sum_nr_running > sgs->nr_numa_running)	5882	if (sgs->sum_nr_running > sgs->nr_numa_running)
5883	return regular;	5883	return regular;
5884	if (sgs->sum_nr_running > sgs->nr_preferred_running)	5884	if (sgs->sum_nr_running > sgs->nr_preferred_running)
5885	return remote;	5885	return remote;
5886	return all;	5886	return all;
5887	}	5887	}
5888		5888
5889	static inline enum fbq_type fbq_classify_rq(struct rq *rq)	5889	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5890	{	5890	{
5891	if (rq->nr_running > rq->nr_numa_running)	5891	if (rq->nr_running > rq->nr_numa_running)
5892	return regular;	5892	return regular;
5893	if (rq->nr_running > rq->nr_preferred_running)	5893	if (rq->nr_running > rq->nr_preferred_running)
5894	return remote;	5894	return remote;
5895	return all;	5895	return all;
5896	}	5896	}
5897	#else	5897	#else
5898	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)	5898	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5899	{	5899	{
5900	return all;	5900	return all;
5901	}	5901	}
5902		5902
5903	static inline enum fbq_type fbq_classify_rq(struct rq *rq)	5903	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5904	{	5904	{
5905	return regular;	5905	return regular;
5906	}	5906	}
5907	#endif /* CONFIG_NUMA_BALANCING */	5907	#endif /* CONFIG_NUMA_BALANCING */
5908		5908
5909	/**	5909	/**
5910	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.	5910	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5911	* @env: The load balancing environment.	5911	* @env: The load balancing environment.
5912	* @sds: variable to hold the statistics for this sched_domain.	5912	* @sds: variable to hold the statistics for this sched_domain.
5913	*/	5913	*/
5914	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)	5914	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
5915	{	5915	{
5916	struct sched_domain *child = env->sd->child;	5916	struct sched_domain *child = env->sd->child;
5917	struct sched_group *sg = env->sd->groups;	5917	struct sched_group *sg = env->sd->groups;
5918	struct sg_lb_stats tmp_sgs;	5918	struct sg_lb_stats tmp_sgs;
5919	int load_idx, prefer_sibling = 0;	5919	int load_idx, prefer_sibling = 0;
5920		5920
5921	if (child && child->flags & SD_PREFER_SIBLING)	5921	if (child && child->flags & SD_PREFER_SIBLING)
5922	prefer_sibling = 1;	5922	prefer_sibling = 1;
5923		5923
5924	load_idx = get_sd_load_idx(env->sd, env->idle);	5924	load_idx = get_sd_load_idx(env->sd, env->idle);
5925		5925
5926	do {	5926	do {
5927	struct sg_lb_stats *sgs = &tmp_sgs;	5927	struct sg_lb_stats *sgs = &tmp_sgs;
5928	int local_group;	5928	int local_group;
5929		5929
5930	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));	5930	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
5931	if (local_group) {	5931	if (local_group) {
5932	sds->local = sg;	5932	sds->local = sg;
5933	sgs = &sds->local_stat;	5933	sgs = &sds->local_stat;
5934		5934
5935	if (env->idle != CPU_NEWLY_IDLE \|\|	5935	if (env->idle != CPU_NEWLY_IDLE \|\|
5936	time_after_eq(jiffies, sg->sgp->next_update))	5936	time_after_eq(jiffies, sg->sgp->next_update))
5937	update_group_power(env->sd, env->dst_cpu);	5937	update_group_power(env->sd, env->dst_cpu);
5938	}	5938	}
5939		5939
5940	update_sg_lb_stats(env, sg, load_idx, local_group, sgs);	5940	update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
5941		5941
5942	if (local_group)	5942	if (local_group)
5943	goto next_group;	5943	goto next_group;
5944		5944
5945	/*	5945	/*
5946	* In case the child domain prefers tasks go to siblings	5946	* In case the child domain prefers tasks go to siblings
5947	* first, lower the sg capacity to one so that we'll try	5947	* first, lower the sg capacity to one so that we'll try
5948	* and move all the excess tasks away. We lower the capacity	5948	* and move all the excess tasks away. We lower the capacity
5949	* of a group only if the local group has the capacity to fit	5949	* of a group only if the local group has the capacity to fit
5950	* these excess tasks, i.e. nr_running < group_capacity. The	5950	* these excess tasks, i.e. nr_running < group_capacity. The
5951	* extra check prevents the case where you always pull from the	5951	* extra check prevents the case where you always pull from the
5952	* heaviest group when it is already under-utilized (possible	5952	* heaviest group when it is already under-utilized (possible
5953	* with a large weight task outweighs the tasks on the system).	5953	* with a large weight task outweighs the tasks on the system).
5954	*/	5954	*/
5955	if (prefer_sibling && sds->local &&	5955	if (prefer_sibling && sds->local &&
5956	sds->local_stat.group_has_capacity)	5956	sds->local_stat.group_has_capacity)
5957	sgs->group_capacity = min(sgs->group_capacity, 1U);	5957	sgs->group_capacity = min(sgs->group_capacity, 1U);
5958		5958
5959	if (update_sd_pick_busiest(env, sds, sg, sgs)) {	5959	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
5960	sds->busiest = sg;	5960	sds->busiest = sg;
5961	sds->busiest_stat = *sgs;	5961	sds->busiest_stat = *sgs;
5962	}	5962	}
5963		5963
5964	next_group:	5964	next_group:
5965	/* Now, start updating sd_lb_stats */	5965	/* Now, start updating sd_lb_stats */
5966	sds->total_load += sgs->group_load;	5966	sds->total_load += sgs->group_load;
5967	sds->total_pwr += sgs->group_power;	5967	sds->total_pwr += sgs->group_power;
5968		5968
5969	sg = sg->next;	5969	sg = sg->next;
5970	} while (sg != env->sd->groups);	5970	} while (sg != env->sd->groups);
5971		5971
5972	if (env->sd->flags & SD_NUMA)	5972	if (env->sd->flags & SD_NUMA)
5973	env->fbq_type = fbq_classify_group(&sds->busiest_stat);	5973	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
5974	}	5974	}
5975		5975
5976	/**	5976	/**
5977	* check_asym_packing - Check to see if the group is packed into the	5977	* check_asym_packing - Check to see if the group is packed into the
5978	* sched doman.	5978	* sched doman.
5979	*	5979	*
5980	* This is primarily intended to used at the sibling level. Some	5980	* This is primarily intended to used at the sibling level. Some
5981	* cores like POWER7 prefer to use lower numbered SMT threads. In the	5981	* cores like POWER7 prefer to use lower numbered SMT threads. In the
5982	* case of POWER7, it can move to lower SMT modes only when higher	5982	* case of POWER7, it can move to lower SMT modes only when higher
5983	* threads are idle. When in lower SMT modes, the threads will	5983	* threads are idle. When in lower SMT modes, the threads will
5984	* perform better since they share less core resources. Hence when we	5984	* perform better since they share less core resources. Hence when we
5985	* have idle threads, we want them to be the higher ones.	5985	* have idle threads, we want them to be the higher ones.
5986	*	5986	*
5987	* This packing function is run on idle threads. It checks to see if	5987	* This packing function is run on idle threads. It checks to see if
5988	* the busiest CPU in this domain (core in the P7 case) has a higher	5988	* the busiest CPU in this domain (core in the P7 case) has a higher
5989	* CPU number than the packing function is being run on. Here we are	5989	* CPU number than the packing function is being run on. Here we are
5990	* assuming lower CPU number will be equivalent to lower a SMT thread	5990	* assuming lower CPU number will be equivalent to lower a SMT thread
5991	* number.	5991	* number.
5992	*	5992	*
5993	* Return: 1 when packing is required and a task should be moved to	5993	* Return: 1 when packing is required and a task should be moved to
5994	* this CPU. The amount of the imbalance is returned in *imbalance.	5994	* this CPU. The amount of the imbalance is returned in *imbalance.
5995	*	5995	*
5996	* @env: The load balancing environment.	5996	* @env: The load balancing environment.
5997	* @sds: Statistics of the sched_domain which is to be packed	5997	* @sds: Statistics of the sched_domain which is to be packed
5998	*/	5998	*/
5999	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)	5999	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
6000	{	6000	{
6001	int busiest_cpu;	6001	int busiest_cpu;
6002		6002
6003	if (!(env->sd->flags & SD_ASYM_PACKING))	6003	if (!(env->sd->flags & SD_ASYM_PACKING))
6004	return 0;	6004	return 0;
6005		6005
6006	if (!sds->busiest)	6006	if (!sds->busiest)
6007	return 0;	6007	return 0;
6008		6008
6009	busiest_cpu = group_first_cpu(sds->busiest);	6009	busiest_cpu = group_first_cpu(sds->busiest);
6010	if (env->dst_cpu > busiest_cpu)	6010	if (env->dst_cpu > busiest_cpu)
6011	return 0;	6011	return 0;
6012		6012
6013	env->imbalance = DIV_ROUND_CLOSEST(	6013	env->imbalance = DIV_ROUND_CLOSEST(
6014	sds->busiest_stat.avg_load * sds->busiest_stat.group_power,	6014	sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
6015	SCHED_POWER_SCALE);	6015	SCHED_POWER_SCALE);
6016		6016
6017	return 1;	6017	return 1;
6018	}	6018	}
6019		6019
6020	/**	6020	/**
6021	* fix_small_imbalance - Calculate the minor imbalance that exists	6021	* fix_small_imbalance - Calculate the minor imbalance that exists
6022	* amongst the groups of a sched_domain, during	6022	* amongst the groups of a sched_domain, during
6023	* load balancing.	6023	* load balancing.
6024	* @env: The load balancing environment.	6024	* @env: The load balancing environment.
6025	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.	6025	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6026	*/	6026	*/
6027	static inline	6027	static inline
6028	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)	6028	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
6029	{	6029	{
6030	unsigned long tmp, pwr_now = 0, pwr_move = 0;	6030	unsigned long tmp, pwr_now = 0, pwr_move = 0;
6031	unsigned int imbn = 2;	6031	unsigned int imbn = 2;
6032	unsigned long scaled_busy_load_per_task;	6032	unsigned long scaled_busy_load_per_task;
6033	struct sg_lb_stats local, busiest;	6033	struct sg_lb_stats local, busiest;
6034		6034
6035	local = &sds->local_stat;	6035	local = &sds->local_stat;
6036	busiest = &sds->busiest_stat;	6036	busiest = &sds->busiest_stat;
6037		6037
6038	if (!local->sum_nr_running)	6038	if (!local->sum_nr_running)
6039	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);	6039	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6040	else if (busiest->load_per_task > local->load_per_task)	6040	else if (busiest->load_per_task > local->load_per_task)
6041	imbn = 1;	6041	imbn = 1;
6042		6042
6043	scaled_busy_load_per_task =	6043	scaled_busy_load_per_task =
6044	(busiest->load_per_task * SCHED_POWER_SCALE) /	6044	(busiest->load_per_task * SCHED_POWER_SCALE) /
6045	busiest->group_power;	6045	busiest->group_power;
6046		6046
6047	if (busiest->avg_load + scaled_busy_load_per_task >=	6047	if (busiest->avg_load + scaled_busy_load_per_task >=
6048	local->avg_load + (scaled_busy_load_per_task * imbn)) {	6048	local->avg_load + (scaled_busy_load_per_task * imbn)) {
6049	env->imbalance = busiest->load_per_task;	6049	env->imbalance = busiest->load_per_task;
6050	return;	6050	return;
6051	}	6051	}
6052		6052
6053	/*	6053	/*
6054	* OK, we don't have enough imbalance to justify moving tasks,	6054	* OK, we don't have enough imbalance to justify moving tasks,
6055	* however we may be able to increase total CPU power used by	6055	* however we may be able to increase total CPU power used by
6056	* moving them.	6056	* moving them.
6057	*/	6057	*/
6058		6058
6059	pwr_now += busiest->group_power *	6059	pwr_now += busiest->group_power *
6060	min(busiest->load_per_task, busiest->avg_load);	6060	min(busiest->load_per_task, busiest->avg_load);
6061	pwr_now += local->group_power *	6061	pwr_now += local->group_power *
6062	min(local->load_per_task, local->avg_load);	6062	min(local->load_per_task, local->avg_load);
6063	pwr_now /= SCHED_POWER_SCALE;	6063	pwr_now /= SCHED_POWER_SCALE;
6064		6064
6065	/* Amount of load we'd subtract */	6065	/* Amount of load we'd subtract */
6066	if (busiest->avg_load > scaled_busy_load_per_task) {	6066	if (busiest->avg_load > scaled_busy_load_per_task) {
6067	pwr_move += busiest->group_power *	6067	pwr_move += busiest->group_power *
6068	min(busiest->load_per_task,	6068	min(busiest->load_per_task,
6069	busiest->avg_load - scaled_busy_load_per_task);	6069	busiest->avg_load - scaled_busy_load_per_task);
6070	}	6070	}
6071		6071
6072	/* Amount of load we'd add */	6072	/* Amount of load we'd add */
6073	if (busiest->avg_load * busiest->group_power <	6073	if (busiest->avg_load * busiest->group_power <
6074	busiest->load_per_task * SCHED_POWER_SCALE) {	6074	busiest->load_per_task * SCHED_POWER_SCALE) {
6075	tmp = (busiest->avg_load * busiest->group_power) /	6075	tmp = (busiest->avg_load * busiest->group_power) /
6076	local->group_power;	6076	local->group_power;
6077	} else {	6077	} else {
6078	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /	6078	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
6079	local->group_power;	6079	local->group_power;
6080	}	6080	}
6081	pwr_move += local->group_power *	6081	pwr_move += local->group_power *
6082	min(local->load_per_task, local->avg_load + tmp);	6082	min(local->load_per_task, local->avg_load + tmp);
6083	pwr_move /= SCHED_POWER_SCALE;	6083	pwr_move /= SCHED_POWER_SCALE;
6084		6084
6085	/* Move if we gain throughput */	6085	/* Move if we gain throughput */
6086	if (pwr_move > pwr_now)	6086	if (pwr_move > pwr_now)
6087	env->imbalance = busiest->load_per_task;	6087	env->imbalance = busiest->load_per_task;
6088	}	6088	}
6089		6089
6090	/**	6090	/**
6091	* calculate_imbalance - Calculate the amount of imbalance present within the	6091	* calculate_imbalance - Calculate the amount of imbalance present within the
6092	* groups of a given sched_domain during load balance.	6092	* groups of a given sched_domain during load balance.
6093	* @env: load balance environment	6093	* @env: load balance environment
6094	* @sds: statistics of the sched_domain whose imbalance is to be calculated.	6094	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
6095	*/	6095	*/
6096	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)	6096	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
6097	{	6097	{
6098	unsigned long max_pull, load_above_capacity = ~0UL;	6098	unsigned long max_pull, load_above_capacity = ~0UL;
6099	struct sg_lb_stats local, busiest;	6099	struct sg_lb_stats local, busiest;
6100		6100
6101	local = &sds->local_stat;	6101	local = &sds->local_stat;
6102	busiest = &sds->busiest_stat;	6102	busiest = &sds->busiest_stat;
6103		6103
6104	if (busiest->group_imb) {	6104	if (busiest->group_imb) {
6105	/*	6105	/*
6106	* In the group_imb case we cannot rely on group-wide averages	6106	* In the group_imb case we cannot rely on group-wide averages
6107	* to ensure cpu-load equilibrium, look at wider averages. XXX	6107	* to ensure cpu-load equilibrium, look at wider averages. XXX
6108	*/	6108	*/
6109	busiest->load_per_task =	6109	busiest->load_per_task =
6110	min(busiest->load_per_task, sds->avg_load);	6110	min(busiest->load_per_task, sds->avg_load);
6111	}	6111	}
6112		6112
6113	/*	6113	/*
6114	* In the presence of smp nice balancing, certain scenarios can have	6114	* In the presence of smp nice balancing, certain scenarios can have
6115	* max load less than avg load(as we skip the groups at or below	6115	* max load less than avg load(as we skip the groups at or below
6116	* its cpu_power, while calculating max_load..)	6116	* its cpu_power, while calculating max_load..)
6117	*/	6117	*/
6118	if (busiest->avg_load <= sds->avg_load \|\|	6118	if (busiest->avg_load <= sds->avg_load \|\|
6119	local->avg_load >= sds->avg_load) {	6119	local->avg_load >= sds->avg_load) {
6120	env->imbalance = 0;	6120	env->imbalance = 0;
6121	return fix_small_imbalance(env, sds);	6121	return fix_small_imbalance(env, sds);
6122	}	6122	}
6123		6123
6124	if (!busiest->group_imb) {	6124	if (!busiest->group_imb) {
6125	/*	6125	/*
6126	* Don't want to pull so many tasks that a group would go idle.	6126	* Don't want to pull so many tasks that a group would go idle.
6127	* Except of course for the group_imb case, since then we might	6127	* Except of course for the group_imb case, since then we might
6128	* have to drop below capacity to reach cpu-load equilibrium.	6128	* have to drop below capacity to reach cpu-load equilibrium.
6129	*/	6129	*/
6130	load_above_capacity =	6130	load_above_capacity =
6131	(busiest->sum_nr_running - busiest->group_capacity);	6131	(busiest->sum_nr_running - busiest->group_capacity);
6132		6132
6133	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);	6133	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
6134	load_above_capacity /= busiest->group_power;	6134	load_above_capacity /= busiest->group_power;
6135	}	6135	}
6136		6136
6137	/*	6137	/*
6138	* We're trying to get all the cpus to the average_load, so we don't	6138	* We're trying to get all the cpus to the average_load, so we don't
6139	* want to push ourselves above the average load, nor do we wish to	6139	* want to push ourselves above the average load, nor do we wish to
6140	* reduce the max loaded cpu below the average load. At the same time,	6140	* reduce the max loaded cpu below the average load. At the same time,
6141	* we also don't want to reduce the group load below the group capacity	6141	* we also don't want to reduce the group load below the group capacity
6142	* (so that we can implement power-savings policies etc). Thus we look	6142	* (so that we can implement power-savings policies etc). Thus we look
6143	* for the minimum possible imbalance.	6143	* for the minimum possible imbalance.
6144	*/	6144	*/
6145	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);	6145	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6146		6146
6147	/* How much load to actually move to equalise the imbalance */	6147	/* How much load to actually move to equalise the imbalance */
6148	env->imbalance = min(	6148	env->imbalance = min(
6149	max_pull * busiest->group_power,	6149	max_pull * busiest->group_power,
6150	(sds->avg_load - local->avg_load) * local->group_power	6150	(sds->avg_load - local->avg_load) * local->group_power
6151	) / SCHED_POWER_SCALE;	6151	) / SCHED_POWER_SCALE;
6152		6152
6153	/*	6153	/*
6154	* if *imbalance is less than the average load per runnable task	6154	* if *imbalance is less than the average load per runnable task
6155	* there is no guarantee that any tasks will be moved so we'll have	6155	* there is no guarantee that any tasks will be moved so we'll have
6156	* a think about bumping its value to force at least one task to be	6156	* a think about bumping its value to force at least one task to be
6157	* moved	6157	* moved
6158	*/	6158	*/
6159	if (env->imbalance < busiest->load_per_task)	6159	if (env->imbalance < busiest->load_per_task)
6160	return fix_small_imbalance(env, sds);	6160	return fix_small_imbalance(env, sds);
6161	}	6161	}
6162		6162
6163	/***** find_busiest_group() helpers end here *******************/	6163	/***** find_busiest_group() helpers end here *******************/
6164		6164
6165	/**	6165	/**
6166	* find_busiest_group - Returns the busiest group within the sched_domain	6166	* find_busiest_group - Returns the busiest group within the sched_domain
6167	* if there is an imbalance. If there isn't an imbalance, and	6167	* if there is an imbalance. If there isn't an imbalance, and
6168	* the user has opted for power-savings, it returns a group whose	6168	* the user has opted for power-savings, it returns a group whose
6169	* CPUs can be put to idle by rebalancing those tasks elsewhere, if	6169	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
6170	* such a group exists.	6170	* such a group exists.
6171	*	6171	*
6172	* Also calculates the amount of weighted load which should be moved	6172	* Also calculates the amount of weighted load which should be moved
6173	* to restore balance.	6173	* to restore balance.
6174	*	6174	*
6175	* @env: The load balancing environment.	6175	* @env: The load balancing environment.
6176	*	6176	*
6177	* Return: - The busiest group if imbalance exists.	6177	* Return: - The busiest group if imbalance exists.
6178	* - If no imbalance and user has opted for power-savings balance,	6178	* - If no imbalance and user has opted for power-savings balance,
6179	* return the least loaded group whose CPUs can be	6179	* return the least loaded group whose CPUs can be
6180	* put to idle by rebalancing its tasks onto our group.	6180	* put to idle by rebalancing its tasks onto our group.
6181	*/	6181	*/
6182	static struct sched_group find_busiest_group(struct lb_env env)	6182	static struct sched_group find_busiest_group(struct lb_env env)
6183	{	6183	{
6184	struct sg_lb_stats local, busiest;	6184	struct sg_lb_stats local, busiest;
6185	struct sd_lb_stats sds;	6185	struct sd_lb_stats sds;
6186		6186
6187	init_sd_lb_stats(&sds);	6187	init_sd_lb_stats(&sds);
6188		6188
6189	/*	6189	/*
6190	* Compute the various statistics relavent for load balancing at	6190	* Compute the various statistics relavent for load balancing at
6191	* this level.	6191	* this level.
6192	*/	6192	*/
6193	update_sd_lb_stats(env, &sds);	6193	update_sd_lb_stats(env, &sds);
6194	local = &sds.local_stat;	6194	local = &sds.local_stat;
6195	busiest = &sds.busiest_stat;	6195	busiest = &sds.busiest_stat;
6196		6196
6197	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&	6197	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
6198	check_asym_packing(env, &sds))	6198	check_asym_packing(env, &sds))
6199	return sds.busiest;	6199	return sds.busiest;
6200		6200
6201	/* There is no busy sibling group to pull tasks from */	6201	/* There is no busy sibling group to pull tasks from */
6202	if (!sds.busiest \|\| busiest->sum_nr_running == 0)	6202	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
6203	goto out_balanced;	6203	goto out_balanced;
6204		6204
6205	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;	6205	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
6206		6206
6207	/*	6207	/*
6208	* If the busiest group is imbalanced the below checks don't	6208	* If the busiest group is imbalanced the below checks don't
6209	* work because they assume all things are equal, which typically	6209	* work because they assume all things are equal, which typically
6210	* isn't true due to cpus_allowed constraints and the like.	6210	* isn't true due to cpus_allowed constraints and the like.
6211	*/	6211	*/
6212	if (busiest->group_imb)	6212	if (busiest->group_imb)
6213	goto force_balance;	6213	goto force_balance;
6214		6214
6215	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */	6215	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6216	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&	6216	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
6217	!busiest->group_has_capacity)	6217	!busiest->group_has_capacity)
6218	goto force_balance;	6218	goto force_balance;
6219		6219
6220	/*	6220	/*
6221	* If the local group is more busy than the selected busiest group	6221	* If the local group is more busy than the selected busiest group
6222	* don't try and pull any tasks.	6222	* don't try and pull any tasks.
6223	*/	6223	*/
6224	if (local->avg_load >= busiest->avg_load)	6224	if (local->avg_load >= busiest->avg_load)
6225	goto out_balanced;	6225	goto out_balanced;
6226		6226
6227	/*	6227	/*
6228	* Don't pull any tasks if this group is already above the domain	6228	* Don't pull any tasks if this group is already above the domain
6229	* average load.	6229	* average load.
6230	*/	6230	*/
6231	if (local->avg_load >= sds.avg_load)	6231	if (local->avg_load >= sds.avg_load)
6232	goto out_balanced;	6232	goto out_balanced;
6233		6233
6234	if (env->idle == CPU_IDLE) {	6234	if (env->idle == CPU_IDLE) {
6235	/*	6235	/*
6236	* This cpu is idle. If the busiest group load doesn't	6236	* This cpu is idle. If the busiest group load doesn't
6237	* have more tasks than the number of available cpu's and	6237	* have more tasks than the number of available cpu's and
6238	* there is no imbalance between this and busiest group	6238	* there is no imbalance between this and busiest group
6239	* wrt to idle cpu's, it is balanced.	6239	* wrt to idle cpu's, it is balanced.
6240	*/	6240	*/
6241	if ((local->idle_cpus < busiest->idle_cpus) &&	6241	if ((local->idle_cpus < busiest->idle_cpus) &&
6242	busiest->sum_nr_running <= busiest->group_weight)	6242	busiest->sum_nr_running <= busiest->group_weight)
6243	goto out_balanced;	6243	goto out_balanced;
6244	} else {	6244	} else {
6245	/*	6245	/*
6246	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use	6246	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6247	* imbalance_pct to be conservative.	6247	* imbalance_pct to be conservative.
6248	*/	6248	*/
6249	if (100 * busiest->avg_load <=	6249	if (100 * busiest->avg_load <=
6250	env->sd->imbalance_pct * local->avg_load)	6250	env->sd->imbalance_pct * local->avg_load)
6251	goto out_balanced;	6251	goto out_balanced;
6252	}	6252	}
6253		6253
6254	force_balance:	6254	force_balance:
6255	/* Looks like there is an imbalance. Compute it */	6255	/* Looks like there is an imbalance. Compute it */
6256	calculate_imbalance(env, &sds);	6256	calculate_imbalance(env, &sds);
6257	return sds.busiest;	6257	return sds.busiest;
6258		6258
6259	out_balanced:	6259	out_balanced:
6260	env->imbalance = 0;	6260	env->imbalance = 0;
6261	return NULL;	6261	return NULL;
6262	}	6262	}
6263		6263
6264	/*	6264	/*
6265	* find_busiest_queue - find the busiest runqueue among the cpus in group.	6265	* find_busiest_queue - find the busiest runqueue among the cpus in group.
6266	*/	6266	*/
6267	static struct rq find_busiest_queue(struct lb_env env,	6267	static struct rq find_busiest_queue(struct lb_env env,
6268	struct sched_group *group)	6268	struct sched_group *group)
6269	{	6269	{
6270	struct rq busiest = NULL, rq;	6270	struct rq busiest = NULL, rq;
6271	unsigned long busiest_load = 0, busiest_power = 1;	6271	unsigned long busiest_load = 0, busiest_power = 1;
6272	int i;	6272	int i;
6273		6273
6274	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {	6274	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6275	unsigned long power, capacity, wl;	6275	unsigned long power, capacity, wl;
6276	enum fbq_type rt;	6276	enum fbq_type rt;
6277		6277
6278	rq = cpu_rq(i);	6278	rq = cpu_rq(i);
6279	rt = fbq_classify_rq(rq);	6279	rt = fbq_classify_rq(rq);
6280		6280
6281	/*	6281	/*
6282	* We classify groups/runqueues into three groups:	6282	* We classify groups/runqueues into three groups:
6283	* - regular: there are !numa tasks	6283	* - regular: there are !numa tasks
6284	* - remote: there are numa tasks that run on the 'wrong' node	6284	* - remote: there are numa tasks that run on the 'wrong' node
6285	* - all: there is no distinction	6285	* - all: there is no distinction
6286	*	6286	*
6287	* In order to avoid migrating ideally placed numa tasks,	6287	* In order to avoid migrating ideally placed numa tasks,
6288	* ignore those when there's better options.	6288	* ignore those when there's better options.
6289	*	6289	*
6290	* If we ignore the actual busiest queue to migrate another	6290	* If we ignore the actual busiest queue to migrate another
6291	* task, the next balance pass can still reduce the busiest	6291	* task, the next balance pass can still reduce the busiest
6292	* queue by moving tasks around inside the node.	6292	* queue by moving tasks around inside the node.
6293	*	6293	*
6294	* If we cannot move enough load due to this classification	6294	* If we cannot move enough load due to this classification
6295	* the next pass will adjust the group classification and	6295	* the next pass will adjust the group classification and
6296	* allow migration of more tasks.	6296	* allow migration of more tasks.
6297	*	6297	*
6298	* Both cases only affect the total convergence complexity.	6298	* Both cases only affect the total convergence complexity.
6299	*/	6299	*/
6300	if (rt > env->fbq_type)	6300	if (rt > env->fbq_type)
6301	continue;	6301	continue;
6302		6302
6303	power = power_of(i);	6303	power = power_of(i);
6304	capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);	6304	capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
6305	if (!capacity)	6305	if (!capacity)
6306	capacity = fix_small_capacity(env->sd, group);	6306	capacity = fix_small_capacity(env->sd, group);
6307		6307
6308	wl = weighted_cpuload(i);	6308	wl = weighted_cpuload(i);
6309		6309
6310	/*	6310	/*
6311	* When comparing with imbalance, use weighted_cpuload()	6311	* When comparing with imbalance, use weighted_cpuload()
6312	* which is not scaled with the cpu power.	6312	* which is not scaled with the cpu power.
6313	*/	6313	*/
6314	if (capacity && rq->nr_running == 1 && wl > env->imbalance)	6314	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
6315	continue;	6315	continue;
6316		6316
6317	/*	6317	/*
6318	* For the load comparisons with the other cpu's, consider	6318	* For the load comparisons with the other cpu's, consider
6319	* the weighted_cpuload() scaled with the cpu power, so that	6319	* the weighted_cpuload() scaled with the cpu power, so that
6320	* the load can be moved away from the cpu that is potentially	6320	* the load can be moved away from the cpu that is potentially
6321	* running at a lower capacity.	6321	* running at a lower capacity.
6322	*	6322	*
6323	* Thus we're looking for max(wl_i / power_i), crosswise	6323	* Thus we're looking for max(wl_i / power_i), crosswise
6324	* multiplication to rid ourselves of the division works out	6324	* multiplication to rid ourselves of the division works out
6325	* to: wl_i * power_j > wl_j * power_i; where j is our	6325	* to: wl_i * power_j > wl_j * power_i; where j is our
6326	* previous maximum.	6326	* previous maximum.
6327	*/	6327	*/
6328	if (wl * busiest_power > busiest_load * power) {	6328	if (wl * busiest_power > busiest_load * power) {
6329	busiest_load = wl;	6329	busiest_load = wl;
6330	busiest_power = power;	6330	busiest_power = power;
6331	busiest = rq;	6331	busiest = rq;
6332	}	6332	}
6333	}	6333	}
6334		6334
6335	return busiest;	6335	return busiest;
6336	}	6336	}
6337		6337
6338	/*	6338	/*
6339	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but	6339	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6340	* so long as it is large enough.	6340	* so long as it is large enough.
6341	*/	6341	*/
6342	#define MAX_PINNED_INTERVAL 512	6342	#define MAX_PINNED_INTERVAL 512
6343		6343
6344	/* Working cpumask for load_balance and load_balance_newidle. */	6344	/* Working cpumask for load_balance and load_balance_newidle. */
6345	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);	6345	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6346		6346
6347	static int need_active_balance(struct lb_env *env)	6347	static int need_active_balance(struct lb_env *env)
6348	{	6348	{
6349	struct sched_domain *sd = env->sd;	6349	struct sched_domain *sd = env->sd;
6350		6350
6351	if (env->idle == CPU_NEWLY_IDLE) {	6351	if (env->idle == CPU_NEWLY_IDLE) {
6352		6352
6353	/*	6353	/*
6354	* ASYM_PACKING needs to force migrate tasks from busy but	6354	* ASYM_PACKING needs to force migrate tasks from busy but
6355	* higher numbered CPUs in order to pack all tasks in the	6355	* higher numbered CPUs in order to pack all tasks in the
6356	* lowest numbered CPUs.	6356	* lowest numbered CPUs.
6357	*/	6357	*/
6358	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)	6358	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6359	return 1;	6359	return 1;
6360	}	6360	}
6361		6361
6362	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);	6362	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6363	}	6363	}
6364		6364
6365	static int active_load_balance_cpu_stop(void *data);	6365	static int active_load_balance_cpu_stop(void *data);
6366		6366
6367	static int should_we_balance(struct lb_env *env)	6367	static int should_we_balance(struct lb_env *env)
6368	{	6368	{
6369	struct sched_group *sg = env->sd->groups;	6369	struct sched_group *sg = env->sd->groups;
6370	struct cpumask sg_cpus, sg_mask;	6370	struct cpumask sg_cpus, sg_mask;
6371	int cpu, balance_cpu = -1;	6371	int cpu, balance_cpu = -1;
6372		6372
6373	/*	6373	/*
6374	* In the newly idle case, we will allow all the cpu's	6374	* In the newly idle case, we will allow all the cpu's
6375	* to do the newly idle load balance.	6375	* to do the newly idle load balance.
6376	*/	6376	*/
6377	if (env->idle == CPU_NEWLY_IDLE)	6377	if (env->idle == CPU_NEWLY_IDLE)
6378	return 1;	6378	return 1;
6379		6379
6380	sg_cpus = sched_group_cpus(sg);	6380	sg_cpus = sched_group_cpus(sg);
6381	sg_mask = sched_group_mask(sg);	6381	sg_mask = sched_group_mask(sg);
6382	/* Try to find first idle cpu */	6382	/* Try to find first idle cpu */
6383	for_each_cpu_and(cpu, sg_cpus, env->cpus) {	6383	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6384	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))	6384	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
6385	continue;	6385	continue;
6386		6386
6387	balance_cpu = cpu;	6387	balance_cpu = cpu;
6388	break;	6388	break;
6389	}	6389	}
6390		6390
6391	if (balance_cpu == -1)	6391	if (balance_cpu == -1)
6392	balance_cpu = group_balance_cpu(sg);	6392	balance_cpu = group_balance_cpu(sg);
6393		6393
6394	/*	6394	/*
6395	* First idle cpu or the first cpu(busiest) in this sched group	6395	* First idle cpu or the first cpu(busiest) in this sched group
6396	* is eligible for doing load balancing at this and above domains.	6396	* is eligible for doing load balancing at this and above domains.
6397	*/	6397	*/
6398	return balance_cpu == env->dst_cpu;	6398	return balance_cpu == env->dst_cpu;
6399	}	6399	}
6400		6400
6401	/*	6401	/*
6402	* Check this_cpu to ensure it is balanced within domain. Attempt to move	6402	* Check this_cpu to ensure it is balanced within domain. Attempt to move
6403	* tasks if there is an imbalance.	6403	* tasks if there is an imbalance.
6404	*/	6404	*/
6405	static int load_balance(int this_cpu, struct rq *this_rq,	6405	static int load_balance(int this_cpu, struct rq *this_rq,
6406	struct sched_domain *sd, enum cpu_idle_type idle,	6406	struct sched_domain *sd, enum cpu_idle_type idle,
6407	int *continue_balancing)	6407	int *continue_balancing)
6408	{	6408	{
6409	int ld_moved, cur_ld_moved, active_balance = 0;	6409	int ld_moved, cur_ld_moved, active_balance = 0;
6410	struct sched_domain *sd_parent = sd->parent;	6410	struct sched_domain *sd_parent = sd->parent;
6411	struct sched_group *group;	6411	struct sched_group *group;
6412	struct rq *busiest;	6412	struct rq *busiest;
6413	unsigned long flags;	6413	unsigned long flags;
6414	struct cpumask *cpus = __get_cpu_var(load_balance_mask);	6414	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
6415		6415
6416	struct lb_env env = {	6416	struct lb_env env = {
6417	.sd = sd,	6417	.sd = sd,
6418	.dst_cpu = this_cpu,	6418	.dst_cpu = this_cpu,
6419	.dst_rq = this_rq,	6419	.dst_rq = this_rq,
6420	.dst_grpmask = sched_group_cpus(sd->groups),	6420	.dst_grpmask = sched_group_cpus(sd->groups),
6421	.idle = idle,	6421	.idle = idle,
6422	.loop_break = sched_nr_migrate_break,	6422	.loop_break = sched_nr_migrate_break,
6423	.cpus = cpus,	6423	.cpus = cpus,
6424	.fbq_type = all,	6424	.fbq_type = all,
6425	};	6425	};
6426		6426
6427	/*	6427	/*
6428	* For NEWLY_IDLE load_balancing, we don't need to consider	6428	* For NEWLY_IDLE load_balancing, we don't need to consider
6429	* other cpus in our group	6429	* other cpus in our group
6430	*/	6430	*/
6431	if (idle == CPU_NEWLY_IDLE)	6431	if (idle == CPU_NEWLY_IDLE)
6432	env.dst_grpmask = NULL;	6432	env.dst_grpmask = NULL;
6433		6433
6434	cpumask_copy(cpus, cpu_active_mask);	6434	cpumask_copy(cpus, cpu_active_mask);
6435		6435
6436	schedstat_inc(sd, lb_count[idle]);	6436	schedstat_inc(sd, lb_count[idle]);
6437		6437
6438	redo:	6438	redo:
6439	if (!should_we_balance(&env)) {	6439	if (!should_we_balance(&env)) {
6440	*continue_balancing = 0;	6440	*continue_balancing = 0;
6441	goto out_balanced;	6441	goto out_balanced;
6442	}	6442	}
6443		6443
6444	group = find_busiest_group(&env);	6444	group = find_busiest_group(&env);
6445	if (!group) {	6445	if (!group) {
6446	schedstat_inc(sd, lb_nobusyg[idle]);	6446	schedstat_inc(sd, lb_nobusyg[idle]);
6447	goto out_balanced;	6447	goto out_balanced;
6448	}	6448	}
6449		6449
6450	busiest = find_busiest_queue(&env, group);	6450	busiest = find_busiest_queue(&env, group);
6451	if (!busiest) {	6451	if (!busiest) {
6452	schedstat_inc(sd, lb_nobusyq[idle]);	6452	schedstat_inc(sd, lb_nobusyq[idle]);
6453	goto out_balanced;	6453	goto out_balanced;
6454	}	6454	}
6455		6455
6456	BUG_ON(busiest == env.dst_rq);	6456	BUG_ON(busiest == env.dst_rq);
6457		6457
6458	schedstat_add(sd, lb_imbalance[idle], env.imbalance);	6458	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6459		6459
6460	ld_moved = 0;	6460	ld_moved = 0;
6461	if (busiest->nr_running > 1) {	6461	if (busiest->nr_running > 1) {
6462	/*	6462	/*
6463	* Attempt to move tasks. If find_busiest_group has found	6463	* Attempt to move tasks. If find_busiest_group has found
6464	* an imbalance but busiest->nr_running <= 1, the group is	6464	* an imbalance but busiest->nr_running <= 1, the group is
6465	* still unbalanced. ld_moved simply stays zero, so it is	6465	* still unbalanced. ld_moved simply stays zero, so it is
6466	* correctly treated as an imbalance.	6466	* correctly treated as an imbalance.
6467	*/	6467	*/
6468	env.flags \|= LBF_ALL_PINNED;	6468	env.flags \|= LBF_ALL_PINNED;
6469	env.src_cpu = busiest->cpu;	6469	env.src_cpu = busiest->cpu;
6470	env.src_rq = busiest;	6470	env.src_rq = busiest;
6471	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);	6471	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6472		6472
6473	more_balance:	6473	more_balance:
6474	local_irq_save(flags);	6474	local_irq_save(flags);
6475	double_rq_lock(env.dst_rq, busiest);	6475	double_rq_lock(env.dst_rq, busiest);
6476		6476
6477	/*	6477	/*
6478	* cur_ld_moved - load moved in current iteration	6478	* cur_ld_moved - load moved in current iteration
6479	* ld_moved - cumulative load moved across iterations	6479	* ld_moved - cumulative load moved across iterations
6480	*/	6480	*/
6481	cur_ld_moved = move_tasks(&env);	6481	cur_ld_moved = move_tasks(&env);
6482	ld_moved += cur_ld_moved;	6482	ld_moved += cur_ld_moved;
6483	double_rq_unlock(env.dst_rq, busiest);	6483	double_rq_unlock(env.dst_rq, busiest);
6484	local_irq_restore(flags);	6484	local_irq_restore(flags);
6485		6485
6486	/*	6486	/*
6487	* some other cpu did the load balance for us.	6487	* some other cpu did the load balance for us.
6488	*/	6488	*/
6489	if (cur_ld_moved && env.dst_cpu != smp_processor_id())	6489	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
6490	resched_cpu(env.dst_cpu);	6490	resched_cpu(env.dst_cpu);
6491		6491
6492	if (env.flags & LBF_NEED_BREAK) {	6492	if (env.flags & LBF_NEED_BREAK) {
6493	env.flags &= ~LBF_NEED_BREAK;	6493	env.flags &= ~LBF_NEED_BREAK;
6494	goto more_balance;	6494	goto more_balance;
6495	}	6495	}
6496		6496
6497	/*	6497	/*
6498	* Revisit (affine) tasks on src_cpu that couldn't be moved to	6498	* Revisit (affine) tasks on src_cpu that couldn't be moved to
6499	* us and move them to an alternate dst_cpu in our sched_group	6499	* us and move them to an alternate dst_cpu in our sched_group
6500	* where they can run. The upper limit on how many times we	6500	* where they can run. The upper limit on how many times we
6501	* iterate on same src_cpu is dependent on number of cpus in our	6501	* iterate on same src_cpu is dependent on number of cpus in our
6502	* sched_group.	6502	* sched_group.
6503	*	6503	*
6504	* This changes load balance semantics a bit on who can move	6504	* This changes load balance semantics a bit on who can move
6505	* load to a given_cpu. In addition to the given_cpu itself	6505	* load to a given_cpu. In addition to the given_cpu itself
6506	* (or a ilb_cpu acting on its behalf where given_cpu is	6506	* (or a ilb_cpu acting on its behalf where given_cpu is
6507	* nohz-idle), we now have balance_cpu in a position to move	6507	* nohz-idle), we now have balance_cpu in a position to move
6508	* load to given_cpu. In rare situations, this may cause	6508	* load to given_cpu. In rare situations, this may cause
6509	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding	6509	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
6510	* _independently_ and at _same_ time to move some load to	6510	* _independently_ and at _same_ time to move some load to
6511	* given_cpu) causing exceess load to be moved to given_cpu.	6511	* given_cpu) causing exceess load to be moved to given_cpu.
6512	* This however should not happen so much in practice and	6512	* This however should not happen so much in practice and
6513	* moreover subsequent load balance cycles should correct the	6513	* moreover subsequent load balance cycles should correct the
6514	* excess load moved.	6514	* excess load moved.
6515	*/	6515	*/
6516	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {	6516	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6517		6517
6518	/* Prevent to re-select dst_cpu via env's cpus */	6518	/* Prevent to re-select dst_cpu via env's cpus */
6519	cpumask_clear_cpu(env.dst_cpu, env.cpus);	6519	cpumask_clear_cpu(env.dst_cpu, env.cpus);
6520		6520
6521	env.dst_rq = cpu_rq(env.new_dst_cpu);	6521	env.dst_rq = cpu_rq(env.new_dst_cpu);
6522	env.dst_cpu = env.new_dst_cpu;	6522	env.dst_cpu = env.new_dst_cpu;
6523	env.flags &= ~LBF_DST_PINNED;	6523	env.flags &= ~LBF_DST_PINNED;
6524	env.loop = 0;	6524	env.loop = 0;
6525	env.loop_break = sched_nr_migrate_break;	6525	env.loop_break = sched_nr_migrate_break;
6526		6526
6527	/*	6527	/*
6528	* Go back to "more_balance" rather than "redo" since we	6528	* Go back to "more_balance" rather than "redo" since we
6529	* need to continue with same src_cpu.	6529	* need to continue with same src_cpu.
6530	*/	6530	*/
6531	goto more_balance;	6531	goto more_balance;
6532	}	6532	}
6533		6533
6534	/*	6534	/*
6535	* We failed to reach balance because of affinity.	6535	* We failed to reach balance because of affinity.
6536	*/	6536	*/
6537	if (sd_parent) {	6537	if (sd_parent) {
6538	int *group_imbalance = &sd_parent->groups->sgp->imbalance;	6538	int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6539		6539
6540	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {	6540	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6541	*group_imbalance = 1;	6541	*group_imbalance = 1;
6542	} else if (*group_imbalance)	6542	} else if (*group_imbalance)
6543	*group_imbalance = 0;	6543	*group_imbalance = 0;
6544	}	6544	}
6545		6545
6546	/* All tasks on this runqueue were pinned by CPU affinity */	6546	/* All tasks on this runqueue were pinned by CPU affinity */
6547	if (unlikely(env.flags & LBF_ALL_PINNED)) {	6547	if (unlikely(env.flags & LBF_ALL_PINNED)) {
6548	cpumask_clear_cpu(cpu_of(busiest), cpus);	6548	cpumask_clear_cpu(cpu_of(busiest), cpus);
6549	if (!cpumask_empty(cpus)) {	6549	if (!cpumask_empty(cpus)) {
6550	env.loop = 0;	6550	env.loop = 0;
6551	env.loop_break = sched_nr_migrate_break;	6551	env.loop_break = sched_nr_migrate_break;
6552	goto redo;	6552	goto redo;
6553	}	6553	}
6554	goto out_balanced;	6554	goto out_balanced;
6555	}	6555	}
6556	}	6556	}
6557		6557
6558	if (!ld_moved) {	6558	if (!ld_moved) {
6559	schedstat_inc(sd, lb_failed[idle]);	6559	schedstat_inc(sd, lb_failed[idle]);
6560	/*	6560	/*
6561	* Increment the failure counter only on periodic balance.	6561	* Increment the failure counter only on periodic balance.
6562	* We do not want newidle balance, which can be very	6562	* We do not want newidle balance, which can be very
6563	* frequent, pollute the failure counter causing	6563	* frequent, pollute the failure counter causing
6564	* excessive cache_hot migrations and active balances.	6564	* excessive cache_hot migrations and active balances.
6565	*/	6565	*/
6566	if (idle != CPU_NEWLY_IDLE)	6566	if (idle != CPU_NEWLY_IDLE)
6567	sd->nr_balance_failed++;	6567	sd->nr_balance_failed++;
6568		6568
6569	if (need_active_balance(&env)) {	6569	if (need_active_balance(&env)) {
6570	raw_spin_lock_irqsave(&busiest->lock, flags);	6570	raw_spin_lock_irqsave(&busiest->lock, flags);
6571		6571
6572	/* don't kick the active_load_balance_cpu_stop,	6572	/* don't kick the active_load_balance_cpu_stop,
6573	* if the curr task on busiest cpu can't be	6573	* if the curr task on busiest cpu can't be
6574	* moved to this_cpu	6574	* moved to this_cpu
6575	*/	6575	*/
6576	if (!cpumask_test_cpu(this_cpu,	6576	if (!cpumask_test_cpu(this_cpu,
6577	tsk_cpus_allowed(busiest->curr))) {	6577	tsk_cpus_allowed(busiest->curr))) {
6578	raw_spin_unlock_irqrestore(&busiest->lock,	6578	raw_spin_unlock_irqrestore(&busiest->lock,
6579	flags);	6579	flags);
6580	env.flags \|= LBF_ALL_PINNED;	6580	env.flags \|= LBF_ALL_PINNED;
6581	goto out_one_pinned;	6581	goto out_one_pinned;
6582	}	6582	}
6583		6583
6584	/*	6584	/*
6585	* ->active_balance synchronizes accesses to	6585	* ->active_balance synchronizes accesses to
6586	* ->active_balance_work. Once set, it's cleared	6586	* ->active_balance_work. Once set, it's cleared
6587	* only after active load balance is finished.	6587	* only after active load balance is finished.
6588	*/	6588	*/
6589	if (!busiest->active_balance) {	6589	if (!busiest->active_balance) {
6590	busiest->active_balance = 1;	6590	busiest->active_balance = 1;
6591	busiest->push_cpu = this_cpu;	6591	busiest->push_cpu = this_cpu;
6592	active_balance = 1;	6592	active_balance = 1;
6593	}	6593	}
6594	raw_spin_unlock_irqrestore(&busiest->lock, flags);	6594	raw_spin_unlock_irqrestore(&busiest->lock, flags);
6595		6595
6596	if (active_balance) {	6596	if (active_balance) {
6597	stop_one_cpu_nowait(cpu_of(busiest),	6597	stop_one_cpu_nowait(cpu_of(busiest),
6598	active_load_balance_cpu_stop, busiest,	6598	active_load_balance_cpu_stop, busiest,
6599	&busiest->active_balance_work);	6599	&busiest->active_balance_work);
6600	}	6600	}
6601		6601
6602	/*	6602	/*
6603	* We've kicked active balancing, reset the failure	6603	* We've kicked active balancing, reset the failure
6604	* counter.	6604	* counter.
6605	*/	6605	*/
6606	sd->nr_balance_failed = sd->cache_nice_tries+1;	6606	sd->nr_balance_failed = sd->cache_nice_tries+1;
6607	}	6607	}
6608	} else	6608	} else
6609	sd->nr_balance_failed = 0;	6609	sd->nr_balance_failed = 0;
6610		6610
6611	if (likely(!active_balance)) {	6611	if (likely(!active_balance)) {
6612	/* We were unbalanced, so reset the balancing interval */	6612	/* We were unbalanced, so reset the balancing interval */
6613	sd->balance_interval = sd->min_interval;	6613	sd->balance_interval = sd->min_interval;
6614	} else {	6614	} else {
6615	/*	6615	/*
6616	* If we've begun active balancing, start to back off. This	6616	* If we've begun active balancing, start to back off. This
6617	* case may not be covered by the all_pinned logic if there	6617	* case may not be covered by the all_pinned logic if there
6618	* is only 1 task on the busy runqueue (because we don't call	6618	* is only 1 task on the busy runqueue (because we don't call
6619	* move_tasks).	6619	* move_tasks).
6620	*/	6620	*/
6621	if (sd->balance_interval < sd->max_interval)	6621	if (sd->balance_interval < sd->max_interval)
6622	sd->balance_interval *= 2;	6622	sd->balance_interval *= 2;
6623	}	6623	}
6624		6624
6625	goto out;	6625	goto out;
6626		6626
6627	out_balanced:	6627	out_balanced:
6628	schedstat_inc(sd, lb_balanced[idle]);	6628	schedstat_inc(sd, lb_balanced[idle]);
6629		6629
6630	sd->nr_balance_failed = 0;	6630	sd->nr_balance_failed = 0;
6631		6631
6632	out_one_pinned:	6632	out_one_pinned:
6633	/* tune up the balancing interval */	6633	/* tune up the balancing interval */
6634	if (((env.flags & LBF_ALL_PINNED) &&	6634	if (((env.flags & LBF_ALL_PINNED) &&
6635	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|	6635	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
6636	(sd->balance_interval < sd->max_interval))	6636	(sd->balance_interval < sd->max_interval))
6637	sd->balance_interval *= 2;	6637	sd->balance_interval *= 2;
6638		6638
6639	ld_moved = 0;	6639	ld_moved = 0;
6640	out:	6640	out:
6641	return ld_moved;	6641	return ld_moved;
6642	}	6642	}
6643		6643
6644	/*	6644	/*
6645	* idle_balance is called by schedule() if this_cpu is about to become	6645	* idle_balance is called by schedule() if this_cpu is about to become
6646	* idle. Attempts to pull tasks from other CPUs.	6646	* idle. Attempts to pull tasks from other CPUs.
6647	*/	6647	*/
6648	static int idle_balance(struct rq *this_rq)	6648	static int idle_balance(struct rq *this_rq)
6649	{	6649	{
6650	struct sched_domain *sd;	6650	struct sched_domain *sd;
6651	int pulled_task = 0;	6651	int pulled_task = 0;
6652	unsigned long next_balance = jiffies + HZ;	6652	unsigned long next_balance = jiffies + HZ;
6653	u64 curr_cost = 0;	6653	u64 curr_cost = 0;
6654	int this_cpu = this_rq->cpu;	6654	int this_cpu = this_rq->cpu;
6655		6655
6656	idle_enter_fair(this_rq);	6656	idle_enter_fair(this_rq);
6657		6657
6658	/*	6658	/*
6659	* We must set idle_stamp _before_ calling idle_balance(), such that we	6659	* We must set idle_stamp _before_ calling idle_balance(), such that we
6660	* measure the duration of idle_balance() as idle time.	6660	* measure the duration of idle_balance() as idle time.
6661	*/	6661	*/
6662	this_rq->idle_stamp = rq_clock(this_rq);	6662	this_rq->idle_stamp = rq_clock(this_rq);
6663		6663
6664	if (this_rq->avg_idle < sysctl_sched_migration_cost)	6664	if (this_rq->avg_idle < sysctl_sched_migration_cost)
6665	goto out;	6665	goto out;
6666		6666
6667	/*	6667	/*
6668	* Drop the rq->lock, but keep IRQ/preempt disabled.	6668	* Drop the rq->lock, but keep IRQ/preempt disabled.
6669	*/	6669	*/
6670	raw_spin_unlock(&this_rq->lock);	6670	raw_spin_unlock(&this_rq->lock);
6671		6671
6672	update_blocked_averages(this_cpu);	6672	update_blocked_averages(this_cpu);
6673	rcu_read_lock();	6673	rcu_read_lock();
6674	for_each_domain(this_cpu, sd) {	6674	for_each_domain(this_cpu, sd) {
6675	unsigned long interval;	6675	unsigned long interval;
6676	int continue_balancing = 1;	6676	int continue_balancing = 1;
6677	u64 t0, domain_cost;	6677	u64 t0, domain_cost;
6678		6678
6679	if (!(sd->flags & SD_LOAD_BALANCE))	6679	if (!(sd->flags & SD_LOAD_BALANCE))
6680	continue;	6680	continue;
6681		6681
6682	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)	6682	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6683	break;	6683	break;
6684		6684
6685	if (sd->flags & SD_BALANCE_NEWIDLE) {	6685	if (sd->flags & SD_BALANCE_NEWIDLE) {
6686	t0 = sched_clock_cpu(this_cpu);	6686	t0 = sched_clock_cpu(this_cpu);
6687		6687
6688	/* If we've pulled tasks over stop searching: */	6688	/* If we've pulled tasks over stop searching: */
6689	pulled_task = load_balance(this_cpu, this_rq,	6689	pulled_task = load_balance(this_cpu, this_rq,
6690	sd, CPU_NEWLY_IDLE,	6690	sd, CPU_NEWLY_IDLE,
6691	&continue_balancing);	6691	&continue_balancing);
6692		6692
6693	domain_cost = sched_clock_cpu(this_cpu) - t0;	6693	domain_cost = sched_clock_cpu(this_cpu) - t0;
6694	if (domain_cost > sd->max_newidle_lb_cost)	6694	if (domain_cost > sd->max_newidle_lb_cost)
6695	sd->max_newidle_lb_cost = domain_cost;	6695	sd->max_newidle_lb_cost = domain_cost;
6696		6696
6697	curr_cost += domain_cost;	6697	curr_cost += domain_cost;
6698	}	6698	}
6699		6699
6700	interval = msecs_to_jiffies(sd->balance_interval);	6700	interval = msecs_to_jiffies(sd->balance_interval);
6701	if (time_after(next_balance, sd->last_balance + interval))	6701	if (time_after(next_balance, sd->last_balance + interval))
6702	next_balance = sd->last_balance + interval;	6702	next_balance = sd->last_balance + interval;
6703	if (pulled_task)	6703	if (pulled_task)
6704	break;	6704	break;
6705	}	6705	}
6706	rcu_read_unlock();	6706	rcu_read_unlock();
6707		6707
6708	raw_spin_lock(&this_rq->lock);	6708	raw_spin_lock(&this_rq->lock);
6709		6709
6710	if (curr_cost > this_rq->max_idle_balance_cost)	6710	if (curr_cost > this_rq->max_idle_balance_cost)
6711	this_rq->max_idle_balance_cost = curr_cost;	6711	this_rq->max_idle_balance_cost = curr_cost;
6712		6712
6713	/*	6713	/*
6714	* While browsing the domains, we released the rq lock, a task could	6714	* While browsing the domains, we released the rq lock, a task could
6715	* have been enqueued in the meantime. Since we're not going idle,	6715	* have been enqueued in the meantime. Since we're not going idle,
6716	* pretend we pulled a task.	6716	* pretend we pulled a task.
6717	*/	6717	*/
6718	if (this_rq->cfs.h_nr_running && !pulled_task)	6718	if (this_rq->cfs.h_nr_running && !pulled_task)
6719	pulled_task = 1;	6719	pulled_task = 1;
6720		6720
6721	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {	6721	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
6722	/*	6722	/*
6723	* We are going idle. next_balance may be set based on	6723	* We are going idle. next_balance may be set based on
6724	* a busy processor. So reset next_balance.	6724	* a busy processor. So reset next_balance.
6725	*/	6725	*/
6726	this_rq->next_balance = next_balance;	6726	this_rq->next_balance = next_balance;
6727	}	6727	}
6728		6728
6729	out:	6729	out:
6730	/* Is there a task of a high priority class? */	6730	/* Is there a task of a high priority class? */
6731	if (this_rq->nr_running != this_rq->cfs.h_nr_running &&	6731	if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
6732	((this_rq->stop && this_rq->stop->on_rq) \|\|	6732	((this_rq->stop && this_rq->stop->on_rq) \|\|
6733	this_rq->dl.dl_nr_running \|\|	6733	this_rq->dl.dl_nr_running \|\|
6734	(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))	6734	(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6735	pulled_task = -1;	6735	pulled_task = -1;
6736		6736
6737	if (pulled_task) {	6737	if (pulled_task) {
6738	idle_exit_fair(this_rq);	6738	idle_exit_fair(this_rq);
6739	this_rq->idle_stamp = 0;	6739	this_rq->idle_stamp = 0;
6740	}	6740	}
6741		6741
6742	return pulled_task;	6742	return pulled_task;
6743	}	6743	}
6744		6744
6745	/*	6745	/*
6746	* active_load_balance_cpu_stop is run by cpu stopper. It pushes	6746	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
6747	* running tasks off the busiest CPU onto idle CPUs. It requires at	6747	* running tasks off the busiest CPU onto idle CPUs. It requires at
6748	* least 1 task to be running on each physical CPU where possible, and	6748	* least 1 task to be running on each physical CPU where possible, and
6749	* avoids physical / logical imbalances.	6749	* avoids physical / logical imbalances.
6750	*/	6750	*/
6751	static int active_load_balance_cpu_stop(void *data)	6751	static int active_load_balance_cpu_stop(void *data)
6752	{	6752	{
6753	struct rq *busiest_rq = data;	6753	struct rq *busiest_rq = data;
6754	int busiest_cpu = cpu_of(busiest_rq);	6754	int busiest_cpu = cpu_of(busiest_rq);
6755	int target_cpu = busiest_rq->push_cpu;	6755	int target_cpu = busiest_rq->push_cpu;
6756	struct rq *target_rq = cpu_rq(target_cpu);	6756	struct rq *target_rq = cpu_rq(target_cpu);
6757	struct sched_domain *sd;	6757	struct sched_domain *sd;
6758		6758
6759	raw_spin_lock_irq(&busiest_rq->lock);	6759	raw_spin_lock_irq(&busiest_rq->lock);
6760		6760
6761	/* make sure the requested cpu hasn't gone down in the meantime */	6761	/* make sure the requested cpu hasn't gone down in the meantime */
6762	if (unlikely(busiest_cpu != smp_processor_id() \|\|	6762	if (unlikely(busiest_cpu != smp_processor_id() \|\|
6763	!busiest_rq->active_balance))	6763	!busiest_rq->active_balance))
6764	goto out_unlock;	6764	goto out_unlock;
6765		6765
6766	/* Is there any task to move? */	6766	/* Is there any task to move? */
6767	if (busiest_rq->nr_running <= 1)	6767	if (busiest_rq->nr_running <= 1)
6768	goto out_unlock;	6768	goto out_unlock;
6769		6769
6770	/*	6770	/*
6771	* This condition is "impossible", if it occurs	6771	* This condition is "impossible", if it occurs
6772	* we need to fix it. Originally reported by	6772	* we need to fix it. Originally reported by
6773	* Bjorn Helgaas on a 128-cpu setup.	6773	* Bjorn Helgaas on a 128-cpu setup.
6774	*/	6774	*/
6775	BUG_ON(busiest_rq == target_rq);	6775	BUG_ON(busiest_rq == target_rq);
6776		6776
6777	/* move a task from busiest_rq to target_rq */	6777	/* move a task from busiest_rq to target_rq */
6778	double_lock_balance(busiest_rq, target_rq);	6778	double_lock_balance(busiest_rq, target_rq);
6779		6779
6780	/* Search for an sd spanning us and the target CPU. */	6780	/* Search for an sd spanning us and the target CPU. */
6781	rcu_read_lock();	6781	rcu_read_lock();
6782	for_each_domain(target_cpu, sd) {	6782	for_each_domain(target_cpu, sd) {
6783	if ((sd->flags & SD_LOAD_BALANCE) &&	6783	if ((sd->flags & SD_LOAD_BALANCE) &&
6784	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))	6784	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6785	break;	6785	break;
6786	}	6786	}
6787		6787
6788	if (likely(sd)) {	6788	if (likely(sd)) {
6789	struct lb_env env = {	6789	struct lb_env env = {
6790	.sd = sd,	6790	.sd = sd,
6791	.dst_cpu = target_cpu,	6791	.dst_cpu = target_cpu,
6792	.dst_rq = target_rq,	6792	.dst_rq = target_rq,
6793	.src_cpu = busiest_rq->cpu,	6793	.src_cpu = busiest_rq->cpu,
6794	.src_rq = busiest_rq,	6794	.src_rq = busiest_rq,
6795	.idle = CPU_IDLE,	6795	.idle = CPU_IDLE,
6796	};	6796	};
6797		6797
6798	schedstat_inc(sd, alb_count);	6798	schedstat_inc(sd, alb_count);
6799		6799
6800	if (move_one_task(&env))	6800	if (move_one_task(&env))
6801	schedstat_inc(sd, alb_pushed);	6801	schedstat_inc(sd, alb_pushed);
6802	else	6802	else
6803	schedstat_inc(sd, alb_failed);	6803	schedstat_inc(sd, alb_failed);
6804	}	6804	}
6805	rcu_read_unlock();	6805	rcu_read_unlock();
6806	double_unlock_balance(busiest_rq, target_rq);	6806	double_unlock_balance(busiest_rq, target_rq);
6807	out_unlock:	6807	out_unlock:
6808	busiest_rq->active_balance = 0;	6808	busiest_rq->active_balance = 0;
6809	raw_spin_unlock_irq(&busiest_rq->lock);	6809	raw_spin_unlock_irq(&busiest_rq->lock);
6810	return 0;	6810	return 0;
6811	}	6811	}
6812		6812
6813	static inline int on_null_domain(struct rq *rq)	6813	static inline int on_null_domain(struct rq *rq)
6814	{	6814	{
6815	return unlikely(!rcu_dereference_sched(rq->sd));	6815	return unlikely(!rcu_dereference_sched(rq->sd));
6816	}	6816	}
6817		6817
6818	#ifdef CONFIG_NO_HZ_COMMON	6818	#ifdef CONFIG_NO_HZ_COMMON
6819	/*	6819	/*
6820	* idle load balancing details	6820	* idle load balancing details
6821	* - When one of the busy CPUs notice that there may be an idle rebalancing	6821	* - When one of the busy CPUs notice that there may be an idle rebalancing
6822	* needed, they will kick the idle load balancer, which then does idle	6822	* needed, they will kick the idle load balancer, which then does idle
6823	* load balancing for all the idle CPUs.	6823	* load balancing for all the idle CPUs.
6824	*/	6824	*/
6825	static struct {	6825	static struct {
6826	cpumask_var_t idle_cpus_mask;	6826	cpumask_var_t idle_cpus_mask;
6827	atomic_t nr_cpus;	6827	atomic_t nr_cpus;
6828	unsigned long next_balance; /* in jiffy units */	6828	unsigned long next_balance; /* in jiffy units */
6829	} nohz ____cacheline_aligned;	6829	} nohz ____cacheline_aligned;
6830		6830
6831	static inline int find_new_ilb(void)	6831	static inline int find_new_ilb(void)
6832	{	6832	{
6833	int ilb = cpumask_first(nohz.idle_cpus_mask);	6833	int ilb = cpumask_first(nohz.idle_cpus_mask);
6834		6834
6835	if (ilb < nr_cpu_ids && idle_cpu(ilb))	6835	if (ilb < nr_cpu_ids && idle_cpu(ilb))
6836	return ilb;	6836	return ilb;
6837		6837
6838	return nr_cpu_ids;	6838	return nr_cpu_ids;
6839	}	6839	}
6840		6840
6841	/*	6841	/*
6842	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the	6842	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
6843	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle	6843	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6844	* CPU (if there is one).	6844	* CPU (if there is one).
6845	*/	6845	*/
6846	static void nohz_balancer_kick(void)	6846	static void nohz_balancer_kick(void)
6847	{	6847	{
6848	int ilb_cpu;	6848	int ilb_cpu;
6849		6849
6850	nohz.next_balance++;	6850	nohz.next_balance++;
6851		6851
6852	ilb_cpu = find_new_ilb();	6852	ilb_cpu = find_new_ilb();
6853		6853
6854	if (ilb_cpu >= nr_cpu_ids)	6854	if (ilb_cpu >= nr_cpu_ids)
6855	return;	6855	return;
6856		6856
6857	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))	6857	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6858	return;	6858	return;
6859	/*	6859	/*
6860	* Use smp_send_reschedule() instead of resched_cpu().	6860	* Use smp_send_reschedule() instead of resched_cpu().
6861	* This way we generate a sched IPI on the target cpu which	6861	* This way we generate a sched IPI on the target cpu which
6862	* is idle. And the softirq performing nohz idle load balance	6862	* is idle. And the softirq performing nohz idle load balance
6863	* will be run before returning from the IPI.	6863	* will be run before returning from the IPI.
6864	*/	6864	*/
6865	smp_send_reschedule(ilb_cpu);	6865	smp_send_reschedule(ilb_cpu);
6866	return;	6866	return;
6867	}	6867	}
6868		6868
6869	static inline void nohz_balance_exit_idle(int cpu)	6869	static inline void nohz_balance_exit_idle(int cpu)
6870	{	6870	{
6871	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {	6871	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6872	/*	6872	/*
6873	* Completely isolated CPUs don't ever set, so we must test.	6873	* Completely isolated CPUs don't ever set, so we must test.
6874	*/	6874	*/
6875	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {	6875	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
6876	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);	6876	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6877	atomic_dec(&nohz.nr_cpus);	6877	atomic_dec(&nohz.nr_cpus);
6878	}	6878	}
6879	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));	6879	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6880	}	6880	}
6881	}	6881	}
6882		6882
6883	static inline void set_cpu_sd_state_busy(void)	6883	static inline void set_cpu_sd_state_busy(void)
6884	{	6884	{
6885	struct sched_domain *sd;	6885	struct sched_domain *sd;
6886	int cpu = smp_processor_id();	6886	int cpu = smp_processor_id();
6887		6887
6888	rcu_read_lock();	6888	rcu_read_lock();
6889	sd = rcu_dereference(per_cpu(sd_busy, cpu));	6889	sd = rcu_dereference(per_cpu(sd_busy, cpu));
6890		6890
6891	if (!sd \|\| !sd->nohz_idle)	6891	if (!sd \|\| !sd->nohz_idle)
6892	goto unlock;	6892	goto unlock;
6893	sd->nohz_idle = 0;	6893	sd->nohz_idle = 0;
6894		6894
6895	atomic_inc(&sd->groups->sgp->nr_busy_cpus);	6895	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
6896	unlock:	6896	unlock:
6897	rcu_read_unlock();	6897	rcu_read_unlock();
6898	}	6898	}
6899		6899
6900	void set_cpu_sd_state_idle(void)	6900	void set_cpu_sd_state_idle(void)
6901	{	6901	{
6902	struct sched_domain *sd;	6902	struct sched_domain *sd;
6903	int cpu = smp_processor_id();	6903	int cpu = smp_processor_id();
6904		6904
6905	rcu_read_lock();	6905	rcu_read_lock();
6906	sd = rcu_dereference(per_cpu(sd_busy, cpu));	6906	sd = rcu_dereference(per_cpu(sd_busy, cpu));
6907		6907
6908	if (!sd \|\| sd->nohz_idle)	6908	if (!sd \|\| sd->nohz_idle)
6909	goto unlock;	6909	goto unlock;
6910	sd->nohz_idle = 1;	6910	sd->nohz_idle = 1;
6911		6911
6912	atomic_dec(&sd->groups->sgp->nr_busy_cpus);	6912	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
6913	unlock:	6913	unlock:
6914	rcu_read_unlock();	6914	rcu_read_unlock();
6915	}	6915	}
6916		6916
6917	/*	6917	/*
6918	* This routine will record that the cpu is going idle with tick stopped.	6918	* This routine will record that the cpu is going idle with tick stopped.
6919	* This info will be used in performing idle load balancing in the future.	6919	* This info will be used in performing idle load balancing in the future.
6920	*/	6920	*/
6921	void nohz_balance_enter_idle(int cpu)	6921	void nohz_balance_enter_idle(int cpu)
6922	{	6922	{
6923	/*	6923	/*
6924	* If this cpu is going down, then nothing needs to be done.	6924	* If this cpu is going down, then nothing needs to be done.
6925	*/	6925	*/
6926	if (!cpu_active(cpu))	6926	if (!cpu_active(cpu))
6927	return;	6927	return;
6928		6928
6929	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))	6929	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6930	return;	6930	return;
6931		6931
6932	/*	6932	/*
6933	* If we're a completely isolated CPU, we don't play.	6933	* If we're a completely isolated CPU, we don't play.
6934	*/	6934	*/
6935	if (on_null_domain(cpu_rq(cpu)))	6935	if (on_null_domain(cpu_rq(cpu)))
6936	return;	6936	return;
6937		6937
6938	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);	6938	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6939	atomic_inc(&nohz.nr_cpus);	6939	atomic_inc(&nohz.nr_cpus);
6940	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));	6940	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6941	}	6941	}
6942		6942
6943	static int sched_ilb_notifier(struct notifier_block *nfb,	6943	static int sched_ilb_notifier(struct notifier_block *nfb,
6944	unsigned long action, void *hcpu)	6944	unsigned long action, void *hcpu)
6945	{	6945	{
6946	switch (action & ~CPU_TASKS_FROZEN) {	6946	switch (action & ~CPU_TASKS_FROZEN) {
6947	case CPU_DYING:	6947	case CPU_DYING:
6948	nohz_balance_exit_idle(smp_processor_id());	6948	nohz_balance_exit_idle(smp_processor_id());
6949	return NOTIFY_OK;	6949	return NOTIFY_OK;
6950	default:	6950	default:
6951	return NOTIFY_DONE;	6951	return NOTIFY_DONE;
6952	}	6952	}
6953	}	6953	}
6954	#endif	6954	#endif
6955		6955
6956	static DEFINE_SPINLOCK(balancing);	6956	static DEFINE_SPINLOCK(balancing);
6957		6957
6958	/*	6958	/*
6959	* Scale the max load_balance interval with the number of CPUs in the system.	6959	* Scale the max load_balance interval with the number of CPUs in the system.
6960	* This trades load-balance latency on larger machines for less cross talk.	6960	* This trades load-balance latency on larger machines for less cross talk.
6961	*/	6961	*/
6962	void update_max_interval(void)	6962	void update_max_interval(void)
6963	{	6963	{
6964	max_load_balance_interval = HZ*num_online_cpus()/10;	6964	max_load_balance_interval = HZ*num_online_cpus()/10;
6965	}	6965	}
6966		6966
6967	/*	6967	/*
6968	* It checks each scheduling domain to see if it is due to be balanced,	6968	* It checks each scheduling domain to see if it is due to be balanced,
6969	* and initiates a balancing operation if so.	6969	* and initiates a balancing operation if so.
6970	*	6970	*
6971	* Balancing parameters are set up in init_sched_domains.	6971	* Balancing parameters are set up in init_sched_domains.
6972	*/	6972	*/
6973	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)	6973	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6974	{	6974	{
6975	int continue_balancing = 1;	6975	int continue_balancing = 1;
6976	int cpu = rq->cpu;	6976	int cpu = rq->cpu;
6977	unsigned long interval;	6977	unsigned long interval;
6978	struct sched_domain *sd;	6978	struct sched_domain *sd;
6979	/* Earliest time when we have to do rebalance again */	6979	/* Earliest time when we have to do rebalance again */
6980	unsigned long next_balance = jiffies + 60*HZ;	6980	unsigned long next_balance = jiffies + 60*HZ;
6981	int update_next_balance = 0;	6981	int update_next_balance = 0;
6982	int need_serialize, need_decay = 0;	6982	int need_serialize, need_decay = 0;
6983	u64 max_cost = 0;	6983	u64 max_cost = 0;
6984		6984
6985	update_blocked_averages(cpu);	6985	update_blocked_averages(cpu);
6986		6986
6987	rcu_read_lock();	6987	rcu_read_lock();
6988	for_each_domain(cpu, sd) {	6988	for_each_domain(cpu, sd) {
6989	/*	6989	/*
6990	* Decay the newidle max times here because this is a regular	6990	* Decay the newidle max times here because this is a regular
6991	* visit to all the domains. Decay ~1% per second.	6991	* visit to all the domains. Decay ~1% per second.
6992	*/	6992	*/
6993	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {	6993	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6994	sd->max_newidle_lb_cost =	6994	sd->max_newidle_lb_cost =
6995	(sd->max_newidle_lb_cost * 253) / 256;	6995	(sd->max_newidle_lb_cost * 253) / 256;
6996	sd->next_decay_max_lb_cost = jiffies + HZ;	6996	sd->next_decay_max_lb_cost = jiffies + HZ;
6997	need_decay = 1;	6997	need_decay = 1;
6998	}	6998	}
6999	max_cost += sd->max_newidle_lb_cost;	6999	max_cost += sd->max_newidle_lb_cost;
7000		7000
7001	if (!(sd->flags & SD_LOAD_BALANCE))	7001	if (!(sd->flags & SD_LOAD_BALANCE))
7002	continue;	7002	continue;
7003		7003
7004	/*	7004	/*
7005	* Stop the load balance at this level. There is another	7005	* Stop the load balance at this level. There is another
7006	* CPU in our sched group which is doing load balancing more	7006	* CPU in our sched group which is doing load balancing more
7007	* actively.	7007	* actively.
7008	*/	7008	*/
7009	if (!continue_balancing) {	7009	if (!continue_balancing) {
7010	if (need_decay)	7010	if (need_decay)
7011	continue;	7011	continue;
7012	break;	7012	break;
7013	}	7013	}
7014		7014
7015	interval = sd->balance_interval;	7015	interval = sd->balance_interval;
7016	if (idle != CPU_IDLE)	7016	if (idle != CPU_IDLE)
7017	interval *= sd->busy_factor;	7017	interval *= sd->busy_factor;
7018		7018
7019	/* scale ms to jiffies */	7019	/* scale ms to jiffies */
7020	interval = msecs_to_jiffies(interval);	7020	interval = msecs_to_jiffies(interval);
7021	interval = clamp(interval, 1UL, max_load_balance_interval);	7021	interval = clamp(interval, 1UL, max_load_balance_interval);
7022		7022
7023	need_serialize = sd->flags & SD_SERIALIZE;	7023	need_serialize = sd->flags & SD_SERIALIZE;
7024		7024
7025	if (need_serialize) {	7025	if (need_serialize) {
7026	if (!spin_trylock(&balancing))	7026	if (!spin_trylock(&balancing))
7027	goto out;	7027	goto out;
7028	}	7028	}
7029		7029
7030	if (time_after_eq(jiffies, sd->last_balance + interval)) {	7030	if (time_after_eq(jiffies, sd->last_balance + interval)) {
7031	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {	7031	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7032	/*	7032	/*
7033	* The LBF_DST_PINNED logic could have changed	7033	* The LBF_DST_PINNED logic could have changed
7034	* env->dst_cpu, so we can't know our idle	7034	* env->dst_cpu, so we can't know our idle
7035	* state even if we migrated tasks. Update it.	7035	* state even if we migrated tasks. Update it.
7036	*/	7036	*/
7037	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;	7037	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7038	}	7038	}
7039	sd->last_balance = jiffies;	7039	sd->last_balance = jiffies;
7040	}	7040	}
7041	if (need_serialize)	7041	if (need_serialize)
7042	spin_unlock(&balancing);	7042	spin_unlock(&balancing);
7043	out:	7043	out:
7044	if (time_after(next_balance, sd->last_balance + interval)) {	7044	if (time_after(next_balance, sd->last_balance + interval)) {
7045	next_balance = sd->last_balance + interval;	7045	next_balance = sd->last_balance + interval;
7046	update_next_balance = 1;	7046	update_next_balance = 1;
7047	}	7047	}
7048	}	7048	}
7049	if (need_decay) {	7049	if (need_decay) {
7050	/*	7050	/*
7051	* Ensure the rq-wide value also decays but keep it at a	7051	* Ensure the rq-wide value also decays but keep it at a
7052	* reasonable floor to avoid funnies with rq->avg_idle.	7052	* reasonable floor to avoid funnies with rq->avg_idle.
7053	*/	7053	*/
7054	rq->max_idle_balance_cost =	7054	rq->max_idle_balance_cost =
7055	max((u64)sysctl_sched_migration_cost, max_cost);	7055	max((u64)sysctl_sched_migration_cost, max_cost);
7056	}	7056	}
7057	rcu_read_unlock();	7057	rcu_read_unlock();
7058		7058
7059	/*	7059	/*
7060	* next_balance will be updated only when there is a need.	7060	* next_balance will be updated only when there is a need.
7061	* When the cpu is attached to null domain for ex, it will not be	7061	* When the cpu is attached to null domain for ex, it will not be
7062	* updated.	7062	* updated.
7063	*/	7063	*/
7064	if (likely(update_next_balance))	7064	if (likely(update_next_balance))
7065	rq->next_balance = next_balance;	7065	rq->next_balance = next_balance;
7066	}	7066	}
7067		7067
7068	#ifdef CONFIG_NO_HZ_COMMON	7068	#ifdef CONFIG_NO_HZ_COMMON
7069	/*	7069	/*
7070	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the	7070	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7071	* rebalancing for all the cpus for whom scheduler ticks are stopped.	7071	* rebalancing for all the cpus for whom scheduler ticks are stopped.
7072	*/	7072	*/
7073	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)	7073	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7074	{	7074	{
7075	int this_cpu = this_rq->cpu;	7075	int this_cpu = this_rq->cpu;
7076	struct rq *rq;	7076	struct rq *rq;
7077	int balance_cpu;	7077	int balance_cpu;
7078		7078
7079	if (idle != CPU_IDLE \|\|	7079	if (idle != CPU_IDLE \|\|
7080	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))	7080	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7081	goto end;	7081	goto end;
7082		7082
7083	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {	7083	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7084	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))	7084	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
7085	continue;	7085	continue;
7086		7086
7087	/*	7087	/*
7088	* If this cpu gets work to do, stop the load balancing	7088	* If this cpu gets work to do, stop the load balancing
7089	* work being done for other cpus. Next load	7089	* work being done for other cpus. Next load
7090	* balancing owner will pick it up.	7090	* balancing owner will pick it up.
7091	*/	7091	*/
7092	if (need_resched())	7092	if (need_resched())
7093	break;	7093	break;
7094		7094
7095	rq = cpu_rq(balance_cpu);	7095	rq = cpu_rq(balance_cpu);
7096		7096
7097	raw_spin_lock_irq(&rq->lock);	7097	raw_spin_lock_irq(&rq->lock);
7098	update_rq_clock(rq);	7098	update_rq_clock(rq);
7099	update_idle_cpu_load(rq);	7099	update_idle_cpu_load(rq);
7100	raw_spin_unlock_irq(&rq->lock);	7100	raw_spin_unlock_irq(&rq->lock);
7101		7101
7102	rebalance_domains(rq, CPU_IDLE);	7102	rebalance_domains(rq, CPU_IDLE);
7103		7103
7104	if (time_after(this_rq->next_balance, rq->next_balance))	7104	if (time_after(this_rq->next_balance, rq->next_balance))
7105	this_rq->next_balance = rq->next_balance;	7105	this_rq->next_balance = rq->next_balance;
7106	}	7106	}
7107	nohz.next_balance = this_rq->next_balance;	7107	nohz.next_balance = this_rq->next_balance;
7108	end:	7108	end:
7109	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));	7109	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7110	}	7110	}
7111		7111
7112	/*	7112	/*
7113	* Current heuristic for kicking the idle load balancer in the presence	7113	* Current heuristic for kicking the idle load balancer in the presence
7114	* of an idle cpu is the system.	7114	* of an idle cpu is the system.
7115	* - This rq has more than one task.	7115	* - This rq has more than one task.
7116	* - At any scheduler domain level, this cpu's scheduler group has multiple	7116	* - At any scheduler domain level, this cpu's scheduler group has multiple
7117	* busy cpu's exceeding the group's power.	7117	* busy cpu's exceeding the group's power.
7118	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler	7118	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7119	* domain span are idle.	7119	* domain span are idle.
7120	*/	7120	*/
7121	static inline int nohz_kick_needed(struct rq *rq)	7121	static inline int nohz_kick_needed(struct rq *rq)
7122	{	7122	{
7123	unsigned long now = jiffies;	7123	unsigned long now = jiffies;
7124	struct sched_domain *sd;	7124	struct sched_domain *sd;
7125	struct sched_group_power *sgp;	7125	struct sched_group_power *sgp;
7126	int nr_busy, cpu = rq->cpu;	7126	int nr_busy, cpu = rq->cpu;
7127		7127
7128	if (unlikely(rq->idle_balance))	7128	if (unlikely(rq->idle_balance))
7129	return 0;	7129	return 0;
7130		7130
7131	/*	7131	/*
7132	* We may be recently in ticked or tickless idle mode. At the first	7132	* We may be recently in ticked or tickless idle mode. At the first
7133	* busy tick after returning from idle, we will update the busy stats.	7133	* busy tick after returning from idle, we will update the busy stats.
7134	*/	7134	*/
7135	set_cpu_sd_state_busy();	7135	set_cpu_sd_state_busy();
7136	nohz_balance_exit_idle(cpu);	7136	nohz_balance_exit_idle(cpu);
7137		7137
7138	/*	7138	/*
7139	* None are in tickless mode and hence no need for NOHZ idle load	7139	* None are in tickless mode and hence no need for NOHZ idle load
7140	* balancing.	7140	* balancing.
7141	*/	7141	*/
7142	if (likely(!atomic_read(&nohz.nr_cpus)))	7142	if (likely(!atomic_read(&nohz.nr_cpus)))
7143	return 0;	7143	return 0;
7144		7144
7145	if (time_before(now, nohz.next_balance))	7145	if (time_before(now, nohz.next_balance))
7146	return 0;	7146	return 0;
7147		7147
7148	if (rq->nr_running >= 2)	7148	if (rq->nr_running >= 2)
7149	goto need_kick;	7149	goto need_kick;
7150		7150
7151	rcu_read_lock();	7151	rcu_read_lock();
7152	sd = rcu_dereference(per_cpu(sd_busy, cpu));	7152	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7153		7153
7154	if (sd) {	7154	if (sd) {
7155	sgp = sd->groups->sgp;	7155	sgp = sd->groups->sgp;
7156	nr_busy = atomic_read(&sgp->nr_busy_cpus);	7156	nr_busy = atomic_read(&sgp->nr_busy_cpus);
7157		7157
7158	if (nr_busy > 1)	7158	if (nr_busy > 1)
7159	goto need_kick_unlock;	7159	goto need_kick_unlock;
7160	}	7160	}
7161		7161
7162	sd = rcu_dereference(per_cpu(sd_asym, cpu));	7162	sd = rcu_dereference(per_cpu(sd_asym, cpu));
7163		7163
7164	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,	7164	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7165	sched_domain_span(sd)) < cpu))	7165	sched_domain_span(sd)) < cpu))
7166	goto need_kick_unlock;	7166	goto need_kick_unlock;
7167		7167
7168	rcu_read_unlock();	7168	rcu_read_unlock();
7169	return 0;	7169	return 0;
7170		7170
7171	need_kick_unlock:	7171	need_kick_unlock:
7172	rcu_read_unlock();	7172	rcu_read_unlock();
7173	need_kick:	7173	need_kick:
7174	return 1;	7174	return 1;
7175	}	7175	}
7176	#else	7176	#else
7177	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }	7177	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7178	#endif	7178	#endif
7179		7179
7180	/*	7180	/*
7181	* run_rebalance_domains is triggered when needed from the scheduler tick.	7181	* run_rebalance_domains is triggered when needed from the scheduler tick.
7182	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).	7182	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7183	*/	7183	*/
7184	static void run_rebalance_domains(struct softirq_action *h)	7184	static void run_rebalance_domains(struct softirq_action *h)
7185	{	7185	{
7186	struct rq *this_rq = this_rq();	7186	struct rq *this_rq = this_rq();
7187	enum cpu_idle_type idle = this_rq->idle_balance ?	7187	enum cpu_idle_type idle = this_rq->idle_balance ?
7188	CPU_IDLE : CPU_NOT_IDLE;	7188	CPU_IDLE : CPU_NOT_IDLE;
7189		7189
7190	rebalance_domains(this_rq, idle);	7190	rebalance_domains(this_rq, idle);
7191		7191
7192	/*	7192	/*
7193	* If this cpu has a pending nohz_balance_kick, then do the	7193	* If this cpu has a pending nohz_balance_kick, then do the
7194	* balancing on behalf of the other idle cpus whose ticks are	7194	* balancing on behalf of the other idle cpus whose ticks are
7195	* stopped.	7195	* stopped.
7196	*/	7196	*/
7197	nohz_idle_balance(this_rq, idle);	7197	nohz_idle_balance(this_rq, idle);
7198	}	7198	}
7199		7199
7200	/*	7200	/*
7201	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.	7201	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7202	*/	7202	*/
7203	void trigger_load_balance(struct rq *rq)	7203	void trigger_load_balance(struct rq *rq)
7204	{	7204	{
7205	/* Don't need to rebalance while attached to NULL domain */	7205	/* Don't need to rebalance while attached to NULL domain */
7206	if (unlikely(on_null_domain(rq)))	7206	if (unlikely(on_null_domain(rq)))
7207	return;	7207	return;
7208		7208
7209	if (time_after_eq(jiffies, rq->next_balance))	7209	if (time_after_eq(jiffies, rq->next_balance))
7210	raise_softirq(SCHED_SOFTIRQ);	7210	raise_softirq(SCHED_SOFTIRQ);
7211	#ifdef CONFIG_NO_HZ_COMMON	7211	#ifdef CONFIG_NO_HZ_COMMON
7212	if (nohz_kick_needed(rq))	7212	if (nohz_kick_needed(rq))
7213	nohz_balancer_kick();	7213	nohz_balancer_kick();
7214	#endif	7214	#endif
7215	}	7215	}
7216		7216
7217	static void rq_online_fair(struct rq *rq)	7217	static void rq_online_fair(struct rq *rq)
7218	{	7218	{
7219	update_sysctl();	7219	update_sysctl();
7220	}	7220	}
7221		7221
7222	static void rq_offline_fair(struct rq *rq)	7222	static void rq_offline_fair(struct rq *rq)
7223	{	7223	{
7224	update_sysctl();	7224	update_sysctl();
7225		7225
7226	/* Ensure any throttled groups are reachable by pick_next_task */	7226	/* Ensure any throttled groups are reachable by pick_next_task */
7227	unthrottle_offline_cfs_rqs(rq);	7227	unthrottle_offline_cfs_rqs(rq);
7228	}	7228	}
7229		7229
7230	#endif /* CONFIG_SMP */	7230	#endif /* CONFIG_SMP */
7231		7231
7232	/*	7232	/*
7233	* scheduler tick hitting a task of our scheduling class:	7233	* scheduler tick hitting a task of our scheduling class:
7234	*/	7234	*/
7235	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)	7235	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
7236	{	7236	{
7237	struct cfs_rq *cfs_rq;	7237	struct cfs_rq *cfs_rq;
7238	struct sched_entity *se = &curr->se;	7238	struct sched_entity *se = &curr->se;
7239		7239
7240	for_each_sched_entity(se) {	7240	for_each_sched_entity(se) {
7241	cfs_rq = cfs_rq_of(se);	7241	cfs_rq = cfs_rq_of(se);
7242	entity_tick(cfs_rq, se, queued);	7242	entity_tick(cfs_rq, se, queued);
7243	}	7243	}
7244		7244
7245	if (numabalancing_enabled)	7245	if (numabalancing_enabled)
7246	task_tick_numa(rq, curr);	7246	task_tick_numa(rq, curr);
7247		7247
7248	update_rq_runnable_avg(rq, 1);	7248	update_rq_runnable_avg(rq, 1);
7249	}	7249	}
7250		7250
7251	/*	7251	/*
7252	* called on fork with the child task as argument from the parent's context	7252	* called on fork with the child task as argument from the parent's context
7253	* - child not yet on the tasklist	7253	* - child not yet on the tasklist
7254	* - preemption disabled	7254	* - preemption disabled
7255	*/	7255	*/
7256	static void task_fork_fair(struct task_struct *p)	7256	static void task_fork_fair(struct task_struct *p)
7257	{	7257	{
7258	struct cfs_rq *cfs_rq;	7258	struct cfs_rq *cfs_rq;
7259	struct sched_entity se = &p->se, curr;	7259	struct sched_entity se = &p->se, curr;
7260	int this_cpu = smp_processor_id();	7260	int this_cpu = smp_processor_id();
7261	struct rq *rq = this_rq();	7261	struct rq *rq = this_rq();
7262	unsigned long flags;	7262	unsigned long flags;
7263		7263
7264	raw_spin_lock_irqsave(&rq->lock, flags);	7264	raw_spin_lock_irqsave(&rq->lock, flags);
7265		7265
7266	update_rq_clock(rq);	7266	update_rq_clock(rq);
7267		7267
7268	cfs_rq = task_cfs_rq(current);	7268	cfs_rq = task_cfs_rq(current);
7269	curr = cfs_rq->curr;	7269	curr = cfs_rq->curr;
7270		7270
7271	/*	7271	/*
7272	* Not only the cpu but also the task_group of the parent might have	7272	* Not only the cpu but also the task_group of the parent might have
7273	* been changed after parent->se.parent,cfs_rq were copied to	7273	* been changed after parent->se.parent,cfs_rq were copied to
7274	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those	7274	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7275	* of child point to valid ones.	7275	* of child point to valid ones.
7276	*/	7276	*/
7277	rcu_read_lock();	7277	rcu_read_lock();
7278	__set_task_cpu(p, this_cpu);	7278	__set_task_cpu(p, this_cpu);
7279	rcu_read_unlock();	7279	rcu_read_unlock();
7280		7280
7281	update_curr(cfs_rq);	7281	update_curr(cfs_rq);
7282		7282
7283	if (curr)	7283	if (curr)
7284	se->vruntime = curr->vruntime;	7284	se->vruntime = curr->vruntime;
7285	place_entity(cfs_rq, se, 1);	7285	place_entity(cfs_rq, se, 1);
7286		7286
7287	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {	7287	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7288	/*	7288	/*
7289	* Upon rescheduling, sched_class::put_prev_task() will place	7289	* Upon rescheduling, sched_class::put_prev_task() will place
7290	* 'current' within the tree based on its new key value.	7290	* 'current' within the tree based on its new key value.
7291	*/	7291	*/
7292	swap(curr->vruntime, se->vruntime);	7292	swap(curr->vruntime, se->vruntime);
7293	resched_task(rq->curr);	7293	resched_task(rq->curr);
7294	}	7294	}
7295		7295
7296	se->vruntime -= cfs_rq->min_vruntime;	7296	se->vruntime -= cfs_rq->min_vruntime;
7297		7297
7298	raw_spin_unlock_irqrestore(&rq->lock, flags);	7298	raw_spin_unlock_irqrestore(&rq->lock, flags);
7299	}	7299	}
7300		7300
7301	/*	7301	/*
7302	* Priority of the task has changed. Check to see if we preempt	7302	* Priority of the task has changed. Check to see if we preempt
7303	* the current task.	7303	* the current task.
7304	*/	7304	*/
7305	static void	7305	static void
7306	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)	7306	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
7307	{	7307	{
7308	if (!p->se.on_rq)	7308	if (!p->se.on_rq)
7309	return;	7309	return;
7310		7310
7311	/*	7311	/*
7312	* Reschedule if we are currently running on this runqueue and	7312	* Reschedule if we are currently running on this runqueue and
7313	* our priority decreased, or if we are not currently running on	7313	* our priority decreased, or if we are not currently running on
7314	* this runqueue and our priority is higher than the current's	7314	* this runqueue and our priority is higher than the current's
7315	*/	7315	*/
7316	if (rq->curr == p) {	7316	if (rq->curr == p) {
7317	if (p->prio > oldprio)	7317	if (p->prio > oldprio)
7318	resched_task(rq->curr);	7318	resched_task(rq->curr);
7319	} else	7319	} else
7320	check_preempt_curr(rq, p, 0);	7320	check_preempt_curr(rq, p, 0);
7321	}	7321	}
7322		7322
7323	static void switched_from_fair(struct rq rq, struct task_struct p)	7323	static void switched_from_fair(struct rq rq, struct task_struct p)
7324	{	7324	{
7325	struct sched_entity *se = &p->se;	7325	struct sched_entity *se = &p->se;
7326	struct cfs_rq *cfs_rq = cfs_rq_of(se);	7326	struct cfs_rq *cfs_rq = cfs_rq_of(se);
7327		7327
7328	/*	7328	/*
7329	* Ensure the task's vruntime is normalized, so that when it's	7329	* Ensure the task's vruntime is normalized, so that when it's
7330	* switched back to the fair class the enqueue_entity(.flags=0) will	7330	* switched back to the fair class the enqueue_entity(.flags=0) will
7331	* do the right thing.	7331	* do the right thing.
7332	*	7332	*
7333	* If it's on_rq, then the dequeue_entity(.flags=0) will already	7333	* If it's on_rq, then the dequeue_entity(.flags=0) will already
7334	* have normalized the vruntime, if it's !on_rq, then only when	7334	* have normalized the vruntime, if it's !on_rq, then only when
7335	* the task is sleeping will it still have non-normalized vruntime.	7335	* the task is sleeping will it still have non-normalized vruntime.
7336	*/	7336	*/
7337	if (!p->on_rq && p->state != TASK_RUNNING) {	7337	if (!p->on_rq && p->state != TASK_RUNNING) {
7338	/*	7338	/*
7339	* Fix up our vruntime so that the current sleep doesn't	7339	* Fix up our vruntime so that the current sleep doesn't
7340	* cause 'unlimited' sleep bonus.	7340	* cause 'unlimited' sleep bonus.
7341	*/	7341	*/
7342	place_entity(cfs_rq, se, 0);	7342	place_entity(cfs_rq, se, 0);
7343	se->vruntime -= cfs_rq->min_vruntime;	7343	se->vruntime -= cfs_rq->min_vruntime;
7344	}	7344	}
7345		7345
7346	#ifdef CONFIG_SMP	7346	#ifdef CONFIG_SMP
7347	/*	7347	/*
7348	* Remove our load from contribution when we leave sched_fair	7348	* Remove our load from contribution when we leave sched_fair
7349	* and ensure we don't carry in an old decay_count if we	7349	* and ensure we don't carry in an old decay_count if we
7350	* switch back.	7350	* switch back.
7351	*/	7351	*/
7352	if (se->avg.decay_count) {	7352	if (se->avg.decay_count) {
7353	__synchronize_entity_decay(se);	7353	__synchronize_entity_decay(se);
7354	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);	7354	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
7355	}	7355	}
7356	#endif	7356	#endif
7357	}	7357	}
7358		7358
7359	/*	7359	/*
7360	* We switched to the sched_fair class.	7360	* We switched to the sched_fair class.
7361	*/	7361	*/
7362	static void switched_to_fair(struct rq rq, struct task_struct p)	7362	static void switched_to_fair(struct rq rq, struct task_struct p)
7363	{	7363	{
7364	struct sched_entity *se = &p->se;	7364	struct sched_entity *se = &p->se;
7365	#ifdef CONFIG_FAIR_GROUP_SCHED	7365	#ifdef CONFIG_FAIR_GROUP_SCHED
7366	/*	7366	/*
7367	* Since the real-depth could have been changed (only FAIR	7367	* Since the real-depth could have been changed (only FAIR
7368	* class maintain depth value), reset depth properly.	7368	* class maintain depth value), reset depth properly.
7369	*/	7369	*/
7370	se->depth = se->parent ? se->parent->depth + 1 : 0;	7370	se->depth = se->parent ? se->parent->depth + 1 : 0;
7371	#endif	7371	#endif
7372	if (!se->on_rq)	7372	if (!se->on_rq)
7373	return;	7373	return;
7374		7374
7375	/*	7375	/*
7376	* We were most likely switched from sched_rt, so	7376	* We were most likely switched from sched_rt, so
7377	* kick off the schedule if running, otherwise just see	7377	* kick off the schedule if running, otherwise just see
7378	* if we can still preempt the current task.	7378	* if we can still preempt the current task.
7379	*/	7379	*/
7380	if (rq->curr == p)	7380	if (rq->curr == p)
7381	resched_task(rq->curr);	7381	resched_task(rq->curr);
7382	else	7382	else
7383	check_preempt_curr(rq, p, 0);	7383	check_preempt_curr(rq, p, 0);
7384	}	7384	}
7385		7385
7386	/* Account for a task changing its policy or group.	7386	/* Account for a task changing its policy or group.
7387	*	7387	*
7388	* This routine is mostly called to set cfs_rq->curr field when a task	7388	* This routine is mostly called to set cfs_rq->curr field when a task
7389	* migrates between groups/classes.	7389	* migrates between groups/classes.
7390	*/	7390	*/
7391	static void set_curr_task_fair(struct rq *rq)	7391	static void set_curr_task_fair(struct rq *rq)
7392	{	7392	{
7393	struct sched_entity *se = &rq->curr->se;	7393	struct sched_entity *se = &rq->curr->se;
7394		7394
7395	for_each_sched_entity(se) {	7395	for_each_sched_entity(se) {
7396	struct cfs_rq *cfs_rq = cfs_rq_of(se);	7396	struct cfs_rq *cfs_rq = cfs_rq_of(se);
7397		7397
7398	set_next_entity(cfs_rq, se);	7398	set_next_entity(cfs_rq, se);
7399	/* ensure bandwidth has been allocated on our new cfs_rq */	7399	/* ensure bandwidth has been allocated on our new cfs_rq */
7400	account_cfs_rq_runtime(cfs_rq, 0);	7400	account_cfs_rq_runtime(cfs_rq, 0);
7401	}	7401	}
7402	}	7402	}
7403		7403
7404	void init_cfs_rq(struct cfs_rq *cfs_rq)	7404	void init_cfs_rq(struct cfs_rq *cfs_rq)
7405	{	7405	{
7406	cfs_rq->tasks_timeline = RB_ROOT;	7406	cfs_rq->tasks_timeline = RB_ROOT;
7407	cfs_rq->min_vruntime = (u64)(-(1LL << 20));	7407	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7408	#ifndef CONFIG_64BIT	7408	#ifndef CONFIG_64BIT
7409	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;	7409	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7410	#endif	7410	#endif
7411	#ifdef CONFIG_SMP	7411	#ifdef CONFIG_SMP
7412	atomic64_set(&cfs_rq->decay_counter, 1);	7412	atomic64_set(&cfs_rq->decay_counter, 1);
7413	atomic_long_set(&cfs_rq->removed_load, 0);	7413	atomic_long_set(&cfs_rq->removed_load, 0);
7414	#endif	7414	#endif
7415	}	7415	}
7416		7416
7417	#ifdef CONFIG_FAIR_GROUP_SCHED	7417	#ifdef CONFIG_FAIR_GROUP_SCHED
7418	static void task_move_group_fair(struct task_struct *p, int on_rq)	7418	static void task_move_group_fair(struct task_struct *p, int on_rq)
7419	{	7419	{
7420	struct sched_entity *se = &p->se;	7420	struct sched_entity *se = &p->se;
7421	struct cfs_rq *cfs_rq;	7421	struct cfs_rq *cfs_rq;
7422		7422
7423	/*	7423	/*
7424	* If the task was not on the rq at the time of this cgroup movement	7424	* If the task was not on the rq at the time of this cgroup movement
7425	* it must have been asleep, sleeping tasks keep their ->vruntime	7425	* it must have been asleep, sleeping tasks keep their ->vruntime
7426	* absolute on their old rq until wakeup (needed for the fair sleeper	7426	* absolute on their old rq until wakeup (needed for the fair sleeper
7427	* bonus in place_entity()).	7427	* bonus in place_entity()).
7428	*	7428	*
7429	* If it was on the rq, we've just 'preempted' it, which does convert	7429	* If it was on the rq, we've just 'preempted' it, which does convert
7430	* ->vruntime to a relative base.	7430	* ->vruntime to a relative base.
7431	*	7431	*
7432	* Make sure both cases convert their relative position when migrating	7432	* Make sure both cases convert their relative position when migrating
7433	* to another cgroup's rq. This does somewhat interfere with the	7433	* to another cgroup's rq. This does somewhat interfere with the
7434	* fair sleeper stuff for the first placement, but who cares.	7434	* fair sleeper stuff for the first placement, but who cares.
7435	*/	7435	*/
7436	/*	7436	/*
7437	* When !on_rq, vruntime of the task has usually NOT been normalized.	7437	* When !on_rq, vruntime of the task has usually NOT been normalized.
7438	* But there are some cases where it has already been normalized:	7438	* But there are some cases where it has already been normalized:
7439	*	7439	*
7440	* - Moving a forked child which is waiting for being woken up by	7440	* - Moving a forked child which is waiting for being woken up by
7441	* wake_up_new_task().	7441	* wake_up_new_task().
7442	* - Moving a task which has been woken up by try_to_wake_up() and	7442	* - Moving a task which has been woken up by try_to_wake_up() and
7443	* waiting for actually being woken up by sched_ttwu_pending().	7443	* waiting for actually being woken up by sched_ttwu_pending().
7444	*	7444	*
7445	* To prevent boost or penalty in the new cfs_rq caused by delta	7445	* To prevent boost or penalty in the new cfs_rq caused by delta
7446	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.	7446	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7447	*/	7447	*/
7448	if (!on_rq && (!se->sum_exec_runtime \|\| p->state == TASK_WAKING))	7448	if (!on_rq && (!se->sum_exec_runtime \|\| p->state == TASK_WAKING))
7449	on_rq = 1;	7449	on_rq = 1;
7450		7450
7451	if (!on_rq)	7451	if (!on_rq)
7452	se->vruntime -= cfs_rq_of(se)->min_vruntime;	7452	se->vruntime -= cfs_rq_of(se)->min_vruntime;
7453	set_task_rq(p, task_cpu(p));	7453	set_task_rq(p, task_cpu(p));
7454	se->depth = se->parent ? se->parent->depth + 1 : 0;	7454	se->depth = se->parent ? se->parent->depth + 1 : 0;
7455	if (!on_rq) {	7455	if (!on_rq) {
7456	cfs_rq = cfs_rq_of(se);	7456	cfs_rq = cfs_rq_of(se);
7457	se->vruntime += cfs_rq->min_vruntime;	7457	se->vruntime += cfs_rq->min_vruntime;
7458	#ifdef CONFIG_SMP	7458	#ifdef CONFIG_SMP
7459	/*	7459	/*
7460	* migrate_task_rq_fair() will have removed our previous	7460	* migrate_task_rq_fair() will have removed our previous
7461	* contribution, but we must synchronize for ongoing future	7461	* contribution, but we must synchronize for ongoing future
7462	* decay.	7462	* decay.
7463	*/	7463	*/
7464	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);	7464	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7465	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;	7465	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7466	#endif	7466	#endif
7467	}	7467	}
7468	}	7468	}
7469		7469
7470	void free_fair_sched_group(struct task_group *tg)	7470	void free_fair_sched_group(struct task_group *tg)
7471	{	7471	{
7472	int i;	7472	int i;
7473		7473
7474	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));	7474	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7475		7475
7476	for_each_possible_cpu(i) {	7476	for_each_possible_cpu(i) {
7477	if (tg->cfs_rq)	7477	if (tg->cfs_rq)
7478	kfree(tg->cfs_rq[i]);	7478	kfree(tg->cfs_rq[i]);
7479	if (tg->se)	7479	if (tg->se)
7480	kfree(tg->se[i]);	7480	kfree(tg->se[i]);
7481	}	7481	}
7482		7482
7483	kfree(tg->cfs_rq);	7483	kfree(tg->cfs_rq);
7484	kfree(tg->se);	7484	kfree(tg->se);
7485	}	7485	}
7486		7486
7487	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)	7487	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
7488	{	7488	{
7489	struct cfs_rq *cfs_rq;	7489	struct cfs_rq *cfs_rq;
7490	struct sched_entity *se;	7490	struct sched_entity *se;
7491	int i;	7491	int i;
7492		7492
7493	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);	7493	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7494	if (!tg->cfs_rq)	7494	if (!tg->cfs_rq)
7495	goto err;	7495	goto err;
7496	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);	7496	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7497	if (!tg->se)	7497	if (!tg->se)
7498	goto err;	7498	goto err;
7499		7499
7500	tg->shares = NICE_0_LOAD;	7500	tg->shares = NICE_0_LOAD;
7501		7501
7502	init_cfs_bandwidth(tg_cfs_bandwidth(tg));	7502	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7503		7503
7504	for_each_possible_cpu(i) {	7504	for_each_possible_cpu(i) {
7505	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),	7505	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7506	GFP_KERNEL, cpu_to_node(i));	7506	GFP_KERNEL, cpu_to_node(i));
7507	if (!cfs_rq)	7507	if (!cfs_rq)
7508	goto err;	7508	goto err;
7509		7509
7510	se = kzalloc_node(sizeof(struct sched_entity),	7510	se = kzalloc_node(sizeof(struct sched_entity),
7511	GFP_KERNEL, cpu_to_node(i));	7511	GFP_KERNEL, cpu_to_node(i));
7512	if (!se)	7512	if (!se)
7513	goto err_free_rq;	7513	goto err_free_rq;
7514		7514
7515	init_cfs_rq(cfs_rq);	7515	init_cfs_rq(cfs_rq);
7516	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);	7516	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7517	}	7517	}
7518		7518
7519	return 1;	7519	return 1;
7520		7520
7521	err_free_rq:	7521	err_free_rq:
7522	kfree(cfs_rq);	7522	kfree(cfs_rq);
7523	err:	7523	err:
7524	return 0;	7524	return 0;
7525	}	7525	}
7526		7526
7527	void unregister_fair_sched_group(struct task_group *tg, int cpu)	7527	void unregister_fair_sched_group(struct task_group *tg, int cpu)
7528	{	7528	{
7529	struct rq *rq = cpu_rq(cpu);	7529	struct rq *rq = cpu_rq(cpu);
7530	unsigned long flags;	7530	unsigned long flags;
7531		7531
7532	/*	7532	/*
7533	* Only empty task groups can be destroyed; so we can speculatively	7533	* Only empty task groups can be destroyed; so we can speculatively
7534	* check on_list without danger of it being re-added.	7534	* check on_list without danger of it being re-added.
7535	*/	7535	*/
7536	if (!tg->cfs_rq[cpu]->on_list)	7536	if (!tg->cfs_rq[cpu]->on_list)
7537	return;	7537	return;
7538		7538
7539	raw_spin_lock_irqsave(&rq->lock, flags);	7539	raw_spin_lock_irqsave(&rq->lock, flags);
7540	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);	7540	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7541	raw_spin_unlock_irqrestore(&rq->lock, flags);	7541	raw_spin_unlock_irqrestore(&rq->lock, flags);
7542	}	7542	}
7543		7543
7544	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,	7544	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
7545	struct sched_entity *se, int cpu,	7545	struct sched_entity *se, int cpu,
7546	struct sched_entity *parent)	7546	struct sched_entity *parent)
7547	{	7547	{
7548	struct rq *rq = cpu_rq(cpu);	7548	struct rq *rq = cpu_rq(cpu);
7549		7549
7550	cfs_rq->tg = tg;	7550	cfs_rq->tg = tg;
7551	cfs_rq->rq = rq;	7551	cfs_rq->rq = rq;
7552	init_cfs_rq_runtime(cfs_rq);	7552	init_cfs_rq_runtime(cfs_rq);
7553		7553
7554	tg->cfs_rq[cpu] = cfs_rq;	7554	tg->cfs_rq[cpu] = cfs_rq;
7555	tg->se[cpu] = se;	7555	tg->se[cpu] = se;
7556		7556
7557	/* se could be NULL for root_task_group */	7557	/* se could be NULL for root_task_group */
7558	if (!se)	7558	if (!se)
7559	return;	7559	return;
7560		7560
7561	if (!parent) {	7561	if (!parent) {
7562	se->cfs_rq = &rq->cfs;	7562	se->cfs_rq = &rq->cfs;
7563	se->depth = 0;	7563	se->depth = 0;
7564	} else {	7564	} else {
7565	se->cfs_rq = parent->my_q;	7565	se->cfs_rq = parent->my_q;
7566	se->depth = parent->depth + 1;	7566	se->depth = parent->depth + 1;
7567	}	7567	}
7568		7568
7569	se->my_q = cfs_rq;	7569	se->my_q = cfs_rq;
7570	/* guarantee group entities always have weight */	7570	/* guarantee group entities always have weight */
7571	update_load_set(&se->load, NICE_0_LOAD);	7571	update_load_set(&se->load, NICE_0_LOAD);
7572	se->parent = parent;	7572	se->parent = parent;
7573	}	7573	}
7574		7574
7575	static DEFINE_MUTEX(shares_mutex);	7575	static DEFINE_MUTEX(shares_mutex);
7576		7576
7577	int sched_group_set_shares(struct task_group *tg, unsigned long shares)	7577	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7578	{	7578	{
7579	int i;	7579	int i;
7580	unsigned long flags;	7580	unsigned long flags;
7581		7581
7582	/*	7582	/*
7583	* We can't change the weight of the root cgroup.	7583	* We can't change the weight of the root cgroup.
7584	*/	7584	*/
7585	if (!tg->se[0])	7585	if (!tg->se[0])
7586	return -EINVAL;	7586	return -EINVAL;
7587		7587
7588	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));	7588	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7589		7589
7590	mutex_lock(&shares_mutex);	7590	mutex_lock(&shares_mutex);
7591	if (tg->shares == shares)	7591	if (tg->shares == shares)
7592	goto done;	7592	goto done;
7593		7593
7594	tg->shares = shares;	7594	tg->shares = shares;
7595	for_each_possible_cpu(i) {	7595	for_each_possible_cpu(i) {
7596	struct rq *rq = cpu_rq(i);	7596	struct rq *rq = cpu_rq(i);
7597	struct sched_entity *se;	7597	struct sched_entity *se;
7598		7598
7599	se = tg->se[i];	7599	se = tg->se[i];
7600	/* Propagate contribution to hierarchy */	7600	/* Propagate contribution to hierarchy */
7601	raw_spin_lock_irqsave(&rq->lock, flags);	7601	raw_spin_lock_irqsave(&rq->lock, flags);
7602		7602
7603	/* Possible calls to update_curr() need rq clock */	7603	/* Possible calls to update_curr() need rq clock */
7604	update_rq_clock(rq);	7604	update_rq_clock(rq);
7605	for_each_sched_entity(se)	7605	for_each_sched_entity(se)
7606	update_cfs_shares(group_cfs_rq(se));	7606	update_cfs_shares(group_cfs_rq(se));
7607	raw_spin_unlock_irqrestore(&rq->lock, flags);	7607	raw_spin_unlock_irqrestore(&rq->lock, flags);
7608	}	7608	}
7609		7609
7610	done:	7610	done:
7611	mutex_unlock(&shares_mutex);	7611	mutex_unlock(&shares_mutex);
7612	return 0;	7612	return 0;
7613	}	7613	}
7614	#else /* CONFIG_FAIR_GROUP_SCHED */	7614	#else /* CONFIG_FAIR_GROUP_SCHED */
7615		7615
7616	void free_fair_sched_group(struct task_group *tg) { }	7616	void free_fair_sched_group(struct task_group *tg) { }
7617		7617
7618	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)	7618	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
7619	{	7619	{
7620	return 1;	7620	return 1;
7621	}	7621	}
7622		7622
7623	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }	7623	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7624		7624
7625	#endif /* CONFIG_FAIR_GROUP_SCHED */	7625	#endif /* CONFIG_FAIR_GROUP_SCHED */
7626		7626
7627		7627
7628	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)	7628	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
7629	{	7629	{
7630	struct sched_entity *se = &task->se;	7630	struct sched_entity *se = &task->se;
7631	unsigned int rr_interval = 0;	7631	unsigned int rr_interval = 0;
7632		7632
7633	/*	7633	/*
7634	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise	7634	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
7635	* idle runqueue:	7635	* idle runqueue:
7636	*/	7636	*/
7637	if (rq->cfs.load.weight)	7637	if (rq->cfs.load.weight)
7638	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));	7638	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7639		7639
7640	return rr_interval;	7640	return rr_interval;
7641	}	7641	}
7642		7642
7643	/*	7643	/*
7644	* All the scheduling class methods:	7644	* All the scheduling class methods:
7645	*/	7645	*/
7646	const struct sched_class fair_sched_class = {	7646	const struct sched_class fair_sched_class = {
7647	.next = &idle_sched_class,	7647	.next = &idle_sched_class,
7648	.enqueue_task = enqueue_task_fair,	7648	.enqueue_task = enqueue_task_fair,
7649	.dequeue_task = dequeue_task_fair,	7649	.dequeue_task = dequeue_task_fair,
7650	.yield_task = yield_task_fair,	7650	.yield_task = yield_task_fair,
7651	.yield_to_task = yield_to_task_fair,	7651	.yield_to_task = yield_to_task_fair,
7652		7652
7653	.check_preempt_curr = check_preempt_wakeup,	7653	.check_preempt_curr = check_preempt_wakeup,
7654		7654
7655	.pick_next_task = pick_next_task_fair,	7655	.pick_next_task = pick_next_task_fair,
7656	.put_prev_task = put_prev_task_fair,	7656	.put_prev_task = put_prev_task_fair,
7657		7657
7658	#ifdef CONFIG_SMP	7658	#ifdef CONFIG_SMP
7659	.select_task_rq = select_task_rq_fair,	7659	.select_task_rq = select_task_rq_fair,
7660	.migrate_task_rq = migrate_task_rq_fair,	7660	.migrate_task_rq = migrate_task_rq_fair,
7661		7661
7662	.rq_online = rq_online_fair,	7662	.rq_online = rq_online_fair,
7663	.rq_offline = rq_offline_fair,	7663	.rq_offline = rq_offline_fair,
7664		7664
7665	.task_waking = task_waking_fair,	7665	.task_waking = task_waking_fair,
7666	#endif	7666	#endif
7667		7667
7668	.set_curr_task = set_curr_task_fair,	7668	.set_curr_task = set_curr_task_fair,
7669	.task_tick = task_tick_fair,	7669	.task_tick = task_tick_fair,
7670	.task_fork = task_fork_fair,	7670	.task_fork = task_fork_fair,
7671		7671
7672	.prio_changed = prio_changed_fair,	7672	.prio_changed = prio_changed_fair,
7673	.switched_from = switched_from_fair,	7673	.switched_from = switched_from_fair,
7674	.switched_to = switched_to_fair,	7674	.switched_to = switched_to_fair,
7675		7675
7676	.get_rr_interval = get_rr_interval_fair,	7676	.get_rr_interval = get_rr_interval_fair,
7677		7677
7678	#ifdef CONFIG_FAIR_GROUP_SCHED	7678	#ifdef CONFIG_FAIR_GROUP_SCHED
7679	.task_move_group = task_move_group_fair,	7679	.task_move_group = task_move_group_fair,
7680	#endif	7680	#endif
7681	};	7681	};
7682		7682
7683	#ifdef CONFIG_SCHED_DEBUG	7683	#ifdef CONFIG_SCHED_DEBUG
7684	void print_cfs_stats(struct seq_file *m, int cpu)	7684	void print_cfs_stats(struct seq_file *m, int cpu)
7685	{	7685	{
7686	struct cfs_rq *cfs_rq;	7686	struct cfs_rq *cfs_rq;
7687		7687
7688	rcu_read_lock();	7688	rcu_read_lock();
7689	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)	7689	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7690	print_cfs_rq(m, cpu, cfs_rq);	7690	print_cfs_rq(m, cpu, cfs_rq);
7691	rcu_read_unlock();	7691	rcu_read_unlock();
7692	}	7692	}
7693	#endif	7693	#endif
7694		7694
7695	__init void init_sched_fair_class(void)	7695	__init void init_sched_fair_class(void)
7696	{	7696	{
7697	#ifdef CONFIG_SMP	7697	#ifdef CONFIG_SMP
7698	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);	7698	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7699		7699
7700	#ifdef CONFIG_NO_HZ_COMMON	7700	#ifdef CONFIG_NO_HZ_COMMON
7701	nohz.next_balance = jiffies;	7701	nohz.next_balance = jiffies;
7702	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);	7702	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7703	cpu_notifier(sched_ilb_notifier, 0);	7703	cpu_notifier(sched_ilb_notifier, 0);
7704	#endif	7704	#endif
7705	#endif /* SMP */	7705	#endif /* SMP */
7706		7706
7707	}	7707	}
7708		7708

kernel/sched/sched.h

Diff comments View file @ 09dc4ab

1		1
2	#include <linux/sched.h>	2	#include <linux/sched.h>
3	#include <linux/sched/sysctl.h>	3	#include <linux/sched/sysctl.h>
4	#include <linux/sched/rt.h>	4	#include <linux/sched/rt.h>
5	#include <linux/sched/deadline.h>	5	#include <linux/sched/deadline.h>
6	#include <linux/mutex.h>	6	#include <linux/mutex.h>
7	#include <linux/spinlock.h>	7	#include <linux/spinlock.h>
8	#include <linux/stop_machine.h>	8	#include <linux/stop_machine.h>
9	#include <linux/tick.h>	9	#include <linux/tick.h>
10	#include <linux/slab.h>	10	#include <linux/slab.h>
11		11
12	#include "cpupri.h"	12	#include "cpupri.h"
13	#include "cpudeadline.h"	13	#include "cpudeadline.h"
14	#include "cpuacct.h"	14	#include "cpuacct.h"
15		15
16	struct rq;	16	struct rq;
17		17
18	extern __read_mostly int scheduler_running;	18	extern __read_mostly int scheduler_running;
19		19
20	extern unsigned long calc_load_update;	20	extern unsigned long calc_load_update;
21	extern atomic_long_t calc_load_tasks;	21	extern atomic_long_t calc_load_tasks;
22		22
23	extern long calc_load_fold_active(struct rq *this_rq);	23	extern long calc_load_fold_active(struct rq *this_rq);
24	extern void update_cpu_load_active(struct rq *this_rq);	24	extern void update_cpu_load_active(struct rq *this_rq);
25		25
26	/*	26	/*
27	* Helpers for converting nanosecond timing to jiffy resolution	27	* Helpers for converting nanosecond timing to jiffy resolution
28	*/	28	*/
29	#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))	29	#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
30		30
31	/*	31	/*
32	* Increase resolution of nice-level calculations for 64-bit architectures.	32	* Increase resolution of nice-level calculations for 64-bit architectures.
33	* The extra resolution improves shares distribution and load balancing of	33	* The extra resolution improves shares distribution and load balancing of
34	* low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup	34	* low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
35	* hierarchies, especially on larger systems. This is not a user-visible change	35	* hierarchies, especially on larger systems. This is not a user-visible change
36	* and does not change the user-interface for setting shares/weights.	36	* and does not change the user-interface for setting shares/weights.
37	*	37	*
38	* We increase resolution only if we have enough bits to allow this increased	38	* We increase resolution only if we have enough bits to allow this increased
39	* resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution	39	* resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
40	* when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the	40	* when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
41	* increased costs.	41	* increased costs.
42	*/	42	*/
43	#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */	43	#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
44	# define SCHED_LOAD_RESOLUTION 10	44	# define SCHED_LOAD_RESOLUTION 10
45	# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)	45	# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
46	# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)	46	# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
47	#else	47	#else
48	# define SCHED_LOAD_RESOLUTION 0	48	# define SCHED_LOAD_RESOLUTION 0
49	# define scale_load(w) (w)	49	# define scale_load(w) (w)
50	# define scale_load_down(w) (w)	50	# define scale_load_down(w) (w)
51	#endif	51	#endif
52		52
53	#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)	53	#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
54	#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)	54	#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
55		55
56	#define NICE_0_LOAD SCHED_LOAD_SCALE	56	#define NICE_0_LOAD SCHED_LOAD_SCALE
57	#define NICE_0_SHIFT SCHED_LOAD_SHIFT	57	#define NICE_0_SHIFT SCHED_LOAD_SHIFT
58		58
59	/*	59	/*
60	* Single value that decides SCHED_DEADLINE internal math precision.	60	* Single value that decides SCHED_DEADLINE internal math precision.
61	* 10 -> just above 1us	61	* 10 -> just above 1us
62	* 9 -> just above 0.5us	62	* 9 -> just above 0.5us
63	*/	63	*/
64	#define DL_SCALE (10)	64	#define DL_SCALE (10)
65		65
66	/*	66	/*
67	* These are the 'tuning knobs' of the scheduler:	67	* These are the 'tuning knobs' of the scheduler:
68	*/	68	*/
69		69
70	/*	70	/*
71	* single value that denotes runtime == period, ie unlimited time.	71	* single value that denotes runtime == period, ie unlimited time.
72	*/	72	*/
73	#define RUNTIME_INF ((u64)~0ULL)	73	#define RUNTIME_INF ((u64)~0ULL)
74		74
75	static inline int fair_policy(int policy)	75	static inline int fair_policy(int policy)
76	{	76	{
77	return policy == SCHED_NORMAL \|\| policy == SCHED_BATCH;	77	return policy == SCHED_NORMAL \|\| policy == SCHED_BATCH;
78	}	78	}
79		79
80	static inline int rt_policy(int policy)	80	static inline int rt_policy(int policy)
81	{	81	{
82	return policy == SCHED_FIFO \|\| policy == SCHED_RR;	82	return policy == SCHED_FIFO \|\| policy == SCHED_RR;
83	}	83	}
84		84
85	static inline int dl_policy(int policy)	85	static inline int dl_policy(int policy)
86	{	86	{
87	return policy == SCHED_DEADLINE;	87	return policy == SCHED_DEADLINE;
88	}	88	}
89		89
90	static inline int task_has_rt_policy(struct task_struct *p)	90	static inline int task_has_rt_policy(struct task_struct *p)
91	{	91	{
92	return rt_policy(p->policy);	92	return rt_policy(p->policy);
93	}	93	}
94		94
95	static inline int task_has_dl_policy(struct task_struct *p)	95	static inline int task_has_dl_policy(struct task_struct *p)
96	{	96	{
97	return dl_policy(p->policy);	97	return dl_policy(p->policy);
98	}	98	}
99		99
100	static inline bool dl_time_before(u64 a, u64 b)	100	static inline bool dl_time_before(u64 a, u64 b)
101	{	101	{
102	return (s64)(a - b) < 0;	102	return (s64)(a - b) < 0;
103	}	103	}
104		104
105	/*	105	/*
106	* Tells if entity @a should preempt entity @b.	106	* Tells if entity @a should preempt entity @b.
107	*/	107	*/
108	static inline bool	108	static inline bool
109	dl_entity_preempt(struct sched_dl_entity a, struct sched_dl_entity b)	109	dl_entity_preempt(struct sched_dl_entity a, struct sched_dl_entity b)
110	{	110	{
111	return dl_time_before(a->deadline, b->deadline);	111	return dl_time_before(a->deadline, b->deadline);
112	}	112	}
113		113
114	/*	114	/*
115	* This is the priority-queue data structure of the RT scheduling class:	115	* This is the priority-queue data structure of the RT scheduling class:
116	*/	116	*/
117	struct rt_prio_array {	117	struct rt_prio_array {
118	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */	118	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
119	struct list_head queue[MAX_RT_PRIO];	119	struct list_head queue[MAX_RT_PRIO];
120	};	120	};
121		121
122	struct rt_bandwidth {	122	struct rt_bandwidth {
123	/* nests inside the rq lock: */	123	/* nests inside the rq lock: */
124	raw_spinlock_t rt_runtime_lock;	124	raw_spinlock_t rt_runtime_lock;
125	ktime_t rt_period;	125	ktime_t rt_period;
126	u64 rt_runtime;	126	u64 rt_runtime;
127	struct hrtimer rt_period_timer;	127	struct hrtimer rt_period_timer;
128	};	128	};
129	/*	129	/*
130	* To keep the bandwidth of -deadline tasks and groups under control	130	* To keep the bandwidth of -deadline tasks and groups under control
131	* we need some place where:	131	* we need some place where:
132	* - store the maximum -deadline bandwidth of the system (the group);	132	* - store the maximum -deadline bandwidth of the system (the group);
133	* - cache the fraction of that bandwidth that is currently allocated.	133	* - cache the fraction of that bandwidth that is currently allocated.
134	*	134	*
135	* This is all done in the data structure below. It is similar to the	135	* This is all done in the data structure below. It is similar to the
136	* one used for RT-throttling (rt_bandwidth), with the main difference	136	* one used for RT-throttling (rt_bandwidth), with the main difference
137	* that, since here we are only interested in admission control, we	137	* that, since here we are only interested in admission control, we
138	* do not decrease any runtime while the group "executes", neither we	138	* do not decrease any runtime while the group "executes", neither we
139	* need a timer to replenish it.	139	* need a timer to replenish it.
140	*	140	*
141	* With respect to SMP, the bandwidth is given on a per-CPU basis,	141	* With respect to SMP, the bandwidth is given on a per-CPU basis,
142	* meaning that:	142	* meaning that:
143	* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;	143	* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
144	* - dl_total_bw array contains, in the i-eth element, the currently	144	* - dl_total_bw array contains, in the i-eth element, the currently
145	* allocated bandwidth on the i-eth CPU.	145	* allocated bandwidth on the i-eth CPU.
146	* Moreover, groups consume bandwidth on each CPU, while tasks only	146	* Moreover, groups consume bandwidth on each CPU, while tasks only
147	* consume bandwidth on the CPU they're running on.	147	* consume bandwidth on the CPU they're running on.
148	* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw	148	* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
149	* that will be shown the next time the proc or cgroup controls will	149	* that will be shown the next time the proc or cgroup controls will
150	* be red. It on its turn can be changed by writing on its own	150	* be red. It on its turn can be changed by writing on its own
151	* control.	151	* control.
152	*/	152	*/
153	struct dl_bandwidth {	153	struct dl_bandwidth {
154	raw_spinlock_t dl_runtime_lock;	154	raw_spinlock_t dl_runtime_lock;
155	u64 dl_runtime;	155	u64 dl_runtime;
156	u64 dl_period;	156	u64 dl_period;
157	};	157	};
158		158
159	static inline int dl_bandwidth_enabled(void)	159	static inline int dl_bandwidth_enabled(void)
160	{	160	{
161	return sysctl_sched_rt_runtime >= 0;	161	return sysctl_sched_rt_runtime >= 0;
162	}	162	}
163		163
164	extern struct dl_bw *dl_bw_of(int i);	164	extern struct dl_bw *dl_bw_of(int i);
165		165
166	struct dl_bw {	166	struct dl_bw {
167	raw_spinlock_t lock;	167	raw_spinlock_t lock;
168	u64 bw, total_bw;	168	u64 bw, total_bw;
169	};	169	};
170		170
171	extern struct mutex sched_domains_mutex;	171	extern struct mutex sched_domains_mutex;
172		172
173	#ifdef CONFIG_CGROUP_SCHED	173	#ifdef CONFIG_CGROUP_SCHED
174		174
175	#include <linux/cgroup.h>	175	#include <linux/cgroup.h>
176		176
177	struct cfs_rq;	177	struct cfs_rq;
178	struct rt_rq;	178	struct rt_rq;
179		179
180	extern struct list_head task_groups;	180	extern struct list_head task_groups;
181		181
182	struct cfs_bandwidth {	182	struct cfs_bandwidth {
183	#ifdef CONFIG_CFS_BANDWIDTH	183	#ifdef CONFIG_CFS_BANDWIDTH
184	raw_spinlock_t lock;	184	raw_spinlock_t lock;
185	ktime_t period;	185	ktime_t period;
186	u64 quota, runtime;	186	u64 quota, runtime;
187	s64 hierarchal_quota;	187	s64 hierarchal_quota;
188	u64 runtime_expires;	188	u64 runtime_expires;
189		189
190	int idle, timer_active;	190	int idle, timer_active;
191	struct hrtimer period_timer, slack_timer;	191	struct hrtimer period_timer, slack_timer;
192	struct list_head throttled_cfs_rq;	192	struct list_head throttled_cfs_rq;
193		193
194	/* statistics */	194	/* statistics */
195	int nr_periods, nr_throttled;	195	int nr_periods, nr_throttled;
196	u64 throttled_time;	196	u64 throttled_time;
197	#endif	197	#endif
198	};	198	};
199		199
200	/* task group related information */	200	/* task group related information */
201	struct task_group {	201	struct task_group {
202	struct cgroup_subsys_state css;	202	struct cgroup_subsys_state css;
203		203
204	#ifdef CONFIG_FAIR_GROUP_SCHED	204	#ifdef CONFIG_FAIR_GROUP_SCHED
205	/* schedulable entities of this group on each cpu */	205	/* schedulable entities of this group on each cpu */
206	struct sched_entity **se;	206	struct sched_entity **se;
207	/* runqueue "owned" by this group on each cpu */	207	/* runqueue "owned" by this group on each cpu */
208	struct cfs_rq **cfs_rq;	208	struct cfs_rq **cfs_rq;
209	unsigned long shares;	209	unsigned long shares;
210		210
211	#ifdef CONFIG_SMP	211	#ifdef CONFIG_SMP
212	atomic_long_t load_avg;	212	atomic_long_t load_avg;
213	atomic_t runnable_avg;	213	atomic_t runnable_avg;
214	#endif	214	#endif
215	#endif	215	#endif
216		216
217	#ifdef CONFIG_RT_GROUP_SCHED	217	#ifdef CONFIG_RT_GROUP_SCHED
218	struct sched_rt_entity **rt_se;	218	struct sched_rt_entity **rt_se;
219	struct rt_rq **rt_rq;	219	struct rt_rq **rt_rq;
220		220
221	struct rt_bandwidth rt_bandwidth;	221	struct rt_bandwidth rt_bandwidth;
222	#endif	222	#endif
223		223
224	struct rcu_head rcu;	224	struct rcu_head rcu;
225	struct list_head list;	225	struct list_head list;
226		226
227	struct task_group *parent;	227	struct task_group *parent;
228	struct list_head siblings;	228	struct list_head siblings;
229	struct list_head children;	229	struct list_head children;
230		230
231	#ifdef CONFIG_SCHED_AUTOGROUP	231	#ifdef CONFIG_SCHED_AUTOGROUP
232	struct autogroup *autogroup;	232	struct autogroup *autogroup;
233	#endif	233	#endif
234		234
235	struct cfs_bandwidth cfs_bandwidth;	235	struct cfs_bandwidth cfs_bandwidth;
236	};	236	};
237		237
238	#ifdef CONFIG_FAIR_GROUP_SCHED	238	#ifdef CONFIG_FAIR_GROUP_SCHED
239	#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD	239	#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
240		240
241	/*	241	/*
242	* A weight of 0 or 1 can cause arithmetics problems.	242	* A weight of 0 or 1 can cause arithmetics problems.
243	* A weight of a cfs_rq is the sum of weights of which entities	243	* A weight of a cfs_rq is the sum of weights of which entities
244	* are queued on this cfs_rq, so a weight of a entity should not be	244	* are queued on this cfs_rq, so a weight of a entity should not be
245	* too large, so as the shares value of a task group.	245	* too large, so as the shares value of a task group.
246	* (The default weight is 1024 - so there's no practical	246	* (The default weight is 1024 - so there's no practical
247	* limitation from this.)	247	* limitation from this.)
248	*/	248	*/
249	#define MIN_SHARES (1UL << 1)	249	#define MIN_SHARES (1UL << 1)
250	#define MAX_SHARES (1UL << 18)	250	#define MAX_SHARES (1UL << 18)
251	#endif	251	#endif
252		252
253	typedef int (tg_visitor)(struct task_group , void *);	253	typedef int (tg_visitor)(struct task_group , void *);
254		254
255	extern int walk_tg_tree_from(struct task_group *from,	255	extern int walk_tg_tree_from(struct task_group *from,
256	tg_visitor down, tg_visitor up, void *data);	256	tg_visitor down, tg_visitor up, void *data);
257		257
258	/*	258	/*
259	* Iterate the full tree, calling @down when first entering a node and @up when	259	* Iterate the full tree, calling @down when first entering a node and @up when
260	* leaving it for the final time.	260	* leaving it for the final time.
261	*	261	*
262	* Caller must hold rcu_lock or sufficient equivalent.	262	* Caller must hold rcu_lock or sufficient equivalent.
263	*/	263	*/
264	static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)	264	static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
265	{	265	{
266	return walk_tg_tree_from(&root_task_group, down, up, data);	266	return walk_tg_tree_from(&root_task_group, down, up, data);
267	}	267	}
268		268
269	extern int tg_nop(struct task_group tg, void data);	269	extern int tg_nop(struct task_group tg, void data);
270		270
271	extern void free_fair_sched_group(struct task_group *tg);	271	extern void free_fair_sched_group(struct task_group *tg);
272	extern int alloc_fair_sched_group(struct task_group tg, struct task_group parent);	272	extern int alloc_fair_sched_group(struct task_group tg, struct task_group parent);
273	extern void unregister_fair_sched_group(struct task_group *tg, int cpu);	273	extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
274	extern void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,	274	extern void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
275	struct sched_entity *se, int cpu,	275	struct sched_entity *se, int cpu,
276	struct sched_entity *parent);	276	struct sched_entity *parent);
277	extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);	277	extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
278	extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);	278	extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
279		279
280	extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);	280	extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
281	extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);	281	extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
282	extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);	282	extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
283		283
284	extern void free_rt_sched_group(struct task_group *tg);	284	extern void free_rt_sched_group(struct task_group *tg);
285	extern int alloc_rt_sched_group(struct task_group tg, struct task_group parent);	285	extern int alloc_rt_sched_group(struct task_group tg, struct task_group parent);
286	extern void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,	286	extern void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,
287	struct sched_rt_entity *rt_se, int cpu,	287	struct sched_rt_entity *rt_se, int cpu,
288	struct sched_rt_entity *parent);	288	struct sched_rt_entity *parent);
289		289
290	extern struct task_group sched_create_group(struct task_group parent);	290	extern struct task_group sched_create_group(struct task_group parent);
291	extern void sched_online_group(struct task_group *tg,	291	extern void sched_online_group(struct task_group *tg,
292	struct task_group *parent);	292	struct task_group *parent);
293	extern void sched_destroy_group(struct task_group *tg);	293	extern void sched_destroy_group(struct task_group *tg);
294	extern void sched_offline_group(struct task_group *tg);	294	extern void sched_offline_group(struct task_group *tg);
295		295
296	extern void sched_move_task(struct task_struct *tsk);	296	extern void sched_move_task(struct task_struct *tsk);
297		297
298	#ifdef CONFIG_FAIR_GROUP_SCHED	298	#ifdef CONFIG_FAIR_GROUP_SCHED
299	extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);	299	extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
300	#endif	300	#endif
301		301
302	#else /* CONFIG_CGROUP_SCHED */	302	#else /* CONFIG_CGROUP_SCHED */
303		303
304	struct cfs_bandwidth { };	304	struct cfs_bandwidth { };
305		305
306	#endif /* CONFIG_CGROUP_SCHED */	306	#endif /* CONFIG_CGROUP_SCHED */
307		307
308	/* CFS-related fields in a runqueue */	308	/* CFS-related fields in a runqueue */
309	struct cfs_rq {	309	struct cfs_rq {
310	struct load_weight load;	310	struct load_weight load;
311	unsigned int nr_running, h_nr_running;	311	unsigned int nr_running, h_nr_running;
312		312
313	u64 exec_clock;	313	u64 exec_clock;
314	u64 min_vruntime;	314	u64 min_vruntime;
315	#ifndef CONFIG_64BIT	315	#ifndef CONFIG_64BIT
316	u64 min_vruntime_copy;	316	u64 min_vruntime_copy;
317	#endif	317	#endif
318		318
319	struct rb_root tasks_timeline;	319	struct rb_root tasks_timeline;
320	struct rb_node *rb_leftmost;	320	struct rb_node *rb_leftmost;
321		321
322	/*	322	/*
323	* 'curr' points to currently running entity on this cfs_rq.	323	* 'curr' points to currently running entity on this cfs_rq.
324	* It is set to NULL otherwise (i.e when none are currently running).	324	* It is set to NULL otherwise (i.e when none are currently running).
325	*/	325	*/
326	struct sched_entity curr, next, last, skip;	326	struct sched_entity curr, next, last, skip;
327		327
328	#ifdef CONFIG_SCHED_DEBUG	328	#ifdef CONFIG_SCHED_DEBUG
329	unsigned int nr_spread_over;	329	unsigned int nr_spread_over;
330	#endif	330	#endif
331		331
332	#ifdef CONFIG_SMP	332	#ifdef CONFIG_SMP
333	/*	333	/*
334	* CFS Load tracking	334	* CFS Load tracking
335	* Under CFS, load is tracked on a per-entity basis and aggregated up.	335	* Under CFS, load is tracked on a per-entity basis and aggregated up.
336	* This allows for the description of both thread and group usage (in	336	* This allows for the description of both thread and group usage (in
337	* the FAIR_GROUP_SCHED case).	337	* the FAIR_GROUP_SCHED case).
338	*/	338	*/
339	unsigned long runnable_load_avg, blocked_load_avg;	339	unsigned long runnable_load_avg, blocked_load_avg;
340	atomic64_t decay_counter;	340	atomic64_t decay_counter;
341	u64 last_decay;	341	u64 last_decay;
342	atomic_long_t removed_load;	342	atomic_long_t removed_load;
343		343
344	#ifdef CONFIG_FAIR_GROUP_SCHED	344	#ifdef CONFIG_FAIR_GROUP_SCHED
345	/* Required to track per-cpu representation of a task_group */	345	/* Required to track per-cpu representation of a task_group */
346	u32 tg_runnable_contrib;	346	u32 tg_runnable_contrib;
347	unsigned long tg_load_contrib;	347	unsigned long tg_load_contrib;
348		348
349	/*	349	/*
350	* h_load = weight * f(tg)	350	* h_load = weight * f(tg)
351	*	351	*
352	* Where f(tg) is the recursive weight fraction assigned to	352	* Where f(tg) is the recursive weight fraction assigned to
353	* this group.	353	* this group.
354	*/	354	*/
355	unsigned long h_load;	355	unsigned long h_load;
356	u64 last_h_load_update;	356	u64 last_h_load_update;
357	struct sched_entity *h_load_next;	357	struct sched_entity *h_load_next;
358	#endif /* CONFIG_FAIR_GROUP_SCHED */	358	#endif /* CONFIG_FAIR_GROUP_SCHED */
359	#endif /* CONFIG_SMP */	359	#endif /* CONFIG_SMP */
360		360
361	#ifdef CONFIG_FAIR_GROUP_SCHED	361	#ifdef CONFIG_FAIR_GROUP_SCHED
362	struct rq rq; / cpu runqueue to which this cfs_rq is attached */	362	struct rq rq; / cpu runqueue to which this cfs_rq is attached */
363		363
364	/*	364	/*
365	* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in	365	* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
366	* a hierarchy). Non-leaf lrqs hold other higher schedulable entities	366	* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
367	* (like users, containers etc.)	367	* (like users, containers etc.)
368	*	368	*
369	* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This	369	* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
370	* list is used during load balance.	370	* list is used during load balance.
371	*/	371	*/
372	int on_list;	372	int on_list;
373	struct list_head leaf_cfs_rq_list;	373	struct list_head leaf_cfs_rq_list;
374	struct task_group tg; / group that "owns" this runqueue */	374	struct task_group tg; / group that "owns" this runqueue */
375		375
376	#ifdef CONFIG_CFS_BANDWIDTH	376	#ifdef CONFIG_CFS_BANDWIDTH
377	int runtime_enabled;	377	int runtime_enabled;
378	u64 runtime_expires;	378	u64 runtime_expires;
379	s64 runtime_remaining;	379	s64 runtime_remaining;
380		380
381	u64 throttled_clock, throttled_clock_task;	381	u64 throttled_clock, throttled_clock_task;
382	u64 throttled_clock_task_time;	382	u64 throttled_clock_task_time;
383	int throttled, throttle_count;	383	int throttled, throttle_count;
384	struct list_head throttled_list;	384	struct list_head throttled_list;
385	#endif /* CONFIG_CFS_BANDWIDTH */	385	#endif /* CONFIG_CFS_BANDWIDTH */
386	#endif /* CONFIG_FAIR_GROUP_SCHED */	386	#endif /* CONFIG_FAIR_GROUP_SCHED */
387	};	387	};
388		388
389	static inline int rt_bandwidth_enabled(void)	389	static inline int rt_bandwidth_enabled(void)
390	{	390	{
391	return sysctl_sched_rt_runtime >= 0;	391	return sysctl_sched_rt_runtime >= 0;
392	}	392	}
393		393
394	/* Real-Time classes' related field in a runqueue: */	394	/* Real-Time classes' related field in a runqueue: */
395	struct rt_rq {	395	struct rt_rq {
396	struct rt_prio_array active;	396	struct rt_prio_array active;
397	unsigned int rt_nr_running;	397	unsigned int rt_nr_running;
398	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	398	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
399	struct {	399	struct {
400	int curr; /* highest queued rt task prio */	400	int curr; /* highest queued rt task prio */
401	#ifdef CONFIG_SMP	401	#ifdef CONFIG_SMP
402	int next; /* next highest */	402	int next; /* next highest */
403	#endif	403	#endif
404	} highest_prio;	404	} highest_prio;
405	#endif	405	#endif
406	#ifdef CONFIG_SMP	406	#ifdef CONFIG_SMP
407	unsigned long rt_nr_migratory;	407	unsigned long rt_nr_migratory;
408	unsigned long rt_nr_total;	408	unsigned long rt_nr_total;
409	int overloaded;	409	int overloaded;
410	struct plist_head pushable_tasks;	410	struct plist_head pushable_tasks;
411	#endif	411	#endif
412	int rt_throttled;	412	int rt_throttled;
413	u64 rt_time;	413	u64 rt_time;
414	u64 rt_runtime;	414	u64 rt_runtime;
415	/* Nests inside the rq lock: */	415	/* Nests inside the rq lock: */
416	raw_spinlock_t rt_runtime_lock;	416	raw_spinlock_t rt_runtime_lock;
417		417
418	#ifdef CONFIG_RT_GROUP_SCHED	418	#ifdef CONFIG_RT_GROUP_SCHED
419	unsigned long rt_nr_boosted;	419	unsigned long rt_nr_boosted;
420		420
421	struct rq *rq;	421	struct rq *rq;
422	struct task_group *tg;	422	struct task_group *tg;
423	#endif	423	#endif
424	};	424	};
425		425
426	#ifdef CONFIG_RT_GROUP_SCHED	426	#ifdef CONFIG_RT_GROUP_SCHED
427	static inline int rt_rq_throttled(struct rt_rq *rt_rq)	427	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428	{	428	{
429	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;	429	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430	}	430	}
431	#else	431	#else
432	static inline int rt_rq_throttled(struct rt_rq *rt_rq)	432	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433	{	433	{
434	return rt_rq->rt_throttled;	434	return rt_rq->rt_throttled;
435	}	435	}
436	#endif	436	#endif
437		437
438	/* Deadline class' related fields in a runqueue */	438	/* Deadline class' related fields in a runqueue */
439	struct dl_rq {	439	struct dl_rq {
440	/* runqueue is an rbtree, ordered by deadline */	440	/* runqueue is an rbtree, ordered by deadline */
441	struct rb_root rb_root;	441	struct rb_root rb_root;
442	struct rb_node *rb_leftmost;	442	struct rb_node *rb_leftmost;
443		443
444	unsigned long dl_nr_running;	444	unsigned long dl_nr_running;
445		445
446	#ifdef CONFIG_SMP	446	#ifdef CONFIG_SMP
447	/*	447	/*
448	* Deadline values of the currently executing and the	448	* Deadline values of the currently executing and the
449	* earliest ready task on this rq. Caching these facilitates	449	* earliest ready task on this rq. Caching these facilitates
450	* the decision wether or not a ready but not running task	450	* the decision wether or not a ready but not running task
451	* should migrate somewhere else.	451	* should migrate somewhere else.
452	*/	452	*/
453	struct {	453	struct {
454	u64 curr;	454	u64 curr;
455	u64 next;	455	u64 next;
456	} earliest_dl;	456	} earliest_dl;
457		457
458	unsigned long dl_nr_migratory;	458	unsigned long dl_nr_migratory;
459	int overloaded;	459	int overloaded;
460		460
461	/*	461	/*
462	* Tasks on this rq that can be pushed away. They are kept in	462	* Tasks on this rq that can be pushed away. They are kept in
463	* an rb-tree, ordered by tasks' deadlines, with caching	463	* an rb-tree, ordered by tasks' deadlines, with caching
464	* of the leftmost (earliest deadline) element.	464	* of the leftmost (earliest deadline) element.
465	*/	465	*/
466	struct rb_root pushable_dl_tasks_root;	466	struct rb_root pushable_dl_tasks_root;
467	struct rb_node *pushable_dl_tasks_leftmost;	467	struct rb_node *pushable_dl_tasks_leftmost;
468	#else	468	#else
469	struct dl_bw dl_bw;	469	struct dl_bw dl_bw;
470	#endif	470	#endif
471	};	471	};
472		472
473	#ifdef CONFIG_SMP	473	#ifdef CONFIG_SMP
474		474
475	/*	475	/*
476	* We add the notion of a root-domain which will be used to define per-domain	476	* We add the notion of a root-domain which will be used to define per-domain
477	* variables. Each exclusive cpuset essentially defines an island domain by	477	* variables. Each exclusive cpuset essentially defines an island domain by
478	* fully partitioning the member cpus from any other cpuset. Whenever a new	478	* fully partitioning the member cpus from any other cpuset. Whenever a new
479	* exclusive cpuset is created, we also create and attach a new root-domain	479	* exclusive cpuset is created, we also create and attach a new root-domain
480	* object.	480	* object.
481	*	481	*
482	*/	482	*/
483	struct root_domain {	483	struct root_domain {
484	atomic_t refcount;	484	atomic_t refcount;
485	atomic_t rto_count;	485	atomic_t rto_count;
486	struct rcu_head rcu;	486	struct rcu_head rcu;
487	cpumask_var_t span;	487	cpumask_var_t span;
488	cpumask_var_t online;	488	cpumask_var_t online;
489		489
490	/*	490	/*
491	* The bit corresponding to a CPU gets set here if such CPU has more	491	* The bit corresponding to a CPU gets set here if such CPU has more
492	* than one runnable -deadline task (as it is below for RT tasks).	492	* than one runnable -deadline task (as it is below for RT tasks).
493	*/	493	*/
494	cpumask_var_t dlo_mask;	494	cpumask_var_t dlo_mask;
495	atomic_t dlo_count;	495	atomic_t dlo_count;
496	struct dl_bw dl_bw;	496	struct dl_bw dl_bw;
497	struct cpudl cpudl;	497	struct cpudl cpudl;
498		498
499	/*	499	/*
500	* The "RT overload" flag: it gets set if a CPU has more than	500	* The "RT overload" flag: it gets set if a CPU has more than
501	* one runnable RT task.	501	* one runnable RT task.
502	*/	502	*/
503	cpumask_var_t rto_mask;	503	cpumask_var_t rto_mask;
504	struct cpupri cpupri;	504	struct cpupri cpupri;
505	};	505	};
506		506
507	extern struct root_domain def_root_domain;	507	extern struct root_domain def_root_domain;
508		508
509	#endif /* CONFIG_SMP */	509	#endif /* CONFIG_SMP */
510		510
511	/*	511	/*
512	* This is the main, per-CPU runqueue data structure.	512	* This is the main, per-CPU runqueue data structure.
513	*	513	*
514	* Locking rule: those places that want to lock multiple runqueues	514	* Locking rule: those places that want to lock multiple runqueues
515	* (such as the load balancing or the thread migration code), lock	515	* (such as the load balancing or the thread migration code), lock
516	* acquire operations must be ordered by ascending &runqueue.	516	* acquire operations must be ordered by ascending &runqueue.
517	*/	517	*/
518	struct rq {	518	struct rq {
519	/* runqueue lock: */	519	/* runqueue lock: */
520	raw_spinlock_t lock;	520	raw_spinlock_t lock;
521		521
522	/*	522	/*
523	* nr_running and cpu_load should be in the same cacheline because	523	* nr_running and cpu_load should be in the same cacheline because
524	* remote CPUs use both these fields when doing load calculation.	524	* remote CPUs use both these fields when doing load calculation.
525	*/	525	*/
526	unsigned int nr_running;	526	unsigned int nr_running;
527	#ifdef CONFIG_NUMA_BALANCING	527	#ifdef CONFIG_NUMA_BALANCING
528	unsigned int nr_numa_running;	528	unsigned int nr_numa_running;
529	unsigned int nr_preferred_running;	529	unsigned int nr_preferred_running;
530	#endif	530	#endif
531	#define CPU_LOAD_IDX_MAX 5	531	#define CPU_LOAD_IDX_MAX 5
532	unsigned long cpu_load[CPU_LOAD_IDX_MAX];	532	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
533	unsigned long last_load_update_tick;	533	unsigned long last_load_update_tick;
534	#ifdef CONFIG_NO_HZ_COMMON	534	#ifdef CONFIG_NO_HZ_COMMON
535	u64 nohz_stamp;	535	u64 nohz_stamp;
536	unsigned long nohz_flags;	536	unsigned long nohz_flags;
537	#endif	537	#endif
538	#ifdef CONFIG_NO_HZ_FULL	538	#ifdef CONFIG_NO_HZ_FULL
539	unsigned long last_sched_tick;	539	unsigned long last_sched_tick;
540	#endif	540	#endif
541	int skip_clock_update;	541	int skip_clock_update;
542		542
543	/* capture load from all tasks on this cpu: */	543	/* capture load from all tasks on this cpu: */
544	struct load_weight load;	544	struct load_weight load;
545	unsigned long nr_load_updates;	545	unsigned long nr_load_updates;
546	u64 nr_switches;	546	u64 nr_switches;
547		547
548	struct cfs_rq cfs;	548	struct cfs_rq cfs;
549	struct rt_rq rt;	549	struct rt_rq rt;
550	struct dl_rq dl;	550	struct dl_rq dl;
551		551
552	#ifdef CONFIG_FAIR_GROUP_SCHED	552	#ifdef CONFIG_FAIR_GROUP_SCHED
553	/* list of leaf cfs_rq on this cpu: */	553	/* list of leaf cfs_rq on this cpu: */
554	struct list_head leaf_cfs_rq_list;	554	struct list_head leaf_cfs_rq_list;
555		555
556	struct sched_avg avg;	556	struct sched_avg avg;
557	#endif /* CONFIG_FAIR_GROUP_SCHED */	557	#endif /* CONFIG_FAIR_GROUP_SCHED */
558		558
559	/*	559	/*
560	* This is part of a global counter where only the total sum	560	* This is part of a global counter where only the total sum
561	* over all CPUs matters. A task can increase this counter on	561	* over all CPUs matters. A task can increase this counter on
562	* one CPU and if it got migrated afterwards it may decrease	562	* one CPU and if it got migrated afterwards it may decrease
563	* it on another CPU. Always updated under the runqueue lock:	563	* it on another CPU. Always updated under the runqueue lock:
564	*/	564	*/
565	unsigned long nr_uninterruptible;	565	unsigned long nr_uninterruptible;
566		566
567	struct task_struct curr, idle, *stop;	567	struct task_struct curr, idle, *stop;
568	unsigned long next_balance;	568	unsigned long next_balance;
569	struct mm_struct *prev_mm;	569	struct mm_struct *prev_mm;
570		570
571	u64 clock;	571	u64 clock;
572	u64 clock_task;	572	u64 clock_task;
573		573
574	atomic_t nr_iowait;	574	atomic_t nr_iowait;
575		575
576	#ifdef CONFIG_SMP	576	#ifdef CONFIG_SMP
577	struct root_domain *rd;	577	struct root_domain *rd;
578	struct sched_domain *sd;	578	struct sched_domain *sd;
579		579
580	unsigned long cpu_power;	580	unsigned long cpu_power;
581		581
582	unsigned char idle_balance;	582	unsigned char idle_balance;
583	/* For active balancing */	583	/* For active balancing */
584	int post_schedule;	584	int post_schedule;
585	int active_balance;	585	int active_balance;
586	int push_cpu;	586	int push_cpu;
587	struct cpu_stop_work active_balance_work;	587	struct cpu_stop_work active_balance_work;
588	/* cpu of this runqueue: */	588	/* cpu of this runqueue: */
589	int cpu;	589	int cpu;
590	int online;	590	int online;
591		591
592	struct list_head cfs_tasks;	592	struct list_head cfs_tasks;
593		593
594	u64 rt_avg;	594	u64 rt_avg;
595	u64 age_stamp;	595	u64 age_stamp;
596	u64 idle_stamp;	596	u64 idle_stamp;
597	u64 avg_idle;	597	u64 avg_idle;
598		598
599	/* This is used to determine avg_idle's max value */	599	/* This is used to determine avg_idle's max value */
600	u64 max_idle_balance_cost;	600	u64 max_idle_balance_cost;
601	#endif	601	#endif
602		602
603	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	603	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
604	u64 prev_irq_time;	604	u64 prev_irq_time;
605	#endif	605	#endif
606	#ifdef CONFIG_PARAVIRT	606	#ifdef CONFIG_PARAVIRT
607	u64 prev_steal_time;	607	u64 prev_steal_time;
608	#endif	608	#endif
609	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING	609	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
610	u64 prev_steal_time_rq;	610	u64 prev_steal_time_rq;
611	#endif	611	#endif
612		612
613	/* calc_load related fields */	613	/* calc_load related fields */
614	unsigned long calc_load_update;	614	unsigned long calc_load_update;
615	long calc_load_active;	615	long calc_load_active;
616		616
617	#ifdef CONFIG_SCHED_HRTICK	617	#ifdef CONFIG_SCHED_HRTICK
618	#ifdef CONFIG_SMP	618	#ifdef CONFIG_SMP
619	int hrtick_csd_pending;	619	int hrtick_csd_pending;
620	struct call_single_data hrtick_csd;	620	struct call_single_data hrtick_csd;
621	#endif	621	#endif
622	struct hrtimer hrtick_timer;	622	struct hrtimer hrtick_timer;
623	#endif	623	#endif
624		624
625	#ifdef CONFIG_SCHEDSTATS	625	#ifdef CONFIG_SCHEDSTATS
626	/* latency stats */	626	/* latency stats */
627	struct sched_info rq_sched_info;	627	struct sched_info rq_sched_info;
628	unsigned long long rq_cpu_time;	628	unsigned long long rq_cpu_time;
629	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */	629	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
630		630
631	/* sys_sched_yield() stats */	631	/* sys_sched_yield() stats */
632	unsigned int yld_count;	632	unsigned int yld_count;
633		633
634	/* schedule() stats */	634	/* schedule() stats */
635	unsigned int sched_count;	635	unsigned int sched_count;
636	unsigned int sched_goidle;	636	unsigned int sched_goidle;
637		637
638	/* try_to_wake_up() stats */	638	/* try_to_wake_up() stats */
639	unsigned int ttwu_count;	639	unsigned int ttwu_count;
640	unsigned int ttwu_local;	640	unsigned int ttwu_local;
641	#endif	641	#endif
642		642
643	#ifdef CONFIG_SMP	643	#ifdef CONFIG_SMP
644	struct llist_head wake_list;	644	struct llist_head wake_list;
645	#endif	645	#endif
646	};	646	};
647		647
648	static inline int cpu_of(struct rq *rq)	648	static inline int cpu_of(struct rq *rq)
649	{	649	{
650	#ifdef CONFIG_SMP	650	#ifdef CONFIG_SMP
651	return rq->cpu;	651	return rq->cpu;
652	#else	652	#else
653	return 0;	653	return 0;
654	#endif	654	#endif
655	}	655	}
656		656
657	DECLARE_PER_CPU(struct rq, runqueues);	657	DECLARE_PER_CPU(struct rq, runqueues);
658		658
659	#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))	659	#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
660	#define this_rq() (&__get_cpu_var(runqueues))	660	#define this_rq() (&__get_cpu_var(runqueues))
661	#define task_rq(p) cpu_rq(task_cpu(p))	661	#define task_rq(p) cpu_rq(task_cpu(p))
662	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)	662	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
663	#define raw_rq() (&__raw_get_cpu_var(runqueues))	663	#define raw_rq() (&__raw_get_cpu_var(runqueues))
664		664
665	static inline u64 rq_clock(struct rq *rq)	665	static inline u64 rq_clock(struct rq *rq)
666	{	666	{
667	return rq->clock;	667	return rq->clock;
668	}	668	}
669		669
670	static inline u64 rq_clock_task(struct rq *rq)	670	static inline u64 rq_clock_task(struct rq *rq)
671	{	671	{
672	return rq->clock_task;	672	return rq->clock_task;
673	}	673	}
674		674
675	#ifdef CONFIG_NUMA_BALANCING	675	#ifdef CONFIG_NUMA_BALANCING
676	extern void sched_setnuma(struct task_struct *p, int node);	676	extern void sched_setnuma(struct task_struct *p, int node);
677	extern int migrate_task_to(struct task_struct *p, int cpu);	677	extern int migrate_task_to(struct task_struct *p, int cpu);
678	extern int migrate_swap(struct task_struct , struct task_struct );	678	extern int migrate_swap(struct task_struct , struct task_struct );
679	#endif /* CONFIG_NUMA_BALANCING */	679	#endif /* CONFIG_NUMA_BALANCING */
680		680
681	#ifdef CONFIG_SMP	681	#ifdef CONFIG_SMP
682		682
683	#define rcu_dereference_check_sched_domain(p) \	683	#define rcu_dereference_check_sched_domain(p) \
684	rcu_dereference_check((p), \	684	rcu_dereference_check((p), \
685	lockdep_is_held(&sched_domains_mutex))	685	lockdep_is_held(&sched_domains_mutex))
686		686
687	/*	687	/*
688	* The domain tree (rq->sd) is protected by RCU's quiescent state transition.	688	* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
689	* See detach_destroy_domains: synchronize_sched for details.	689	* See detach_destroy_domains: synchronize_sched for details.
690	*	690	*
691	* The domain tree of any CPU may only be accessed from within	691	* The domain tree of any CPU may only be accessed from within
692	* preempt-disabled sections.	692	* preempt-disabled sections.
693	*/	693	*/
694	#define for_each_domain(cpu, __sd) \	694	#define for_each_domain(cpu, __sd) \
695	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \	695	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
696	__sd; __sd = __sd->parent)	696	__sd; __sd = __sd->parent)
697		697
698	#define for_each_lower_domain(sd) for (; sd; sd = sd->child)	698	#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
699		699
700	/**	700	/**
701	* highest_flag_domain - Return highest sched_domain containing flag.	701	* highest_flag_domain - Return highest sched_domain containing flag.
702	* @cpu: The cpu whose highest level of sched domain is to	702	* @cpu: The cpu whose highest level of sched domain is to
703	* be returned.	703	* be returned.
704	* @flag: The flag to check for the highest sched_domain	704	* @flag: The flag to check for the highest sched_domain
705	* for the given cpu.	705	* for the given cpu.
706	*	706	*
707	* Returns the highest sched_domain of a cpu which contains the given flag.	707	* Returns the highest sched_domain of a cpu which contains the given flag.
708	*/	708	*/
709	static inline struct sched_domain *highest_flag_domain(int cpu, int flag)	709	static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
710	{	710	{
711	struct sched_domain sd, hsd = NULL;	711	struct sched_domain sd, hsd = NULL;
712		712
713	for_each_domain(cpu, sd) {	713	for_each_domain(cpu, sd) {
714	if (!(sd->flags & flag))	714	if (!(sd->flags & flag))
715	break;	715	break;
716	hsd = sd;	716	hsd = sd;
717	}	717	}
718		718
719	return hsd;	719	return hsd;
720	}	720	}
721		721
722	static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)	722	static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
723	{	723	{
724	struct sched_domain *sd;	724	struct sched_domain *sd;
725		725
726	for_each_domain(cpu, sd) {	726	for_each_domain(cpu, sd) {
727	if (sd->flags & flag)	727	if (sd->flags & flag)
728	break;	728	break;
729	}	729	}
730		730
731	return sd;	731	return sd;
732	}	732	}
733		733
734	DECLARE_PER_CPU(struct sched_domain *, sd_llc);	734	DECLARE_PER_CPU(struct sched_domain *, sd_llc);
735	DECLARE_PER_CPU(int, sd_llc_size);	735	DECLARE_PER_CPU(int, sd_llc_size);
736	DECLARE_PER_CPU(int, sd_llc_id);	736	DECLARE_PER_CPU(int, sd_llc_id);
737	DECLARE_PER_CPU(struct sched_domain *, sd_numa);	737	DECLARE_PER_CPU(struct sched_domain *, sd_numa);
738	DECLARE_PER_CPU(struct sched_domain *, sd_busy);	738	DECLARE_PER_CPU(struct sched_domain *, sd_busy);
739	DECLARE_PER_CPU(struct sched_domain *, sd_asym);	739	DECLARE_PER_CPU(struct sched_domain *, sd_asym);
740		740
741	struct sched_group_power {	741	struct sched_group_power {
742	atomic_t ref;	742	atomic_t ref;
743	/*	743	/*
744	* CPU power of this group, SCHED_LOAD_SCALE being max power for a	744	* CPU power of this group, SCHED_LOAD_SCALE being max power for a
745	* single CPU.	745	* single CPU.
746	*/	746	*/
747	unsigned int power, power_orig;	747	unsigned int power, power_orig;
748	unsigned long next_update;	748	unsigned long next_update;
749	int imbalance; /* XXX unrelated to power but shared group state */	749	int imbalance; /* XXX unrelated to power but shared group state */
750	/*	750	/*
751	* Number of busy cpus in this group.	751	* Number of busy cpus in this group.
752	*/	752	*/
753	atomic_t nr_busy_cpus;	753	atomic_t nr_busy_cpus;
754		754
755	unsigned long cpumask[0]; /* iteration mask */	755	unsigned long cpumask[0]; /* iteration mask */
756	};	756	};
757		757
758	struct sched_group {	758	struct sched_group {
759	struct sched_group next; / Must be a circular list */	759	struct sched_group next; / Must be a circular list */
760	atomic_t ref;	760	atomic_t ref;
761		761
762	unsigned int group_weight;	762	unsigned int group_weight;
763	struct sched_group_power *sgp;	763	struct sched_group_power *sgp;
764		764
765	/*	765	/*
766	* The CPUs this group covers.	766	* The CPUs this group covers.
767	*	767	*
768	* NOTE: this field is variable length. (Allocated dynamically	768	* NOTE: this field is variable length. (Allocated dynamically
769	* by attaching extra space to the end of the structure,	769	* by attaching extra space to the end of the structure,
770	* depending on how many CPUs the kernel has booted up with)	770	* depending on how many CPUs the kernel has booted up with)
771	*/	771	*/
772	unsigned long cpumask[0];	772	unsigned long cpumask[0];
773	};	773	};
774		774
775	static inline struct cpumask sched_group_cpus(struct sched_group sg)	775	static inline struct cpumask sched_group_cpus(struct sched_group sg)
776	{	776	{
777	return to_cpumask(sg->cpumask);	777	return to_cpumask(sg->cpumask);
778	}	778	}
779		779
780	/*	780	/*
781	* cpumask masking which cpus in the group are allowed to iterate up the domain	781	* cpumask masking which cpus in the group are allowed to iterate up the domain
782	* tree.	782	* tree.
783	*/	783	*/
784	static inline struct cpumask sched_group_mask(struct sched_group sg)	784	static inline struct cpumask sched_group_mask(struct sched_group sg)
785	{	785	{
786	return to_cpumask(sg->sgp->cpumask);	786	return to_cpumask(sg->sgp->cpumask);
787	}	787	}
788		788
789	/**	789	/**
790	* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.	790	* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
791	* @group: The group whose first cpu is to be returned.	791	* @group: The group whose first cpu is to be returned.
792	*/	792	*/
793	static inline unsigned int group_first_cpu(struct sched_group *group)	793	static inline unsigned int group_first_cpu(struct sched_group *group)
794	{	794	{
795	return cpumask_first(sched_group_cpus(group));	795	return cpumask_first(sched_group_cpus(group));
796	}	796	}
797		797
798	extern int group_balance_cpu(struct sched_group *sg);	798	extern int group_balance_cpu(struct sched_group *sg);
799		799
800	#endif /* CONFIG_SMP */	800	#endif /* CONFIG_SMP */
801		801
802	#include "stats.h"	802	#include "stats.h"
803	#include "auto_group.h"	803	#include "auto_group.h"
804		804
805	#ifdef CONFIG_CGROUP_SCHED	805	#ifdef CONFIG_CGROUP_SCHED
806		806
807	/*	807	/*
808	* Return the group to which this tasks belongs.	808	* Return the group to which this tasks belongs.
809	*	809	*
810	* We cannot use task_css() and friends because the cgroup subsystem	810	* We cannot use task_css() and friends because the cgroup subsystem
811	* changes that value before the cgroup_subsys::attach() method is called,	811	* changes that value before the cgroup_subsys::attach() method is called,
812	* therefore we cannot pin it and might observe the wrong value.	812	* therefore we cannot pin it and might observe the wrong value.
813	*	813	*
814	* The same is true for autogroup's p->signal->autogroup->tg, the autogroup	814	* The same is true for autogroup's p->signal->autogroup->tg, the autogroup
815	* core changes this before calling sched_move_task().	815	* core changes this before calling sched_move_task().
816	*	816	*
817	* Instead we use a 'copy' which is updated from sched_move_task() while	817	* Instead we use a 'copy' which is updated from sched_move_task() while
818	* holding both task_struct::pi_lock and rq::lock.	818	* holding both task_struct::pi_lock and rq::lock.
819	*/	819	*/
820	static inline struct task_group task_group(struct task_struct p)	820	static inline struct task_group task_group(struct task_struct p)
821	{	821	{
822	return p->sched_task_group;	822	return p->sched_task_group;
823	}	823	}
824		824
825	/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */	825	/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
826	static inline void set_task_rq(struct task_struct *p, unsigned int cpu)	826	static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
827	{	827	{
828	#if defined(CONFIG_FAIR_GROUP_SCHED) \|\| defined(CONFIG_RT_GROUP_SCHED)	828	#if defined(CONFIG_FAIR_GROUP_SCHED) \|\| defined(CONFIG_RT_GROUP_SCHED)
829	struct task_group *tg = task_group(p);	829	struct task_group *tg = task_group(p);
830	#endif	830	#endif
831		831
832	#ifdef CONFIG_FAIR_GROUP_SCHED	832	#ifdef CONFIG_FAIR_GROUP_SCHED
833	p->se.cfs_rq = tg->cfs_rq[cpu];	833	p->se.cfs_rq = tg->cfs_rq[cpu];
834	p->se.parent = tg->se[cpu];	834	p->se.parent = tg->se[cpu];
835	#endif	835	#endif
836		836
837	#ifdef CONFIG_RT_GROUP_SCHED	837	#ifdef CONFIG_RT_GROUP_SCHED
838	p->rt.rt_rq = tg->rt_rq[cpu];	838	p->rt.rt_rq = tg->rt_rq[cpu];
839	p->rt.parent = tg->rt_se[cpu];	839	p->rt.parent = tg->rt_se[cpu];
840	#endif	840	#endif
841	}	841	}
842		842
843	#else /* CONFIG_CGROUP_SCHED */	843	#else /* CONFIG_CGROUP_SCHED */
844		844
845	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }	845	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
846	static inline struct task_group task_group(struct task_struct p)	846	static inline struct task_group task_group(struct task_struct p)
847	{	847	{
848	return NULL;	848	return NULL;
849	}	849	}
850		850
851	#endif /* CONFIG_CGROUP_SCHED */	851	#endif /* CONFIG_CGROUP_SCHED */
852		852
853	static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)	853	static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
854	{	854	{
855	set_task_rq(p, cpu);	855	set_task_rq(p, cpu);
856	#ifdef CONFIG_SMP	856	#ifdef CONFIG_SMP
857	/*	857	/*
858	* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be	858	* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
859	* successfuly executed on another CPU. We must ensure that updates of	859	* successfuly executed on another CPU. We must ensure that updates of
860	* per-task data have been completed by this moment.	860	* per-task data have been completed by this moment.
861	*/	861	*/
862	smp_wmb();	862	smp_wmb();
863	task_thread_info(p)->cpu = cpu;	863	task_thread_info(p)->cpu = cpu;
864	p->wake_cpu = cpu;	864	p->wake_cpu = cpu;
865	#endif	865	#endif
866	}	866	}
867		867
868	/*	868	/*
869	* Tunables that become constants when CONFIG_SCHED_DEBUG is off:	869	* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
870	*/	870	*/
871	#ifdef CONFIG_SCHED_DEBUG	871	#ifdef CONFIG_SCHED_DEBUG
872	# include <linux/static_key.h>	872	# include <linux/static_key.h>
873	# define const_debug __read_mostly	873	# define const_debug __read_mostly
874	#else	874	#else
875	# define const_debug const	875	# define const_debug const
876	#endif	876	#endif
877		877
878	extern const_debug unsigned int sysctl_sched_features;	878	extern const_debug unsigned int sysctl_sched_features;
879		879
880	#define SCHED_FEAT(name, enabled) \	880	#define SCHED_FEAT(name, enabled) \
881	__SCHED_FEAT_##name ,	881	__SCHED_FEAT_##name ,
882		882
883	enum {	883	enum {
884	#include "features.h"	884	#include "features.h"
885	__SCHED_FEAT_NR,	885	__SCHED_FEAT_NR,
886	};	886	};
887		887
888	#undef SCHED_FEAT	888	#undef SCHED_FEAT
889		889
890	#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)	890	#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
891	static __always_inline bool static_branch__true(struct static_key *key)	891	static __always_inline bool static_branch__true(struct static_key *key)
892	{	892	{
893	return static_key_true(key); /* Not out of line branch. */	893	return static_key_true(key); /* Not out of line branch. */
894	}	894	}
895		895
896	static __always_inline bool static_branch__false(struct static_key *key)	896	static __always_inline bool static_branch__false(struct static_key *key)
897	{	897	{
898	return static_key_false(key); /* Out of line branch. */	898	return static_key_false(key); /* Out of line branch. */
899	}	899	}
900		900
901	#define SCHED_FEAT(name, enabled) \	901	#define SCHED_FEAT(name, enabled) \
902	static __always_inline bool static_branch_##name(struct static_key *key) \	902	static __always_inline bool static_branch_##name(struct static_key *key) \
903	{ \	903	{ \
904	return static_branch__##enabled(key); \	904	return static_branch__##enabled(key); \
905	}	905	}
906		906
907	#include "features.h"	907	#include "features.h"
908		908
909	#undef SCHED_FEAT	909	#undef SCHED_FEAT
910		910
911	extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];	911	extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
912	#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))	912	#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
913	#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */	913	#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
914	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))	914	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
915	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */	915	#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
916		916
917	#ifdef CONFIG_NUMA_BALANCING	917	#ifdef CONFIG_NUMA_BALANCING
918	#define sched_feat_numa(x) sched_feat(x)	918	#define sched_feat_numa(x) sched_feat(x)
919	#ifdef CONFIG_SCHED_DEBUG	919	#ifdef CONFIG_SCHED_DEBUG
920	#define numabalancing_enabled sched_feat_numa(NUMA)	920	#define numabalancing_enabled sched_feat_numa(NUMA)
921	#else	921	#else
922	extern bool numabalancing_enabled;	922	extern bool numabalancing_enabled;
923	#endif /* CONFIG_SCHED_DEBUG */	923	#endif /* CONFIG_SCHED_DEBUG */
924	#else	924	#else
925	#define sched_feat_numa(x) (0)	925	#define sched_feat_numa(x) (0)
926	#define numabalancing_enabled (0)	926	#define numabalancing_enabled (0)
927	#endif /* CONFIG_NUMA_BALANCING */	927	#endif /* CONFIG_NUMA_BALANCING */
928		928
929	static inline u64 global_rt_period(void)	929	static inline u64 global_rt_period(void)
930	{	930	{
931	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;	931	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
932	}	932	}
933		933
934	static inline u64 global_rt_runtime(void)	934	static inline u64 global_rt_runtime(void)
935	{	935	{
936	if (sysctl_sched_rt_runtime < 0)	936	if (sysctl_sched_rt_runtime < 0)
937	return RUNTIME_INF;	937	return RUNTIME_INF;
938		938
939	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;	939	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
940	}	940	}
941		941
942	static inline int task_current(struct rq rq, struct task_struct p)	942	static inline int task_current(struct rq rq, struct task_struct p)
943	{	943	{
944	return rq->curr == p;	944	return rq->curr == p;
945	}	945	}
946		946
947	static inline int task_running(struct rq rq, struct task_struct p)	947	static inline int task_running(struct rq rq, struct task_struct p)
948	{	948	{
949	#ifdef CONFIG_SMP	949	#ifdef CONFIG_SMP
950	return p->on_cpu;	950	return p->on_cpu;
951	#else	951	#else
952	return task_current(rq, p);	952	return task_current(rq, p);
953	#endif	953	#endif
954	}	954	}
955		955
956		956
957	#ifndef prepare_arch_switch	957	#ifndef prepare_arch_switch
958	# define prepare_arch_switch(next) do { } while (0)	958	# define prepare_arch_switch(next) do { } while (0)
959	#endif	959	#endif
960	#ifndef finish_arch_switch	960	#ifndef finish_arch_switch
961	# define finish_arch_switch(prev) do { } while (0)	961	# define finish_arch_switch(prev) do { } while (0)
962	#endif	962	#endif
963	#ifndef finish_arch_post_lock_switch	963	#ifndef finish_arch_post_lock_switch
964	# define finish_arch_post_lock_switch() do { } while (0)	964	# define finish_arch_post_lock_switch() do { } while (0)
965	#endif	965	#endif
966		966
967	#ifndef __ARCH_WANT_UNLOCKED_CTXSW	967	#ifndef __ARCH_WANT_UNLOCKED_CTXSW
968	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)	968	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)
969	{	969	{
970	#ifdef CONFIG_SMP	970	#ifdef CONFIG_SMP
971	/*	971	/*
972	* We can optimise this out completely for !SMP, because the	972	* We can optimise this out completely for !SMP, because the
973	* SMP rebalancing from interrupt is the only thing that cares	973	* SMP rebalancing from interrupt is the only thing that cares
974	* here.	974	* here.
975	*/	975	*/
976	next->on_cpu = 1;	976	next->on_cpu = 1;
977	#endif	977	#endif
978	}	978	}
979		979
980	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)	980	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)
981	{	981	{
982	#ifdef CONFIG_SMP	982	#ifdef CONFIG_SMP
983	/*	983	/*
984	* After ->on_cpu is cleared, the task can be moved to a different CPU.	984	* After ->on_cpu is cleared, the task can be moved to a different CPU.
985	* We must ensure this doesn't happen until the switch is completely	985	* We must ensure this doesn't happen until the switch is completely
986	* finished.	986	* finished.
987	*/	987	*/
988	smp_wmb();	988	smp_wmb();
989	prev->on_cpu = 0;	989	prev->on_cpu = 0;
990	#endif	990	#endif
991	#ifdef CONFIG_DEBUG_SPINLOCK	991	#ifdef CONFIG_DEBUG_SPINLOCK
992	/* this is a valid case when another task releases the spinlock */	992	/* this is a valid case when another task releases the spinlock */
993	rq->lock.owner = current;	993	rq->lock.owner = current;
994	#endif	994	#endif
995	/*	995	/*
996	* If we are tracking spinlock dependencies then we have to	996	* If we are tracking spinlock dependencies then we have to
997	* fix up the runqueue lock - which gets 'carried over' from	997	* fix up the runqueue lock - which gets 'carried over' from
998	* prev into current:	998	* prev into current:
999	*/	999	*/
1000	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);	1000	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1001		1001
1002	raw_spin_unlock_irq(&rq->lock);	1002	raw_spin_unlock_irq(&rq->lock);
1003	}	1003	}
1004		1004
1005	#else /* __ARCH_WANT_UNLOCKED_CTXSW */	1005	#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1006	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)	1006	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)
1007	{	1007	{
1008	#ifdef CONFIG_SMP	1008	#ifdef CONFIG_SMP
1009	/*	1009	/*
1010	* We can optimise this out completely for !SMP, because the	1010	* We can optimise this out completely for !SMP, because the
1011	* SMP rebalancing from interrupt is the only thing that cares	1011	* SMP rebalancing from interrupt is the only thing that cares
1012	* here.	1012	* here.
1013	*/	1013	*/
1014	next->on_cpu = 1;	1014	next->on_cpu = 1;
1015	#endif	1015	#endif
1016	raw_spin_unlock(&rq->lock);	1016	raw_spin_unlock(&rq->lock);
1017	}	1017	}
1018		1018
1019	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)	1019	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)
1020	{	1020	{
1021	#ifdef CONFIG_SMP	1021	#ifdef CONFIG_SMP
1022	/*	1022	/*
1023	* After ->on_cpu is cleared, the task can be moved to a different CPU.	1023	* After ->on_cpu is cleared, the task can be moved to a different CPU.
1024	* We must ensure this doesn't happen until the switch is completely	1024	* We must ensure this doesn't happen until the switch is completely
1025	* finished.	1025	* finished.
1026	*/	1026	*/
1027	smp_wmb();	1027	smp_wmb();
1028	prev->on_cpu = 0;	1028	prev->on_cpu = 0;
1029	#endif	1029	#endif
1030	local_irq_enable();	1030	local_irq_enable();
1031	}	1031	}
1032	#endif /* __ARCH_WANT_UNLOCKED_CTXSW */	1032	#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1033		1033
1034	/*	1034	/*
1035	* wake flags	1035	* wake flags
1036	*/	1036	*/
1037	#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */	1037	#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
1038	#define WF_FORK 0x02 /* child wakeup after fork */	1038	#define WF_FORK 0x02 /* child wakeup after fork */
1039	#define WF_MIGRATED 0x4 /* internal use, task got migrated */	1039	#define WF_MIGRATED 0x4 /* internal use, task got migrated */
1040		1040
1041	/*	1041	/*
1042	* To aid in avoiding the subversion of "niceness" due to uneven distribution	1042	* To aid in avoiding the subversion of "niceness" due to uneven distribution
1043	* of tasks with abnormal "nice" values across CPUs the contribution that	1043	* of tasks with abnormal "nice" values across CPUs the contribution that
1044	* each task makes to its run queue's load is weighted according to its	1044	* each task makes to its run queue's load is weighted according to its
1045	* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a	1045	* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1046	* scaled version of the new time slice allocation that they receive on time	1046	* scaled version of the new time slice allocation that they receive on time
1047	* slice expiry etc.	1047	* slice expiry etc.
1048	*/	1048	*/
1049		1049
1050	#define WEIGHT_IDLEPRIO 3	1050	#define WEIGHT_IDLEPRIO 3
1051	#define WMULT_IDLEPRIO 1431655765	1051	#define WMULT_IDLEPRIO 1431655765
1052		1052
1053	/*	1053	/*
1054	* Nice levels are multiplicative, with a gentle 10% change for every	1054	* Nice levels are multiplicative, with a gentle 10% change for every
1055	* nice level changed. I.e. when a CPU-bound task goes from nice 0 to	1055	* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1056	* nice 1, it will get ~10% less CPU time than another CPU-bound task	1056	* nice 1, it will get ~10% less CPU time than another CPU-bound task
1057	* that remained on nice 0.	1057	* that remained on nice 0.
1058	*	1058	*
1059	* The "10% effect" is relative and cumulative: from _any_ nice level,	1059	* The "10% effect" is relative and cumulative: from _any_ nice level,
1060	* if you go up 1 level, it's -10% CPU usage, if you go down 1 level	1060	* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1061	* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.	1061	* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1062	* If a task goes up by ~10% and another task goes down by ~10% then	1062	* If a task goes up by ~10% and another task goes down by ~10% then
1063	* the relative distance between them is ~25%.)	1063	* the relative distance between them is ~25%.)
1064	*/	1064	*/
1065	static const int prio_to_weight[40] = {	1065	static const int prio_to_weight[40] = {
1066	/* -20 */ 88761, 71755, 56483, 46273, 36291,	1066	/* -20 */ 88761, 71755, 56483, 46273, 36291,
1067	/* -15 */ 29154, 23254, 18705, 14949, 11916,	1067	/* -15 */ 29154, 23254, 18705, 14949, 11916,
1068	/* -10 */ 9548, 7620, 6100, 4904, 3906,	1068	/* -10 */ 9548, 7620, 6100, 4904, 3906,
1069	/* -5 */ 3121, 2501, 1991, 1586, 1277,	1069	/* -5 */ 3121, 2501, 1991, 1586, 1277,
1070	/* 0 */ 1024, 820, 655, 526, 423,	1070	/* 0 */ 1024, 820, 655, 526, 423,
1071	/* 5 */ 335, 272, 215, 172, 137,	1071	/* 5 */ 335, 272, 215, 172, 137,
1072	/* 10 */ 110, 87, 70, 56, 45,	1072	/* 10 */ 110, 87, 70, 56, 45,
1073	/* 15 */ 36, 29, 23, 18, 15,	1073	/* 15 */ 36, 29, 23, 18, 15,
1074	};	1074	};
1075		1075
1076	/*	1076	/*
1077	* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.	1077	* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1078	*	1078	*
1079	* In cases where the weight does not change often, we can use the	1079	* In cases where the weight does not change often, we can use the
1080	* precalculated inverse to speed up arithmetics by turning divisions	1080	* precalculated inverse to speed up arithmetics by turning divisions
1081	* into multiplications:	1081	* into multiplications:
1082	*/	1082	*/
1083	static const u32 prio_to_wmult[40] = {	1083	static const u32 prio_to_wmult[40] = {
1084	/* -20 */ 48388, 59856, 76040, 92818, 118348,	1084	/* -20 */ 48388, 59856, 76040, 92818, 118348,
1085	/* -15 */ 147320, 184698, 229616, 287308, 360437,	1085	/* -15 */ 147320, 184698, 229616, 287308, 360437,
1086	/* -10 */ 449829, 563644, 704093, 875809, 1099582,	1086	/* -10 */ 449829, 563644, 704093, 875809, 1099582,
1087	/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,	1087	/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1088	/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,	1088	/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1089	/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,	1089	/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1090	/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,	1090	/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1091	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,	1091	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1092	};	1092	};
1093		1093
1094	#define ENQUEUE_WAKEUP 1	1094	#define ENQUEUE_WAKEUP 1
1095	#define ENQUEUE_HEAD 2	1095	#define ENQUEUE_HEAD 2
1096	#ifdef CONFIG_SMP	1096	#ifdef CONFIG_SMP
1097	#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */	1097	#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
1098	#else	1098	#else
1099	#define ENQUEUE_WAKING 0	1099	#define ENQUEUE_WAKING 0
1100	#endif	1100	#endif
1101	#define ENQUEUE_REPLENISH 8	1101	#define ENQUEUE_REPLENISH 8
1102		1102
1103	#define DEQUEUE_SLEEP 1	1103	#define DEQUEUE_SLEEP 1
1104		1104
1105	#define RETRY_TASK ((void *)-1UL)	1105	#define RETRY_TASK ((void *)-1UL)
1106		1106
1107	struct sched_class {	1107	struct sched_class {
1108	const struct sched_class *next;	1108	const struct sched_class *next;
1109		1109
1110	void (enqueue_task) (struct rq rq, struct task_struct *p, int flags);	1110	void (enqueue_task) (struct rq rq, struct task_struct *p, int flags);
1111	void (dequeue_task) (struct rq rq, struct task_struct *p, int flags);	1111	void (dequeue_task) (struct rq rq, struct task_struct *p, int flags);
1112	void (yield_task) (struct rq rq);	1112	void (yield_task) (struct rq rq);
1113	bool (yield_to_task) (struct rq rq, struct task_struct *p, bool preempt);	1113	bool (yield_to_task) (struct rq rq, struct task_struct *p, bool preempt);
1114		1114
1115	void (check_preempt_curr) (struct rq rq, struct task_struct *p, int flags);	1115	void (check_preempt_curr) (struct rq rq, struct task_struct *p, int flags);
1116		1116
1117	/*	1117	/*
1118	* It is the responsibility of the pick_next_task() method that will	1118	* It is the responsibility of the pick_next_task() method that will
1119	* return the next task to call put_prev_task() on the @prev task or	1119	* return the next task to call put_prev_task() on the @prev task or
1120	* something equivalent.	1120	* something equivalent.
1121	*	1121	*
1122	* May return RETRY_TASK when it finds a higher prio class has runnable	1122	* May return RETRY_TASK when it finds a higher prio class has runnable
1123	* tasks.	1123	* tasks.
1124	*/	1124	*/
1125	struct task_struct * (pick_next_task) (struct rq rq,	1125	struct task_struct * (pick_next_task) (struct rq rq,
1126	struct task_struct *prev);	1126	struct task_struct *prev);
1127	void (put_prev_task) (struct rq rq, struct task_struct *p);	1127	void (put_prev_task) (struct rq rq, struct task_struct *p);
1128		1128
1129	#ifdef CONFIG_SMP	1129	#ifdef CONFIG_SMP
1130	int (select_task_rq)(struct task_struct p, int task_cpu, int sd_flag, int flags);	1130	int (select_task_rq)(struct task_struct p, int task_cpu, int sd_flag, int flags);
1131	void (migrate_task_rq)(struct task_struct p, int next_cpu);	1131	void (migrate_task_rq)(struct task_struct p, int next_cpu);
1132		1132
1133	void (post_schedule) (struct rq this_rq);	1133	void (post_schedule) (struct rq this_rq);
1134	void (task_waking) (struct task_struct task);	1134	void (task_waking) (struct task_struct task);
1135	void (task_woken) (struct rq this_rq, struct task_struct *task);	1135	void (task_woken) (struct rq this_rq, struct task_struct *task);
1136		1136
1137	void (set_cpus_allowed)(struct task_struct p,	1137	void (set_cpus_allowed)(struct task_struct p,
1138	const struct cpumask *newmask);	1138	const struct cpumask *newmask);
1139		1139
1140	void (rq_online)(struct rq rq);	1140	void (rq_online)(struct rq rq);
1141	void (rq_offline)(struct rq rq);	1141	void (rq_offline)(struct rq rq);
1142	#endif	1142	#endif
1143		1143
1144	void (set_curr_task) (struct rq rq);	1144	void (set_curr_task) (struct rq rq);
1145	void (task_tick) (struct rq rq, struct task_struct *p, int queued);	1145	void (task_tick) (struct rq rq, struct task_struct *p, int queued);
1146	void (task_fork) (struct task_struct p);	1146	void (task_fork) (struct task_struct p);
1147	void (task_dead) (struct task_struct p);	1147	void (task_dead) (struct task_struct p);
1148		1148
1149	void (switched_from) (struct rq this_rq, struct task_struct *task);	1149	void (switched_from) (struct rq this_rq, struct task_struct *task);
1150	void (switched_to) (struct rq this_rq, struct task_struct *task);	1150	void (switched_to) (struct rq this_rq, struct task_struct *task);
1151	void (prio_changed) (struct rq this_rq, struct task_struct *task,	1151	void (prio_changed) (struct rq this_rq, struct task_struct *task,
1152	int oldprio);	1152	int oldprio);
1153		1153
1154	unsigned int (get_rr_interval) (struct rq rq,	1154	unsigned int (get_rr_interval) (struct rq rq,
1155	struct task_struct *task);	1155	struct task_struct *task);
1156		1156
1157	#ifdef CONFIG_FAIR_GROUP_SCHED	1157	#ifdef CONFIG_FAIR_GROUP_SCHED
1158	void (task_move_group) (struct task_struct p, int on_rq);	1158	void (task_move_group) (struct task_struct p, int on_rq);
1159	#endif	1159	#endif
1160	};	1160	};
1161		1161
1162	static inline void put_prev_task(struct rq rq, struct task_struct prev)	1162	static inline void put_prev_task(struct rq rq, struct task_struct prev)
1163	{	1163	{
1164	prev->sched_class->put_prev_task(rq, prev);	1164	prev->sched_class->put_prev_task(rq, prev);
1165	}	1165	}
1166		1166
1167	#define sched_class_highest (&stop_sched_class)	1167	#define sched_class_highest (&stop_sched_class)
1168	#define for_each_class(class) \	1168	#define for_each_class(class) \
1169	for (class = sched_class_highest; class; class = class->next)	1169	for (class = sched_class_highest; class; class = class->next)
1170		1170
1171	extern const struct sched_class stop_sched_class;	1171	extern const struct sched_class stop_sched_class;
1172	extern const struct sched_class dl_sched_class;	1172	extern const struct sched_class dl_sched_class;
1173	extern const struct sched_class rt_sched_class;	1173	extern const struct sched_class rt_sched_class;
1174	extern const struct sched_class fair_sched_class;	1174	extern const struct sched_class fair_sched_class;
1175	extern const struct sched_class idle_sched_class;	1175	extern const struct sched_class idle_sched_class;
1176		1176
1177		1177
1178	#ifdef CONFIG_SMP	1178	#ifdef CONFIG_SMP
1179		1179
1180	extern void update_group_power(struct sched_domain *sd, int cpu);	1180	extern void update_group_power(struct sched_domain *sd, int cpu);
1181		1181
1182	extern void trigger_load_balance(struct rq *rq);	1182	extern void trigger_load_balance(struct rq *rq);
1183		1183
1184	extern void idle_enter_fair(struct rq *this_rq);	1184	extern void idle_enter_fair(struct rq *this_rq);
1185	extern void idle_exit_fair(struct rq *this_rq);	1185	extern void idle_exit_fair(struct rq *this_rq);
1186		1186
1187	#else	1187	#else
1188		1188
1189	static inline void idle_enter_fair(struct rq *rq) { }	1189	static inline void idle_enter_fair(struct rq *rq) { }
1190	static inline void idle_exit_fair(struct rq *rq) { }	1190	static inline void idle_exit_fair(struct rq *rq) { }
1191		1191
1192	#endif	1192	#endif
1193		1193
1194	extern void sysrq_sched_debug_show(void);	1194	extern void sysrq_sched_debug_show(void);
1195	extern void sched_init_granularity(void);	1195	extern void sched_init_granularity(void);
1196	extern void update_max_interval(void);	1196	extern void update_max_interval(void);
1197		1197
1198	extern void init_sched_dl_class(void);	1198	extern void init_sched_dl_class(void);
1199	extern void init_sched_rt_class(void);	1199	extern void init_sched_rt_class(void);
1200	extern void init_sched_fair_class(void);	1200	extern void init_sched_fair_class(void);
1201	extern void init_sched_dl_class(void);	1201	extern void init_sched_dl_class(void);
1202		1202
1203	extern void resched_task(struct task_struct *p);	1203	extern void resched_task(struct task_struct *p);
1204	extern void resched_cpu(int cpu);	1204	extern void resched_cpu(int cpu);
1205		1205
1206	extern struct rt_bandwidth def_rt_bandwidth;	1206	extern struct rt_bandwidth def_rt_bandwidth;
1207	extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);	1207	extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
1208		1208
1209	extern struct dl_bandwidth def_dl_bandwidth;	1209	extern struct dl_bandwidth def_dl_bandwidth;
1210	extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);	1210	extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
1211	extern void init_dl_task_timer(struct sched_dl_entity *dl_se);	1211	extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1212		1212
1213	unsigned long to_ratio(u64 period, u64 runtime);	1213	unsigned long to_ratio(u64 period, u64 runtime);
1214		1214
1215	extern void update_idle_cpu_load(struct rq *this_rq);	1215	extern void update_idle_cpu_load(struct rq *this_rq);
1216		1216
1217	extern void init_task_runnable_average(struct task_struct *p);	1217	extern void init_task_runnable_average(struct task_struct *p);
1218		1218
1219	static inline void inc_nr_running(struct rq *rq)	1219	static inline void inc_nr_running(struct rq *rq)
1220	{	1220	{
1221	rq->nr_running++;	1221	rq->nr_running++;
1222		1222
1223	#ifdef CONFIG_NO_HZ_FULL	1223	#ifdef CONFIG_NO_HZ_FULL
1224	if (rq->nr_running == 2) {	1224	if (rq->nr_running == 2) {
1225	if (tick_nohz_full_cpu(rq->cpu)) {	1225	if (tick_nohz_full_cpu(rq->cpu)) {
1226	/* Order rq->nr_running write against the IPI */	1226	/* Order rq->nr_running write against the IPI */
1227	smp_wmb();	1227	smp_wmb();
1228	smp_send_reschedule(rq->cpu);	1228	smp_send_reschedule(rq->cpu);
1229	}	1229	}
1230	}	1230	}
1231	#endif	1231	#endif
1232	}	1232	}
1233		1233
1234	static inline void dec_nr_running(struct rq *rq)	1234	static inline void dec_nr_running(struct rq *rq)
1235	{	1235	{
1236	rq->nr_running--;	1236	rq->nr_running--;
1237	}	1237	}
1238		1238
1239	static inline void rq_last_tick_reset(struct rq *rq)	1239	static inline void rq_last_tick_reset(struct rq *rq)
1240	{	1240	{
1241	#ifdef CONFIG_NO_HZ_FULL	1241	#ifdef CONFIG_NO_HZ_FULL
1242	rq->last_sched_tick = jiffies;	1242	rq->last_sched_tick = jiffies;
1243	#endif	1243	#endif
1244	}	1244	}
1245		1245
1246	extern void update_rq_clock(struct rq *rq);	1246	extern void update_rq_clock(struct rq *rq);
1247		1247
1248	extern void activate_task(struct rq rq, struct task_struct p, int flags);	1248	extern void activate_task(struct rq rq, struct task_struct p, int flags);
1249	extern void deactivate_task(struct rq rq, struct task_struct p, int flags);	1249	extern void deactivate_task(struct rq rq, struct task_struct p, int flags);
1250		1250
1251	extern void check_preempt_curr(struct rq rq, struct task_struct p, int flags);	1251	extern void check_preempt_curr(struct rq rq, struct task_struct p, int flags);
1252		1252
1253	extern const_debug unsigned int sysctl_sched_time_avg;	1253	extern const_debug unsigned int sysctl_sched_time_avg;
1254	extern const_debug unsigned int sysctl_sched_nr_migrate;	1254	extern const_debug unsigned int sysctl_sched_nr_migrate;
1255	extern const_debug unsigned int sysctl_sched_migration_cost;	1255	extern const_debug unsigned int sysctl_sched_migration_cost;
1256		1256
1257	static inline u64 sched_avg_period(void)	1257	static inline u64 sched_avg_period(void)
1258	{	1258	{
1259	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;	1259	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1260	}	1260	}
1261		1261
1262	#ifdef CONFIG_SCHED_HRTICK	1262	#ifdef CONFIG_SCHED_HRTICK
1263		1263
1264	/*	1264	/*
1265	* Use hrtick when:	1265	* Use hrtick when:
1266	* - enabled by features	1266	* - enabled by features
1267	* - hrtimer is actually high res	1267	* - hrtimer is actually high res
1268	*/	1268	*/
1269	static inline int hrtick_enabled(struct rq *rq)	1269	static inline int hrtick_enabled(struct rq *rq)
1270	{	1270	{
1271	if (!sched_feat(HRTICK))	1271	if (!sched_feat(HRTICK))
1272	return 0;	1272	return 0;
1273	if (!cpu_active(cpu_of(rq)))	1273	if (!cpu_active(cpu_of(rq)))
1274	return 0;	1274	return 0;
1275	return hrtimer_is_hres_active(&rq->hrtick_timer);	1275	return hrtimer_is_hres_active(&rq->hrtick_timer);
1276	}	1276	}
1277		1277
1278	void hrtick_start(struct rq *rq, u64 delay);	1278	void hrtick_start(struct rq *rq, u64 delay);
1279		1279
1280	#else	1280	#else
1281		1281
1282	static inline int hrtick_enabled(struct rq *rq)	1282	static inline int hrtick_enabled(struct rq *rq)
1283	{	1283	{
1284	return 0;	1284	return 0;
1285	}	1285	}
1286		1286
1287	#endif /* CONFIG_SCHED_HRTICK */	1287	#endif /* CONFIG_SCHED_HRTICK */
1288		1288
1289	#ifdef CONFIG_SMP	1289	#ifdef CONFIG_SMP
1290	extern void sched_avg_update(struct rq *rq);	1290	extern void sched_avg_update(struct rq *rq);
1291	static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)	1291	static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1292	{	1292	{
1293	rq->rt_avg += rt_delta;	1293	rq->rt_avg += rt_delta;
1294	sched_avg_update(rq);	1294	sched_avg_update(rq);
1295	}	1295	}
1296	#else	1296	#else
1297	static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }	1297	static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1298	static inline void sched_avg_update(struct rq *rq) { }	1298	static inline void sched_avg_update(struct rq *rq) { }
1299	#endif	1299	#endif
1300		1300
1301	extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);	1301	extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
1302		1302
1303	#ifdef CONFIG_SMP	1303	#ifdef CONFIG_SMP
1304	#ifdef CONFIG_PREEMPT	1304	#ifdef CONFIG_PREEMPT
1305		1305
1306	static inline void double_rq_lock(struct rq rq1, struct rq rq2);	1306	static inline void double_rq_lock(struct rq rq1, struct rq rq2);
1307		1307
1308	/*	1308	/*
1309	* fair double_lock_balance: Safely acquires both rq->locks in a fair	1309	* fair double_lock_balance: Safely acquires both rq->locks in a fair
1310	* way at the expense of forcing extra atomic operations in all	1310	* way at the expense of forcing extra atomic operations in all
1311	* invocations. This assures that the double_lock is acquired using the	1311	* invocations. This assures that the double_lock is acquired using the
1312	* same underlying policy as the spinlock_t on this architecture, which	1312	* same underlying policy as the spinlock_t on this architecture, which
1313	* reduces latency compared to the unfair variant below. However, it	1313	* reduces latency compared to the unfair variant below. However, it
1314	* also adds more overhead and therefore may reduce throughput.	1314	* also adds more overhead and therefore may reduce throughput.
1315	*/	1315	*/
1316	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)	1316	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)
1317	__releases(this_rq->lock)	1317	__releases(this_rq->lock)
1318	__acquires(busiest->lock)	1318	__acquires(busiest->lock)
1319	__acquires(this_rq->lock)	1319	__acquires(this_rq->lock)
1320	{	1320	{
1321	raw_spin_unlock(&this_rq->lock);	1321	raw_spin_unlock(&this_rq->lock);
1322	double_rq_lock(this_rq, busiest);	1322	double_rq_lock(this_rq, busiest);
1323		1323
1324	return 1;	1324	return 1;
1325	}	1325	}
1326		1326
1327	#else	1327	#else
1328	/*	1328	/*
1329	* Unfair double_lock_balance: Optimizes throughput at the expense of	1329	* Unfair double_lock_balance: Optimizes throughput at the expense of
1330	* latency by eliminating extra atomic operations when the locks are	1330	* latency by eliminating extra atomic operations when the locks are
1331	* already in proper order on entry. This favors lower cpu-ids and will	1331	* already in proper order on entry. This favors lower cpu-ids and will
1332	* grant the double lock to lower cpus over higher ids under contention,	1332	* grant the double lock to lower cpus over higher ids under contention,
1333	* regardless of entry order into the function.	1333	* regardless of entry order into the function.
1334	*/	1334	*/
1335	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)	1335	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)
1336	__releases(this_rq->lock)	1336	__releases(this_rq->lock)
1337	__acquires(busiest->lock)	1337	__acquires(busiest->lock)
1338	__acquires(this_rq->lock)	1338	__acquires(this_rq->lock)
1339	{	1339	{
1340	int ret = 0;	1340	int ret = 0;
1341		1341
1342	if (unlikely(!raw_spin_trylock(&busiest->lock))) {	1342	if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1343	if (busiest < this_rq) {	1343	if (busiest < this_rq) {
1344	raw_spin_unlock(&this_rq->lock);	1344	raw_spin_unlock(&this_rq->lock);
1345	raw_spin_lock(&busiest->lock);	1345	raw_spin_lock(&busiest->lock);
1346	raw_spin_lock_nested(&this_rq->lock,	1346	raw_spin_lock_nested(&this_rq->lock,
1347	SINGLE_DEPTH_NESTING);	1347	SINGLE_DEPTH_NESTING);
1348	ret = 1;	1348	ret = 1;
1349	} else	1349	} else
1350	raw_spin_lock_nested(&busiest->lock,	1350	raw_spin_lock_nested(&busiest->lock,
1351	SINGLE_DEPTH_NESTING);	1351	SINGLE_DEPTH_NESTING);
1352	}	1352	}
1353	return ret;	1353	return ret;
1354	}	1354	}
1355		1355
1356	#endif /* CONFIG_PREEMPT */	1356	#endif /* CONFIG_PREEMPT */
1357		1357
1358	/*	1358	/*
1359	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.	1359	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1360	*/	1360	*/
1361	static inline int double_lock_balance(struct rq this_rq, struct rq busiest)	1361	static inline int double_lock_balance(struct rq this_rq, struct rq busiest)
1362	{	1362	{
1363	if (unlikely(!irqs_disabled())) {	1363	if (unlikely(!irqs_disabled())) {
1364	/* printk() doesn't work good under rq->lock */	1364	/* printk() doesn't work good under rq->lock */
1365	raw_spin_unlock(&this_rq->lock);	1365	raw_spin_unlock(&this_rq->lock);
1366	BUG_ON(1);	1366	BUG_ON(1);
1367	}	1367	}
1368		1368
1369	return _double_lock_balance(this_rq, busiest);	1369	return _double_lock_balance(this_rq, busiest);
1370	}	1370	}
1371		1371
1372	static inline void double_unlock_balance(struct rq this_rq, struct rq busiest)	1372	static inline void double_unlock_balance(struct rq this_rq, struct rq busiest)
1373	__releases(busiest->lock)	1373	__releases(busiest->lock)
1374	{	1374	{
1375	raw_spin_unlock(&busiest->lock);	1375	raw_spin_unlock(&busiest->lock);
1376	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);	1376	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1377	}	1377	}
1378		1378
1379	static inline void double_lock(spinlock_t l1, spinlock_t l2)	1379	static inline void double_lock(spinlock_t l1, spinlock_t l2)
1380	{	1380	{
1381	if (l1 > l2)	1381	if (l1 > l2)
1382	swap(l1, l2);	1382	swap(l1, l2);
1383		1383
1384	spin_lock(l1);	1384	spin_lock(l1);
1385	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);	1385	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1386	}	1386	}
1387		1387
1388	static inline void double_lock_irq(spinlock_t l1, spinlock_t l2)	1388	static inline void double_lock_irq(spinlock_t l1, spinlock_t l2)
1389	{	1389	{
1390	if (l1 > l2)	1390	if (l1 > l2)
1391	swap(l1, l2);	1391	swap(l1, l2);
1392		1392
1393	spin_lock_irq(l1);	1393	spin_lock_irq(l1);
1394	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);	1394	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1395	}	1395	}
1396		1396
1397	static inline void double_raw_lock(raw_spinlock_t l1, raw_spinlock_t l2)	1397	static inline void double_raw_lock(raw_spinlock_t l1, raw_spinlock_t l2)
1398	{	1398	{
1399	if (l1 > l2)	1399	if (l1 > l2)
1400	swap(l1, l2);	1400	swap(l1, l2);
1401		1401
1402	raw_spin_lock(l1);	1402	raw_spin_lock(l1);
1403	raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);	1403	raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1404	}	1404	}
1405		1405
1406	/*	1406	/*
1407	* double_rq_lock - safely lock two runqueues	1407	* double_rq_lock - safely lock two runqueues
1408	*	1408	*
1409	* Note this does not disable interrupts like task_rq_lock,	1409	* Note this does not disable interrupts like task_rq_lock,
1410	* you need to do so manually before calling.	1410	* you need to do so manually before calling.
1411	*/	1411	*/
1412	static inline void double_rq_lock(struct rq rq1, struct rq rq2)	1412	static inline void double_rq_lock(struct rq rq1, struct rq rq2)
1413	__acquires(rq1->lock)	1413	__acquires(rq1->lock)
1414	__acquires(rq2->lock)	1414	__acquires(rq2->lock)
1415	{	1415	{
1416	BUG_ON(!irqs_disabled());	1416	BUG_ON(!irqs_disabled());
1417	if (rq1 == rq2) {	1417	if (rq1 == rq2) {
1418	raw_spin_lock(&rq1->lock);	1418	raw_spin_lock(&rq1->lock);
1419	__acquire(rq2->lock); /* Fake it out ;) */	1419	__acquire(rq2->lock); /* Fake it out ;) */
1420	} else {	1420	} else {
1421	if (rq1 < rq2) {	1421	if (rq1 < rq2) {
1422	raw_spin_lock(&rq1->lock);	1422	raw_spin_lock(&rq1->lock);
1423	raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);	1423	raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1424	} else {	1424	} else {
1425	raw_spin_lock(&rq2->lock);	1425	raw_spin_lock(&rq2->lock);
1426	raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);	1426	raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1427	}	1427	}
1428	}	1428	}
1429	}	1429	}
1430		1430
1431	/*	1431	/*
1432	* double_rq_unlock - safely unlock two runqueues	1432	* double_rq_unlock - safely unlock two runqueues
1433	*	1433	*
1434	* Note this does not restore interrupts like task_rq_unlock,	1434	* Note this does not restore interrupts like task_rq_unlock,
1435	* you need to do so manually after calling.	1435	* you need to do so manually after calling.
1436	*/	1436	*/
1437	static inline void double_rq_unlock(struct rq rq1, struct rq rq2)	1437	static inline void double_rq_unlock(struct rq rq1, struct rq rq2)
1438	__releases(rq1->lock)	1438	__releases(rq1->lock)
1439	__releases(rq2->lock)	1439	__releases(rq2->lock)
1440	{	1440	{
1441	raw_spin_unlock(&rq1->lock);	1441	raw_spin_unlock(&rq1->lock);
1442	if (rq1 != rq2)	1442	if (rq1 != rq2)
1443	raw_spin_unlock(&rq2->lock);	1443	raw_spin_unlock(&rq2->lock);
1444	else	1444	else
1445	__release(rq2->lock);	1445	__release(rq2->lock);
1446	}	1446	}
1447		1447
1448	#else /* CONFIG_SMP */	1448	#else /* CONFIG_SMP */
1449		1449
1450	/*	1450	/*
1451	* double_rq_lock - safely lock two runqueues	1451	* double_rq_lock - safely lock two runqueues
1452	*	1452	*
1453	* Note this does not disable interrupts like task_rq_lock,	1453	* Note this does not disable interrupts like task_rq_lock,
1454	* you need to do so manually before calling.	1454	* you need to do so manually before calling.
1455	*/	1455	*/
1456	static inline void double_rq_lock(struct rq rq1, struct rq rq2)	1456	static inline void double_rq_lock(struct rq rq1, struct rq rq2)
1457	__acquires(rq1->lock)	1457	__acquires(rq1->lock)
1458	__acquires(rq2->lock)	1458	__acquires(rq2->lock)
1459	{	1459	{
1460	BUG_ON(!irqs_disabled());	1460	BUG_ON(!irqs_disabled());
1461	BUG_ON(rq1 != rq2);	1461	BUG_ON(rq1 != rq2);
1462	raw_spin_lock(&rq1->lock);	1462	raw_spin_lock(&rq1->lock);
1463	__acquire(rq2->lock); /* Fake it out ;) */	1463	__acquire(rq2->lock); /* Fake it out ;) */
1464	}	1464	}
1465		1465
1466	/*	1466	/*
1467	* double_rq_unlock - safely unlock two runqueues	1467	* double_rq_unlock - safely unlock two runqueues
1468	*	1468	*
1469	* Note this does not restore interrupts like task_rq_unlock,	1469	* Note this does not restore interrupts like task_rq_unlock,
1470	* you need to do so manually after calling.	1470	* you need to do so manually after calling.
1471	*/	1471	*/
1472	static inline void double_rq_unlock(struct rq rq1, struct rq rq2)	1472	static inline void double_rq_unlock(struct rq rq1, struct rq rq2)
1473	__releases(rq1->lock)	1473	__releases(rq1->lock)
1474	__releases(rq2->lock)	1474	__releases(rq2->lock)
1475	{	1475	{
1476	BUG_ON(rq1 != rq2);	1476	BUG_ON(rq1 != rq2);
1477	raw_spin_unlock(&rq1->lock);	1477	raw_spin_unlock(&rq1->lock);
1478	__release(rq2->lock);	1478	__release(rq2->lock);
1479	}	1479	}
1480		1480
1481	#endif	1481	#endif
1482		1482
1483	extern struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq);	1483	extern struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq);
1484	extern struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq);	1484	extern struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq);
1485	extern void print_cfs_stats(struct seq_file *m, int cpu);	1485	extern void print_cfs_stats(struct seq_file *m, int cpu);
1486	extern void print_rt_stats(struct seq_file *m, int cpu);	1486	extern void print_rt_stats(struct seq_file *m, int cpu);
1487		1487
1488	extern void init_cfs_rq(struct cfs_rq *cfs_rq);	1488	extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1489	extern void init_rt_rq(struct rt_rq rt_rq, struct rq rq);	1489	extern void init_rt_rq(struct rt_rq rt_rq, struct rq rq);
1490	extern void init_dl_rq(struct dl_rq dl_rq, struct rq rq);	1490	extern void init_dl_rq(struct dl_rq dl_rq, struct rq rq);
1491		1491
1492	extern void cfs_bandwidth_usage_inc(void);	1492	extern void cfs_bandwidth_usage_inc(void);
1493	extern void cfs_bandwidth_usage_dec(void);	1493	extern void cfs_bandwidth_usage_dec(void);
1494		1494
1495	#ifdef CONFIG_NO_HZ_COMMON	1495	#ifdef CONFIG_NO_HZ_COMMON
1496	enum rq_nohz_flag_bits {	1496	enum rq_nohz_flag_bits {
1497	NOHZ_TICK_STOPPED,	1497	NOHZ_TICK_STOPPED,
1498	NOHZ_BALANCE_KICK,	1498	NOHZ_BALANCE_KICK,
1499	};	1499	};
1500		1500
1501	#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)	1501	#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1502	#endif	1502	#endif
1503		1503
1504	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	1504	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1505		1505
1506	DECLARE_PER_CPU(u64, cpu_hardirq_time);	1506	DECLARE_PER_CPU(u64, cpu_hardirq_time);
1507	DECLARE_PER_CPU(u64, cpu_softirq_time);	1507	DECLARE_PER_CPU(u64, cpu_softirq_time);
1508		1508
1509	#ifndef CONFIG_64BIT	1509	#ifndef CONFIG_64BIT
1510	DECLARE_PER_CPU(seqcount_t, irq_time_seq);	1510	DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1511		1511
1512	static inline void irq_time_write_begin(void)	1512	static inline void irq_time_write_begin(void)
1513	{	1513	{
1514	__this_cpu_inc(irq_time_seq.sequence);	1514	__this_cpu_inc(irq_time_seq.sequence);
1515	smp_wmb();	1515	smp_wmb();
1516	}	1516	}
1517		1517
1518	static inline void irq_time_write_end(void)	1518	static inline void irq_time_write_end(void)
1519	{	1519	{
1520	smp_wmb();	1520	smp_wmb();
1521	__this_cpu_inc(irq_time_seq.sequence);	1521	__this_cpu_inc(irq_time_seq.sequence);
1522	}	1522	}
1523		1523
1524	static inline u64 irq_time_read(int cpu)	1524	static inline u64 irq_time_read(int cpu)
1525	{	1525	{
1526	u64 irq_time;	1526	u64 irq_time;
1527	unsigned seq;	1527	unsigned seq;
1528		1528
1529	do {	1529	do {
1530	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));	1530	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1531	irq_time = per_cpu(cpu_softirq_time, cpu) +	1531	irq_time = per_cpu(cpu_softirq_time, cpu) +
1532	per_cpu(cpu_hardirq_time, cpu);	1532	per_cpu(cpu_hardirq_time, cpu);
1533	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));	1533	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1534		1534
1535	return irq_time;	1535	return irq_time;
1536	}	1536	}
1537	#else /* CONFIG_64BIT */	1537	#else /* CONFIG_64BIT */
1538	static inline void irq_time_write_begin(void)	1538	static inline void irq_time_write_begin(void)
1539	{	1539	{
1540	}	1540	}
1541		1541
1542	static inline void irq_time_write_end(void)	1542	static inline void irq_time_write_end(void)
1543	{	1543	{
1544	}	1544	}
1545		1545
1546	static inline u64 irq_time_read(int cpu)	1546	static inline u64 irq_time_read(int cpu)
1547	{	1547	{
1548	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);	1548	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1549	}	1549	}
1550	#endif /* CONFIG_64BIT */	1550	#endif /* CONFIG_64BIT */
1551	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */	1551	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1552		1552

Link here
Eric Lee 2016-08-17 09:25:49 UTC

mentioned in commit 77a4d1

_mentioned in commit 77a4d1_

Choose File ... File name...
Cancel
Link here
Eric Lee 2016-08-24 03:20:52 UTC

mentioned in commit 77a4d1

_mentioned in commit 77a4d1_

Choose File ... File name...
Cancel
Link here
Eric Lee 2017-03-31 17:20:46 UTC

mentioned in commit 77a4d1

_mentioned in commit 77a4d1_

Choose File ... File name...
Cancel
Link here
Eric Lee 2018-09-03 12:18:38 UTC

mentioned in commit 77a4d1

_mentioned in commit 77a4d1_

Choose File ... File name...
Cancel
Link here
Eric Lee 2018-09-03 13:52:26 UTC

mentioned in commit 77a4d1

_mentioned in commit 77a4d1_

Choose File ... File name...
Cancel