Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit ba74c1448f127649046615ec017bded7b2a76f29

Authored by Thomas Gleixner 2011-03-21 20:32:17 +0800

Committed by Ingo Molnar 2012-03-01 17:28:04 +0800

Exists in smarc-l5.0.0_1.0.0-ga and in 5 other branches

sched/rt: Document scheduler related skip-resched-check sites

Create a distinction between scheduler related preempt_enable_no_resched()
calls and the nearly one hundred other places in the kernel that do not
want to reschedule, for one reason or another.

This distinction matters for -rt, where the scheduler and the non-scheduler
preempt models (and checks) are different. For upstream it's purely
documentational.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-gs88fvx2mdv5psnzxnv575ke@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 5 changed files with 11 additions and 8 deletions Inline Diff

arch/powerpc/kernel/idle.c
arch/sparc/kernel/process_64.c
include/linux/preempt.h
kernel/sched/core.c
kernel/softirq.c

arch/powerpc/kernel/idle.c

Diff comments View file @ ba74c14

1	/*	1	/*
2	* Idle daemon for PowerPC. Idle daemon will handle any action	2	* Idle daemon for PowerPC. Idle daemon will handle any action
3	* that needs to be taken when the system becomes idle.	3	* that needs to be taken when the system becomes idle.
4	*	4	*
5	* Originally written by Cort Dougan (cort@cs.nmt.edu).	5	* Originally written by Cort Dougan (cort@cs.nmt.edu).
6	* Subsequent 32-bit hacking by Tom Rini, Armin Kuster,	6	* Subsequent 32-bit hacking by Tom Rini, Armin Kuster,
7	* Paul Mackerras and others.	7	* Paul Mackerras and others.
8	*	8	*
9	* iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>	9	* iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
10	*	10	*
11	* Additional shared processor, SMT, and firmware support	11	* Additional shared processor, SMT, and firmware support
12	* Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>	12	* Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
13	*	13	*
14	* 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org>	14	* 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org>
15	*	15	*
16	* This program is free software; you can redistribute it and/or	16	* This program is free software; you can redistribute it and/or
17	* modify it under the terms of the GNU General Public License	17	* modify it under the terms of the GNU General Public License
18	* as published by the Free Software Foundation; either version	18	* as published by the Free Software Foundation; either version
19	* 2 of the License, or (at your option) any later version.	19	* 2 of the License, or (at your option) any later version.
20	*/	20	*/
21		21
22	#include <linux/sched.h>	22	#include <linux/sched.h>
23	#include <linux/kernel.h>	23	#include <linux/kernel.h>
24	#include <linux/smp.h>	24	#include <linux/smp.h>
25	#include <linux/cpu.h>	25	#include <linux/cpu.h>
26	#include <linux/sysctl.h>	26	#include <linux/sysctl.h>
27	#include <linux/tick.h>	27	#include <linux/tick.h>
28		28
29	#include <asm/system.h>	29	#include <asm/system.h>
30	#include <asm/processor.h>	30	#include <asm/processor.h>
31	#include <asm/cputable.h>	31	#include <asm/cputable.h>
32	#include <asm/time.h>	32	#include <asm/time.h>
33	#include <asm/machdep.h>	33	#include <asm/machdep.h>
34	#include <asm/smp.h>	34	#include <asm/smp.h>
35		35
36	#ifdef CONFIG_HOTPLUG_CPU	36	#ifdef CONFIG_HOTPLUG_CPU
37	#define cpu_should_die() cpu_is_offline(smp_processor_id())	37	#define cpu_should_die() cpu_is_offline(smp_processor_id())
38	#else	38	#else
39	#define cpu_should_die() 0	39	#define cpu_should_die() 0
40	#endif	40	#endif
41		41
42	unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;	42	unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
43	EXPORT_SYMBOL(cpuidle_disable);	43	EXPORT_SYMBOL(cpuidle_disable);
44		44
45	static int __init powersave_off(char *arg)	45	static int __init powersave_off(char *arg)
46	{	46	{
47	ppc_md.power_save = NULL;	47	ppc_md.power_save = NULL;
48	cpuidle_disable = IDLE_POWERSAVE_OFF;	48	cpuidle_disable = IDLE_POWERSAVE_OFF;
49	return 0;	49	return 0;
50	}	50	}
51	__setup("powersave=off", powersave_off);	51	__setup("powersave=off", powersave_off);
52		52
53	/*	53	/*
54	* The body of the idle task.	54	* The body of the idle task.
55	*/	55	*/
56	void cpu_idle(void)	56	void cpu_idle(void)
57	{	57	{
58	if (ppc_md.idle_loop)	58	if (ppc_md.idle_loop)
59	ppc_md.idle_loop(); /* doesn't return */	59	ppc_md.idle_loop(); /* doesn't return */
60		60
61	set_thread_flag(TIF_POLLING_NRFLAG);	61	set_thread_flag(TIF_POLLING_NRFLAG);
62	while (1) {	62	while (1) {
63	tick_nohz_idle_enter();	63	tick_nohz_idle_enter();
64	rcu_idle_enter();	64	rcu_idle_enter();
65		65
66	while (!need_resched() && !cpu_should_die()) {	66	while (!need_resched() && !cpu_should_die()) {
67	ppc64_runlatch_off();	67	ppc64_runlatch_off();
68		68
69	if (ppc_md.power_save) {	69	if (ppc_md.power_save) {
70	clear_thread_flag(TIF_POLLING_NRFLAG);	70	clear_thread_flag(TIF_POLLING_NRFLAG);
71	/*	71	/*
72	* smp_mb is so clearing of TIF_POLLING_NRFLAG	72	* smp_mb is so clearing of TIF_POLLING_NRFLAG
73	* is ordered w.r.t. need_resched() test.	73	* is ordered w.r.t. need_resched() test.
74	*/	74	*/
75	smp_mb();	75	smp_mb();
76	local_irq_disable();	76	local_irq_disable();
77		77
78	/* Don't trace irqs off for idle */	78	/* Don't trace irqs off for idle */
79	stop_critical_timings();	79	stop_critical_timings();
80		80
81	/* check again after disabling irqs */	81	/* check again after disabling irqs */
82	if (!need_resched() && !cpu_should_die())	82	if (!need_resched() && !cpu_should_die())
83	ppc_md.power_save();	83	ppc_md.power_save();
84		84
85	start_critical_timings();	85	start_critical_timings();
86		86
87	local_irq_enable();	87	local_irq_enable();
88	set_thread_flag(TIF_POLLING_NRFLAG);	88	set_thread_flag(TIF_POLLING_NRFLAG);
89		89
90	} else {	90	} else {
91	/*	91	/*
92	* Go into low thread priority and possibly	92	* Go into low thread priority and possibly
93	* low power mode.	93	* low power mode.
94	*/	94	*/
95	HMT_low();	95	HMT_low();
96	HMT_very_low();	96	HMT_very_low();
97	}	97	}
98	}	98	}
99		99
100	HMT_medium();	100	HMT_medium();
101	ppc64_runlatch_on();	101	ppc64_runlatch_on();
102	rcu_idle_exit();	102	rcu_idle_exit();
103	tick_nohz_idle_exit();	103	tick_nohz_idle_exit();
104	if (cpu_should_die()) {	104	if (cpu_should_die()) {
105	preempt_enable_no_resched();	105	sched_preempt_enable_no_resched();
106	cpu_die();	106	cpu_die();
107	}	107	}
108	schedule_preempt_disabled();	108	schedule_preempt_disabled();
109	}	109	}
110	}	110	}
111		111
112		112
113	/*	113	/*
114	* cpu_idle_wait - Used to ensure that all the CPUs come out of the old	114	* cpu_idle_wait - Used to ensure that all the CPUs come out of the old
115	* idle loop and start using the new idle loop.	115	* idle loop and start using the new idle loop.
116	* Required while changing idle handler on SMP systems.	116	* Required while changing idle handler on SMP systems.
117	* Caller must have changed idle handler to the new value before the call.	117	* Caller must have changed idle handler to the new value before the call.
118	* This window may be larger on shared systems.	118	* This window may be larger on shared systems.
119	*/	119	*/
120	void cpu_idle_wait(void)	120	void cpu_idle_wait(void)
121	{	121	{
122	int cpu;	122	int cpu;
123	smp_mb();	123	smp_mb();
124		124
125	/* kick all the CPUs so that they exit out of old idle routine */	125	/* kick all the CPUs so that they exit out of old idle routine */
126	get_online_cpus();	126	get_online_cpus();
127	for_each_online_cpu(cpu) {	127	for_each_online_cpu(cpu) {
128	if (cpu != smp_processor_id())	128	if (cpu != smp_processor_id())
129	smp_send_reschedule(cpu);	129	smp_send_reschedule(cpu);
130	}	130	}
131	put_online_cpus();	131	put_online_cpus();
132	}	132	}
133	EXPORT_SYMBOL_GPL(cpu_idle_wait);	133	EXPORT_SYMBOL_GPL(cpu_idle_wait);
134		134
135	int powersave_nap;	135	int powersave_nap;
136		136
137	#ifdef CONFIG_SYSCTL	137	#ifdef CONFIG_SYSCTL
138	/*	138	/*
139	* Register the sysctl to set/clear powersave_nap.	139	* Register the sysctl to set/clear powersave_nap.
140	*/	140	*/
141	static ctl_table powersave_nap_ctl_table[]={	141	static ctl_table powersave_nap_ctl_table[]={
142	{	142	{
143	.procname = "powersave-nap",	143	.procname = "powersave-nap",
144	.data = &powersave_nap,	144	.data = &powersave_nap,
145	.maxlen = sizeof(int),	145	.maxlen = sizeof(int),
146	.mode = 0644,	146	.mode = 0644,
147	.proc_handler = proc_dointvec,	147	.proc_handler = proc_dointvec,
148	},	148	},
149	{}	149	{}
150	};	150	};
151	static ctl_table powersave_nap_sysctl_root[] = {	151	static ctl_table powersave_nap_sysctl_root[] = {
152	{	152	{
153	.procname = "kernel",	153	.procname = "kernel",
154	.mode = 0555,	154	.mode = 0555,
155	.child = powersave_nap_ctl_table,	155	.child = powersave_nap_ctl_table,
156	},	156	},
157	{}	157	{}
158	};	158	};
159		159
160	static int __init	160	static int __init
161	register_powersave_nap_sysctl(void)	161	register_powersave_nap_sysctl(void)
162	{	162	{
163	register_sysctl_table(powersave_nap_sysctl_root);	163	register_sysctl_table(powersave_nap_sysctl_root);
164		164
165	return 0;	165	return 0;
166	}	166	}
167	__initcall(register_powersave_nap_sysctl);	167	__initcall(register_powersave_nap_sysctl);
168	#endif	168	#endif
169		169

arch/sparc/kernel/process_64.c

Diff comments View file @ ba74c14

1	/* arch/sparc64/kernel/process.c	1	/* arch/sparc64/kernel/process.c
2	*	2	*
3	* Copyright (C) 1995, 1996, 2008 David S. Miller (davem@davemloft.net)	3	* Copyright (C) 1995, 1996, 2008 David S. Miller (davem@davemloft.net)
4	* Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)	4	* Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
5	* Copyright (C) 1997, 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)	5	* Copyright (C) 1997, 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
6	*/	6	*/
7		7
8	/*	8	/*
9	* This file handles the architecture-dependent parts of process handling..	9	* This file handles the architecture-dependent parts of process handling..
10	*/	10	*/
11		11
12	#include <stdarg.h>	12	#include <stdarg.h>
13		13
14	#include <linux/errno.h>	14	#include <linux/errno.h>
15	#include <linux/export.h>	15	#include <linux/export.h>
16	#include <linux/sched.h>	16	#include <linux/sched.h>
17	#include <linux/kernel.h>	17	#include <linux/kernel.h>
18	#include <linux/mm.h>	18	#include <linux/mm.h>
19	#include <linux/fs.h>	19	#include <linux/fs.h>
20	#include <linux/smp.h>	20	#include <linux/smp.h>
21	#include <linux/stddef.h>	21	#include <linux/stddef.h>
22	#include <linux/ptrace.h>	22	#include <linux/ptrace.h>
23	#include <linux/slab.h>	23	#include <linux/slab.h>
24	#include <linux/user.h>	24	#include <linux/user.h>
25	#include <linux/delay.h>	25	#include <linux/delay.h>
26	#include <linux/compat.h>	26	#include <linux/compat.h>
27	#include <linux/tick.h>	27	#include <linux/tick.h>
28	#include <linux/init.h>	28	#include <linux/init.h>
29	#include <linux/cpu.h>	29	#include <linux/cpu.h>
30	#include <linux/elfcore.h>	30	#include <linux/elfcore.h>
31	#include <linux/sysrq.h>	31	#include <linux/sysrq.h>
32	#include <linux/nmi.h>	32	#include <linux/nmi.h>
33		33
34	#include <asm/uaccess.h>	34	#include <asm/uaccess.h>
35	#include <asm/system.h>	35	#include <asm/system.h>
36	#include <asm/page.h>	36	#include <asm/page.h>
37	#include <asm/pgalloc.h>	37	#include <asm/pgalloc.h>
38	#include <asm/pgtable.h>	38	#include <asm/pgtable.h>
39	#include <asm/processor.h>	39	#include <asm/processor.h>
40	#include <asm/pstate.h>	40	#include <asm/pstate.h>
41	#include <asm/elf.h>	41	#include <asm/elf.h>
42	#include <asm/fpumacro.h>	42	#include <asm/fpumacro.h>
43	#include <asm/head.h>	43	#include <asm/head.h>
44	#include <asm/cpudata.h>	44	#include <asm/cpudata.h>
45	#include <asm/mmu_context.h>	45	#include <asm/mmu_context.h>
46	#include <asm/unistd.h>	46	#include <asm/unistd.h>
47	#include <asm/hypervisor.h>	47	#include <asm/hypervisor.h>
48	#include <asm/syscalls.h>	48	#include <asm/syscalls.h>
49	#include <asm/irq_regs.h>	49	#include <asm/irq_regs.h>
50	#include <asm/smp.h>	50	#include <asm/smp.h>
51		51
52	#include "kstack.h"	52	#include "kstack.h"
53		53
54	static void sparc64_yield(int cpu)	54	static void sparc64_yield(int cpu)
55	{	55	{
56	if (tlb_type != hypervisor) {	56	if (tlb_type != hypervisor) {
57	touch_nmi_watchdog();	57	touch_nmi_watchdog();
58	return;	58	return;
59	}	59	}
60		60
61	clear_thread_flag(TIF_POLLING_NRFLAG);	61	clear_thread_flag(TIF_POLLING_NRFLAG);
62	smp_mb__after_clear_bit();	62	smp_mb__after_clear_bit();
63		63
64	while (!need_resched() && !cpu_is_offline(cpu)) {	64	while (!need_resched() && !cpu_is_offline(cpu)) {
65	unsigned long pstate;	65	unsigned long pstate;
66		66
67	/* Disable interrupts. */	67	/* Disable interrupts. */
68	__asm__ __volatile__(	68	__asm__ __volatile__(
69	"rdpr %%pstate, %0\n\t"	69	"rdpr %%pstate, %0\n\t"
70	"andn %0, %1, %0\n\t"	70	"andn %0, %1, %0\n\t"
71	"wrpr %0, %%g0, %%pstate"	71	"wrpr %0, %%g0, %%pstate"
72	: "=&r" (pstate)	72	: "=&r" (pstate)
73	: "i" (PSTATE_IE));	73	: "i" (PSTATE_IE));
74		74
75	if (!need_resched() && !cpu_is_offline(cpu))	75	if (!need_resched() && !cpu_is_offline(cpu))
76	sun4v_cpu_yield();	76	sun4v_cpu_yield();
77		77
78	/* Re-enable interrupts. */	78	/* Re-enable interrupts. */
79	__asm__ __volatile__(	79	__asm__ __volatile__(
80	"rdpr %%pstate, %0\n\t"	80	"rdpr %%pstate, %0\n\t"
81	"or %0, %1, %0\n\t"	81	"or %0, %1, %0\n\t"
82	"wrpr %0, %%g0, %%pstate"	82	"wrpr %0, %%g0, %%pstate"
83	: "=&r" (pstate)	83	: "=&r" (pstate)
84	: "i" (PSTATE_IE));	84	: "i" (PSTATE_IE));
85	}	85	}
86		86
87	set_thread_flag(TIF_POLLING_NRFLAG);	87	set_thread_flag(TIF_POLLING_NRFLAG);
88	}	88	}
89		89
90	/* The idle loop on sparc64. */	90	/* The idle loop on sparc64. */
91	void cpu_idle(void)	91	void cpu_idle(void)
92	{	92	{
93	int cpu = smp_processor_id();	93	int cpu = smp_processor_id();
94		94
95	set_thread_flag(TIF_POLLING_NRFLAG);	95	set_thread_flag(TIF_POLLING_NRFLAG);
96		96
97	while(1) {	97	while(1) {
98	tick_nohz_idle_enter();	98	tick_nohz_idle_enter();
99	rcu_idle_enter();	99	rcu_idle_enter();
100		100
101	while (!need_resched() && !cpu_is_offline(cpu))	101	while (!need_resched() && !cpu_is_offline(cpu))
102	sparc64_yield(cpu);	102	sparc64_yield(cpu);
103		103
104	rcu_idle_exit();	104	rcu_idle_exit();
105	tick_nohz_idle_exit();	105	tick_nohz_idle_exit();
106		106
107	#ifdef CONFIG_HOTPLUG_CPU	107	#ifdef CONFIG_HOTPLUG_CPU
108	if (cpu_is_offline(cpu)) {	108	if (cpu_is_offline(cpu)) {
109	preempt_enable_no_resched();	109	sched_preempt_enable_no_resched();
110	cpu_play_dead();	110	cpu_play_dead();
111	}	111	}
112	#endif	112	#endif
113	schedule_preempt_disabled();	113	schedule_preempt_disabled();
114	}	114	}
115	}	115	}
116		116
117	#ifdef CONFIG_COMPAT	117	#ifdef CONFIG_COMPAT
118	static void show_regwindow32(struct pt_regs *regs)	118	static void show_regwindow32(struct pt_regs *regs)
119	{	119	{
120	struct reg_window32 __user *rw;	120	struct reg_window32 __user *rw;
121	struct reg_window32 r_w;	121	struct reg_window32 r_w;
122	mm_segment_t old_fs;	122	mm_segment_t old_fs;
123		123
124	__asm__ __volatile__ ("flushw");	124	__asm__ __volatile__ ("flushw");
125	rw = compat_ptr((unsigned)regs->u_regs[14]);	125	rw = compat_ptr((unsigned)regs->u_regs[14]);
126	old_fs = get_fs();	126	old_fs = get_fs();
127	set_fs (USER_DS);	127	set_fs (USER_DS);
128	if (copy_from_user (&r_w, rw, sizeof(r_w))) {	128	if (copy_from_user (&r_w, rw, sizeof(r_w))) {
129	set_fs (old_fs);	129	set_fs (old_fs);
130	return;	130	return;
131	}	131	}
132		132
133	set_fs (old_fs);	133	set_fs (old_fs);
134	printk("l0: %08x l1: %08x l2: %08x l3: %08x "	134	printk("l0: %08x l1: %08x l2: %08x l3: %08x "
135	"l4: %08x l5: %08x l6: %08x l7: %08x\n",	135	"l4: %08x l5: %08x l6: %08x l7: %08x\n",
136	r_w.locals[0], r_w.locals[1], r_w.locals[2], r_w.locals[3],	136	r_w.locals[0], r_w.locals[1], r_w.locals[2], r_w.locals[3],
137	r_w.locals[4], r_w.locals[5], r_w.locals[6], r_w.locals[7]);	137	r_w.locals[4], r_w.locals[5], r_w.locals[6], r_w.locals[7]);
138	printk("i0: %08x i1: %08x i2: %08x i3: %08x "	138	printk("i0: %08x i1: %08x i2: %08x i3: %08x "
139	"i4: %08x i5: %08x i6: %08x i7: %08x\n",	139	"i4: %08x i5: %08x i6: %08x i7: %08x\n",
140	r_w.ins[0], r_w.ins[1], r_w.ins[2], r_w.ins[3],	140	r_w.ins[0], r_w.ins[1], r_w.ins[2], r_w.ins[3],
141	r_w.ins[4], r_w.ins[5], r_w.ins[6], r_w.ins[7]);	141	r_w.ins[4], r_w.ins[5], r_w.ins[6], r_w.ins[7]);
142	}	142	}
143	#else	143	#else
144	#define show_regwindow32(regs) do { } while (0)	144	#define show_regwindow32(regs) do { } while (0)
145	#endif	145	#endif
146		146
147	static void show_regwindow(struct pt_regs *regs)	147	static void show_regwindow(struct pt_regs *regs)
148	{	148	{
149	struct reg_window __user *rw;	149	struct reg_window __user *rw;
150	struct reg_window *rwk;	150	struct reg_window *rwk;
151	struct reg_window r_w;	151	struct reg_window r_w;
152	mm_segment_t old_fs;	152	mm_segment_t old_fs;
153		153
154	if ((regs->tstate & TSTATE_PRIV) \|\| !(test_thread_flag(TIF_32BIT))) {	154	if ((regs->tstate & TSTATE_PRIV) \|\| !(test_thread_flag(TIF_32BIT))) {
155	__asm__ __volatile__ ("flushw");	155	__asm__ __volatile__ ("flushw");
156	rw = (struct reg_window __user *)	156	rw = (struct reg_window __user *)
157	(regs->u_regs[14] + STACK_BIAS);	157	(regs->u_regs[14] + STACK_BIAS);
158	rwk = (struct reg_window *)	158	rwk = (struct reg_window *)
159	(regs->u_regs[14] + STACK_BIAS);	159	(regs->u_regs[14] + STACK_BIAS);
160	if (!(regs->tstate & TSTATE_PRIV)) {	160	if (!(regs->tstate & TSTATE_PRIV)) {
161	old_fs = get_fs();	161	old_fs = get_fs();
162	set_fs (USER_DS);	162	set_fs (USER_DS);
163	if (copy_from_user (&r_w, rw, sizeof(r_w))) {	163	if (copy_from_user (&r_w, rw, sizeof(r_w))) {
164	set_fs (old_fs);	164	set_fs (old_fs);
165	return;	165	return;
166	}	166	}
167	rwk = &r_w;	167	rwk = &r_w;
168	set_fs (old_fs);	168	set_fs (old_fs);
169	}	169	}
170	} else {	170	} else {
171	show_regwindow32(regs);	171	show_regwindow32(regs);
172	return;	172	return;
173	}	173	}
174	printk("l0: %016lx l1: %016lx l2: %016lx l3: %016lx\n",	174	printk("l0: %016lx l1: %016lx l2: %016lx l3: %016lx\n",
175	rwk->locals[0], rwk->locals[1], rwk->locals[2], rwk->locals[3]);	175	rwk->locals[0], rwk->locals[1], rwk->locals[2], rwk->locals[3]);
176	printk("l4: %016lx l5: %016lx l6: %016lx l7: %016lx\n",	176	printk("l4: %016lx l5: %016lx l6: %016lx l7: %016lx\n",
177	rwk->locals[4], rwk->locals[5], rwk->locals[6], rwk->locals[7]);	177	rwk->locals[4], rwk->locals[5], rwk->locals[6], rwk->locals[7]);
178	printk("i0: %016lx i1: %016lx i2: %016lx i3: %016lx\n",	178	printk("i0: %016lx i1: %016lx i2: %016lx i3: %016lx\n",
179	rwk->ins[0], rwk->ins[1], rwk->ins[2], rwk->ins[3]);	179	rwk->ins[0], rwk->ins[1], rwk->ins[2], rwk->ins[3]);
180	printk("i4: %016lx i5: %016lx i6: %016lx i7: %016lx\n",	180	printk("i4: %016lx i5: %016lx i6: %016lx i7: %016lx\n",
181	rwk->ins[4], rwk->ins[5], rwk->ins[6], rwk->ins[7]);	181	rwk->ins[4], rwk->ins[5], rwk->ins[6], rwk->ins[7]);
182	if (regs->tstate & TSTATE_PRIV)	182	if (regs->tstate & TSTATE_PRIV)
183	printk("I7: <%pS>\n", (void *) rwk->ins[7]);	183	printk("I7: <%pS>\n", (void *) rwk->ins[7]);
184	}	184	}
185		185
186	void show_regs(struct pt_regs *regs)	186	void show_regs(struct pt_regs *regs)
187	{	187	{
188	printk("TSTATE: %016lx TPC: %016lx TNPC: %016lx Y: %08x %s\n", regs->tstate,	188	printk("TSTATE: %016lx TPC: %016lx TNPC: %016lx Y: %08x %s\n", regs->tstate,
189	regs->tpc, regs->tnpc, regs->y, print_tainted());	189	regs->tpc, regs->tnpc, regs->y, print_tainted());
190	printk("TPC: <%pS>\n", (void *) regs->tpc);	190	printk("TPC: <%pS>\n", (void *) regs->tpc);
191	printk("g0: %016lx g1: %016lx g2: %016lx g3: %016lx\n",	191	printk("g0: %016lx g1: %016lx g2: %016lx g3: %016lx\n",
192	regs->u_regs[0], regs->u_regs[1], regs->u_regs[2],	192	regs->u_regs[0], regs->u_regs[1], regs->u_regs[2],
193	regs->u_regs[3]);	193	regs->u_regs[3]);
194	printk("g4: %016lx g5: %016lx g6: %016lx g7: %016lx\n",	194	printk("g4: %016lx g5: %016lx g6: %016lx g7: %016lx\n",
195	regs->u_regs[4], regs->u_regs[5], regs->u_regs[6],	195	regs->u_regs[4], regs->u_regs[5], regs->u_regs[6],
196	regs->u_regs[7]);	196	regs->u_regs[7]);
197	printk("o0: %016lx o1: %016lx o2: %016lx o3: %016lx\n",	197	printk("o0: %016lx o1: %016lx o2: %016lx o3: %016lx\n",
198	regs->u_regs[8], regs->u_regs[9], regs->u_regs[10],	198	regs->u_regs[8], regs->u_regs[9], regs->u_regs[10],
199	regs->u_regs[11]);	199	regs->u_regs[11]);
200	printk("o4: %016lx o5: %016lx sp: %016lx ret_pc: %016lx\n",	200	printk("o4: %016lx o5: %016lx sp: %016lx ret_pc: %016lx\n",
201	regs->u_regs[12], regs->u_regs[13], regs->u_regs[14],	201	regs->u_regs[12], regs->u_regs[13], regs->u_regs[14],
202	regs->u_regs[15]);	202	regs->u_regs[15]);
203	printk("RPC: <%pS>\n", (void *) regs->u_regs[15]);	203	printk("RPC: <%pS>\n", (void *) regs->u_regs[15]);
204	show_regwindow(regs);	204	show_regwindow(regs);
205	show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]);	205	show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]);
206	}	206	}
207		207
208	struct global_reg_snapshot global_reg_snapshot[NR_CPUS];	208	struct global_reg_snapshot global_reg_snapshot[NR_CPUS];
209	static DEFINE_SPINLOCK(global_reg_snapshot_lock);	209	static DEFINE_SPINLOCK(global_reg_snapshot_lock);
210		210
211	static void __global_reg_self(struct thread_info tp, struct pt_regs regs,	211	static void __global_reg_self(struct thread_info tp, struct pt_regs regs,
212	int this_cpu)	212	int this_cpu)
213	{	213	{
214	flushw_all();	214	flushw_all();
215		215
216	global_reg_snapshot[this_cpu].tstate = regs->tstate;	216	global_reg_snapshot[this_cpu].tstate = regs->tstate;
217	global_reg_snapshot[this_cpu].tpc = regs->tpc;	217	global_reg_snapshot[this_cpu].tpc = regs->tpc;
218	global_reg_snapshot[this_cpu].tnpc = regs->tnpc;	218	global_reg_snapshot[this_cpu].tnpc = regs->tnpc;
219	global_reg_snapshot[this_cpu].o7 = regs->u_regs[UREG_I7];	219	global_reg_snapshot[this_cpu].o7 = regs->u_regs[UREG_I7];
220		220
221	if (regs->tstate & TSTATE_PRIV) {	221	if (regs->tstate & TSTATE_PRIV) {
222	struct reg_window *rw;	222	struct reg_window *rw;
223		223
224	rw = (struct reg_window *)	224	rw = (struct reg_window *)
225	(regs->u_regs[UREG_FP] + STACK_BIAS);	225	(regs->u_regs[UREG_FP] + STACK_BIAS);
226	if (kstack_valid(tp, (unsigned long) rw)) {	226	if (kstack_valid(tp, (unsigned long) rw)) {
227	global_reg_snapshot[this_cpu].i7 = rw->ins[7];	227	global_reg_snapshot[this_cpu].i7 = rw->ins[7];
228	rw = (struct reg_window *)	228	rw = (struct reg_window *)
229	(rw->ins[6] + STACK_BIAS);	229	(rw->ins[6] + STACK_BIAS);
230	if (kstack_valid(tp, (unsigned long) rw))	230	if (kstack_valid(tp, (unsigned long) rw))
231	global_reg_snapshot[this_cpu].rpc = rw->ins[7];	231	global_reg_snapshot[this_cpu].rpc = rw->ins[7];
232	}	232	}
233	} else {	233	} else {
234	global_reg_snapshot[this_cpu].i7 = 0;	234	global_reg_snapshot[this_cpu].i7 = 0;
235	global_reg_snapshot[this_cpu].rpc = 0;	235	global_reg_snapshot[this_cpu].rpc = 0;
236	}	236	}
237	global_reg_snapshot[this_cpu].thread = tp;	237	global_reg_snapshot[this_cpu].thread = tp;
238	}	238	}
239		239
240	/* In order to avoid hangs we do not try to synchronize with the	240	/* In order to avoid hangs we do not try to synchronize with the
241	* global register dump client cpus. The last store they make is to	241	* global register dump client cpus. The last store they make is to
242	* the thread pointer, so do a short poll waiting for that to become	242	* the thread pointer, so do a short poll waiting for that to become
243	* non-NULL.	243	* non-NULL.
244	*/	244	*/
245	static void __global_reg_poll(struct global_reg_snapshot *gp)	245	static void __global_reg_poll(struct global_reg_snapshot *gp)
246	{	246	{
247	int limit = 0;	247	int limit = 0;
248		248
249	while (!gp->thread && ++limit < 100) {	249	while (!gp->thread && ++limit < 100) {
250	barrier();	250	barrier();
251	udelay(1);	251	udelay(1);
252	}	252	}
253	}	253	}
254		254
255	void arch_trigger_all_cpu_backtrace(void)	255	void arch_trigger_all_cpu_backtrace(void)
256	{	256	{
257	struct thread_info *tp = current_thread_info();	257	struct thread_info *tp = current_thread_info();
258	struct pt_regs *regs = get_irq_regs();	258	struct pt_regs *regs = get_irq_regs();
259	unsigned long flags;	259	unsigned long flags;
260	int this_cpu, cpu;	260	int this_cpu, cpu;
261		261
262	if (!regs)	262	if (!regs)
263	regs = tp->kregs;	263	regs = tp->kregs;
264		264
265	spin_lock_irqsave(&global_reg_snapshot_lock, flags);	265	spin_lock_irqsave(&global_reg_snapshot_lock, flags);
266		266
267	memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot));	267	memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot));
268		268
269	this_cpu = raw_smp_processor_id();	269	this_cpu = raw_smp_processor_id();
270		270
271	__global_reg_self(tp, regs, this_cpu);	271	__global_reg_self(tp, regs, this_cpu);
272		272
273	smp_fetch_global_regs();	273	smp_fetch_global_regs();
274		274
275	for_each_online_cpu(cpu) {	275	for_each_online_cpu(cpu) {
276	struct global_reg_snapshot *gp = &global_reg_snapshot[cpu];	276	struct global_reg_snapshot *gp = &global_reg_snapshot[cpu];
277		277
278	__global_reg_poll(gp);	278	__global_reg_poll(gp);
279		279
280	tp = gp->thread;	280	tp = gp->thread;
281	printk("%c CPU[%3d]: TSTATE[%016lx] TPC[%016lx] TNPC[%016lx] TASK[%s:%d]\n",	281	printk("%c CPU[%3d]: TSTATE[%016lx] TPC[%016lx] TNPC[%016lx] TASK[%s:%d]\n",
282	(cpu == this_cpu ? '*' : ' '), cpu,	282	(cpu == this_cpu ? '*' : ' '), cpu,
283	gp->tstate, gp->tpc, gp->tnpc,	283	gp->tstate, gp->tpc, gp->tnpc,
284	((tp && tp->task) ? tp->task->comm : "NULL"),	284	((tp && tp->task) ? tp->task->comm : "NULL"),
285	((tp && tp->task) ? tp->task->pid : -1));	285	((tp && tp->task) ? tp->task->pid : -1));
286		286
287	if (gp->tstate & TSTATE_PRIV) {	287	if (gp->tstate & TSTATE_PRIV) {
288	printk(" TPC[%pS] O7[%pS] I7[%pS] RPC[%pS]\n",	288	printk(" TPC[%pS] O7[%pS] I7[%pS] RPC[%pS]\n",
289	(void *) gp->tpc,	289	(void *) gp->tpc,
290	(void *) gp->o7,	290	(void *) gp->o7,
291	(void *) gp->i7,	291	(void *) gp->i7,
292	(void *) gp->rpc);	292	(void *) gp->rpc);
293	} else {	293	} else {
294	printk(" TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n",	294	printk(" TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n",
295	gp->tpc, gp->o7, gp->i7, gp->rpc);	295	gp->tpc, gp->o7, gp->i7, gp->rpc);
296	}	296	}
297	}	297	}
298		298
299	memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot));	299	memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot));
300		300
301	spin_unlock_irqrestore(&global_reg_snapshot_lock, flags);	301	spin_unlock_irqrestore(&global_reg_snapshot_lock, flags);
302	}	302	}
303		303
304	#ifdef CONFIG_MAGIC_SYSRQ	304	#ifdef CONFIG_MAGIC_SYSRQ
305		305
306	static void sysrq_handle_globreg(int key)	306	static void sysrq_handle_globreg(int key)
307	{	307	{
308	arch_trigger_all_cpu_backtrace();	308	arch_trigger_all_cpu_backtrace();
309	}	309	}
310		310
311	static struct sysrq_key_op sparc_globalreg_op = {	311	static struct sysrq_key_op sparc_globalreg_op = {
312	.handler = sysrq_handle_globreg,	312	.handler = sysrq_handle_globreg,
313	.help_msg = "Globalregs",	313	.help_msg = "Globalregs",
314	.action_msg = "Show Global CPU Regs",	314	.action_msg = "Show Global CPU Regs",
315	};	315	};
316		316
317	static int __init sparc_globreg_init(void)	317	static int __init sparc_globreg_init(void)
318	{	318	{
319	return register_sysrq_key('y', &sparc_globalreg_op);	319	return register_sysrq_key('y', &sparc_globalreg_op);
320	}	320	}
321		321
322	core_initcall(sparc_globreg_init);	322	core_initcall(sparc_globreg_init);
323		323
324	#endif	324	#endif
325		325
326	unsigned long thread_saved_pc(struct task_struct *tsk)	326	unsigned long thread_saved_pc(struct task_struct *tsk)
327	{	327	{
328	struct thread_info *ti = task_thread_info(tsk);	328	struct thread_info *ti = task_thread_info(tsk);
329	unsigned long ret = 0xdeadbeefUL;	329	unsigned long ret = 0xdeadbeefUL;
330		330
331	if (ti && ti->ksp) {	331	if (ti && ti->ksp) {
332	unsigned long *sp;	332	unsigned long *sp;
333	sp = (unsigned long *)(ti->ksp + STACK_BIAS);	333	sp = (unsigned long *)(ti->ksp + STACK_BIAS);
334	if (((unsigned long)sp & (sizeof(long) - 1)) == 0UL &&	334	if (((unsigned long)sp & (sizeof(long) - 1)) == 0UL &&
335	sp[14]) {	335	sp[14]) {
336	unsigned long *fp;	336	unsigned long *fp;
337	fp = (unsigned long *)(sp[14] + STACK_BIAS);	337	fp = (unsigned long *)(sp[14] + STACK_BIAS);
338	if (((unsigned long)fp & (sizeof(long) - 1)) == 0UL)	338	if (((unsigned long)fp & (sizeof(long) - 1)) == 0UL)
339	ret = fp[15];	339	ret = fp[15];
340	}	340	}
341	}	341	}
342	return ret;	342	return ret;
343	}	343	}
344		344
345	/* Free current thread data structures etc.. */	345	/* Free current thread data structures etc.. */
346	void exit_thread(void)	346	void exit_thread(void)
347	{	347	{
348	struct thread_info *t = current_thread_info();	348	struct thread_info *t = current_thread_info();
349		349
350	if (t->utraps) {	350	if (t->utraps) {
351	if (t->utraps[0] < 2)	351	if (t->utraps[0] < 2)
352	kfree (t->utraps);	352	kfree (t->utraps);
353	else	353	else
354	t->utraps[0]--;	354	t->utraps[0]--;
355	}	355	}
356	}	356	}
357		357
358	void flush_thread(void)	358	void flush_thread(void)
359	{	359	{
360	struct thread_info *t = current_thread_info();	360	struct thread_info *t = current_thread_info();
361	struct mm_struct *mm;	361	struct mm_struct *mm;
362		362
363	mm = t->task->mm;	363	mm = t->task->mm;
364	if (mm)	364	if (mm)
365	tsb_context_switch(mm);	365	tsb_context_switch(mm);
366		366
367	set_thread_wsaved(0);	367	set_thread_wsaved(0);
368		368
369	/* Clear FPU register state. */	369	/* Clear FPU register state. */
370	t->fpsaved[0] = 0;	370	t->fpsaved[0] = 0;
371	}	371	}
372		372
373	/* It's a bit more tricky when 64-bit tasks are involved... */	373	/* It's a bit more tricky when 64-bit tasks are involved... */
374	static unsigned long clone_stackframe(unsigned long csp, unsigned long psp)	374	static unsigned long clone_stackframe(unsigned long csp, unsigned long psp)
375	{	375	{
376	unsigned long fp, distance, rval;	376	unsigned long fp, distance, rval;
377		377
378	if (!(test_thread_flag(TIF_32BIT))) {	378	if (!(test_thread_flag(TIF_32BIT))) {
379	csp += STACK_BIAS;	379	csp += STACK_BIAS;
380	psp += STACK_BIAS;	380	psp += STACK_BIAS;
381	__get_user(fp, &(((struct reg_window __user *)psp)->ins[6]));	381	__get_user(fp, &(((struct reg_window __user *)psp)->ins[6]));
382	fp += STACK_BIAS;	382	fp += STACK_BIAS;
383	} else	383	} else
384	__get_user(fp, &(((struct reg_window32 __user *)psp)->ins[6]));	384	__get_user(fp, &(((struct reg_window32 __user *)psp)->ins[6]));
385		385
386	/* Now align the stack as this is mandatory in the Sparc ABI	386	/* Now align the stack as this is mandatory in the Sparc ABI
387	* due to how register windows work. This hides the	387	* due to how register windows work. This hides the
388	* restriction from thread libraries etc.	388	* restriction from thread libraries etc.
389	*/	389	*/
390	csp &= ~15UL;	390	csp &= ~15UL;
391		391
392	distance = fp - psp;	392	distance = fp - psp;
393	rval = (csp - distance);	393	rval = (csp - distance);
394	if (copy_in_user((void __user ) rval, (void __user ) psp, distance))	394	if (copy_in_user((void __user ) rval, (void __user ) psp, distance))
395	rval = 0;	395	rval = 0;
396	else if (test_thread_flag(TIF_32BIT)) {	396	else if (test_thread_flag(TIF_32BIT)) {
397	if (put_user(((u32)csp),	397	if (put_user(((u32)csp),
398	&(((struct reg_window32 __user *)rval)->ins[6])))	398	&(((struct reg_window32 __user *)rval)->ins[6])))
399	rval = 0;	399	rval = 0;
400	} else {	400	} else {
401	if (put_user(((u64)csp - STACK_BIAS),	401	if (put_user(((u64)csp - STACK_BIAS),
402	&(((struct reg_window __user *)rval)->ins[6])))	402	&(((struct reg_window __user *)rval)->ins[6])))
403	rval = 0;	403	rval = 0;
404	else	404	else
405	rval = rval - STACK_BIAS;	405	rval = rval - STACK_BIAS;
406	}	406	}
407		407
408	return rval;	408	return rval;
409	}	409	}
410		410
411	/* Standard stuff. */	411	/* Standard stuff. */
412	static inline void shift_window_buffer(int first_win, int last_win,	412	static inline void shift_window_buffer(int first_win, int last_win,
413	struct thread_info *t)	413	struct thread_info *t)
414	{	414	{
415	int i;	415	int i;
416		416
417	for (i = first_win; i < last_win; i++) {	417	for (i = first_win; i < last_win; i++) {
418	t->rwbuf_stkptrs[i] = t->rwbuf_stkptrs[i+1];	418	t->rwbuf_stkptrs[i] = t->rwbuf_stkptrs[i+1];
419	memcpy(&t->reg_window[i], &t->reg_window[i+1],	419	memcpy(&t->reg_window[i], &t->reg_window[i+1],
420	sizeof(struct reg_window));	420	sizeof(struct reg_window));
421	}	421	}
422	}	422	}
423		423
424	void synchronize_user_stack(void)	424	void synchronize_user_stack(void)
425	{	425	{
426	struct thread_info *t = current_thread_info();	426	struct thread_info *t = current_thread_info();
427	unsigned long window;	427	unsigned long window;
428		428
429	flush_user_windows();	429	flush_user_windows();
430	if ((window = get_thread_wsaved()) != 0) {	430	if ((window = get_thread_wsaved()) != 0) {
431	int winsize = sizeof(struct reg_window);	431	int winsize = sizeof(struct reg_window);
432	int bias = 0;	432	int bias = 0;
433		433
434	if (test_thread_flag(TIF_32BIT))	434	if (test_thread_flag(TIF_32BIT))
435	winsize = sizeof(struct reg_window32);	435	winsize = sizeof(struct reg_window32);
436	else	436	else
437	bias = STACK_BIAS;	437	bias = STACK_BIAS;
438		438
439	window -= 1;	439	window -= 1;
440	do {	440	do {
441	unsigned long sp = (t->rwbuf_stkptrs[window] + bias);	441	unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
442	struct reg_window *rwin = &t->reg_window[window];	442	struct reg_window *rwin = &t->reg_window[window];
443		443
444	if (!copy_to_user((char __user *)sp, rwin, winsize)) {	444	if (!copy_to_user((char __user *)sp, rwin, winsize)) {
445	shift_window_buffer(window, get_thread_wsaved() - 1, t);	445	shift_window_buffer(window, get_thread_wsaved() - 1, t);
446	set_thread_wsaved(get_thread_wsaved() - 1);	446	set_thread_wsaved(get_thread_wsaved() - 1);
447	}	447	}
448	} while (window--);	448	} while (window--);
449	}	449	}
450	}	450	}
451		451
452	static void stack_unaligned(unsigned long sp)	452	static void stack_unaligned(unsigned long sp)
453	{	453	{
454	siginfo_t info;	454	siginfo_t info;
455		455
456	info.si_signo = SIGBUS;	456	info.si_signo = SIGBUS;
457	info.si_errno = 0;	457	info.si_errno = 0;
458	info.si_code = BUS_ADRALN;	458	info.si_code = BUS_ADRALN;
459	info.si_addr = (void __user *) sp;	459	info.si_addr = (void __user *) sp;
460	info.si_trapno = 0;	460	info.si_trapno = 0;
461	force_sig_info(SIGBUS, &info, current);	461	force_sig_info(SIGBUS, &info, current);
462	}	462	}
463		463
464	void fault_in_user_windows(void)	464	void fault_in_user_windows(void)
465	{	465	{
466	struct thread_info *t = current_thread_info();	466	struct thread_info *t = current_thread_info();
467	unsigned long window;	467	unsigned long window;
468	int winsize = sizeof(struct reg_window);	468	int winsize = sizeof(struct reg_window);
469	int bias = 0;	469	int bias = 0;
470		470
471	if (test_thread_flag(TIF_32BIT))	471	if (test_thread_flag(TIF_32BIT))
472	winsize = sizeof(struct reg_window32);	472	winsize = sizeof(struct reg_window32);
473	else	473	else
474	bias = STACK_BIAS;	474	bias = STACK_BIAS;
475		475
476	flush_user_windows();	476	flush_user_windows();
477	window = get_thread_wsaved();	477	window = get_thread_wsaved();
478		478
479	if (likely(window != 0)) {	479	if (likely(window != 0)) {
480	window -= 1;	480	window -= 1;
481	do {	481	do {
482	unsigned long sp = (t->rwbuf_stkptrs[window] + bias);	482	unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
483	struct reg_window *rwin = &t->reg_window[window];	483	struct reg_window *rwin = &t->reg_window[window];
484		484
485	if (unlikely(sp & 0x7UL))	485	if (unlikely(sp & 0x7UL))
486	stack_unaligned(sp);	486	stack_unaligned(sp);
487		487
488	if (unlikely(copy_to_user((char __user *)sp,	488	if (unlikely(copy_to_user((char __user *)sp,
489	rwin, winsize)))	489	rwin, winsize)))
490	goto barf;	490	goto barf;
491	} while (window--);	491	} while (window--);
492	}	492	}
493	set_thread_wsaved(0);	493	set_thread_wsaved(0);
494	return;	494	return;
495		495
496	barf:	496	barf:
497	set_thread_wsaved(window + 1);	497	set_thread_wsaved(window + 1);
498	do_exit(SIGILL);	498	do_exit(SIGILL);
499	}	499	}
500		500
501	asmlinkage long sparc_do_fork(unsigned long clone_flags,	501	asmlinkage long sparc_do_fork(unsigned long clone_flags,
502	unsigned long stack_start,	502	unsigned long stack_start,
503	struct pt_regs *regs,	503	struct pt_regs *regs,
504	unsigned long stack_size)	504	unsigned long stack_size)
505	{	505	{
506	int __user parent_tid_ptr, child_tid_ptr;	506	int __user parent_tid_ptr, child_tid_ptr;
507	unsigned long orig_i1 = regs->u_regs[UREG_I1];	507	unsigned long orig_i1 = regs->u_regs[UREG_I1];
508	long ret;	508	long ret;
509		509
510	#ifdef CONFIG_COMPAT	510	#ifdef CONFIG_COMPAT
511	if (test_thread_flag(TIF_32BIT)) {	511	if (test_thread_flag(TIF_32BIT)) {
512	parent_tid_ptr = compat_ptr(regs->u_regs[UREG_I2]);	512	parent_tid_ptr = compat_ptr(regs->u_regs[UREG_I2]);
513	child_tid_ptr = compat_ptr(regs->u_regs[UREG_I4]);	513	child_tid_ptr = compat_ptr(regs->u_regs[UREG_I4]);
514	} else	514	} else
515	#endif	515	#endif
516	{	516	{
517	parent_tid_ptr = (int __user *) regs->u_regs[UREG_I2];	517	parent_tid_ptr = (int __user *) regs->u_regs[UREG_I2];
518	child_tid_ptr = (int __user *) regs->u_regs[UREG_I4];	518	child_tid_ptr = (int __user *) regs->u_regs[UREG_I4];
519	}	519	}
520		520
521	ret = do_fork(clone_flags, stack_start,	521	ret = do_fork(clone_flags, stack_start,
522	regs, stack_size,	522	regs, stack_size,
523	parent_tid_ptr, child_tid_ptr);	523	parent_tid_ptr, child_tid_ptr);
524		524
525	/* If we get an error and potentially restart the system	525	/* If we get an error and potentially restart the system
526	* call, we're screwed because copy_thread() clobbered	526	* call, we're screwed because copy_thread() clobbered
527	* the parent's %o1. So detect that case and restore it	527	* the parent's %o1. So detect that case and restore it
528	* here.	528	* here.
529	*/	529	*/
530	if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)	530	if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
531	regs->u_regs[UREG_I1] = orig_i1;	531	regs->u_regs[UREG_I1] = orig_i1;
532		532
533	return ret;	533	return ret;
534	}	534	}
535		535
536	/* Copy a Sparc thread. The fork() return value conventions	536	/* Copy a Sparc thread. The fork() return value conventions
537	* under SunOS are nothing short of bletcherous:	537	* under SunOS are nothing short of bletcherous:
538	* Parent --> %o0 == childs pid, %o1 == 0	538	* Parent --> %o0 == childs pid, %o1 == 0
539	* Child --> %o0 == parents pid, %o1 == 1	539	* Child --> %o0 == parents pid, %o1 == 1
540	*/	540	*/
541	int copy_thread(unsigned long clone_flags, unsigned long sp,	541	int copy_thread(unsigned long clone_flags, unsigned long sp,
542	unsigned long unused,	542	unsigned long unused,
543	struct task_struct p, struct pt_regs regs)	543	struct task_struct p, struct pt_regs regs)
544	{	544	{
545	struct thread_info *t = task_thread_info(p);	545	struct thread_info *t = task_thread_info(p);
546	struct sparc_stackf *parent_sf;	546	struct sparc_stackf *parent_sf;
547	unsigned long child_stack_sz;	547	unsigned long child_stack_sz;
548	char *child_trap_frame;	548	char *child_trap_frame;
549	int kernel_thread;	549	int kernel_thread;
550		550
551	kernel_thread = (regs->tstate & TSTATE_PRIV) ? 1 : 0;	551	kernel_thread = (regs->tstate & TSTATE_PRIV) ? 1 : 0;
552	parent_sf = ((struct sparc_stackf *) regs) - 1;	552	parent_sf = ((struct sparc_stackf *) regs) - 1;
553		553
554	/* Calculate offset to stack_frame & pt_regs */	554	/* Calculate offset to stack_frame & pt_regs */
555	child_stack_sz = ((STACKFRAME_SZ + TRACEREG_SZ) +	555	child_stack_sz = ((STACKFRAME_SZ + TRACEREG_SZ) +
556	(kernel_thread ? STACKFRAME_SZ : 0));	556	(kernel_thread ? STACKFRAME_SZ : 0));
557	child_trap_frame = (task_stack_page(p) +	557	child_trap_frame = (task_stack_page(p) +
558	(THREAD_SIZE - child_stack_sz));	558	(THREAD_SIZE - child_stack_sz));
559	memcpy(child_trap_frame, parent_sf, child_stack_sz);	559	memcpy(child_trap_frame, parent_sf, child_stack_sz);
560		560
561	t->flags = (t->flags & ~((0xffUL << TI_FLAG_CWP_SHIFT) \|	561	t->flags = (t->flags & ~((0xffUL << TI_FLAG_CWP_SHIFT) \|
562	(0xffUL << TI_FLAG_CURRENT_DS_SHIFT))) \|	562	(0xffUL << TI_FLAG_CURRENT_DS_SHIFT))) \|
563	(((regs->tstate + 1) & TSTATE_CWP) << TI_FLAG_CWP_SHIFT);	563	(((regs->tstate + 1) & TSTATE_CWP) << TI_FLAG_CWP_SHIFT);
564	t->new_child = 1;	564	t->new_child = 1;
565	t->ksp = ((unsigned long) child_trap_frame) - STACK_BIAS;	565	t->ksp = ((unsigned long) child_trap_frame) - STACK_BIAS;
566	t->kregs = (struct pt_regs *) (child_trap_frame +	566	t->kregs = (struct pt_regs *) (child_trap_frame +
567	sizeof(struct sparc_stackf));	567	sizeof(struct sparc_stackf));
568	t->fpsaved[0] = 0;	568	t->fpsaved[0] = 0;
569		569
570	if (kernel_thread) {	570	if (kernel_thread) {
571	struct sparc_stackf child_sf = (struct sparc_stackf )	571	struct sparc_stackf child_sf = (struct sparc_stackf )
572	(child_trap_frame + (STACKFRAME_SZ + TRACEREG_SZ));	572	(child_trap_frame + (STACKFRAME_SZ + TRACEREG_SZ));
573		573
574	/* Zero terminate the stack backtrace. */	574	/* Zero terminate the stack backtrace. */
575	child_sf->fp = NULL;	575	child_sf->fp = NULL;
576	t->kregs->u_regs[UREG_FP] =	576	t->kregs->u_regs[UREG_FP] =
577	((unsigned long) child_sf) - STACK_BIAS;	577	((unsigned long) child_sf) - STACK_BIAS;
578		578
579	t->flags \|= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT);	579	t->flags \|= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT);
580	t->kregs->u_regs[UREG_G6] = (unsigned long) t;	580	t->kregs->u_regs[UREG_G6] = (unsigned long) t;
581	t->kregs->u_regs[UREG_G4] = (unsigned long) t->task;	581	t->kregs->u_regs[UREG_G4] = (unsigned long) t->task;
582	} else {	582	} else {
583	if (t->flags & _TIF_32BIT) {	583	if (t->flags & _TIF_32BIT) {
584	sp &= 0x00000000ffffffffUL;	584	sp &= 0x00000000ffffffffUL;
585	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;	585	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
586	}	586	}
587	t->kregs->u_regs[UREG_FP] = sp;	587	t->kregs->u_regs[UREG_FP] = sp;
588	t->flags \|= ((long)ASI_AIUS << TI_FLAG_CURRENT_DS_SHIFT);	588	t->flags \|= ((long)ASI_AIUS << TI_FLAG_CURRENT_DS_SHIFT);
589	if (sp != regs->u_regs[UREG_FP]) {	589	if (sp != regs->u_regs[UREG_FP]) {
590	unsigned long csp;	590	unsigned long csp;
591		591
592	csp = clone_stackframe(sp, regs->u_regs[UREG_FP]);	592	csp = clone_stackframe(sp, regs->u_regs[UREG_FP]);
593	if (!csp)	593	if (!csp)
594	return -EFAULT;	594	return -EFAULT;
595	t->kregs->u_regs[UREG_FP] = csp;	595	t->kregs->u_regs[UREG_FP] = csp;
596	}	596	}
597	if (t->utraps)	597	if (t->utraps)
598	t->utraps[0]++;	598	t->utraps[0]++;
599	}	599	}
600		600
601	/* Set the return value for the child. */	601	/* Set the return value for the child. */
602	t->kregs->u_regs[UREG_I0] = current->pid;	602	t->kregs->u_regs[UREG_I0] = current->pid;
603	t->kregs->u_regs[UREG_I1] = 1;	603	t->kregs->u_regs[UREG_I1] = 1;
604		604
605	/* Set the second return value for the parent. */	605	/* Set the second return value for the parent. */
606	regs->u_regs[UREG_I1] = 0;	606	regs->u_regs[UREG_I1] = 0;
607		607
608	if (clone_flags & CLONE_SETTLS)	608	if (clone_flags & CLONE_SETTLS)
609	t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3];	609	t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3];
610		610
611	return 0;	611	return 0;
612	}	612	}
613		613
614	/*	614	/*
615	* This is the mechanism for creating a new kernel thread.	615	* This is the mechanism for creating a new kernel thread.
616	*	616	*
617	* NOTE! Only a kernel-only process(ie the swapper or direct descendants	617	* NOTE! Only a kernel-only process(ie the swapper or direct descendants
618	* who haven't done an "execve()") should use this: it will work within	618	* who haven't done an "execve()") should use this: it will work within
619	* a system call from a "real" process, but the process memory space will	619	* a system call from a "real" process, but the process memory space will
620	* not be freed until both the parent and the child have exited.	620	* not be freed until both the parent and the child have exited.
621	*/	621	*/
622	pid_t kernel_thread(int (fn)(void ), void * arg, unsigned long flags)	622	pid_t kernel_thread(int (fn)(void ), void * arg, unsigned long flags)
623	{	623	{
624	long retval;	624	long retval;
625		625
626	/* If the parent runs before fn(arg) is called by the child,	626	/* If the parent runs before fn(arg) is called by the child,
627	* the input registers of this function can be clobbered.	627	* the input registers of this function can be clobbered.
628	* So we stash 'fn' and 'arg' into global registers which	628	* So we stash 'fn' and 'arg' into global registers which
629	* will not be modified by the parent.	629	* will not be modified by the parent.
630	*/	630	*/
631	__asm__ __volatile__("mov %4, %%g2\n\t" /* Save FN into global */	631	__asm__ __volatile__("mov %4, %%g2\n\t" /* Save FN into global */
632	"mov %5, %%g3\n\t" /* Save ARG into global */	632	"mov %5, %%g3\n\t" /* Save ARG into global */
633	"mov %1, %%g1\n\t" /* Clone syscall nr. */	633	"mov %1, %%g1\n\t" /* Clone syscall nr. */
634	"mov %2, %%o0\n\t" /* Clone flags. */	634	"mov %2, %%o0\n\t" /* Clone flags. */
635	"mov 0, %%o1\n\t" /* usp arg == 0 */	635	"mov 0, %%o1\n\t" /* usp arg == 0 */
636	"t 0x6d\n\t" /* Linux/Sparc clone(). */	636	"t 0x6d\n\t" /* Linux/Sparc clone(). */
637	"brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */	637	"brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */
638	" mov %%o0, %0\n\t"	638	" mov %%o0, %0\n\t"
639	"jmpl %%g2, %%o7\n\t" /* Call the function. */	639	"jmpl %%g2, %%o7\n\t" /* Call the function. */
640	" mov %%g3, %%o0\n\t" /* Set arg in delay. */	640	" mov %%g3, %%o0\n\t" /* Set arg in delay. */
641	"mov %3, %%g1\n\t"	641	"mov %3, %%g1\n\t"
642	"t 0x6d\n\t" /* Linux/Sparc exit(). */	642	"t 0x6d\n\t" /* Linux/Sparc exit(). */
643	/* Notreached by child. */	643	/* Notreached by child. */
644	"1:" :	644	"1:" :
645	"=r" (retval) :	645	"=r" (retval) :
646	"i" (__NR_clone), "r" (flags \| CLONE_VM \| CLONE_UNTRACED),	646	"i" (__NR_clone), "r" (flags \| CLONE_VM \| CLONE_UNTRACED),
647	"i" (__NR_exit), "r" (fn), "r" (arg) :	647	"i" (__NR_exit), "r" (fn), "r" (arg) :
648	"g1", "g2", "g3", "o0", "o1", "memory", "cc");	648	"g1", "g2", "g3", "o0", "o1", "memory", "cc");
649	return retval;	649	return retval;
650	}	650	}
651	EXPORT_SYMBOL(kernel_thread);	651	EXPORT_SYMBOL(kernel_thread);
652		652
653	typedef struct {	653	typedef struct {
654	union {	654	union {
655	unsigned int pr_regs[32];	655	unsigned int pr_regs[32];
656	unsigned long pr_dregs[16];	656	unsigned long pr_dregs[16];
657	} pr_fr;	657	} pr_fr;
658	unsigned int __unused;	658	unsigned int __unused;
659	unsigned int pr_fsr;	659	unsigned int pr_fsr;
660	unsigned char pr_qcnt;	660	unsigned char pr_qcnt;
661	unsigned char pr_q_entrysize;	661	unsigned char pr_q_entrysize;
662	unsigned char pr_en;	662	unsigned char pr_en;
663	unsigned int pr_q[64];	663	unsigned int pr_q[64];
664	} elf_fpregset_t32;	664	} elf_fpregset_t32;
665		665
666	/*	666	/*
667	* fill in the fpu structure for a core dump.	667	* fill in the fpu structure for a core dump.
668	*/	668	*/
669	int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs)	669	int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs)
670	{	670	{
671	unsigned long *kfpregs = current_thread_info()->fpregs;	671	unsigned long *kfpregs = current_thread_info()->fpregs;
672	unsigned long fprs = current_thread_info()->fpsaved[0];	672	unsigned long fprs = current_thread_info()->fpsaved[0];
673		673
674	if (test_thread_flag(TIF_32BIT)) {	674	if (test_thread_flag(TIF_32BIT)) {
675	elf_fpregset_t32 fpregs32 = (elf_fpregset_t32 )fpregs;	675	elf_fpregset_t32 fpregs32 = (elf_fpregset_t32 )fpregs;
676		676
677	if (fprs & FPRS_DL)	677	if (fprs & FPRS_DL)
678	memcpy(&fpregs32->pr_fr.pr_regs[0], kfpregs,	678	memcpy(&fpregs32->pr_fr.pr_regs[0], kfpregs,
679	sizeof(unsigned int) * 32);	679	sizeof(unsigned int) * 32);
680	else	680	else
681	memset(&fpregs32->pr_fr.pr_regs[0], 0,	681	memset(&fpregs32->pr_fr.pr_regs[0], 0,
682	sizeof(unsigned int) * 32);	682	sizeof(unsigned int) * 32);
683	fpregs32->pr_qcnt = 0;	683	fpregs32->pr_qcnt = 0;
684	fpregs32->pr_q_entrysize = 8;	684	fpregs32->pr_q_entrysize = 8;
685	memset(&fpregs32->pr_q[0], 0,	685	memset(&fpregs32->pr_q[0], 0,
686	(sizeof(unsigned int) * 64));	686	(sizeof(unsigned int) * 64));
687	if (fprs & FPRS_FEF) {	687	if (fprs & FPRS_FEF) {
688	fpregs32->pr_fsr = (unsigned int) current_thread_info()->xfsr[0];	688	fpregs32->pr_fsr = (unsigned int) current_thread_info()->xfsr[0];
689	fpregs32->pr_en = 1;	689	fpregs32->pr_en = 1;
690	} else {	690	} else {
691	fpregs32->pr_fsr = 0;	691	fpregs32->pr_fsr = 0;
692	fpregs32->pr_en = 0;	692	fpregs32->pr_en = 0;
693	}	693	}
694	} else {	694	} else {
695	if(fprs & FPRS_DL)	695	if(fprs & FPRS_DL)
696	memcpy(&fpregs->pr_regs[0], kfpregs,	696	memcpy(&fpregs->pr_regs[0], kfpregs,
697	sizeof(unsigned int) * 32);	697	sizeof(unsigned int) * 32);
698	else	698	else
699	memset(&fpregs->pr_regs[0], 0,	699	memset(&fpregs->pr_regs[0], 0,
700	sizeof(unsigned int) * 32);	700	sizeof(unsigned int) * 32);
701	if(fprs & FPRS_DU)	701	if(fprs & FPRS_DU)
702	memcpy(&fpregs->pr_regs[16], kfpregs+16,	702	memcpy(&fpregs->pr_regs[16], kfpregs+16,
703	sizeof(unsigned int) * 32);	703	sizeof(unsigned int) * 32);
704	else	704	else
705	memset(&fpregs->pr_regs[16], 0,	705	memset(&fpregs->pr_regs[16], 0,
706	sizeof(unsigned int) * 32);	706	sizeof(unsigned int) * 32);
707	if(fprs & FPRS_FEF) {	707	if(fprs & FPRS_FEF) {
708	fpregs->pr_fsr = current_thread_info()->xfsr[0];	708	fpregs->pr_fsr = current_thread_info()->xfsr[0];
709	fpregs->pr_gsr = current_thread_info()->gsr[0];	709	fpregs->pr_gsr = current_thread_info()->gsr[0];
710	} else {	710	} else {
711	fpregs->pr_fsr = fpregs->pr_gsr = 0;	711	fpregs->pr_fsr = fpregs->pr_gsr = 0;
712	}	712	}
713	fpregs->pr_fprs = fprs;	713	fpregs->pr_fprs = fprs;
714	}	714	}
715	return 1;	715	return 1;
716	}	716	}
717	EXPORT_SYMBOL(dump_fpu);	717	EXPORT_SYMBOL(dump_fpu);
718		718
719	/*	719	/*
720	* sparc_execve() executes a new program after the asm stub has set	720	* sparc_execve() executes a new program after the asm stub has set
721	* things up for us. This should basically do what I want it to.	721	* things up for us. This should basically do what I want it to.
722	*/	722	*/
723	asmlinkage int sparc_execve(struct pt_regs *regs)	723	asmlinkage int sparc_execve(struct pt_regs *regs)
724	{	724	{
725	int error, base = 0;	725	int error, base = 0;
726	char *filename;	726	char *filename;
727		727
728	/* User register window flush is done by entry.S */	728	/* User register window flush is done by entry.S */
729		729
730	/* Check for indirect call. */	730	/* Check for indirect call. */
731	if (regs->u_regs[UREG_G1] == 0)	731	if (regs->u_regs[UREG_G1] == 0)
732	base = 1;	732	base = 1;
733		733
734	filename = getname((char __user *)regs->u_regs[base + UREG_I0]);	734	filename = getname((char __user *)regs->u_regs[base + UREG_I0]);
735	error = PTR_ERR(filename);	735	error = PTR_ERR(filename);
736	if (IS_ERR(filename))	736	if (IS_ERR(filename))
737	goto out;	737	goto out;
738	error = do_execve(filename,	738	error = do_execve(filename,
739	(const char __user const __user )	739	(const char __user const __user )
740	regs->u_regs[base + UREG_I1],	740	regs->u_regs[base + UREG_I1],
741	(const char __user const __user )	741	(const char __user const __user )
742	regs->u_regs[base + UREG_I2], regs);	742	regs->u_regs[base + UREG_I2], regs);
743	putname(filename);	743	putname(filename);
744	if (!error) {	744	if (!error) {
745	fprs_write(0);	745	fprs_write(0);
746	current_thread_info()->xfsr[0] = 0;	746	current_thread_info()->xfsr[0] = 0;
747	current_thread_info()->fpsaved[0] = 0;	747	current_thread_info()->fpsaved[0] = 0;
748	regs->tstate &= ~TSTATE_PEF;	748	regs->tstate &= ~TSTATE_PEF;
749	}	749	}
750	out:	750	out:
751	return error;	751	return error;
752	}	752	}
753		753
754	unsigned long get_wchan(struct task_struct *task)	754	unsigned long get_wchan(struct task_struct *task)
755	{	755	{
756	unsigned long pc, fp, bias = 0;	756	unsigned long pc, fp, bias = 0;
757	struct thread_info *tp;	757	struct thread_info *tp;
758	struct reg_window *rw;	758	struct reg_window *rw;
759	unsigned long ret = 0;	759	unsigned long ret = 0;
760	int count = 0;	760	int count = 0;
761		761
762	if (!task \|\| task == current \|\|	762	if (!task \|\| task == current \|\|
763	task->state == TASK_RUNNING)	763	task->state == TASK_RUNNING)
764	goto out;	764	goto out;
765		765
766	tp = task_thread_info(task);	766	tp = task_thread_info(task);
767	bias = STACK_BIAS;	767	bias = STACK_BIAS;
768	fp = task_thread_info(task)->ksp + bias;	768	fp = task_thread_info(task)->ksp + bias;
769		769
770	do {	770	do {
771	if (!kstack_valid(tp, fp))	771	if (!kstack_valid(tp, fp))
772	break;	772	break;
773	rw = (struct reg_window *) fp;	773	rw = (struct reg_window *) fp;
774	pc = rw->ins[7];	774	pc = rw->ins[7];
775	if (!in_sched_functions(pc)) {	775	if (!in_sched_functions(pc)) {
776	ret = pc;	776	ret = pc;
777	goto out;	777	goto out;
778	}	778	}
779	fp = rw->ins[6] + bias;	779	fp = rw->ins[6] + bias;
780	} while (++count < 16);	780	} while (++count < 16);
781		781
782	out:	782	out:
783	return ret;	783	return ret;
784	}	784	}
785		785

include/linux/preempt.h

Diff comments View file @ ba74c14

 #ifndef __LINUX_PREEMPT_H
 #define __LINUX_PREEMPT_H
 /*
  * include/linux/preempt.h - macros for accessing and manipulating
  * preempt_count (used for kernel preemption, interrupt count, etc.)
  */
 #include <linux/thread_info.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
 #endif
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 #define preempt_count()	(current_thread_info()->preempt_count)
 #ifdef CONFIG_PREEMPT
 asmlinkage void preempt_schedule(void);
 #define preempt_check_resched() \
 do { \
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 #else /* !CONFIG_PREEMPT */
 #define preempt_check_resched()		do { } while (0)
 #endif /* CONFIG_PREEMPT */
 #ifdef CONFIG_PREEMPT_COUNT
 #define preempt_disable() \
 do { \
 	inc_preempt_count(); \
 	barrier(); \
 } while (0)
-#define preempt_enable_no_resched() \
+#define sched_preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
+#define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
 #define preempt_enable() \
 do { \
 	preempt_enable_no_resched(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
 /* For debugging and tracer internals only! */
 #define add_preempt_count_notrace(val)			\
 	do { preempt_count() += (val); } while (0)
 #define sub_preempt_count_notrace(val)			\
 	do { preempt_count() -= (val); } while (0)
 #define inc_preempt_count_notrace() add_preempt_count_notrace(1)
 #define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
 #define preempt_disable_notrace() \
 do { \
 	inc_preempt_count_notrace(); \
 	barrier(); \
 } while (0)
 #define preempt_enable_no_resched_notrace() \
 do { \
 	barrier(); \
 	dec_preempt_count_notrace(); \
 } while (0)
 /* preempt_check_resched is OK to trace */
 #define preempt_enable_notrace() \
 do { \
 	preempt_enable_no_resched_notrace(); \
 	barrier(); \
 	preempt_check_resched(); \
 } while (0)
 #else /* !CONFIG_PREEMPT_COUNT */
 #define preempt_disable()		do { } while (0)
+#define sched_preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
 #endif /* CONFIG_PREEMPT_COUNT */
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 struct preempt_notifier;
 /**
  * preempt_ops - notifiers called when a task is preempted and rescheduled
  * @sched_in: we're about to be rescheduled:
  *    notifier: struct preempt_notifier for the task being scheduled
  *    cpu:  cpu we're scheduled on
  * @sched_out: we've just been preempted
  *    notifier: struct preempt_notifier for the task being preempted
  *    next: the task that's kicking us out
  *
  * Please note that sched_in and out are called under different
  * contexts.  sched_out is called with rq lock held and irq disabled
  * while sched_in is called without rq lock and irq enabled.  This
  * difference is intentional and depended upon by its users.
  */
 struct preempt_ops {
 	void (*sched_in)(struct preempt_notifier *notifier, int cpu);
 	void (*sched_out)(struct preempt_notifier *notifier,
 			  struct task_struct *next);
 };
 /**
  * preempt_notifier - key for installing preemption notifiers
  * @link: internal use
  * @ops: defines the notifier functions to be called
  *
  * Usually used in conjunction with container_of().
  */
 struct preempt_notifier {
 	struct hlist_node link;
 	struct preempt_ops *ops;
 };
 void preempt_notifier_register(struct preempt_notifier *notifier);
 void preempt_notifier_unregister(struct preempt_notifier *notifier);
 static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 				     struct preempt_ops *ops)
 {
 	INIT_HLIST_NODE(&notifier->link);
 	notifier->ops = ops;
 }
 #endif
 #endif /* __LINUX_PREEMPT_H */

kernel/sched/core.c

Diff comments View file @ ba74c14

1	/*	1	/*
2	* kernel/sched/core.c	2	* kernel/sched/core.c
3	*	3	*
4	* Kernel scheduler and related syscalls	4	* Kernel scheduler and related syscalls
5	*	5	*
6	* Copyright (C) 1991-2002 Linus Torvalds	6	* Copyright (C) 1991-2002 Linus Torvalds
7	*	7	*
8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and	8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9	* make semaphores SMP safe	9	* make semaphores SMP safe
10	* 1998-11-19 Implemented schedule_timeout() and related stuff	10	* 1998-11-19 Implemented schedule_timeout() and related stuff
11	* by Andrea Arcangeli	11	* by Andrea Arcangeli
12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:	12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13	* hybrid priority-list and round-robin design with	13	* hybrid priority-list and round-robin design with
14	* an array-switch method of distributing timeslices	14	* an array-switch method of distributing timeslices
15	* and per-CPU runqueues. Cleanups and useful suggestions	15	* and per-CPU runqueues. Cleanups and useful suggestions
16	* by Davide Libenzi, preemptible kernel bits by Robert Love.	16	* by Davide Libenzi, preemptible kernel bits by Robert Love.
17	* 2003-09-03 Interactivity tuning by Con Kolivas.	17	* 2003-09-03 Interactivity tuning by Con Kolivas.
18	* 2004-04-02 Scheduler domains code by Nick Piggin	18	* 2004-04-02 Scheduler domains code by Nick Piggin
19	* 2007-04-15 Work begun on replacing all interactivity tuning with a	19	* 2007-04-15 Work begun on replacing all interactivity tuning with a
20	* fair scheduling design by Con Kolivas.	20	* fair scheduling design by Con Kolivas.
21	* 2007-05-05 Load balancing (smp-nice) and other improvements	21	* 2007-05-05 Load balancing (smp-nice) and other improvements
22	* by Peter Williams	22	* by Peter Williams
23	* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith	23	* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24	* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri	24	* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25	* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,	25	* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26	* Thomas Gleixner, Mike Kravetz	26	* Thomas Gleixner, Mike Kravetz
27	*/	27	*/
28		28
29	#include <linux/mm.h>	29	#include <linux/mm.h>
30	#include <linux/module.h>	30	#include <linux/module.h>
31	#include <linux/nmi.h>	31	#include <linux/nmi.h>
32	#include <linux/init.h>	32	#include <linux/init.h>
33	#include <linux/uaccess.h>	33	#include <linux/uaccess.h>
34	#include <linux/highmem.h>	34	#include <linux/highmem.h>
35	#include <asm/mmu_context.h>	35	#include <asm/mmu_context.h>
36	#include <linux/interrupt.h>	36	#include <linux/interrupt.h>
37	#include <linux/capability.h>	37	#include <linux/capability.h>
38	#include <linux/completion.h>	38	#include <linux/completion.h>
39	#include <linux/kernel_stat.h>	39	#include <linux/kernel_stat.h>
40	#include <linux/debug_locks.h>	40	#include <linux/debug_locks.h>
41	#include <linux/perf_event.h>	41	#include <linux/perf_event.h>
42	#include <linux/security.h>	42	#include <linux/security.h>
43	#include <linux/notifier.h>	43	#include <linux/notifier.h>
44	#include <linux/profile.h>	44	#include <linux/profile.h>
45	#include <linux/freezer.h>	45	#include <linux/freezer.h>
46	#include <linux/vmalloc.h>	46	#include <linux/vmalloc.h>
47	#include <linux/blkdev.h>	47	#include <linux/blkdev.h>
48	#include <linux/delay.h>	48	#include <linux/delay.h>
49	#include <linux/pid_namespace.h>	49	#include <linux/pid_namespace.h>
50	#include <linux/smp.h>	50	#include <linux/smp.h>
51	#include <linux/threads.h>	51	#include <linux/threads.h>
52	#include <linux/timer.h>	52	#include <linux/timer.h>
53	#include <linux/rcupdate.h>	53	#include <linux/rcupdate.h>
54	#include <linux/cpu.h>	54	#include <linux/cpu.h>
55	#include <linux/cpuset.h>	55	#include <linux/cpuset.h>
56	#include <linux/percpu.h>	56	#include <linux/percpu.h>
57	#include <linux/proc_fs.h>	57	#include <linux/proc_fs.h>
58	#include <linux/seq_file.h>	58	#include <linux/seq_file.h>
59	#include <linux/sysctl.h>	59	#include <linux/sysctl.h>
60	#include <linux/syscalls.h>	60	#include <linux/syscalls.h>
61	#include <linux/times.h>	61	#include <linux/times.h>
62	#include <linux/tsacct_kern.h>	62	#include <linux/tsacct_kern.h>
63	#include <linux/kprobes.h>	63	#include <linux/kprobes.h>
64	#include <linux/delayacct.h>	64	#include <linux/delayacct.h>
65	#include <linux/unistd.h>	65	#include <linux/unistd.h>
66	#include <linux/pagemap.h>	66	#include <linux/pagemap.h>
67	#include <linux/hrtimer.h>	67	#include <linux/hrtimer.h>
68	#include <linux/tick.h>	68	#include <linux/tick.h>
69	#include <linux/debugfs.h>	69	#include <linux/debugfs.h>
70	#include <linux/ctype.h>	70	#include <linux/ctype.h>
71	#include <linux/ftrace.h>	71	#include <linux/ftrace.h>
72	#include <linux/slab.h>	72	#include <linux/slab.h>
73	#include <linux/init_task.h>	73	#include <linux/init_task.h>
74		74
75	#include <asm/tlb.h>	75	#include <asm/tlb.h>
76	#include <asm/irq_regs.h>	76	#include <asm/irq_regs.h>
77	#include <asm/mutex.h>	77	#include <asm/mutex.h>
78	#ifdef CONFIG_PARAVIRT	78	#ifdef CONFIG_PARAVIRT
79	#include <asm/paravirt.h>	79	#include <asm/paravirt.h>
80	#endif	80	#endif
81		81
82	#include "sched.h"	82	#include "sched.h"
83	#include "../workqueue_sched.h"	83	#include "../workqueue_sched.h"
84		84
85	#define CREATE_TRACE_POINTS	85	#define CREATE_TRACE_POINTS
86	#include <trace/events/sched.h>	86	#include <trace/events/sched.h>
87		87
88	void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)	88	void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
89	{	89	{
90	unsigned long delta;	90	unsigned long delta;
91	ktime_t soft, hard, now;	91	ktime_t soft, hard, now;
92		92
93	for (;;) {	93	for (;;) {
94	if (hrtimer_active(period_timer))	94	if (hrtimer_active(period_timer))
95	break;	95	break;
96		96
97	now = hrtimer_cb_get_time(period_timer);	97	now = hrtimer_cb_get_time(period_timer);
98	hrtimer_forward(period_timer, now, period);	98	hrtimer_forward(period_timer, now, period);
99		99
100	soft = hrtimer_get_softexpires(period_timer);	100	soft = hrtimer_get_softexpires(period_timer);
101	hard = hrtimer_get_expires(period_timer);	101	hard = hrtimer_get_expires(period_timer);
102	delta = ktime_to_ns(ktime_sub(hard, soft));	102	delta = ktime_to_ns(ktime_sub(hard, soft));
103	__hrtimer_start_range_ns(period_timer, soft, delta,	103	__hrtimer_start_range_ns(period_timer, soft, delta,
104	HRTIMER_MODE_ABS_PINNED, 0);	104	HRTIMER_MODE_ABS_PINNED, 0);
105	}	105	}
106	}	106	}
107		107
108	DEFINE_MUTEX(sched_domains_mutex);	108	DEFINE_MUTEX(sched_domains_mutex);
109	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);	109	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
110		110
111	static void update_rq_clock_task(struct rq *rq, s64 delta);	111	static void update_rq_clock_task(struct rq *rq, s64 delta);
112		112
113	void update_rq_clock(struct rq *rq)	113	void update_rq_clock(struct rq *rq)
114	{	114	{
115	s64 delta;	115	s64 delta;
116		116
117	if (rq->skip_clock_update > 0)	117	if (rq->skip_clock_update > 0)
118	return;	118	return;
119		119
120	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;	120	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
121	rq->clock += delta;	121	rq->clock += delta;
122	update_rq_clock_task(rq, delta);	122	update_rq_clock_task(rq, delta);
123	}	123	}
124		124
125	/*	125	/*
126	* Debugging: various feature bits	126	* Debugging: various feature bits
127	*/	127	*/
128		128
129	#define SCHED_FEAT(name, enabled) \	129	#define SCHED_FEAT(name, enabled) \
130	(1UL << __SCHED_FEAT_##name) * enabled \|	130	(1UL << __SCHED_FEAT_##name) * enabled \|
131		131
132	const_debug unsigned int sysctl_sched_features =	132	const_debug unsigned int sysctl_sched_features =
133	#include "features.h"	133	#include "features.h"
134	0;	134	0;
135		135
136	#undef SCHED_FEAT	136	#undef SCHED_FEAT
137		137
138	#ifdef CONFIG_SCHED_DEBUG	138	#ifdef CONFIG_SCHED_DEBUG
139	#define SCHED_FEAT(name, enabled) \	139	#define SCHED_FEAT(name, enabled) \
140	#name ,	140	#name ,
141		141
142	static __read_mostly char *sched_feat_names[] = {	142	static __read_mostly char *sched_feat_names[] = {
143	#include "features.h"	143	#include "features.h"
144	NULL	144	NULL
145	};	145	};
146		146
147	#undef SCHED_FEAT	147	#undef SCHED_FEAT
148		148
149	static int sched_feat_show(struct seq_file m, void v)	149	static int sched_feat_show(struct seq_file m, void v)
150	{	150	{
151	int i;	151	int i;
152		152
153	for (i = 0; i < __SCHED_FEAT_NR; i++) {	153	for (i = 0; i < __SCHED_FEAT_NR; i++) {
154	if (!(sysctl_sched_features & (1UL << i)))	154	if (!(sysctl_sched_features & (1UL << i)))
155	seq_puts(m, "NO_");	155	seq_puts(m, "NO_");
156	seq_printf(m, "%s ", sched_feat_names[i]);	156	seq_printf(m, "%s ", sched_feat_names[i]);
157	}	157	}
158	seq_puts(m, "\n");	158	seq_puts(m, "\n");
159		159
160	return 0;	160	return 0;
161	}	161	}
162		162
163	#ifdef HAVE_JUMP_LABEL	163	#ifdef HAVE_JUMP_LABEL
164		164
165	#define jump_label_key__true jump_label_key_enabled	165	#define jump_label_key__true jump_label_key_enabled
166	#define jump_label_key__false jump_label_key_disabled	166	#define jump_label_key__false jump_label_key_disabled
167		167
168	#define SCHED_FEAT(name, enabled) \	168	#define SCHED_FEAT(name, enabled) \
169	jump_label_key__##enabled ,	169	jump_label_key__##enabled ,
170		170
171	struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {	171	struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
172	#include "features.h"	172	#include "features.h"
173	};	173	};
174		174
175	#undef SCHED_FEAT	175	#undef SCHED_FEAT
176		176
177	static void sched_feat_disable(int i)	177	static void sched_feat_disable(int i)
178	{	178	{
179	if (jump_label_enabled(&sched_feat_keys[i]))	179	if (jump_label_enabled(&sched_feat_keys[i]))
180	jump_label_dec(&sched_feat_keys[i]);	180	jump_label_dec(&sched_feat_keys[i]);
181	}	181	}
182		182
183	static void sched_feat_enable(int i)	183	static void sched_feat_enable(int i)
184	{	184	{
185	if (!jump_label_enabled(&sched_feat_keys[i]))	185	if (!jump_label_enabled(&sched_feat_keys[i]))
186	jump_label_inc(&sched_feat_keys[i]);	186	jump_label_inc(&sched_feat_keys[i]);
187	}	187	}
188	#else	188	#else
189	static void sched_feat_disable(int i) { };	189	static void sched_feat_disable(int i) { };
190	static void sched_feat_enable(int i) { };	190	static void sched_feat_enable(int i) { };
191	#endif /* HAVE_JUMP_LABEL */	191	#endif /* HAVE_JUMP_LABEL */
192		192
193	static ssize_t	193	static ssize_t
194	sched_feat_write(struct file filp, const char __user ubuf,	194	sched_feat_write(struct file filp, const char __user ubuf,
195	size_t cnt, loff_t *ppos)	195	size_t cnt, loff_t *ppos)
196	{	196	{
197	char buf[64];	197	char buf[64];
198	char *cmp;	198	char *cmp;
199	int neg = 0;	199	int neg = 0;
200	int i;	200	int i;
201		201
202	if (cnt > 63)	202	if (cnt > 63)
203	cnt = 63;	203	cnt = 63;
204		204
205	if (copy_from_user(&buf, ubuf, cnt))	205	if (copy_from_user(&buf, ubuf, cnt))
206	return -EFAULT;	206	return -EFAULT;
207		207
208	buf[cnt] = 0;	208	buf[cnt] = 0;
209	cmp = strstrip(buf);	209	cmp = strstrip(buf);
210		210
211	if (strncmp(cmp, "NO_", 3) == 0) {	211	if (strncmp(cmp, "NO_", 3) == 0) {
212	neg = 1;	212	neg = 1;
213	cmp += 3;	213	cmp += 3;
214	}	214	}
215		215
216	for (i = 0; i < __SCHED_FEAT_NR; i++) {	216	for (i = 0; i < __SCHED_FEAT_NR; i++) {
217	if (strcmp(cmp, sched_feat_names[i]) == 0) {	217	if (strcmp(cmp, sched_feat_names[i]) == 0) {
218	if (neg) {	218	if (neg) {
219	sysctl_sched_features &= ~(1UL << i);	219	sysctl_sched_features &= ~(1UL << i);
220	sched_feat_disable(i);	220	sched_feat_disable(i);
221	} else {	221	} else {
222	sysctl_sched_features \|= (1UL << i);	222	sysctl_sched_features \|= (1UL << i);
223	sched_feat_enable(i);	223	sched_feat_enable(i);
224	}	224	}
225	break;	225	break;
226	}	226	}
227	}	227	}
228		228
229	if (i == __SCHED_FEAT_NR)	229	if (i == __SCHED_FEAT_NR)
230	return -EINVAL;	230	return -EINVAL;
231		231
232	*ppos += cnt;	232	*ppos += cnt;
233		233
234	return cnt;	234	return cnt;
235	}	235	}
236		236
237	static int sched_feat_open(struct inode inode, struct file filp)	237	static int sched_feat_open(struct inode inode, struct file filp)
238	{	238	{
239	return single_open(filp, sched_feat_show, NULL);	239	return single_open(filp, sched_feat_show, NULL);
240	}	240	}
241		241
242	static const struct file_operations sched_feat_fops = {	242	static const struct file_operations sched_feat_fops = {
243	.open = sched_feat_open,	243	.open = sched_feat_open,
244	.write = sched_feat_write,	244	.write = sched_feat_write,
245	.read = seq_read,	245	.read = seq_read,
246	.llseek = seq_lseek,	246	.llseek = seq_lseek,
247	.release = single_release,	247	.release = single_release,
248	};	248	};
249		249
250	static __init int sched_init_debug(void)	250	static __init int sched_init_debug(void)
251	{	251	{
252	debugfs_create_file("sched_features", 0644, NULL, NULL,	252	debugfs_create_file("sched_features", 0644, NULL, NULL,
253	&sched_feat_fops);	253	&sched_feat_fops);
254		254
255	return 0;	255	return 0;
256	}	256	}
257	late_initcall(sched_init_debug);	257	late_initcall(sched_init_debug);
258	#endif /* CONFIG_SCHED_DEBUG */	258	#endif /* CONFIG_SCHED_DEBUG */
259		259
260	/*	260	/*
261	* Number of tasks to iterate in a single balance run.	261	* Number of tasks to iterate in a single balance run.
262	* Limited because this is done with IRQs disabled.	262	* Limited because this is done with IRQs disabled.
263	*/	263	*/
264	const_debug unsigned int sysctl_sched_nr_migrate = 32;	264	const_debug unsigned int sysctl_sched_nr_migrate = 32;
265		265
266	/*	266	/*
267	* period over which we average the RT time consumption, measured	267	* period over which we average the RT time consumption, measured
268	* in ms.	268	* in ms.
269	*	269	*
270	* default: 1s	270	* default: 1s
271	*/	271	*/
272	const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;	272	const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
273		273
274	/*	274	/*
275	* period over which we measure -rt task cpu usage in us.	275	* period over which we measure -rt task cpu usage in us.
276	* default: 1s	276	* default: 1s
277	*/	277	*/
278	unsigned int sysctl_sched_rt_period = 1000000;	278	unsigned int sysctl_sched_rt_period = 1000000;
279		279
280	__read_mostly int scheduler_running;	280	__read_mostly int scheduler_running;
281		281
282	/*	282	/*
283	* part of the period that we allow rt tasks to run in us.	283	* part of the period that we allow rt tasks to run in us.
284	* default: 0.95s	284	* default: 0.95s
285	*/	285	*/
286	int sysctl_sched_rt_runtime = 950000;	286	int sysctl_sched_rt_runtime = 950000;
287		287
288		288
289		289
290	/*	290	/*
291	* __task_rq_lock - lock the rq @p resides on.	291	* __task_rq_lock - lock the rq @p resides on.
292	*/	292	*/
293	static inline struct rq __task_rq_lock(struct task_struct p)	293	static inline struct rq __task_rq_lock(struct task_struct p)
294	__acquires(rq->lock)	294	__acquires(rq->lock)
295	{	295	{
296	struct rq *rq;	296	struct rq *rq;
297		297
298	lockdep_assert_held(&p->pi_lock);	298	lockdep_assert_held(&p->pi_lock);
299		299
300	for (;;) {	300	for (;;) {
301	rq = task_rq(p);	301	rq = task_rq(p);
302	raw_spin_lock(&rq->lock);	302	raw_spin_lock(&rq->lock);
303	if (likely(rq == task_rq(p)))	303	if (likely(rq == task_rq(p)))
304	return rq;	304	return rq;
305	raw_spin_unlock(&rq->lock);	305	raw_spin_unlock(&rq->lock);
306	}	306	}
307	}	307	}
308		308
309	/*	309	/*
310	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.	310	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
311	*/	311	*/
312	static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)	312	static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)
313	__acquires(p->pi_lock)	313	__acquires(p->pi_lock)
314	__acquires(rq->lock)	314	__acquires(rq->lock)
315	{	315	{
316	struct rq *rq;	316	struct rq *rq;
317		317
318	for (;;) {	318	for (;;) {
319	raw_spin_lock_irqsave(&p->pi_lock, *flags);	319	raw_spin_lock_irqsave(&p->pi_lock, *flags);
320	rq = task_rq(p);	320	rq = task_rq(p);
321	raw_spin_lock(&rq->lock);	321	raw_spin_lock(&rq->lock);
322	if (likely(rq == task_rq(p)))	322	if (likely(rq == task_rq(p)))
323	return rq;	323	return rq;
324	raw_spin_unlock(&rq->lock);	324	raw_spin_unlock(&rq->lock);
325	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);	325	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
326	}	326	}
327	}	327	}
328		328
329	static void __task_rq_unlock(struct rq *rq)	329	static void __task_rq_unlock(struct rq *rq)
330	__releases(rq->lock)	330	__releases(rq->lock)
331	{	331	{
332	raw_spin_unlock(&rq->lock);	332	raw_spin_unlock(&rq->lock);
333	}	333	}
334		334
335	static inline void	335	static inline void
336	task_rq_unlock(struct rq rq, struct task_struct p, unsigned long *flags)	336	task_rq_unlock(struct rq rq, struct task_struct p, unsigned long *flags)
337	__releases(rq->lock)	337	__releases(rq->lock)
338	__releases(p->pi_lock)	338	__releases(p->pi_lock)
339	{	339	{
340	raw_spin_unlock(&rq->lock);	340	raw_spin_unlock(&rq->lock);
341	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);	341	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
342	}	342	}
343		343
344	/*	344	/*
345	* this_rq_lock - lock this runqueue and disable interrupts.	345	* this_rq_lock - lock this runqueue and disable interrupts.
346	*/	346	*/
347	static struct rq *this_rq_lock(void)	347	static struct rq *this_rq_lock(void)
348	__acquires(rq->lock)	348	__acquires(rq->lock)
349	{	349	{
350	struct rq *rq;	350	struct rq *rq;
351		351
352	local_irq_disable();	352	local_irq_disable();
353	rq = this_rq();	353	rq = this_rq();
354	raw_spin_lock(&rq->lock);	354	raw_spin_lock(&rq->lock);
355		355
356	return rq;	356	return rq;
357	}	357	}
358		358
359	#ifdef CONFIG_SCHED_HRTICK	359	#ifdef CONFIG_SCHED_HRTICK
360	/*	360	/*
361	* Use HR-timers to deliver accurate preemption points.	361	* Use HR-timers to deliver accurate preemption points.
362	*	362	*
363	* Its all a bit involved since we cannot program an hrt while holding the	363	* Its all a bit involved since we cannot program an hrt while holding the
364	* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a	364	* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
365	* reschedule event.	365	* reschedule event.
366	*	366	*
367	* When we get rescheduled we reprogram the hrtick_timer outside of the	367	* When we get rescheduled we reprogram the hrtick_timer outside of the
368	* rq->lock.	368	* rq->lock.
369	*/	369	*/
370		370
371	static void hrtick_clear(struct rq *rq)	371	static void hrtick_clear(struct rq *rq)
372	{	372	{
373	if (hrtimer_active(&rq->hrtick_timer))	373	if (hrtimer_active(&rq->hrtick_timer))
374	hrtimer_cancel(&rq->hrtick_timer);	374	hrtimer_cancel(&rq->hrtick_timer);
375	}	375	}
376		376
377	/*	377	/*
378	* High-resolution timer tick.	378	* High-resolution timer tick.
379	* Runs from hardirq context with interrupts disabled.	379	* Runs from hardirq context with interrupts disabled.
380	*/	380	*/
381	static enum hrtimer_restart hrtick(struct hrtimer *timer)	381	static enum hrtimer_restart hrtick(struct hrtimer *timer)
382	{	382	{
383	struct rq *rq = container_of(timer, struct rq, hrtick_timer);	383	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
384		384
385	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());	385	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
386		386
387	raw_spin_lock(&rq->lock);	387	raw_spin_lock(&rq->lock);
388	update_rq_clock(rq);	388	update_rq_clock(rq);
389	rq->curr->sched_class->task_tick(rq, rq->curr, 1);	389	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
390	raw_spin_unlock(&rq->lock);	390	raw_spin_unlock(&rq->lock);
391		391
392	return HRTIMER_NORESTART;	392	return HRTIMER_NORESTART;
393	}	393	}
394		394
395	#ifdef CONFIG_SMP	395	#ifdef CONFIG_SMP
396	/*	396	/*
397	* called from hardirq (IPI) context	397	* called from hardirq (IPI) context
398	*/	398	*/
399	static void __hrtick_start(void *arg)	399	static void __hrtick_start(void *arg)
400	{	400	{
401	struct rq *rq = arg;	401	struct rq *rq = arg;
402		402
403	raw_spin_lock(&rq->lock);	403	raw_spin_lock(&rq->lock);
404	hrtimer_restart(&rq->hrtick_timer);	404	hrtimer_restart(&rq->hrtick_timer);
405	rq->hrtick_csd_pending = 0;	405	rq->hrtick_csd_pending = 0;
406	raw_spin_unlock(&rq->lock);	406	raw_spin_unlock(&rq->lock);
407	}	407	}
408		408
409	/*	409	/*
410	* Called to set the hrtick timer state.	410	* Called to set the hrtick timer state.
411	*	411	*
412	* called with rq->lock held and irqs disabled	412	* called with rq->lock held and irqs disabled
413	*/	413	*/
414	void hrtick_start(struct rq *rq, u64 delay)	414	void hrtick_start(struct rq *rq, u64 delay)
415	{	415	{
416	struct hrtimer *timer = &rq->hrtick_timer;	416	struct hrtimer *timer = &rq->hrtick_timer;
417	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);	417	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
418		418
419	hrtimer_set_expires(timer, time);	419	hrtimer_set_expires(timer, time);
420		420
421	if (rq == this_rq()) {	421	if (rq == this_rq()) {
422	hrtimer_restart(timer);	422	hrtimer_restart(timer);
423	} else if (!rq->hrtick_csd_pending) {	423	} else if (!rq->hrtick_csd_pending) {
424	__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);	424	__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
425	rq->hrtick_csd_pending = 1;	425	rq->hrtick_csd_pending = 1;
426	}	426	}
427	}	427	}
428		428
429	static int	429	static int
430	hotplug_hrtick(struct notifier_block nfb, unsigned long action, void hcpu)	430	hotplug_hrtick(struct notifier_block nfb, unsigned long action, void hcpu)
431	{	431	{
432	int cpu = (int)(long)hcpu;	432	int cpu = (int)(long)hcpu;
433		433
434	switch (action) {	434	switch (action) {
435	case CPU_UP_CANCELED:	435	case CPU_UP_CANCELED:
436	case CPU_UP_CANCELED_FROZEN:	436	case CPU_UP_CANCELED_FROZEN:
437	case CPU_DOWN_PREPARE:	437	case CPU_DOWN_PREPARE:
438	case CPU_DOWN_PREPARE_FROZEN:	438	case CPU_DOWN_PREPARE_FROZEN:
439	case CPU_DEAD:	439	case CPU_DEAD:
440	case CPU_DEAD_FROZEN:	440	case CPU_DEAD_FROZEN:
441	hrtick_clear(cpu_rq(cpu));	441	hrtick_clear(cpu_rq(cpu));
442	return NOTIFY_OK;	442	return NOTIFY_OK;
443	}	443	}
444		444
445	return NOTIFY_DONE;	445	return NOTIFY_DONE;
446	}	446	}
447		447
448	static __init void init_hrtick(void)	448	static __init void init_hrtick(void)
449	{	449	{
450	hotcpu_notifier(hotplug_hrtick, 0);	450	hotcpu_notifier(hotplug_hrtick, 0);
451	}	451	}
452	#else	452	#else
453	/*	453	/*
454	* Called to set the hrtick timer state.	454	* Called to set the hrtick timer state.
455	*	455	*
456	* called with rq->lock held and irqs disabled	456	* called with rq->lock held and irqs disabled
457	*/	457	*/
458	void hrtick_start(struct rq *rq, u64 delay)	458	void hrtick_start(struct rq *rq, u64 delay)
459	{	459	{
460	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,	460	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
461	HRTIMER_MODE_REL_PINNED, 0);	461	HRTIMER_MODE_REL_PINNED, 0);
462	}	462	}
463		463
464	static inline void init_hrtick(void)	464	static inline void init_hrtick(void)
465	{	465	{
466	}	466	}
467	#endif /* CONFIG_SMP */	467	#endif /* CONFIG_SMP */
468		468
469	static void init_rq_hrtick(struct rq *rq)	469	static void init_rq_hrtick(struct rq *rq)
470	{	470	{
471	#ifdef CONFIG_SMP	471	#ifdef CONFIG_SMP
472	rq->hrtick_csd_pending = 0;	472	rq->hrtick_csd_pending = 0;
473		473
474	rq->hrtick_csd.flags = 0;	474	rq->hrtick_csd.flags = 0;
475	rq->hrtick_csd.func = __hrtick_start;	475	rq->hrtick_csd.func = __hrtick_start;
476	rq->hrtick_csd.info = rq;	476	rq->hrtick_csd.info = rq;
477	#endif	477	#endif
478		478
479	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	479	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
480	rq->hrtick_timer.function = hrtick;	480	rq->hrtick_timer.function = hrtick;
481	}	481	}
482	#else /* CONFIG_SCHED_HRTICK */	482	#else /* CONFIG_SCHED_HRTICK */
483	static inline void hrtick_clear(struct rq *rq)	483	static inline void hrtick_clear(struct rq *rq)
484	{	484	{
485	}	485	}
486		486
487	static inline void init_rq_hrtick(struct rq *rq)	487	static inline void init_rq_hrtick(struct rq *rq)
488	{	488	{
489	}	489	}
490		490
491	static inline void init_hrtick(void)	491	static inline void init_hrtick(void)
492	{	492	{
493	}	493	}
494	#endif /* CONFIG_SCHED_HRTICK */	494	#endif /* CONFIG_SCHED_HRTICK */
495		495
496	/*	496	/*
497	* resched_task - mark a task 'to be rescheduled now'.	497	* resched_task - mark a task 'to be rescheduled now'.
498	*	498	*
499	* On UP this means the setting of the need_resched flag, on SMP it	499	* On UP this means the setting of the need_resched flag, on SMP it
500	* might also involve a cross-CPU call to trigger the scheduler on	500	* might also involve a cross-CPU call to trigger the scheduler on
501	* the target CPU.	501	* the target CPU.
502	*/	502	*/
503	#ifdef CONFIG_SMP	503	#ifdef CONFIG_SMP
504		504
505	#ifndef tsk_is_polling	505	#ifndef tsk_is_polling
506	#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)	506	#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
507	#endif	507	#endif
508		508
509	void resched_task(struct task_struct *p)	509	void resched_task(struct task_struct *p)
510	{	510	{
511	int cpu;	511	int cpu;
512		512
513	assert_raw_spin_locked(&task_rq(p)->lock);	513	assert_raw_spin_locked(&task_rq(p)->lock);
514		514
515	if (test_tsk_need_resched(p))	515	if (test_tsk_need_resched(p))
516	return;	516	return;
517		517
518	set_tsk_need_resched(p);	518	set_tsk_need_resched(p);
519		519
520	cpu = task_cpu(p);	520	cpu = task_cpu(p);
521	if (cpu == smp_processor_id())	521	if (cpu == smp_processor_id())
522	return;	522	return;
523		523
524	/* NEED_RESCHED must be visible before we test polling */	524	/* NEED_RESCHED must be visible before we test polling */
525	smp_mb();	525	smp_mb();
526	if (!tsk_is_polling(p))	526	if (!tsk_is_polling(p))
527	smp_send_reschedule(cpu);	527	smp_send_reschedule(cpu);
528	}	528	}
529		529
530	void resched_cpu(int cpu)	530	void resched_cpu(int cpu)
531	{	531	{
532	struct rq *rq = cpu_rq(cpu);	532	struct rq *rq = cpu_rq(cpu);
533	unsigned long flags;	533	unsigned long flags;
534		534
535	if (!raw_spin_trylock_irqsave(&rq->lock, flags))	535	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
536	return;	536	return;
537	resched_task(cpu_curr(cpu));	537	resched_task(cpu_curr(cpu));
538	raw_spin_unlock_irqrestore(&rq->lock, flags);	538	raw_spin_unlock_irqrestore(&rq->lock, flags);
539	}	539	}
540		540
541	#ifdef CONFIG_NO_HZ	541	#ifdef CONFIG_NO_HZ
542	/*	542	/*
543	* In the semi idle case, use the nearest busy cpu for migrating timers	543	* In the semi idle case, use the nearest busy cpu for migrating timers
544	* from an idle cpu. This is good for power-savings.	544	* from an idle cpu. This is good for power-savings.
545	*	545	*
546	* We don't do similar optimization for completely idle system, as	546	* We don't do similar optimization for completely idle system, as
547	* selecting an idle cpu will add more delays to the timers than intended	547	* selecting an idle cpu will add more delays to the timers than intended
548	* (as that cpu's timer base may not be uptodate wrt jiffies etc).	548	* (as that cpu's timer base may not be uptodate wrt jiffies etc).
549	*/	549	*/
550	int get_nohz_timer_target(void)	550	int get_nohz_timer_target(void)
551	{	551	{
552	int cpu = smp_processor_id();	552	int cpu = smp_processor_id();
553	int i;	553	int i;
554	struct sched_domain *sd;	554	struct sched_domain *sd;
555		555
556	rcu_read_lock();	556	rcu_read_lock();
557	for_each_domain(cpu, sd) {	557	for_each_domain(cpu, sd) {
558	for_each_cpu(i, sched_domain_span(sd)) {	558	for_each_cpu(i, sched_domain_span(sd)) {
559	if (!idle_cpu(i)) {	559	if (!idle_cpu(i)) {
560	cpu = i;	560	cpu = i;
561	goto unlock;	561	goto unlock;
562	}	562	}
563	}	563	}
564	}	564	}
565	unlock:	565	unlock:
566	rcu_read_unlock();	566	rcu_read_unlock();
567	return cpu;	567	return cpu;
568	}	568	}
569	/*	569	/*
570	* When add_timer_on() enqueues a timer into the timer wheel of an	570	* When add_timer_on() enqueues a timer into the timer wheel of an
571	* idle CPU then this timer might expire before the next timer event	571	* idle CPU then this timer might expire before the next timer event
572	* which is scheduled to wake up that CPU. In case of a completely	572	* which is scheduled to wake up that CPU. In case of a completely
573	* idle system the next event might even be infinite time into the	573	* idle system the next event might even be infinite time into the
574	* future. wake_up_idle_cpu() ensures that the CPU is woken up and	574	* future. wake_up_idle_cpu() ensures that the CPU is woken up and
575	* leaves the inner idle loop so the newly added timer is taken into	575	* leaves the inner idle loop so the newly added timer is taken into
576	* account when the CPU goes back to idle and evaluates the timer	576	* account when the CPU goes back to idle and evaluates the timer
577	* wheel for the next timer event.	577	* wheel for the next timer event.
578	*/	578	*/
579	void wake_up_idle_cpu(int cpu)	579	void wake_up_idle_cpu(int cpu)
580	{	580	{
581	struct rq *rq = cpu_rq(cpu);	581	struct rq *rq = cpu_rq(cpu);
582		582
583	if (cpu == smp_processor_id())	583	if (cpu == smp_processor_id())
584	return;	584	return;
585		585
586	/*	586	/*
587	* This is safe, as this function is called with the timer	587	* This is safe, as this function is called with the timer
588	* wheel base lock of (cpu) held. When the CPU is on the way	588	* wheel base lock of (cpu) held. When the CPU is on the way
589	* to idle and has not yet set rq->curr to idle then it will	589	* to idle and has not yet set rq->curr to idle then it will
590	* be serialized on the timer wheel base lock and take the new	590	* be serialized on the timer wheel base lock and take the new
591	* timer into account automatically.	591	* timer into account automatically.
592	*/	592	*/
593	if (rq->curr != rq->idle)	593	if (rq->curr != rq->idle)
594	return;	594	return;
595		595
596	/*	596	/*
597	* We can set TIF_RESCHED on the idle task of the other CPU	597	* We can set TIF_RESCHED on the idle task of the other CPU
598	* lockless. The worst case is that the other CPU runs the	598	* lockless. The worst case is that the other CPU runs the
599	* idle task through an additional NOOP schedule()	599	* idle task through an additional NOOP schedule()
600	*/	600	*/
601	set_tsk_need_resched(rq->idle);	601	set_tsk_need_resched(rq->idle);
602		602
603	/* NEED_RESCHED must be visible before we test polling */	603	/* NEED_RESCHED must be visible before we test polling */
604	smp_mb();	604	smp_mb();
605	if (!tsk_is_polling(rq->idle))	605	if (!tsk_is_polling(rq->idle))
606	smp_send_reschedule(cpu);	606	smp_send_reschedule(cpu);
607	}	607	}
608		608
609	static inline bool got_nohz_idle_kick(void)	609	static inline bool got_nohz_idle_kick(void)
610	{	610	{
611	int cpu = smp_processor_id();	611	int cpu = smp_processor_id();
612	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));	612	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
613	}	613	}
614		614
615	#else /* CONFIG_NO_HZ */	615	#else /* CONFIG_NO_HZ */
616		616
617	static inline bool got_nohz_idle_kick(void)	617	static inline bool got_nohz_idle_kick(void)
618	{	618	{
619	return false;	619	return false;
620	}	620	}
621		621
622	#endif /* CONFIG_NO_HZ */	622	#endif /* CONFIG_NO_HZ */
623		623
624	void sched_avg_update(struct rq *rq)	624	void sched_avg_update(struct rq *rq)
625	{	625	{
626	s64 period = sched_avg_period();	626	s64 period = sched_avg_period();
627		627
628	while ((s64)(rq->clock - rq->age_stamp) > period) {	628	while ((s64)(rq->clock - rq->age_stamp) > period) {
629	/*	629	/*
630	* Inline assembly required to prevent the compiler	630	* Inline assembly required to prevent the compiler
631	* optimising this loop into a divmod call.	631	* optimising this loop into a divmod call.
632	* See __iter_div_u64_rem() for another example of this.	632	* See __iter_div_u64_rem() for another example of this.
633	*/	633	*/
634	asm("" : "+rm" (rq->age_stamp));	634	asm("" : "+rm" (rq->age_stamp));
635	rq->age_stamp += period;	635	rq->age_stamp += period;
636	rq->rt_avg /= 2;	636	rq->rt_avg /= 2;
637	}	637	}
638	}	638	}
639		639
640	#else /* !CONFIG_SMP */	640	#else /* !CONFIG_SMP */
641	void resched_task(struct task_struct *p)	641	void resched_task(struct task_struct *p)
642	{	642	{
643	assert_raw_spin_locked(&task_rq(p)->lock);	643	assert_raw_spin_locked(&task_rq(p)->lock);
644	set_tsk_need_resched(p);	644	set_tsk_need_resched(p);
645	}	645	}
646	#endif /* CONFIG_SMP */	646	#endif /* CONFIG_SMP */
647		647
648	#if defined(CONFIG_RT_GROUP_SCHED) \|\| (defined(CONFIG_FAIR_GROUP_SCHED) && \	648	#if defined(CONFIG_RT_GROUP_SCHED) \|\| (defined(CONFIG_FAIR_GROUP_SCHED) && \
649	(defined(CONFIG_SMP) \|\| defined(CONFIG_CFS_BANDWIDTH)))	649	(defined(CONFIG_SMP) \|\| defined(CONFIG_CFS_BANDWIDTH)))
650	/*	650	/*
651	* Iterate task_group tree rooted at *from, calling @down when first entering a	651	* Iterate task_group tree rooted at *from, calling @down when first entering a
652	* node and @up when leaving it for the final time.	652	* node and @up when leaving it for the final time.
653	*	653	*
654	* Caller must hold rcu_lock or sufficient equivalent.	654	* Caller must hold rcu_lock or sufficient equivalent.
655	*/	655	*/
656	int walk_tg_tree_from(struct task_group *from,	656	int walk_tg_tree_from(struct task_group *from,
657	tg_visitor down, tg_visitor up, void *data)	657	tg_visitor down, tg_visitor up, void *data)
658	{	658	{
659	struct task_group parent, child;	659	struct task_group parent, child;
660	int ret;	660	int ret;
661		661
662	parent = from;	662	parent = from;
663		663
664	down:	664	down:
665	ret = (*down)(parent, data);	665	ret = (*down)(parent, data);
666	if (ret)	666	if (ret)
667	goto out;	667	goto out;
668	list_for_each_entry_rcu(child, &parent->children, siblings) {	668	list_for_each_entry_rcu(child, &parent->children, siblings) {
669	parent = child;	669	parent = child;
670	goto down;	670	goto down;
671		671
672	up:	672	up:
673	continue;	673	continue;
674	}	674	}
675	ret = (*up)(parent, data);	675	ret = (*up)(parent, data);
676	if (ret \|\| parent == from)	676	if (ret \|\| parent == from)
677	goto out;	677	goto out;
678		678
679	child = parent;	679	child = parent;
680	parent = parent->parent;	680	parent = parent->parent;
681	if (parent)	681	if (parent)
682	goto up;	682	goto up;
683	out:	683	out:
684	return ret;	684	return ret;
685	}	685	}
686		686
687	int tg_nop(struct task_group tg, void data)	687	int tg_nop(struct task_group tg, void data)
688	{	688	{
689	return 0;	689	return 0;
690	}	690	}
691	#endif	691	#endif
692		692
693	void update_cpu_load(struct rq *this_rq);	693	void update_cpu_load(struct rq *this_rq);
694		694
695	static void set_load_weight(struct task_struct *p)	695	static void set_load_weight(struct task_struct *p)
696	{	696	{
697	int prio = p->static_prio - MAX_RT_PRIO;	697	int prio = p->static_prio - MAX_RT_PRIO;
698	struct load_weight *load = &p->se.load;	698	struct load_weight *load = &p->se.load;
699		699
700	/*	700	/*
701	* SCHED_IDLE tasks get minimal weight:	701	* SCHED_IDLE tasks get minimal weight:
702	*/	702	*/
703	if (p->policy == SCHED_IDLE) {	703	if (p->policy == SCHED_IDLE) {
704	load->weight = scale_load(WEIGHT_IDLEPRIO);	704	load->weight = scale_load(WEIGHT_IDLEPRIO);
705	load->inv_weight = WMULT_IDLEPRIO;	705	load->inv_weight = WMULT_IDLEPRIO;
706	return;	706	return;
707	}	707	}
708		708
709	load->weight = scale_load(prio_to_weight[prio]);	709	load->weight = scale_load(prio_to_weight[prio]);
710	load->inv_weight = prio_to_wmult[prio];	710	load->inv_weight = prio_to_wmult[prio];
711	}	711	}
712		712
713	static void enqueue_task(struct rq rq, struct task_struct p, int flags)	713	static void enqueue_task(struct rq rq, struct task_struct p, int flags)
714	{	714	{
715	update_rq_clock(rq);	715	update_rq_clock(rq);
716	sched_info_queued(p);	716	sched_info_queued(p);
717	p->sched_class->enqueue_task(rq, p, flags);	717	p->sched_class->enqueue_task(rq, p, flags);
718	}	718	}
719		719
720	static void dequeue_task(struct rq rq, struct task_struct p, int flags)	720	static void dequeue_task(struct rq rq, struct task_struct p, int flags)
721	{	721	{
722	update_rq_clock(rq);	722	update_rq_clock(rq);
723	sched_info_dequeued(p);	723	sched_info_dequeued(p);
724	p->sched_class->dequeue_task(rq, p, flags);	724	p->sched_class->dequeue_task(rq, p, flags);
725	}	725	}
726		726
727	void activate_task(struct rq rq, struct task_struct p, int flags)	727	void activate_task(struct rq rq, struct task_struct p, int flags)
728	{	728	{
729	if (task_contributes_to_load(p))	729	if (task_contributes_to_load(p))
730	rq->nr_uninterruptible--;	730	rq->nr_uninterruptible--;
731		731
732	enqueue_task(rq, p, flags);	732	enqueue_task(rq, p, flags);
733	}	733	}
734		734
735	void deactivate_task(struct rq rq, struct task_struct p, int flags)	735	void deactivate_task(struct rq rq, struct task_struct p, int flags)
736	{	736	{
737	if (task_contributes_to_load(p))	737	if (task_contributes_to_load(p))
738	rq->nr_uninterruptible++;	738	rq->nr_uninterruptible++;
739		739
740	dequeue_task(rq, p, flags);	740	dequeue_task(rq, p, flags);
741	}	741	}
742		742
743	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	743	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744		744
745	/*	745	/*
746	* There are no locks covering percpu hardirq/softirq time.	746	* There are no locks covering percpu hardirq/softirq time.
747	* They are only modified in account_system_vtime, on corresponding CPU	747	* They are only modified in account_system_vtime, on corresponding CPU
748	* with interrupts disabled. So, writes are safe.	748	* with interrupts disabled. So, writes are safe.
749	* They are read and saved off onto struct rq in update_rq_clock().	749	* They are read and saved off onto struct rq in update_rq_clock().
750	* This may result in other CPU reading this CPU's irq time and can	750	* This may result in other CPU reading this CPU's irq time and can
751	* race with irq/account_system_vtime on this CPU. We would either get old	751	* race with irq/account_system_vtime on this CPU. We would either get old
752	* or new value with a side effect of accounting a slice of irq time to wrong	752	* or new value with a side effect of accounting a slice of irq time to wrong
753	* task when irq is in progress while we read rq->clock. That is a worthy	753	* task when irq is in progress while we read rq->clock. That is a worthy
754	* compromise in place of having locks on each irq in account_system_time.	754	* compromise in place of having locks on each irq in account_system_time.
755	*/	755	*/
756	static DEFINE_PER_CPU(u64, cpu_hardirq_time);	756	static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757	static DEFINE_PER_CPU(u64, cpu_softirq_time);	757	static DEFINE_PER_CPU(u64, cpu_softirq_time);
758		758
759	static DEFINE_PER_CPU(u64, irq_start_time);	759	static DEFINE_PER_CPU(u64, irq_start_time);
760	static int sched_clock_irqtime;	760	static int sched_clock_irqtime;
761		761
762	void enable_sched_clock_irqtime(void)	762	void enable_sched_clock_irqtime(void)
763	{	763	{
764	sched_clock_irqtime = 1;	764	sched_clock_irqtime = 1;
765	}	765	}
766		766
767	void disable_sched_clock_irqtime(void)	767	void disable_sched_clock_irqtime(void)
768	{	768	{
769	sched_clock_irqtime = 0;	769	sched_clock_irqtime = 0;
770	}	770	}
771		771
772	#ifndef CONFIG_64BIT	772	#ifndef CONFIG_64BIT
773	static DEFINE_PER_CPU(seqcount_t, irq_time_seq);	773	static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774		774
775	static inline void irq_time_write_begin(void)	775	static inline void irq_time_write_begin(void)
776	{	776	{
777	__this_cpu_inc(irq_time_seq.sequence);	777	__this_cpu_inc(irq_time_seq.sequence);
778	smp_wmb();	778	smp_wmb();
779	}	779	}
780		780
781	static inline void irq_time_write_end(void)	781	static inline void irq_time_write_end(void)
782	{	782	{
783	smp_wmb();	783	smp_wmb();
784	__this_cpu_inc(irq_time_seq.sequence);	784	__this_cpu_inc(irq_time_seq.sequence);
785	}	785	}
786		786
787	static inline u64 irq_time_read(int cpu)	787	static inline u64 irq_time_read(int cpu)
788	{	788	{
789	u64 irq_time;	789	u64 irq_time;
790	unsigned seq;	790	unsigned seq;
791		791
792	do {	792	do {
793	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));	793	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794	irq_time = per_cpu(cpu_softirq_time, cpu) +	794	irq_time = per_cpu(cpu_softirq_time, cpu) +
795	per_cpu(cpu_hardirq_time, cpu);	795	per_cpu(cpu_hardirq_time, cpu);
796	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));	796	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797		797
798	return irq_time;	798	return irq_time;
799	}	799	}
800	#else /* CONFIG_64BIT */	800	#else /* CONFIG_64BIT */
801	static inline void irq_time_write_begin(void)	801	static inline void irq_time_write_begin(void)
802	{	802	{
803	}	803	}
804		804
805	static inline void irq_time_write_end(void)	805	static inline void irq_time_write_end(void)
806	{	806	{
807	}	807	}
808		808
809	static inline u64 irq_time_read(int cpu)	809	static inline u64 irq_time_read(int cpu)
810	{	810	{
811	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);	811	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812	}	812	}
813	#endif /* CONFIG_64BIT */	813	#endif /* CONFIG_64BIT */
814		814
815	/*	815	/*
816	* Called before incrementing preempt_count on {soft,}irq_enter	816	* Called before incrementing preempt_count on {soft,}irq_enter
817	* and before decrementing preempt_count on {soft,}irq_exit.	817	* and before decrementing preempt_count on {soft,}irq_exit.
818	*/	818	*/
819	void account_system_vtime(struct task_struct *curr)	819	void account_system_vtime(struct task_struct *curr)
820	{	820	{
821	unsigned long flags;	821	unsigned long flags;
822	s64 delta;	822	s64 delta;
823	int cpu;	823	int cpu;
824		824
825	if (!sched_clock_irqtime)	825	if (!sched_clock_irqtime)
826	return;	826	return;
827		827
828	local_irq_save(flags);	828	local_irq_save(flags);
829		829
830	cpu = smp_processor_id();	830	cpu = smp_processor_id();
831	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);	831	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832	__this_cpu_add(irq_start_time, delta);	832	__this_cpu_add(irq_start_time, delta);
833		833
834	irq_time_write_begin();	834	irq_time_write_begin();
835	/*	835	/*
836	* We do not account for softirq time from ksoftirqd here.	836	* We do not account for softirq time from ksoftirqd here.
837	* We want to continue accounting softirq time to ksoftirqd thread	837	* We want to continue accounting softirq time to ksoftirqd thread
838	* in that case, so as not to confuse scheduler with a special task	838	* in that case, so as not to confuse scheduler with a special task
839	* that do not consume any time, but still wants to run.	839	* that do not consume any time, but still wants to run.
840	*/	840	*/
841	if (hardirq_count())	841	if (hardirq_count())
842	__this_cpu_add(cpu_hardirq_time, delta);	842	__this_cpu_add(cpu_hardirq_time, delta);
843	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())	843	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844	__this_cpu_add(cpu_softirq_time, delta);	844	__this_cpu_add(cpu_softirq_time, delta);
845		845
846	irq_time_write_end();	846	irq_time_write_end();
847	local_irq_restore(flags);	847	local_irq_restore(flags);
848	}	848	}
849	EXPORT_SYMBOL_GPL(account_system_vtime);	849	EXPORT_SYMBOL_GPL(account_system_vtime);
850		850
851	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */	851	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
852		852
853	#ifdef CONFIG_PARAVIRT	853	#ifdef CONFIG_PARAVIRT
854	static inline u64 steal_ticks(u64 steal)	854	static inline u64 steal_ticks(u64 steal)
855	{	855	{
856	if (unlikely(steal > NSEC_PER_SEC))	856	if (unlikely(steal > NSEC_PER_SEC))
857	return div_u64(steal, TICK_NSEC);	857	return div_u64(steal, TICK_NSEC);
858		858
859	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);	859	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860	}	860	}
861	#endif	861	#endif
862		862
863	static void update_rq_clock_task(struct rq *rq, s64 delta)	863	static void update_rq_clock_task(struct rq *rq, s64 delta)
864	{	864	{
865	/*	865	/*
866	* In theory, the compile should just see 0 here, and optimize out the call	866	* In theory, the compile should just see 0 here, and optimize out the call
867	* to sched_rt_avg_update. But I don't trust it...	867	* to sched_rt_avg_update. But I don't trust it...
868	*/	868	*/
869	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)	869	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
870	s64 steal = 0, irq_delta = 0;	870	s64 steal = 0, irq_delta = 0;
871	#endif	871	#endif
872	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	872	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
873	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;	873	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
874		874
875	/*	875	/*
876	* Since irq_time is only updated on {soft,}irq_exit, we might run into	876	* Since irq_time is only updated on {soft,}irq_exit, we might run into
877	* this case when a previous update_rq_clock() happened inside a	877	* this case when a previous update_rq_clock() happened inside a
878	* {soft,}irq region.	878	* {soft,}irq region.
879	*	879	*
880	* When this happens, we stop ->clock_task and only update the	880	* When this happens, we stop ->clock_task and only update the
881	* prev_irq_time stamp to account for the part that fit, so that a next	881	* prev_irq_time stamp to account for the part that fit, so that a next
882	* update will consume the rest. This ensures ->clock_task is	882	* update will consume the rest. This ensures ->clock_task is
883	* monotonic.	883	* monotonic.
884	*	884	*
885	* It does however cause some slight miss-attribution of {soft,}irq	885	* It does however cause some slight miss-attribution of {soft,}irq
886	* time, a more accurate solution would be to update the irq_time using	886	* time, a more accurate solution would be to update the irq_time using
887	* the current rq->clock timestamp, except that would require using	887	* the current rq->clock timestamp, except that would require using
888	* atomic ops.	888	* atomic ops.
889	*/	889	*/
890	if (irq_delta > delta)	890	if (irq_delta > delta)
891	irq_delta = delta;	891	irq_delta = delta;
892		892
893	rq->prev_irq_time += irq_delta;	893	rq->prev_irq_time += irq_delta;
894	delta -= irq_delta;	894	delta -= irq_delta;
895	#endif	895	#endif
896	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING	896	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
897	if (static_branch((&paravirt_steal_rq_enabled))) {	897	if (static_branch((&paravirt_steal_rq_enabled))) {
898	u64 st;	898	u64 st;
899		899
900	steal = paravirt_steal_clock(cpu_of(rq));	900	steal = paravirt_steal_clock(cpu_of(rq));
901	steal -= rq->prev_steal_time_rq;	901	steal -= rq->prev_steal_time_rq;
902		902
903	if (unlikely(steal > delta))	903	if (unlikely(steal > delta))
904	steal = delta;	904	steal = delta;
905		905
906	st = steal_ticks(steal);	906	st = steal_ticks(steal);
907	steal = st * TICK_NSEC;	907	steal = st * TICK_NSEC;
908		908
909	rq->prev_steal_time_rq += steal;	909	rq->prev_steal_time_rq += steal;
910		910
911	delta -= steal;	911	delta -= steal;
912	}	912	}
913	#endif	913	#endif
914		914
915	rq->clock_task += delta;	915	rq->clock_task += delta;
916		916
917	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)	917	#if defined(CONFIG_IRQ_TIME_ACCOUNTING) \|\| defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
918	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))	918	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
919	sched_rt_avg_update(rq, irq_delta + steal);	919	sched_rt_avg_update(rq, irq_delta + steal);
920	#endif	920	#endif
921	}	921	}
922		922
923	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	923	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924	static int irqtime_account_hi_update(void)	924	static int irqtime_account_hi_update(void)
925	{	925	{
926	u64 *cpustat = kcpustat_this_cpu->cpustat;	926	u64 *cpustat = kcpustat_this_cpu->cpustat;
927	unsigned long flags;	927	unsigned long flags;
928	u64 latest_ns;	928	u64 latest_ns;
929	int ret = 0;	929	int ret = 0;
930		930
931	local_irq_save(flags);	931	local_irq_save(flags);
932	latest_ns = this_cpu_read(cpu_hardirq_time);	932	latest_ns = this_cpu_read(cpu_hardirq_time);
933	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])	933	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934	ret = 1;	934	ret = 1;
935	local_irq_restore(flags);	935	local_irq_restore(flags);
936	return ret;	936	return ret;
937	}	937	}
938		938
939	static int irqtime_account_si_update(void)	939	static int irqtime_account_si_update(void)
940	{	940	{
941	u64 *cpustat = kcpustat_this_cpu->cpustat;	941	u64 *cpustat = kcpustat_this_cpu->cpustat;
942	unsigned long flags;	942	unsigned long flags;
943	u64 latest_ns;	943	u64 latest_ns;
944	int ret = 0;	944	int ret = 0;
945		945
946	local_irq_save(flags);	946	local_irq_save(flags);
947	latest_ns = this_cpu_read(cpu_softirq_time);	947	latest_ns = this_cpu_read(cpu_softirq_time);
948	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])	948	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949	ret = 1;	949	ret = 1;
950	local_irq_restore(flags);	950	local_irq_restore(flags);
951	return ret;	951	return ret;
952	}	952	}
953		953
954	#else /* CONFIG_IRQ_TIME_ACCOUNTING */	954	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
955		955
956	#define sched_clock_irqtime (0)	956	#define sched_clock_irqtime (0)
957		957
958	#endif	958	#endif
959		959
960	void sched_set_stop_task(int cpu, struct task_struct *stop)	960	void sched_set_stop_task(int cpu, struct task_struct *stop)
961	{	961	{
962	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };	962	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
963	struct task_struct *old_stop = cpu_rq(cpu)->stop;	963	struct task_struct *old_stop = cpu_rq(cpu)->stop;
964		964
965	if (stop) {	965	if (stop) {
966	/*	966	/*
967	* Make it appear like a SCHED_FIFO task, its something	967	* Make it appear like a SCHED_FIFO task, its something
968	* userspace knows about and won't get confused about.	968	* userspace knows about and won't get confused about.
969	*	969	*
970	* Also, it will make PI more or less work without too	970	* Also, it will make PI more or less work without too
971	* much confusion -- but then, stop work should not	971	* much confusion -- but then, stop work should not
972	* rely on PI working anyway.	972	* rely on PI working anyway.
973	*/	973	*/
974	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);	974	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
975		975
976	stop->sched_class = &stop_sched_class;	976	stop->sched_class = &stop_sched_class;
977	}	977	}
978		978
979	cpu_rq(cpu)->stop = stop;	979	cpu_rq(cpu)->stop = stop;
980		980
981	if (old_stop) {	981	if (old_stop) {
982	/*	982	/*
983	* Reset it back to a normal scheduling class so that	983	* Reset it back to a normal scheduling class so that
984	* it can die in pieces.	984	* it can die in pieces.
985	*/	985	*/
986	old_stop->sched_class = &rt_sched_class;	986	old_stop->sched_class = &rt_sched_class;
987	}	987	}
988	}	988	}
989		989
990	/*	990	/*
991	* __normal_prio - return the priority that is based on the static prio	991	* __normal_prio - return the priority that is based on the static prio
992	*/	992	*/
993	static inline int __normal_prio(struct task_struct *p)	993	static inline int __normal_prio(struct task_struct *p)
994	{	994	{
995	return p->static_prio;	995	return p->static_prio;
996	}	996	}
997		997
998	/*	998	/*
999	* Calculate the expected normal priority: i.e. priority	999	* Calculate the expected normal priority: i.e. priority
1000	* without taking RT-inheritance into account. Might be	1000	* without taking RT-inheritance into account. Might be
1001	* boosted by interactivity modifiers. Changes upon fork,	1001	* boosted by interactivity modifiers. Changes upon fork,
1002	* setprio syscalls, and whenever the interactivity	1002	* setprio syscalls, and whenever the interactivity
1003	* estimator recalculates.	1003	* estimator recalculates.
1004	*/	1004	*/
1005	static inline int normal_prio(struct task_struct *p)	1005	static inline int normal_prio(struct task_struct *p)
1006	{	1006	{
1007	int prio;	1007	int prio;
1008		1008
1009	if (task_has_rt_policy(p))	1009	if (task_has_rt_policy(p))
1010	prio = MAX_RT_PRIO-1 - p->rt_priority;	1010	prio = MAX_RT_PRIO-1 - p->rt_priority;
1011	else	1011	else
1012	prio = __normal_prio(p);	1012	prio = __normal_prio(p);
1013	return prio;	1013	return prio;
1014	}	1014	}
1015		1015
1016	/*	1016	/*
1017	* Calculate the current priority, i.e. the priority	1017	* Calculate the current priority, i.e. the priority
1018	* taken into account by the scheduler. This value might	1018	* taken into account by the scheduler. This value might
1019	* be boosted by RT tasks, or might be boosted by	1019	* be boosted by RT tasks, or might be boosted by
1020	* interactivity modifiers. Will be RT if the task got	1020	* interactivity modifiers. Will be RT if the task got
1021	* RT-boosted. If not then it returns p->normal_prio.	1021	* RT-boosted. If not then it returns p->normal_prio.
1022	*/	1022	*/
1023	static int effective_prio(struct task_struct *p)	1023	static int effective_prio(struct task_struct *p)
1024	{	1024	{
1025	p->normal_prio = normal_prio(p);	1025	p->normal_prio = normal_prio(p);
1026	/*	1026	/*
1027	* If we are RT tasks or we were boosted to RT priority,	1027	* If we are RT tasks or we were boosted to RT priority,
1028	* keep the priority unchanged. Otherwise, update priority	1028	* keep the priority unchanged. Otherwise, update priority
1029	* to the normal priority:	1029	* to the normal priority:
1030	*/	1030	*/
1031	if (!rt_prio(p->prio))	1031	if (!rt_prio(p->prio))
1032	return p->normal_prio;	1032	return p->normal_prio;
1033	return p->prio;	1033	return p->prio;
1034	}	1034	}
1035		1035
1036	/**	1036	/**
1037	* task_curr - is this task currently executing on a CPU?	1037	* task_curr - is this task currently executing on a CPU?
1038	* @p: the task in question.	1038	* @p: the task in question.
1039	*/	1039	*/
1040	inline int task_curr(const struct task_struct *p)	1040	inline int task_curr(const struct task_struct *p)
1041	{	1041	{
1042	return cpu_curr(task_cpu(p)) == p;	1042	return cpu_curr(task_cpu(p)) == p;
1043	}	1043	}
1044		1044
1045	static inline void check_class_changed(struct rq rq, struct task_struct p,	1045	static inline void check_class_changed(struct rq rq, struct task_struct p,
1046	const struct sched_class *prev_class,	1046	const struct sched_class *prev_class,
1047	int oldprio)	1047	int oldprio)
1048	{	1048	{
1049	if (prev_class != p->sched_class) {	1049	if (prev_class != p->sched_class) {
1050	if (prev_class->switched_from)	1050	if (prev_class->switched_from)
1051	prev_class->switched_from(rq, p);	1051	prev_class->switched_from(rq, p);
1052	p->sched_class->switched_to(rq, p);	1052	p->sched_class->switched_to(rq, p);
1053	} else if (oldprio != p->prio)	1053	} else if (oldprio != p->prio)
1054	p->sched_class->prio_changed(rq, p, oldprio);	1054	p->sched_class->prio_changed(rq, p, oldprio);
1055	}	1055	}
1056		1056
1057	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)	1057	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
1058	{	1058	{
1059	const struct sched_class *class;	1059	const struct sched_class *class;
1060		1060
1061	if (p->sched_class == rq->curr->sched_class) {	1061	if (p->sched_class == rq->curr->sched_class) {
1062	rq->curr->sched_class->check_preempt_curr(rq, p, flags);	1062	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1063	} else {	1063	} else {
1064	for_each_class(class) {	1064	for_each_class(class) {
1065	if (class == rq->curr->sched_class)	1065	if (class == rq->curr->sched_class)
1066	break;	1066	break;
1067	if (class == p->sched_class) {	1067	if (class == p->sched_class) {
1068	resched_task(rq->curr);	1068	resched_task(rq->curr);
1069	break;	1069	break;
1070	}	1070	}
1071	}	1071	}
1072	}	1072	}
1073		1073
1074	/*	1074	/*
1075	* A queue event has occurred, and we're going to schedule. In	1075	* A queue event has occurred, and we're going to schedule. In
1076	* this case, we can save a useless back to back clock update.	1076	* this case, we can save a useless back to back clock update.
1077	*/	1077	*/
1078	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))	1078	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
1079	rq->skip_clock_update = 1;	1079	rq->skip_clock_update = 1;
1080	}	1080	}
1081		1081
1082	#ifdef CONFIG_SMP	1082	#ifdef CONFIG_SMP
1083	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)	1083	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1084	{	1084	{
1085	#ifdef CONFIG_SCHED_DEBUG	1085	#ifdef CONFIG_SCHED_DEBUG
1086	/*	1086	/*
1087	* We should never call set_task_cpu() on a blocked task,	1087	* We should never call set_task_cpu() on a blocked task,
1088	* ttwu() will sort out the placement.	1088	* ttwu() will sort out the placement.
1089	*/	1089	*/
1090	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&	1090	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1091	!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));	1091	!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
1092		1092
1093	#ifdef CONFIG_LOCKDEP	1093	#ifdef CONFIG_LOCKDEP
1094	/*	1094	/*
1095	* The caller should hold either p->pi_lock or rq->lock, when changing	1095	* The caller should hold either p->pi_lock or rq->lock, when changing
1096	* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.	1096	* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097	*	1097	*
1098	* sched_move_task() holds both and thus holding either pins the cgroup,	1098	* sched_move_task() holds both and thus holding either pins the cgroup,
1099	* see set_task_rq().	1099	* see set_task_rq().
1100	*	1100	*
1101	* Furthermore, all task_rq users should acquire both locks, see	1101	* Furthermore, all task_rq users should acquire both locks, see
1102	* task_rq_lock().	1102	* task_rq_lock().
1103	*/	1103	*/
1104	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|	1104	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|
1105	lockdep_is_held(&task_rq(p)->lock)));	1105	lockdep_is_held(&task_rq(p)->lock)));
1106	#endif	1106	#endif
1107	#endif	1107	#endif
1108		1108
1109	trace_sched_migrate_task(p, new_cpu);	1109	trace_sched_migrate_task(p, new_cpu);
1110		1110
1111	if (task_cpu(p) != new_cpu) {	1111	if (task_cpu(p) != new_cpu) {
1112	p->se.nr_migrations++;	1112	p->se.nr_migrations++;
1113	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);	1113	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1114	}	1114	}
1115		1115
1116	__set_task_cpu(p, new_cpu);	1116	__set_task_cpu(p, new_cpu);
1117	}	1117	}
1118		1118
1119	struct migration_arg {	1119	struct migration_arg {
1120	struct task_struct *task;	1120	struct task_struct *task;
1121	int dest_cpu;	1121	int dest_cpu;
1122	};	1122	};
1123		1123
1124	static int migration_cpu_stop(void *data);	1124	static int migration_cpu_stop(void *data);
1125		1125
1126	/*	1126	/*
1127	* wait_task_inactive - wait for a thread to unschedule.	1127	* wait_task_inactive - wait for a thread to unschedule.
1128	*	1128	*
1129	* If @match_state is nonzero, it's the @p->state value just checked and	1129	* If @match_state is nonzero, it's the @p->state value just checked and
1130	* not expected to change. If it changes, i.e. @p might have woken up,	1130	* not expected to change. If it changes, i.e. @p might have woken up,
1131	* then return zero. When we succeed in waiting for @p to be off its CPU,	1131	* then return zero. When we succeed in waiting for @p to be off its CPU,
1132	* we return a positive number (its total switch count). If a second call	1132	* we return a positive number (its total switch count). If a second call
1133	* a short while later returns the same number, the caller can be sure that	1133	* a short while later returns the same number, the caller can be sure that
1134	* @p has remained unscheduled the whole time.	1134	* @p has remained unscheduled the whole time.
1135	*	1135	*
1136	* The caller must ensure that the task will unschedule sometime soon,	1136	* The caller must ensure that the task will unschedule sometime soon,
1137	* else this function might spin for a long time. This function can't	1137	* else this function might spin for a long time. This function can't
1138	* be called with interrupts off, or it may introduce deadlock with	1138	* be called with interrupts off, or it may introduce deadlock with
1139	* smp_call_function() if an IPI is sent by the same process we are	1139	* smp_call_function() if an IPI is sent by the same process we are
1140	* waiting to become inactive.	1140	* waiting to become inactive.
1141	*/	1141	*/
1142	unsigned long wait_task_inactive(struct task_struct *p, long match_state)	1142	unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1143	{	1143	{
1144	unsigned long flags;	1144	unsigned long flags;
1145	int running, on_rq;	1145	int running, on_rq;
1146	unsigned long ncsw;	1146	unsigned long ncsw;
1147	struct rq *rq;	1147	struct rq *rq;
1148		1148
1149	for (;;) {	1149	for (;;) {
1150	/*	1150	/*
1151	* We do the initial early heuristics without holding	1151	* We do the initial early heuristics without holding
1152	* any task-queue locks at all. We'll only try to get	1152	* any task-queue locks at all. We'll only try to get
1153	* the runqueue lock when things look like they will	1153	* the runqueue lock when things look like they will
1154	* work out!	1154	* work out!
1155	*/	1155	*/
1156	rq = task_rq(p);	1156	rq = task_rq(p);
1157		1157
1158	/*	1158	/*
1159	* If the task is actively running on another CPU	1159	* If the task is actively running on another CPU
1160	* still, just relax and busy-wait without holding	1160	* still, just relax and busy-wait without holding
1161	* any locks.	1161	* any locks.
1162	*	1162	*
1163	* NOTE! Since we don't hold any locks, it's not	1163	* NOTE! Since we don't hold any locks, it's not
1164	* even sure that "rq" stays as the right runqueue!	1164	* even sure that "rq" stays as the right runqueue!
1165	* But we don't care, since "task_running()" will	1165	* But we don't care, since "task_running()" will
1166	* return false if the runqueue has changed and p	1166	* return false if the runqueue has changed and p
1167	* is actually now running somewhere else!	1167	* is actually now running somewhere else!
1168	*/	1168	*/
1169	while (task_running(rq, p)) {	1169	while (task_running(rq, p)) {
1170	if (match_state && unlikely(p->state != match_state))	1170	if (match_state && unlikely(p->state != match_state))
1171	return 0;	1171	return 0;
1172	cpu_relax();	1172	cpu_relax();
1173	}	1173	}
1174		1174
1175	/*	1175	/*
1176	* Ok, time to look more closely! We need the rq	1176	* Ok, time to look more closely! We need the rq
1177	* lock now, to be sure. If we're wrong, we'll	1177	* lock now, to be sure. If we're wrong, we'll
1178	* just go back and repeat.	1178	* just go back and repeat.
1179	*/	1179	*/
1180	rq = task_rq_lock(p, &flags);	1180	rq = task_rq_lock(p, &flags);
1181	trace_sched_wait_task(p);	1181	trace_sched_wait_task(p);
1182	running = task_running(rq, p);	1182	running = task_running(rq, p);
1183	on_rq = p->on_rq;	1183	on_rq = p->on_rq;
1184	ncsw = 0;	1184	ncsw = 0;
1185	if (!match_state \|\| p->state == match_state)	1185	if (!match_state \|\| p->state == match_state)
1186	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */	1186	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
1187	task_rq_unlock(rq, p, &flags);	1187	task_rq_unlock(rq, p, &flags);
1188		1188
1189	/*	1189	/*
1190	* If it changed from the expected state, bail out now.	1190	* If it changed from the expected state, bail out now.
1191	*/	1191	*/
1192	if (unlikely(!ncsw))	1192	if (unlikely(!ncsw))
1193	break;	1193	break;
1194		1194
1195	/*	1195	/*
1196	* Was it really running after all now that we	1196	* Was it really running after all now that we
1197	* checked with the proper locks actually held?	1197	* checked with the proper locks actually held?
1198	*	1198	*
1199	* Oops. Go back and try again..	1199	* Oops. Go back and try again..
1200	*/	1200	*/
1201	if (unlikely(running)) {	1201	if (unlikely(running)) {
1202	cpu_relax();	1202	cpu_relax();
1203	continue;	1203	continue;
1204	}	1204	}
1205		1205
1206	/*	1206	/*
1207	* It's not enough that it's not actively running,	1207	* It's not enough that it's not actively running,
1208	* it must be off the runqueue _entirely_, and not	1208	* it must be off the runqueue _entirely_, and not
1209	* preempted!	1209	* preempted!
1210	*	1210	*
1211	* So if it was still runnable (but just not actively	1211	* So if it was still runnable (but just not actively
1212	* running right now), it's preempted, and we should	1212	* running right now), it's preempted, and we should
1213	* yield - it could be a while.	1213	* yield - it could be a while.
1214	*/	1214	*/
1215	if (unlikely(on_rq)) {	1215	if (unlikely(on_rq)) {
1216	ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);	1216	ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1217		1217
1218	set_current_state(TASK_UNINTERRUPTIBLE);	1218	set_current_state(TASK_UNINTERRUPTIBLE);
1219	schedule_hrtimeout(&to, HRTIMER_MODE_REL);	1219	schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1220	continue;	1220	continue;
1221	}	1221	}
1222		1222
1223	/*	1223	/*
1224	* Ahh, all good. It wasn't running, and it wasn't	1224	* Ahh, all good. It wasn't running, and it wasn't
1225	* runnable, which means that it will never become	1225	* runnable, which means that it will never become
1226	* running in the future either. We're all done!	1226	* running in the future either. We're all done!
1227	*/	1227	*/
1228	break;	1228	break;
1229	}	1229	}
1230		1230
1231	return ncsw;	1231	return ncsw;
1232	}	1232	}
1233		1233
1234	/***	1234	/***
1235	* kick_process - kick a running thread to enter/exit the kernel	1235	* kick_process - kick a running thread to enter/exit the kernel
1236	* @p: the to-be-kicked thread	1236	* @p: the to-be-kicked thread
1237	*	1237	*
1238	* Cause a process which is running on another CPU to enter	1238	* Cause a process which is running on another CPU to enter
1239	* kernel-mode, without any delay. (to get signals handled.)	1239	* kernel-mode, without any delay. (to get signals handled.)
1240	*	1240	*
1241	* NOTE: this function doesn't have to take the runqueue lock,	1241	* NOTE: this function doesn't have to take the runqueue lock,
1242	* because all it wants to ensure is that the remote task enters	1242	* because all it wants to ensure is that the remote task enters
1243	* the kernel. If the IPI races and the task has been migrated	1243	* the kernel. If the IPI races and the task has been migrated
1244	* to another CPU then no harm is done and the purpose has been	1244	* to another CPU then no harm is done and the purpose has been
1245	* achieved as well.	1245	* achieved as well.
1246	*/	1246	*/
1247	void kick_process(struct task_struct *p)	1247	void kick_process(struct task_struct *p)
1248	{	1248	{
1249	int cpu;	1249	int cpu;
1250		1250
1251	preempt_disable();	1251	preempt_disable();
1252	cpu = task_cpu(p);	1252	cpu = task_cpu(p);
1253	if ((cpu != smp_processor_id()) && task_curr(p))	1253	if ((cpu != smp_processor_id()) && task_curr(p))
1254	smp_send_reschedule(cpu);	1254	smp_send_reschedule(cpu);
1255	preempt_enable();	1255	preempt_enable();
1256	}	1256	}
1257	EXPORT_SYMBOL_GPL(kick_process);	1257	EXPORT_SYMBOL_GPL(kick_process);
1258	#endif /* CONFIG_SMP */	1258	#endif /* CONFIG_SMP */
1259		1259
1260	#ifdef CONFIG_SMP	1260	#ifdef CONFIG_SMP
1261	/*	1261	/*
1262	* ->cpus_allowed is protected by both rq->lock and p->pi_lock	1262	* ->cpus_allowed is protected by both rq->lock and p->pi_lock
1263	*/	1263	*/
1264	static int select_fallback_rq(int cpu, struct task_struct *p)	1264	static int select_fallback_rq(int cpu, struct task_struct *p)
1265	{	1265	{
1266	int dest_cpu;	1266	int dest_cpu;
1267	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));	1267	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1268		1268
1269	/* Look for allowed, online CPU in same node. */	1269	/* Look for allowed, online CPU in same node. */
1270	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)	1270	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
1271	if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))	1271	if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1272	return dest_cpu;	1272	return dest_cpu;
1273		1273
1274	/* Any allowed, online CPU? */	1274	/* Any allowed, online CPU? */
1275	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);	1275	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
1276	if (dest_cpu < nr_cpu_ids)	1276	if (dest_cpu < nr_cpu_ids)
1277	return dest_cpu;	1277	return dest_cpu;
1278		1278
1279	/* No more Mr. Nice Guy. */	1279	/* No more Mr. Nice Guy. */
1280	dest_cpu = cpuset_cpus_allowed_fallback(p);	1280	dest_cpu = cpuset_cpus_allowed_fallback(p);
1281	/*	1281	/*
1282	* Don't tell them about moving exiting tasks or	1282	* Don't tell them about moving exiting tasks or
1283	* kernel threads (both mm NULL), since they never	1283	* kernel threads (both mm NULL), since they never
1284	* leave kernel.	1284	* leave kernel.
1285	*/	1285	*/
1286	if (p->mm && printk_ratelimit()) {	1286	if (p->mm && printk_ratelimit()) {
1287	printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",	1287	printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
1288	task_pid_nr(p), p->comm, cpu);	1288	task_pid_nr(p), p->comm, cpu);
1289	}	1289	}
1290		1290
1291	return dest_cpu;	1291	return dest_cpu;
1292	}	1292	}
1293		1293
1294	/*	1294	/*
1295	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.	1295	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1296	*/	1296	*/
1297	static inline	1297	static inline
1298	int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)	1298	int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1299	{	1299	{
1300	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);	1300	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1301		1301
1302	/*	1302	/*
1303	* In order not to call set_task_cpu() on a blocking task we need	1303	* In order not to call set_task_cpu() on a blocking task we need
1304	* to rely on ttwu() to place the task on a valid ->cpus_allowed	1304	* to rely on ttwu() to place the task on a valid ->cpus_allowed
1305	* cpu.	1305	* cpu.
1306	*	1306	*
1307	* Since this is common to all placement strategies, this lives here.	1307	* Since this is common to all placement strategies, this lives here.
1308	*	1308	*
1309	* [ this allows ->select_task() to simply return task_cpu(p) and	1309	* [ this allows ->select_task() to simply return task_cpu(p) and
1310	* not worry about this generic constraint ]	1310	* not worry about this generic constraint ]
1311	*/	1311	*/
1312	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) \|\|	1312	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) \|\|
1313	!cpu_online(cpu)))	1313	!cpu_online(cpu)))
1314	cpu = select_fallback_rq(task_cpu(p), p);	1314	cpu = select_fallback_rq(task_cpu(p), p);
1315		1315
1316	return cpu;	1316	return cpu;
1317	}	1317	}
1318		1318
1319	static void update_avg(u64 *avg, u64 sample)	1319	static void update_avg(u64 *avg, u64 sample)
1320	{	1320	{
1321	s64 diff = sample - *avg;	1321	s64 diff = sample - *avg;
1322	*avg += diff >> 3;	1322	*avg += diff >> 3;
1323	}	1323	}
1324	#endif	1324	#endif
1325		1325
1326	static void	1326	static void
1327	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)	1327	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1328	{	1328	{
1329	#ifdef CONFIG_SCHEDSTATS	1329	#ifdef CONFIG_SCHEDSTATS
1330	struct rq *rq = this_rq();	1330	struct rq *rq = this_rq();
1331		1331
1332	#ifdef CONFIG_SMP	1332	#ifdef CONFIG_SMP
1333	int this_cpu = smp_processor_id();	1333	int this_cpu = smp_processor_id();
1334		1334
1335	if (cpu == this_cpu) {	1335	if (cpu == this_cpu) {
1336	schedstat_inc(rq, ttwu_local);	1336	schedstat_inc(rq, ttwu_local);
1337	schedstat_inc(p, se.statistics.nr_wakeups_local);	1337	schedstat_inc(p, se.statistics.nr_wakeups_local);
1338	} else {	1338	} else {
1339	struct sched_domain *sd;	1339	struct sched_domain *sd;
1340		1340
1341	schedstat_inc(p, se.statistics.nr_wakeups_remote);	1341	schedstat_inc(p, se.statistics.nr_wakeups_remote);
1342	rcu_read_lock();	1342	rcu_read_lock();
1343	for_each_domain(this_cpu, sd) {	1343	for_each_domain(this_cpu, sd) {
1344	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {	1344	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1345	schedstat_inc(sd, ttwu_wake_remote);	1345	schedstat_inc(sd, ttwu_wake_remote);
1346	break;	1346	break;
1347	}	1347	}
1348	}	1348	}
1349	rcu_read_unlock();	1349	rcu_read_unlock();
1350	}	1350	}
1351		1351
1352	if (wake_flags & WF_MIGRATED)	1352	if (wake_flags & WF_MIGRATED)
1353	schedstat_inc(p, se.statistics.nr_wakeups_migrate);	1353	schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1354		1354
1355	#endif /* CONFIG_SMP */	1355	#endif /* CONFIG_SMP */
1356		1356
1357	schedstat_inc(rq, ttwu_count);	1357	schedstat_inc(rq, ttwu_count);
1358	schedstat_inc(p, se.statistics.nr_wakeups);	1358	schedstat_inc(p, se.statistics.nr_wakeups);
1359		1359
1360	if (wake_flags & WF_SYNC)	1360	if (wake_flags & WF_SYNC)
1361	schedstat_inc(p, se.statistics.nr_wakeups_sync);	1361	schedstat_inc(p, se.statistics.nr_wakeups_sync);
1362		1362
1363	#endif /* CONFIG_SCHEDSTATS */	1363	#endif /* CONFIG_SCHEDSTATS */
1364	}	1364	}
1365		1365
1366	static void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)	1366	static void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)
1367	{	1367	{
1368	activate_task(rq, p, en_flags);	1368	activate_task(rq, p, en_flags);
1369	p->on_rq = 1;	1369	p->on_rq = 1;
1370		1370
1371	/* if a worker is waking up, notify workqueue */	1371	/* if a worker is waking up, notify workqueue */
1372	if (p->flags & PF_WQ_WORKER)	1372	if (p->flags & PF_WQ_WORKER)
1373	wq_worker_waking_up(p, cpu_of(rq));	1373	wq_worker_waking_up(p, cpu_of(rq));
1374	}	1374	}
1375		1375
1376	/*	1376	/*
1377	* Mark the task runnable and perform wakeup-preemption.	1377	* Mark the task runnable and perform wakeup-preemption.
1378	*/	1378	*/
1379	static void	1379	static void
1380	ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags)	1380	ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags)
1381	{	1381	{
1382	trace_sched_wakeup(p, true);	1382	trace_sched_wakeup(p, true);
1383	check_preempt_curr(rq, p, wake_flags);	1383	check_preempt_curr(rq, p, wake_flags);
1384		1384
1385	p->state = TASK_RUNNING;	1385	p->state = TASK_RUNNING;
1386	#ifdef CONFIG_SMP	1386	#ifdef CONFIG_SMP
1387	if (p->sched_class->task_woken)	1387	if (p->sched_class->task_woken)
1388	p->sched_class->task_woken(rq, p);	1388	p->sched_class->task_woken(rq, p);
1389		1389
1390	if (rq->idle_stamp) {	1390	if (rq->idle_stamp) {
1391	u64 delta = rq->clock - rq->idle_stamp;	1391	u64 delta = rq->clock - rq->idle_stamp;
1392	u64 max = 2*sysctl_sched_migration_cost;	1392	u64 max = 2*sysctl_sched_migration_cost;
1393		1393
1394	if (delta > max)	1394	if (delta > max)
1395	rq->avg_idle = max;	1395	rq->avg_idle = max;
1396	else	1396	else
1397	update_avg(&rq->avg_idle, delta);	1397	update_avg(&rq->avg_idle, delta);
1398	rq->idle_stamp = 0;	1398	rq->idle_stamp = 0;
1399	}	1399	}
1400	#endif	1400	#endif
1401	}	1401	}
1402		1402
1403	static void	1403	static void
1404	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags)	1404	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags)
1405	{	1405	{
1406	#ifdef CONFIG_SMP	1406	#ifdef CONFIG_SMP
1407	if (p->sched_contributes_to_load)	1407	if (p->sched_contributes_to_load)
1408	rq->nr_uninterruptible--;	1408	rq->nr_uninterruptible--;
1409	#endif	1409	#endif
1410		1410
1411	ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_WAKING);	1411	ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_WAKING);
1412	ttwu_do_wakeup(rq, p, wake_flags);	1412	ttwu_do_wakeup(rq, p, wake_flags);
1413	}	1413	}
1414		1414
1415	/*	1415	/*
1416	* Called in case the task @p isn't fully descheduled from its runqueue,	1416	* Called in case the task @p isn't fully descheduled from its runqueue,
1417	* in this case we must do a remote wakeup. Its a 'light' wakeup though,	1417	* in this case we must do a remote wakeup. Its a 'light' wakeup though,
1418	* since all we need to do is flip p->state to TASK_RUNNING, since	1418	* since all we need to do is flip p->state to TASK_RUNNING, since
1419	* the task is still ->on_rq.	1419	* the task is still ->on_rq.
1420	*/	1420	*/
1421	static int ttwu_remote(struct task_struct *p, int wake_flags)	1421	static int ttwu_remote(struct task_struct *p, int wake_flags)
1422	{	1422	{
1423	struct rq *rq;	1423	struct rq *rq;
1424	int ret = 0;	1424	int ret = 0;
1425		1425
1426	rq = __task_rq_lock(p);	1426	rq = __task_rq_lock(p);
1427	if (p->on_rq) {	1427	if (p->on_rq) {
1428	ttwu_do_wakeup(rq, p, wake_flags);	1428	ttwu_do_wakeup(rq, p, wake_flags);
1429	ret = 1;	1429	ret = 1;
1430	}	1430	}
1431	__task_rq_unlock(rq);	1431	__task_rq_unlock(rq);
1432		1432
1433	return ret;	1433	return ret;
1434	}	1434	}
1435		1435
1436	#ifdef CONFIG_SMP	1436	#ifdef CONFIG_SMP
1437	static void sched_ttwu_pending(void)	1437	static void sched_ttwu_pending(void)
1438	{	1438	{
1439	struct rq *rq = this_rq();	1439	struct rq *rq = this_rq();
1440	struct llist_node *llist = llist_del_all(&rq->wake_list);	1440	struct llist_node *llist = llist_del_all(&rq->wake_list);
1441	struct task_struct *p;	1441	struct task_struct *p;
1442		1442
1443	raw_spin_lock(&rq->lock);	1443	raw_spin_lock(&rq->lock);
1444		1444
1445	while (llist) {	1445	while (llist) {
1446	p = llist_entry(llist, struct task_struct, wake_entry);	1446	p = llist_entry(llist, struct task_struct, wake_entry);
1447	llist = llist_next(llist);	1447	llist = llist_next(llist);
1448	ttwu_do_activate(rq, p, 0);	1448	ttwu_do_activate(rq, p, 0);
1449	}	1449	}
1450		1450
1451	raw_spin_unlock(&rq->lock);	1451	raw_spin_unlock(&rq->lock);
1452	}	1452	}
1453		1453
1454	void scheduler_ipi(void)	1454	void scheduler_ipi(void)
1455	{	1455	{
1456	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())	1456	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1457	return;	1457	return;
1458		1458
1459	/*	1459	/*
1460	* Not all reschedule IPI handlers call irq_enter/irq_exit, since	1460	* Not all reschedule IPI handlers call irq_enter/irq_exit, since
1461	* traditionally all their work was done from the interrupt return	1461	* traditionally all their work was done from the interrupt return
1462	* path. Now that we actually do some work, we need to make sure	1462	* path. Now that we actually do some work, we need to make sure
1463	* we do call them.	1463	* we do call them.
1464	*	1464	*
1465	* Some archs already do call them, luckily irq_enter/exit nest	1465	* Some archs already do call them, luckily irq_enter/exit nest
1466	* properly.	1466	* properly.
1467	*	1467	*
1468	* Arguably we should visit all archs and update all handlers,	1468	* Arguably we should visit all archs and update all handlers,
1469	* however a fair share of IPIs are still resched only so this would	1469	* however a fair share of IPIs are still resched only so this would
1470	* somewhat pessimize the simple resched case.	1470	* somewhat pessimize the simple resched case.
1471	*/	1471	*/
1472	irq_enter();	1472	irq_enter();
1473	sched_ttwu_pending();	1473	sched_ttwu_pending();
1474		1474
1475	/*	1475	/*
1476	* Check if someone kicked us for doing the nohz idle load balance.	1476	* Check if someone kicked us for doing the nohz idle load balance.
1477	*/	1477	*/
1478	if (unlikely(got_nohz_idle_kick() && !need_resched())) {	1478	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1479	this_rq()->idle_balance = 1;	1479	this_rq()->idle_balance = 1;
1480	raise_softirq_irqoff(SCHED_SOFTIRQ);	1480	raise_softirq_irqoff(SCHED_SOFTIRQ);
1481	}	1481	}
1482	irq_exit();	1482	irq_exit();
1483	}	1483	}
1484		1484
1485	static void ttwu_queue_remote(struct task_struct *p, int cpu)	1485	static void ttwu_queue_remote(struct task_struct *p, int cpu)
1486	{	1486	{
1487	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))	1487	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1488	smp_send_reschedule(cpu);	1488	smp_send_reschedule(cpu);
1489	}	1489	}
1490		1490
1491	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	1491	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1492	static int ttwu_activate_remote(struct task_struct *p, int wake_flags)	1492	static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1493	{	1493	{
1494	struct rq *rq;	1494	struct rq *rq;
1495	int ret = 0;	1495	int ret = 0;
1496		1496
1497	rq = __task_rq_lock(p);	1497	rq = __task_rq_lock(p);
1498	if (p->on_cpu) {	1498	if (p->on_cpu) {
1499	ttwu_activate(rq, p, ENQUEUE_WAKEUP);	1499	ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1500	ttwu_do_wakeup(rq, p, wake_flags);	1500	ttwu_do_wakeup(rq, p, wake_flags);
1501	ret = 1;	1501	ret = 1;
1502	}	1502	}
1503	__task_rq_unlock(rq);	1503	__task_rq_unlock(rq);
1504		1504
1505	return ret;	1505	return ret;
1506		1506
1507	}	1507	}
1508	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */	1508	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1509		1509
1510	bool cpus_share_cache(int this_cpu, int that_cpu)	1510	bool cpus_share_cache(int this_cpu, int that_cpu)
1511	{	1511	{
1512	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);	1512	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1513	}	1513	}
1514	#endif /* CONFIG_SMP */	1514	#endif /* CONFIG_SMP */
1515		1515
1516	static void ttwu_queue(struct task_struct *p, int cpu)	1516	static void ttwu_queue(struct task_struct *p, int cpu)
1517	{	1517	{
1518	struct rq *rq = cpu_rq(cpu);	1518	struct rq *rq = cpu_rq(cpu);
1519		1519
1520	#if defined(CONFIG_SMP)	1520	#if defined(CONFIG_SMP)
1521	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {	1521	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1522	sched_clock_cpu(cpu); /* sync clocks x-cpu */	1522	sched_clock_cpu(cpu); /* sync clocks x-cpu */
1523	ttwu_queue_remote(p, cpu);	1523	ttwu_queue_remote(p, cpu);
1524	return;	1524	return;
1525	}	1525	}
1526	#endif	1526	#endif
1527		1527
1528	raw_spin_lock(&rq->lock);	1528	raw_spin_lock(&rq->lock);
1529	ttwu_do_activate(rq, p, 0);	1529	ttwu_do_activate(rq, p, 0);
1530	raw_spin_unlock(&rq->lock);	1530	raw_spin_unlock(&rq->lock);
1531	}	1531	}
1532		1532
1533	/**	1533	/**
1534	* try_to_wake_up - wake up a thread	1534	* try_to_wake_up - wake up a thread
1535	* @p: the thread to be awakened	1535	* @p: the thread to be awakened
1536	* @state: the mask of task states that can be woken	1536	* @state: the mask of task states that can be woken
1537	* @wake_flags: wake modifier flags (WF_*)	1537	* @wake_flags: wake modifier flags (WF_*)
1538	*	1538	*
1539	* Put it on the run-queue if it's not already there. The "current"	1539	* Put it on the run-queue if it's not already there. The "current"
1540	* thread is always on the run-queue (except when the actual	1540	* thread is always on the run-queue (except when the actual
1541	* re-schedule is in progress), and as such you're allowed to do	1541	* re-schedule is in progress), and as such you're allowed to do
1542	* the simpler "current->state = TASK_RUNNING" to mark yourself	1542	* the simpler "current->state = TASK_RUNNING" to mark yourself
1543	* runnable without the overhead of this.	1543	* runnable without the overhead of this.
1544	*	1544	*
1545	* Returns %true if @p was woken up, %false if it was already running	1545	* Returns %true if @p was woken up, %false if it was already running
1546	* or @state didn't match @p's state.	1546	* or @state didn't match @p's state.
1547	*/	1547	*/
1548	static int	1548	static int
1549	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)	1549	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1550	{	1550	{
1551	unsigned long flags;	1551	unsigned long flags;
1552	int cpu, success = 0;	1552	int cpu, success = 0;
1553		1553
1554	smp_wmb();	1554	smp_wmb();
1555	raw_spin_lock_irqsave(&p->pi_lock, flags);	1555	raw_spin_lock_irqsave(&p->pi_lock, flags);
1556	if (!(p->state & state))	1556	if (!(p->state & state))
1557	goto out;	1557	goto out;
1558		1558
1559	success = 1; /* we're going to change ->state */	1559	success = 1; /* we're going to change ->state */
1560	cpu = task_cpu(p);	1560	cpu = task_cpu(p);
1561		1561
1562	if (p->on_rq && ttwu_remote(p, wake_flags))	1562	if (p->on_rq && ttwu_remote(p, wake_flags))
1563	goto stat;	1563	goto stat;
1564		1564
1565	#ifdef CONFIG_SMP	1565	#ifdef CONFIG_SMP
1566	/*	1566	/*
1567	* If the owning (remote) cpu is still in the middle of schedule() with	1567	* If the owning (remote) cpu is still in the middle of schedule() with
1568	* this task as prev, wait until its done referencing the task.	1568	* this task as prev, wait until its done referencing the task.
1569	*/	1569	*/
1570	while (p->on_cpu) {	1570	while (p->on_cpu) {
1571	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	1571	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1572	/*	1572	/*
1573	* In case the architecture enables interrupts in	1573	* In case the architecture enables interrupts in
1574	* context_switch(), we cannot busy wait, since that	1574	* context_switch(), we cannot busy wait, since that
1575	* would lead to deadlocks when an interrupt hits and	1575	* would lead to deadlocks when an interrupt hits and
1576	* tries to wake up @prev. So bail and do a complete	1576	* tries to wake up @prev. So bail and do a complete
1577	* remote wakeup.	1577	* remote wakeup.
1578	*/	1578	*/
1579	if (ttwu_activate_remote(p, wake_flags))	1579	if (ttwu_activate_remote(p, wake_flags))
1580	goto stat;	1580	goto stat;
1581	#else	1581	#else
1582	cpu_relax();	1582	cpu_relax();
1583	#endif	1583	#endif
1584	}	1584	}
1585	/*	1585	/*
1586	* Pairs with the smp_wmb() in finish_lock_switch().	1586	* Pairs with the smp_wmb() in finish_lock_switch().
1587	*/	1587	*/
1588	smp_rmb();	1588	smp_rmb();
1589		1589
1590	p->sched_contributes_to_load = !!task_contributes_to_load(p);	1590	p->sched_contributes_to_load = !!task_contributes_to_load(p);
1591	p->state = TASK_WAKING;	1591	p->state = TASK_WAKING;
1592		1592
1593	if (p->sched_class->task_waking)	1593	if (p->sched_class->task_waking)
1594	p->sched_class->task_waking(p);	1594	p->sched_class->task_waking(p);
1595		1595
1596	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);	1596	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1597	if (task_cpu(p) != cpu) {	1597	if (task_cpu(p) != cpu) {
1598	wake_flags \|= WF_MIGRATED;	1598	wake_flags \|= WF_MIGRATED;
1599	set_task_cpu(p, cpu);	1599	set_task_cpu(p, cpu);
1600	}	1600	}
1601	#endif /* CONFIG_SMP */	1601	#endif /* CONFIG_SMP */
1602		1602
1603	ttwu_queue(p, cpu);	1603	ttwu_queue(p, cpu);
1604	stat:	1604	stat:
1605	ttwu_stat(p, cpu, wake_flags);	1605	ttwu_stat(p, cpu, wake_flags);
1606	out:	1606	out:
1607	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	1607	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1608		1608
1609	return success;	1609	return success;
1610	}	1610	}
1611		1611
1612	/**	1612	/**
1613	* try_to_wake_up_local - try to wake up a local task with rq lock held	1613	* try_to_wake_up_local - try to wake up a local task with rq lock held
1614	* @p: the thread to be awakened	1614	* @p: the thread to be awakened
1615	*	1615	*
1616	* Put @p on the run-queue if it's not already there. The caller must	1616	* Put @p on the run-queue if it's not already there. The caller must
1617	* ensure that this_rq() is locked, @p is bound to this_rq() and not	1617	* ensure that this_rq() is locked, @p is bound to this_rq() and not
1618	* the current task.	1618	* the current task.
1619	*/	1619	*/
1620	static void try_to_wake_up_local(struct task_struct *p)	1620	static void try_to_wake_up_local(struct task_struct *p)
1621	{	1621	{
1622	struct rq *rq = task_rq(p);	1622	struct rq *rq = task_rq(p);
1623		1623
1624	BUG_ON(rq != this_rq());	1624	BUG_ON(rq != this_rq());
1625	BUG_ON(p == current);	1625	BUG_ON(p == current);
1626	lockdep_assert_held(&rq->lock);	1626	lockdep_assert_held(&rq->lock);
1627		1627
1628	if (!raw_spin_trylock(&p->pi_lock)) {	1628	if (!raw_spin_trylock(&p->pi_lock)) {
1629	raw_spin_unlock(&rq->lock);	1629	raw_spin_unlock(&rq->lock);
1630	raw_spin_lock(&p->pi_lock);	1630	raw_spin_lock(&p->pi_lock);
1631	raw_spin_lock(&rq->lock);	1631	raw_spin_lock(&rq->lock);
1632	}	1632	}
1633		1633
1634	if (!(p->state & TASK_NORMAL))	1634	if (!(p->state & TASK_NORMAL))
1635	goto out;	1635	goto out;
1636		1636
1637	if (!p->on_rq)	1637	if (!p->on_rq)
1638	ttwu_activate(rq, p, ENQUEUE_WAKEUP);	1638	ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1639		1639
1640	ttwu_do_wakeup(rq, p, 0);	1640	ttwu_do_wakeup(rq, p, 0);
1641	ttwu_stat(p, smp_processor_id(), 0);	1641	ttwu_stat(p, smp_processor_id(), 0);
1642	out:	1642	out:
1643	raw_spin_unlock(&p->pi_lock);	1643	raw_spin_unlock(&p->pi_lock);
1644	}	1644	}
1645		1645
1646	/**	1646	/**
1647	* wake_up_process - Wake up a specific process	1647	* wake_up_process - Wake up a specific process
1648	* @p: The process to be woken up.	1648	* @p: The process to be woken up.
1649	*	1649	*
1650	* Attempt to wake up the nominated process and move it to the set of runnable	1650	* Attempt to wake up the nominated process and move it to the set of runnable
1651	* processes. Returns 1 if the process was woken up, 0 if it was already	1651	* processes. Returns 1 if the process was woken up, 0 if it was already
1652	* running.	1652	* running.
1653	*	1653	*
1654	* It may be assumed that this function implies a write memory barrier before	1654	* It may be assumed that this function implies a write memory barrier before
1655	* changing the task state if and only if any tasks are woken up.	1655	* changing the task state if and only if any tasks are woken up.
1656	*/	1656	*/
1657	int wake_up_process(struct task_struct *p)	1657	int wake_up_process(struct task_struct *p)
1658	{	1658	{
1659	return try_to_wake_up(p, TASK_ALL, 0);	1659	return try_to_wake_up(p, TASK_ALL, 0);
1660	}	1660	}
1661	EXPORT_SYMBOL(wake_up_process);	1661	EXPORT_SYMBOL(wake_up_process);
1662		1662
1663	int wake_up_state(struct task_struct *p, unsigned int state)	1663	int wake_up_state(struct task_struct *p, unsigned int state)
1664	{	1664	{
1665	return try_to_wake_up(p, state, 0);	1665	return try_to_wake_up(p, state, 0);
1666	}	1666	}
1667		1667
1668	/*	1668	/*
1669	* Perform scheduler related setup for a newly forked process p.	1669	* Perform scheduler related setup for a newly forked process p.
1670	* p is forked by current.	1670	* p is forked by current.
1671	*	1671	*
1672	* __sched_fork() is basic setup used by init_idle() too:	1672	* __sched_fork() is basic setup used by init_idle() too:
1673	*/	1673	*/
1674	static void __sched_fork(struct task_struct *p)	1674	static void __sched_fork(struct task_struct *p)
1675	{	1675	{
1676	p->on_rq = 0;	1676	p->on_rq = 0;
1677		1677
1678	p->se.on_rq = 0;	1678	p->se.on_rq = 0;
1679	p->se.exec_start = 0;	1679	p->se.exec_start = 0;
1680	p->se.sum_exec_runtime = 0;	1680	p->se.sum_exec_runtime = 0;
1681	p->se.prev_sum_exec_runtime = 0;	1681	p->se.prev_sum_exec_runtime = 0;
1682	p->se.nr_migrations = 0;	1682	p->se.nr_migrations = 0;
1683	p->se.vruntime = 0;	1683	p->se.vruntime = 0;
1684	INIT_LIST_HEAD(&p->se.group_node);	1684	INIT_LIST_HEAD(&p->se.group_node);
1685		1685
1686	#ifdef CONFIG_SCHEDSTATS	1686	#ifdef CONFIG_SCHEDSTATS
1687	memset(&p->se.statistics, 0, sizeof(p->se.statistics));	1687	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1688	#endif	1688	#endif
1689		1689
1690	INIT_LIST_HEAD(&p->rt.run_list);	1690	INIT_LIST_HEAD(&p->rt.run_list);
1691		1691
1692	#ifdef CONFIG_PREEMPT_NOTIFIERS	1692	#ifdef CONFIG_PREEMPT_NOTIFIERS
1693	INIT_HLIST_HEAD(&p->preempt_notifiers);	1693	INIT_HLIST_HEAD(&p->preempt_notifiers);
1694	#endif	1694	#endif
1695	}	1695	}
1696		1696
1697	/*	1697	/*
1698	* fork()/clone()-time setup:	1698	* fork()/clone()-time setup:
1699	*/	1699	*/
1700	void sched_fork(struct task_struct *p)	1700	void sched_fork(struct task_struct *p)
1701	{	1701	{
1702	unsigned long flags;	1702	unsigned long flags;
1703	int cpu = get_cpu();	1703	int cpu = get_cpu();
1704		1704
1705	__sched_fork(p);	1705	__sched_fork(p);
1706	/*	1706	/*
1707	* We mark the process as running here. This guarantees that	1707	* We mark the process as running here. This guarantees that
1708	* nobody will actually run it, and a signal or other external	1708	* nobody will actually run it, and a signal or other external
1709	* event cannot wake it up and insert it on the runqueue either.	1709	* event cannot wake it up and insert it on the runqueue either.
1710	*/	1710	*/
1711	p->state = TASK_RUNNING;	1711	p->state = TASK_RUNNING;
1712		1712
1713	/*	1713	/*
1714	* Make sure we do not leak PI boosting priority to the child.	1714	* Make sure we do not leak PI boosting priority to the child.
1715	*/	1715	*/
1716	p->prio = current->normal_prio;	1716	p->prio = current->normal_prio;
1717		1717
1718	/*	1718	/*
1719	* Revert to default priority/policy on fork if requested.	1719	* Revert to default priority/policy on fork if requested.
1720	*/	1720	*/
1721	if (unlikely(p->sched_reset_on_fork)) {	1721	if (unlikely(p->sched_reset_on_fork)) {
1722	if (task_has_rt_policy(p)) {	1722	if (task_has_rt_policy(p)) {
1723	p->policy = SCHED_NORMAL;	1723	p->policy = SCHED_NORMAL;
1724	p->static_prio = NICE_TO_PRIO(0);	1724	p->static_prio = NICE_TO_PRIO(0);
1725	p->rt_priority = 0;	1725	p->rt_priority = 0;
1726	} else if (PRIO_TO_NICE(p->static_prio) < 0)	1726	} else if (PRIO_TO_NICE(p->static_prio) < 0)
1727	p->static_prio = NICE_TO_PRIO(0);	1727	p->static_prio = NICE_TO_PRIO(0);
1728		1728
1729	p->prio = p->normal_prio = __normal_prio(p);	1729	p->prio = p->normal_prio = __normal_prio(p);
1730	set_load_weight(p);	1730	set_load_weight(p);
1731		1731
1732	/*	1732	/*
1733	* We don't need the reset flag anymore after the fork. It has	1733	* We don't need the reset flag anymore after the fork. It has
1734	* fulfilled its duty:	1734	* fulfilled its duty:
1735	*/	1735	*/
1736	p->sched_reset_on_fork = 0;	1736	p->sched_reset_on_fork = 0;
1737	}	1737	}
1738		1738
1739	if (!rt_prio(p->prio))	1739	if (!rt_prio(p->prio))
1740	p->sched_class = &fair_sched_class;	1740	p->sched_class = &fair_sched_class;
1741		1741
1742	if (p->sched_class->task_fork)	1742	if (p->sched_class->task_fork)
1743	p->sched_class->task_fork(p);	1743	p->sched_class->task_fork(p);
1744		1744
1745	/*	1745	/*
1746	* The child is not yet in the pid-hash so no cgroup attach races,	1746	* The child is not yet in the pid-hash so no cgroup attach races,
1747	* and the cgroup is pinned to this child due to cgroup_fork()	1747	* and the cgroup is pinned to this child due to cgroup_fork()
1748	* is ran before sched_fork().	1748	* is ran before sched_fork().
1749	*	1749	*
1750	* Silence PROVE_RCU.	1750	* Silence PROVE_RCU.
1751	*/	1751	*/
1752	raw_spin_lock_irqsave(&p->pi_lock, flags);	1752	raw_spin_lock_irqsave(&p->pi_lock, flags);
1753	set_task_cpu(p, cpu);	1753	set_task_cpu(p, cpu);
1754	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	1754	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1755		1755
1756	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)	1756	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)
1757	if (likely(sched_info_on()))	1757	if (likely(sched_info_on()))
1758	memset(&p->sched_info, 0, sizeof(p->sched_info));	1758	memset(&p->sched_info, 0, sizeof(p->sched_info));
1759	#endif	1759	#endif
1760	#if defined(CONFIG_SMP)	1760	#if defined(CONFIG_SMP)
1761	p->on_cpu = 0;	1761	p->on_cpu = 0;
1762	#endif	1762	#endif
1763	#ifdef CONFIG_PREEMPT_COUNT	1763	#ifdef CONFIG_PREEMPT_COUNT
1764	/* Want to start with kernel preemption disabled. */	1764	/* Want to start with kernel preemption disabled. */
1765	task_thread_info(p)->preempt_count = 1;	1765	task_thread_info(p)->preempt_count = 1;
1766	#endif	1766	#endif
1767	#ifdef CONFIG_SMP	1767	#ifdef CONFIG_SMP
1768	plist_node_init(&p->pushable_tasks, MAX_PRIO);	1768	plist_node_init(&p->pushable_tasks, MAX_PRIO);
1769	#endif	1769	#endif
1770		1770
1771	put_cpu();	1771	put_cpu();
1772	}	1772	}
1773		1773
1774	/*	1774	/*
1775	* wake_up_new_task - wake up a newly created task for the first time.	1775	* wake_up_new_task - wake up a newly created task for the first time.
1776	*	1776	*
1777	* This function will do some initial scheduler statistics housekeeping	1777	* This function will do some initial scheduler statistics housekeeping
1778	* that must be done for every newly created context, then puts the task	1778	* that must be done for every newly created context, then puts the task
1779	* on the runqueue and wakes it.	1779	* on the runqueue and wakes it.
1780	*/	1780	*/
1781	void wake_up_new_task(struct task_struct *p)	1781	void wake_up_new_task(struct task_struct *p)
1782	{	1782	{
1783	unsigned long flags;	1783	unsigned long flags;
1784	struct rq *rq;	1784	struct rq *rq;
1785		1785
1786	raw_spin_lock_irqsave(&p->pi_lock, flags);	1786	raw_spin_lock_irqsave(&p->pi_lock, flags);
1787	#ifdef CONFIG_SMP	1787	#ifdef CONFIG_SMP
1788	/*	1788	/*
1789	* Fork balancing, do it here and not earlier because:	1789	* Fork balancing, do it here and not earlier because:
1790	* - cpus_allowed can change in the fork path	1790	* - cpus_allowed can change in the fork path
1791	* - any previously selected cpu might disappear through hotplug	1791	* - any previously selected cpu might disappear through hotplug
1792	*/	1792	*/
1793	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));	1793	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1794	#endif	1794	#endif
1795		1795
1796	rq = __task_rq_lock(p);	1796	rq = __task_rq_lock(p);
1797	activate_task(rq, p, 0);	1797	activate_task(rq, p, 0);
1798	p->on_rq = 1;	1798	p->on_rq = 1;
1799	trace_sched_wakeup_new(p, true);	1799	trace_sched_wakeup_new(p, true);
1800	check_preempt_curr(rq, p, WF_FORK);	1800	check_preempt_curr(rq, p, WF_FORK);
1801	#ifdef CONFIG_SMP	1801	#ifdef CONFIG_SMP
1802	if (p->sched_class->task_woken)	1802	if (p->sched_class->task_woken)
1803	p->sched_class->task_woken(rq, p);	1803	p->sched_class->task_woken(rq, p);
1804	#endif	1804	#endif
1805	task_rq_unlock(rq, p, &flags);	1805	task_rq_unlock(rq, p, &flags);
1806	}	1806	}
1807		1807
1808	#ifdef CONFIG_PREEMPT_NOTIFIERS	1808	#ifdef CONFIG_PREEMPT_NOTIFIERS
1809		1809
1810	/**	1810	/**
1811	* preempt_notifier_register - tell me when current is being preempted & rescheduled	1811	* preempt_notifier_register - tell me when current is being preempted & rescheduled
1812	* @notifier: notifier struct to register	1812	* @notifier: notifier struct to register
1813	*/	1813	*/
1814	void preempt_notifier_register(struct preempt_notifier *notifier)	1814	void preempt_notifier_register(struct preempt_notifier *notifier)
1815	{	1815	{
1816	hlist_add_head(&notifier->link, &current->preempt_notifiers);	1816	hlist_add_head(&notifier->link, &current->preempt_notifiers);
1817	}	1817	}
1818	EXPORT_SYMBOL_GPL(preempt_notifier_register);	1818	EXPORT_SYMBOL_GPL(preempt_notifier_register);
1819		1819
1820	/**	1820	/**
1821	* preempt_notifier_unregister - no longer interested in preemption notifications	1821	* preempt_notifier_unregister - no longer interested in preemption notifications
1822	* @notifier: notifier struct to unregister	1822	* @notifier: notifier struct to unregister
1823	*	1823	*
1824	* This is safe to call from within a preemption notifier.	1824	* This is safe to call from within a preemption notifier.
1825	*/	1825	*/
1826	void preempt_notifier_unregister(struct preempt_notifier *notifier)	1826	void preempt_notifier_unregister(struct preempt_notifier *notifier)
1827	{	1827	{
1828	hlist_del(&notifier->link);	1828	hlist_del(&notifier->link);
1829	}	1829	}
1830	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);	1830	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1831		1831
1832	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)	1832	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1833	{	1833	{
1834	struct preempt_notifier *notifier;	1834	struct preempt_notifier *notifier;
1835	struct hlist_node *node;	1835	struct hlist_node *node;
1836		1836
1837	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)	1837	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1838	notifier->ops->sched_in(notifier, raw_smp_processor_id());	1838	notifier->ops->sched_in(notifier, raw_smp_processor_id());
1839	}	1839	}
1840		1840
1841	static void	1841	static void
1842	fire_sched_out_preempt_notifiers(struct task_struct *curr,	1842	fire_sched_out_preempt_notifiers(struct task_struct *curr,
1843	struct task_struct *next)	1843	struct task_struct *next)
1844	{	1844	{
1845	struct preempt_notifier *notifier;	1845	struct preempt_notifier *notifier;
1846	struct hlist_node *node;	1846	struct hlist_node *node;
1847		1847
1848	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)	1848	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1849	notifier->ops->sched_out(notifier, next);	1849	notifier->ops->sched_out(notifier, next);
1850	}	1850	}
1851		1851
1852	#else /* !CONFIG_PREEMPT_NOTIFIERS */	1852	#else /* !CONFIG_PREEMPT_NOTIFIERS */
1853		1853
1854	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)	1854	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1855	{	1855	{
1856	}	1856	}
1857		1857
1858	static void	1858	static void
1859	fire_sched_out_preempt_notifiers(struct task_struct *curr,	1859	fire_sched_out_preempt_notifiers(struct task_struct *curr,
1860	struct task_struct *next)	1860	struct task_struct *next)
1861	{	1861	{
1862	}	1862	}
1863		1863
1864	#endif /* CONFIG_PREEMPT_NOTIFIERS */	1864	#endif /* CONFIG_PREEMPT_NOTIFIERS */
1865		1865
1866	/**	1866	/**
1867	* prepare_task_switch - prepare to switch tasks	1867	* prepare_task_switch - prepare to switch tasks
1868	* @rq: the runqueue preparing to switch	1868	* @rq: the runqueue preparing to switch
1869	* @prev: the current task that is being switched out	1869	* @prev: the current task that is being switched out
1870	* @next: the task we are going to switch to.	1870	* @next: the task we are going to switch to.
1871	*	1871	*
1872	* This is called with the rq lock held and interrupts off. It must	1872	* This is called with the rq lock held and interrupts off. It must
1873	* be paired with a subsequent finish_task_switch after the context	1873	* be paired with a subsequent finish_task_switch after the context
1874	* switch.	1874	* switch.
1875	*	1875	*
1876	* prepare_task_switch sets up locking and calls architecture specific	1876	* prepare_task_switch sets up locking and calls architecture specific
1877	* hooks.	1877	* hooks.
1878	*/	1878	*/
1879	static inline void	1879	static inline void
1880	prepare_task_switch(struct rq rq, struct task_struct prev,	1880	prepare_task_switch(struct rq rq, struct task_struct prev,
1881	struct task_struct *next)	1881	struct task_struct *next)
1882	{	1882	{
1883	sched_info_switch(prev, next);	1883	sched_info_switch(prev, next);
1884	perf_event_task_sched_out(prev, next);	1884	perf_event_task_sched_out(prev, next);
1885	fire_sched_out_preempt_notifiers(prev, next);	1885	fire_sched_out_preempt_notifiers(prev, next);
1886	prepare_lock_switch(rq, next);	1886	prepare_lock_switch(rq, next);
1887	prepare_arch_switch(next);	1887	prepare_arch_switch(next);
1888	trace_sched_switch(prev, next);	1888	trace_sched_switch(prev, next);
1889	}	1889	}
1890		1890
1891	/**	1891	/**
1892	* finish_task_switch - clean up after a task-switch	1892	* finish_task_switch - clean up after a task-switch
1893	* @rq: runqueue associated with task-switch	1893	* @rq: runqueue associated with task-switch
1894	* @prev: the thread we just switched away from.	1894	* @prev: the thread we just switched away from.
1895	*	1895	*
1896	* finish_task_switch must be called after the context switch, paired	1896	* finish_task_switch must be called after the context switch, paired
1897	* with a prepare_task_switch call before the context switch.	1897	* with a prepare_task_switch call before the context switch.
1898	* finish_task_switch will reconcile locking set up by prepare_task_switch,	1898	* finish_task_switch will reconcile locking set up by prepare_task_switch,
1899	* and do any other architecture-specific cleanup actions.	1899	* and do any other architecture-specific cleanup actions.
1900	*	1900	*
1901	* Note that we may have delayed dropping an mm in context_switch(). If	1901	* Note that we may have delayed dropping an mm in context_switch(). If
1902	* so, we finish that here outside of the runqueue lock. (Doing it	1902	* so, we finish that here outside of the runqueue lock. (Doing it
1903	* with the lock held can cause deadlocks; see schedule() for	1903	* with the lock held can cause deadlocks; see schedule() for
1904	* details.)	1904	* details.)
1905	*/	1905	*/
1906	static void finish_task_switch(struct rq rq, struct task_struct prev)	1906	static void finish_task_switch(struct rq rq, struct task_struct prev)
1907	__releases(rq->lock)	1907	__releases(rq->lock)
1908	{	1908	{
1909	struct mm_struct *mm = rq->prev_mm;	1909	struct mm_struct *mm = rq->prev_mm;
1910	long prev_state;	1910	long prev_state;
1911		1911
1912	rq->prev_mm = NULL;	1912	rq->prev_mm = NULL;
1913		1913
1914	/*	1914	/*
1915	* A task struct has one reference for the use as "current".	1915	* A task struct has one reference for the use as "current".
1916	* If a task dies, then it sets TASK_DEAD in tsk->state and calls	1916	* If a task dies, then it sets TASK_DEAD in tsk->state and calls
1917	* schedule one last time. The schedule call will never return, and	1917	* schedule one last time. The schedule call will never return, and
1918	* the scheduled task must drop that reference.	1918	* the scheduled task must drop that reference.
1919	* The test for TASK_DEAD must occur while the runqueue locks are	1919	* The test for TASK_DEAD must occur while the runqueue locks are
1920	* still held, otherwise prev could be scheduled on another cpu, die	1920	* still held, otherwise prev could be scheduled on another cpu, die
1921	* there before we look at prev->state, and then the reference would	1921	* there before we look at prev->state, and then the reference would
1922	* be dropped twice.	1922	* be dropped twice.
1923	* Manfred Spraul <manfred@colorfullife.com>	1923	* Manfred Spraul <manfred@colorfullife.com>
1924	*/	1924	*/
1925	prev_state = prev->state;	1925	prev_state = prev->state;
1926	finish_arch_switch(prev);	1926	finish_arch_switch(prev);
1927	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	1927	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1928	local_irq_disable();	1928	local_irq_disable();
1929	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */	1929	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1930	perf_event_task_sched_in(prev, current);	1930	perf_event_task_sched_in(prev, current);
1931	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	1931	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1932	local_irq_enable();	1932	local_irq_enable();
1933	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */	1933	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1934	finish_lock_switch(rq, prev);	1934	finish_lock_switch(rq, prev);
1935		1935
1936	fire_sched_in_preempt_notifiers(current);	1936	fire_sched_in_preempt_notifiers(current);
1937	if (mm)	1937	if (mm)
1938	mmdrop(mm);	1938	mmdrop(mm);
1939	if (unlikely(prev_state == TASK_DEAD)) {	1939	if (unlikely(prev_state == TASK_DEAD)) {
1940	/*	1940	/*
1941	* Remove function-return probe instances associated with this	1941	* Remove function-return probe instances associated with this
1942	* task and put them back on the free list.	1942	* task and put them back on the free list.
1943	*/	1943	*/
1944	kprobe_flush_task(prev);	1944	kprobe_flush_task(prev);
1945	put_task_struct(prev);	1945	put_task_struct(prev);
1946	}	1946	}
1947	}	1947	}
1948		1948
1949	#ifdef CONFIG_SMP	1949	#ifdef CONFIG_SMP
1950		1950
1951	/* assumes rq->lock is held */	1951	/* assumes rq->lock is held */
1952	static inline void pre_schedule(struct rq rq, struct task_struct prev)	1952	static inline void pre_schedule(struct rq rq, struct task_struct prev)
1953	{	1953	{
1954	if (prev->sched_class->pre_schedule)	1954	if (prev->sched_class->pre_schedule)
1955	prev->sched_class->pre_schedule(rq, prev);	1955	prev->sched_class->pre_schedule(rq, prev);
1956	}	1956	}
1957		1957
1958	/* rq->lock is NOT held, but preemption is disabled */	1958	/* rq->lock is NOT held, but preemption is disabled */
1959	static inline void post_schedule(struct rq *rq)	1959	static inline void post_schedule(struct rq *rq)
1960	{	1960	{
1961	if (rq->post_schedule) {	1961	if (rq->post_schedule) {
1962	unsigned long flags;	1962	unsigned long flags;
1963		1963
1964	raw_spin_lock_irqsave(&rq->lock, flags);	1964	raw_spin_lock_irqsave(&rq->lock, flags);
1965	if (rq->curr->sched_class->post_schedule)	1965	if (rq->curr->sched_class->post_schedule)
1966	rq->curr->sched_class->post_schedule(rq);	1966	rq->curr->sched_class->post_schedule(rq);
1967	raw_spin_unlock_irqrestore(&rq->lock, flags);	1967	raw_spin_unlock_irqrestore(&rq->lock, flags);
1968		1968
1969	rq->post_schedule = 0;	1969	rq->post_schedule = 0;
1970	}	1970	}
1971	}	1971	}
1972		1972
1973	#else	1973	#else
1974		1974
1975	static inline void pre_schedule(struct rq rq, struct task_struct p)	1975	static inline void pre_schedule(struct rq rq, struct task_struct p)
1976	{	1976	{
1977	}	1977	}
1978		1978
1979	static inline void post_schedule(struct rq *rq)	1979	static inline void post_schedule(struct rq *rq)
1980	{	1980	{
1981	}	1981	}
1982		1982
1983	#endif	1983	#endif
1984		1984
1985	/**	1985	/**
1986	* schedule_tail - first thing a freshly forked thread must call.	1986	* schedule_tail - first thing a freshly forked thread must call.
1987	* @prev: the thread we just switched away from.	1987	* @prev: the thread we just switched away from.
1988	*/	1988	*/
1989	asmlinkage void schedule_tail(struct task_struct *prev)	1989	asmlinkage void schedule_tail(struct task_struct *prev)
1990	__releases(rq->lock)	1990	__releases(rq->lock)
1991	{	1991	{
1992	struct rq *rq = this_rq();	1992	struct rq *rq = this_rq();
1993		1993
1994	finish_task_switch(rq, prev);	1994	finish_task_switch(rq, prev);
1995		1995
1996	/*	1996	/*
1997	* FIXME: do we need to worry about rq being invalidated by the	1997	* FIXME: do we need to worry about rq being invalidated by the
1998	* task_switch?	1998	* task_switch?
1999	*/	1999	*/
2000	post_schedule(rq);	2000	post_schedule(rq);
2001		2001
2002	#ifdef __ARCH_WANT_UNLOCKED_CTXSW	2002	#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2003	/* In this case, finish_task_switch does not reenable preemption */	2003	/* In this case, finish_task_switch does not reenable preemption */
2004	preempt_enable();	2004	preempt_enable();
2005	#endif	2005	#endif
2006	if (current->set_child_tid)	2006	if (current->set_child_tid)
2007	put_user(task_pid_vnr(current), current->set_child_tid);	2007	put_user(task_pid_vnr(current), current->set_child_tid);
2008	}	2008	}
2009		2009
2010	/*	2010	/*
2011	* context_switch - switch to the new MM and the new	2011	* context_switch - switch to the new MM and the new
2012	* thread's register state.	2012	* thread's register state.
2013	*/	2013	*/
2014	static inline void	2014	static inline void
2015	context_switch(struct rq rq, struct task_struct prev,	2015	context_switch(struct rq rq, struct task_struct prev,
2016	struct task_struct *next)	2016	struct task_struct *next)
2017	{	2017	{
2018	struct mm_struct mm, oldmm;	2018	struct mm_struct mm, oldmm;
2019		2019
2020	prepare_task_switch(rq, prev, next);	2020	prepare_task_switch(rq, prev, next);
2021		2021
2022	mm = next->mm;	2022	mm = next->mm;
2023	oldmm = prev->active_mm;	2023	oldmm = prev->active_mm;
2024	/*	2024	/*
2025	* For paravirt, this is coupled with an exit in switch_to to	2025	* For paravirt, this is coupled with an exit in switch_to to
2026	* combine the page table reload and the switch backend into	2026	* combine the page table reload and the switch backend into
2027	* one hypercall.	2027	* one hypercall.
2028	*/	2028	*/
2029	arch_start_context_switch(prev);	2029	arch_start_context_switch(prev);
2030		2030
2031	if (!mm) {	2031	if (!mm) {
2032	next->active_mm = oldmm;	2032	next->active_mm = oldmm;
2033	atomic_inc(&oldmm->mm_count);	2033	atomic_inc(&oldmm->mm_count);
2034	enter_lazy_tlb(oldmm, next);	2034	enter_lazy_tlb(oldmm, next);
2035	} else	2035	} else
2036	switch_mm(oldmm, mm, next);	2036	switch_mm(oldmm, mm, next);
2037		2037
2038	if (!prev->mm) {	2038	if (!prev->mm) {
2039	prev->active_mm = NULL;	2039	prev->active_mm = NULL;
2040	rq->prev_mm = oldmm;	2040	rq->prev_mm = oldmm;
2041	}	2041	}
2042	/*	2042	/*
2043	* Since the runqueue lock will be released by the next	2043	* Since the runqueue lock will be released by the next
2044	* task (which is an invalid locking op but in the case	2044	* task (which is an invalid locking op but in the case
2045	* of the scheduler it's an obvious special-case), so we	2045	* of the scheduler it's an obvious special-case), so we
2046	* do an early lockdep release here:	2046	* do an early lockdep release here:
2047	*/	2047	*/
2048	#ifndef __ARCH_WANT_UNLOCKED_CTXSW	2048	#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2049	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);	2049	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2050	#endif	2050	#endif
2051		2051
2052	/* Here we just switch the register state and the stack. */	2052	/* Here we just switch the register state and the stack. */
2053	switch_to(prev, next, prev);	2053	switch_to(prev, next, prev);
2054		2054
2055	barrier();	2055	barrier();
2056	/*	2056	/*
2057	* this_rq must be evaluated again because prev may have moved	2057	* this_rq must be evaluated again because prev may have moved
2058	* CPUs since it called schedule(), thus the 'rq' on its stack	2058	* CPUs since it called schedule(), thus the 'rq' on its stack
2059	* frame will be invalid.	2059	* frame will be invalid.
2060	*/	2060	*/
2061	finish_task_switch(this_rq(), prev);	2061	finish_task_switch(this_rq(), prev);
2062	}	2062	}
2063		2063
2064	/*	2064	/*
2065	* nr_running, nr_uninterruptible and nr_context_switches:	2065	* nr_running, nr_uninterruptible and nr_context_switches:
2066	*	2066	*
2067	* externally visible scheduler statistics: current number of runnable	2067	* externally visible scheduler statistics: current number of runnable
2068	* threads, current number of uninterruptible-sleeping threads, total	2068	* threads, current number of uninterruptible-sleeping threads, total
2069	* number of context switches performed since bootup.	2069	* number of context switches performed since bootup.
2070	*/	2070	*/
2071	unsigned long nr_running(void)	2071	unsigned long nr_running(void)
2072	{	2072	{
2073	unsigned long i, sum = 0;	2073	unsigned long i, sum = 0;
2074		2074
2075	for_each_online_cpu(i)	2075	for_each_online_cpu(i)
2076	sum += cpu_rq(i)->nr_running;	2076	sum += cpu_rq(i)->nr_running;
2077		2077
2078	return sum;	2078	return sum;
2079	}	2079	}
2080		2080
2081	unsigned long nr_uninterruptible(void)	2081	unsigned long nr_uninterruptible(void)
2082	{	2082	{
2083	unsigned long i, sum = 0;	2083	unsigned long i, sum = 0;
2084		2084
2085	for_each_possible_cpu(i)	2085	for_each_possible_cpu(i)
2086	sum += cpu_rq(i)->nr_uninterruptible;	2086	sum += cpu_rq(i)->nr_uninterruptible;
2087		2087
2088	/*	2088	/*
2089	* Since we read the counters lockless, it might be slightly	2089	* Since we read the counters lockless, it might be slightly
2090	* inaccurate. Do not allow it to go below zero though:	2090	* inaccurate. Do not allow it to go below zero though:
2091	*/	2091	*/
2092	if (unlikely((long)sum < 0))	2092	if (unlikely((long)sum < 0))
2093	sum = 0;	2093	sum = 0;
2094		2094
2095	return sum;	2095	return sum;
2096	}	2096	}
2097		2097
2098	unsigned long long nr_context_switches(void)	2098	unsigned long long nr_context_switches(void)
2099	{	2099	{
2100	int i;	2100	int i;
2101	unsigned long long sum = 0;	2101	unsigned long long sum = 0;
2102		2102
2103	for_each_possible_cpu(i)	2103	for_each_possible_cpu(i)
2104	sum += cpu_rq(i)->nr_switches;	2104	sum += cpu_rq(i)->nr_switches;
2105		2105
2106	return sum;	2106	return sum;
2107	}	2107	}
2108		2108
2109	unsigned long nr_iowait(void)	2109	unsigned long nr_iowait(void)
2110	{	2110	{
2111	unsigned long i, sum = 0;	2111	unsigned long i, sum = 0;
2112		2112
2113	for_each_possible_cpu(i)	2113	for_each_possible_cpu(i)
2114	sum += atomic_read(&cpu_rq(i)->nr_iowait);	2114	sum += atomic_read(&cpu_rq(i)->nr_iowait);
2115		2115
2116	return sum;	2116	return sum;
2117	}	2117	}
2118		2118
2119	unsigned long nr_iowait_cpu(int cpu)	2119	unsigned long nr_iowait_cpu(int cpu)
2120	{	2120	{
2121	struct rq *this = cpu_rq(cpu);	2121	struct rq *this = cpu_rq(cpu);
2122	return atomic_read(&this->nr_iowait);	2122	return atomic_read(&this->nr_iowait);
2123	}	2123	}
2124		2124
2125	unsigned long this_cpu_load(void)	2125	unsigned long this_cpu_load(void)
2126	{	2126	{
2127	struct rq *this = this_rq();	2127	struct rq *this = this_rq();
2128	return this->cpu_load[0];	2128	return this->cpu_load[0];
2129	}	2129	}
2130		2130
2131		2131
2132	/* Variables and functions for calc_load */	2132	/* Variables and functions for calc_load */
2133	static atomic_long_t calc_load_tasks;	2133	static atomic_long_t calc_load_tasks;
2134	static unsigned long calc_load_update;	2134	static unsigned long calc_load_update;
2135	unsigned long avenrun[3];	2135	unsigned long avenrun[3];
2136	EXPORT_SYMBOL(avenrun);	2136	EXPORT_SYMBOL(avenrun);
2137		2137
2138	static long calc_load_fold_active(struct rq *this_rq)	2138	static long calc_load_fold_active(struct rq *this_rq)
2139	{	2139	{
2140	long nr_active, delta = 0;	2140	long nr_active, delta = 0;
2141		2141
2142	nr_active = this_rq->nr_running;	2142	nr_active = this_rq->nr_running;
2143	nr_active += (long) this_rq->nr_uninterruptible;	2143	nr_active += (long) this_rq->nr_uninterruptible;
2144		2144
2145	if (nr_active != this_rq->calc_load_active) {	2145	if (nr_active != this_rq->calc_load_active) {
2146	delta = nr_active - this_rq->calc_load_active;	2146	delta = nr_active - this_rq->calc_load_active;
2147	this_rq->calc_load_active = nr_active;	2147	this_rq->calc_load_active = nr_active;
2148	}	2148	}
2149		2149
2150	return delta;	2150	return delta;
2151	}	2151	}
2152		2152
2153	static unsigned long	2153	static unsigned long
2154	calc_load(unsigned long load, unsigned long exp, unsigned long active)	2154	calc_load(unsigned long load, unsigned long exp, unsigned long active)
2155	{	2155	{
2156	load *= exp;	2156	load *= exp;
2157	load += active * (FIXED_1 - exp);	2157	load += active * (FIXED_1 - exp);
2158	load += 1UL << (FSHIFT - 1);	2158	load += 1UL << (FSHIFT - 1);
2159	return load >> FSHIFT;	2159	return load >> FSHIFT;
2160	}	2160	}
2161		2161
2162	#ifdef CONFIG_NO_HZ	2162	#ifdef CONFIG_NO_HZ
2163	/*	2163	/*
2164	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.	2164	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2165	*	2165	*
2166	* When making the ILB scale, we should try to pull this in as well.	2166	* When making the ILB scale, we should try to pull this in as well.
2167	*/	2167	*/
2168	static atomic_long_t calc_load_tasks_idle;	2168	static atomic_long_t calc_load_tasks_idle;
2169		2169
2170	void calc_load_account_idle(struct rq *this_rq)	2170	void calc_load_account_idle(struct rq *this_rq)
2171	{	2171	{
2172	long delta;	2172	long delta;
2173		2173
2174	delta = calc_load_fold_active(this_rq);	2174	delta = calc_load_fold_active(this_rq);
2175	if (delta)	2175	if (delta)
2176	atomic_long_add(delta, &calc_load_tasks_idle);	2176	atomic_long_add(delta, &calc_load_tasks_idle);
2177	}	2177	}
2178		2178
2179	static long calc_load_fold_idle(void)	2179	static long calc_load_fold_idle(void)
2180	{	2180	{
2181	long delta = 0;	2181	long delta = 0;
2182		2182
2183	/*	2183	/*
2184	* Its got a race, we don't care...	2184	* Its got a race, we don't care...
2185	*/	2185	*/
2186	if (atomic_long_read(&calc_load_tasks_idle))	2186	if (atomic_long_read(&calc_load_tasks_idle))
2187	delta = atomic_long_xchg(&calc_load_tasks_idle, 0);	2187	delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2188		2188
2189	return delta;	2189	return delta;
2190	}	2190	}
2191		2191
2192	/**	2192	/**
2193	* fixed_power_int - compute: x^n, in O(log n) time	2193	* fixed_power_int - compute: x^n, in O(log n) time
2194	*	2194	*
2195	* @x: base of the power	2195	* @x: base of the power
2196	* @frac_bits: fractional bits of @x	2196	* @frac_bits: fractional bits of @x
2197	* @n: power to raise @x to.	2197	* @n: power to raise @x to.
2198	*	2198	*
2199	* By exploiting the relation between the definition of the natural power	2199	* By exploiting the relation between the definition of the natural power
2200	* function: x^n := xx...*x (x multiplied by itself for n times), and	2200	* function: x^n := xx...*x (x multiplied by itself for n times), and
2201	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,	2201	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2202	* (where: n_i \elem {0, 1}, the binary vector representing n),	2202	* (where: n_i \elem {0, 1}, the binary vector representing n),
2203	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is	2203	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2204	* of course trivially computable in O(log_2 n), the length of our binary	2204	* of course trivially computable in O(log_2 n), the length of our binary
2205	* vector.	2205	* vector.
2206	*/	2206	*/
2207	static unsigned long	2207	static unsigned long
2208	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)	2208	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2209	{	2209	{
2210	unsigned long result = 1UL << frac_bits;	2210	unsigned long result = 1UL << frac_bits;
2211		2211
2212	if (n) for (;;) {	2212	if (n) for (;;) {
2213	if (n & 1) {	2213	if (n & 1) {
2214	result *= x;	2214	result *= x;
2215	result += 1UL << (frac_bits - 1);	2215	result += 1UL << (frac_bits - 1);
2216	result >>= frac_bits;	2216	result >>= frac_bits;
2217	}	2217	}
2218	n >>= 1;	2218	n >>= 1;
2219	if (!n)	2219	if (!n)
2220	break;	2220	break;
2221	x *= x;	2221	x *= x;
2222	x += 1UL << (frac_bits - 1);	2222	x += 1UL << (frac_bits - 1);
2223	x >>= frac_bits;	2223	x >>= frac_bits;
2224	}	2224	}
2225		2225
2226	return result;	2226	return result;
2227	}	2227	}
2228		2228
2229	/*	2229	/*
2230	* a1 = a0 * e + a * (1 - e)	2230	* a1 = a0 * e + a * (1 - e)
2231	*	2231	*
2232	* a2 = a1 * e + a * (1 - e)	2232	* a2 = a1 * e + a * (1 - e)
2233	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)	2233	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2234	* = a0 * e^2 + a * (1 - e) * (1 + e)	2234	* = a0 * e^2 + a * (1 - e) * (1 + e)
2235	*	2235	*
2236	* a3 = a2 * e + a * (1 - e)	2236	* a3 = a2 * e + a * (1 - e)
2237	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)	2237	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2238	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)	2238	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2239	*	2239	*
2240	* ...	2240	* ...
2241	*	2241	*
2242	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]	2242	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2243	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)	2243	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2244	* = a0 * e^n + a * (1 - e^n)	2244	* = a0 * e^n + a * (1 - e^n)
2245	*	2245	*
2246	* [1] application of the geometric series:	2246	* [1] application of the geometric series:
2247	*	2247	*
2248	* n 1 - x^(n+1)	2248	* n 1 - x^(n+1)
2249	* S_n := \Sum x^i = -------------	2249	* S_n := \Sum x^i = -------------
2250	* i=0 1 - x	2250	* i=0 1 - x
2251	*/	2251	*/
2252	static unsigned long	2252	static unsigned long
2253	calc_load_n(unsigned long load, unsigned long exp,	2253	calc_load_n(unsigned long load, unsigned long exp,
2254	unsigned long active, unsigned int n)	2254	unsigned long active, unsigned int n)
2255	{	2255	{
2256		2256
2257	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);	2257	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2258	}	2258	}
2259		2259
2260	/*	2260	/*
2261	* NO_HZ can leave us missing all per-cpu ticks calling	2261	* NO_HZ can leave us missing all per-cpu ticks calling
2262	* calc_load_account_active(), but since an idle CPU folds its delta into	2262	* calc_load_account_active(), but since an idle CPU folds its delta into
2263	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold	2263	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2264	* in the pending idle delta if our idle period crossed a load cycle boundary.	2264	* in the pending idle delta if our idle period crossed a load cycle boundary.
2265	*	2265	*
2266	* Once we've updated the global active value, we need to apply the exponential	2266	* Once we've updated the global active value, we need to apply the exponential
2267	* weights adjusted to the number of cycles missed.	2267	* weights adjusted to the number of cycles missed.
2268	*/	2268	*/
2269	static void calc_global_nohz(unsigned long ticks)	2269	static void calc_global_nohz(unsigned long ticks)
2270	{	2270	{
2271	long delta, active, n;	2271	long delta, active, n;
2272		2272
2273	if (time_before(jiffies, calc_load_update))	2273	if (time_before(jiffies, calc_load_update))
2274	return;	2274	return;
2275		2275
2276	/*	2276	/*
2277	* If we crossed a calc_load_update boundary, make sure to fold	2277	* If we crossed a calc_load_update boundary, make sure to fold
2278	* any pending idle changes, the respective CPUs might have	2278	* any pending idle changes, the respective CPUs might have
2279	* missed the tick driven calc_load_account_active() update	2279	* missed the tick driven calc_load_account_active() update
2280	* due to NO_HZ.	2280	* due to NO_HZ.
2281	*/	2281	*/
2282	delta = calc_load_fold_idle();	2282	delta = calc_load_fold_idle();
2283	if (delta)	2283	if (delta)
2284	atomic_long_add(delta, &calc_load_tasks);	2284	atomic_long_add(delta, &calc_load_tasks);
2285		2285
2286	/*	2286	/*
2287	* If we were idle for multiple load cycles, apply them.	2287	* If we were idle for multiple load cycles, apply them.
2288	*/	2288	*/
2289	if (ticks >= LOAD_FREQ) {	2289	if (ticks >= LOAD_FREQ) {
2290	n = ticks / LOAD_FREQ;	2290	n = ticks / LOAD_FREQ;
2291		2291
2292	active = atomic_long_read(&calc_load_tasks);	2292	active = atomic_long_read(&calc_load_tasks);
2293	active = active > 0 ? active * FIXED_1 : 0;	2293	active = active > 0 ? active * FIXED_1 : 0;
2294		2294
2295	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);	2295	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2296	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);	2296	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2297	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);	2297	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2298		2298
2299	calc_load_update += n * LOAD_FREQ;	2299	calc_load_update += n * LOAD_FREQ;
2300	}	2300	}
2301		2301
2302	/*	2302	/*
2303	* Its possible the remainder of the above division also crosses	2303	* Its possible the remainder of the above division also crosses
2304	* a LOAD_FREQ period, the regular check in calc_global_load()	2304	* a LOAD_FREQ period, the regular check in calc_global_load()
2305	* which comes after this will take care of that.	2305	* which comes after this will take care of that.
2306	*	2306	*
2307	* Consider us being 11 ticks before a cycle completion, and us	2307	* Consider us being 11 ticks before a cycle completion, and us
2308	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will	2308	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
2309	* age us 4 cycles, and the test in calc_global_load() will	2309	* age us 4 cycles, and the test in calc_global_load() will
2310	* pick up the final one.	2310	* pick up the final one.
2311	*/	2311	*/
2312	}	2312	}
2313	#else	2313	#else
2314	void calc_load_account_idle(struct rq *this_rq)	2314	void calc_load_account_idle(struct rq *this_rq)
2315	{	2315	{
2316	}	2316	}
2317		2317
2318	static inline long calc_load_fold_idle(void)	2318	static inline long calc_load_fold_idle(void)
2319	{	2319	{
2320	return 0;	2320	return 0;
2321	}	2321	}
2322		2322
2323	static void calc_global_nohz(unsigned long ticks)	2323	static void calc_global_nohz(unsigned long ticks)
2324	{	2324	{
2325	}	2325	}
2326	#endif	2326	#endif
2327		2327
2328	/**	2328	/**
2329	* get_avenrun - get the load average array	2329	* get_avenrun - get the load average array
2330	* @loads: pointer to dest load array	2330	* @loads: pointer to dest load array
2331	* @offset: offset to add	2331	* @offset: offset to add
2332	* @shift: shift count to shift the result left	2332	* @shift: shift count to shift the result left
2333	*	2333	*
2334	* These values are estimates at best, so no need for locking.	2334	* These values are estimates at best, so no need for locking.
2335	*/	2335	*/
2336	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)	2336	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2337	{	2337	{
2338	loads[0] = (avenrun[0] + offset) << shift;	2338	loads[0] = (avenrun[0] + offset) << shift;
2339	loads[1] = (avenrun[1] + offset) << shift;	2339	loads[1] = (avenrun[1] + offset) << shift;
2340	loads[2] = (avenrun[2] + offset) << shift;	2340	loads[2] = (avenrun[2] + offset) << shift;
2341	}	2341	}
2342		2342
2343	/*	2343	/*
2344	* calc_load - update the avenrun load estimates 10 ticks after the	2344	* calc_load - update the avenrun load estimates 10 ticks after the
2345	* CPUs have updated calc_load_tasks.	2345	* CPUs have updated calc_load_tasks.
2346	*/	2346	*/
2347	void calc_global_load(unsigned long ticks)	2347	void calc_global_load(unsigned long ticks)
2348	{	2348	{
2349	long active;	2349	long active;
2350		2350
2351	calc_global_nohz(ticks);	2351	calc_global_nohz(ticks);
2352		2352
2353	if (time_before(jiffies, calc_load_update + 10))	2353	if (time_before(jiffies, calc_load_update + 10))
2354	return;	2354	return;
2355		2355
2356	active = atomic_long_read(&calc_load_tasks);	2356	active = atomic_long_read(&calc_load_tasks);
2357	active = active > 0 ? active * FIXED_1 : 0;	2357	active = active > 0 ? active * FIXED_1 : 0;
2358		2358
2359	avenrun[0] = calc_load(avenrun[0], EXP_1, active);	2359	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2360	avenrun[1] = calc_load(avenrun[1], EXP_5, active);	2360	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2361	avenrun[2] = calc_load(avenrun[2], EXP_15, active);	2361	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2362		2362
2363	calc_load_update += LOAD_FREQ;	2363	calc_load_update += LOAD_FREQ;
2364	}	2364	}
2365		2365
2366	/*	2366	/*
2367	* Called from update_cpu_load() to periodically update this CPU's	2367	* Called from update_cpu_load() to periodically update this CPU's
2368	* active count.	2368	* active count.
2369	*/	2369	*/
2370	static void calc_load_account_active(struct rq *this_rq)	2370	static void calc_load_account_active(struct rq *this_rq)
2371	{	2371	{
2372	long delta;	2372	long delta;
2373		2373
2374	if (time_before(jiffies, this_rq->calc_load_update))	2374	if (time_before(jiffies, this_rq->calc_load_update))
2375	return;	2375	return;
2376		2376
2377	delta = calc_load_fold_active(this_rq);	2377	delta = calc_load_fold_active(this_rq);
2378	delta += calc_load_fold_idle();	2378	delta += calc_load_fold_idle();
2379	if (delta)	2379	if (delta)
2380	atomic_long_add(delta, &calc_load_tasks);	2380	atomic_long_add(delta, &calc_load_tasks);
2381		2381
2382	this_rq->calc_load_update += LOAD_FREQ;	2382	this_rq->calc_load_update += LOAD_FREQ;
2383	}	2383	}
2384		2384
2385	/*	2385	/*
2386	* The exact cpuload at various idx values, calculated at every tick would be	2386	* The exact cpuload at various idx values, calculated at every tick would be
2387	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load	2387	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2388	*	2388	*
2389	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called	2389	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2390	* on nth tick when cpu may be busy, then we have:	2390	* on nth tick when cpu may be busy, then we have:
2391	* load = ((2^idx - 1) / 2^idx)^(n-1) * load	2391	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
2392	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load	2392	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2393	*	2393	*
2394	* decay_load_missed() below does efficient calculation of	2394	* decay_load_missed() below does efficient calculation of
2395	* load = ((2^idx - 1) / 2^idx)^(n-1) * load	2395	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
2396	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load	2396	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2397	*	2397	*
2398	* The calculation is approximated on a 128 point scale.	2398	* The calculation is approximated on a 128 point scale.
2399	* degrade_zero_ticks is the number of ticks after which load at any	2399	* degrade_zero_ticks is the number of ticks after which load at any
2400	* particular idx is approximated to be zero.	2400	* particular idx is approximated to be zero.
2401	* degrade_factor is a precomputed table, a row for each load idx.	2401	* degrade_factor is a precomputed table, a row for each load idx.
2402	* Each column corresponds to degradation factor for a power of two ticks,	2402	* Each column corresponds to degradation factor for a power of two ticks,
2403	* based on 128 point scale.	2403	* based on 128 point scale.
2404	* Example:	2404	* Example:
2405	* row 2, col 3 (=12) says that the degradation at load idx 2 after	2405	* row 2, col 3 (=12) says that the degradation at load idx 2 after
2406	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).	2406	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2407	*	2407	*
2408	* With this power of 2 load factors, we can degrade the load n times	2408	* With this power of 2 load factors, we can degrade the load n times
2409	* by looking at 1 bits in n and doing as many mult/shift instead of	2409	* by looking at 1 bits in n and doing as many mult/shift instead of
2410	* n mult/shifts needed by the exact degradation.	2410	* n mult/shifts needed by the exact degradation.
2411	*/	2411	*/
2412	#define DEGRADE_SHIFT 7	2412	#define DEGRADE_SHIFT 7
2413	static const unsigned char	2413	static const unsigned char
2414	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};	2414	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2415	static const unsigned char	2415	static const unsigned char
2416	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {	2416	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2417	{0, 0, 0, 0, 0, 0, 0, 0},	2417	{0, 0, 0, 0, 0, 0, 0, 0},
2418	{64, 32, 8, 0, 0, 0, 0, 0},	2418	{64, 32, 8, 0, 0, 0, 0, 0},
2419	{96, 72, 40, 12, 1, 0, 0},	2419	{96, 72, 40, 12, 1, 0, 0},
2420	{112, 98, 75, 43, 15, 1, 0},	2420	{112, 98, 75, 43, 15, 1, 0},
2421	{120, 112, 98, 76, 45, 16, 2} };	2421	{120, 112, 98, 76, 45, 16, 2} };
2422		2422
2423	/*	2423	/*
2424	* Update cpu_load for any missed ticks, due to tickless idle. The backlog	2424	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
2425	* would be when CPU is idle and so we just decay the old load without	2425	* would be when CPU is idle and so we just decay the old load without
2426	* adding any new load.	2426	* adding any new load.
2427	*/	2427	*/
2428	static unsigned long	2428	static unsigned long
2429	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)	2429	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2430	{	2430	{
2431	int j = 0;	2431	int j = 0;
2432		2432
2433	if (!missed_updates)	2433	if (!missed_updates)
2434	return load;	2434	return load;
2435		2435
2436	if (missed_updates >= degrade_zero_ticks[idx])	2436	if (missed_updates >= degrade_zero_ticks[idx])
2437	return 0;	2437	return 0;
2438		2438
2439	if (idx == 1)	2439	if (idx == 1)
2440	return load >> missed_updates;	2440	return load >> missed_updates;
2441		2441
2442	while (missed_updates) {	2442	while (missed_updates) {
2443	if (missed_updates % 2)	2443	if (missed_updates % 2)
2444	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;	2444	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2445		2445
2446	missed_updates >>= 1;	2446	missed_updates >>= 1;
2447	j++;	2447	j++;
2448	}	2448	}
2449	return load;	2449	return load;
2450	}	2450	}
2451		2451
2452	/*	2452	/*
2453	* Update rq->cpu_load[] statistics. This function is usually called every	2453	* Update rq->cpu_load[] statistics. This function is usually called every
2454	* scheduler tick (TICK_NSEC). With tickless idle this will not be called	2454	* scheduler tick (TICK_NSEC). With tickless idle this will not be called
2455	* every tick. We fix it up based on jiffies.	2455	* every tick. We fix it up based on jiffies.
2456	*/	2456	*/
2457	void update_cpu_load(struct rq *this_rq)	2457	void update_cpu_load(struct rq *this_rq)
2458	{	2458	{
2459	unsigned long this_load = this_rq->load.weight;	2459	unsigned long this_load = this_rq->load.weight;
2460	unsigned long curr_jiffies = jiffies;	2460	unsigned long curr_jiffies = jiffies;
2461	unsigned long pending_updates;	2461	unsigned long pending_updates;
2462	int i, scale;	2462	int i, scale;
2463		2463
2464	this_rq->nr_load_updates++;	2464	this_rq->nr_load_updates++;
2465		2465
2466	/* Avoid repeated calls on same jiffy, when moving in and out of idle */	2466	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
2467	if (curr_jiffies == this_rq->last_load_update_tick)	2467	if (curr_jiffies == this_rq->last_load_update_tick)
2468	return;	2468	return;
2469		2469
2470	pending_updates = curr_jiffies - this_rq->last_load_update_tick;	2470	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2471	this_rq->last_load_update_tick = curr_jiffies;	2471	this_rq->last_load_update_tick = curr_jiffies;
2472		2472
2473	/* Update our load: */	2473	/* Update our load: */
2474	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */	2474	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2475	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {	2475	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2476	unsigned long old_load, new_load;	2476	unsigned long old_load, new_load;
2477		2477
2478	/* scale is effectively 1 << i now, and >> i divides by scale */	2478	/* scale is effectively 1 << i now, and >> i divides by scale */
2479		2479
2480	old_load = this_rq->cpu_load[i];	2480	old_load = this_rq->cpu_load[i];
2481	old_load = decay_load_missed(old_load, pending_updates - 1, i);	2481	old_load = decay_load_missed(old_load, pending_updates - 1, i);
2482	new_load = this_load;	2482	new_load = this_load;
2483	/*	2483	/*
2484	* Round up the averaging division if load is increasing. This	2484	* Round up the averaging division if load is increasing. This
2485	* prevents us from getting stuck on 9 if the load is 10, for	2485	* prevents us from getting stuck on 9 if the load is 10, for
2486	* example.	2486	* example.
2487	*/	2487	*/
2488	if (new_load > old_load)	2488	if (new_load > old_load)
2489	new_load += scale - 1;	2489	new_load += scale - 1;
2490		2490
2491	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;	2491	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2492	}	2492	}
2493		2493
2494	sched_avg_update(this_rq);	2494	sched_avg_update(this_rq);
2495	}	2495	}
2496		2496
2497	static void update_cpu_load_active(struct rq *this_rq)	2497	static void update_cpu_load_active(struct rq *this_rq)
2498	{	2498	{
2499	update_cpu_load(this_rq);	2499	update_cpu_load(this_rq);
2500		2500
2501	calc_load_account_active(this_rq);	2501	calc_load_account_active(this_rq);
2502	}	2502	}
2503		2503
2504	#ifdef CONFIG_SMP	2504	#ifdef CONFIG_SMP
2505		2505
2506	/*	2506	/*
2507	* sched_exec - execve() is a valuable balancing opportunity, because at	2507	* sched_exec - execve() is a valuable balancing opportunity, because at
2508	* this point the task has the smallest effective memory and cache footprint.	2508	* this point the task has the smallest effective memory and cache footprint.
2509	*/	2509	*/
2510	void sched_exec(void)	2510	void sched_exec(void)
2511	{	2511	{
2512	struct task_struct *p = current;	2512	struct task_struct *p = current;
2513	unsigned long flags;	2513	unsigned long flags;
2514	int dest_cpu;	2514	int dest_cpu;
2515		2515
2516	raw_spin_lock_irqsave(&p->pi_lock, flags);	2516	raw_spin_lock_irqsave(&p->pi_lock, flags);
2517	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);	2517	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2518	if (dest_cpu == smp_processor_id())	2518	if (dest_cpu == smp_processor_id())
2519	goto unlock;	2519	goto unlock;
2520		2520
2521	if (likely(cpu_active(dest_cpu))) {	2521	if (likely(cpu_active(dest_cpu))) {
2522	struct migration_arg arg = { p, dest_cpu };	2522	struct migration_arg arg = { p, dest_cpu };
2523		2523
2524	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	2524	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2525	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);	2525	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2526	return;	2526	return;
2527	}	2527	}
2528	unlock:	2528	unlock:
2529	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	2529	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2530	}	2530	}
2531		2531
2532	#endif	2532	#endif
2533		2533
2534	DEFINE_PER_CPU(struct kernel_stat, kstat);	2534	DEFINE_PER_CPU(struct kernel_stat, kstat);
2535	DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);	2535	DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2536		2536
2537	EXPORT_PER_CPU_SYMBOL(kstat);	2537	EXPORT_PER_CPU_SYMBOL(kstat);
2538	EXPORT_PER_CPU_SYMBOL(kernel_cpustat);	2538	EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2539		2539
2540	/*	2540	/*
2541	* Return any ns on the sched_clock that have not yet been accounted in	2541	* Return any ns on the sched_clock that have not yet been accounted in
2542	* @p in case that task is currently running.	2542	* @p in case that task is currently running.
2543	*	2543	*
2544	* Called with task_rq_lock() held on @rq.	2544	* Called with task_rq_lock() held on @rq.
2545	*/	2545	*/
2546	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)	2546	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)
2547	{	2547	{
2548	u64 ns = 0;	2548	u64 ns = 0;
2549		2549
2550	if (task_current(rq, p)) {	2550	if (task_current(rq, p)) {
2551	update_rq_clock(rq);	2551	update_rq_clock(rq);
2552	ns = rq->clock_task - p->se.exec_start;	2552	ns = rq->clock_task - p->se.exec_start;
2553	if ((s64)ns < 0)	2553	if ((s64)ns < 0)
2554	ns = 0;	2554	ns = 0;
2555	}	2555	}
2556		2556
2557	return ns;	2557	return ns;
2558	}	2558	}
2559		2559
2560	unsigned long long task_delta_exec(struct task_struct *p)	2560	unsigned long long task_delta_exec(struct task_struct *p)
2561	{	2561	{
2562	unsigned long flags;	2562	unsigned long flags;
2563	struct rq *rq;	2563	struct rq *rq;
2564	u64 ns = 0;	2564	u64 ns = 0;
2565		2565
2566	rq = task_rq_lock(p, &flags);	2566	rq = task_rq_lock(p, &flags);
2567	ns = do_task_delta_exec(p, rq);	2567	ns = do_task_delta_exec(p, rq);
2568	task_rq_unlock(rq, p, &flags);	2568	task_rq_unlock(rq, p, &flags);
2569		2569
2570	return ns;	2570	return ns;
2571	}	2571	}
2572		2572
2573	/*	2573	/*
2574	* Return accounted runtime for the task.	2574	* Return accounted runtime for the task.
2575	* In case the task is currently running, return the runtime plus current's	2575	* In case the task is currently running, return the runtime plus current's
2576	* pending runtime that have not been accounted yet.	2576	* pending runtime that have not been accounted yet.
2577	*/	2577	*/
2578	unsigned long long task_sched_runtime(struct task_struct *p)	2578	unsigned long long task_sched_runtime(struct task_struct *p)
2579	{	2579	{
2580	unsigned long flags;	2580	unsigned long flags;
2581	struct rq *rq;	2581	struct rq *rq;
2582	u64 ns = 0;	2582	u64 ns = 0;
2583		2583
2584	rq = task_rq_lock(p, &flags);	2584	rq = task_rq_lock(p, &flags);
2585	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);	2585	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2586	task_rq_unlock(rq, p, &flags);	2586	task_rq_unlock(rq, p, &flags);
2587		2587
2588	return ns;	2588	return ns;
2589	}	2589	}
2590		2590
2591	#ifdef CONFIG_CGROUP_CPUACCT	2591	#ifdef CONFIG_CGROUP_CPUACCT
2592	struct cgroup_subsys cpuacct_subsys;	2592	struct cgroup_subsys cpuacct_subsys;
2593	struct cpuacct root_cpuacct;	2593	struct cpuacct root_cpuacct;
2594	#endif	2594	#endif
2595		2595
2596	static inline void task_group_account_field(struct task_struct *p, int index,	2596	static inline void task_group_account_field(struct task_struct *p, int index,
2597	u64 tmp)	2597	u64 tmp)
2598	{	2598	{
2599	#ifdef CONFIG_CGROUP_CPUACCT	2599	#ifdef CONFIG_CGROUP_CPUACCT
2600	struct kernel_cpustat *kcpustat;	2600	struct kernel_cpustat *kcpustat;
2601	struct cpuacct *ca;	2601	struct cpuacct *ca;
2602	#endif	2602	#endif
2603	/*	2603	/*
2604	* Since all updates are sure to touch the root cgroup, we	2604	* Since all updates are sure to touch the root cgroup, we
2605	* get ourselves ahead and touch it first. If the root cgroup	2605	* get ourselves ahead and touch it first. If the root cgroup
2606	* is the only cgroup, then nothing else should be necessary.	2606	* is the only cgroup, then nothing else should be necessary.
2607	*	2607	*
2608	*/	2608	*/
2609	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;	2609	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2610		2610
2611	#ifdef CONFIG_CGROUP_CPUACCT	2611	#ifdef CONFIG_CGROUP_CPUACCT
2612	if (unlikely(!cpuacct_subsys.active))	2612	if (unlikely(!cpuacct_subsys.active))
2613	return;	2613	return;
2614		2614
2615	rcu_read_lock();	2615	rcu_read_lock();
2616	ca = task_ca(p);	2616	ca = task_ca(p);
2617	while (ca && (ca != &root_cpuacct)) {	2617	while (ca && (ca != &root_cpuacct)) {
2618	kcpustat = this_cpu_ptr(ca->cpustat);	2618	kcpustat = this_cpu_ptr(ca->cpustat);
2619	kcpustat->cpustat[index] += tmp;	2619	kcpustat->cpustat[index] += tmp;
2620	ca = parent_ca(ca);	2620	ca = parent_ca(ca);
2621	}	2621	}
2622	rcu_read_unlock();	2622	rcu_read_unlock();
2623	#endif	2623	#endif
2624	}	2624	}
2625		2625
2626		2626
2627	/*	2627	/*
2628	* Account user cpu time to a process.	2628	* Account user cpu time to a process.
2629	* @p: the process that the cpu time gets accounted to	2629	* @p: the process that the cpu time gets accounted to
2630	* @cputime: the cpu time spent in user space since the last update	2630	* @cputime: the cpu time spent in user space since the last update
2631	* @cputime_scaled: cputime scaled by cpu frequency	2631	* @cputime_scaled: cputime scaled by cpu frequency
2632	*/	2632	*/
2633	void account_user_time(struct task_struct *p, cputime_t cputime,	2633	void account_user_time(struct task_struct *p, cputime_t cputime,
2634	cputime_t cputime_scaled)	2634	cputime_t cputime_scaled)
2635	{	2635	{
2636	int index;	2636	int index;
2637		2637
2638	/* Add user time to process. */	2638	/* Add user time to process. */
2639	p->utime += cputime;	2639	p->utime += cputime;
2640	p->utimescaled += cputime_scaled;	2640	p->utimescaled += cputime_scaled;
2641	account_group_user_time(p, cputime);	2641	account_group_user_time(p, cputime);
2642		2642
2643	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;	2643	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2644		2644
2645	/* Add user time to cpustat. */	2645	/* Add user time to cpustat. */
2646	task_group_account_field(p, index, (__force u64) cputime);	2646	task_group_account_field(p, index, (__force u64) cputime);
2647		2647
2648	/* Account for user time used */	2648	/* Account for user time used */
2649	acct_update_integrals(p);	2649	acct_update_integrals(p);
2650	}	2650	}
2651		2651
2652	/*	2652	/*
2653	* Account guest cpu time to a process.	2653	* Account guest cpu time to a process.
2654	* @p: the process that the cpu time gets accounted to	2654	* @p: the process that the cpu time gets accounted to
2655	* @cputime: the cpu time spent in virtual machine since the last update	2655	* @cputime: the cpu time spent in virtual machine since the last update
2656	* @cputime_scaled: cputime scaled by cpu frequency	2656	* @cputime_scaled: cputime scaled by cpu frequency
2657	*/	2657	*/
2658	static void account_guest_time(struct task_struct *p, cputime_t cputime,	2658	static void account_guest_time(struct task_struct *p, cputime_t cputime,
2659	cputime_t cputime_scaled)	2659	cputime_t cputime_scaled)
2660	{	2660	{
2661	u64 *cpustat = kcpustat_this_cpu->cpustat;	2661	u64 *cpustat = kcpustat_this_cpu->cpustat;
2662		2662
2663	/* Add guest time to process. */	2663	/* Add guest time to process. */
2664	p->utime += cputime;	2664	p->utime += cputime;
2665	p->utimescaled += cputime_scaled;	2665	p->utimescaled += cputime_scaled;
2666	account_group_user_time(p, cputime);	2666	account_group_user_time(p, cputime);
2667	p->gtime += cputime;	2667	p->gtime += cputime;
2668		2668
2669	/* Add guest time to cpustat. */	2669	/* Add guest time to cpustat. */
2670	if (TASK_NICE(p) > 0) {	2670	if (TASK_NICE(p) > 0) {
2671	cpustat[CPUTIME_NICE] += (__force u64) cputime;	2671	cpustat[CPUTIME_NICE] += (__force u64) cputime;
2672	cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;	2672	cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2673	} else {	2673	} else {
2674	cpustat[CPUTIME_USER] += (__force u64) cputime;	2674	cpustat[CPUTIME_USER] += (__force u64) cputime;
2675	cpustat[CPUTIME_GUEST] += (__force u64) cputime;	2675	cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2676	}	2676	}
2677	}	2677	}
2678		2678
2679	/*	2679	/*
2680	* Account system cpu time to a process and desired cpustat field	2680	* Account system cpu time to a process and desired cpustat field
2681	* @p: the process that the cpu time gets accounted to	2681	* @p: the process that the cpu time gets accounted to
2682	* @cputime: the cpu time spent in kernel space since the last update	2682	* @cputime: the cpu time spent in kernel space since the last update
2683	* @cputime_scaled: cputime scaled by cpu frequency	2683	* @cputime_scaled: cputime scaled by cpu frequency
2684	* @target_cputime64: pointer to cpustat field that has to be updated	2684	* @target_cputime64: pointer to cpustat field that has to be updated
2685	*/	2685	*/
2686	static inline	2686	static inline
2687	void __account_system_time(struct task_struct *p, cputime_t cputime,	2687	void __account_system_time(struct task_struct *p, cputime_t cputime,
2688	cputime_t cputime_scaled, int index)	2688	cputime_t cputime_scaled, int index)
2689	{	2689	{
2690	/* Add system time to process. */	2690	/* Add system time to process. */
2691	p->stime += cputime;	2691	p->stime += cputime;
2692	p->stimescaled += cputime_scaled;	2692	p->stimescaled += cputime_scaled;
2693	account_group_system_time(p, cputime);	2693	account_group_system_time(p, cputime);
2694		2694
2695	/* Add system time to cpustat. */	2695	/* Add system time to cpustat. */
2696	task_group_account_field(p, index, (__force u64) cputime);	2696	task_group_account_field(p, index, (__force u64) cputime);
2697		2697
2698	/* Account for system time used */	2698	/* Account for system time used */
2699	acct_update_integrals(p);	2699	acct_update_integrals(p);
2700	}	2700	}
2701		2701
2702	/*	2702	/*
2703	* Account system cpu time to a process.	2703	* Account system cpu time to a process.
2704	* @p: the process that the cpu time gets accounted to	2704	* @p: the process that the cpu time gets accounted to
2705	* @hardirq_offset: the offset to subtract from hardirq_count()	2705	* @hardirq_offset: the offset to subtract from hardirq_count()
2706	* @cputime: the cpu time spent in kernel space since the last update	2706	* @cputime: the cpu time spent in kernel space since the last update
2707	* @cputime_scaled: cputime scaled by cpu frequency	2707	* @cputime_scaled: cputime scaled by cpu frequency
2708	*/	2708	*/
2709	void account_system_time(struct task_struct *p, int hardirq_offset,	2709	void account_system_time(struct task_struct *p, int hardirq_offset,
2710	cputime_t cputime, cputime_t cputime_scaled)	2710	cputime_t cputime, cputime_t cputime_scaled)
2711	{	2711	{
2712	int index;	2712	int index;
2713		2713
2714	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {	2714	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2715	account_guest_time(p, cputime, cputime_scaled);	2715	account_guest_time(p, cputime, cputime_scaled);
2716	return;	2716	return;
2717	}	2717	}
2718		2718
2719	if (hardirq_count() - hardirq_offset)	2719	if (hardirq_count() - hardirq_offset)
2720	index = CPUTIME_IRQ;	2720	index = CPUTIME_IRQ;
2721	else if (in_serving_softirq())	2721	else if (in_serving_softirq())
2722	index = CPUTIME_SOFTIRQ;	2722	index = CPUTIME_SOFTIRQ;
2723	else	2723	else
2724	index = CPUTIME_SYSTEM;	2724	index = CPUTIME_SYSTEM;
2725		2725
2726	__account_system_time(p, cputime, cputime_scaled, index);	2726	__account_system_time(p, cputime, cputime_scaled, index);
2727	}	2727	}
2728		2728
2729	/*	2729	/*
2730	* Account for involuntary wait time.	2730	* Account for involuntary wait time.
2731	* @cputime: the cpu time spent in involuntary wait	2731	* @cputime: the cpu time spent in involuntary wait
2732	*/	2732	*/
2733	void account_steal_time(cputime_t cputime)	2733	void account_steal_time(cputime_t cputime)
2734	{	2734	{
2735	u64 *cpustat = kcpustat_this_cpu->cpustat;	2735	u64 *cpustat = kcpustat_this_cpu->cpustat;
2736		2736
2737	cpustat[CPUTIME_STEAL] += (__force u64) cputime;	2737	cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2738	}	2738	}
2739		2739
2740	/*	2740	/*
2741	* Account for idle time.	2741	* Account for idle time.
2742	* @cputime: the cpu time spent in idle wait	2742	* @cputime: the cpu time spent in idle wait
2743	*/	2743	*/
2744	void account_idle_time(cputime_t cputime)	2744	void account_idle_time(cputime_t cputime)
2745	{	2745	{
2746	u64 *cpustat = kcpustat_this_cpu->cpustat;	2746	u64 *cpustat = kcpustat_this_cpu->cpustat;
2747	struct rq *rq = this_rq();	2747	struct rq *rq = this_rq();
2748		2748
2749	if (atomic_read(&rq->nr_iowait) > 0)	2749	if (atomic_read(&rq->nr_iowait) > 0)
2750	cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;	2750	cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2751	else	2751	else
2752	cpustat[CPUTIME_IDLE] += (__force u64) cputime;	2752	cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2753	}	2753	}
2754		2754
2755	static __always_inline bool steal_account_process_tick(void)	2755	static __always_inline bool steal_account_process_tick(void)
2756	{	2756	{
2757	#ifdef CONFIG_PARAVIRT	2757	#ifdef CONFIG_PARAVIRT
2758	if (static_branch(&paravirt_steal_enabled)) {	2758	if (static_branch(&paravirt_steal_enabled)) {
2759	u64 steal, st = 0;	2759	u64 steal, st = 0;
2760		2760
2761	steal = paravirt_steal_clock(smp_processor_id());	2761	steal = paravirt_steal_clock(smp_processor_id());
2762	steal -= this_rq()->prev_steal_time;	2762	steal -= this_rq()->prev_steal_time;
2763		2763
2764	st = steal_ticks(steal);	2764	st = steal_ticks(steal);
2765	this_rq()->prev_steal_time += st * TICK_NSEC;	2765	this_rq()->prev_steal_time += st * TICK_NSEC;
2766		2766
2767	account_steal_time(st);	2767	account_steal_time(st);
2768	return st;	2768	return st;
2769	}	2769	}
2770	#endif	2770	#endif
2771	return false;	2771	return false;
2772	}	2772	}
2773		2773
2774	#ifndef CONFIG_VIRT_CPU_ACCOUNTING	2774	#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2775		2775
2776	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	2776	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2777	/*	2777	/*
2778	* Account a tick to a process and cpustat	2778	* Account a tick to a process and cpustat
2779	* @p: the process that the cpu time gets accounted to	2779	* @p: the process that the cpu time gets accounted to
2780	* @user_tick: is the tick from userspace	2780	* @user_tick: is the tick from userspace
2781	* @rq: the pointer to rq	2781	* @rq: the pointer to rq
2782	*	2782	*
2783	* Tick demultiplexing follows the order	2783	* Tick demultiplexing follows the order
2784	* - pending hardirq update	2784	* - pending hardirq update
2785	* - pending softirq update	2785	* - pending softirq update
2786	* - user_time	2786	* - user_time
2787	* - idle_time	2787	* - idle_time
2788	* - system time	2788	* - system time
2789	* - check for guest_time	2789	* - check for guest_time
2790	* - else account as system_time	2790	* - else account as system_time
2791	*	2791	*
2792	* Check for hardirq is done both for system and user time as there is	2792	* Check for hardirq is done both for system and user time as there is
2793	* no timer going off while we are on hardirq and hence we may never get an	2793	* no timer going off while we are on hardirq and hence we may never get an
2794	* opportunity to update it solely in system time.	2794	* opportunity to update it solely in system time.
2795	* p->stime and friends are only updated on system time and not on irq	2795	* p->stime and friends are only updated on system time and not on irq
2796	* softirq as those do not count in task exec_runtime any more.	2796	* softirq as those do not count in task exec_runtime any more.
2797	*/	2797	*/
2798	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,	2798	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
2799	struct rq *rq)	2799	struct rq *rq)
2800	{	2800	{
2801	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);	2801	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2802	u64 *cpustat = kcpustat_this_cpu->cpustat;	2802	u64 *cpustat = kcpustat_this_cpu->cpustat;
2803		2803
2804	if (steal_account_process_tick())	2804	if (steal_account_process_tick())
2805	return;	2805	return;
2806		2806
2807	if (irqtime_account_hi_update()) {	2807	if (irqtime_account_hi_update()) {
2808	cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;	2808	cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
2809	} else if (irqtime_account_si_update()) {	2809	} else if (irqtime_account_si_update()) {
2810	cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;	2810	cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
2811	} else if (this_cpu_ksoftirqd() == p) {	2811	} else if (this_cpu_ksoftirqd() == p) {
2812	/*	2812	/*
2813	* ksoftirqd time do not get accounted in cpu_softirq_time.	2813	* ksoftirqd time do not get accounted in cpu_softirq_time.
2814	* So, we have to handle it separately here.	2814	* So, we have to handle it separately here.
2815	* Also, p->stime needs to be updated for ksoftirqd.	2815	* Also, p->stime needs to be updated for ksoftirqd.
2816	*/	2816	*/
2817	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,	2817	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
2818	CPUTIME_SOFTIRQ);	2818	CPUTIME_SOFTIRQ);
2819	} else if (user_tick) {	2819	} else if (user_tick) {
2820	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);	2820	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
2821	} else if (p == rq->idle) {	2821	} else if (p == rq->idle) {
2822	account_idle_time(cputime_one_jiffy);	2822	account_idle_time(cputime_one_jiffy);
2823	} else if (p->flags & PF_VCPU) { /* System time or guest time */	2823	} else if (p->flags & PF_VCPU) { /* System time or guest time */
2824	account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);	2824	account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
2825	} else {	2825	} else {
2826	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,	2826	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
2827	CPUTIME_SYSTEM);	2827	CPUTIME_SYSTEM);
2828	}	2828	}
2829	}	2829	}
2830		2830
2831	static void irqtime_account_idle_ticks(int ticks)	2831	static void irqtime_account_idle_ticks(int ticks)
2832	{	2832	{
2833	int i;	2833	int i;
2834	struct rq *rq = this_rq();	2834	struct rq *rq = this_rq();
2835		2835
2836	for (i = 0; i < ticks; i++)	2836	for (i = 0; i < ticks; i++)
2837	irqtime_account_process_tick(current, 0, rq);	2837	irqtime_account_process_tick(current, 0, rq);
2838	}	2838	}
2839	#else /* CONFIG_IRQ_TIME_ACCOUNTING */	2839	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2840	static void irqtime_account_idle_ticks(int ticks) {}	2840	static void irqtime_account_idle_ticks(int ticks) {}
2841	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,	2841	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
2842	struct rq *rq) {}	2842	struct rq *rq) {}
2843	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */	2843	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2844		2844
2845	/*	2845	/*
2846	* Account a single tick of cpu time.	2846	* Account a single tick of cpu time.
2847	* @p: the process that the cpu time gets accounted to	2847	* @p: the process that the cpu time gets accounted to
2848	* @user_tick: indicates if the tick is a user or a system tick	2848	* @user_tick: indicates if the tick is a user or a system tick
2849	*/	2849	*/
2850	void account_process_tick(struct task_struct *p, int user_tick)	2850	void account_process_tick(struct task_struct *p, int user_tick)
2851	{	2851	{
2852	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);	2852	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2853	struct rq *rq = this_rq();	2853	struct rq *rq = this_rq();
2854		2854
2855	if (sched_clock_irqtime) {	2855	if (sched_clock_irqtime) {
2856	irqtime_account_process_tick(p, user_tick, rq);	2856	irqtime_account_process_tick(p, user_tick, rq);
2857	return;	2857	return;
2858	}	2858	}
2859		2859
2860	if (steal_account_process_tick())	2860	if (steal_account_process_tick())
2861	return;	2861	return;
2862		2862
2863	if (user_tick)	2863	if (user_tick)
2864	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);	2864	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
2865	else if ((p != rq->idle) \|\| (irq_count() != HARDIRQ_OFFSET))	2865	else if ((p != rq->idle) \|\| (irq_count() != HARDIRQ_OFFSET))
2866	account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,	2866	account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
2867	one_jiffy_scaled);	2867	one_jiffy_scaled);
2868	else	2868	else
2869	account_idle_time(cputime_one_jiffy);	2869	account_idle_time(cputime_one_jiffy);
2870	}	2870	}
2871		2871
2872	/*	2872	/*
2873	* Account multiple ticks of steal time.	2873	* Account multiple ticks of steal time.
2874	* @p: the process from which the cpu time has been stolen	2874	* @p: the process from which the cpu time has been stolen
2875	* @ticks: number of stolen ticks	2875	* @ticks: number of stolen ticks
2876	*/	2876	*/
2877	void account_steal_ticks(unsigned long ticks)	2877	void account_steal_ticks(unsigned long ticks)
2878	{	2878	{
2879	account_steal_time(jiffies_to_cputime(ticks));	2879	account_steal_time(jiffies_to_cputime(ticks));
2880	}	2880	}
2881		2881
2882	/*	2882	/*
2883	* Account multiple ticks of idle time.	2883	* Account multiple ticks of idle time.
2884	* @ticks: number of stolen ticks	2884	* @ticks: number of stolen ticks
2885	*/	2885	*/
2886	void account_idle_ticks(unsigned long ticks)	2886	void account_idle_ticks(unsigned long ticks)
2887	{	2887	{
2888		2888
2889	if (sched_clock_irqtime) {	2889	if (sched_clock_irqtime) {
2890	irqtime_account_idle_ticks(ticks);	2890	irqtime_account_idle_ticks(ticks);
2891	return;	2891	return;
2892	}	2892	}
2893		2893
2894	account_idle_time(jiffies_to_cputime(ticks));	2894	account_idle_time(jiffies_to_cputime(ticks));
2895	}	2895	}
2896		2896
2897	#endif	2897	#endif
2898		2898
2899	/*	2899	/*
2900	* Use precise platform statistics if available:	2900	* Use precise platform statistics if available:
2901	*/	2901	*/
2902	#ifdef CONFIG_VIRT_CPU_ACCOUNTING	2902	#ifdef CONFIG_VIRT_CPU_ACCOUNTING
2903	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)	2903	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)
2904	{	2904	{
2905	*ut = p->utime;	2905	*ut = p->utime;
2906	*st = p->stime;	2906	*st = p->stime;
2907	}	2907	}
2908		2908
2909	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)	2909	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)
2910	{	2910	{
2911	struct task_cputime cputime;	2911	struct task_cputime cputime;
2912		2912
2913	thread_group_cputime(p, &cputime);	2913	thread_group_cputime(p, &cputime);
2914		2914
2915	*ut = cputime.utime;	2915	*ut = cputime.utime;
2916	*st = cputime.stime;	2916	*st = cputime.stime;
2917	}	2917	}
2918	#else	2918	#else
2919		2919
2920	#ifndef nsecs_to_cputime	2920	#ifndef nsecs_to_cputime
2921	# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)	2921	# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
2922	#endif	2922	#endif
2923		2923
2924	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)	2924	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)
2925	{	2925	{
2926	cputime_t rtime, utime = p->utime, total = utime + p->stime;	2926	cputime_t rtime, utime = p->utime, total = utime + p->stime;
2927		2927
2928	/*	2928	/*
2929	* Use CFS's precise accounting:	2929	* Use CFS's precise accounting:
2930	*/	2930	*/
2931	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);	2931	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
2932		2932
2933	if (total) {	2933	if (total) {
2934	u64 temp = (__force u64) rtime;	2934	u64 temp = (__force u64) rtime;
2935		2935
2936	temp *= (__force u64) utime;	2936	temp *= (__force u64) utime;
2937	do_div(temp, (__force u32) total);	2937	do_div(temp, (__force u32) total);
2938	utime = (__force cputime_t) temp;	2938	utime = (__force cputime_t) temp;
2939	} else	2939	} else
2940	utime = rtime;	2940	utime = rtime;
2941		2941
2942	/*	2942	/*
2943	* Compare with previous values, to keep monotonicity:	2943	* Compare with previous values, to keep monotonicity:
2944	*/	2944	*/
2945	p->prev_utime = max(p->prev_utime, utime);	2945	p->prev_utime = max(p->prev_utime, utime);
2946	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);	2946	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
2947		2947
2948	*ut = p->prev_utime;	2948	*ut = p->prev_utime;
2949	*st = p->prev_stime;	2949	*st = p->prev_stime;
2950	}	2950	}
2951		2951
2952	/*	2952	/*
2953	* Must be called with siglock held.	2953	* Must be called with siglock held.
2954	*/	2954	*/
2955	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)	2955	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)
2956	{	2956	{
2957	struct signal_struct *sig = p->signal;	2957	struct signal_struct *sig = p->signal;
2958	struct task_cputime cputime;	2958	struct task_cputime cputime;
2959	cputime_t rtime, utime, total;	2959	cputime_t rtime, utime, total;
2960		2960
2961	thread_group_cputime(p, &cputime);	2961	thread_group_cputime(p, &cputime);
2962		2962
2963	total = cputime.utime + cputime.stime;	2963	total = cputime.utime + cputime.stime;
2964	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);	2964	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
2965		2965
2966	if (total) {	2966	if (total) {
2967	u64 temp = (__force u64) rtime;	2967	u64 temp = (__force u64) rtime;
2968		2968
2969	temp *= (__force u64) cputime.utime;	2969	temp *= (__force u64) cputime.utime;
2970	do_div(temp, (__force u32) total);	2970	do_div(temp, (__force u32) total);
2971	utime = (__force cputime_t) temp;	2971	utime = (__force cputime_t) temp;
2972	} else	2972	} else
2973	utime = rtime;	2973	utime = rtime;
2974		2974
2975	sig->prev_utime = max(sig->prev_utime, utime);	2975	sig->prev_utime = max(sig->prev_utime, utime);
2976	sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);	2976	sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
2977		2977
2978	*ut = sig->prev_utime;	2978	*ut = sig->prev_utime;
2979	*st = sig->prev_stime;	2979	*st = sig->prev_stime;
2980	}	2980	}
2981	#endif	2981	#endif
2982		2982
2983	/*	2983	/*
2984	* This function gets called by the timer code, with HZ frequency.	2984	* This function gets called by the timer code, with HZ frequency.
2985	* We call it with interrupts disabled.	2985	* We call it with interrupts disabled.
2986	*/	2986	*/
2987	void scheduler_tick(void)	2987	void scheduler_tick(void)
2988	{	2988	{
2989	int cpu = smp_processor_id();	2989	int cpu = smp_processor_id();
2990	struct rq *rq = cpu_rq(cpu);	2990	struct rq *rq = cpu_rq(cpu);
2991	struct task_struct *curr = rq->curr;	2991	struct task_struct *curr = rq->curr;
2992		2992
2993	sched_clock_tick();	2993	sched_clock_tick();
2994		2994
2995	raw_spin_lock(&rq->lock);	2995	raw_spin_lock(&rq->lock);
2996	update_rq_clock(rq);	2996	update_rq_clock(rq);
2997	update_cpu_load_active(rq);	2997	update_cpu_load_active(rq);
2998	curr->sched_class->task_tick(rq, curr, 0);	2998	curr->sched_class->task_tick(rq, curr, 0);
2999	raw_spin_unlock(&rq->lock);	2999	raw_spin_unlock(&rq->lock);
3000		3000
3001	perf_event_task_tick();	3001	perf_event_task_tick();
3002		3002
3003	#ifdef CONFIG_SMP	3003	#ifdef CONFIG_SMP
3004	rq->idle_balance = idle_cpu(cpu);	3004	rq->idle_balance = idle_cpu(cpu);
3005	trigger_load_balance(rq, cpu);	3005	trigger_load_balance(rq, cpu);
3006	#endif	3006	#endif
3007	}	3007	}
3008		3008
3009	notrace unsigned long get_parent_ip(unsigned long addr)	3009	notrace unsigned long get_parent_ip(unsigned long addr)
3010	{	3010	{
3011	if (in_lock_functions(addr)) {	3011	if (in_lock_functions(addr)) {
3012	addr = CALLER_ADDR2;	3012	addr = CALLER_ADDR2;
3013	if (in_lock_functions(addr))	3013	if (in_lock_functions(addr))
3014	addr = CALLER_ADDR3;	3014	addr = CALLER_ADDR3;
3015	}	3015	}
3016	return addr;	3016	return addr;
3017	}	3017	}
3018		3018
3019	#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \	3019	#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
3020	defined(CONFIG_PREEMPT_TRACER))	3020	defined(CONFIG_PREEMPT_TRACER))
3021		3021
3022	void __kprobes add_preempt_count(int val)	3022	void __kprobes add_preempt_count(int val)
3023	{	3023	{
3024	#ifdef CONFIG_DEBUG_PREEMPT	3024	#ifdef CONFIG_DEBUG_PREEMPT
3025	/*	3025	/*
3026	* Underflow?	3026	* Underflow?
3027	*/	3027	*/
3028	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))	3028	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3029	return;	3029	return;
3030	#endif	3030	#endif
3031	preempt_count() += val;	3031	preempt_count() += val;
3032	#ifdef CONFIG_DEBUG_PREEMPT	3032	#ifdef CONFIG_DEBUG_PREEMPT
3033	/*	3033	/*
3034	* Spinlock count overflowing soon?	3034	* Spinlock count overflowing soon?
3035	*/	3035	*/
3036	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=	3036	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3037	PREEMPT_MASK - 10);	3037	PREEMPT_MASK - 10);
3038	#endif	3038	#endif
3039	if (preempt_count() == val)	3039	if (preempt_count() == val)
3040	trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));	3040	trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3041	}	3041	}
3042	EXPORT_SYMBOL(add_preempt_count);	3042	EXPORT_SYMBOL(add_preempt_count);
3043		3043
3044	void __kprobes sub_preempt_count(int val)	3044	void __kprobes sub_preempt_count(int val)
3045	{	3045	{
3046	#ifdef CONFIG_DEBUG_PREEMPT	3046	#ifdef CONFIG_DEBUG_PREEMPT
3047	/*	3047	/*
3048	* Underflow?	3048	* Underflow?
3049	*/	3049	*/
3050	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))	3050	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3051	return;	3051	return;
3052	/*	3052	/*
3053	* Is the spinlock portion underflowing?	3053	* Is the spinlock portion underflowing?
3054	*/	3054	*/
3055	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&	3055	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3056	!(preempt_count() & PREEMPT_MASK)))	3056	!(preempt_count() & PREEMPT_MASK)))
3057	return;	3057	return;
3058	#endif	3058	#endif
3059		3059
3060	if (preempt_count() == val)	3060	if (preempt_count() == val)
3061	trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));	3061	trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3062	preempt_count() -= val;	3062	preempt_count() -= val;
3063	}	3063	}
3064	EXPORT_SYMBOL(sub_preempt_count);	3064	EXPORT_SYMBOL(sub_preempt_count);
3065		3065
3066	#endif	3066	#endif
3067		3067
3068	/*	3068	/*
3069	* Print scheduling while atomic bug:	3069	* Print scheduling while atomic bug:
3070	*/	3070	*/
3071	static noinline void __schedule_bug(struct task_struct *prev)	3071	static noinline void __schedule_bug(struct task_struct *prev)
3072	{	3072	{
3073	struct pt_regs *regs = get_irq_regs();	3073	struct pt_regs *regs = get_irq_regs();
3074		3074
3075	if (oops_in_progress)	3075	if (oops_in_progress)
3076	return;	3076	return;
3077		3077
3078	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",	3078	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3079	prev->comm, prev->pid, preempt_count());	3079	prev->comm, prev->pid, preempt_count());
3080		3080
3081	debug_show_held_locks(prev);	3081	debug_show_held_locks(prev);
3082	print_modules();	3082	print_modules();
3083	if (irqs_disabled())	3083	if (irqs_disabled())
3084	print_irqtrace_events(prev);	3084	print_irqtrace_events(prev);
3085		3085
3086	if (regs)	3086	if (regs)
3087	show_regs(regs);	3087	show_regs(regs);
3088	else	3088	else
3089	dump_stack();	3089	dump_stack();
3090	}	3090	}
3091		3091
3092	/*	3092	/*
3093	* Various schedule()-time debugging checks and statistics:	3093	* Various schedule()-time debugging checks and statistics:
3094	*/	3094	*/
3095	static inline void schedule_debug(struct task_struct *prev)	3095	static inline void schedule_debug(struct task_struct *prev)
3096	{	3096	{
3097	/*	3097	/*
3098	* Test if we are atomic. Since do_exit() needs to call into	3098	* Test if we are atomic. Since do_exit() needs to call into
3099	* schedule() atomically, we ignore that path for now.	3099	* schedule() atomically, we ignore that path for now.
3100	* Otherwise, whine if we are scheduling when we should not be.	3100	* Otherwise, whine if we are scheduling when we should not be.
3101	*/	3101	*/
3102	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))	3102	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3103	__schedule_bug(prev);	3103	__schedule_bug(prev);
3104	rcu_sleep_check();	3104	rcu_sleep_check();
3105		3105
3106	profile_hit(SCHED_PROFILING, __builtin_return_address(0));	3106	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3107		3107
3108	schedstat_inc(this_rq(), sched_count);	3108	schedstat_inc(this_rq(), sched_count);
3109	}	3109	}
3110		3110
3111	static void put_prev_task(struct rq rq, struct task_struct prev)	3111	static void put_prev_task(struct rq rq, struct task_struct prev)
3112	{	3112	{
3113	if (prev->on_rq \|\| rq->skip_clock_update < 0)	3113	if (prev->on_rq \|\| rq->skip_clock_update < 0)
3114	update_rq_clock(rq);	3114	update_rq_clock(rq);
3115	prev->sched_class->put_prev_task(rq, prev);	3115	prev->sched_class->put_prev_task(rq, prev);
3116	}	3116	}
3117		3117
3118	/*	3118	/*
3119	* Pick up the highest-prio task:	3119	* Pick up the highest-prio task:
3120	*/	3120	*/
3121	static inline struct task_struct *	3121	static inline struct task_struct *
3122	pick_next_task(struct rq *rq)	3122	pick_next_task(struct rq *rq)
3123	{	3123	{
3124	const struct sched_class *class;	3124	const struct sched_class *class;
3125	struct task_struct *p;	3125	struct task_struct *p;
3126		3126
3127	/*	3127	/*
3128	* Optimization: we know that if all tasks are in	3128	* Optimization: we know that if all tasks are in
3129	* the fair class we can call that function directly:	3129	* the fair class we can call that function directly:
3130	*/	3130	*/
3131	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {	3131	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
3132	p = fair_sched_class.pick_next_task(rq);	3132	p = fair_sched_class.pick_next_task(rq);
3133	if (likely(p))	3133	if (likely(p))
3134	return p;	3134	return p;
3135	}	3135	}
3136		3136
3137	for_each_class(class) {	3137	for_each_class(class) {
3138	p = class->pick_next_task(rq);	3138	p = class->pick_next_task(rq);
3139	if (p)	3139	if (p)
3140	return p;	3140	return p;
3141	}	3141	}
3142		3142
3143	BUG(); /* the idle class will always have a runnable task */	3143	BUG(); /* the idle class will always have a runnable task */
3144	}	3144	}
3145		3145
3146	/*	3146	/*
3147	* __schedule() is the main scheduler function.	3147	* __schedule() is the main scheduler function.
3148	*/	3148	*/
3149	static void __sched __schedule(void)	3149	static void __sched __schedule(void)
3150	{	3150	{
3151	struct task_struct prev, next;	3151	struct task_struct prev, next;
3152	unsigned long *switch_count;	3152	unsigned long *switch_count;
3153	struct rq *rq;	3153	struct rq *rq;
3154	int cpu;	3154	int cpu;
3155		3155
3156	need_resched:	3156	need_resched:
3157	preempt_disable();	3157	preempt_disable();
3158	cpu = smp_processor_id();	3158	cpu = smp_processor_id();
3159	rq = cpu_rq(cpu);	3159	rq = cpu_rq(cpu);
3160	rcu_note_context_switch(cpu);	3160	rcu_note_context_switch(cpu);
3161	prev = rq->curr;	3161	prev = rq->curr;
3162		3162
3163	schedule_debug(prev);	3163	schedule_debug(prev);
3164		3164
3165	if (sched_feat(HRTICK))	3165	if (sched_feat(HRTICK))
3166	hrtick_clear(rq);	3166	hrtick_clear(rq);
3167		3167
3168	raw_spin_lock_irq(&rq->lock);	3168	raw_spin_lock_irq(&rq->lock);
3169		3169
3170	switch_count = &prev->nivcsw;	3170	switch_count = &prev->nivcsw;
3171	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	3171	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3172	if (unlikely(signal_pending_state(prev->state, prev))) {	3172	if (unlikely(signal_pending_state(prev->state, prev))) {
3173	prev->state = TASK_RUNNING;	3173	prev->state = TASK_RUNNING;
3174	} else {	3174	} else {
3175	deactivate_task(rq, prev, DEQUEUE_SLEEP);	3175	deactivate_task(rq, prev, DEQUEUE_SLEEP);
3176	prev->on_rq = 0;	3176	prev->on_rq = 0;
3177		3177
3178	/*	3178	/*
3179	* If a worker went to sleep, notify and ask workqueue	3179	* If a worker went to sleep, notify and ask workqueue
3180	* whether it wants to wake up a task to maintain	3180	* whether it wants to wake up a task to maintain
3181	* concurrency.	3181	* concurrency.
3182	*/	3182	*/
3183	if (prev->flags & PF_WQ_WORKER) {	3183	if (prev->flags & PF_WQ_WORKER) {
3184	struct task_struct *to_wakeup;	3184	struct task_struct *to_wakeup;
3185		3185
3186	to_wakeup = wq_worker_sleeping(prev, cpu);	3186	to_wakeup = wq_worker_sleeping(prev, cpu);
3187	if (to_wakeup)	3187	if (to_wakeup)
3188	try_to_wake_up_local(to_wakeup);	3188	try_to_wake_up_local(to_wakeup);
3189	}	3189	}
3190	}	3190	}
3191	switch_count = &prev->nvcsw;	3191	switch_count = &prev->nvcsw;
3192	}	3192	}
3193		3193
3194	pre_schedule(rq, prev);	3194	pre_schedule(rq, prev);
3195		3195
3196	if (unlikely(!rq->nr_running))	3196	if (unlikely(!rq->nr_running))
3197	idle_balance(cpu, rq);	3197	idle_balance(cpu, rq);
3198		3198
3199	put_prev_task(rq, prev);	3199	put_prev_task(rq, prev);
3200	next = pick_next_task(rq);	3200	next = pick_next_task(rq);
3201	clear_tsk_need_resched(prev);	3201	clear_tsk_need_resched(prev);
3202	rq->skip_clock_update = 0;	3202	rq->skip_clock_update = 0;
3203		3203
3204	if (likely(prev != next)) {	3204	if (likely(prev != next)) {
3205	rq->nr_switches++;	3205	rq->nr_switches++;
3206	rq->curr = next;	3206	rq->curr = next;
3207	++*switch_count;	3207	++*switch_count;
3208		3208
3209	context_switch(rq, prev, next); /* unlocks the rq */	3209	context_switch(rq, prev, next); /* unlocks the rq */
3210	/*	3210	/*
3211	* The context switch have flipped the stack from under us	3211	* The context switch have flipped the stack from under us
3212	* and restored the local variables which were saved when	3212	* and restored the local variables which were saved when
3213	* this task called schedule() in the past. prev == current	3213	* this task called schedule() in the past. prev == current
3214	* is still correct, but it can be moved to another cpu/rq.	3214	* is still correct, but it can be moved to another cpu/rq.
3215	*/	3215	*/
3216	cpu = smp_processor_id();	3216	cpu = smp_processor_id();
3217	rq = cpu_rq(cpu);	3217	rq = cpu_rq(cpu);
3218	} else	3218	} else
3219	raw_spin_unlock_irq(&rq->lock);	3219	raw_spin_unlock_irq(&rq->lock);
3220		3220
3221	post_schedule(rq);	3221	post_schedule(rq);
3222		3222
3223	preempt_enable_no_resched();	3223	sched_preempt_enable_no_resched();
3224	if (need_resched())	3224	if (need_resched())
3225	goto need_resched;	3225	goto need_resched;
3226	}	3226	}
3227		3227
3228	static inline void sched_submit_work(struct task_struct *tsk)	3228	static inline void sched_submit_work(struct task_struct *tsk)
3229	{	3229	{
3230	if (!tsk->state)	3230	if (!tsk->state)
3231	return;	3231	return;
3232	/*	3232	/*
3233	* If we are going to sleep and we have plugged IO queued,	3233	* If we are going to sleep and we have plugged IO queued,
3234	* make sure to submit it to avoid deadlocks.	3234	* make sure to submit it to avoid deadlocks.
3235	*/	3235	*/
3236	if (blk_needs_flush_plug(tsk))	3236	if (blk_needs_flush_plug(tsk))
3237	blk_schedule_flush_plug(tsk);	3237	blk_schedule_flush_plug(tsk);
3238	}	3238	}
3239		3239
3240	asmlinkage void __sched schedule(void)	3240	asmlinkage void __sched schedule(void)
3241	{	3241	{
3242	struct task_struct *tsk = current;	3242	struct task_struct *tsk = current;
3243		3243
3244	sched_submit_work(tsk);	3244	sched_submit_work(tsk);
3245	__schedule();	3245	__schedule();
3246	}	3246	}
3247	EXPORT_SYMBOL(schedule);	3247	EXPORT_SYMBOL(schedule);
3248		3248
3249	/**	3249	/**
3250	* schedule_preempt_disabled - called with preemption disabled	3250	* schedule_preempt_disabled - called with preemption disabled
3251	*	3251	*
3252	* Returns with preemption disabled. Note: preempt_count must be 1	3252	* Returns with preemption disabled. Note: preempt_count must be 1
3253	*/	3253	*/
3254	void __sched schedule_preempt_disabled(void)	3254	void __sched schedule_preempt_disabled(void)
3255	{	3255	{
3256	preempt_enable_no_resched();	3256	sched_preempt_enable_no_resched();
3257	schedule();	3257	schedule();
3258	preempt_disable();	3258	preempt_disable();
3259	}	3259	}
3260		3260
3261	#ifdef CONFIG_MUTEX_SPIN_ON_OWNER	3261	#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3262		3262
3263	static inline bool owner_running(struct mutex lock, struct task_struct owner)	3263	static inline bool owner_running(struct mutex lock, struct task_struct owner)
3264	{	3264	{
3265	if (lock->owner != owner)	3265	if (lock->owner != owner)
3266	return false;	3266	return false;
3267		3267
3268	/*	3268	/*
3269	* Ensure we emit the owner->on_cpu, dereference _after_ checking	3269	* Ensure we emit the owner->on_cpu, dereference _after_ checking
3270	* lock->owner still matches owner, if that fails, owner might	3270	* lock->owner still matches owner, if that fails, owner might
3271	* point to free()d memory, if it still matches, the rcu_read_lock()	3271	* point to free()d memory, if it still matches, the rcu_read_lock()
3272	* ensures the memory stays valid.	3272	* ensures the memory stays valid.
3273	*/	3273	*/
3274	barrier();	3274	barrier();
3275		3275
3276	return owner->on_cpu;	3276	return owner->on_cpu;
3277	}	3277	}
3278		3278
3279	/*	3279	/*
3280	* Look out! "owner" is an entirely speculative pointer	3280	* Look out! "owner" is an entirely speculative pointer
3281	* access and not reliable.	3281	* access and not reliable.
3282	*/	3282	*/
3283	int mutex_spin_on_owner(struct mutex lock, struct task_struct owner)	3283	int mutex_spin_on_owner(struct mutex lock, struct task_struct owner)
3284	{	3284	{
3285	if (!sched_feat(OWNER_SPIN))	3285	if (!sched_feat(OWNER_SPIN))
3286	return 0;	3286	return 0;
3287		3287
3288	rcu_read_lock();	3288	rcu_read_lock();
3289	while (owner_running(lock, owner)) {	3289	while (owner_running(lock, owner)) {
3290	if (need_resched())	3290	if (need_resched())
3291	break;	3291	break;
3292		3292
3293	arch_mutex_cpu_relax();	3293	arch_mutex_cpu_relax();
3294	}	3294	}
3295	rcu_read_unlock();	3295	rcu_read_unlock();
3296		3296
3297	/*	3297	/*
3298	* We break out the loop above on need_resched() and when the	3298	* We break out the loop above on need_resched() and when the
3299	* owner changed, which is a sign for heavy contention. Return	3299	* owner changed, which is a sign for heavy contention. Return
3300	* success only when lock->owner is NULL.	3300	* success only when lock->owner is NULL.
3301	*/	3301	*/
3302	return lock->owner == NULL;	3302	return lock->owner == NULL;
3303	}	3303	}
3304	#endif	3304	#endif
3305		3305
3306	#ifdef CONFIG_PREEMPT	3306	#ifdef CONFIG_PREEMPT
3307	/*	3307	/*
3308	* this is the entry point to schedule() from in-kernel preemption	3308	* this is the entry point to schedule() from in-kernel preemption
3309	* off of preempt_enable. Kernel preemptions off return from interrupt	3309	* off of preempt_enable. Kernel preemptions off return from interrupt
3310	* occur there and call schedule directly.	3310	* occur there and call schedule directly.
3311	*/	3311	*/
3312	asmlinkage void __sched notrace preempt_schedule(void)	3312	asmlinkage void __sched notrace preempt_schedule(void)
3313	{	3313	{
3314	struct thread_info *ti = current_thread_info();	3314	struct thread_info *ti = current_thread_info();
3315		3315
3316	/*	3316	/*
3317	* If there is a non-zero preempt_count or interrupts are disabled,	3317	* If there is a non-zero preempt_count or interrupts are disabled,
3318	* we do not want to preempt the current task. Just return..	3318	* we do not want to preempt the current task. Just return..
3319	*/	3319	*/
3320	if (likely(ti->preempt_count \|\| irqs_disabled()))	3320	if (likely(ti->preempt_count \|\| irqs_disabled()))
3321	return;	3321	return;
3322		3322
3323	do {	3323	do {
3324	add_preempt_count_notrace(PREEMPT_ACTIVE);	3324	add_preempt_count_notrace(PREEMPT_ACTIVE);
3325	__schedule();	3325	__schedule();
3326	sub_preempt_count_notrace(PREEMPT_ACTIVE);	3326	sub_preempt_count_notrace(PREEMPT_ACTIVE);
3327		3327
3328	/*	3328	/*
3329	* Check again in case we missed a preemption opportunity	3329	* Check again in case we missed a preemption opportunity
3330	* between schedule and now.	3330	* between schedule and now.
3331	*/	3331	*/
3332	barrier();	3332	barrier();
3333	} while (need_resched());	3333	} while (need_resched());
3334	}	3334	}
3335	EXPORT_SYMBOL(preempt_schedule);	3335	EXPORT_SYMBOL(preempt_schedule);
3336		3336
3337	/*	3337	/*
3338	* this is the entry point to schedule() from kernel preemption	3338	* this is the entry point to schedule() from kernel preemption
3339	* off of irq context.	3339	* off of irq context.
3340	* Note, that this is called and return with irqs disabled. This will	3340	* Note, that this is called and return with irqs disabled. This will
3341	* protect us against recursive calling from irq.	3341	* protect us against recursive calling from irq.
3342	*/	3342	*/
3343	asmlinkage void __sched preempt_schedule_irq(void)	3343	asmlinkage void __sched preempt_schedule_irq(void)
3344	{	3344	{
3345	struct thread_info *ti = current_thread_info();	3345	struct thread_info *ti = current_thread_info();
3346		3346
3347	/* Catch callers which need to be fixed */	3347	/* Catch callers which need to be fixed */
3348	BUG_ON(ti->preempt_count \|\| !irqs_disabled());	3348	BUG_ON(ti->preempt_count \|\| !irqs_disabled());
3349		3349
3350	do {	3350	do {
3351	add_preempt_count(PREEMPT_ACTIVE);	3351	add_preempt_count(PREEMPT_ACTIVE);
3352	local_irq_enable();	3352	local_irq_enable();
3353	__schedule();	3353	__schedule();
3354	local_irq_disable();	3354	local_irq_disable();
3355	sub_preempt_count(PREEMPT_ACTIVE);	3355	sub_preempt_count(PREEMPT_ACTIVE);
3356		3356
3357	/*	3357	/*
3358	* Check again in case we missed a preemption opportunity	3358	* Check again in case we missed a preemption opportunity
3359	* between schedule and now.	3359	* between schedule and now.
3360	*/	3360	*/
3361	barrier();	3361	barrier();
3362	} while (need_resched());	3362	} while (need_resched());
3363	}	3363	}
3364		3364
3365	#endif /* CONFIG_PREEMPT */	3365	#endif /* CONFIG_PREEMPT */
3366		3366
3367	int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,	3367	int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3368	void *key)	3368	void *key)
3369	{	3369	{
3370	return try_to_wake_up(curr->private, mode, wake_flags);	3370	return try_to_wake_up(curr->private, mode, wake_flags);
3371	}	3371	}
3372	EXPORT_SYMBOL(default_wake_function);	3372	EXPORT_SYMBOL(default_wake_function);
3373		3373
3374	/*	3374	/*
3375	* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just	3375	* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3376	* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve	3376	* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3377	* number) then we wake all the non-exclusive tasks and one exclusive task.	3377	* number) then we wake all the non-exclusive tasks and one exclusive task.
3378	*	3378	*
3379	* There are circumstances in which we can try to wake a task which has already	3379	* There are circumstances in which we can try to wake a task which has already
3380	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns	3380	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3381	* zero in this (rare) case, and we handle it by continuing to scan the queue.	3381	* zero in this (rare) case, and we handle it by continuing to scan the queue.
3382	*/	3382	*/
3383	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,	3383	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3384	int nr_exclusive, int wake_flags, void *key)	3384	int nr_exclusive, int wake_flags, void *key)
3385	{	3385	{
3386	wait_queue_t curr, next;	3386	wait_queue_t curr, next;
3387		3387
3388	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {	3388	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3389	unsigned flags = curr->flags;	3389	unsigned flags = curr->flags;
3390		3390
3391	if (curr->func(curr, mode, wake_flags, key) &&	3391	if (curr->func(curr, mode, wake_flags, key) &&
3392	(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)	3392	(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3393	break;	3393	break;
3394	}	3394	}
3395	}	3395	}
3396		3396
3397	/**	3397	/**
3398	* __wake_up - wake up threads blocked on a waitqueue.	3398	* __wake_up - wake up threads blocked on a waitqueue.
3399	* @q: the waitqueue	3399	* @q: the waitqueue
3400	* @mode: which threads	3400	* @mode: which threads
3401	* @nr_exclusive: how many wake-one or wake-many threads to wake up	3401	* @nr_exclusive: how many wake-one or wake-many threads to wake up
3402	* @key: is directly passed to the wakeup function	3402	* @key: is directly passed to the wakeup function
3403	*	3403	*
3404	* It may be assumed that this function implies a write memory barrier before	3404	* It may be assumed that this function implies a write memory barrier before
3405	* changing the task state if and only if any tasks are woken up.	3405	* changing the task state if and only if any tasks are woken up.
3406	*/	3406	*/
3407	void __wake_up(wait_queue_head_t *q, unsigned int mode,	3407	void __wake_up(wait_queue_head_t *q, unsigned int mode,
3408	int nr_exclusive, void *key)	3408	int nr_exclusive, void *key)
3409	{	3409	{
3410	unsigned long flags;	3410	unsigned long flags;
3411		3411
3412	spin_lock_irqsave(&q->lock, flags);	3412	spin_lock_irqsave(&q->lock, flags);
3413	__wake_up_common(q, mode, nr_exclusive, 0, key);	3413	__wake_up_common(q, mode, nr_exclusive, 0, key);
3414	spin_unlock_irqrestore(&q->lock, flags);	3414	spin_unlock_irqrestore(&q->lock, flags);
3415	}	3415	}
3416	EXPORT_SYMBOL(__wake_up);	3416	EXPORT_SYMBOL(__wake_up);
3417		3417
3418	/*	3418	/*
3419	* Same as __wake_up but called with the spinlock in wait_queue_head_t held.	3419	* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3420	*/	3420	*/
3421	void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)	3421	void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3422	{	3422	{
3423	__wake_up_common(q, mode, 1, 0, NULL);	3423	__wake_up_common(q, mode, 1, 0, NULL);
3424	}	3424	}
3425	EXPORT_SYMBOL_GPL(__wake_up_locked);	3425	EXPORT_SYMBOL_GPL(__wake_up_locked);
3426		3426
3427	void __wake_up_locked_key(wait_queue_head_t q, unsigned int mode, void key)	3427	void __wake_up_locked_key(wait_queue_head_t q, unsigned int mode, void key)
3428	{	3428	{
3429	__wake_up_common(q, mode, 1, 0, key);	3429	__wake_up_common(q, mode, 1, 0, key);
3430	}	3430	}
3431	EXPORT_SYMBOL_GPL(__wake_up_locked_key);	3431	EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3432		3432
3433	/**	3433	/**
3434	* __wake_up_sync_key - wake up threads blocked on a waitqueue.	3434	* __wake_up_sync_key - wake up threads blocked on a waitqueue.
3435	* @q: the waitqueue	3435	* @q: the waitqueue
3436	* @mode: which threads	3436	* @mode: which threads
3437	* @nr_exclusive: how many wake-one or wake-many threads to wake up	3437	* @nr_exclusive: how many wake-one or wake-many threads to wake up
3438	* @key: opaque value to be passed to wakeup targets	3438	* @key: opaque value to be passed to wakeup targets
3439	*	3439	*
3440	* The sync wakeup differs that the waker knows that it will schedule	3440	* The sync wakeup differs that the waker knows that it will schedule
3441	* away soon, so while the target thread will be woken up, it will not	3441	* away soon, so while the target thread will be woken up, it will not
3442	* be migrated to another CPU - ie. the two threads are 'synchronized'	3442	* be migrated to another CPU - ie. the two threads are 'synchronized'
3443	* with each other. This can prevent needless bouncing between CPUs.	3443	* with each other. This can prevent needless bouncing between CPUs.
3444	*	3444	*
3445	* On UP it can prevent extra preemption.	3445	* On UP it can prevent extra preemption.
3446	*	3446	*
3447	* It may be assumed that this function implies a write memory barrier before	3447	* It may be assumed that this function implies a write memory barrier before
3448	* changing the task state if and only if any tasks are woken up.	3448	* changing the task state if and only if any tasks are woken up.
3449	*/	3449	*/
3450	void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,	3450	void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3451	int nr_exclusive, void *key)	3451	int nr_exclusive, void *key)
3452	{	3452	{
3453	unsigned long flags;	3453	unsigned long flags;
3454	int wake_flags = WF_SYNC;	3454	int wake_flags = WF_SYNC;
3455		3455
3456	if (unlikely(!q))	3456	if (unlikely(!q))
3457	return;	3457	return;
3458		3458
3459	if (unlikely(!nr_exclusive))	3459	if (unlikely(!nr_exclusive))
3460	wake_flags = 0;	3460	wake_flags = 0;
3461		3461
3462	spin_lock_irqsave(&q->lock, flags);	3462	spin_lock_irqsave(&q->lock, flags);
3463	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);	3463	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3464	spin_unlock_irqrestore(&q->lock, flags);	3464	spin_unlock_irqrestore(&q->lock, flags);
3465	}	3465	}
3466	EXPORT_SYMBOL_GPL(__wake_up_sync_key);	3466	EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3467		3467
3468	/*	3468	/*
3469	* __wake_up_sync - see __wake_up_sync_key()	3469	* __wake_up_sync - see __wake_up_sync_key()
3470	*/	3470	*/
3471	void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)	3471	void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3472	{	3472	{
3473	__wake_up_sync_key(q, mode, nr_exclusive, NULL);	3473	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
3474	}	3474	}
3475	EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */	3475	EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3476		3476
3477	/**	3477	/**
3478	* complete: - signals a single thread waiting on this completion	3478	* complete: - signals a single thread waiting on this completion
3479	* @x: holds the state of this particular completion	3479	* @x: holds the state of this particular completion
3480	*	3480	*
3481	* This will wake up a single thread waiting on this completion. Threads will be	3481	* This will wake up a single thread waiting on this completion. Threads will be
3482	* awakened in the same order in which they were queued.	3482	* awakened in the same order in which they were queued.
3483	*	3483	*
3484	* See also complete_all(), wait_for_completion() and related routines.	3484	* See also complete_all(), wait_for_completion() and related routines.
3485	*	3485	*
3486	* It may be assumed that this function implies a write memory barrier before	3486	* It may be assumed that this function implies a write memory barrier before
3487	* changing the task state if and only if any tasks are woken up.	3487	* changing the task state if and only if any tasks are woken up.
3488	*/	3488	*/
3489	void complete(struct completion *x)	3489	void complete(struct completion *x)
3490	{	3490	{
3491	unsigned long flags;	3491	unsigned long flags;
3492		3492
3493	spin_lock_irqsave(&x->wait.lock, flags);	3493	spin_lock_irqsave(&x->wait.lock, flags);
3494	x->done++;	3494	x->done++;
3495	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);	3495	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3496	spin_unlock_irqrestore(&x->wait.lock, flags);	3496	spin_unlock_irqrestore(&x->wait.lock, flags);
3497	}	3497	}
3498	EXPORT_SYMBOL(complete);	3498	EXPORT_SYMBOL(complete);
3499		3499
3500	/**	3500	/**
3501	* complete_all: - signals all threads waiting on this completion	3501	* complete_all: - signals all threads waiting on this completion
3502	* @x: holds the state of this particular completion	3502	* @x: holds the state of this particular completion
3503	*	3503	*
3504	* This will wake up all threads waiting on this particular completion event.	3504	* This will wake up all threads waiting on this particular completion event.
3505	*	3505	*
3506	* It may be assumed that this function implies a write memory barrier before	3506	* It may be assumed that this function implies a write memory barrier before
3507	* changing the task state if and only if any tasks are woken up.	3507	* changing the task state if and only if any tasks are woken up.
3508	*/	3508	*/
3509	void complete_all(struct completion *x)	3509	void complete_all(struct completion *x)
3510	{	3510	{
3511	unsigned long flags;	3511	unsigned long flags;
3512		3512
3513	spin_lock_irqsave(&x->wait.lock, flags);	3513	spin_lock_irqsave(&x->wait.lock, flags);
3514	x->done += UINT_MAX/2;	3514	x->done += UINT_MAX/2;
3515	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);	3515	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3516	spin_unlock_irqrestore(&x->wait.lock, flags);	3516	spin_unlock_irqrestore(&x->wait.lock, flags);
3517	}	3517	}
3518	EXPORT_SYMBOL(complete_all);	3518	EXPORT_SYMBOL(complete_all);
3519		3519
3520	static inline long __sched	3520	static inline long __sched
3521	do_wait_for_common(struct completion *x, long timeout, int state)	3521	do_wait_for_common(struct completion *x, long timeout, int state)
3522	{	3522	{
3523	if (!x->done) {	3523	if (!x->done) {
3524	DECLARE_WAITQUEUE(wait, current);	3524	DECLARE_WAITQUEUE(wait, current);
3525		3525
3526	__add_wait_queue_tail_exclusive(&x->wait, &wait);	3526	__add_wait_queue_tail_exclusive(&x->wait, &wait);
3527	do {	3527	do {
3528	if (signal_pending_state(state, current)) {	3528	if (signal_pending_state(state, current)) {
3529	timeout = -ERESTARTSYS;	3529	timeout = -ERESTARTSYS;
3530	break;	3530	break;
3531	}	3531	}
3532	__set_current_state(state);	3532	__set_current_state(state);
3533	spin_unlock_irq(&x->wait.lock);	3533	spin_unlock_irq(&x->wait.lock);
3534	timeout = schedule_timeout(timeout);	3534	timeout = schedule_timeout(timeout);
3535	spin_lock_irq(&x->wait.lock);	3535	spin_lock_irq(&x->wait.lock);
3536	} while (!x->done && timeout);	3536	} while (!x->done && timeout);
3537	__remove_wait_queue(&x->wait, &wait);	3537	__remove_wait_queue(&x->wait, &wait);
3538	if (!x->done)	3538	if (!x->done)
3539	return timeout;	3539	return timeout;
3540	}	3540	}
3541	x->done--;	3541	x->done--;
3542	return timeout ?: 1;	3542	return timeout ?: 1;
3543	}	3543	}
3544		3544
3545	static long __sched	3545	static long __sched
3546	wait_for_common(struct completion *x, long timeout, int state)	3546	wait_for_common(struct completion *x, long timeout, int state)
3547	{	3547	{
3548	might_sleep();	3548	might_sleep();
3549		3549
3550	spin_lock_irq(&x->wait.lock);	3550	spin_lock_irq(&x->wait.lock);
3551	timeout = do_wait_for_common(x, timeout, state);	3551	timeout = do_wait_for_common(x, timeout, state);
3552	spin_unlock_irq(&x->wait.lock);	3552	spin_unlock_irq(&x->wait.lock);
3553	return timeout;	3553	return timeout;
3554	}	3554	}
3555		3555
3556	/**	3556	/**
3557	* wait_for_completion: - waits for completion of a task	3557	* wait_for_completion: - waits for completion of a task
3558	* @x: holds the state of this particular completion	3558	* @x: holds the state of this particular completion
3559	*	3559	*
3560	* This waits to be signaled for completion of a specific task. It is NOT	3560	* This waits to be signaled for completion of a specific task. It is NOT
3561	* interruptible and there is no timeout.	3561	* interruptible and there is no timeout.
3562	*	3562	*
3563	* See also similar routines (i.e. wait_for_completion_timeout()) with timeout	3563	* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3564	* and interrupt capability. Also see complete().	3564	* and interrupt capability. Also see complete().
3565	*/	3565	*/
3566	void __sched wait_for_completion(struct completion *x)	3566	void __sched wait_for_completion(struct completion *x)
3567	{	3567	{
3568	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);	3568	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3569	}	3569	}
3570	EXPORT_SYMBOL(wait_for_completion);	3570	EXPORT_SYMBOL(wait_for_completion);
3571		3571
3572	/**	3572	/**
3573	* wait_for_completion_timeout: - waits for completion of a task (w/timeout)	3573	* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
3574	* @x: holds the state of this particular completion	3574	* @x: holds the state of this particular completion
3575	* @timeout: timeout value in jiffies	3575	* @timeout: timeout value in jiffies
3576	*	3576	*
3577	* This waits for either a completion of a specific task to be signaled or for a	3577	* This waits for either a completion of a specific task to be signaled or for a
3578	* specified timeout to expire. The timeout is in jiffies. It is not	3578	* specified timeout to expire. The timeout is in jiffies. It is not
3579	* interruptible.	3579	* interruptible.
3580	*	3580	*
3581	* The return value is 0 if timed out, and positive (at least 1, or number of	3581	* The return value is 0 if timed out, and positive (at least 1, or number of
3582	* jiffies left till timeout) if completed.	3582	* jiffies left till timeout) if completed.
3583	*/	3583	*/
3584	unsigned long __sched	3584	unsigned long __sched
3585	wait_for_completion_timeout(struct completion *x, unsigned long timeout)	3585	wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3586	{	3586	{
3587	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);	3587	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3588	}	3588	}
3589	EXPORT_SYMBOL(wait_for_completion_timeout);	3589	EXPORT_SYMBOL(wait_for_completion_timeout);
3590		3590
3591	/**	3591	/**
3592	* wait_for_completion_interruptible: - waits for completion of a task (w/intr)	3592	* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3593	* @x: holds the state of this particular completion	3593	* @x: holds the state of this particular completion
3594	*	3594	*
3595	* This waits for completion of a specific task to be signaled. It is	3595	* This waits for completion of a specific task to be signaled. It is
3596	* interruptible.	3596	* interruptible.
3597	*	3597	*
3598	* The return value is -ERESTARTSYS if interrupted, 0 if completed.	3598	* The return value is -ERESTARTSYS if interrupted, 0 if completed.
3599	*/	3599	*/
3600	int __sched wait_for_completion_interruptible(struct completion *x)	3600	int __sched wait_for_completion_interruptible(struct completion *x)
3601	{	3601	{
3602	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);	3602	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3603	if (t == -ERESTARTSYS)	3603	if (t == -ERESTARTSYS)
3604	return t;	3604	return t;
3605	return 0;	3605	return 0;
3606	}	3606	}
3607	EXPORT_SYMBOL(wait_for_completion_interruptible);	3607	EXPORT_SYMBOL(wait_for_completion_interruptible);
3608		3608
3609	/**	3609	/**
3610	* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))	3610	* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
3611	* @x: holds the state of this particular completion	3611	* @x: holds the state of this particular completion
3612	* @timeout: timeout value in jiffies	3612	* @timeout: timeout value in jiffies
3613	*	3613	*
3614	* This waits for either a completion of a specific task to be signaled or for a	3614	* This waits for either a completion of a specific task to be signaled or for a
3615	* specified timeout to expire. It is interruptible. The timeout is in jiffies.	3615	* specified timeout to expire. It is interruptible. The timeout is in jiffies.
3616	*	3616	*
3617	* The return value is -ERESTARTSYS if interrupted, 0 if timed out,	3617	* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3618	* positive (at least 1, or number of jiffies left till timeout) if completed.	3618	* positive (at least 1, or number of jiffies left till timeout) if completed.
3619	*/	3619	*/
3620	long __sched	3620	long __sched
3621	wait_for_completion_interruptible_timeout(struct completion *x,	3621	wait_for_completion_interruptible_timeout(struct completion *x,
3622	unsigned long timeout)	3622	unsigned long timeout)
3623	{	3623	{
3624	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);	3624	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3625	}	3625	}
3626	EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);	3626	EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3627		3627
3628	/**	3628	/**
3629	* wait_for_completion_killable: - waits for completion of a task (killable)	3629	* wait_for_completion_killable: - waits for completion of a task (killable)
3630	* @x: holds the state of this particular completion	3630	* @x: holds the state of this particular completion
3631	*	3631	*
3632	* This waits to be signaled for completion of a specific task. It can be	3632	* This waits to be signaled for completion of a specific task. It can be
3633	* interrupted by a kill signal.	3633	* interrupted by a kill signal.
3634	*	3634	*
3635	* The return value is -ERESTARTSYS if interrupted, 0 if completed.	3635	* The return value is -ERESTARTSYS if interrupted, 0 if completed.
3636	*/	3636	*/
3637	int __sched wait_for_completion_killable(struct completion *x)	3637	int __sched wait_for_completion_killable(struct completion *x)
3638	{	3638	{
3639	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);	3639	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3640	if (t == -ERESTARTSYS)	3640	if (t == -ERESTARTSYS)
3641	return t;	3641	return t;
3642	return 0;	3642	return 0;
3643	}	3643	}
3644	EXPORT_SYMBOL(wait_for_completion_killable);	3644	EXPORT_SYMBOL(wait_for_completion_killable);
3645		3645
3646	/**	3646	/**
3647	* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))	3647	* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
3648	* @x: holds the state of this particular completion	3648	* @x: holds the state of this particular completion
3649	* @timeout: timeout value in jiffies	3649	* @timeout: timeout value in jiffies
3650	*	3650	*
3651	* This waits for either a completion of a specific task to be	3651	* This waits for either a completion of a specific task to be
3652	* signaled or for a specified timeout to expire. It can be	3652	* signaled or for a specified timeout to expire. It can be
3653	* interrupted by a kill signal. The timeout is in jiffies.	3653	* interrupted by a kill signal. The timeout is in jiffies.
3654	*	3654	*
3655	* The return value is -ERESTARTSYS if interrupted, 0 if timed out,	3655	* The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3656	* positive (at least 1, or number of jiffies left till timeout) if completed.	3656	* positive (at least 1, or number of jiffies left till timeout) if completed.
3657	*/	3657	*/
3658	long __sched	3658	long __sched
3659	wait_for_completion_killable_timeout(struct completion *x,	3659	wait_for_completion_killable_timeout(struct completion *x,
3660	unsigned long timeout)	3660	unsigned long timeout)
3661	{	3661	{
3662	return wait_for_common(x, timeout, TASK_KILLABLE);	3662	return wait_for_common(x, timeout, TASK_KILLABLE);
3663	}	3663	}
3664	EXPORT_SYMBOL(wait_for_completion_killable_timeout);	3664	EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3665		3665
3666	/**	3666	/**
3667	* try_wait_for_completion - try to decrement a completion without blocking	3667	* try_wait_for_completion - try to decrement a completion without blocking
3668	* @x: completion structure	3668	* @x: completion structure
3669	*	3669	*
3670	* Returns: 0 if a decrement cannot be done without blocking	3670	* Returns: 0 if a decrement cannot be done without blocking
3671	* 1 if a decrement succeeded.	3671	* 1 if a decrement succeeded.
3672	*	3672	*
3673	* If a completion is being used as a counting completion,	3673	* If a completion is being used as a counting completion,
3674	* attempt to decrement the counter without blocking. This	3674	* attempt to decrement the counter without blocking. This
3675	* enables us to avoid waiting if the resource the completion	3675	* enables us to avoid waiting if the resource the completion
3676	* is protecting is not available.	3676	* is protecting is not available.
3677	*/	3677	*/
3678	bool try_wait_for_completion(struct completion *x)	3678	bool try_wait_for_completion(struct completion *x)
3679	{	3679	{
3680	unsigned long flags;	3680	unsigned long flags;
3681	int ret = 1;	3681	int ret = 1;
3682		3682
3683	spin_lock_irqsave(&x->wait.lock, flags);	3683	spin_lock_irqsave(&x->wait.lock, flags);
3684	if (!x->done)	3684	if (!x->done)
3685	ret = 0;	3685	ret = 0;
3686	else	3686	else
3687	x->done--;	3687	x->done--;
3688	spin_unlock_irqrestore(&x->wait.lock, flags);	3688	spin_unlock_irqrestore(&x->wait.lock, flags);
3689	return ret;	3689	return ret;
3690	}	3690	}
3691	EXPORT_SYMBOL(try_wait_for_completion);	3691	EXPORT_SYMBOL(try_wait_for_completion);
3692		3692
3693	/**	3693	/**
3694	* completion_done - Test to see if a completion has any waiters	3694	* completion_done - Test to see if a completion has any waiters
3695	* @x: completion structure	3695	* @x: completion structure
3696	*	3696	*
3697	* Returns: 0 if there are waiters (wait_for_completion() in progress)	3697	* Returns: 0 if there are waiters (wait_for_completion() in progress)
3698	* 1 if there are no waiters.	3698	* 1 if there are no waiters.
3699	*	3699	*
3700	*/	3700	*/
3701	bool completion_done(struct completion *x)	3701	bool completion_done(struct completion *x)
3702	{	3702	{
3703	unsigned long flags;	3703	unsigned long flags;
3704	int ret = 1;	3704	int ret = 1;
3705		3705
3706	spin_lock_irqsave(&x->wait.lock, flags);	3706	spin_lock_irqsave(&x->wait.lock, flags);
3707	if (!x->done)	3707	if (!x->done)
3708	ret = 0;	3708	ret = 0;
3709	spin_unlock_irqrestore(&x->wait.lock, flags);	3709	spin_unlock_irqrestore(&x->wait.lock, flags);
3710	return ret;	3710	return ret;
3711	}	3711	}
3712	EXPORT_SYMBOL(completion_done);	3712	EXPORT_SYMBOL(completion_done);
3713		3713
3714	static long __sched	3714	static long __sched
3715	sleep_on_common(wait_queue_head_t *q, int state, long timeout)	3715	sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3716	{	3716	{
3717	unsigned long flags;	3717	unsigned long flags;
3718	wait_queue_t wait;	3718	wait_queue_t wait;
3719		3719
3720	init_waitqueue_entry(&wait, current);	3720	init_waitqueue_entry(&wait, current);
3721		3721
3722	__set_current_state(state);	3722	__set_current_state(state);
3723		3723
3724	spin_lock_irqsave(&q->lock, flags);	3724	spin_lock_irqsave(&q->lock, flags);
3725	__add_wait_queue(q, &wait);	3725	__add_wait_queue(q, &wait);
3726	spin_unlock(&q->lock);	3726	spin_unlock(&q->lock);
3727	timeout = schedule_timeout(timeout);	3727	timeout = schedule_timeout(timeout);
3728	spin_lock_irq(&q->lock);	3728	spin_lock_irq(&q->lock);
3729	__remove_wait_queue(q, &wait);	3729	__remove_wait_queue(q, &wait);
3730	spin_unlock_irqrestore(&q->lock, flags);	3730	spin_unlock_irqrestore(&q->lock, flags);
3731		3731
3732	return timeout;	3732	return timeout;
3733	}	3733	}
3734		3734
3735	void __sched interruptible_sleep_on(wait_queue_head_t *q)	3735	void __sched interruptible_sleep_on(wait_queue_head_t *q)
3736	{	3736	{
3737	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);	3737	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3738	}	3738	}
3739	EXPORT_SYMBOL(interruptible_sleep_on);	3739	EXPORT_SYMBOL(interruptible_sleep_on);
3740		3740
3741	long __sched	3741	long __sched
3742	interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)	3742	interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3743	{	3743	{
3744	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);	3744	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3745	}	3745	}
3746	EXPORT_SYMBOL(interruptible_sleep_on_timeout);	3746	EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3747		3747
3748	void __sched sleep_on(wait_queue_head_t *q)	3748	void __sched sleep_on(wait_queue_head_t *q)
3749	{	3749	{
3750	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);	3750	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3751	}	3751	}
3752	EXPORT_SYMBOL(sleep_on);	3752	EXPORT_SYMBOL(sleep_on);
3753		3753
3754	long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)	3754	long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3755	{	3755	{
3756	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);	3756	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3757	}	3757	}
3758	EXPORT_SYMBOL(sleep_on_timeout);	3758	EXPORT_SYMBOL(sleep_on_timeout);
3759		3759
3760	#ifdef CONFIG_RT_MUTEXES	3760	#ifdef CONFIG_RT_MUTEXES
3761		3761
3762	/*	3762	/*
3763	* rt_mutex_setprio - set the current priority of a task	3763	* rt_mutex_setprio - set the current priority of a task
3764	* @p: task	3764	* @p: task
3765	* @prio: prio value (kernel-internal form)	3765	* @prio: prio value (kernel-internal form)
3766	*	3766	*
3767	* This function changes the 'effective' priority of a task. It does	3767	* This function changes the 'effective' priority of a task. It does
3768	* not touch ->normal_prio like __setscheduler().	3768	* not touch ->normal_prio like __setscheduler().
3769	*	3769	*
3770	* Used by the rt_mutex code to implement priority inheritance logic.	3770	* Used by the rt_mutex code to implement priority inheritance logic.
3771	*/	3771	*/
3772	void rt_mutex_setprio(struct task_struct *p, int prio)	3772	void rt_mutex_setprio(struct task_struct *p, int prio)
3773	{	3773	{
3774	int oldprio, on_rq, running;	3774	int oldprio, on_rq, running;
3775	struct rq *rq;	3775	struct rq *rq;
3776	const struct sched_class *prev_class;	3776	const struct sched_class *prev_class;
3777		3777
3778	BUG_ON(prio < 0 \|\| prio > MAX_PRIO);	3778	BUG_ON(prio < 0 \|\| prio > MAX_PRIO);
3779		3779
3780	rq = __task_rq_lock(p);	3780	rq = __task_rq_lock(p);
3781		3781
3782	trace_sched_pi_setprio(p, prio);	3782	trace_sched_pi_setprio(p, prio);
3783	oldprio = p->prio;	3783	oldprio = p->prio;
3784	prev_class = p->sched_class;	3784	prev_class = p->sched_class;
3785	on_rq = p->on_rq;	3785	on_rq = p->on_rq;
3786	running = task_current(rq, p);	3786	running = task_current(rq, p);
3787	if (on_rq)	3787	if (on_rq)
3788	dequeue_task(rq, p, 0);	3788	dequeue_task(rq, p, 0);
3789	if (running)	3789	if (running)
3790	p->sched_class->put_prev_task(rq, p);	3790	p->sched_class->put_prev_task(rq, p);
3791		3791
3792	if (rt_prio(prio))	3792	if (rt_prio(prio))
3793	p->sched_class = &rt_sched_class;	3793	p->sched_class = &rt_sched_class;
3794	else	3794	else
3795	p->sched_class = &fair_sched_class;	3795	p->sched_class = &fair_sched_class;
3796		3796
3797	p->prio = prio;	3797	p->prio = prio;
3798		3798
3799	if (running)	3799	if (running)
3800	p->sched_class->set_curr_task(rq);	3800	p->sched_class->set_curr_task(rq);
3801	if (on_rq)	3801	if (on_rq)
3802	enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);	3802	enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3803		3803
3804	check_class_changed(rq, p, prev_class, oldprio);	3804	check_class_changed(rq, p, prev_class, oldprio);
3805	__task_rq_unlock(rq);	3805	__task_rq_unlock(rq);
3806	}	3806	}
3807		3807
3808	#endif	3808	#endif
3809		3809
3810	void set_user_nice(struct task_struct *p, long nice)	3810	void set_user_nice(struct task_struct *p, long nice)
3811	{	3811	{
3812	int old_prio, delta, on_rq;	3812	int old_prio, delta, on_rq;
3813	unsigned long flags;	3813	unsigned long flags;
3814	struct rq *rq;	3814	struct rq *rq;
3815		3815
3816	if (TASK_NICE(p) == nice \|\| nice < -20 \|\| nice > 19)	3816	if (TASK_NICE(p) == nice \|\| nice < -20 \|\| nice > 19)
3817	return;	3817	return;
3818	/*	3818	/*
3819	* We have to be careful, if called from sys_setpriority(),	3819	* We have to be careful, if called from sys_setpriority(),
3820	* the task might be in the middle of scheduling on another CPU.	3820	* the task might be in the middle of scheduling on another CPU.
3821	*/	3821	*/
3822	rq = task_rq_lock(p, &flags);	3822	rq = task_rq_lock(p, &flags);
3823	/*	3823	/*
3824	* The RT priorities are set via sched_setscheduler(), but we still	3824	* The RT priorities are set via sched_setscheduler(), but we still
3825	* allow the 'normal' nice value to be set - but as expected	3825	* allow the 'normal' nice value to be set - but as expected
3826	* it wont have any effect on scheduling until the task is	3826	* it wont have any effect on scheduling until the task is
3827	* SCHED_FIFO/SCHED_RR:	3827	* SCHED_FIFO/SCHED_RR:
3828	*/	3828	*/
3829	if (task_has_rt_policy(p)) {	3829	if (task_has_rt_policy(p)) {
3830	p->static_prio = NICE_TO_PRIO(nice);	3830	p->static_prio = NICE_TO_PRIO(nice);
3831	goto out_unlock;	3831	goto out_unlock;
3832	}	3832	}
3833	on_rq = p->on_rq;	3833	on_rq = p->on_rq;
3834	if (on_rq)	3834	if (on_rq)
3835	dequeue_task(rq, p, 0);	3835	dequeue_task(rq, p, 0);
3836		3836
3837	p->static_prio = NICE_TO_PRIO(nice);	3837	p->static_prio = NICE_TO_PRIO(nice);
3838	set_load_weight(p);	3838	set_load_weight(p);
3839	old_prio = p->prio;	3839	old_prio = p->prio;
3840	p->prio = effective_prio(p);	3840	p->prio = effective_prio(p);
3841	delta = p->prio - old_prio;	3841	delta = p->prio - old_prio;
3842		3842
3843	if (on_rq) {	3843	if (on_rq) {
3844	enqueue_task(rq, p, 0);	3844	enqueue_task(rq, p, 0);
3845	/*	3845	/*
3846	* If the task increased its priority or is running and	3846	* If the task increased its priority or is running and
3847	* lowered its priority, then reschedule its CPU:	3847	* lowered its priority, then reschedule its CPU:
3848	*/	3848	*/
3849	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))	3849	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
3850	resched_task(rq->curr);	3850	resched_task(rq->curr);
3851	}	3851	}
3852	out_unlock:	3852	out_unlock:
3853	task_rq_unlock(rq, p, &flags);	3853	task_rq_unlock(rq, p, &flags);
3854	}	3854	}
3855	EXPORT_SYMBOL(set_user_nice);	3855	EXPORT_SYMBOL(set_user_nice);
3856		3856
3857	/*	3857	/*
3858	* can_nice - check if a task can reduce its nice value	3858	* can_nice - check if a task can reduce its nice value
3859	* @p: task	3859	* @p: task
3860	* @nice: nice value	3860	* @nice: nice value
3861	*/	3861	*/
3862	int can_nice(const struct task_struct *p, const int nice)	3862	int can_nice(const struct task_struct *p, const int nice)
3863	{	3863	{
3864	/* convert nice value [19,-20] to rlimit style value [1,40] */	3864	/* convert nice value [19,-20] to rlimit style value [1,40] */
3865	int nice_rlim = 20 - nice;	3865	int nice_rlim = 20 - nice;
3866		3866
3867	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|	3867	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|
3868	capable(CAP_SYS_NICE));	3868	capable(CAP_SYS_NICE));
3869	}	3869	}
3870		3870
3871	#ifdef __ARCH_WANT_SYS_NICE	3871	#ifdef __ARCH_WANT_SYS_NICE
3872		3872
3873	/*	3873	/*
3874	* sys_nice - change the priority of the current process.	3874	* sys_nice - change the priority of the current process.
3875	* @increment: priority increment	3875	* @increment: priority increment
3876	*	3876	*
3877	* sys_setpriority is a more generic, but much slower function that	3877	* sys_setpriority is a more generic, but much slower function that
3878	* does similar things.	3878	* does similar things.
3879	*/	3879	*/
3880	SYSCALL_DEFINE1(nice, int, increment)	3880	SYSCALL_DEFINE1(nice, int, increment)
3881	{	3881	{
3882	long nice, retval;	3882	long nice, retval;
3883		3883
3884	/*	3884	/*
3885	* Setpriority might change our priority at the same moment.	3885	* Setpriority might change our priority at the same moment.
3886	* We don't have to worry. Conceptually one call occurs first	3886	* We don't have to worry. Conceptually one call occurs first
3887	* and we have a single winner.	3887	* and we have a single winner.
3888	*/	3888	*/
3889	if (increment < -40)	3889	if (increment < -40)
3890	increment = -40;	3890	increment = -40;
3891	if (increment > 40)	3891	if (increment > 40)
3892	increment = 40;	3892	increment = 40;
3893		3893
3894	nice = TASK_NICE(current) + increment;	3894	nice = TASK_NICE(current) + increment;
3895	if (nice < -20)	3895	if (nice < -20)
3896	nice = -20;	3896	nice = -20;
3897	if (nice > 19)	3897	if (nice > 19)
3898	nice = 19;	3898	nice = 19;
3899		3899
3900	if (increment < 0 && !can_nice(current, nice))	3900	if (increment < 0 && !can_nice(current, nice))
3901	return -EPERM;	3901	return -EPERM;
3902		3902
3903	retval = security_task_setnice(current, nice);	3903	retval = security_task_setnice(current, nice);
3904	if (retval)	3904	if (retval)
3905	return retval;	3905	return retval;
3906		3906
3907	set_user_nice(current, nice);	3907	set_user_nice(current, nice);
3908	return 0;	3908	return 0;
3909	}	3909	}
3910		3910
3911	#endif	3911	#endif
3912		3912
3913	/**	3913	/**
3914	* task_prio - return the priority value of a given task.	3914	* task_prio - return the priority value of a given task.
3915	* @p: the task in question.	3915	* @p: the task in question.
3916	*	3916	*
3917	* This is the priority value as seen by users in /proc.	3917	* This is the priority value as seen by users in /proc.
3918	* RT tasks are offset by -200. Normal tasks are centered	3918	* RT tasks are offset by -200. Normal tasks are centered
3919	* around 0, value goes from -16 to +15.	3919	* around 0, value goes from -16 to +15.
3920	*/	3920	*/
3921	int task_prio(const struct task_struct *p)	3921	int task_prio(const struct task_struct *p)
3922	{	3922	{
3923	return p->prio - MAX_RT_PRIO;	3923	return p->prio - MAX_RT_PRIO;
3924	}	3924	}
3925		3925
3926	/**	3926	/**
3927	* task_nice - return the nice value of a given task.	3927	* task_nice - return the nice value of a given task.
3928	* @p: the task in question.	3928	* @p: the task in question.
3929	*/	3929	*/
3930	int task_nice(const struct task_struct *p)	3930	int task_nice(const struct task_struct *p)
3931	{	3931	{
3932	return TASK_NICE(p);	3932	return TASK_NICE(p);
3933	}	3933	}
3934	EXPORT_SYMBOL(task_nice);	3934	EXPORT_SYMBOL(task_nice);
3935		3935
3936	/**	3936	/**
3937	* idle_cpu - is a given cpu idle currently?	3937	* idle_cpu - is a given cpu idle currently?
3938	* @cpu: the processor in question.	3938	* @cpu: the processor in question.
3939	*/	3939	*/
3940	int idle_cpu(int cpu)	3940	int idle_cpu(int cpu)
3941	{	3941	{
3942	struct rq *rq = cpu_rq(cpu);	3942	struct rq *rq = cpu_rq(cpu);
3943		3943
3944	if (rq->curr != rq->idle)	3944	if (rq->curr != rq->idle)
3945	return 0;	3945	return 0;
3946		3946
3947	if (rq->nr_running)	3947	if (rq->nr_running)
3948	return 0;	3948	return 0;
3949		3949
3950	#ifdef CONFIG_SMP	3950	#ifdef CONFIG_SMP
3951	if (!llist_empty(&rq->wake_list))	3951	if (!llist_empty(&rq->wake_list))
3952	return 0;	3952	return 0;
3953	#endif	3953	#endif
3954		3954
3955	return 1;	3955	return 1;
3956	}	3956	}
3957		3957
3958	/**	3958	/**
3959	* idle_task - return the idle task for a given cpu.	3959	* idle_task - return the idle task for a given cpu.
3960	* @cpu: the processor in question.	3960	* @cpu: the processor in question.
3961	*/	3961	*/
3962	struct task_struct *idle_task(int cpu)	3962	struct task_struct *idle_task(int cpu)
3963	{	3963	{
3964	return cpu_rq(cpu)->idle;	3964	return cpu_rq(cpu)->idle;
3965	}	3965	}
3966		3966
3967	/**	3967	/**
3968	* find_process_by_pid - find a process with a matching PID value.	3968	* find_process_by_pid - find a process with a matching PID value.
3969	* @pid: the pid in question.	3969	* @pid: the pid in question.
3970	*/	3970	*/
3971	static struct task_struct *find_process_by_pid(pid_t pid)	3971	static struct task_struct *find_process_by_pid(pid_t pid)
3972	{	3972	{
3973	return pid ? find_task_by_vpid(pid) : current;	3973	return pid ? find_task_by_vpid(pid) : current;
3974	}	3974	}
3975		3975
3976	/* Actually do priority change: must hold rq lock. */	3976	/* Actually do priority change: must hold rq lock. */
3977	static void	3977	static void
3978	__setscheduler(struct rq rq, struct task_struct p, int policy, int prio)	3978	__setscheduler(struct rq rq, struct task_struct p, int policy, int prio)
3979	{	3979	{
3980	p->policy = policy;	3980	p->policy = policy;
3981	p->rt_priority = prio;	3981	p->rt_priority = prio;
3982	p->normal_prio = normal_prio(p);	3982	p->normal_prio = normal_prio(p);
3983	/* we are holding p->pi_lock already */	3983	/* we are holding p->pi_lock already */
3984	p->prio = rt_mutex_getprio(p);	3984	p->prio = rt_mutex_getprio(p);
3985	if (rt_prio(p->prio))	3985	if (rt_prio(p->prio))
3986	p->sched_class = &rt_sched_class;	3986	p->sched_class = &rt_sched_class;
3987	else	3987	else
3988	p->sched_class = &fair_sched_class;	3988	p->sched_class = &fair_sched_class;
3989	set_load_weight(p);	3989	set_load_weight(p);
3990	}	3990	}
3991		3991
3992	/*	3992	/*
3993	* check the target process has a UID that matches the current process's	3993	* check the target process has a UID that matches the current process's
3994	*/	3994	*/
3995	static bool check_same_owner(struct task_struct *p)	3995	static bool check_same_owner(struct task_struct *p)
3996	{	3996	{
3997	const struct cred cred = current_cred(), pcred;	3997	const struct cred cred = current_cred(), pcred;
3998	bool match;	3998	bool match;
3999		3999
4000	rcu_read_lock();	4000	rcu_read_lock();
4001	pcred = __task_cred(p);	4001	pcred = __task_cred(p);
4002	if (cred->user->user_ns == pcred->user->user_ns)	4002	if (cred->user->user_ns == pcred->user->user_ns)
4003	match = (cred->euid == pcred->euid \|\|	4003	match = (cred->euid == pcred->euid \|\|
4004	cred->euid == pcred->uid);	4004	cred->euid == pcred->uid);
4005	else	4005	else
4006	match = false;	4006	match = false;
4007	rcu_read_unlock();	4007	rcu_read_unlock();
4008	return match;	4008	return match;
4009	}	4009	}
4010		4010
4011	static int __sched_setscheduler(struct task_struct *p, int policy,	4011	static int __sched_setscheduler(struct task_struct *p, int policy,
4012	const struct sched_param *param, bool user)	4012	const struct sched_param *param, bool user)
4013	{	4013	{
4014	int retval, oldprio, oldpolicy = -1, on_rq, running;	4014	int retval, oldprio, oldpolicy = -1, on_rq, running;
4015	unsigned long flags;	4015	unsigned long flags;
4016	const struct sched_class *prev_class;	4016	const struct sched_class *prev_class;
4017	struct rq *rq;	4017	struct rq *rq;
4018	int reset_on_fork;	4018	int reset_on_fork;
4019		4019
4020	/* may grab non-irq protected spin_locks */	4020	/* may grab non-irq protected spin_locks */
4021	BUG_ON(in_interrupt());	4021	BUG_ON(in_interrupt());
4022	recheck:	4022	recheck:
4023	/* double check policy once rq lock held */	4023	/* double check policy once rq lock held */
4024	if (policy < 0) {	4024	if (policy < 0) {
4025	reset_on_fork = p->sched_reset_on_fork;	4025	reset_on_fork = p->sched_reset_on_fork;
4026	policy = oldpolicy = p->policy;	4026	policy = oldpolicy = p->policy;
4027	} else {	4027	} else {
4028	reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);	4028	reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4029	policy &= ~SCHED_RESET_ON_FORK;	4029	policy &= ~SCHED_RESET_ON_FORK;
4030		4030
4031	if (policy != SCHED_FIFO && policy != SCHED_RR &&	4031	if (policy != SCHED_FIFO && policy != SCHED_RR &&
4032	policy != SCHED_NORMAL && policy != SCHED_BATCH &&	4032	policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4033	policy != SCHED_IDLE)	4033	policy != SCHED_IDLE)
4034	return -EINVAL;	4034	return -EINVAL;
4035	}	4035	}
4036		4036
4037	/*	4037	/*
4038	* Valid priorities for SCHED_FIFO and SCHED_RR are	4038	* Valid priorities for SCHED_FIFO and SCHED_RR are
4039	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,	4039	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4040	* SCHED_BATCH and SCHED_IDLE is 0.	4040	* SCHED_BATCH and SCHED_IDLE is 0.
4041	*/	4041	*/
4042	if (param->sched_priority < 0 \|\|	4042	if (param->sched_priority < 0 \|\|
4043	(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) \|\|	4043	(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) \|\|
4044	(!p->mm && param->sched_priority > MAX_RT_PRIO-1))	4044	(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4045	return -EINVAL;	4045	return -EINVAL;
4046	if (rt_policy(policy) != (param->sched_priority != 0))	4046	if (rt_policy(policy) != (param->sched_priority != 0))
4047	return -EINVAL;	4047	return -EINVAL;
4048		4048
4049	/*	4049	/*
4050	* Allow unprivileged RT tasks to decrease priority:	4050	* Allow unprivileged RT tasks to decrease priority:
4051	*/	4051	*/
4052	if (user && !capable(CAP_SYS_NICE)) {	4052	if (user && !capable(CAP_SYS_NICE)) {
4053	if (rt_policy(policy)) {	4053	if (rt_policy(policy)) {
4054	unsigned long rlim_rtprio =	4054	unsigned long rlim_rtprio =
4055	task_rlimit(p, RLIMIT_RTPRIO);	4055	task_rlimit(p, RLIMIT_RTPRIO);
4056		4056
4057	/* can't set/change the rt policy */	4057	/* can't set/change the rt policy */
4058	if (policy != p->policy && !rlim_rtprio)	4058	if (policy != p->policy && !rlim_rtprio)
4059	return -EPERM;	4059	return -EPERM;
4060		4060
4061	/* can't increase priority */	4061	/* can't increase priority */
4062	if (param->sched_priority > p->rt_priority &&	4062	if (param->sched_priority > p->rt_priority &&
4063	param->sched_priority > rlim_rtprio)	4063	param->sched_priority > rlim_rtprio)
4064	return -EPERM;	4064	return -EPERM;
4065	}	4065	}
4066		4066
4067	/*	4067	/*
4068	* Treat SCHED_IDLE as nice 20. Only allow a switch to	4068	* Treat SCHED_IDLE as nice 20. Only allow a switch to
4069	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.	4069	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4070	*/	4070	*/
4071	if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {	4071	if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4072	if (!can_nice(p, TASK_NICE(p)))	4072	if (!can_nice(p, TASK_NICE(p)))
4073	return -EPERM;	4073	return -EPERM;
4074	}	4074	}
4075		4075
4076	/* can't change other user's priorities */	4076	/* can't change other user's priorities */
4077	if (!check_same_owner(p))	4077	if (!check_same_owner(p))
4078	return -EPERM;	4078	return -EPERM;
4079		4079
4080	/* Normal users shall not reset the sched_reset_on_fork flag */	4080	/* Normal users shall not reset the sched_reset_on_fork flag */
4081	if (p->sched_reset_on_fork && !reset_on_fork)	4081	if (p->sched_reset_on_fork && !reset_on_fork)
4082	return -EPERM;	4082	return -EPERM;
4083	}	4083	}
4084		4084
4085	if (user) {	4085	if (user) {
4086	retval = security_task_setscheduler(p);	4086	retval = security_task_setscheduler(p);
4087	if (retval)	4087	if (retval)
4088	return retval;	4088	return retval;
4089	}	4089	}
4090		4090
4091	/*	4091	/*
4092	* make sure no PI-waiters arrive (or leave) while we are	4092	* make sure no PI-waiters arrive (or leave) while we are
4093	* changing the priority of the task:	4093	* changing the priority of the task:
4094	*	4094	*
4095	* To be able to change p->policy safely, the appropriate	4095	* To be able to change p->policy safely, the appropriate
4096	* runqueue lock must be held.	4096	* runqueue lock must be held.
4097	*/	4097	*/
4098	rq = task_rq_lock(p, &flags);	4098	rq = task_rq_lock(p, &flags);
4099		4099
4100	/*	4100	/*
4101	* Changing the policy of the stop threads its a very bad idea	4101	* Changing the policy of the stop threads its a very bad idea
4102	*/	4102	*/
4103	if (p == rq->stop) {	4103	if (p == rq->stop) {
4104	task_rq_unlock(rq, p, &flags);	4104	task_rq_unlock(rq, p, &flags);
4105	return -EINVAL;	4105	return -EINVAL;
4106	}	4106	}
4107		4107
4108	/*	4108	/*
4109	* If not changing anything there's no need to proceed further:	4109	* If not changing anything there's no need to proceed further:
4110	*/	4110	*/
4111	if (unlikely(policy == p->policy && (!rt_policy(policy) \|\|	4111	if (unlikely(policy == p->policy && (!rt_policy(policy) \|\|
4112	param->sched_priority == p->rt_priority))) {	4112	param->sched_priority == p->rt_priority))) {
4113		4113
4114	__task_rq_unlock(rq);	4114	__task_rq_unlock(rq);
4115	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	4115	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4116	return 0;	4116	return 0;
4117	}	4117	}
4118		4118
4119	#ifdef CONFIG_RT_GROUP_SCHED	4119	#ifdef CONFIG_RT_GROUP_SCHED
4120	if (user) {	4120	if (user) {
4121	/*	4121	/*
4122	* Do not allow realtime tasks into groups that have no runtime	4122	* Do not allow realtime tasks into groups that have no runtime
4123	* assigned.	4123	* assigned.
4124	*/	4124	*/
4125	if (rt_bandwidth_enabled() && rt_policy(policy) &&	4125	if (rt_bandwidth_enabled() && rt_policy(policy) &&
4126	task_group(p)->rt_bandwidth.rt_runtime == 0 &&	4126	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4127	!task_group_is_autogroup(task_group(p))) {	4127	!task_group_is_autogroup(task_group(p))) {
4128	task_rq_unlock(rq, p, &flags);	4128	task_rq_unlock(rq, p, &flags);
4129	return -EPERM;	4129	return -EPERM;
4130	}	4130	}
4131	}	4131	}
4132	#endif	4132	#endif
4133		4133
4134	/* recheck policy now with rq lock held */	4134	/* recheck policy now with rq lock held */
4135	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {	4135	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4136	policy = oldpolicy = -1;	4136	policy = oldpolicy = -1;
4137	task_rq_unlock(rq, p, &flags);	4137	task_rq_unlock(rq, p, &flags);
4138	goto recheck;	4138	goto recheck;
4139	}	4139	}
4140	on_rq = p->on_rq;	4140	on_rq = p->on_rq;
4141	running = task_current(rq, p);	4141	running = task_current(rq, p);
4142	if (on_rq)	4142	if (on_rq)
4143	dequeue_task(rq, p, 0);	4143	dequeue_task(rq, p, 0);
4144	if (running)	4144	if (running)
4145	p->sched_class->put_prev_task(rq, p);	4145	p->sched_class->put_prev_task(rq, p);
4146		4146
4147	p->sched_reset_on_fork = reset_on_fork;	4147	p->sched_reset_on_fork = reset_on_fork;
4148		4148
4149	oldprio = p->prio;	4149	oldprio = p->prio;
4150	prev_class = p->sched_class;	4150	prev_class = p->sched_class;
4151	__setscheduler(rq, p, policy, param->sched_priority);	4151	__setscheduler(rq, p, policy, param->sched_priority);
4152		4152
4153	if (running)	4153	if (running)
4154	p->sched_class->set_curr_task(rq);	4154	p->sched_class->set_curr_task(rq);
4155	if (on_rq)	4155	if (on_rq)
4156	enqueue_task(rq, p, 0);	4156	enqueue_task(rq, p, 0);
4157		4157
4158	check_class_changed(rq, p, prev_class, oldprio);	4158	check_class_changed(rq, p, prev_class, oldprio);
4159	task_rq_unlock(rq, p, &flags);	4159	task_rq_unlock(rq, p, &flags);
4160		4160
4161	rt_mutex_adjust_pi(p);	4161	rt_mutex_adjust_pi(p);
4162		4162
4163	return 0;	4163	return 0;
4164	}	4164	}
4165		4165
4166	/**	4166	/**
4167	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.	4167	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4168	* @p: the task in question.	4168	* @p: the task in question.
4169	* @policy: new policy.	4169	* @policy: new policy.
4170	* @param: structure containing the new RT priority.	4170	* @param: structure containing the new RT priority.
4171	*	4171	*
4172	* NOTE that the task may be already dead.	4172	* NOTE that the task may be already dead.
4173	*/	4173	*/
4174	int sched_setscheduler(struct task_struct *p, int policy,	4174	int sched_setscheduler(struct task_struct *p, int policy,
4175	const struct sched_param *param)	4175	const struct sched_param *param)
4176	{	4176	{
4177	return __sched_setscheduler(p, policy, param, true);	4177	return __sched_setscheduler(p, policy, param, true);
4178	}	4178	}
4179	EXPORT_SYMBOL_GPL(sched_setscheduler);	4179	EXPORT_SYMBOL_GPL(sched_setscheduler);
4180		4180
4181	/**	4181	/**
4182	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.	4182	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4183	* @p: the task in question.	4183	* @p: the task in question.
4184	* @policy: new policy.	4184	* @policy: new policy.
4185	* @param: structure containing the new RT priority.	4185	* @param: structure containing the new RT priority.
4186	*	4186	*
4187	* Just like sched_setscheduler, only don't bother checking if the	4187	* Just like sched_setscheduler, only don't bother checking if the
4188	* current context has permission. For example, this is needed in	4188	* current context has permission. For example, this is needed in
4189	* stop_machine(): we create temporary high priority worker threads,	4189	* stop_machine(): we create temporary high priority worker threads,
4190	* but our caller might not have that capability.	4190	* but our caller might not have that capability.
4191	*/	4191	*/
4192	int sched_setscheduler_nocheck(struct task_struct *p, int policy,	4192	int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4193	const struct sched_param *param)	4193	const struct sched_param *param)
4194	{	4194	{
4195	return __sched_setscheduler(p, policy, param, false);	4195	return __sched_setscheduler(p, policy, param, false);
4196	}	4196	}
4197		4197
4198	static int	4198	static int
4199	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)	4199	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4200	{	4200	{
4201	struct sched_param lparam;	4201	struct sched_param lparam;
4202	struct task_struct *p;	4202	struct task_struct *p;
4203	int retval;	4203	int retval;
4204		4204
4205	if (!param \|\| pid < 0)	4205	if (!param \|\| pid < 0)
4206	return -EINVAL;	4206	return -EINVAL;
4207	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))	4207	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4208	return -EFAULT;	4208	return -EFAULT;
4209		4209
4210	rcu_read_lock();	4210	rcu_read_lock();
4211	retval = -ESRCH;	4211	retval = -ESRCH;
4212	p = find_process_by_pid(pid);	4212	p = find_process_by_pid(pid);
4213	if (p != NULL)	4213	if (p != NULL)
4214	retval = sched_setscheduler(p, policy, &lparam);	4214	retval = sched_setscheduler(p, policy, &lparam);
4215	rcu_read_unlock();	4215	rcu_read_unlock();
4216		4216
4217	return retval;	4217	return retval;
4218	}	4218	}
4219		4219
4220	/**	4220	/**
4221	* sys_sched_setscheduler - set/change the scheduler policy and RT priority	4221	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
4222	* @pid: the pid in question.	4222	* @pid: the pid in question.
4223	* @policy: new policy.	4223	* @policy: new policy.
4224	* @param: structure containing the new RT priority.	4224	* @param: structure containing the new RT priority.
4225	*/	4225	*/
4226	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,	4226	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4227	struct sched_param __user *, param)	4227	struct sched_param __user *, param)
4228	{	4228	{
4229	/* negative values for policy are not valid */	4229	/* negative values for policy are not valid */
4230	if (policy < 0)	4230	if (policy < 0)
4231	return -EINVAL;	4231	return -EINVAL;
4232		4232
4233	return do_sched_setscheduler(pid, policy, param);	4233	return do_sched_setscheduler(pid, policy, param);
4234	}	4234	}
4235		4235
4236	/**	4236	/**
4237	* sys_sched_setparam - set/change the RT priority of a thread	4237	* sys_sched_setparam - set/change the RT priority of a thread
4238	* @pid: the pid in question.	4238	* @pid: the pid in question.
4239	* @param: structure containing the new RT priority.	4239	* @param: structure containing the new RT priority.
4240	*/	4240	*/
4241	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)	4241	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4242	{	4242	{
4243	return do_sched_setscheduler(pid, -1, param);	4243	return do_sched_setscheduler(pid, -1, param);
4244	}	4244	}
4245		4245
4246	/**	4246	/**
4247	* sys_sched_getscheduler - get the policy (scheduling class) of a thread	4247	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
4248	* @pid: the pid in question.	4248	* @pid: the pid in question.
4249	*/	4249	*/
4250	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)	4250	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4251	{	4251	{
4252	struct task_struct *p;	4252	struct task_struct *p;
4253	int retval;	4253	int retval;
4254		4254
4255	if (pid < 0)	4255	if (pid < 0)
4256	return -EINVAL;	4256	return -EINVAL;
4257		4257
4258	retval = -ESRCH;	4258	retval = -ESRCH;
4259	rcu_read_lock();	4259	rcu_read_lock();
4260	p = find_process_by_pid(pid);	4260	p = find_process_by_pid(pid);
4261	if (p) {	4261	if (p) {
4262	retval = security_task_getscheduler(p);	4262	retval = security_task_getscheduler(p);
4263	if (!retval)	4263	if (!retval)
4264	retval = p->policy	4264	retval = p->policy
4265	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);	4265	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4266	}	4266	}
4267	rcu_read_unlock();	4267	rcu_read_unlock();
4268	return retval;	4268	return retval;
4269	}	4269	}
4270		4270
4271	/**	4271	/**
4272	* sys_sched_getparam - get the RT priority of a thread	4272	* sys_sched_getparam - get the RT priority of a thread
4273	* @pid: the pid in question.	4273	* @pid: the pid in question.
4274	* @param: structure containing the RT priority.	4274	* @param: structure containing the RT priority.
4275	*/	4275	*/
4276	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)	4276	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4277	{	4277	{
4278	struct sched_param lp;	4278	struct sched_param lp;
4279	struct task_struct *p;	4279	struct task_struct *p;
4280	int retval;	4280	int retval;
4281		4281
4282	if (!param \|\| pid < 0)	4282	if (!param \|\| pid < 0)
4283	return -EINVAL;	4283	return -EINVAL;
4284		4284
4285	rcu_read_lock();	4285	rcu_read_lock();
4286	p = find_process_by_pid(pid);	4286	p = find_process_by_pid(pid);
4287	retval = -ESRCH;	4287	retval = -ESRCH;
4288	if (!p)	4288	if (!p)
4289	goto out_unlock;	4289	goto out_unlock;
4290		4290
4291	retval = security_task_getscheduler(p);	4291	retval = security_task_getscheduler(p);
4292	if (retval)	4292	if (retval)
4293	goto out_unlock;	4293	goto out_unlock;
4294		4294
4295	lp.sched_priority = p->rt_priority;	4295	lp.sched_priority = p->rt_priority;
4296	rcu_read_unlock();	4296	rcu_read_unlock();
4297		4297
4298	/*	4298	/*
4299	* This one might sleep, we cannot do it with a spinlock held ...	4299	* This one might sleep, we cannot do it with a spinlock held ...
4300	*/	4300	*/
4301	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;	4301	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4302		4302
4303	return retval;	4303	return retval;
4304		4304
4305	out_unlock:	4305	out_unlock:
4306	rcu_read_unlock();	4306	rcu_read_unlock();
4307	return retval;	4307	return retval;
4308	}	4308	}
4309		4309
4310	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)	4310	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4311	{	4311	{
4312	cpumask_var_t cpus_allowed, new_mask;	4312	cpumask_var_t cpus_allowed, new_mask;
4313	struct task_struct *p;	4313	struct task_struct *p;
4314	int retval;	4314	int retval;
4315		4315
4316	get_online_cpus();	4316	get_online_cpus();
4317	rcu_read_lock();	4317	rcu_read_lock();
4318		4318
4319	p = find_process_by_pid(pid);	4319	p = find_process_by_pid(pid);
4320	if (!p) {	4320	if (!p) {
4321	rcu_read_unlock();	4321	rcu_read_unlock();
4322	put_online_cpus();	4322	put_online_cpus();
4323	return -ESRCH;	4323	return -ESRCH;
4324	}	4324	}
4325		4325
4326	/* Prevent p going away */	4326	/* Prevent p going away */
4327	get_task_struct(p);	4327	get_task_struct(p);
4328	rcu_read_unlock();	4328	rcu_read_unlock();
4329		4329
4330	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {	4330	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4331	retval = -ENOMEM;	4331	retval = -ENOMEM;
4332	goto out_put_task;	4332	goto out_put_task;
4333	}	4333	}
4334	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {	4334	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4335	retval = -ENOMEM;	4335	retval = -ENOMEM;
4336	goto out_free_cpus_allowed;	4336	goto out_free_cpus_allowed;
4337	}	4337	}
4338	retval = -EPERM;	4338	retval = -EPERM;
4339	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))	4339	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4340	goto out_unlock;	4340	goto out_unlock;
4341		4341
4342	retval = security_task_setscheduler(p);	4342	retval = security_task_setscheduler(p);
4343	if (retval)	4343	if (retval)
4344	goto out_unlock;	4344	goto out_unlock;
4345		4345
4346	cpuset_cpus_allowed(p, cpus_allowed);	4346	cpuset_cpus_allowed(p, cpus_allowed);
4347	cpumask_and(new_mask, in_mask, cpus_allowed);	4347	cpumask_and(new_mask, in_mask, cpus_allowed);
4348	again:	4348	again:
4349	retval = set_cpus_allowed_ptr(p, new_mask);	4349	retval = set_cpus_allowed_ptr(p, new_mask);
4350		4350
4351	if (!retval) {	4351	if (!retval) {
4352	cpuset_cpus_allowed(p, cpus_allowed);	4352	cpuset_cpus_allowed(p, cpus_allowed);
4353	if (!cpumask_subset(new_mask, cpus_allowed)) {	4353	if (!cpumask_subset(new_mask, cpus_allowed)) {
4354	/*	4354	/*
4355	* We must have raced with a concurrent cpuset	4355	* We must have raced with a concurrent cpuset
4356	* update. Just reset the cpus_allowed to the	4356	* update. Just reset the cpus_allowed to the
4357	* cpuset's cpus_allowed	4357	* cpuset's cpus_allowed
4358	*/	4358	*/
4359	cpumask_copy(new_mask, cpus_allowed);	4359	cpumask_copy(new_mask, cpus_allowed);
4360	goto again;	4360	goto again;
4361	}	4361	}
4362	}	4362	}
4363	out_unlock:	4363	out_unlock:
4364	free_cpumask_var(new_mask);	4364	free_cpumask_var(new_mask);
4365	out_free_cpus_allowed:	4365	out_free_cpus_allowed:
4366	free_cpumask_var(cpus_allowed);	4366	free_cpumask_var(cpus_allowed);
4367	out_put_task:	4367	out_put_task:
4368	put_task_struct(p);	4368	put_task_struct(p);
4369	put_online_cpus();	4369	put_online_cpus();
4370	return retval;	4370	return retval;
4371	}	4371	}
4372		4372
4373	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,	4373	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4374	struct cpumask *new_mask)	4374	struct cpumask *new_mask)
4375	{	4375	{
4376	if (len < cpumask_size())	4376	if (len < cpumask_size())
4377	cpumask_clear(new_mask);	4377	cpumask_clear(new_mask);
4378	else if (len > cpumask_size())	4378	else if (len > cpumask_size())
4379	len = cpumask_size();	4379	len = cpumask_size();
4380		4380
4381	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;	4381	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4382	}	4382	}
4383		4383
4384	/**	4384	/**
4385	* sys_sched_setaffinity - set the cpu affinity of a process	4385	* sys_sched_setaffinity - set the cpu affinity of a process
4386	* @pid: pid of the process	4386	* @pid: pid of the process
4387	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	4387	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
4388	* @user_mask_ptr: user-space pointer to the new cpu mask	4388	* @user_mask_ptr: user-space pointer to the new cpu mask
4389	*/	4389	*/
4390	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,	4390	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4391	unsigned long __user *, user_mask_ptr)	4391	unsigned long __user *, user_mask_ptr)
4392	{	4392	{
4393	cpumask_var_t new_mask;	4393	cpumask_var_t new_mask;
4394	int retval;	4394	int retval;
4395		4395
4396	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))	4396	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4397	return -ENOMEM;	4397	return -ENOMEM;
4398		4398
4399	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);	4399	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4400	if (retval == 0)	4400	if (retval == 0)
4401	retval = sched_setaffinity(pid, new_mask);	4401	retval = sched_setaffinity(pid, new_mask);
4402	free_cpumask_var(new_mask);	4402	free_cpumask_var(new_mask);
4403	return retval;	4403	return retval;
4404	}	4404	}
4405		4405
4406	long sched_getaffinity(pid_t pid, struct cpumask *mask)	4406	long sched_getaffinity(pid_t pid, struct cpumask *mask)
4407	{	4407	{
4408	struct task_struct *p;	4408	struct task_struct *p;
4409	unsigned long flags;	4409	unsigned long flags;
4410	int retval;	4410	int retval;
4411		4411
4412	get_online_cpus();	4412	get_online_cpus();
4413	rcu_read_lock();	4413	rcu_read_lock();
4414		4414
4415	retval = -ESRCH;	4415	retval = -ESRCH;
4416	p = find_process_by_pid(pid);	4416	p = find_process_by_pid(pid);
4417	if (!p)	4417	if (!p)
4418	goto out_unlock;	4418	goto out_unlock;
4419		4419
4420	retval = security_task_getscheduler(p);	4420	retval = security_task_getscheduler(p);
4421	if (retval)	4421	if (retval)
4422	goto out_unlock;	4422	goto out_unlock;
4423		4423
4424	raw_spin_lock_irqsave(&p->pi_lock, flags);	4424	raw_spin_lock_irqsave(&p->pi_lock, flags);
4425	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);	4425	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4426	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	4426	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4427		4427
4428	out_unlock:	4428	out_unlock:
4429	rcu_read_unlock();	4429	rcu_read_unlock();
4430	put_online_cpus();	4430	put_online_cpus();
4431		4431
4432	return retval;	4432	return retval;
4433	}	4433	}
4434		4434
4435	/**	4435	/**
4436	* sys_sched_getaffinity - get the cpu affinity of a process	4436	* sys_sched_getaffinity - get the cpu affinity of a process
4437	* @pid: pid of the process	4437	* @pid: pid of the process
4438	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	4438	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
4439	* @user_mask_ptr: user-space pointer to hold the current cpu mask	4439	* @user_mask_ptr: user-space pointer to hold the current cpu mask
4440	*/	4440	*/
4441	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,	4441	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4442	unsigned long __user *, user_mask_ptr)	4442	unsigned long __user *, user_mask_ptr)
4443	{	4443	{
4444	int ret;	4444	int ret;
4445	cpumask_var_t mask;	4445	cpumask_var_t mask;
4446		4446
4447	if ((len * BITS_PER_BYTE) < nr_cpu_ids)	4447	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4448	return -EINVAL;	4448	return -EINVAL;
4449	if (len & (sizeof(unsigned long)-1))	4449	if (len & (sizeof(unsigned long)-1))
4450	return -EINVAL;	4450	return -EINVAL;
4451		4451
4452	if (!alloc_cpumask_var(&mask, GFP_KERNEL))	4452	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4453	return -ENOMEM;	4453	return -ENOMEM;
4454		4454
4455	ret = sched_getaffinity(pid, mask);	4455	ret = sched_getaffinity(pid, mask);
4456	if (ret == 0) {	4456	if (ret == 0) {
4457	size_t retlen = min_t(size_t, len, cpumask_size());	4457	size_t retlen = min_t(size_t, len, cpumask_size());
4458		4458
4459	if (copy_to_user(user_mask_ptr, mask, retlen))	4459	if (copy_to_user(user_mask_ptr, mask, retlen))
4460	ret = -EFAULT;	4460	ret = -EFAULT;
4461	else	4461	else
4462	ret = retlen;	4462	ret = retlen;
4463	}	4463	}
4464	free_cpumask_var(mask);	4464	free_cpumask_var(mask);
4465		4465
4466	return ret;	4466	return ret;
4467	}	4467	}
4468		4468
4469	/**	4469	/**
4470	* sys_sched_yield - yield the current processor to other threads.	4470	* sys_sched_yield - yield the current processor to other threads.
4471	*	4471	*
4472	* This function yields the current CPU to other tasks. If there are no	4472	* This function yields the current CPU to other tasks. If there are no
4473	* other threads running on this CPU then this function will return.	4473	* other threads running on this CPU then this function will return.
4474	*/	4474	*/
4475	SYSCALL_DEFINE0(sched_yield)	4475	SYSCALL_DEFINE0(sched_yield)
4476	{	4476	{
4477	struct rq *rq = this_rq_lock();	4477	struct rq *rq = this_rq_lock();
4478		4478
4479	schedstat_inc(rq, yld_count);	4479	schedstat_inc(rq, yld_count);
4480	current->sched_class->yield_task(rq);	4480	current->sched_class->yield_task(rq);
4481		4481
4482	/*	4482	/*
4483	* Since we are going to call schedule() anyway, there's	4483	* Since we are going to call schedule() anyway, there's
4484	* no need to preempt or enable interrupts:	4484	* no need to preempt or enable interrupts:
4485	*/	4485	*/
4486	__release(rq->lock);	4486	__release(rq->lock);
4487	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);	4487	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4488	do_raw_spin_unlock(&rq->lock);	4488	do_raw_spin_unlock(&rq->lock);
4489	preempt_enable_no_resched();	4489	sched_preempt_enable_no_resched();
4490		4490
4491	schedule();	4491	schedule();
4492		4492
4493	return 0;	4493	return 0;
4494	}	4494	}
4495		4495
4496	static inline int should_resched(void)	4496	static inline int should_resched(void)
4497	{	4497	{
4498	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);	4498	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4499	}	4499	}
4500		4500
4501	static void __cond_resched(void)	4501	static void __cond_resched(void)
4502	{	4502	{
4503	add_preempt_count(PREEMPT_ACTIVE);	4503	add_preempt_count(PREEMPT_ACTIVE);
4504	__schedule();	4504	__schedule();
4505	sub_preempt_count(PREEMPT_ACTIVE);	4505	sub_preempt_count(PREEMPT_ACTIVE);
4506	}	4506	}
4507		4507
4508	int __sched _cond_resched(void)	4508	int __sched _cond_resched(void)
4509	{	4509	{
4510	if (should_resched()) {	4510	if (should_resched()) {
4511	__cond_resched();	4511	__cond_resched();
4512	return 1;	4512	return 1;
4513	}	4513	}
4514	return 0;	4514	return 0;
4515	}	4515	}
4516	EXPORT_SYMBOL(_cond_resched);	4516	EXPORT_SYMBOL(_cond_resched);
4517		4517
4518	/*	4518	/*
4519	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,	4519	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4520	* call schedule, and on return reacquire the lock.	4520	* call schedule, and on return reacquire the lock.
4521	*	4521	*
4522	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level	4522	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4523	* operations here to prevent schedule() from being called twice (once via	4523	* operations here to prevent schedule() from being called twice (once via
4524	* spin_unlock(), once by hand).	4524	* spin_unlock(), once by hand).
4525	*/	4525	*/
4526	int __cond_resched_lock(spinlock_t *lock)	4526	int __cond_resched_lock(spinlock_t *lock)
4527	{	4527	{
4528	int resched = should_resched();	4528	int resched = should_resched();
4529	int ret = 0;	4529	int ret = 0;
4530		4530
4531	lockdep_assert_held(lock);	4531	lockdep_assert_held(lock);
4532		4532
4533	if (spin_needbreak(lock) \|\| resched) {	4533	if (spin_needbreak(lock) \|\| resched) {
4534	spin_unlock(lock);	4534	spin_unlock(lock);
4535	if (resched)	4535	if (resched)
4536	__cond_resched();	4536	__cond_resched();
4537	else	4537	else
4538	cpu_relax();	4538	cpu_relax();
4539	ret = 1;	4539	ret = 1;
4540	spin_lock(lock);	4540	spin_lock(lock);
4541	}	4541	}
4542	return ret;	4542	return ret;
4543	}	4543	}
4544	EXPORT_SYMBOL(__cond_resched_lock);	4544	EXPORT_SYMBOL(__cond_resched_lock);
4545		4545
4546	int __sched __cond_resched_softirq(void)	4546	int __sched __cond_resched_softirq(void)
4547	{	4547	{
4548	BUG_ON(!in_softirq());	4548	BUG_ON(!in_softirq());
4549		4549
4550	if (should_resched()) {	4550	if (should_resched()) {
4551	local_bh_enable();	4551	local_bh_enable();
4552	__cond_resched();	4552	__cond_resched();
4553	local_bh_disable();	4553	local_bh_disable();
4554	return 1;	4554	return 1;
4555	}	4555	}
4556	return 0;	4556	return 0;
4557	}	4557	}
4558	EXPORT_SYMBOL(__cond_resched_softirq);	4558	EXPORT_SYMBOL(__cond_resched_softirq);
4559		4559
4560	/**	4560	/**
4561	* yield - yield the current processor to other threads.	4561	* yield - yield the current processor to other threads.
4562	*	4562	*
4563	* This is a shortcut for kernel-space yielding - it marks the	4563	* This is a shortcut for kernel-space yielding - it marks the
4564	* thread runnable and calls sys_sched_yield().	4564	* thread runnable and calls sys_sched_yield().
4565	*/	4565	*/
4566	void __sched yield(void)	4566	void __sched yield(void)
4567	{	4567	{
4568	set_current_state(TASK_RUNNING);	4568	set_current_state(TASK_RUNNING);
4569	sys_sched_yield();	4569	sys_sched_yield();
4570	}	4570	}
4571	EXPORT_SYMBOL(yield);	4571	EXPORT_SYMBOL(yield);
4572		4572
4573	/**	4573	/**
4574	* yield_to - yield the current processor to another thread in	4574	* yield_to - yield the current processor to another thread in
4575	* your thread group, or accelerate that thread toward the	4575	* your thread group, or accelerate that thread toward the
4576	* processor it's on.	4576	* processor it's on.
4577	* @p: target task	4577	* @p: target task
4578	* @preempt: whether task preemption is allowed or not	4578	* @preempt: whether task preemption is allowed or not
4579	*	4579	*
4580	* It's the caller's job to ensure that the target task struct	4580	* It's the caller's job to ensure that the target task struct
4581	* can't go away on us before we can do any checks.	4581	* can't go away on us before we can do any checks.
4582	*	4582	*
4583	* Returns true if we indeed boosted the target task.	4583	* Returns true if we indeed boosted the target task.
4584	*/	4584	*/
4585	bool __sched yield_to(struct task_struct *p, bool preempt)	4585	bool __sched yield_to(struct task_struct *p, bool preempt)
4586	{	4586	{
4587	struct task_struct *curr = current;	4587	struct task_struct *curr = current;
4588	struct rq rq, p_rq;	4588	struct rq rq, p_rq;
4589	unsigned long flags;	4589	unsigned long flags;
4590	bool yielded = 0;	4590	bool yielded = 0;
4591		4591
4592	local_irq_save(flags);	4592	local_irq_save(flags);
4593	rq = this_rq();	4593	rq = this_rq();
4594		4594
4595	again:	4595	again:
4596	p_rq = task_rq(p);	4596	p_rq = task_rq(p);
4597	double_rq_lock(rq, p_rq);	4597	double_rq_lock(rq, p_rq);
4598	while (task_rq(p) != p_rq) {	4598	while (task_rq(p) != p_rq) {
4599	double_rq_unlock(rq, p_rq);	4599	double_rq_unlock(rq, p_rq);
4600	goto again;	4600	goto again;
4601	}	4601	}
4602		4602
4603	if (!curr->sched_class->yield_to_task)	4603	if (!curr->sched_class->yield_to_task)
4604	goto out;	4604	goto out;
4605		4605
4606	if (curr->sched_class != p->sched_class)	4606	if (curr->sched_class != p->sched_class)
4607	goto out;	4607	goto out;
4608		4608
4609	if (task_running(p_rq, p) \|\| p->state)	4609	if (task_running(p_rq, p) \|\| p->state)
4610	goto out;	4610	goto out;
4611		4611
4612	yielded = curr->sched_class->yield_to_task(rq, p, preempt);	4612	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4613	if (yielded) {	4613	if (yielded) {
4614	schedstat_inc(rq, yld_count);	4614	schedstat_inc(rq, yld_count);
4615	/*	4615	/*
4616	* Make p's CPU reschedule; pick_next_entity takes care of	4616	* Make p's CPU reschedule; pick_next_entity takes care of
4617	* fairness.	4617	* fairness.
4618	*/	4618	*/
4619	if (preempt && rq != p_rq)	4619	if (preempt && rq != p_rq)
4620	resched_task(p_rq->curr);	4620	resched_task(p_rq->curr);
4621	} else {	4621	} else {
4622	/*	4622	/*
4623	* We might have set it in task_yield_fair(), but are	4623	* We might have set it in task_yield_fair(), but are
4624	* not going to schedule(), so don't want to skip	4624	* not going to schedule(), so don't want to skip
4625	* the next update.	4625	* the next update.
4626	*/	4626	*/
4627	rq->skip_clock_update = 0;	4627	rq->skip_clock_update = 0;
4628	}	4628	}
4629		4629
4630	out:	4630	out:
4631	double_rq_unlock(rq, p_rq);	4631	double_rq_unlock(rq, p_rq);
4632	local_irq_restore(flags);	4632	local_irq_restore(flags);
4633		4633
4634	if (yielded)	4634	if (yielded)
4635	schedule();	4635	schedule();
4636		4636
4637	return yielded;	4637	return yielded;
4638	}	4638	}
4639	EXPORT_SYMBOL_GPL(yield_to);	4639	EXPORT_SYMBOL_GPL(yield_to);
4640		4640
4641	/*	4641	/*
4642	* This task is about to go to sleep on IO. Increment rq->nr_iowait so	4642	* This task is about to go to sleep on IO. Increment rq->nr_iowait so
4643	* that process accounting knows that this is a task in IO wait state.	4643	* that process accounting knows that this is a task in IO wait state.
4644	*/	4644	*/
4645	void __sched io_schedule(void)	4645	void __sched io_schedule(void)
4646	{	4646	{
4647	struct rq *rq = raw_rq();	4647	struct rq *rq = raw_rq();
4648		4648
4649	delayacct_blkio_start();	4649	delayacct_blkio_start();
4650	atomic_inc(&rq->nr_iowait);	4650	atomic_inc(&rq->nr_iowait);
4651	blk_flush_plug(current);	4651	blk_flush_plug(current);
4652	current->in_iowait = 1;	4652	current->in_iowait = 1;
4653	schedule();	4653	schedule();
4654	current->in_iowait = 0;	4654	current->in_iowait = 0;
4655	atomic_dec(&rq->nr_iowait);	4655	atomic_dec(&rq->nr_iowait);
4656	delayacct_blkio_end();	4656	delayacct_blkio_end();
4657	}	4657	}
4658	EXPORT_SYMBOL(io_schedule);	4658	EXPORT_SYMBOL(io_schedule);
4659		4659
4660	long __sched io_schedule_timeout(long timeout)	4660	long __sched io_schedule_timeout(long timeout)
4661	{	4661	{
4662	struct rq *rq = raw_rq();	4662	struct rq *rq = raw_rq();
4663	long ret;	4663	long ret;
4664		4664
4665	delayacct_blkio_start();	4665	delayacct_blkio_start();
4666	atomic_inc(&rq->nr_iowait);	4666	atomic_inc(&rq->nr_iowait);
4667	blk_flush_plug(current);	4667	blk_flush_plug(current);
4668	current->in_iowait = 1;	4668	current->in_iowait = 1;
4669	ret = schedule_timeout(timeout);	4669	ret = schedule_timeout(timeout);
4670	current->in_iowait = 0;	4670	current->in_iowait = 0;
4671	atomic_dec(&rq->nr_iowait);	4671	atomic_dec(&rq->nr_iowait);
4672	delayacct_blkio_end();	4672	delayacct_blkio_end();
4673	return ret;	4673	return ret;
4674	}	4674	}
4675		4675
4676	/**	4676	/**
4677	* sys_sched_get_priority_max - return maximum RT priority.	4677	* sys_sched_get_priority_max - return maximum RT priority.
4678	* @policy: scheduling class.	4678	* @policy: scheduling class.
4679	*	4679	*
4680	* this syscall returns the maximum rt_priority that can be used	4680	* this syscall returns the maximum rt_priority that can be used
4681	* by a given scheduling class.	4681	* by a given scheduling class.
4682	*/	4682	*/
4683	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)	4683	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4684	{	4684	{
4685	int ret = -EINVAL;	4685	int ret = -EINVAL;
4686		4686
4687	switch (policy) {	4687	switch (policy) {
4688	case SCHED_FIFO:	4688	case SCHED_FIFO:
4689	case SCHED_RR:	4689	case SCHED_RR:
4690	ret = MAX_USER_RT_PRIO-1;	4690	ret = MAX_USER_RT_PRIO-1;
4691	break;	4691	break;
4692	case SCHED_NORMAL:	4692	case SCHED_NORMAL:
4693	case SCHED_BATCH:	4693	case SCHED_BATCH:
4694	case SCHED_IDLE:	4694	case SCHED_IDLE:
4695	ret = 0;	4695	ret = 0;
4696	break;	4696	break;
4697	}	4697	}
4698	return ret;	4698	return ret;
4699	}	4699	}
4700		4700
4701	/**	4701	/**
4702	* sys_sched_get_priority_min - return minimum RT priority.	4702	* sys_sched_get_priority_min - return minimum RT priority.
4703	* @policy: scheduling class.	4703	* @policy: scheduling class.
4704	*	4704	*
4705	* this syscall returns the minimum rt_priority that can be used	4705	* this syscall returns the minimum rt_priority that can be used
4706	* by a given scheduling class.	4706	* by a given scheduling class.
4707	*/	4707	*/
4708	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)	4708	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4709	{	4709	{
4710	int ret = -EINVAL;	4710	int ret = -EINVAL;
4711		4711
4712	switch (policy) {	4712	switch (policy) {
4713	case SCHED_FIFO:	4713	case SCHED_FIFO:
4714	case SCHED_RR:	4714	case SCHED_RR:
4715	ret = 1;	4715	ret = 1;
4716	break;	4716	break;
4717	case SCHED_NORMAL:	4717	case SCHED_NORMAL:
4718	case SCHED_BATCH:	4718	case SCHED_BATCH:
4719	case SCHED_IDLE:	4719	case SCHED_IDLE:
4720	ret = 0;	4720	ret = 0;
4721	}	4721	}
4722	return ret;	4722	return ret;
4723	}	4723	}
4724		4724
4725	/**	4725	/**
4726	* sys_sched_rr_get_interval - return the default timeslice of a process.	4726	* sys_sched_rr_get_interval - return the default timeslice of a process.
4727	* @pid: pid of the process.	4727	* @pid: pid of the process.
4728	* @interval: userspace pointer to the timeslice value.	4728	* @interval: userspace pointer to the timeslice value.
4729	*	4729	*
4730	* this syscall writes the default timeslice value of a given process	4730	* this syscall writes the default timeslice value of a given process
4731	* into the user-space timespec buffer. A value of '0' means infinity.	4731	* into the user-space timespec buffer. A value of '0' means infinity.
4732	*/	4732	*/
4733	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,	4733	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4734	struct timespec __user *, interval)	4734	struct timespec __user *, interval)
4735	{	4735	{
4736	struct task_struct *p;	4736	struct task_struct *p;
4737	unsigned int time_slice;	4737	unsigned int time_slice;
4738	unsigned long flags;	4738	unsigned long flags;
4739	struct rq *rq;	4739	struct rq *rq;
4740	int retval;	4740	int retval;
4741	struct timespec t;	4741	struct timespec t;
4742		4742
4743	if (pid < 0)	4743	if (pid < 0)
4744	return -EINVAL;	4744	return -EINVAL;
4745		4745
4746	retval = -ESRCH;	4746	retval = -ESRCH;
4747	rcu_read_lock();	4747	rcu_read_lock();
4748	p = find_process_by_pid(pid);	4748	p = find_process_by_pid(pid);
4749	if (!p)	4749	if (!p)
4750	goto out_unlock;	4750	goto out_unlock;
4751		4751
4752	retval = security_task_getscheduler(p);	4752	retval = security_task_getscheduler(p);
4753	if (retval)	4753	if (retval)
4754	goto out_unlock;	4754	goto out_unlock;
4755		4755
4756	rq = task_rq_lock(p, &flags);	4756	rq = task_rq_lock(p, &flags);
4757	time_slice = p->sched_class->get_rr_interval(rq, p);	4757	time_slice = p->sched_class->get_rr_interval(rq, p);
4758	task_rq_unlock(rq, p, &flags);	4758	task_rq_unlock(rq, p, &flags);
4759		4759
4760	rcu_read_unlock();	4760	rcu_read_unlock();
4761	jiffies_to_timespec(time_slice, &t);	4761	jiffies_to_timespec(time_slice, &t);
4762	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;	4762	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4763	return retval;	4763	return retval;
4764		4764
4765	out_unlock:	4765	out_unlock:
4766	rcu_read_unlock();	4766	rcu_read_unlock();
4767	return retval;	4767	return retval;
4768	}	4768	}
4769		4769
4770	static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;	4770	static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4771		4771
4772	void sched_show_task(struct task_struct *p)	4772	void sched_show_task(struct task_struct *p)
4773	{	4773	{
4774	unsigned long free = 0;	4774	unsigned long free = 0;
4775	unsigned state;	4775	unsigned state;
4776		4776
4777	state = p->state ? __ffs(p->state) + 1 : 0;	4777	state = p->state ? __ffs(p->state) + 1 : 0;
4778	printk(KERN_INFO "%-15.15s %c", p->comm,	4778	printk(KERN_INFO "%-15.15s %c", p->comm,
4779	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');	4779	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4780	#if BITS_PER_LONG == 32	4780	#if BITS_PER_LONG == 32
4781	if (state == TASK_RUNNING)	4781	if (state == TASK_RUNNING)
4782	printk(KERN_CONT " running ");	4782	printk(KERN_CONT " running ");
4783	else	4783	else
4784	printk(KERN_CONT " %08lx ", thread_saved_pc(p));	4784	printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4785	#else	4785	#else
4786	if (state == TASK_RUNNING)	4786	if (state == TASK_RUNNING)
4787	printk(KERN_CONT " running task ");	4787	printk(KERN_CONT " running task ");
4788	else	4788	else
4789	printk(KERN_CONT " %016lx ", thread_saved_pc(p));	4789	printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4790	#endif	4790	#endif
4791	#ifdef CONFIG_DEBUG_STACK_USAGE	4791	#ifdef CONFIG_DEBUG_STACK_USAGE
4792	free = stack_not_used(p);	4792	free = stack_not_used(p);
4793	#endif	4793	#endif
4794	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,	4794	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4795	task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),	4795	task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
4796	(unsigned long)task_thread_info(p)->flags);	4796	(unsigned long)task_thread_info(p)->flags);
4797		4797
4798	show_stack(p, NULL);	4798	show_stack(p, NULL);
4799	}	4799	}
4800		4800
4801	void show_state_filter(unsigned long state_filter)	4801	void show_state_filter(unsigned long state_filter)
4802	{	4802	{
4803	struct task_struct g, p;	4803	struct task_struct g, p;
4804		4804
4805	#if BITS_PER_LONG == 32	4805	#if BITS_PER_LONG == 32
4806	printk(KERN_INFO	4806	printk(KERN_INFO
4807	" task PC stack pid father\n");	4807	" task PC stack pid father\n");
4808	#else	4808	#else
4809	printk(KERN_INFO	4809	printk(KERN_INFO
4810	" task PC stack pid father\n");	4810	" task PC stack pid father\n");
4811	#endif	4811	#endif
4812	rcu_read_lock();	4812	rcu_read_lock();
4813	do_each_thread(g, p) {	4813	do_each_thread(g, p) {
4814	/*	4814	/*
4815	* reset the NMI-timeout, listing all files on a slow	4815	* reset the NMI-timeout, listing all files on a slow
4816	* console might take a lot of time:	4816	* console might take a lot of time:
4817	*/	4817	*/
4818	touch_nmi_watchdog();	4818	touch_nmi_watchdog();
4819	if (!state_filter \|\| (p->state & state_filter))	4819	if (!state_filter \|\| (p->state & state_filter))
4820	sched_show_task(p);	4820	sched_show_task(p);
4821	} while_each_thread(g, p);	4821	} while_each_thread(g, p);
4822		4822
4823	touch_all_softlockup_watchdogs();	4823	touch_all_softlockup_watchdogs();
4824		4824
4825	#ifdef CONFIG_SCHED_DEBUG	4825	#ifdef CONFIG_SCHED_DEBUG
4826	sysrq_sched_debug_show();	4826	sysrq_sched_debug_show();
4827	#endif	4827	#endif
4828	rcu_read_unlock();	4828	rcu_read_unlock();
4829	/*	4829	/*
4830	* Only show locks if all tasks are dumped:	4830	* Only show locks if all tasks are dumped:
4831	*/	4831	*/
4832	if (!state_filter)	4832	if (!state_filter)
4833	debug_show_all_locks();	4833	debug_show_all_locks();
4834	}	4834	}
4835		4835
4836	void __cpuinit init_idle_bootup_task(struct task_struct *idle)	4836	void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4837	{	4837	{
4838	idle->sched_class = &idle_sched_class;	4838	idle->sched_class = &idle_sched_class;
4839	}	4839	}
4840		4840
4841	/**	4841	/**
4842	* init_idle - set up an idle thread for a given CPU	4842	* init_idle - set up an idle thread for a given CPU
4843	* @idle: task in question	4843	* @idle: task in question
4844	* @cpu: cpu the idle task belongs to	4844	* @cpu: cpu the idle task belongs to
4845	*	4845	*
4846	* NOTE: this function does not set the idle thread's NEED_RESCHED	4846	* NOTE: this function does not set the idle thread's NEED_RESCHED
4847	* flag, to make booting more robust.	4847	* flag, to make booting more robust.
4848	*/	4848	*/
4849	void __cpuinit init_idle(struct task_struct *idle, int cpu)	4849	void __cpuinit init_idle(struct task_struct *idle, int cpu)
4850	{	4850	{
4851	struct rq *rq = cpu_rq(cpu);	4851	struct rq *rq = cpu_rq(cpu);
4852	unsigned long flags;	4852	unsigned long flags;
4853		4853
4854	raw_spin_lock_irqsave(&rq->lock, flags);	4854	raw_spin_lock_irqsave(&rq->lock, flags);
4855		4855
4856	__sched_fork(idle);	4856	__sched_fork(idle);
4857	idle->state = TASK_RUNNING;	4857	idle->state = TASK_RUNNING;
4858	idle->se.exec_start = sched_clock();	4858	idle->se.exec_start = sched_clock();
4859		4859
4860	do_set_cpus_allowed(idle, cpumask_of(cpu));	4860	do_set_cpus_allowed(idle, cpumask_of(cpu));
4861	/*	4861	/*
4862	* We're having a chicken and egg problem, even though we are	4862	* We're having a chicken and egg problem, even though we are
4863	* holding rq->lock, the cpu isn't yet set to this cpu so the	4863	* holding rq->lock, the cpu isn't yet set to this cpu so the
4864	* lockdep check in task_group() will fail.	4864	* lockdep check in task_group() will fail.
4865	*	4865	*
4866	* Similar case to sched_fork(). / Alternatively we could	4866	* Similar case to sched_fork(). / Alternatively we could
4867	* use task_rq_lock() here and obtain the other rq->lock.	4867	* use task_rq_lock() here and obtain the other rq->lock.
4868	*	4868	*
4869	* Silence PROVE_RCU	4869	* Silence PROVE_RCU
4870	*/	4870	*/
4871	rcu_read_lock();	4871	rcu_read_lock();
4872	__set_task_cpu(idle, cpu);	4872	__set_task_cpu(idle, cpu);
4873	rcu_read_unlock();	4873	rcu_read_unlock();
4874		4874
4875	rq->curr = rq->idle = idle;	4875	rq->curr = rq->idle = idle;
4876	#if defined(CONFIG_SMP)	4876	#if defined(CONFIG_SMP)
4877	idle->on_cpu = 1;	4877	idle->on_cpu = 1;
4878	#endif	4878	#endif
4879	raw_spin_unlock_irqrestore(&rq->lock, flags);	4879	raw_spin_unlock_irqrestore(&rq->lock, flags);
4880		4880
4881	/* Set the preempt count _outside_ the spinlocks! */	4881	/* Set the preempt count _outside_ the spinlocks! */
4882	task_thread_info(idle)->preempt_count = 0;	4882	task_thread_info(idle)->preempt_count = 0;
4883		4883
4884	/*	4884	/*
4885	* The idle tasks have their own, simple scheduling class:	4885	* The idle tasks have their own, simple scheduling class:
4886	*/	4886	*/
4887	idle->sched_class = &idle_sched_class;	4887	idle->sched_class = &idle_sched_class;
4888	ftrace_graph_init_idle_task(idle, cpu);	4888	ftrace_graph_init_idle_task(idle, cpu);
4889	#if defined(CONFIG_SMP)	4889	#if defined(CONFIG_SMP)
4890	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);	4890	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4891	#endif	4891	#endif
4892	}	4892	}
4893		4893
4894	#ifdef CONFIG_SMP	4894	#ifdef CONFIG_SMP
4895	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)	4895	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
4896	{	4896	{
4897	if (p->sched_class && p->sched_class->set_cpus_allowed)	4897	if (p->sched_class && p->sched_class->set_cpus_allowed)
4898	p->sched_class->set_cpus_allowed(p, new_mask);	4898	p->sched_class->set_cpus_allowed(p, new_mask);
4899		4899
4900	cpumask_copy(&p->cpus_allowed, new_mask);	4900	cpumask_copy(&p->cpus_allowed, new_mask);
4901	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);	4901	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
4902	}	4902	}
4903		4903
4904	/*	4904	/*
4905	* This is how migration works:	4905	* This is how migration works:
4906	*	4906	*
4907	* 1) we invoke migration_cpu_stop() on the target CPU using	4907	* 1) we invoke migration_cpu_stop() on the target CPU using
4908	* stop_one_cpu().	4908	* stop_one_cpu().
4909	* 2) stopper starts to run (implicitly forcing the migrated thread	4909	* 2) stopper starts to run (implicitly forcing the migrated thread
4910	* off the CPU)	4910	* off the CPU)
4911	* 3) it checks whether the migrated task is still in the wrong runqueue.	4911	* 3) it checks whether the migrated task is still in the wrong runqueue.
4912	* 4) if it's in the wrong runqueue then the migration thread removes	4912	* 4) if it's in the wrong runqueue then the migration thread removes
4913	* it and puts it into the right queue.	4913	* it and puts it into the right queue.
4914	* 5) stopper completes and stop_one_cpu() returns and the migration	4914	* 5) stopper completes and stop_one_cpu() returns and the migration
4915	* is done.	4915	* is done.
4916	*/	4916	*/
4917		4917
4918	/*	4918	/*
4919	* Change a given task's CPU affinity. Migrate the thread to a	4919	* Change a given task's CPU affinity. Migrate the thread to a
4920	* proper CPU and schedule it away if the CPU it's executing on	4920	* proper CPU and schedule it away if the CPU it's executing on
4921	* is removed from the allowed bitmask.	4921	* is removed from the allowed bitmask.
4922	*	4922	*
4923	* NOTE: the caller must have a valid reference to the task, the	4923	* NOTE: the caller must have a valid reference to the task, the
4924	* task must not exit() & deallocate itself prematurely. The	4924	* task must not exit() & deallocate itself prematurely. The
4925	* call is not atomic; no spinlocks may be held.	4925	* call is not atomic; no spinlocks may be held.
4926	*/	4926	*/
4927	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)	4927	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
4928	{	4928	{
4929	unsigned long flags;	4929	unsigned long flags;
4930	struct rq *rq;	4930	struct rq *rq;
4931	unsigned int dest_cpu;	4931	unsigned int dest_cpu;
4932	int ret = 0;	4932	int ret = 0;
4933		4933
4934	rq = task_rq_lock(p, &flags);	4934	rq = task_rq_lock(p, &flags);
4935		4935
4936	if (cpumask_equal(&p->cpus_allowed, new_mask))	4936	if (cpumask_equal(&p->cpus_allowed, new_mask))
4937	goto out;	4937	goto out;
4938		4938
4939	if (!cpumask_intersects(new_mask, cpu_active_mask)) {	4939	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4940	ret = -EINVAL;	4940	ret = -EINVAL;
4941	goto out;	4941	goto out;
4942	}	4942	}
4943		4943
4944	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {	4944	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4945	ret = -EINVAL;	4945	ret = -EINVAL;
4946	goto out;	4946	goto out;
4947	}	4947	}
4948		4948
4949	do_set_cpus_allowed(p, new_mask);	4949	do_set_cpus_allowed(p, new_mask);
4950		4950
4951	/* Can the task run on the task's current CPU? If so, we're done */	4951	/* Can the task run on the task's current CPU? If so, we're done */
4952	if (cpumask_test_cpu(task_cpu(p), new_mask))	4952	if (cpumask_test_cpu(task_cpu(p), new_mask))
4953	goto out;	4953	goto out;
4954		4954
4955	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);	4955	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4956	if (p->on_rq) {	4956	if (p->on_rq) {
4957	struct migration_arg arg = { p, dest_cpu };	4957	struct migration_arg arg = { p, dest_cpu };
4958	/* Need help from migration thread: drop lock and wait. */	4958	/* Need help from migration thread: drop lock and wait. */
4959	task_rq_unlock(rq, p, &flags);	4959	task_rq_unlock(rq, p, &flags);
4960	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);	4960	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4961	tlb_migrate_finish(p->mm);	4961	tlb_migrate_finish(p->mm);
4962	return 0;	4962	return 0;
4963	}	4963	}
4964	out:	4964	out:
4965	task_rq_unlock(rq, p, &flags);	4965	task_rq_unlock(rq, p, &flags);
4966		4966
4967	return ret;	4967	return ret;
4968	}	4968	}
4969	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);	4969	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4970		4970
4971	/*	4971	/*
4972	* Move (not current) task off this cpu, onto dest cpu. We're doing	4972	* Move (not current) task off this cpu, onto dest cpu. We're doing
4973	* this because either it can't run here any more (set_cpus_allowed()	4973	* this because either it can't run here any more (set_cpus_allowed()
4974	* away from this CPU, or CPU going down), or because we're	4974	* away from this CPU, or CPU going down), or because we're
4975	* attempting to rebalance this task on exec (sched_exec).	4975	* attempting to rebalance this task on exec (sched_exec).
4976	*	4976	*
4977	* So we race with normal scheduler movements, but that's OK, as long	4977	* So we race with normal scheduler movements, but that's OK, as long
4978	* as the task is no longer on this CPU.	4978	* as the task is no longer on this CPU.
4979	*	4979	*
4980	* Returns non-zero if task was successfully migrated.	4980	* Returns non-zero if task was successfully migrated.
4981	*/	4981	*/
4982	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)	4982	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4983	{	4983	{
4984	struct rq rq_dest, rq_src;	4984	struct rq rq_dest, rq_src;
4985	int ret = 0;	4985	int ret = 0;
4986		4986
4987	if (unlikely(!cpu_active(dest_cpu)))	4987	if (unlikely(!cpu_active(dest_cpu)))
4988	return ret;	4988	return ret;
4989		4989
4990	rq_src = cpu_rq(src_cpu);	4990	rq_src = cpu_rq(src_cpu);
4991	rq_dest = cpu_rq(dest_cpu);	4991	rq_dest = cpu_rq(dest_cpu);
4992		4992
4993	raw_spin_lock(&p->pi_lock);	4993	raw_spin_lock(&p->pi_lock);
4994	double_rq_lock(rq_src, rq_dest);	4994	double_rq_lock(rq_src, rq_dest);
4995	/* Already moved. */	4995	/* Already moved. */
4996	if (task_cpu(p) != src_cpu)	4996	if (task_cpu(p) != src_cpu)
4997	goto done;	4997	goto done;
4998	/* Affinity changed (again). */	4998	/* Affinity changed (again). */
4999	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))	4999	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
5000	goto fail;	5000	goto fail;
5001		5001
5002	/*	5002	/*
5003	* If we're not on a rq, the next wake-up will ensure we're	5003	* If we're not on a rq, the next wake-up will ensure we're
5004	* placed properly.	5004	* placed properly.
5005	*/	5005	*/
5006	if (p->on_rq) {	5006	if (p->on_rq) {
5007	dequeue_task(rq_src, p, 0);	5007	dequeue_task(rq_src, p, 0);
5008	set_task_cpu(p, dest_cpu);	5008	set_task_cpu(p, dest_cpu);
5009	enqueue_task(rq_dest, p, 0);	5009	enqueue_task(rq_dest, p, 0);
5010	check_preempt_curr(rq_dest, p, 0);	5010	check_preempt_curr(rq_dest, p, 0);
5011	}	5011	}
5012	done:	5012	done:
5013	ret = 1;	5013	ret = 1;
5014	fail:	5014	fail:
5015	double_rq_unlock(rq_src, rq_dest);	5015	double_rq_unlock(rq_src, rq_dest);
5016	raw_spin_unlock(&p->pi_lock);	5016	raw_spin_unlock(&p->pi_lock);
5017	return ret;	5017	return ret;
5018	}	5018	}
5019		5019
5020	/*	5020	/*
5021	* migration_cpu_stop - this will be executed by a highprio stopper thread	5021	* migration_cpu_stop - this will be executed by a highprio stopper thread
5022	* and performs thread migration by bumping thread off CPU then	5022	* and performs thread migration by bumping thread off CPU then
5023	* 'pushing' onto another runqueue.	5023	* 'pushing' onto another runqueue.
5024	*/	5024	*/
5025	static int migration_cpu_stop(void *data)	5025	static int migration_cpu_stop(void *data)
5026	{	5026	{
5027	struct migration_arg *arg = data;	5027	struct migration_arg *arg = data;
5028		5028
5029	/*	5029	/*
5030	* The original target cpu might have gone down and we might	5030	* The original target cpu might have gone down and we might
5031	* be on another cpu but it doesn't matter.	5031	* be on another cpu but it doesn't matter.
5032	*/	5032	*/
5033	local_irq_disable();	5033	local_irq_disable();
5034	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);	5034	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5035	local_irq_enable();	5035	local_irq_enable();
5036	return 0;	5036	return 0;
5037	}	5037	}
5038		5038
5039	#ifdef CONFIG_HOTPLUG_CPU	5039	#ifdef CONFIG_HOTPLUG_CPU
5040		5040
5041	/*	5041	/*
5042	* Ensures that the idle task is using init_mm right before its cpu goes	5042	* Ensures that the idle task is using init_mm right before its cpu goes
5043	* offline.	5043	* offline.
5044	*/	5044	*/
5045	void idle_task_exit(void)	5045	void idle_task_exit(void)
5046	{	5046	{
5047	struct mm_struct *mm = current->active_mm;	5047	struct mm_struct *mm = current->active_mm;
5048		5048
5049	BUG_ON(cpu_online(smp_processor_id()));	5049	BUG_ON(cpu_online(smp_processor_id()));
5050		5050
5051	if (mm != &init_mm)	5051	if (mm != &init_mm)
5052	switch_mm(mm, &init_mm, current);	5052	switch_mm(mm, &init_mm, current);
5053	mmdrop(mm);	5053	mmdrop(mm);
5054	}	5054	}
5055		5055
5056	/*	5056	/*
5057	* While a dead CPU has no uninterruptible tasks queued at this point,	5057	* While a dead CPU has no uninterruptible tasks queued at this point,
5058	* it might still have a nonzero ->nr_uninterruptible counter, because	5058	* it might still have a nonzero ->nr_uninterruptible counter, because
5059	* for performance reasons the counter is not stricly tracking tasks to	5059	* for performance reasons the counter is not stricly tracking tasks to
5060	* their home CPUs. So we just add the counter to another CPU's counter,	5060	* their home CPUs. So we just add the counter to another CPU's counter,
5061	* to keep the global sum constant after CPU-down:	5061	* to keep the global sum constant after CPU-down:
5062	*/	5062	*/
5063	static void migrate_nr_uninterruptible(struct rq *rq_src)	5063	static void migrate_nr_uninterruptible(struct rq *rq_src)
5064	{	5064	{
5065	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));	5065	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5066		5066
5067	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;	5067	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5068	rq_src->nr_uninterruptible = 0;	5068	rq_src->nr_uninterruptible = 0;
5069	}	5069	}
5070		5070
5071	/*	5071	/*
5072	* remove the tasks which were accounted by rq from calc_load_tasks.	5072	* remove the tasks which were accounted by rq from calc_load_tasks.
5073	*/	5073	*/
5074	static void calc_global_load_remove(struct rq *rq)	5074	static void calc_global_load_remove(struct rq *rq)
5075	{	5075	{
5076	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);	5076	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5077	rq->calc_load_active = 0;	5077	rq->calc_load_active = 0;
5078	}	5078	}
5079		5079
5080	/*	5080	/*
5081	* Migrate all tasks from the rq, sleeping tasks will be migrated by	5081	* Migrate all tasks from the rq, sleeping tasks will be migrated by
5082	* try_to_wake_up()->select_task_rq().	5082	* try_to_wake_up()->select_task_rq().
5083	*	5083	*
5084	* Called with rq->lock held even though we'er in stop_machine() and	5084	* Called with rq->lock held even though we'er in stop_machine() and
5085	* there's no concurrency possible, we hold the required locks anyway	5085	* there's no concurrency possible, we hold the required locks anyway
5086	* because of lock validation efforts.	5086	* because of lock validation efforts.
5087	*/	5087	*/
5088	static void migrate_tasks(unsigned int dead_cpu)	5088	static void migrate_tasks(unsigned int dead_cpu)
5089	{	5089	{
5090	struct rq *rq = cpu_rq(dead_cpu);	5090	struct rq *rq = cpu_rq(dead_cpu);
5091	struct task_struct next, stop = rq->stop;	5091	struct task_struct next, stop = rq->stop;
5092	int dest_cpu;	5092	int dest_cpu;
5093		5093
5094	/*	5094	/*
5095	* Fudge the rq selection such that the below task selection loop	5095	* Fudge the rq selection such that the below task selection loop
5096	* doesn't get stuck on the currently eligible stop task.	5096	* doesn't get stuck on the currently eligible stop task.
5097	*	5097	*
5098	* We're currently inside stop_machine() and the rq is either stuck	5098	* We're currently inside stop_machine() and the rq is either stuck
5099	* in the stop_machine_cpu_stop() loop, or we're executing this code,	5099	* in the stop_machine_cpu_stop() loop, or we're executing this code,
5100	* either way we should never end up calling schedule() until we're	5100	* either way we should never end up calling schedule() until we're
5101	* done here.	5101	* done here.
5102	*/	5102	*/
5103	rq->stop = NULL;	5103	rq->stop = NULL;
5104		5104
5105	/* Ensure any throttled groups are reachable by pick_next_task */	5105	/* Ensure any throttled groups are reachable by pick_next_task */
5106	unthrottle_offline_cfs_rqs(rq);	5106	unthrottle_offline_cfs_rqs(rq);
5107		5107
5108	for ( ; ; ) {	5108	for ( ; ; ) {
5109	/*	5109	/*
5110	* There's this thread running, bail when that's the only	5110	* There's this thread running, bail when that's the only
5111	* remaining thread.	5111	* remaining thread.
5112	*/	5112	*/
5113	if (rq->nr_running == 1)	5113	if (rq->nr_running == 1)
5114	break;	5114	break;
5115		5115
5116	next = pick_next_task(rq);	5116	next = pick_next_task(rq);
5117	BUG_ON(!next);	5117	BUG_ON(!next);
5118	next->sched_class->put_prev_task(rq, next);	5118	next->sched_class->put_prev_task(rq, next);
5119		5119
5120	/* Find suitable destination for @next, with force if needed. */	5120	/* Find suitable destination for @next, with force if needed. */
5121	dest_cpu = select_fallback_rq(dead_cpu, next);	5121	dest_cpu = select_fallback_rq(dead_cpu, next);
5122	raw_spin_unlock(&rq->lock);	5122	raw_spin_unlock(&rq->lock);
5123		5123
5124	__migrate_task(next, dead_cpu, dest_cpu);	5124	__migrate_task(next, dead_cpu, dest_cpu);
5125		5125
5126	raw_spin_lock(&rq->lock);	5126	raw_spin_lock(&rq->lock);
5127	}	5127	}
5128		5128
5129	rq->stop = stop;	5129	rq->stop = stop;
5130	}	5130	}
5131		5131
5132	#endif /* CONFIG_HOTPLUG_CPU */	5132	#endif /* CONFIG_HOTPLUG_CPU */
5133		5133
5134	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)	5134	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5135		5135
5136	static struct ctl_table sd_ctl_dir[] = {	5136	static struct ctl_table sd_ctl_dir[] = {
5137	{	5137	{
5138	.procname = "sched_domain",	5138	.procname = "sched_domain",
5139	.mode = 0555,	5139	.mode = 0555,
5140	},	5140	},
5141	{}	5141	{}
5142	};	5142	};
5143		5143
5144	static struct ctl_table sd_ctl_root[] = {	5144	static struct ctl_table sd_ctl_root[] = {
5145	{	5145	{
5146	.procname = "kernel",	5146	.procname = "kernel",
5147	.mode = 0555,	5147	.mode = 0555,
5148	.child = sd_ctl_dir,	5148	.child = sd_ctl_dir,
5149	},	5149	},
5150	{}	5150	{}
5151	};	5151	};
5152		5152
5153	static struct ctl_table *sd_alloc_ctl_entry(int n)	5153	static struct ctl_table *sd_alloc_ctl_entry(int n)
5154	{	5154	{
5155	struct ctl_table *entry =	5155	struct ctl_table *entry =
5156	kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);	5156	kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5157		5157
5158	return entry;	5158	return entry;
5159	}	5159	}
5160		5160
5161	static void sd_free_ctl_entry(struct ctl_table **tablep)	5161	static void sd_free_ctl_entry(struct ctl_table **tablep)
5162	{	5162	{
5163	struct ctl_table *entry;	5163	struct ctl_table *entry;
5164		5164
5165	/*	5165	/*
5166	* In the intermediate directories, both the child directory and	5166	* In the intermediate directories, both the child directory and
5167	* procname are dynamically allocated and could fail but the mode	5167	* procname are dynamically allocated and could fail but the mode
5168	* will always be set. In the lowest directory the names are	5168	* will always be set. In the lowest directory the names are
5169	* static strings and all have proc handlers.	5169	* static strings and all have proc handlers.
5170	*/	5170	*/
5171	for (entry = *tablep; entry->mode; entry++) {	5171	for (entry = *tablep; entry->mode; entry++) {
5172	if (entry->child)	5172	if (entry->child)
5173	sd_free_ctl_entry(&entry->child);	5173	sd_free_ctl_entry(&entry->child);
5174	if (entry->proc_handler == NULL)	5174	if (entry->proc_handler == NULL)
5175	kfree(entry->procname);	5175	kfree(entry->procname);
5176	}	5176	}
5177		5177
5178	kfree(*tablep);	5178	kfree(*tablep);
5179	*tablep = NULL;	5179	*tablep = NULL;
5180	}	5180	}
5181		5181
5182	static void	5182	static void
5183	set_table_entry(struct ctl_table *entry,	5183	set_table_entry(struct ctl_table *entry,
5184	const char procname, void data, int maxlen,	5184	const char procname, void data, int maxlen,
5185	umode_t mode, proc_handler *proc_handler)	5185	umode_t mode, proc_handler *proc_handler)
5186	{	5186	{
5187	entry->procname = procname;	5187	entry->procname = procname;
5188	entry->data = data;	5188	entry->data = data;
5189	entry->maxlen = maxlen;	5189	entry->maxlen = maxlen;
5190	entry->mode = mode;	5190	entry->mode = mode;
5191	entry->proc_handler = proc_handler;	5191	entry->proc_handler = proc_handler;
5192	}	5192	}
5193		5193
5194	static struct ctl_table *	5194	static struct ctl_table *
5195	sd_alloc_ctl_domain_table(struct sched_domain *sd)	5195	sd_alloc_ctl_domain_table(struct sched_domain *sd)
5196	{	5196	{
5197	struct ctl_table *table = sd_alloc_ctl_entry(13);	5197	struct ctl_table *table = sd_alloc_ctl_entry(13);
5198		5198
5199	if (table == NULL)	5199	if (table == NULL)
5200	return NULL;	5200	return NULL;
5201		5201
5202	set_table_entry(&table[0], "min_interval", &sd->min_interval,	5202	set_table_entry(&table[0], "min_interval", &sd->min_interval,
5203	sizeof(long), 0644, proc_doulongvec_minmax);	5203	sizeof(long), 0644, proc_doulongvec_minmax);
5204	set_table_entry(&table[1], "max_interval", &sd->max_interval,	5204	set_table_entry(&table[1], "max_interval", &sd->max_interval,
5205	sizeof(long), 0644, proc_doulongvec_minmax);	5205	sizeof(long), 0644, proc_doulongvec_minmax);
5206	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,	5206	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5207	sizeof(int), 0644, proc_dointvec_minmax);	5207	sizeof(int), 0644, proc_dointvec_minmax);
5208	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,	5208	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5209	sizeof(int), 0644, proc_dointvec_minmax);	5209	sizeof(int), 0644, proc_dointvec_minmax);
5210	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,	5210	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5211	sizeof(int), 0644, proc_dointvec_minmax);	5211	sizeof(int), 0644, proc_dointvec_minmax);
5212	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,	5212	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5213	sizeof(int), 0644, proc_dointvec_minmax);	5213	sizeof(int), 0644, proc_dointvec_minmax);
5214	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,	5214	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5215	sizeof(int), 0644, proc_dointvec_minmax);	5215	sizeof(int), 0644, proc_dointvec_minmax);
5216	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,	5216	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5217	sizeof(int), 0644, proc_dointvec_minmax);	5217	sizeof(int), 0644, proc_dointvec_minmax);
5218	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,	5218	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5219	sizeof(int), 0644, proc_dointvec_minmax);	5219	sizeof(int), 0644, proc_dointvec_minmax);
5220	set_table_entry(&table[9], "cache_nice_tries",	5220	set_table_entry(&table[9], "cache_nice_tries",
5221	&sd->cache_nice_tries,	5221	&sd->cache_nice_tries,
5222	sizeof(int), 0644, proc_dointvec_minmax);	5222	sizeof(int), 0644, proc_dointvec_minmax);
5223	set_table_entry(&table[10], "flags", &sd->flags,	5223	set_table_entry(&table[10], "flags", &sd->flags,
5224	sizeof(int), 0644, proc_dointvec_minmax);	5224	sizeof(int), 0644, proc_dointvec_minmax);
5225	set_table_entry(&table[11], "name", sd->name,	5225	set_table_entry(&table[11], "name", sd->name,
5226	CORENAME_MAX_SIZE, 0444, proc_dostring);	5226	CORENAME_MAX_SIZE, 0444, proc_dostring);
5227	/* &table[12] is terminator */	5227	/* &table[12] is terminator */
5228		5228
5229	return table;	5229	return table;
5230	}	5230	}
5231		5231
5232	static ctl_table *sd_alloc_ctl_cpu_table(int cpu)	5232	static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5233	{	5233	{
5234	struct ctl_table entry, table;	5234	struct ctl_table entry, table;
5235	struct sched_domain *sd;	5235	struct sched_domain *sd;
5236	int domain_num = 0, i;	5236	int domain_num = 0, i;
5237	char buf[32];	5237	char buf[32];
5238		5238
5239	for_each_domain(cpu, sd)	5239	for_each_domain(cpu, sd)
5240	domain_num++;	5240	domain_num++;
5241	entry = table = sd_alloc_ctl_entry(domain_num + 1);	5241	entry = table = sd_alloc_ctl_entry(domain_num + 1);
5242	if (table == NULL)	5242	if (table == NULL)
5243	return NULL;	5243	return NULL;
5244		5244
5245	i = 0;	5245	i = 0;
5246	for_each_domain(cpu, sd) {	5246	for_each_domain(cpu, sd) {
5247	snprintf(buf, 32, "domain%d", i);	5247	snprintf(buf, 32, "domain%d", i);
5248	entry->procname = kstrdup(buf, GFP_KERNEL);	5248	entry->procname = kstrdup(buf, GFP_KERNEL);
5249	entry->mode = 0555;	5249	entry->mode = 0555;
5250	entry->child = sd_alloc_ctl_domain_table(sd);	5250	entry->child = sd_alloc_ctl_domain_table(sd);
5251	entry++;	5251	entry++;
5252	i++;	5252	i++;
5253	}	5253	}
5254	return table;	5254	return table;
5255	}	5255	}
5256		5256
5257	static struct ctl_table_header *sd_sysctl_header;	5257	static struct ctl_table_header *sd_sysctl_header;
5258	static void register_sched_domain_sysctl(void)	5258	static void register_sched_domain_sysctl(void)
5259	{	5259	{
5260	int i, cpu_num = num_possible_cpus();	5260	int i, cpu_num = num_possible_cpus();
5261	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);	5261	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5262	char buf[32];	5262	char buf[32];
5263		5263
5264	WARN_ON(sd_ctl_dir[0].child);	5264	WARN_ON(sd_ctl_dir[0].child);
5265	sd_ctl_dir[0].child = entry;	5265	sd_ctl_dir[0].child = entry;
5266		5266
5267	if (entry == NULL)	5267	if (entry == NULL)
5268	return;	5268	return;
5269		5269
5270	for_each_possible_cpu(i) {	5270	for_each_possible_cpu(i) {
5271	snprintf(buf, 32, "cpu%d", i);	5271	snprintf(buf, 32, "cpu%d", i);
5272	entry->procname = kstrdup(buf, GFP_KERNEL);	5272	entry->procname = kstrdup(buf, GFP_KERNEL);
5273	entry->mode = 0555;	5273	entry->mode = 0555;
5274	entry->child = sd_alloc_ctl_cpu_table(i);	5274	entry->child = sd_alloc_ctl_cpu_table(i);
5275	entry++;	5275	entry++;
5276	}	5276	}
5277		5277
5278	WARN_ON(sd_sysctl_header);	5278	WARN_ON(sd_sysctl_header);
5279	sd_sysctl_header = register_sysctl_table(sd_ctl_root);	5279	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5280	}	5280	}
5281		5281
5282	/* may be called multiple times per register */	5282	/* may be called multiple times per register */
5283	static void unregister_sched_domain_sysctl(void)	5283	static void unregister_sched_domain_sysctl(void)
5284	{	5284	{
5285	if (sd_sysctl_header)	5285	if (sd_sysctl_header)
5286	unregister_sysctl_table(sd_sysctl_header);	5286	unregister_sysctl_table(sd_sysctl_header);
5287	sd_sysctl_header = NULL;	5287	sd_sysctl_header = NULL;
5288	if (sd_ctl_dir[0].child)	5288	if (sd_ctl_dir[0].child)
5289	sd_free_ctl_entry(&sd_ctl_dir[0].child);	5289	sd_free_ctl_entry(&sd_ctl_dir[0].child);
5290	}	5290	}
5291	#else	5291	#else
5292	static void register_sched_domain_sysctl(void)	5292	static void register_sched_domain_sysctl(void)
5293	{	5293	{
5294	}	5294	}
5295	static void unregister_sched_domain_sysctl(void)	5295	static void unregister_sched_domain_sysctl(void)
5296	{	5296	{
5297	}	5297	}
5298	#endif	5298	#endif
5299		5299
5300	static void set_rq_online(struct rq *rq)	5300	static void set_rq_online(struct rq *rq)
5301	{	5301	{
5302	if (!rq->online) {	5302	if (!rq->online) {
5303	const struct sched_class *class;	5303	const struct sched_class *class;
5304		5304
5305	cpumask_set_cpu(rq->cpu, rq->rd->online);	5305	cpumask_set_cpu(rq->cpu, rq->rd->online);
5306	rq->online = 1;	5306	rq->online = 1;
5307		5307
5308	for_each_class(class) {	5308	for_each_class(class) {
5309	if (class->rq_online)	5309	if (class->rq_online)
5310	class->rq_online(rq);	5310	class->rq_online(rq);
5311	}	5311	}
5312	}	5312	}
5313	}	5313	}
5314		5314
5315	static void set_rq_offline(struct rq *rq)	5315	static void set_rq_offline(struct rq *rq)
5316	{	5316	{
5317	if (rq->online) {	5317	if (rq->online) {
5318	const struct sched_class *class;	5318	const struct sched_class *class;
5319		5319
5320	for_each_class(class) {	5320	for_each_class(class) {
5321	if (class->rq_offline)	5321	if (class->rq_offline)
5322	class->rq_offline(rq);	5322	class->rq_offline(rq);
5323	}	5323	}
5324		5324
5325	cpumask_clear_cpu(rq->cpu, rq->rd->online);	5325	cpumask_clear_cpu(rq->cpu, rq->rd->online);
5326	rq->online = 0;	5326	rq->online = 0;
5327	}	5327	}
5328	}	5328	}
5329		5329
5330	/*	5330	/*
5331	* migration_call - callback that gets triggered when a CPU is added.	5331	* migration_call - callback that gets triggered when a CPU is added.
5332	* Here we can start up the necessary migration thread for the new CPU.	5332	* Here we can start up the necessary migration thread for the new CPU.
5333	*/	5333	*/
5334	static int __cpuinit	5334	static int __cpuinit
5335	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)	5335	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5336	{	5336	{
5337	int cpu = (long)hcpu;	5337	int cpu = (long)hcpu;
5338	unsigned long flags;	5338	unsigned long flags;
5339	struct rq *rq = cpu_rq(cpu);	5339	struct rq *rq = cpu_rq(cpu);
5340		5340
5341	switch (action & ~CPU_TASKS_FROZEN) {	5341	switch (action & ~CPU_TASKS_FROZEN) {
5342		5342
5343	case CPU_UP_PREPARE:	5343	case CPU_UP_PREPARE:
5344	rq->calc_load_update = calc_load_update;	5344	rq->calc_load_update = calc_load_update;
5345	break;	5345	break;
5346		5346
5347	case CPU_ONLINE:	5347	case CPU_ONLINE:
5348	/* Update our root-domain */	5348	/* Update our root-domain */
5349	raw_spin_lock_irqsave(&rq->lock, flags);	5349	raw_spin_lock_irqsave(&rq->lock, flags);
5350	if (rq->rd) {	5350	if (rq->rd) {
5351	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	5351	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5352		5352
5353	set_rq_online(rq);	5353	set_rq_online(rq);
5354	}	5354	}
5355	raw_spin_unlock_irqrestore(&rq->lock, flags);	5355	raw_spin_unlock_irqrestore(&rq->lock, flags);
5356	break;	5356	break;
5357		5357
5358	#ifdef CONFIG_HOTPLUG_CPU	5358	#ifdef CONFIG_HOTPLUG_CPU
5359	case CPU_DYING:	5359	case CPU_DYING:
5360	sched_ttwu_pending();	5360	sched_ttwu_pending();
5361	/* Update our root-domain */	5361	/* Update our root-domain */
5362	raw_spin_lock_irqsave(&rq->lock, flags);	5362	raw_spin_lock_irqsave(&rq->lock, flags);
5363	if (rq->rd) {	5363	if (rq->rd) {
5364	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	5364	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5365	set_rq_offline(rq);	5365	set_rq_offline(rq);
5366	}	5366	}
5367	migrate_tasks(cpu);	5367	migrate_tasks(cpu);
5368	BUG_ON(rq->nr_running != 1); /* the migration thread */	5368	BUG_ON(rq->nr_running != 1); /* the migration thread */
5369	raw_spin_unlock_irqrestore(&rq->lock, flags);	5369	raw_spin_unlock_irqrestore(&rq->lock, flags);
5370		5370
5371	migrate_nr_uninterruptible(rq);	5371	migrate_nr_uninterruptible(rq);
5372	calc_global_load_remove(rq);	5372	calc_global_load_remove(rq);
5373	break;	5373	break;
5374	#endif	5374	#endif
5375	}	5375	}
5376		5376
5377	update_max_interval();	5377	update_max_interval();
5378		5378
5379	return NOTIFY_OK;	5379	return NOTIFY_OK;
5380	}	5380	}
5381		5381
5382	/*	5382	/*
5383	* Register at high priority so that task migration (migrate_all_tasks)	5383	* Register at high priority so that task migration (migrate_all_tasks)
5384	* happens before everything else. This has to be lower priority than	5384	* happens before everything else. This has to be lower priority than
5385	* the notifier in the perf_event subsystem, though.	5385	* the notifier in the perf_event subsystem, though.
5386	*/	5386	*/
5387	static struct notifier_block __cpuinitdata migration_notifier = {	5387	static struct notifier_block __cpuinitdata migration_notifier = {
5388	.notifier_call = migration_call,	5388	.notifier_call = migration_call,
5389	.priority = CPU_PRI_MIGRATION,	5389	.priority = CPU_PRI_MIGRATION,
5390	};	5390	};
5391		5391
5392	static int __cpuinit sched_cpu_active(struct notifier_block *nfb,	5392	static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5393	unsigned long action, void *hcpu)	5393	unsigned long action, void *hcpu)
5394	{	5394	{
5395	switch (action & ~CPU_TASKS_FROZEN) {	5395	switch (action & ~CPU_TASKS_FROZEN) {
5396	case CPU_ONLINE:	5396	case CPU_ONLINE:
5397	case CPU_DOWN_FAILED:	5397	case CPU_DOWN_FAILED:
5398	set_cpu_active((long)hcpu, true);	5398	set_cpu_active((long)hcpu, true);
5399	return NOTIFY_OK;	5399	return NOTIFY_OK;
5400	default:	5400	default:
5401	return NOTIFY_DONE;	5401	return NOTIFY_DONE;
5402	}	5402	}
5403	}	5403	}
5404		5404
5405	static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,	5405	static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5406	unsigned long action, void *hcpu)	5406	unsigned long action, void *hcpu)
5407	{	5407	{
5408	switch (action & ~CPU_TASKS_FROZEN) {	5408	switch (action & ~CPU_TASKS_FROZEN) {
5409	case CPU_DOWN_PREPARE:	5409	case CPU_DOWN_PREPARE:
5410	set_cpu_active((long)hcpu, false);	5410	set_cpu_active((long)hcpu, false);
5411	return NOTIFY_OK;	5411	return NOTIFY_OK;
5412	default:	5412	default:
5413	return NOTIFY_DONE;	5413	return NOTIFY_DONE;
5414	}	5414	}
5415	}	5415	}
5416		5416
5417	static int __init migration_init(void)	5417	static int __init migration_init(void)
5418	{	5418	{
5419	void cpu = (void )(long)smp_processor_id();	5419	void cpu = (void )(long)smp_processor_id();
5420	int err;	5420	int err;
5421		5421
5422	/* Initialize migration for the boot CPU */	5422	/* Initialize migration for the boot CPU */
5423	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);	5423	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5424	BUG_ON(err == NOTIFY_BAD);	5424	BUG_ON(err == NOTIFY_BAD);
5425	migration_call(&migration_notifier, CPU_ONLINE, cpu);	5425	migration_call(&migration_notifier, CPU_ONLINE, cpu);
5426	register_cpu_notifier(&migration_notifier);	5426	register_cpu_notifier(&migration_notifier);
5427		5427
5428	/* Register cpu active notifiers */	5428	/* Register cpu active notifiers */
5429	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);	5429	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5430	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);	5430	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5431		5431
5432	return 0;	5432	return 0;
5433	}	5433	}
5434	early_initcall(migration_init);	5434	early_initcall(migration_init);
5435	#endif	5435	#endif
5436		5436
5437	#ifdef CONFIG_SMP	5437	#ifdef CONFIG_SMP
5438		5438
5439	static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */	5439	static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5440		5440
5441	#ifdef CONFIG_SCHED_DEBUG	5441	#ifdef CONFIG_SCHED_DEBUG
5442		5442
5443	static __read_mostly int sched_domain_debug_enabled;	5443	static __read_mostly int sched_domain_debug_enabled;
5444		5444
5445	static int __init sched_domain_debug_setup(char *str)	5445	static int __init sched_domain_debug_setup(char *str)
5446	{	5446	{
5447	sched_domain_debug_enabled = 1;	5447	sched_domain_debug_enabled = 1;
5448		5448
5449	return 0;	5449	return 0;
5450	}	5450	}
5451	early_param("sched_debug", sched_domain_debug_setup);	5451	early_param("sched_debug", sched_domain_debug_setup);
5452		5452
5453	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,	5453	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5454	struct cpumask *groupmask)	5454	struct cpumask *groupmask)
5455	{	5455	{
5456	struct sched_group *group = sd->groups;	5456	struct sched_group *group = sd->groups;
5457	char str[256];	5457	char str[256];
5458		5458
5459	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));	5459	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5460	cpumask_clear(groupmask);	5460	cpumask_clear(groupmask);
5461		5461
5462	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);	5462	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5463		5463
5464	if (!(sd->flags & SD_LOAD_BALANCE)) {	5464	if (!(sd->flags & SD_LOAD_BALANCE)) {
5465	printk("does not load-balance\n");	5465	printk("does not load-balance\n");
5466	if (sd->parent)	5466	if (sd->parent)
5467	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"	5467	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5468	" has parent");	5468	" has parent");
5469	return -1;	5469	return -1;
5470	}	5470	}
5471		5471
5472	printk(KERN_CONT "span %s level %s\n", str, sd->name);	5472	printk(KERN_CONT "span %s level %s\n", str, sd->name);
5473		5473
5474	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {	5474	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5475	printk(KERN_ERR "ERROR: domain->span does not contain "	5475	printk(KERN_ERR "ERROR: domain->span does not contain "
5476	"CPU%d\n", cpu);	5476	"CPU%d\n", cpu);
5477	}	5477	}
5478	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {	5478	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5479	printk(KERN_ERR "ERROR: domain->groups does not contain"	5479	printk(KERN_ERR "ERROR: domain->groups does not contain"
5480	" CPU%d\n", cpu);	5480	" CPU%d\n", cpu);
5481	}	5481	}
5482		5482
5483	printk(KERN_DEBUG "%*s groups:", level + 1, "");	5483	printk(KERN_DEBUG "%*s groups:", level + 1, "");
5484	do {	5484	do {
5485	if (!group) {	5485	if (!group) {
5486	printk("\n");	5486	printk("\n");
5487	printk(KERN_ERR "ERROR: group is NULL\n");	5487	printk(KERN_ERR "ERROR: group is NULL\n");
5488	break;	5488	break;
5489	}	5489	}
5490		5490
5491	if (!group->sgp->power) {	5491	if (!group->sgp->power) {
5492	printk(KERN_CONT "\n");	5492	printk(KERN_CONT "\n");
5493	printk(KERN_ERR "ERROR: domain->cpu_power not "	5493	printk(KERN_ERR "ERROR: domain->cpu_power not "
5494	"set\n");	5494	"set\n");
5495	break;	5495	break;
5496	}	5496	}
5497		5497
5498	if (!cpumask_weight(sched_group_cpus(group))) {	5498	if (!cpumask_weight(sched_group_cpus(group))) {
5499	printk(KERN_CONT "\n");	5499	printk(KERN_CONT "\n");
5500	printk(KERN_ERR "ERROR: empty group\n");	5500	printk(KERN_ERR "ERROR: empty group\n");
5501	break;	5501	break;
5502	}	5502	}
5503		5503
5504	if (cpumask_intersects(groupmask, sched_group_cpus(group))) {	5504	if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
5505	printk(KERN_CONT "\n");	5505	printk(KERN_CONT "\n");
5506	printk(KERN_ERR "ERROR: repeated CPUs\n");	5506	printk(KERN_ERR "ERROR: repeated CPUs\n");
5507	break;	5507	break;
5508	}	5508	}
5509		5509
5510	cpumask_or(groupmask, groupmask, sched_group_cpus(group));	5510	cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5511		5511
5512	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));	5512	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5513		5513
5514	printk(KERN_CONT " %s", str);	5514	printk(KERN_CONT " %s", str);
5515	if (group->sgp->power != SCHED_POWER_SCALE) {	5515	if (group->sgp->power != SCHED_POWER_SCALE) {
5516	printk(KERN_CONT " (cpu_power = %d)",	5516	printk(KERN_CONT " (cpu_power = %d)",
5517	group->sgp->power);	5517	group->sgp->power);
5518	}	5518	}
5519		5519
5520	group = group->next;	5520	group = group->next;
5521	} while (group != sd->groups);	5521	} while (group != sd->groups);
5522	printk(KERN_CONT "\n");	5522	printk(KERN_CONT "\n");
5523		5523
5524	if (!cpumask_equal(sched_domain_span(sd), groupmask))	5524	if (!cpumask_equal(sched_domain_span(sd), groupmask))
5525	printk(KERN_ERR "ERROR: groups don't span domain->span\n");	5525	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5526		5526
5527	if (sd->parent &&	5527	if (sd->parent &&
5528	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))	5528	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5529	printk(KERN_ERR "ERROR: parent span is not a superset "	5529	printk(KERN_ERR "ERROR: parent span is not a superset "
5530	"of domain->span\n");	5530	"of domain->span\n");
5531	return 0;	5531	return 0;
5532	}	5532	}
5533		5533
5534	static void sched_domain_debug(struct sched_domain *sd, int cpu)	5534	static void sched_domain_debug(struct sched_domain *sd, int cpu)
5535	{	5535	{
5536	int level = 0;	5536	int level = 0;
5537		5537
5538	if (!sched_domain_debug_enabled)	5538	if (!sched_domain_debug_enabled)
5539	return;	5539	return;
5540		5540
5541	if (!sd) {	5541	if (!sd) {
5542	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);	5542	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5543	return;	5543	return;
5544	}	5544	}
5545		5545
5546	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);	5546	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5547		5547
5548	for (;;) {	5548	for (;;) {
5549	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))	5549	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5550	break;	5550	break;
5551	level++;	5551	level++;
5552	sd = sd->parent;	5552	sd = sd->parent;
5553	if (!sd)	5553	if (!sd)
5554	break;	5554	break;
5555	}	5555	}
5556	}	5556	}
5557	#else /* !CONFIG_SCHED_DEBUG */	5557	#else /* !CONFIG_SCHED_DEBUG */
5558	# define sched_domain_debug(sd, cpu) do { } while (0)	5558	# define sched_domain_debug(sd, cpu) do { } while (0)
5559	#endif /* CONFIG_SCHED_DEBUG */	5559	#endif /* CONFIG_SCHED_DEBUG */
5560		5560
5561	static int sd_degenerate(struct sched_domain *sd)	5561	static int sd_degenerate(struct sched_domain *sd)
5562	{	5562	{
5563	if (cpumask_weight(sched_domain_span(sd)) == 1)	5563	if (cpumask_weight(sched_domain_span(sd)) == 1)
5564	return 1;	5564	return 1;
5565		5565
5566	/* Following flags need at least 2 groups */	5566	/* Following flags need at least 2 groups */
5567	if (sd->flags & (SD_LOAD_BALANCE \|	5567	if (sd->flags & (SD_LOAD_BALANCE \|
5568	SD_BALANCE_NEWIDLE \|	5568	SD_BALANCE_NEWIDLE \|
5569	SD_BALANCE_FORK \|	5569	SD_BALANCE_FORK \|
5570	SD_BALANCE_EXEC \|	5570	SD_BALANCE_EXEC \|
5571	SD_SHARE_CPUPOWER \|	5571	SD_SHARE_CPUPOWER \|
5572	SD_SHARE_PKG_RESOURCES)) {	5572	SD_SHARE_PKG_RESOURCES)) {
5573	if (sd->groups != sd->groups->next)	5573	if (sd->groups != sd->groups->next)
5574	return 0;	5574	return 0;
5575	}	5575	}
5576		5576
5577	/* Following flags don't use groups */	5577	/* Following flags don't use groups */
5578	if (sd->flags & (SD_WAKE_AFFINE))	5578	if (sd->flags & (SD_WAKE_AFFINE))
5579	return 0;	5579	return 0;
5580		5580
5581	return 1;	5581	return 1;
5582	}	5582	}
5583		5583
5584	static int	5584	static int
5585	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)	5585	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
5586	{	5586	{
5587	unsigned long cflags = sd->flags, pflags = parent->flags;	5587	unsigned long cflags = sd->flags, pflags = parent->flags;
5588		5588
5589	if (sd_degenerate(parent))	5589	if (sd_degenerate(parent))
5590	return 1;	5590	return 1;
5591		5591
5592	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))	5592	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5593	return 0;	5593	return 0;
5594		5594
5595	/* Flags needing groups don't count if only 1 group in parent */	5595	/* Flags needing groups don't count if only 1 group in parent */
5596	if (parent->groups == parent->groups->next) {	5596	if (parent->groups == parent->groups->next) {
5597	pflags &= ~(SD_LOAD_BALANCE \|	5597	pflags &= ~(SD_LOAD_BALANCE \|
5598	SD_BALANCE_NEWIDLE \|	5598	SD_BALANCE_NEWIDLE \|
5599	SD_BALANCE_FORK \|	5599	SD_BALANCE_FORK \|
5600	SD_BALANCE_EXEC \|	5600	SD_BALANCE_EXEC \|
5601	SD_SHARE_CPUPOWER \|	5601	SD_SHARE_CPUPOWER \|
5602	SD_SHARE_PKG_RESOURCES);	5602	SD_SHARE_PKG_RESOURCES);
5603	if (nr_node_ids == 1)	5603	if (nr_node_ids == 1)
5604	pflags &= ~SD_SERIALIZE;	5604	pflags &= ~SD_SERIALIZE;
5605	}	5605	}
5606	if (~cflags & pflags)	5606	if (~cflags & pflags)
5607	return 0;	5607	return 0;
5608		5608
5609	return 1;	5609	return 1;
5610	}	5610	}
5611		5611
5612	static void free_rootdomain(struct rcu_head *rcu)	5612	static void free_rootdomain(struct rcu_head *rcu)
5613	{	5613	{
5614	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);	5614	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5615		5615
5616	cpupri_cleanup(&rd->cpupri);	5616	cpupri_cleanup(&rd->cpupri);
5617	free_cpumask_var(rd->rto_mask);	5617	free_cpumask_var(rd->rto_mask);
5618	free_cpumask_var(rd->online);	5618	free_cpumask_var(rd->online);
5619	free_cpumask_var(rd->span);	5619	free_cpumask_var(rd->span);
5620	kfree(rd);	5620	kfree(rd);
5621	}	5621	}
5622		5622
5623	static void rq_attach_root(struct rq rq, struct root_domain rd)	5623	static void rq_attach_root(struct rq rq, struct root_domain rd)
5624	{	5624	{
5625	struct root_domain *old_rd = NULL;	5625	struct root_domain *old_rd = NULL;
5626	unsigned long flags;	5626	unsigned long flags;
5627		5627
5628	raw_spin_lock_irqsave(&rq->lock, flags);	5628	raw_spin_lock_irqsave(&rq->lock, flags);
5629		5629
5630	if (rq->rd) {	5630	if (rq->rd) {
5631	old_rd = rq->rd;	5631	old_rd = rq->rd;
5632		5632
5633	if (cpumask_test_cpu(rq->cpu, old_rd->online))	5633	if (cpumask_test_cpu(rq->cpu, old_rd->online))
5634	set_rq_offline(rq);	5634	set_rq_offline(rq);
5635		5635
5636	cpumask_clear_cpu(rq->cpu, old_rd->span);	5636	cpumask_clear_cpu(rq->cpu, old_rd->span);
5637		5637
5638	/*	5638	/*
5639	* If we dont want to free the old_rt yet then	5639	* If we dont want to free the old_rt yet then
5640	* set old_rd to NULL to skip the freeing later	5640	* set old_rd to NULL to skip the freeing later
5641	* in this function:	5641	* in this function:
5642	*/	5642	*/
5643	if (!atomic_dec_and_test(&old_rd->refcount))	5643	if (!atomic_dec_and_test(&old_rd->refcount))
5644	old_rd = NULL;	5644	old_rd = NULL;
5645	}	5645	}
5646		5646
5647	atomic_inc(&rd->refcount);	5647	atomic_inc(&rd->refcount);
5648	rq->rd = rd;	5648	rq->rd = rd;
5649		5649
5650	cpumask_set_cpu(rq->cpu, rd->span);	5650	cpumask_set_cpu(rq->cpu, rd->span);
5651	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))	5651	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5652	set_rq_online(rq);	5652	set_rq_online(rq);
5653		5653
5654	raw_spin_unlock_irqrestore(&rq->lock, flags);	5654	raw_spin_unlock_irqrestore(&rq->lock, flags);
5655		5655
5656	if (old_rd)	5656	if (old_rd)
5657	call_rcu_sched(&old_rd->rcu, free_rootdomain);	5657	call_rcu_sched(&old_rd->rcu, free_rootdomain);
5658	}	5658	}
5659		5659
5660	static int init_rootdomain(struct root_domain *rd)	5660	static int init_rootdomain(struct root_domain *rd)
5661	{	5661	{
5662	memset(rd, 0, sizeof(*rd));	5662	memset(rd, 0, sizeof(*rd));
5663		5663
5664	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))	5664	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5665	goto out;	5665	goto out;
5666	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))	5666	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5667	goto free_span;	5667	goto free_span;
5668	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))	5668	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5669	goto free_online;	5669	goto free_online;
5670		5670
5671	if (cpupri_init(&rd->cpupri) != 0)	5671	if (cpupri_init(&rd->cpupri) != 0)
5672	goto free_rto_mask;	5672	goto free_rto_mask;
5673	return 0;	5673	return 0;
5674		5674
5675	free_rto_mask:	5675	free_rto_mask:
5676	free_cpumask_var(rd->rto_mask);	5676	free_cpumask_var(rd->rto_mask);
5677	free_online:	5677	free_online:
5678	free_cpumask_var(rd->online);	5678	free_cpumask_var(rd->online);
5679	free_span:	5679	free_span:
5680	free_cpumask_var(rd->span);	5680	free_cpumask_var(rd->span);
5681	out:	5681	out:
5682	return -ENOMEM;	5682	return -ENOMEM;
5683	}	5683	}
5684		5684
5685	/*	5685	/*
5686	* By default the system creates a single root-domain with all cpus as	5686	* By default the system creates a single root-domain with all cpus as
5687	* members (mimicking the global state we have today).	5687	* members (mimicking the global state we have today).
5688	*/	5688	*/
5689	struct root_domain def_root_domain;	5689	struct root_domain def_root_domain;
5690		5690
5691	static void init_defrootdomain(void)	5691	static void init_defrootdomain(void)
5692	{	5692	{
5693	init_rootdomain(&def_root_domain);	5693	init_rootdomain(&def_root_domain);
5694		5694
5695	atomic_set(&def_root_domain.refcount, 1);	5695	atomic_set(&def_root_domain.refcount, 1);
5696	}	5696	}
5697		5697
5698	static struct root_domain *alloc_rootdomain(void)	5698	static struct root_domain *alloc_rootdomain(void)
5699	{	5699	{
5700	struct root_domain *rd;	5700	struct root_domain *rd;
5701		5701
5702	rd = kmalloc(sizeof(*rd), GFP_KERNEL);	5702	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5703	if (!rd)	5703	if (!rd)
5704	return NULL;	5704	return NULL;
5705		5705
5706	if (init_rootdomain(rd) != 0) {	5706	if (init_rootdomain(rd) != 0) {
5707	kfree(rd);	5707	kfree(rd);
5708	return NULL;	5708	return NULL;
5709	}	5709	}
5710		5710
5711	return rd;	5711	return rd;
5712	}	5712	}
5713		5713
5714	static void free_sched_groups(struct sched_group *sg, int free_sgp)	5714	static void free_sched_groups(struct sched_group *sg, int free_sgp)
5715	{	5715	{
5716	struct sched_group tmp, first;	5716	struct sched_group tmp, first;
5717		5717
5718	if (!sg)	5718	if (!sg)
5719	return;	5719	return;
5720		5720
5721	first = sg;	5721	first = sg;
5722	do {	5722	do {
5723	tmp = sg->next;	5723	tmp = sg->next;
5724		5724
5725	if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))	5725	if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5726	kfree(sg->sgp);	5726	kfree(sg->sgp);
5727		5727
5728	kfree(sg);	5728	kfree(sg);
5729	sg = tmp;	5729	sg = tmp;
5730	} while (sg != first);	5730	} while (sg != first);
5731	}	5731	}
5732		5732
5733	static void free_sched_domain(struct rcu_head *rcu)	5733	static void free_sched_domain(struct rcu_head *rcu)
5734	{	5734	{
5735	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);	5735	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5736		5736
5737	/*	5737	/*
5738	* If its an overlapping domain it has private groups, iterate and	5738	* If its an overlapping domain it has private groups, iterate and
5739	* nuke them all.	5739	* nuke them all.
5740	*/	5740	*/
5741	if (sd->flags & SD_OVERLAP) {	5741	if (sd->flags & SD_OVERLAP) {
5742	free_sched_groups(sd->groups, 1);	5742	free_sched_groups(sd->groups, 1);
5743	} else if (atomic_dec_and_test(&sd->groups->ref)) {	5743	} else if (atomic_dec_and_test(&sd->groups->ref)) {
5744	kfree(sd->groups->sgp);	5744	kfree(sd->groups->sgp);
5745	kfree(sd->groups);	5745	kfree(sd->groups);
5746	}	5746	}
5747	kfree(sd);	5747	kfree(sd);
5748	}	5748	}
5749		5749
5750	static void destroy_sched_domain(struct sched_domain *sd, int cpu)	5750	static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5751	{	5751	{
5752	call_rcu(&sd->rcu, free_sched_domain);	5752	call_rcu(&sd->rcu, free_sched_domain);
5753	}	5753	}
5754		5754
5755	static void destroy_sched_domains(struct sched_domain *sd, int cpu)	5755	static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5756	{	5756	{
5757	for (; sd; sd = sd->parent)	5757	for (; sd; sd = sd->parent)
5758	destroy_sched_domain(sd, cpu);	5758	destroy_sched_domain(sd, cpu);
5759	}	5759	}
5760		5760
5761	/*	5761	/*
5762	* Keep a special pointer to the highest sched_domain that has	5762	* Keep a special pointer to the highest sched_domain that has
5763	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this	5763	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5764	* allows us to avoid some pointer chasing select_idle_sibling().	5764	* allows us to avoid some pointer chasing select_idle_sibling().
5765	*	5765	*
5766	* Also keep a unique ID per domain (we use the first cpu number in	5766	* Also keep a unique ID per domain (we use the first cpu number in
5767	* the cpumask of the domain), this allows us to quickly tell if	5767	* the cpumask of the domain), this allows us to quickly tell if
5768	* two cpus are in the same cache domain, see cpus_share_cache().	5768	* two cpus are in the same cache domain, see cpus_share_cache().
5769	*/	5769	*/
5770	DEFINE_PER_CPU(struct sched_domain *, sd_llc);	5770	DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5771	DEFINE_PER_CPU(int, sd_llc_id);	5771	DEFINE_PER_CPU(int, sd_llc_id);
5772		5772
5773	static void update_top_cache_domain(int cpu)	5773	static void update_top_cache_domain(int cpu)
5774	{	5774	{
5775	struct sched_domain *sd;	5775	struct sched_domain *sd;
5776	int id = cpu;	5776	int id = cpu;
5777		5777
5778	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);	5778	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5779	if (sd)	5779	if (sd)
5780	id = cpumask_first(sched_domain_span(sd));	5780	id = cpumask_first(sched_domain_span(sd));
5781		5781
5782	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);	5782	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5783	per_cpu(sd_llc_id, cpu) = id;	5783	per_cpu(sd_llc_id, cpu) = id;
5784	}	5784	}
5785		5785
5786	/*	5786	/*
5787	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must	5787	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5788	* hold the hotplug lock.	5788	* hold the hotplug lock.
5789	*/	5789	*/
5790	static void	5790	static void
5791	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)	5791	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
5792	{	5792	{
5793	struct rq *rq = cpu_rq(cpu);	5793	struct rq *rq = cpu_rq(cpu);
5794	struct sched_domain *tmp;	5794	struct sched_domain *tmp;
5795		5795
5796	/* Remove the sched domains which do not contribute to scheduling. */	5796	/* Remove the sched domains which do not contribute to scheduling. */
5797	for (tmp = sd; tmp; ) {	5797	for (tmp = sd; tmp; ) {
5798	struct sched_domain *parent = tmp->parent;	5798	struct sched_domain *parent = tmp->parent;
5799	if (!parent)	5799	if (!parent)
5800	break;	5800	break;
5801		5801
5802	if (sd_parent_degenerate(tmp, parent)) {	5802	if (sd_parent_degenerate(tmp, parent)) {
5803	tmp->parent = parent->parent;	5803	tmp->parent = parent->parent;
5804	if (parent->parent)	5804	if (parent->parent)
5805	parent->parent->child = tmp;	5805	parent->parent->child = tmp;
5806	destroy_sched_domain(parent, cpu);	5806	destroy_sched_domain(parent, cpu);
5807	} else	5807	} else
5808	tmp = tmp->parent;	5808	tmp = tmp->parent;
5809	}	5809	}
5810		5810
5811	if (sd && sd_degenerate(sd)) {	5811	if (sd && sd_degenerate(sd)) {
5812	tmp = sd;	5812	tmp = sd;
5813	sd = sd->parent;	5813	sd = sd->parent;
5814	destroy_sched_domain(tmp, cpu);	5814	destroy_sched_domain(tmp, cpu);
5815	if (sd)	5815	if (sd)
5816	sd->child = NULL;	5816	sd->child = NULL;
5817	}	5817	}
5818		5818
5819	sched_domain_debug(sd, cpu);	5819	sched_domain_debug(sd, cpu);
5820		5820
5821	rq_attach_root(rq, rd);	5821	rq_attach_root(rq, rd);
5822	tmp = rq->sd;	5822	tmp = rq->sd;
5823	rcu_assign_pointer(rq->sd, sd);	5823	rcu_assign_pointer(rq->sd, sd);
5824	destroy_sched_domains(tmp, cpu);	5824	destroy_sched_domains(tmp, cpu);
5825		5825
5826	update_top_cache_domain(cpu);	5826	update_top_cache_domain(cpu);
5827	}	5827	}
5828		5828
5829	/* cpus with isolated domains */	5829	/* cpus with isolated domains */
5830	static cpumask_var_t cpu_isolated_map;	5830	static cpumask_var_t cpu_isolated_map;
5831		5831
5832	/* Setup the mask of cpus configured for isolated domains */	5832	/* Setup the mask of cpus configured for isolated domains */
5833	static int __init isolated_cpu_setup(char *str)	5833	static int __init isolated_cpu_setup(char *str)
5834	{	5834	{
5835	alloc_bootmem_cpumask_var(&cpu_isolated_map);	5835	alloc_bootmem_cpumask_var(&cpu_isolated_map);
5836	cpulist_parse(str, cpu_isolated_map);	5836	cpulist_parse(str, cpu_isolated_map);
5837	return 1;	5837	return 1;
5838	}	5838	}
5839		5839
5840	__setup("isolcpus=", isolated_cpu_setup);	5840	__setup("isolcpus=", isolated_cpu_setup);
5841		5841
5842	#ifdef CONFIG_NUMA	5842	#ifdef CONFIG_NUMA
5843		5843
5844	/**	5844	/**
5845	* find_next_best_node - find the next node to include in a sched_domain	5845	* find_next_best_node - find the next node to include in a sched_domain
5846	* @node: node whose sched_domain we're building	5846	* @node: node whose sched_domain we're building
5847	* @used_nodes: nodes already in the sched_domain	5847	* @used_nodes: nodes already in the sched_domain
5848	*	5848	*
5849	* Find the next node to include in a given scheduling domain. Simply	5849	* Find the next node to include in a given scheduling domain. Simply
5850	* finds the closest node not already in the @used_nodes map.	5850	* finds the closest node not already in the @used_nodes map.
5851	*	5851	*
5852	* Should use nodemask_t.	5852	* Should use nodemask_t.
5853	*/	5853	*/
5854	static int find_next_best_node(int node, nodemask_t *used_nodes)	5854	static int find_next_best_node(int node, nodemask_t *used_nodes)
5855	{	5855	{
5856	int i, n, val, min_val, best_node = -1;	5856	int i, n, val, min_val, best_node = -1;
5857		5857
5858	min_val = INT_MAX;	5858	min_val = INT_MAX;
5859		5859
5860	for (i = 0; i < nr_node_ids; i++) {	5860	for (i = 0; i < nr_node_ids; i++) {
5861	/* Start at @node */	5861	/* Start at @node */
5862	n = (node + i) % nr_node_ids;	5862	n = (node + i) % nr_node_ids;
5863		5863
5864	if (!nr_cpus_node(n))	5864	if (!nr_cpus_node(n))
5865	continue;	5865	continue;
5866		5866
5867	/* Skip already used nodes */	5867	/* Skip already used nodes */
5868	if (node_isset(n, *used_nodes))	5868	if (node_isset(n, *used_nodes))
5869	continue;	5869	continue;
5870		5870
5871	/* Simple min distance search */	5871	/* Simple min distance search */
5872	val = node_distance(node, n);	5872	val = node_distance(node, n);
5873		5873
5874	if (val < min_val) {	5874	if (val < min_val) {
5875	min_val = val;	5875	min_val = val;
5876	best_node = n;	5876	best_node = n;
5877	}	5877	}
5878	}	5878	}
5879		5879
5880	if (best_node != -1)	5880	if (best_node != -1)
5881	node_set(best_node, *used_nodes);	5881	node_set(best_node, *used_nodes);
5882	return best_node;	5882	return best_node;
5883	}	5883	}
5884		5884
5885	/**	5885	/**
5886	* sched_domain_node_span - get a cpumask for a node's sched_domain	5886	* sched_domain_node_span - get a cpumask for a node's sched_domain
5887	* @node: node whose cpumask we're constructing	5887	* @node: node whose cpumask we're constructing
5888	* @span: resulting cpumask	5888	* @span: resulting cpumask
5889	*	5889	*
5890	* Given a node, construct a good cpumask for its sched_domain to span. It	5890	* Given a node, construct a good cpumask for its sched_domain to span. It
5891	* should be one that prevents unnecessary balancing, but also spreads tasks	5891	* should be one that prevents unnecessary balancing, but also spreads tasks
5892	* out optimally.	5892	* out optimally.
5893	*/	5893	*/
5894	static void sched_domain_node_span(int node, struct cpumask *span)	5894	static void sched_domain_node_span(int node, struct cpumask *span)
5895	{	5895	{
5896	nodemask_t used_nodes;	5896	nodemask_t used_nodes;
5897	int i;	5897	int i;
5898		5898
5899	cpumask_clear(span);	5899	cpumask_clear(span);
5900	nodes_clear(used_nodes);	5900	nodes_clear(used_nodes);
5901		5901
5902	cpumask_or(span, span, cpumask_of_node(node));	5902	cpumask_or(span, span, cpumask_of_node(node));
5903	node_set(node, used_nodes);	5903	node_set(node, used_nodes);
5904		5904
5905	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {	5905	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5906	int next_node = find_next_best_node(node, &used_nodes);	5906	int next_node = find_next_best_node(node, &used_nodes);
5907	if (next_node < 0)	5907	if (next_node < 0)
5908	break;	5908	break;
5909	cpumask_or(span, span, cpumask_of_node(next_node));	5909	cpumask_or(span, span, cpumask_of_node(next_node));
5910	}	5910	}
5911	}	5911	}
5912		5912
5913	static const struct cpumask *cpu_node_mask(int cpu)	5913	static const struct cpumask *cpu_node_mask(int cpu)
5914	{	5914	{
5915	lockdep_assert_held(&sched_domains_mutex);	5915	lockdep_assert_held(&sched_domains_mutex);
5916		5916
5917	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);	5917	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5918		5918
5919	return sched_domains_tmpmask;	5919	return sched_domains_tmpmask;
5920	}	5920	}
5921		5921
5922	static const struct cpumask *cpu_allnodes_mask(int cpu)	5922	static const struct cpumask *cpu_allnodes_mask(int cpu)
5923	{	5923	{
5924	return cpu_possible_mask;	5924	return cpu_possible_mask;
5925	}	5925	}
5926	#endif /* CONFIG_NUMA */	5926	#endif /* CONFIG_NUMA */
5927		5927
5928	static const struct cpumask *cpu_cpu_mask(int cpu)	5928	static const struct cpumask *cpu_cpu_mask(int cpu)
5929	{	5929	{
5930	return cpumask_of_node(cpu_to_node(cpu));	5930	return cpumask_of_node(cpu_to_node(cpu));
5931	}	5931	}
5932		5932
5933	int sched_smt_power_savings = 0, sched_mc_power_savings = 0;	5933	int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5934		5934
5935	struct sd_data {	5935	struct sd_data {
5936	struct sched_domain **__percpu sd;	5936	struct sched_domain **__percpu sd;
5937	struct sched_group **__percpu sg;	5937	struct sched_group **__percpu sg;
5938	struct sched_group_power **__percpu sgp;	5938	struct sched_group_power **__percpu sgp;
5939	};	5939	};
5940		5940
5941	struct s_data {	5941	struct s_data {
5942	struct sched_domain ** __percpu sd;	5942	struct sched_domain ** __percpu sd;
5943	struct root_domain *rd;	5943	struct root_domain *rd;
5944	};	5944	};
5945		5945
5946	enum s_alloc {	5946	enum s_alloc {
5947	sa_rootdomain,	5947	sa_rootdomain,
5948	sa_sd,	5948	sa_sd,
5949	sa_sd_storage,	5949	sa_sd_storage,
5950	sa_none,	5950	sa_none,
5951	};	5951	};
5952		5952
5953	struct sched_domain_topology_level;	5953	struct sched_domain_topology_level;
5954		5954
5955	typedef struct sched_domain (sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);	5955	typedef struct sched_domain (sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5956	typedef const struct cpumask (sched_domain_mask_f)(int cpu);	5956	typedef const struct cpumask (sched_domain_mask_f)(int cpu);
5957		5957
5958	#define SDTL_OVERLAP 0x01	5958	#define SDTL_OVERLAP 0x01
5959		5959
5960	struct sched_domain_topology_level {	5960	struct sched_domain_topology_level {
5961	sched_domain_init_f init;	5961	sched_domain_init_f init;
5962	sched_domain_mask_f mask;	5962	sched_domain_mask_f mask;
5963	int flags;	5963	int flags;
5964	struct sd_data data;	5964	struct sd_data data;
5965	};	5965	};
5966		5966
5967	static int	5967	static int
5968	build_overlap_sched_groups(struct sched_domain *sd, int cpu)	5968	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5969	{	5969	{
5970	struct sched_group first = NULL, last = NULL, groups = NULL, sg;	5970	struct sched_group first = NULL, last = NULL, groups = NULL, sg;
5971	const struct cpumask *span = sched_domain_span(sd);	5971	const struct cpumask *span = sched_domain_span(sd);
5972	struct cpumask *covered = sched_domains_tmpmask;	5972	struct cpumask *covered = sched_domains_tmpmask;
5973	struct sd_data *sdd = sd->private;	5973	struct sd_data *sdd = sd->private;
5974	struct sched_domain *child;	5974	struct sched_domain *child;
5975	int i;	5975	int i;
5976		5976
5977	cpumask_clear(covered);	5977	cpumask_clear(covered);
5978		5978
5979	for_each_cpu(i, span) {	5979	for_each_cpu(i, span) {
5980	struct cpumask *sg_span;	5980	struct cpumask *sg_span;
5981		5981
5982	if (cpumask_test_cpu(i, covered))	5982	if (cpumask_test_cpu(i, covered))
5983	continue;	5983	continue;
5984		5984
5985	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),	5985	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5986	GFP_KERNEL, cpu_to_node(cpu));	5986	GFP_KERNEL, cpu_to_node(cpu));
5987		5987
5988	if (!sg)	5988	if (!sg)
5989	goto fail;	5989	goto fail;
5990		5990
5991	sg_span = sched_group_cpus(sg);	5991	sg_span = sched_group_cpus(sg);
5992		5992
5993	child = *per_cpu_ptr(sdd->sd, i);	5993	child = *per_cpu_ptr(sdd->sd, i);
5994	if (child->child) {	5994	if (child->child) {
5995	child = child->child;	5995	child = child->child;
5996	cpumask_copy(sg_span, sched_domain_span(child));	5996	cpumask_copy(sg_span, sched_domain_span(child));
5997	} else	5997	} else
5998	cpumask_set_cpu(i, sg_span);	5998	cpumask_set_cpu(i, sg_span);
5999		5999
6000	cpumask_or(covered, covered, sg_span);	6000	cpumask_or(covered, covered, sg_span);
6001		6001
6002	sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));	6002	sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6003	atomic_inc(&sg->sgp->ref);	6003	atomic_inc(&sg->sgp->ref);
6004		6004
6005	if (cpumask_test_cpu(cpu, sg_span))	6005	if (cpumask_test_cpu(cpu, sg_span))
6006	groups = sg;	6006	groups = sg;
6007		6007
6008	if (!first)	6008	if (!first)
6009	first = sg;	6009	first = sg;
6010	if (last)	6010	if (last)
6011	last->next = sg;	6011	last->next = sg;
6012	last = sg;	6012	last = sg;
6013	last->next = first;	6013	last->next = first;
6014	}	6014	}
6015	sd->groups = groups;	6015	sd->groups = groups;
6016		6016
6017	return 0;	6017	return 0;
6018		6018
6019	fail:	6019	fail:
6020	free_sched_groups(first, 0);	6020	free_sched_groups(first, 0);
6021		6021
6022	return -ENOMEM;	6022	return -ENOMEM;
6023	}	6023	}
6024		6024
6025	static int get_group(int cpu, struct sd_data sdd, struct sched_group *sg)	6025	static int get_group(int cpu, struct sd_data sdd, struct sched_group *sg)
6026	{	6026	{
6027	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);	6027	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
6028	struct sched_domain *child = sd->child;	6028	struct sched_domain *child = sd->child;
6029		6029
6030	if (child)	6030	if (child)
6031	cpu = cpumask_first(sched_domain_span(child));	6031	cpu = cpumask_first(sched_domain_span(child));
6032		6032
6033	if (sg) {	6033	if (sg) {
6034	sg = per_cpu_ptr(sdd->sg, cpu);	6034	sg = per_cpu_ptr(sdd->sg, cpu);
6035	(sg)->sgp = per_cpu_ptr(sdd->sgp, cpu);	6035	(sg)->sgp = per_cpu_ptr(sdd->sgp, cpu);
6036	atomic_set(&(sg)->sgp->ref, 1); / for claim_allocations */	6036	atomic_set(&(sg)->sgp->ref, 1); / for claim_allocations */
6037	}	6037	}
6038		6038
6039	return cpu;	6039	return cpu;
6040	}	6040	}
6041		6041
6042	/*	6042	/*
6043	* build_sched_groups will build a circular linked list of the groups	6043	* build_sched_groups will build a circular linked list of the groups
6044	* covered by the given span, and will set each group's ->cpumask correctly,	6044	* covered by the given span, and will set each group's ->cpumask correctly,
6045	* and ->cpu_power to 0.	6045	* and ->cpu_power to 0.
6046	*	6046	*
6047	* Assumes the sched_domain tree is fully constructed	6047	* Assumes the sched_domain tree is fully constructed
6048	*/	6048	*/
6049	static int	6049	static int
6050	build_sched_groups(struct sched_domain *sd, int cpu)	6050	build_sched_groups(struct sched_domain *sd, int cpu)
6051	{	6051	{
6052	struct sched_group first = NULL, last = NULL;	6052	struct sched_group first = NULL, last = NULL;
6053	struct sd_data *sdd = sd->private;	6053	struct sd_data *sdd = sd->private;
6054	const struct cpumask *span = sched_domain_span(sd);	6054	const struct cpumask *span = sched_domain_span(sd);
6055	struct cpumask *covered;	6055	struct cpumask *covered;
6056	int i;	6056	int i;
6057		6057
6058	get_group(cpu, sdd, &sd->groups);	6058	get_group(cpu, sdd, &sd->groups);
6059	atomic_inc(&sd->groups->ref);	6059	atomic_inc(&sd->groups->ref);
6060		6060
6061	if (cpu != cpumask_first(sched_domain_span(sd)))	6061	if (cpu != cpumask_first(sched_domain_span(sd)))
6062	return 0;	6062	return 0;
6063		6063
6064	lockdep_assert_held(&sched_domains_mutex);	6064	lockdep_assert_held(&sched_domains_mutex);
6065	covered = sched_domains_tmpmask;	6065	covered = sched_domains_tmpmask;
6066		6066
6067	cpumask_clear(covered);	6067	cpumask_clear(covered);
6068		6068
6069	for_each_cpu(i, span) {	6069	for_each_cpu(i, span) {
6070	struct sched_group *sg;	6070	struct sched_group *sg;
6071	int group = get_group(i, sdd, &sg);	6071	int group = get_group(i, sdd, &sg);
6072	int j;	6072	int j;
6073		6073
6074	if (cpumask_test_cpu(i, covered))	6074	if (cpumask_test_cpu(i, covered))
6075	continue;	6075	continue;
6076		6076
6077	cpumask_clear(sched_group_cpus(sg));	6077	cpumask_clear(sched_group_cpus(sg));
6078	sg->sgp->power = 0;	6078	sg->sgp->power = 0;
6079		6079
6080	for_each_cpu(j, span) {	6080	for_each_cpu(j, span) {
6081	if (get_group(j, sdd, NULL) != group)	6081	if (get_group(j, sdd, NULL) != group)
6082	continue;	6082	continue;
6083		6083
6084	cpumask_set_cpu(j, covered);	6084	cpumask_set_cpu(j, covered);
6085	cpumask_set_cpu(j, sched_group_cpus(sg));	6085	cpumask_set_cpu(j, sched_group_cpus(sg));
6086	}	6086	}
6087		6087
6088	if (!first)	6088	if (!first)
6089	first = sg;	6089	first = sg;
6090	if (last)	6090	if (last)
6091	last->next = sg;	6091	last->next = sg;
6092	last = sg;	6092	last = sg;
6093	}	6093	}
6094	last->next = first;	6094	last->next = first;
6095		6095
6096	return 0;	6096	return 0;
6097	}	6097	}
6098		6098
6099	/*	6099	/*
6100	* Initialize sched groups cpu_power.	6100	* Initialize sched groups cpu_power.
6101	*	6101	*
6102	* cpu_power indicates the capacity of sched group, which is used while	6102	* cpu_power indicates the capacity of sched group, which is used while
6103	* distributing the load between different sched groups in a sched domain.	6103	* distributing the load between different sched groups in a sched domain.
6104	* Typically cpu_power for all the groups in a sched domain will be same unless	6104	* Typically cpu_power for all the groups in a sched domain will be same unless
6105	* there are asymmetries in the topology. If there are asymmetries, group	6105	* there are asymmetries in the topology. If there are asymmetries, group
6106	* having more cpu_power will pickup more load compared to the group having	6106	* having more cpu_power will pickup more load compared to the group having
6107	* less cpu_power.	6107	* less cpu_power.
6108	*/	6108	*/
6109	static void init_sched_groups_power(int cpu, struct sched_domain *sd)	6109	static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6110	{	6110	{
6111	struct sched_group *sg = sd->groups;	6111	struct sched_group *sg = sd->groups;
6112		6112
6113	WARN_ON(!sd \|\| !sg);	6113	WARN_ON(!sd \|\| !sg);
6114		6114
6115	do {	6115	do {
6116	sg->group_weight = cpumask_weight(sched_group_cpus(sg));	6116	sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6117	sg = sg->next;	6117	sg = sg->next;
6118	} while (sg != sd->groups);	6118	} while (sg != sd->groups);
6119		6119
6120	if (cpu != group_first_cpu(sg))	6120	if (cpu != group_first_cpu(sg))
6121	return;	6121	return;
6122		6122
6123	update_group_power(sd, cpu);	6123	update_group_power(sd, cpu);
6124	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);	6124	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6125	}	6125	}
6126		6126
6127	int __weak arch_sd_sibling_asym_packing(void)	6127	int __weak arch_sd_sibling_asym_packing(void)
6128	{	6128	{
6129	return 0*SD_ASYM_PACKING;	6129	return 0*SD_ASYM_PACKING;
6130	}	6130	}
6131		6131
6132	/*	6132	/*
6133	* Initializers for schedule domains	6133	* Initializers for schedule domains
6134	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()	6134	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6135	*/	6135	*/
6136		6136
6137	#ifdef CONFIG_SCHED_DEBUG	6137	#ifdef CONFIG_SCHED_DEBUG
6138	# define SD_INIT_NAME(sd, type) sd->name = #type	6138	# define SD_INIT_NAME(sd, type) sd->name = #type
6139	#else	6139	#else
6140	# define SD_INIT_NAME(sd, type) do { } while (0)	6140	# define SD_INIT_NAME(sd, type) do { } while (0)
6141	#endif	6141	#endif
6142		6142
6143	#define SD_INIT_FUNC(type) \	6143	#define SD_INIT_FUNC(type) \
6144	static noinline struct sched_domain * \	6144	static noinline struct sched_domain * \
6145	sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \	6145	sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6146	{ \	6146	{ \
6147	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu); \	6147	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu); \
6148	*sd = SD_##type##_INIT; \	6148	*sd = SD_##type##_INIT; \
6149	SD_INIT_NAME(sd, type); \	6149	SD_INIT_NAME(sd, type); \
6150	sd->private = &tl->data; \	6150	sd->private = &tl->data; \
6151	return sd; \	6151	return sd; \
6152	}	6152	}
6153		6153
6154	SD_INIT_FUNC(CPU)	6154	SD_INIT_FUNC(CPU)
6155	#ifdef CONFIG_NUMA	6155	#ifdef CONFIG_NUMA
6156	SD_INIT_FUNC(ALLNODES)	6156	SD_INIT_FUNC(ALLNODES)
6157	SD_INIT_FUNC(NODE)	6157	SD_INIT_FUNC(NODE)
6158	#endif	6158	#endif
6159	#ifdef CONFIG_SCHED_SMT	6159	#ifdef CONFIG_SCHED_SMT
6160	SD_INIT_FUNC(SIBLING)	6160	SD_INIT_FUNC(SIBLING)
6161	#endif	6161	#endif
6162	#ifdef CONFIG_SCHED_MC	6162	#ifdef CONFIG_SCHED_MC
6163	SD_INIT_FUNC(MC)	6163	SD_INIT_FUNC(MC)
6164	#endif	6164	#endif
6165	#ifdef CONFIG_SCHED_BOOK	6165	#ifdef CONFIG_SCHED_BOOK
6166	SD_INIT_FUNC(BOOK)	6166	SD_INIT_FUNC(BOOK)
6167	#endif	6167	#endif
6168		6168
6169	static int default_relax_domain_level = -1;	6169	static int default_relax_domain_level = -1;
6170	int sched_domain_level_max;	6170	int sched_domain_level_max;
6171		6171
6172	static int __init setup_relax_domain_level(char *str)	6172	static int __init setup_relax_domain_level(char *str)
6173	{	6173	{
6174	unsigned long val;	6174	unsigned long val;
6175		6175
6176	val = simple_strtoul(str, NULL, 0);	6176	val = simple_strtoul(str, NULL, 0);
6177	if (val < sched_domain_level_max)	6177	if (val < sched_domain_level_max)
6178	default_relax_domain_level = val;	6178	default_relax_domain_level = val;
6179		6179
6180	return 1;	6180	return 1;
6181	}	6181	}
6182	__setup("relax_domain_level=", setup_relax_domain_level);	6182	__setup("relax_domain_level=", setup_relax_domain_level);
6183		6183
6184	static void set_domain_attribute(struct sched_domain *sd,	6184	static void set_domain_attribute(struct sched_domain *sd,
6185	struct sched_domain_attr *attr)	6185	struct sched_domain_attr *attr)
6186	{	6186	{
6187	int request;	6187	int request;
6188		6188
6189	if (!attr \|\| attr->relax_domain_level < 0) {	6189	if (!attr \|\| attr->relax_domain_level < 0) {
6190	if (default_relax_domain_level < 0)	6190	if (default_relax_domain_level < 0)
6191	return;	6191	return;
6192	else	6192	else
6193	request = default_relax_domain_level;	6193	request = default_relax_domain_level;
6194	} else	6194	} else
6195	request = attr->relax_domain_level;	6195	request = attr->relax_domain_level;
6196	if (request < sd->level) {	6196	if (request < sd->level) {
6197	/* turn off idle balance on this domain */	6197	/* turn off idle balance on this domain */
6198	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);	6198	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
6199	} else {	6199	} else {
6200	/* turn on idle balance on this domain */	6200	/* turn on idle balance on this domain */
6201	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);	6201	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
6202	}	6202	}
6203	}	6203	}
6204		6204
6205	static void __sdt_free(const struct cpumask *cpu_map);	6205	static void __sdt_free(const struct cpumask *cpu_map);
6206	static int __sdt_alloc(const struct cpumask *cpu_map);	6206	static int __sdt_alloc(const struct cpumask *cpu_map);
6207		6207
6208	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,	6208	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6209	const struct cpumask *cpu_map)	6209	const struct cpumask *cpu_map)
6210	{	6210	{
6211	switch (what) {	6211	switch (what) {
6212	case sa_rootdomain:	6212	case sa_rootdomain:
6213	if (!atomic_read(&d->rd->refcount))	6213	if (!atomic_read(&d->rd->refcount))
6214	free_rootdomain(&d->rd->rcu); /* fall through */	6214	free_rootdomain(&d->rd->rcu); /* fall through */
6215	case sa_sd:	6215	case sa_sd:
6216	free_percpu(d->sd); /* fall through */	6216	free_percpu(d->sd); /* fall through */
6217	case sa_sd_storage:	6217	case sa_sd_storage:
6218	__sdt_free(cpu_map); /* fall through */	6218	__sdt_free(cpu_map); /* fall through */
6219	case sa_none:	6219	case sa_none:
6220	break;	6220	break;
6221	}	6221	}
6222	}	6222	}
6223		6223
6224	static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,	6224	static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6225	const struct cpumask *cpu_map)	6225	const struct cpumask *cpu_map)
6226	{	6226	{
6227	memset(d, 0, sizeof(*d));	6227	memset(d, 0, sizeof(*d));
6228		6228
6229	if (__sdt_alloc(cpu_map))	6229	if (__sdt_alloc(cpu_map))
6230	return sa_sd_storage;	6230	return sa_sd_storage;
6231	d->sd = alloc_percpu(struct sched_domain *);	6231	d->sd = alloc_percpu(struct sched_domain *);
6232	if (!d->sd)	6232	if (!d->sd)
6233	return sa_sd_storage;	6233	return sa_sd_storage;
6234	d->rd = alloc_rootdomain();	6234	d->rd = alloc_rootdomain();
6235	if (!d->rd)	6235	if (!d->rd)
6236	return sa_sd;	6236	return sa_sd;
6237	return sa_rootdomain;	6237	return sa_rootdomain;
6238	}	6238	}
6239		6239
6240	/*	6240	/*
6241	* NULL the sd_data elements we've used to build the sched_domain and	6241	* NULL the sd_data elements we've used to build the sched_domain and
6242	* sched_group structure so that the subsequent __free_domain_allocs()	6242	* sched_group structure so that the subsequent __free_domain_allocs()
6243	* will not free the data we're using.	6243	* will not free the data we're using.
6244	*/	6244	*/
6245	static void claim_allocations(int cpu, struct sched_domain *sd)	6245	static void claim_allocations(int cpu, struct sched_domain *sd)
6246	{	6246	{
6247	struct sd_data *sdd = sd->private;	6247	struct sd_data *sdd = sd->private;
6248		6248
6249	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);	6249	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6250	*per_cpu_ptr(sdd->sd, cpu) = NULL;	6250	*per_cpu_ptr(sdd->sd, cpu) = NULL;
6251		6251
6252	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))	6252	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6253	*per_cpu_ptr(sdd->sg, cpu) = NULL;	6253	*per_cpu_ptr(sdd->sg, cpu) = NULL;
6254		6254
6255	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))	6255	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6256	*per_cpu_ptr(sdd->sgp, cpu) = NULL;	6256	*per_cpu_ptr(sdd->sgp, cpu) = NULL;
6257	}	6257	}
6258		6258
6259	#ifdef CONFIG_SCHED_SMT	6259	#ifdef CONFIG_SCHED_SMT
6260	static const struct cpumask *cpu_smt_mask(int cpu)	6260	static const struct cpumask *cpu_smt_mask(int cpu)
6261	{	6261	{
6262	return topology_thread_cpumask(cpu);	6262	return topology_thread_cpumask(cpu);
6263	}	6263	}
6264	#endif	6264	#endif
6265		6265
6266	/*	6266	/*
6267	* Topology list, bottom-up.	6267	* Topology list, bottom-up.
6268	*/	6268	*/
6269	static struct sched_domain_topology_level default_topology[] = {	6269	static struct sched_domain_topology_level default_topology[] = {
6270	#ifdef CONFIG_SCHED_SMT	6270	#ifdef CONFIG_SCHED_SMT
6271	{ sd_init_SIBLING, cpu_smt_mask, },	6271	{ sd_init_SIBLING, cpu_smt_mask, },
6272	#endif	6272	#endif
6273	#ifdef CONFIG_SCHED_MC	6273	#ifdef CONFIG_SCHED_MC
6274	{ sd_init_MC, cpu_coregroup_mask, },	6274	{ sd_init_MC, cpu_coregroup_mask, },
6275	#endif	6275	#endif
6276	#ifdef CONFIG_SCHED_BOOK	6276	#ifdef CONFIG_SCHED_BOOK
6277	{ sd_init_BOOK, cpu_book_mask, },	6277	{ sd_init_BOOK, cpu_book_mask, },
6278	#endif	6278	#endif
6279	{ sd_init_CPU, cpu_cpu_mask, },	6279	{ sd_init_CPU, cpu_cpu_mask, },
6280	#ifdef CONFIG_NUMA	6280	#ifdef CONFIG_NUMA
6281	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },	6281	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6282	{ sd_init_ALLNODES, cpu_allnodes_mask, },	6282	{ sd_init_ALLNODES, cpu_allnodes_mask, },
6283	#endif	6283	#endif
6284	{ NULL, },	6284	{ NULL, },
6285	};	6285	};
6286		6286
6287	static struct sched_domain_topology_level *sched_domain_topology = default_topology;	6287	static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6288		6288
6289	static int __sdt_alloc(const struct cpumask *cpu_map)	6289	static int __sdt_alloc(const struct cpumask *cpu_map)
6290	{	6290	{
6291	struct sched_domain_topology_level *tl;	6291	struct sched_domain_topology_level *tl;
6292	int j;	6292	int j;
6293		6293
6294	for (tl = sched_domain_topology; tl->init; tl++) {	6294	for (tl = sched_domain_topology; tl->init; tl++) {
6295	struct sd_data *sdd = &tl->data;	6295	struct sd_data *sdd = &tl->data;
6296		6296
6297	sdd->sd = alloc_percpu(struct sched_domain *);	6297	sdd->sd = alloc_percpu(struct sched_domain *);
6298	if (!sdd->sd)	6298	if (!sdd->sd)
6299	return -ENOMEM;	6299	return -ENOMEM;
6300		6300
6301	sdd->sg = alloc_percpu(struct sched_group *);	6301	sdd->sg = alloc_percpu(struct sched_group *);
6302	if (!sdd->sg)	6302	if (!sdd->sg)
6303	return -ENOMEM;	6303	return -ENOMEM;
6304		6304
6305	sdd->sgp = alloc_percpu(struct sched_group_power *);	6305	sdd->sgp = alloc_percpu(struct sched_group_power *);
6306	if (!sdd->sgp)	6306	if (!sdd->sgp)
6307	return -ENOMEM;	6307	return -ENOMEM;
6308		6308
6309	for_each_cpu(j, cpu_map) {	6309	for_each_cpu(j, cpu_map) {
6310	struct sched_domain *sd;	6310	struct sched_domain *sd;
6311	struct sched_group *sg;	6311	struct sched_group *sg;
6312	struct sched_group_power *sgp;	6312	struct sched_group_power *sgp;
6313		6313
6314	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),	6314	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6315	GFP_KERNEL, cpu_to_node(j));	6315	GFP_KERNEL, cpu_to_node(j));
6316	if (!sd)	6316	if (!sd)
6317	return -ENOMEM;	6317	return -ENOMEM;
6318		6318
6319	*per_cpu_ptr(sdd->sd, j) = sd;	6319	*per_cpu_ptr(sdd->sd, j) = sd;
6320		6320
6321	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),	6321	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6322	GFP_KERNEL, cpu_to_node(j));	6322	GFP_KERNEL, cpu_to_node(j));
6323	if (!sg)	6323	if (!sg)
6324	return -ENOMEM;	6324	return -ENOMEM;
6325		6325
6326	*per_cpu_ptr(sdd->sg, j) = sg;	6326	*per_cpu_ptr(sdd->sg, j) = sg;
6327		6327
6328	sgp = kzalloc_node(sizeof(struct sched_group_power),	6328	sgp = kzalloc_node(sizeof(struct sched_group_power),
6329	GFP_KERNEL, cpu_to_node(j));	6329	GFP_KERNEL, cpu_to_node(j));
6330	if (!sgp)	6330	if (!sgp)
6331	return -ENOMEM;	6331	return -ENOMEM;
6332		6332
6333	*per_cpu_ptr(sdd->sgp, j) = sgp;	6333	*per_cpu_ptr(sdd->sgp, j) = sgp;
6334	}	6334	}
6335	}	6335	}
6336		6336
6337	return 0;	6337	return 0;
6338	}	6338	}
6339		6339
6340	static void __sdt_free(const struct cpumask *cpu_map)	6340	static void __sdt_free(const struct cpumask *cpu_map)
6341	{	6341	{
6342	struct sched_domain_topology_level *tl;	6342	struct sched_domain_topology_level *tl;
6343	int j;	6343	int j;
6344		6344
6345	for (tl = sched_domain_topology; tl->init; tl++) {	6345	for (tl = sched_domain_topology; tl->init; tl++) {
6346	struct sd_data *sdd = &tl->data;	6346	struct sd_data *sdd = &tl->data;
6347		6347
6348	for_each_cpu(j, cpu_map) {	6348	for_each_cpu(j, cpu_map) {
6349	struct sched_domain sd = per_cpu_ptr(sdd->sd, j);	6349	struct sched_domain sd = per_cpu_ptr(sdd->sd, j);
6350	if (sd && (sd->flags & SD_OVERLAP))	6350	if (sd && (sd->flags & SD_OVERLAP))
6351	free_sched_groups(sd->groups, 0);	6351	free_sched_groups(sd->groups, 0);
6352	kfree(*per_cpu_ptr(sdd->sd, j));	6352	kfree(*per_cpu_ptr(sdd->sd, j));
6353	kfree(*per_cpu_ptr(sdd->sg, j));	6353	kfree(*per_cpu_ptr(sdd->sg, j));
6354	kfree(*per_cpu_ptr(sdd->sgp, j));	6354	kfree(*per_cpu_ptr(sdd->sgp, j));
6355	}	6355	}
6356	free_percpu(sdd->sd);	6356	free_percpu(sdd->sd);
6357	free_percpu(sdd->sg);	6357	free_percpu(sdd->sg);
6358	free_percpu(sdd->sgp);	6358	free_percpu(sdd->sgp);
6359	}	6359	}
6360	}	6360	}
6361		6361
6362	struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,	6362	struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
6363	struct s_data d, const struct cpumask cpu_map,	6363	struct s_data d, const struct cpumask cpu_map,
6364	struct sched_domain_attr attr, struct sched_domain child,	6364	struct sched_domain_attr attr, struct sched_domain child,
6365	int cpu)	6365	int cpu)
6366	{	6366	{
6367	struct sched_domain *sd = tl->init(tl, cpu);	6367	struct sched_domain *sd = tl->init(tl, cpu);
6368	if (!sd)	6368	if (!sd)
6369	return child;	6369	return child;
6370		6370
6371	set_domain_attribute(sd, attr);	6371	set_domain_attribute(sd, attr);
6372	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));	6372	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6373	if (child) {	6373	if (child) {
6374	sd->level = child->level + 1;	6374	sd->level = child->level + 1;
6375	sched_domain_level_max = max(sched_domain_level_max, sd->level);	6375	sched_domain_level_max = max(sched_domain_level_max, sd->level);
6376	child->parent = sd;	6376	child->parent = sd;
6377	}	6377	}
6378	sd->child = child;	6378	sd->child = child;
6379		6379
6380	return sd;	6380	return sd;
6381	}	6381	}
6382		6382
6383	/*	6383	/*
6384	* Build sched domains for a given set of cpus and attach the sched domains	6384	* Build sched domains for a given set of cpus and attach the sched domains
6385	* to the individual cpus	6385	* to the individual cpus
6386	*/	6386	*/
6387	static int build_sched_domains(const struct cpumask *cpu_map,	6387	static int build_sched_domains(const struct cpumask *cpu_map,
6388	struct sched_domain_attr *attr)	6388	struct sched_domain_attr *attr)
6389	{	6389	{
6390	enum s_alloc alloc_state = sa_none;	6390	enum s_alloc alloc_state = sa_none;
6391	struct sched_domain *sd;	6391	struct sched_domain *sd;
6392	struct s_data d;	6392	struct s_data d;
6393	int i, ret = -ENOMEM;	6393	int i, ret = -ENOMEM;
6394		6394
6395	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);	6395	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6396	if (alloc_state != sa_rootdomain)	6396	if (alloc_state != sa_rootdomain)
6397	goto error;	6397	goto error;
6398		6398
6399	/* Set up domains for cpus specified by the cpu_map. */	6399	/* Set up domains for cpus specified by the cpu_map. */
6400	for_each_cpu(i, cpu_map) {	6400	for_each_cpu(i, cpu_map) {
6401	struct sched_domain_topology_level *tl;	6401	struct sched_domain_topology_level *tl;
6402		6402
6403	sd = NULL;	6403	sd = NULL;
6404	for (tl = sched_domain_topology; tl->init; tl++) {	6404	for (tl = sched_domain_topology; tl->init; tl++) {
6405	sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);	6405	sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6406	if (tl->flags & SDTL_OVERLAP \|\| sched_feat(FORCE_SD_OVERLAP))	6406	if (tl->flags & SDTL_OVERLAP \|\| sched_feat(FORCE_SD_OVERLAP))
6407	sd->flags \|= SD_OVERLAP;	6407	sd->flags \|= SD_OVERLAP;
6408	if (cpumask_equal(cpu_map, sched_domain_span(sd)))	6408	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6409	break;	6409	break;
6410	}	6410	}
6411		6411
6412	while (sd->child)	6412	while (sd->child)
6413	sd = sd->child;	6413	sd = sd->child;
6414		6414
6415	*per_cpu_ptr(d.sd, i) = sd;	6415	*per_cpu_ptr(d.sd, i) = sd;
6416	}	6416	}
6417		6417
6418	/* Build the groups for the domains */	6418	/* Build the groups for the domains */
6419	for_each_cpu(i, cpu_map) {	6419	for_each_cpu(i, cpu_map) {
6420	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {	6420	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6421	sd->span_weight = cpumask_weight(sched_domain_span(sd));	6421	sd->span_weight = cpumask_weight(sched_domain_span(sd));
6422	if (sd->flags & SD_OVERLAP) {	6422	if (sd->flags & SD_OVERLAP) {
6423	if (build_overlap_sched_groups(sd, i))	6423	if (build_overlap_sched_groups(sd, i))
6424	goto error;	6424	goto error;
6425	} else {	6425	} else {
6426	if (build_sched_groups(sd, i))	6426	if (build_sched_groups(sd, i))
6427	goto error;	6427	goto error;
6428	}	6428	}
6429	}	6429	}
6430	}	6430	}
6431		6431
6432	/* Calculate CPU power for physical packages and nodes */	6432	/* Calculate CPU power for physical packages and nodes */
6433	for (i = nr_cpumask_bits-1; i >= 0; i--) {	6433	for (i = nr_cpumask_bits-1; i >= 0; i--) {
6434	if (!cpumask_test_cpu(i, cpu_map))	6434	if (!cpumask_test_cpu(i, cpu_map))
6435	continue;	6435	continue;
6436		6436
6437	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {	6437	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6438	claim_allocations(i, sd);	6438	claim_allocations(i, sd);
6439	init_sched_groups_power(i, sd);	6439	init_sched_groups_power(i, sd);
6440	}	6440	}
6441	}	6441	}
6442		6442
6443	/* Attach the domains */	6443	/* Attach the domains */
6444	rcu_read_lock();	6444	rcu_read_lock();
6445	for_each_cpu(i, cpu_map) {	6445	for_each_cpu(i, cpu_map) {
6446	sd = *per_cpu_ptr(d.sd, i);	6446	sd = *per_cpu_ptr(d.sd, i);
6447	cpu_attach_domain(sd, d.rd, i);	6447	cpu_attach_domain(sd, d.rd, i);
6448	}	6448	}
6449	rcu_read_unlock();	6449	rcu_read_unlock();
6450		6450
6451	ret = 0;	6451	ret = 0;
6452	error:	6452	error:
6453	__free_domain_allocs(&d, alloc_state, cpu_map);	6453	__free_domain_allocs(&d, alloc_state, cpu_map);
6454	return ret;	6454	return ret;
6455	}	6455	}
6456		6456
6457	static cpumask_var_t doms_cur; / current sched domains */	6457	static cpumask_var_t doms_cur; / current sched domains */
6458	static int ndoms_cur; /* number of sched domains in 'doms_cur' */	6458	static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6459	static struct sched_domain_attr *dattr_cur;	6459	static struct sched_domain_attr *dattr_cur;
6460	/* attribues of custom domains in 'doms_cur' */	6460	/* attribues of custom domains in 'doms_cur' */
6461		6461
6462	/*	6462	/*
6463	* Special case: If a kmalloc of a doms_cur partition (array of	6463	* Special case: If a kmalloc of a doms_cur partition (array of
6464	* cpumask) fails, then fallback to a single sched domain,	6464	* cpumask) fails, then fallback to a single sched domain,
6465	* as determined by the single cpumask fallback_doms.	6465	* as determined by the single cpumask fallback_doms.
6466	*/	6466	*/
6467	static cpumask_var_t fallback_doms;	6467	static cpumask_var_t fallback_doms;
6468		6468
6469	/*	6469	/*
6470	* arch_update_cpu_topology lets virtualized architectures update the	6470	* arch_update_cpu_topology lets virtualized architectures update the
6471	* cpu core maps. It is supposed to return 1 if the topology changed	6471	* cpu core maps. It is supposed to return 1 if the topology changed
6472	* or 0 if it stayed the same.	6472	* or 0 if it stayed the same.
6473	*/	6473	*/
6474	int __attribute__((weak)) arch_update_cpu_topology(void)	6474	int __attribute__((weak)) arch_update_cpu_topology(void)
6475	{	6475	{
6476	return 0;	6476	return 0;
6477	}	6477	}
6478		6478
6479	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)	6479	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6480	{	6480	{
6481	int i;	6481	int i;
6482	cpumask_var_t *doms;	6482	cpumask_var_t *doms;
6483		6483
6484	doms = kmalloc(sizeof(doms) ndoms, GFP_KERNEL);	6484	doms = kmalloc(sizeof(doms) ndoms, GFP_KERNEL);
6485	if (!doms)	6485	if (!doms)
6486	return NULL;	6486	return NULL;
6487	for (i = 0; i < ndoms; i++) {	6487	for (i = 0; i < ndoms; i++) {
6488	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {	6488	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6489	free_sched_domains(doms, i);	6489	free_sched_domains(doms, i);
6490	return NULL;	6490	return NULL;
6491	}	6491	}
6492	}	6492	}
6493	return doms;	6493	return doms;
6494	}	6494	}
6495		6495
6496	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)	6496	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6497	{	6497	{
6498	unsigned int i;	6498	unsigned int i;
6499	for (i = 0; i < ndoms; i++)	6499	for (i = 0; i < ndoms; i++)
6500	free_cpumask_var(doms[i]);	6500	free_cpumask_var(doms[i]);
6501	kfree(doms);	6501	kfree(doms);
6502	}	6502	}
6503		6503
6504	/*	6504	/*
6505	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	6505	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
6506	* For now this just excludes isolated cpus, but could be used to	6506	* For now this just excludes isolated cpus, but could be used to
6507	* exclude other special cases in the future.	6507	* exclude other special cases in the future.
6508	*/	6508	*/
6509	static int init_sched_domains(const struct cpumask *cpu_map)	6509	static int init_sched_domains(const struct cpumask *cpu_map)
6510	{	6510	{
6511	int err;	6511	int err;
6512		6512
6513	arch_update_cpu_topology();	6513	arch_update_cpu_topology();
6514	ndoms_cur = 1;	6514	ndoms_cur = 1;
6515	doms_cur = alloc_sched_domains(ndoms_cur);	6515	doms_cur = alloc_sched_domains(ndoms_cur);
6516	if (!doms_cur)	6516	if (!doms_cur)
6517	doms_cur = &fallback_doms;	6517	doms_cur = &fallback_doms;
6518	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);	6518	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6519	dattr_cur = NULL;	6519	dattr_cur = NULL;
6520	err = build_sched_domains(doms_cur[0], NULL);	6520	err = build_sched_domains(doms_cur[0], NULL);
6521	register_sched_domain_sysctl();	6521	register_sched_domain_sysctl();
6522		6522
6523	return err;	6523	return err;
6524	}	6524	}
6525		6525
6526	/*	6526	/*
6527	* Detach sched domains from a group of cpus specified in cpu_map	6527	* Detach sched domains from a group of cpus specified in cpu_map
6528	* These cpus will now be attached to the NULL domain	6528	* These cpus will now be attached to the NULL domain
6529	*/	6529	*/
6530	static void detach_destroy_domains(const struct cpumask *cpu_map)	6530	static void detach_destroy_domains(const struct cpumask *cpu_map)
6531	{	6531	{
6532	int i;	6532	int i;
6533		6533
6534	rcu_read_lock();	6534	rcu_read_lock();
6535	for_each_cpu(i, cpu_map)	6535	for_each_cpu(i, cpu_map)
6536	cpu_attach_domain(NULL, &def_root_domain, i);	6536	cpu_attach_domain(NULL, &def_root_domain, i);
6537	rcu_read_unlock();	6537	rcu_read_unlock();
6538	}	6538	}
6539		6539
6540	/* handle null as "default" */	6540	/* handle null as "default" */
6541	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,	6541	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6542	struct sched_domain_attr *new, int idx_new)	6542	struct sched_domain_attr *new, int idx_new)
6543	{	6543	{
6544	struct sched_domain_attr tmp;	6544	struct sched_domain_attr tmp;
6545		6545
6546	/* fast path */	6546	/* fast path */
6547	if (!new && !cur)	6547	if (!new && !cur)
6548	return 1;	6548	return 1;
6549		6549
6550	tmp = SD_ATTR_INIT;	6550	tmp = SD_ATTR_INIT;
6551	return !memcmp(cur ? (cur + idx_cur) : &tmp,	6551	return !memcmp(cur ? (cur + idx_cur) : &tmp,
6552	new ? (new + idx_new) : &tmp,	6552	new ? (new + idx_new) : &tmp,
6553	sizeof(struct sched_domain_attr));	6553	sizeof(struct sched_domain_attr));
6554	}	6554	}
6555		6555
6556	/*	6556	/*
6557	* Partition sched domains as specified by the 'ndoms_new'	6557	* Partition sched domains as specified by the 'ndoms_new'
6558	* cpumasks in the array doms_new[] of cpumasks. This compares	6558	* cpumasks in the array doms_new[] of cpumasks. This compares
6559	* doms_new[] to the current sched domain partitioning, doms_cur[].	6559	* doms_new[] to the current sched domain partitioning, doms_cur[].
6560	* It destroys each deleted domain and builds each new domain.	6560	* It destroys each deleted domain and builds each new domain.
6561	*	6561	*
6562	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.	6562	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6563	* The masks don't intersect (don't overlap.) We should setup one	6563	* The masks don't intersect (don't overlap.) We should setup one
6564	* sched domain for each mask. CPUs not in any of the cpumasks will	6564	* sched domain for each mask. CPUs not in any of the cpumasks will
6565	* not be load balanced. If the same cpumask appears both in the	6565	* not be load balanced. If the same cpumask appears both in the
6566	* current 'doms_cur' domains and in the new 'doms_new', we can leave	6566	* current 'doms_cur' domains and in the new 'doms_new', we can leave
6567	* it as it is.	6567	* it as it is.
6568	*	6568	*
6569	* The passed in 'doms_new' should be allocated using	6569	* The passed in 'doms_new' should be allocated using
6570	* alloc_sched_domains. This routine takes ownership of it and will	6570	* alloc_sched_domains. This routine takes ownership of it and will
6571	* free_sched_domains it when done with it. If the caller failed the	6571	* free_sched_domains it when done with it. If the caller failed the
6572	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,	6572	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6573	* and partition_sched_domains() will fallback to the single partition	6573	* and partition_sched_domains() will fallback to the single partition
6574	* 'fallback_doms', it also forces the domains to be rebuilt.	6574	* 'fallback_doms', it also forces the domains to be rebuilt.
6575	*	6575	*
6576	* If doms_new == NULL it will be replaced with cpu_online_mask.	6576	* If doms_new == NULL it will be replaced with cpu_online_mask.
6577	* ndoms_new == 0 is a special case for destroying existing domains,	6577	* ndoms_new == 0 is a special case for destroying existing domains,
6578	* and it will not create the default domain.	6578	* and it will not create the default domain.
6579	*	6579	*
6580	* Call with hotplug lock held	6580	* Call with hotplug lock held
6581	*/	6581	*/
6582	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],	6582	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6583	struct sched_domain_attr *dattr_new)	6583	struct sched_domain_attr *dattr_new)
6584	{	6584	{
6585	int i, j, n;	6585	int i, j, n;
6586	int new_topology;	6586	int new_topology;
6587		6587
6588	mutex_lock(&sched_domains_mutex);	6588	mutex_lock(&sched_domains_mutex);
6589		6589
6590	/* always unregister in case we don't destroy any domains */	6590	/* always unregister in case we don't destroy any domains */
6591	unregister_sched_domain_sysctl();	6591	unregister_sched_domain_sysctl();
6592		6592
6593	/* Let architecture update cpu core mappings. */	6593	/* Let architecture update cpu core mappings. */
6594	new_topology = arch_update_cpu_topology();	6594	new_topology = arch_update_cpu_topology();
6595		6595
6596	n = doms_new ? ndoms_new : 0;	6596	n = doms_new ? ndoms_new : 0;
6597		6597
6598	/* Destroy deleted domains */	6598	/* Destroy deleted domains */
6599	for (i = 0; i < ndoms_cur; i++) {	6599	for (i = 0; i < ndoms_cur; i++) {
6600	for (j = 0; j < n && !new_topology; j++) {	6600	for (j = 0; j < n && !new_topology; j++) {
6601	if (cpumask_equal(doms_cur[i], doms_new[j])	6601	if (cpumask_equal(doms_cur[i], doms_new[j])
6602	&& dattrs_equal(dattr_cur, i, dattr_new, j))	6602	&& dattrs_equal(dattr_cur, i, dattr_new, j))
6603	goto match1;	6603	goto match1;
6604	}	6604	}
6605	/* no match - a current sched domain not in new doms_new[] */	6605	/* no match - a current sched domain not in new doms_new[] */
6606	detach_destroy_domains(doms_cur[i]);	6606	detach_destroy_domains(doms_cur[i]);
6607	match1:	6607	match1:
6608	;	6608	;
6609	}	6609	}
6610		6610
6611	if (doms_new == NULL) {	6611	if (doms_new == NULL) {
6612	ndoms_cur = 0;	6612	ndoms_cur = 0;
6613	doms_new = &fallback_doms;	6613	doms_new = &fallback_doms;
6614	cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);	6614	cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6615	WARN_ON_ONCE(dattr_new);	6615	WARN_ON_ONCE(dattr_new);
6616	}	6616	}
6617		6617
6618	/* Build new domains */	6618	/* Build new domains */
6619	for (i = 0; i < ndoms_new; i++) {	6619	for (i = 0; i < ndoms_new; i++) {
6620	for (j = 0; j < ndoms_cur && !new_topology; j++) {	6620	for (j = 0; j < ndoms_cur && !new_topology; j++) {
6621	if (cpumask_equal(doms_new[i], doms_cur[j])	6621	if (cpumask_equal(doms_new[i], doms_cur[j])
6622	&& dattrs_equal(dattr_new, i, dattr_cur, j))	6622	&& dattrs_equal(dattr_new, i, dattr_cur, j))
6623	goto match2;	6623	goto match2;
6624	}	6624	}
6625	/* no match - add a new doms_new */	6625	/* no match - add a new doms_new */
6626	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);	6626	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6627	match2:	6627	match2:
6628	;	6628	;
6629	}	6629	}
6630		6630
6631	/* Remember the new sched domains */	6631	/* Remember the new sched domains */
6632	if (doms_cur != &fallback_doms)	6632	if (doms_cur != &fallback_doms)
6633	free_sched_domains(doms_cur, ndoms_cur);	6633	free_sched_domains(doms_cur, ndoms_cur);
6634	kfree(dattr_cur); /* kfree(NULL) is safe */	6634	kfree(dattr_cur); /* kfree(NULL) is safe */
6635	doms_cur = doms_new;	6635	doms_cur = doms_new;
6636	dattr_cur = dattr_new;	6636	dattr_cur = dattr_new;
6637	ndoms_cur = ndoms_new;	6637	ndoms_cur = ndoms_new;
6638		6638
6639	register_sched_domain_sysctl();	6639	register_sched_domain_sysctl();
6640		6640
6641	mutex_unlock(&sched_domains_mutex);	6641	mutex_unlock(&sched_domains_mutex);
6642	}	6642	}
6643		6643
6644	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	6644	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
6645	static void reinit_sched_domains(void)	6645	static void reinit_sched_domains(void)
6646	{	6646	{
6647	get_online_cpus();	6647	get_online_cpus();
6648		6648
6649	/* Destroy domains first to force the rebuild */	6649	/* Destroy domains first to force the rebuild */
6650	partition_sched_domains(0, NULL, NULL);	6650	partition_sched_domains(0, NULL, NULL);
6651		6651
6652	rebuild_sched_domains();	6652	rebuild_sched_domains();
6653	put_online_cpus();	6653	put_online_cpus();
6654	}	6654	}
6655		6655
6656	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)	6656	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6657	{	6657	{
6658	unsigned int level = 0;	6658	unsigned int level = 0;
6659		6659
6660	if (sscanf(buf, "%u", &level) != 1)	6660	if (sscanf(buf, "%u", &level) != 1)
6661	return -EINVAL;	6661	return -EINVAL;
6662		6662
6663	/*	6663	/*
6664	* level is always be positive so don't check for	6664	* level is always be positive so don't check for
6665	* level < POWERSAVINGS_BALANCE_NONE which is 0	6665	* level < POWERSAVINGS_BALANCE_NONE which is 0
6666	* What happens on 0 or 1 byte write,	6666	* What happens on 0 or 1 byte write,
6667	* need to check for count as well?	6667	* need to check for count as well?
6668	*/	6668	*/
6669		6669
6670	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)	6670	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6671	return -EINVAL;	6671	return -EINVAL;
6672		6672
6673	if (smt)	6673	if (smt)
6674	sched_smt_power_savings = level;	6674	sched_smt_power_savings = level;
6675	else	6675	else
6676	sched_mc_power_savings = level;	6676	sched_mc_power_savings = level;
6677		6677
6678	reinit_sched_domains();	6678	reinit_sched_domains();
6679		6679
6680	return count;	6680	return count;
6681	}	6681	}
6682		6682
6683	#ifdef CONFIG_SCHED_MC	6683	#ifdef CONFIG_SCHED_MC
6684	static ssize_t sched_mc_power_savings_show(struct device *dev,	6684	static ssize_t sched_mc_power_savings_show(struct device *dev,
6685	struct device_attribute *attr,	6685	struct device_attribute *attr,
6686	char *buf)	6686	char *buf)
6687	{	6687	{
6688	return sprintf(buf, "%u\n", sched_mc_power_savings);	6688	return sprintf(buf, "%u\n", sched_mc_power_savings);
6689	}	6689	}
6690	static ssize_t sched_mc_power_savings_store(struct device *dev,	6690	static ssize_t sched_mc_power_savings_store(struct device *dev,
6691	struct device_attribute *attr,	6691	struct device_attribute *attr,
6692	const char *buf, size_t count)	6692	const char *buf, size_t count)
6693	{	6693	{
6694	return sched_power_savings_store(buf, count, 0);	6694	return sched_power_savings_store(buf, count, 0);
6695	}	6695	}
6696	static DEVICE_ATTR(sched_mc_power_savings, 0644,	6696	static DEVICE_ATTR(sched_mc_power_savings, 0644,
6697	sched_mc_power_savings_show,	6697	sched_mc_power_savings_show,
6698	sched_mc_power_savings_store);	6698	sched_mc_power_savings_store);
6699	#endif	6699	#endif
6700		6700
6701	#ifdef CONFIG_SCHED_SMT	6701	#ifdef CONFIG_SCHED_SMT
6702	static ssize_t sched_smt_power_savings_show(struct device *dev,	6702	static ssize_t sched_smt_power_savings_show(struct device *dev,
6703	struct device_attribute *attr,	6703	struct device_attribute *attr,
6704	char *buf)	6704	char *buf)
6705	{	6705	{
6706	return sprintf(buf, "%u\n", sched_smt_power_savings);	6706	return sprintf(buf, "%u\n", sched_smt_power_savings);
6707	}	6707	}
6708	static ssize_t sched_smt_power_savings_store(struct device *dev,	6708	static ssize_t sched_smt_power_savings_store(struct device *dev,
6709	struct device_attribute *attr,	6709	struct device_attribute *attr,
6710	const char *buf, size_t count)	6710	const char *buf, size_t count)
6711	{	6711	{
6712	return sched_power_savings_store(buf, count, 1);	6712	return sched_power_savings_store(buf, count, 1);
6713	}	6713	}
6714	static DEVICE_ATTR(sched_smt_power_savings, 0644,	6714	static DEVICE_ATTR(sched_smt_power_savings, 0644,
6715	sched_smt_power_savings_show,	6715	sched_smt_power_savings_show,
6716	sched_smt_power_savings_store);	6716	sched_smt_power_savings_store);
6717	#endif	6717	#endif
6718		6718
6719	int __init sched_create_sysfs_power_savings_entries(struct device *dev)	6719	int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6720	{	6720	{
6721	int err = 0;	6721	int err = 0;
6722		6722
6723	#ifdef CONFIG_SCHED_SMT	6723	#ifdef CONFIG_SCHED_SMT
6724	if (smt_capable())	6724	if (smt_capable())
6725	err = device_create_file(dev, &dev_attr_sched_smt_power_savings);	6725	err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6726	#endif	6726	#endif
6727	#ifdef CONFIG_SCHED_MC	6727	#ifdef CONFIG_SCHED_MC
6728	if (!err && mc_capable())	6728	if (!err && mc_capable())
6729	err = device_create_file(dev, &dev_attr_sched_mc_power_savings);	6729	err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6730	#endif	6730	#endif
6731	return err;	6731	return err;
6732	}	6732	}
6733	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */	6733	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */
6734		6734
6735	/*	6735	/*
6736	* Update cpusets according to cpu_active mask. If cpusets are	6736	* Update cpusets according to cpu_active mask. If cpusets are
6737	* disabled, cpuset_update_active_cpus() becomes a simple wrapper	6737	* disabled, cpuset_update_active_cpus() becomes a simple wrapper
6738	* around partition_sched_domains().	6738	* around partition_sched_domains().
6739	*/	6739	*/
6740	static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,	6740	static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6741	void *hcpu)	6741	void *hcpu)
6742	{	6742	{
6743	switch (action & ~CPU_TASKS_FROZEN) {	6743	switch (action & ~CPU_TASKS_FROZEN) {
6744	case CPU_ONLINE:	6744	case CPU_ONLINE:
6745	case CPU_DOWN_FAILED:	6745	case CPU_DOWN_FAILED:
6746	cpuset_update_active_cpus();	6746	cpuset_update_active_cpus();
6747	return NOTIFY_OK;	6747	return NOTIFY_OK;
6748	default:	6748	default:
6749	return NOTIFY_DONE;	6749	return NOTIFY_DONE;
6750	}	6750	}
6751	}	6751	}
6752		6752
6753	static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,	6753	static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6754	void *hcpu)	6754	void *hcpu)
6755	{	6755	{
6756	switch (action & ~CPU_TASKS_FROZEN) {	6756	switch (action & ~CPU_TASKS_FROZEN) {
6757	case CPU_DOWN_PREPARE:	6757	case CPU_DOWN_PREPARE:
6758	cpuset_update_active_cpus();	6758	cpuset_update_active_cpus();
6759	return NOTIFY_OK;	6759	return NOTIFY_OK;
6760	default:	6760	default:
6761	return NOTIFY_DONE;	6761	return NOTIFY_DONE;
6762	}	6762	}
6763	}	6763	}
6764		6764
6765	void __init sched_init_smp(void)	6765	void __init sched_init_smp(void)
6766	{	6766	{
6767	cpumask_var_t non_isolated_cpus;	6767	cpumask_var_t non_isolated_cpus;
6768		6768
6769	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);	6769	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6770	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);	6770	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6771		6771
6772	get_online_cpus();	6772	get_online_cpus();
6773	mutex_lock(&sched_domains_mutex);	6773	mutex_lock(&sched_domains_mutex);
6774	init_sched_domains(cpu_active_mask);	6774	init_sched_domains(cpu_active_mask);
6775	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);	6775	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6776	if (cpumask_empty(non_isolated_cpus))	6776	if (cpumask_empty(non_isolated_cpus))
6777	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);	6777	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6778	mutex_unlock(&sched_domains_mutex);	6778	mutex_unlock(&sched_domains_mutex);
6779	put_online_cpus();	6779	put_online_cpus();
6780		6780
6781	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);	6781	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6782	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);	6782	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6783		6783
6784	/* RT runtime code needs to handle some hotplug events */	6784	/* RT runtime code needs to handle some hotplug events */
6785	hotcpu_notifier(update_runtime, 0);	6785	hotcpu_notifier(update_runtime, 0);
6786		6786
6787	init_hrtick();	6787	init_hrtick();
6788		6788
6789	/* Move init over to a non-isolated CPU */	6789	/* Move init over to a non-isolated CPU */
6790	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)	6790	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6791	BUG();	6791	BUG();
6792	sched_init_granularity();	6792	sched_init_granularity();
6793	free_cpumask_var(non_isolated_cpus);	6793	free_cpumask_var(non_isolated_cpus);
6794		6794
6795	init_sched_rt_class();	6795	init_sched_rt_class();
6796	}	6796	}
6797	#else	6797	#else
6798	void __init sched_init_smp(void)	6798	void __init sched_init_smp(void)
6799	{	6799	{
6800	sched_init_granularity();	6800	sched_init_granularity();
6801	}	6801	}
6802	#endif /* CONFIG_SMP */	6802	#endif /* CONFIG_SMP */
6803		6803
6804	const_debug unsigned int sysctl_timer_migration = 1;	6804	const_debug unsigned int sysctl_timer_migration = 1;
6805		6805
6806	int in_sched_functions(unsigned long addr)	6806	int in_sched_functions(unsigned long addr)
6807	{	6807	{
6808	return in_lock_functions(addr) \|\|	6808	return in_lock_functions(addr) \|\|
6809	(addr >= (unsigned long)__sched_text_start	6809	(addr >= (unsigned long)__sched_text_start
6810	&& addr < (unsigned long)__sched_text_end);	6810	&& addr < (unsigned long)__sched_text_end);
6811	}	6811	}
6812		6812
6813	#ifdef CONFIG_CGROUP_SCHED	6813	#ifdef CONFIG_CGROUP_SCHED
6814	struct task_group root_task_group;	6814	struct task_group root_task_group;
6815	#endif	6815	#endif
6816		6816
6817	DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);	6817	DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
6818		6818
6819	void __init sched_init(void)	6819	void __init sched_init(void)
6820	{	6820	{
6821	int i, j;	6821	int i, j;
6822	unsigned long alloc_size = 0, ptr;	6822	unsigned long alloc_size = 0, ptr;
6823		6823
6824	#ifdef CONFIG_FAIR_GROUP_SCHED	6824	#ifdef CONFIG_FAIR_GROUP_SCHED
6825	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	6825	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6826	#endif	6826	#endif
6827	#ifdef CONFIG_RT_GROUP_SCHED	6827	#ifdef CONFIG_RT_GROUP_SCHED
6828	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	6828	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6829	#endif	6829	#endif
6830	#ifdef CONFIG_CPUMASK_OFFSTACK	6830	#ifdef CONFIG_CPUMASK_OFFSTACK
6831	alloc_size += num_possible_cpus() * cpumask_size();	6831	alloc_size += num_possible_cpus() * cpumask_size();
6832	#endif	6832	#endif
6833	if (alloc_size) {	6833	if (alloc_size) {
6834	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);	6834	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6835		6835
6836	#ifdef CONFIG_FAIR_GROUP_SCHED	6836	#ifdef CONFIG_FAIR_GROUP_SCHED
6837	root_task_group.se = (struct sched_entity **)ptr;	6837	root_task_group.se = (struct sched_entity **)ptr;
6838	ptr += nr_cpu_ids * sizeof(void **);	6838	ptr += nr_cpu_ids * sizeof(void **);
6839		6839
6840	root_task_group.cfs_rq = (struct cfs_rq **)ptr;	6840	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6841	ptr += nr_cpu_ids * sizeof(void **);	6841	ptr += nr_cpu_ids * sizeof(void **);
6842		6842
6843	#endif /* CONFIG_FAIR_GROUP_SCHED */	6843	#endif /* CONFIG_FAIR_GROUP_SCHED */
6844	#ifdef CONFIG_RT_GROUP_SCHED	6844	#ifdef CONFIG_RT_GROUP_SCHED
6845	root_task_group.rt_se = (struct sched_rt_entity **)ptr;	6845	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6846	ptr += nr_cpu_ids * sizeof(void **);	6846	ptr += nr_cpu_ids * sizeof(void **);
6847		6847
6848	root_task_group.rt_rq = (struct rt_rq **)ptr;	6848	root_task_group.rt_rq = (struct rt_rq **)ptr;
6849	ptr += nr_cpu_ids * sizeof(void **);	6849	ptr += nr_cpu_ids * sizeof(void **);
6850		6850
6851	#endif /* CONFIG_RT_GROUP_SCHED */	6851	#endif /* CONFIG_RT_GROUP_SCHED */
6852	#ifdef CONFIG_CPUMASK_OFFSTACK	6852	#ifdef CONFIG_CPUMASK_OFFSTACK
6853	for_each_possible_cpu(i) {	6853	for_each_possible_cpu(i) {
6854	per_cpu(load_balance_tmpmask, i) = (void *)ptr;	6854	per_cpu(load_balance_tmpmask, i) = (void *)ptr;
6855	ptr += cpumask_size();	6855	ptr += cpumask_size();
6856	}	6856	}
6857	#endif /* CONFIG_CPUMASK_OFFSTACK */	6857	#endif /* CONFIG_CPUMASK_OFFSTACK */
6858	}	6858	}
6859		6859
6860	#ifdef CONFIG_SMP	6860	#ifdef CONFIG_SMP
6861	init_defrootdomain();	6861	init_defrootdomain();
6862	#endif	6862	#endif
6863		6863
6864	init_rt_bandwidth(&def_rt_bandwidth,	6864	init_rt_bandwidth(&def_rt_bandwidth,
6865	global_rt_period(), global_rt_runtime());	6865	global_rt_period(), global_rt_runtime());
6866		6866
6867	#ifdef CONFIG_RT_GROUP_SCHED	6867	#ifdef CONFIG_RT_GROUP_SCHED
6868	init_rt_bandwidth(&root_task_group.rt_bandwidth,	6868	init_rt_bandwidth(&root_task_group.rt_bandwidth,
6869	global_rt_period(), global_rt_runtime());	6869	global_rt_period(), global_rt_runtime());
6870	#endif /* CONFIG_RT_GROUP_SCHED */	6870	#endif /* CONFIG_RT_GROUP_SCHED */
6871		6871
6872	#ifdef CONFIG_CGROUP_SCHED	6872	#ifdef CONFIG_CGROUP_SCHED
6873	list_add(&root_task_group.list, &task_groups);	6873	list_add(&root_task_group.list, &task_groups);
6874	INIT_LIST_HEAD(&root_task_group.children);	6874	INIT_LIST_HEAD(&root_task_group.children);
6875	INIT_LIST_HEAD(&root_task_group.siblings);	6875	INIT_LIST_HEAD(&root_task_group.siblings);
6876	autogroup_init(&init_task);	6876	autogroup_init(&init_task);
6877		6877
6878	#endif /* CONFIG_CGROUP_SCHED */	6878	#endif /* CONFIG_CGROUP_SCHED */
6879		6879
6880	#ifdef CONFIG_CGROUP_CPUACCT	6880	#ifdef CONFIG_CGROUP_CPUACCT
6881	root_cpuacct.cpustat = &kernel_cpustat;	6881	root_cpuacct.cpustat = &kernel_cpustat;
6882	root_cpuacct.cpuusage = alloc_percpu(u64);	6882	root_cpuacct.cpuusage = alloc_percpu(u64);
6883	/* Too early, not expected to fail */	6883	/* Too early, not expected to fail */
6884	BUG_ON(!root_cpuacct.cpuusage);	6884	BUG_ON(!root_cpuacct.cpuusage);
6885	#endif	6885	#endif
6886	for_each_possible_cpu(i) {	6886	for_each_possible_cpu(i) {
6887	struct rq *rq;	6887	struct rq *rq;
6888		6888
6889	rq = cpu_rq(i);	6889	rq = cpu_rq(i);
6890	raw_spin_lock_init(&rq->lock);	6890	raw_spin_lock_init(&rq->lock);
6891	rq->nr_running = 0;	6891	rq->nr_running = 0;
6892	rq->calc_load_active = 0;	6892	rq->calc_load_active = 0;
6893	rq->calc_load_update = jiffies + LOAD_FREQ;	6893	rq->calc_load_update = jiffies + LOAD_FREQ;
6894	init_cfs_rq(&rq->cfs);	6894	init_cfs_rq(&rq->cfs);
6895	init_rt_rq(&rq->rt, rq);	6895	init_rt_rq(&rq->rt, rq);
6896	#ifdef CONFIG_FAIR_GROUP_SCHED	6896	#ifdef CONFIG_FAIR_GROUP_SCHED
6897	root_task_group.shares = ROOT_TASK_GROUP_LOAD;	6897	root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6898	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);	6898	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6899	/*	6899	/*
6900	* How much cpu bandwidth does root_task_group get?	6900	* How much cpu bandwidth does root_task_group get?
6901	*	6901	*
6902	* In case of task-groups formed thr' the cgroup filesystem, it	6902	* In case of task-groups formed thr' the cgroup filesystem, it
6903	* gets 100% of the cpu resources in the system. This overall	6903	* gets 100% of the cpu resources in the system. This overall
6904	* system cpu resource is divided among the tasks of	6904	* system cpu resource is divided among the tasks of
6905	* root_task_group and its child task-groups in a fair manner,	6905	* root_task_group and its child task-groups in a fair manner,
6906	* based on each entity's (task or task-group's) weight	6906	* based on each entity's (task or task-group's) weight
6907	* (se->load.weight).	6907	* (se->load.weight).
6908	*	6908	*
6909	* In other words, if root_task_group has 10 tasks of weight	6909	* In other words, if root_task_group has 10 tasks of weight
6910	* 1024) and two child groups A0 and A1 (of weight 1024 each),	6910	* 1024) and two child groups A0 and A1 (of weight 1024 each),
6911	* then A0's share of the cpu resource is:	6911	* then A0's share of the cpu resource is:
6912	*	6912	*
6913	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%	6913	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6914	*	6914	*
6915	* We achieve this by letting root_task_group's tasks sit	6915	* We achieve this by letting root_task_group's tasks sit
6916	* directly in rq->cfs (i.e root_task_group->se[] = NULL).	6916	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
6917	*/	6917	*/
6918	init_cfs_bandwidth(&root_task_group.cfs_bandwidth);	6918	init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6919	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);	6919	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6920	#endif /* CONFIG_FAIR_GROUP_SCHED */	6920	#endif /* CONFIG_FAIR_GROUP_SCHED */
6921		6921
6922	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;	6922	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6923	#ifdef CONFIG_RT_GROUP_SCHED	6923	#ifdef CONFIG_RT_GROUP_SCHED
6924	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);	6924	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6925	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);	6925	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6926	#endif	6926	#endif
6927		6927
6928	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)	6928	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6929	rq->cpu_load[j] = 0;	6929	rq->cpu_load[j] = 0;
6930		6930
6931	rq->last_load_update_tick = jiffies;	6931	rq->last_load_update_tick = jiffies;
6932		6932
6933	#ifdef CONFIG_SMP	6933	#ifdef CONFIG_SMP
6934	rq->sd = NULL;	6934	rq->sd = NULL;
6935	rq->rd = NULL;	6935	rq->rd = NULL;
6936	rq->cpu_power = SCHED_POWER_SCALE;	6936	rq->cpu_power = SCHED_POWER_SCALE;
6937	rq->post_schedule = 0;	6937	rq->post_schedule = 0;
6938	rq->active_balance = 0;	6938	rq->active_balance = 0;
6939	rq->next_balance = jiffies;	6939	rq->next_balance = jiffies;
6940	rq->push_cpu = 0;	6940	rq->push_cpu = 0;
6941	rq->cpu = i;	6941	rq->cpu = i;
6942	rq->online = 0;	6942	rq->online = 0;
6943	rq->idle_stamp = 0;	6943	rq->idle_stamp = 0;
6944	rq->avg_idle = 2*sysctl_sched_migration_cost;	6944	rq->avg_idle = 2*sysctl_sched_migration_cost;
6945	rq_attach_root(rq, &def_root_domain);	6945	rq_attach_root(rq, &def_root_domain);
6946	#ifdef CONFIG_NO_HZ	6946	#ifdef CONFIG_NO_HZ
6947	rq->nohz_flags = 0;	6947	rq->nohz_flags = 0;
6948	#endif	6948	#endif
6949	#endif	6949	#endif
6950	init_rq_hrtick(rq);	6950	init_rq_hrtick(rq);
6951	atomic_set(&rq->nr_iowait, 0);	6951	atomic_set(&rq->nr_iowait, 0);
6952	}	6952	}
6953		6953
6954	set_load_weight(&init_task);	6954	set_load_weight(&init_task);
6955		6955
6956	#ifdef CONFIG_PREEMPT_NOTIFIERS	6956	#ifdef CONFIG_PREEMPT_NOTIFIERS
6957	INIT_HLIST_HEAD(&init_task.preempt_notifiers);	6957	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6958	#endif	6958	#endif
6959		6959
6960	#ifdef CONFIG_RT_MUTEXES	6960	#ifdef CONFIG_RT_MUTEXES
6961	plist_head_init(&init_task.pi_waiters);	6961	plist_head_init(&init_task.pi_waiters);
6962	#endif	6962	#endif
6963		6963
6964	/*	6964	/*
6965	* The boot idle thread does lazy MMU switching as well:	6965	* The boot idle thread does lazy MMU switching as well:
6966	*/	6966	*/
6967	atomic_inc(&init_mm.mm_count);	6967	atomic_inc(&init_mm.mm_count);
6968	enter_lazy_tlb(&init_mm, current);	6968	enter_lazy_tlb(&init_mm, current);
6969		6969
6970	/*	6970	/*
6971	* Make us the idle thread. Technically, schedule() should not be	6971	* Make us the idle thread. Technically, schedule() should not be
6972	* called from this thread, however somewhere below it might be,	6972	* called from this thread, however somewhere below it might be,
6973	* but because we are the idle thread, we just pick up running again	6973	* but because we are the idle thread, we just pick up running again
6974	* when this runqueue becomes "idle".	6974	* when this runqueue becomes "idle".
6975	*/	6975	*/
6976	init_idle(current, smp_processor_id());	6976	init_idle(current, smp_processor_id());
6977		6977
6978	calc_load_update = jiffies + LOAD_FREQ;	6978	calc_load_update = jiffies + LOAD_FREQ;
6979		6979
6980	/*	6980	/*
6981	* During early bootup we pretend to be a normal task:	6981	* During early bootup we pretend to be a normal task:
6982	*/	6982	*/
6983	current->sched_class = &fair_sched_class;	6983	current->sched_class = &fair_sched_class;
6984		6984
6985	#ifdef CONFIG_SMP	6985	#ifdef CONFIG_SMP
6986	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);	6986	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6987	/* May be allocated at isolcpus cmdline parse time */	6987	/* May be allocated at isolcpus cmdline parse time */
6988	if (cpu_isolated_map == NULL)	6988	if (cpu_isolated_map == NULL)
6989	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);	6989	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6990	#endif	6990	#endif
6991	init_sched_fair_class();	6991	init_sched_fair_class();
6992		6992
6993	scheduler_running = 1;	6993	scheduler_running = 1;
6994	}	6994	}
6995		6995
6996	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP	6996	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6997	static inline int preempt_count_equals(int preempt_offset)	6997	static inline int preempt_count_equals(int preempt_offset)
6998	{	6998	{
6999	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();	6999	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7000		7000
7001	return (nested == preempt_offset);	7001	return (nested == preempt_offset);
7002	}	7002	}
7003		7003
7004	void __might_sleep(const char *file, int line, int preempt_offset)	7004	void __might_sleep(const char *file, int line, int preempt_offset)
7005	{	7005	{
7006	static unsigned long prev_jiffy; /* ratelimiting */	7006	static unsigned long prev_jiffy; /* ratelimiting */
7007		7007
7008	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */	7008	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7009	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) \|\|	7009	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) \|\|
7010	system_state != SYSTEM_RUNNING \|\| oops_in_progress)	7010	system_state != SYSTEM_RUNNING \|\| oops_in_progress)
7011	return;	7011	return;
7012	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)	7012	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7013	return;	7013	return;
7014	prev_jiffy = jiffies;	7014	prev_jiffy = jiffies;
7015		7015
7016	printk(KERN_ERR	7016	printk(KERN_ERR
7017	"BUG: sleeping function called from invalid context at %s:%d\n",	7017	"BUG: sleeping function called from invalid context at %s:%d\n",
7018	file, line);	7018	file, line);
7019	printk(KERN_ERR	7019	printk(KERN_ERR
7020	"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",	7020	"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7021	in_atomic(), irqs_disabled(),	7021	in_atomic(), irqs_disabled(),
7022	current->pid, current->comm);	7022	current->pid, current->comm);
7023		7023
7024	debug_show_held_locks(current);	7024	debug_show_held_locks(current);
7025	if (irqs_disabled())	7025	if (irqs_disabled())
7026	print_irqtrace_events(current);	7026	print_irqtrace_events(current);
7027	dump_stack();	7027	dump_stack();
7028	}	7028	}
7029	EXPORT_SYMBOL(__might_sleep);	7029	EXPORT_SYMBOL(__might_sleep);
7030	#endif	7030	#endif
7031		7031
7032	#ifdef CONFIG_MAGIC_SYSRQ	7032	#ifdef CONFIG_MAGIC_SYSRQ
7033	static void normalize_task(struct rq rq, struct task_struct p)	7033	static void normalize_task(struct rq rq, struct task_struct p)
7034	{	7034	{
7035	const struct sched_class *prev_class = p->sched_class;	7035	const struct sched_class *prev_class = p->sched_class;
7036	int old_prio = p->prio;	7036	int old_prio = p->prio;
7037	int on_rq;	7037	int on_rq;
7038		7038
7039	on_rq = p->on_rq;	7039	on_rq = p->on_rq;
7040	if (on_rq)	7040	if (on_rq)
7041	dequeue_task(rq, p, 0);	7041	dequeue_task(rq, p, 0);
7042	__setscheduler(rq, p, SCHED_NORMAL, 0);	7042	__setscheduler(rq, p, SCHED_NORMAL, 0);
7043	if (on_rq) {	7043	if (on_rq) {
7044	enqueue_task(rq, p, 0);	7044	enqueue_task(rq, p, 0);
7045	resched_task(rq->curr);	7045	resched_task(rq->curr);
7046	}	7046	}
7047		7047
7048	check_class_changed(rq, p, prev_class, old_prio);	7048	check_class_changed(rq, p, prev_class, old_prio);
7049	}	7049	}
7050		7050
7051	void normalize_rt_tasks(void)	7051	void normalize_rt_tasks(void)
7052	{	7052	{
7053	struct task_struct g, p;	7053	struct task_struct g, p;
7054	unsigned long flags;	7054	unsigned long flags;
7055	struct rq *rq;	7055	struct rq *rq;
7056		7056
7057	read_lock_irqsave(&tasklist_lock, flags);	7057	read_lock_irqsave(&tasklist_lock, flags);
7058	do_each_thread(g, p) {	7058	do_each_thread(g, p) {
7059	/*	7059	/*
7060	* Only normalize user tasks:	7060	* Only normalize user tasks:
7061	*/	7061	*/
7062	if (!p->mm)	7062	if (!p->mm)
7063	continue;	7063	continue;
7064		7064
7065	p->se.exec_start = 0;	7065	p->se.exec_start = 0;
7066	#ifdef CONFIG_SCHEDSTATS	7066	#ifdef CONFIG_SCHEDSTATS
7067	p->se.statistics.wait_start = 0;	7067	p->se.statistics.wait_start = 0;
7068	p->se.statistics.sleep_start = 0;	7068	p->se.statistics.sleep_start = 0;
7069	p->se.statistics.block_start = 0;	7069	p->se.statistics.block_start = 0;
7070	#endif	7070	#endif
7071		7071
7072	if (!rt_task(p)) {	7072	if (!rt_task(p)) {
7073	/*	7073	/*
7074	* Renice negative nice level userspace	7074	* Renice negative nice level userspace
7075	* tasks back to 0:	7075	* tasks back to 0:
7076	*/	7076	*/
7077	if (TASK_NICE(p) < 0 && p->mm)	7077	if (TASK_NICE(p) < 0 && p->mm)
7078	set_user_nice(p, 0);	7078	set_user_nice(p, 0);
7079	continue;	7079	continue;
7080	}	7080	}
7081		7081
7082	raw_spin_lock(&p->pi_lock);	7082	raw_spin_lock(&p->pi_lock);
7083	rq = __task_rq_lock(p);	7083	rq = __task_rq_lock(p);
7084		7084
7085	normalize_task(rq, p);	7085	normalize_task(rq, p);
7086		7086
7087	__task_rq_unlock(rq);	7087	__task_rq_unlock(rq);
7088	raw_spin_unlock(&p->pi_lock);	7088	raw_spin_unlock(&p->pi_lock);
7089	} while_each_thread(g, p);	7089	} while_each_thread(g, p);
7090		7090
7091	read_unlock_irqrestore(&tasklist_lock, flags);	7091	read_unlock_irqrestore(&tasklist_lock, flags);
7092	}	7092	}
7093		7093
7094	#endif /* CONFIG_MAGIC_SYSRQ */	7094	#endif /* CONFIG_MAGIC_SYSRQ */
7095		7095
7096	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)	7096	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)
7097	/*	7097	/*
7098	* These functions are only useful for the IA64 MCA handling, or kdb.	7098	* These functions are only useful for the IA64 MCA handling, or kdb.
7099	*	7099	*
7100	* They can only be called when the whole system has been	7100	* They can only be called when the whole system has been
7101	* stopped - every CPU needs to be quiescent, and no scheduling	7101	* stopped - every CPU needs to be quiescent, and no scheduling
7102	* activity can take place. Using them for anything else would	7102	* activity can take place. Using them for anything else would
7103	* be a serious bug, and as a result, they aren't even visible	7103	* be a serious bug, and as a result, they aren't even visible
7104	* under any other configuration.	7104	* under any other configuration.
7105	*/	7105	*/
7106		7106
7107	/**	7107	/**
7108	* curr_task - return the current task for a given cpu.	7108	* curr_task - return the current task for a given cpu.
7109	* @cpu: the processor in question.	7109	* @cpu: the processor in question.
7110	*	7110	*
7111	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!	7111	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7112	*/	7112	*/
7113	struct task_struct *curr_task(int cpu)	7113	struct task_struct *curr_task(int cpu)
7114	{	7114	{
7115	return cpu_curr(cpu);	7115	return cpu_curr(cpu);
7116	}	7116	}
7117		7117
7118	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */	7118	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */
7119		7119
7120	#ifdef CONFIG_IA64	7120	#ifdef CONFIG_IA64
7121	/**	7121	/**
7122	* set_curr_task - set the current task for a given cpu.	7122	* set_curr_task - set the current task for a given cpu.
7123	* @cpu: the processor in question.	7123	* @cpu: the processor in question.
7124	* @p: the task pointer to set.	7124	* @p: the task pointer to set.
7125	*	7125	*
7126	* Description: This function must only be used when non-maskable interrupts	7126	* Description: This function must only be used when non-maskable interrupts
7127	* are serviced on a separate stack. It allows the architecture to switch the	7127	* are serviced on a separate stack. It allows the architecture to switch the
7128	* notion of the current task on a cpu in a non-blocking manner. This function	7128	* notion of the current task on a cpu in a non-blocking manner. This function
7129	* must be called with all CPU's synchronized, and interrupts disabled, the	7129	* must be called with all CPU's synchronized, and interrupts disabled, the
7130	* and caller must save the original value of the current task (see	7130	* and caller must save the original value of the current task (see
7131	* curr_task() above) and restore that value before reenabling interrupts and	7131	* curr_task() above) and restore that value before reenabling interrupts and
7132	* re-starting the system.	7132	* re-starting the system.
7133	*	7133	*
7134	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!	7134	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7135	*/	7135	*/
7136	void set_curr_task(int cpu, struct task_struct *p)	7136	void set_curr_task(int cpu, struct task_struct *p)
7137	{	7137	{
7138	cpu_curr(cpu) = p;	7138	cpu_curr(cpu) = p;
7139	}	7139	}
7140		7140
7141	#endif	7141	#endif
7142		7142
7143	#ifdef CONFIG_CGROUP_SCHED	7143	#ifdef CONFIG_CGROUP_SCHED
7144	/* task_group_lock serializes the addition/removal of task groups */	7144	/* task_group_lock serializes the addition/removal of task groups */
7145	static DEFINE_SPINLOCK(task_group_lock);	7145	static DEFINE_SPINLOCK(task_group_lock);
7146		7146
7147	static void free_sched_group(struct task_group *tg)	7147	static void free_sched_group(struct task_group *tg)
7148	{	7148	{
7149	free_fair_sched_group(tg);	7149	free_fair_sched_group(tg);
7150	free_rt_sched_group(tg);	7150	free_rt_sched_group(tg);
7151	autogroup_free(tg);	7151	autogroup_free(tg);
7152	kfree(tg);	7152	kfree(tg);
7153	}	7153	}
7154		7154
7155	/* allocate runqueue etc for a new task group */	7155	/* allocate runqueue etc for a new task group */
7156	struct task_group sched_create_group(struct task_group parent)	7156	struct task_group sched_create_group(struct task_group parent)
7157	{	7157	{
7158	struct task_group *tg;	7158	struct task_group *tg;
7159	unsigned long flags;	7159	unsigned long flags;
7160		7160
7161	tg = kzalloc(sizeof(*tg), GFP_KERNEL);	7161	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7162	if (!tg)	7162	if (!tg)
7163	return ERR_PTR(-ENOMEM);	7163	return ERR_PTR(-ENOMEM);
7164		7164
7165	if (!alloc_fair_sched_group(tg, parent))	7165	if (!alloc_fair_sched_group(tg, parent))
7166	goto err;	7166	goto err;
7167		7167
7168	if (!alloc_rt_sched_group(tg, parent))	7168	if (!alloc_rt_sched_group(tg, parent))
7169	goto err;	7169	goto err;
7170		7170
7171	spin_lock_irqsave(&task_group_lock, flags);	7171	spin_lock_irqsave(&task_group_lock, flags);
7172	list_add_rcu(&tg->list, &task_groups);	7172	list_add_rcu(&tg->list, &task_groups);
7173		7173
7174	WARN_ON(!parent); /* root should already exist */	7174	WARN_ON(!parent); /* root should already exist */
7175		7175
7176	tg->parent = parent;	7176	tg->parent = parent;
7177	INIT_LIST_HEAD(&tg->children);	7177	INIT_LIST_HEAD(&tg->children);
7178	list_add_rcu(&tg->siblings, &parent->children);	7178	list_add_rcu(&tg->siblings, &parent->children);
7179	spin_unlock_irqrestore(&task_group_lock, flags);	7179	spin_unlock_irqrestore(&task_group_lock, flags);
7180		7180
7181	return tg;	7181	return tg;
7182		7182
7183	err:	7183	err:
7184	free_sched_group(tg);	7184	free_sched_group(tg);
7185	return ERR_PTR(-ENOMEM);	7185	return ERR_PTR(-ENOMEM);
7186	}	7186	}
7187		7187
7188	/* rcu callback to free various structures associated with a task group */	7188	/* rcu callback to free various structures associated with a task group */
7189	static void free_sched_group_rcu(struct rcu_head *rhp)	7189	static void free_sched_group_rcu(struct rcu_head *rhp)
7190	{	7190	{
7191	/* now it should be safe to free those cfs_rqs */	7191	/* now it should be safe to free those cfs_rqs */
7192	free_sched_group(container_of(rhp, struct task_group, rcu));	7192	free_sched_group(container_of(rhp, struct task_group, rcu));
7193	}	7193	}
7194		7194
7195	/* Destroy runqueue etc associated with a task group */	7195	/* Destroy runqueue etc associated with a task group */
7196	void sched_destroy_group(struct task_group *tg)	7196	void sched_destroy_group(struct task_group *tg)
7197	{	7197	{
7198	unsigned long flags;	7198	unsigned long flags;
7199	int i;	7199	int i;
7200		7200
7201	/* end participation in shares distribution */	7201	/* end participation in shares distribution */
7202	for_each_possible_cpu(i)	7202	for_each_possible_cpu(i)
7203	unregister_fair_sched_group(tg, i);	7203	unregister_fair_sched_group(tg, i);
7204		7204
7205	spin_lock_irqsave(&task_group_lock, flags);	7205	spin_lock_irqsave(&task_group_lock, flags);
7206	list_del_rcu(&tg->list);	7206	list_del_rcu(&tg->list);
7207	list_del_rcu(&tg->siblings);	7207	list_del_rcu(&tg->siblings);
7208	spin_unlock_irqrestore(&task_group_lock, flags);	7208	spin_unlock_irqrestore(&task_group_lock, flags);
7209		7209
7210	/* wait for possible concurrent references to cfs_rqs complete */	7210	/* wait for possible concurrent references to cfs_rqs complete */
7211	call_rcu(&tg->rcu, free_sched_group_rcu);	7211	call_rcu(&tg->rcu, free_sched_group_rcu);
7212	}	7212	}
7213		7213
7214	/* change task's runqueue when it moves between groups.	7214	/* change task's runqueue when it moves between groups.
7215	* The caller of this function should have put the task in its new group	7215	* The caller of this function should have put the task in its new group
7216	* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to	7216	* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7217	* reflect its new group.	7217	* reflect its new group.
7218	*/	7218	*/
7219	void sched_move_task(struct task_struct *tsk)	7219	void sched_move_task(struct task_struct *tsk)
7220	{	7220	{
7221	int on_rq, running;	7221	int on_rq, running;
7222	unsigned long flags;	7222	unsigned long flags;
7223	struct rq *rq;	7223	struct rq *rq;
7224		7224
7225	rq = task_rq_lock(tsk, &flags);	7225	rq = task_rq_lock(tsk, &flags);
7226		7226
7227	running = task_current(rq, tsk);	7227	running = task_current(rq, tsk);
7228	on_rq = tsk->on_rq;	7228	on_rq = tsk->on_rq;
7229		7229
7230	if (on_rq)	7230	if (on_rq)
7231	dequeue_task(rq, tsk, 0);	7231	dequeue_task(rq, tsk, 0);
7232	if (unlikely(running))	7232	if (unlikely(running))
7233	tsk->sched_class->put_prev_task(rq, tsk);	7233	tsk->sched_class->put_prev_task(rq, tsk);
7234		7234
7235	#ifdef CONFIG_FAIR_GROUP_SCHED	7235	#ifdef CONFIG_FAIR_GROUP_SCHED
7236	if (tsk->sched_class->task_move_group)	7236	if (tsk->sched_class->task_move_group)
7237	tsk->sched_class->task_move_group(tsk, on_rq);	7237	tsk->sched_class->task_move_group(tsk, on_rq);
7238	else	7238	else
7239	#endif	7239	#endif
7240	set_task_rq(tsk, task_cpu(tsk));	7240	set_task_rq(tsk, task_cpu(tsk));
7241		7241
7242	if (unlikely(running))	7242	if (unlikely(running))
7243	tsk->sched_class->set_curr_task(rq);	7243	tsk->sched_class->set_curr_task(rq);
7244	if (on_rq)	7244	if (on_rq)
7245	enqueue_task(rq, tsk, 0);	7245	enqueue_task(rq, tsk, 0);
7246		7246
7247	task_rq_unlock(rq, tsk, &flags);	7247	task_rq_unlock(rq, tsk, &flags);
7248	}	7248	}
7249	#endif /* CONFIG_CGROUP_SCHED */	7249	#endif /* CONFIG_CGROUP_SCHED */
7250		7250
7251	#if defined(CONFIG_RT_GROUP_SCHED) \|\| defined(CONFIG_CFS_BANDWIDTH)	7251	#if defined(CONFIG_RT_GROUP_SCHED) \|\| defined(CONFIG_CFS_BANDWIDTH)
7252	static unsigned long to_ratio(u64 period, u64 runtime)	7252	static unsigned long to_ratio(u64 period, u64 runtime)
7253	{	7253	{
7254	if (runtime == RUNTIME_INF)	7254	if (runtime == RUNTIME_INF)
7255	return 1ULL << 20;	7255	return 1ULL << 20;
7256		7256
7257	return div64_u64(runtime << 20, period);	7257	return div64_u64(runtime << 20, period);
7258	}	7258	}
7259	#endif	7259	#endif
7260		7260
7261	#ifdef CONFIG_RT_GROUP_SCHED	7261	#ifdef CONFIG_RT_GROUP_SCHED
7262	/*	7262	/*
7263	* Ensure that the real time constraints are schedulable.	7263	* Ensure that the real time constraints are schedulable.
7264	*/	7264	*/
7265	static DEFINE_MUTEX(rt_constraints_mutex);	7265	static DEFINE_MUTEX(rt_constraints_mutex);
7266		7266
7267	/* Must be called with tasklist_lock held */	7267	/* Must be called with tasklist_lock held */
7268	static inline int tg_has_rt_tasks(struct task_group *tg)	7268	static inline int tg_has_rt_tasks(struct task_group *tg)
7269	{	7269	{
7270	struct task_struct g, p;	7270	struct task_struct g, p;
7271		7271
7272	do_each_thread(g, p) {	7272	do_each_thread(g, p) {
7273	if (rt_task(p) && task_rq(p)->rt.tg == tg)	7273	if (rt_task(p) && task_rq(p)->rt.tg == tg)
7274	return 1;	7274	return 1;
7275	} while_each_thread(g, p);	7275	} while_each_thread(g, p);
7276		7276
7277	return 0;	7277	return 0;
7278	}	7278	}
7279		7279
7280	struct rt_schedulable_data {	7280	struct rt_schedulable_data {
7281	struct task_group *tg;	7281	struct task_group *tg;
7282	u64 rt_period;	7282	u64 rt_period;
7283	u64 rt_runtime;	7283	u64 rt_runtime;
7284	};	7284	};
7285		7285
7286	static int tg_rt_schedulable(struct task_group tg, void data)	7286	static int tg_rt_schedulable(struct task_group tg, void data)
7287	{	7287	{
7288	struct rt_schedulable_data *d = data;	7288	struct rt_schedulable_data *d = data;
7289	struct task_group *child;	7289	struct task_group *child;
7290	unsigned long total, sum = 0;	7290	unsigned long total, sum = 0;
7291	u64 period, runtime;	7291	u64 period, runtime;
7292		7292
7293	period = ktime_to_ns(tg->rt_bandwidth.rt_period);	7293	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7294	runtime = tg->rt_bandwidth.rt_runtime;	7294	runtime = tg->rt_bandwidth.rt_runtime;
7295		7295
7296	if (tg == d->tg) {	7296	if (tg == d->tg) {
7297	period = d->rt_period;	7297	period = d->rt_period;
7298	runtime = d->rt_runtime;	7298	runtime = d->rt_runtime;
7299	}	7299	}
7300		7300
7301	/*	7301	/*
7302	* Cannot have more runtime than the period.	7302	* Cannot have more runtime than the period.
7303	*/	7303	*/
7304	if (runtime > period && runtime != RUNTIME_INF)	7304	if (runtime > period && runtime != RUNTIME_INF)
7305	return -EINVAL;	7305	return -EINVAL;
7306		7306
7307	/*	7307	/*
7308	* Ensure we don't starve existing RT tasks.	7308	* Ensure we don't starve existing RT tasks.
7309	*/	7309	*/
7310	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))	7310	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7311	return -EBUSY;	7311	return -EBUSY;
7312		7312
7313	total = to_ratio(period, runtime);	7313	total = to_ratio(period, runtime);
7314		7314
7315	/*	7315	/*
7316	* Nobody can have more than the global setting allows.	7316	* Nobody can have more than the global setting allows.
7317	*/	7317	*/
7318	if (total > to_ratio(global_rt_period(), global_rt_runtime()))	7318	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7319	return -EINVAL;	7319	return -EINVAL;
7320		7320
7321	/*	7321	/*
7322	* The sum of our children's runtime should not exceed our own.	7322	* The sum of our children's runtime should not exceed our own.
7323	*/	7323	*/
7324	list_for_each_entry_rcu(child, &tg->children, siblings) {	7324	list_for_each_entry_rcu(child, &tg->children, siblings) {
7325	period = ktime_to_ns(child->rt_bandwidth.rt_period);	7325	period = ktime_to_ns(child->rt_bandwidth.rt_period);
7326	runtime = child->rt_bandwidth.rt_runtime;	7326	runtime = child->rt_bandwidth.rt_runtime;
7327		7327
7328	if (child == d->tg) {	7328	if (child == d->tg) {
7329	period = d->rt_period;	7329	period = d->rt_period;
7330	runtime = d->rt_runtime;	7330	runtime = d->rt_runtime;
7331	}	7331	}
7332		7332
7333	sum += to_ratio(period, runtime);	7333	sum += to_ratio(period, runtime);
7334	}	7334	}
7335		7335
7336	if (sum > total)	7336	if (sum > total)
7337	return -EINVAL;	7337	return -EINVAL;
7338		7338
7339	return 0;	7339	return 0;
7340	}	7340	}
7341		7341
7342	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)	7342	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7343	{	7343	{
7344	int ret;	7344	int ret;
7345		7345
7346	struct rt_schedulable_data data = {	7346	struct rt_schedulable_data data = {
7347	.tg = tg,	7347	.tg = tg,
7348	.rt_period = period,	7348	.rt_period = period,
7349	.rt_runtime = runtime,	7349	.rt_runtime = runtime,
7350	};	7350	};
7351		7351
7352	rcu_read_lock();	7352	rcu_read_lock();
7353	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);	7353	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7354	rcu_read_unlock();	7354	rcu_read_unlock();
7355		7355
7356	return ret;	7356	return ret;
7357	}	7357	}
7358		7358
7359	static int tg_set_rt_bandwidth(struct task_group *tg,	7359	static int tg_set_rt_bandwidth(struct task_group *tg,
7360	u64 rt_period, u64 rt_runtime)	7360	u64 rt_period, u64 rt_runtime)
7361	{	7361	{
7362	int i, err = 0;	7362	int i, err = 0;
7363		7363
7364	mutex_lock(&rt_constraints_mutex);	7364	mutex_lock(&rt_constraints_mutex);
7365	read_lock(&tasklist_lock);	7365	read_lock(&tasklist_lock);
7366	err = __rt_schedulable(tg, rt_period, rt_runtime);	7366	err = __rt_schedulable(tg, rt_period, rt_runtime);
7367	if (err)	7367	if (err)
7368	goto unlock;	7368	goto unlock;
7369		7369
7370	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);	7370	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7371	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);	7371	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7372	tg->rt_bandwidth.rt_runtime = rt_runtime;	7372	tg->rt_bandwidth.rt_runtime = rt_runtime;
7373		7373
7374	for_each_possible_cpu(i) {	7374	for_each_possible_cpu(i) {
7375	struct rt_rq *rt_rq = tg->rt_rq[i];	7375	struct rt_rq *rt_rq = tg->rt_rq[i];
7376		7376
7377	raw_spin_lock(&rt_rq->rt_runtime_lock);	7377	raw_spin_lock(&rt_rq->rt_runtime_lock);
7378	rt_rq->rt_runtime = rt_runtime;	7378	rt_rq->rt_runtime = rt_runtime;
7379	raw_spin_unlock(&rt_rq->rt_runtime_lock);	7379	raw_spin_unlock(&rt_rq->rt_runtime_lock);
7380	}	7380	}
7381	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);	7381	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7382	unlock:	7382	unlock:
7383	read_unlock(&tasklist_lock);	7383	read_unlock(&tasklist_lock);
7384	mutex_unlock(&rt_constraints_mutex);	7384	mutex_unlock(&rt_constraints_mutex);
7385		7385
7386	return err;	7386	return err;
7387	}	7387	}
7388		7388
7389	int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)	7389	int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7390	{	7390	{
7391	u64 rt_runtime, rt_period;	7391	u64 rt_runtime, rt_period;
7392		7392
7393	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);	7393	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7394	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;	7394	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7395	if (rt_runtime_us < 0)	7395	if (rt_runtime_us < 0)
7396	rt_runtime = RUNTIME_INF;	7396	rt_runtime = RUNTIME_INF;
7397		7397
7398	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);	7398	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7399	}	7399	}
7400		7400
7401	long sched_group_rt_runtime(struct task_group *tg)	7401	long sched_group_rt_runtime(struct task_group *tg)
7402	{	7402	{
7403	u64 rt_runtime_us;	7403	u64 rt_runtime_us;
7404		7404
7405	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)	7405	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7406	return -1;	7406	return -1;
7407		7407
7408	rt_runtime_us = tg->rt_bandwidth.rt_runtime;	7408	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7409	do_div(rt_runtime_us, NSEC_PER_USEC);	7409	do_div(rt_runtime_us, NSEC_PER_USEC);
7410	return rt_runtime_us;	7410	return rt_runtime_us;
7411	}	7411	}
7412		7412
7413	int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)	7413	int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7414	{	7414	{
7415	u64 rt_runtime, rt_period;	7415	u64 rt_runtime, rt_period;
7416		7416
7417	rt_period = (u64)rt_period_us * NSEC_PER_USEC;	7417	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7418	rt_runtime = tg->rt_bandwidth.rt_runtime;	7418	rt_runtime = tg->rt_bandwidth.rt_runtime;
7419		7419
7420	if (rt_period == 0)	7420	if (rt_period == 0)
7421	return -EINVAL;	7421	return -EINVAL;
7422		7422
7423	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);	7423	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7424	}	7424	}
7425		7425
7426	long sched_group_rt_period(struct task_group *tg)	7426	long sched_group_rt_period(struct task_group *tg)
7427	{	7427	{
7428	u64 rt_period_us;	7428	u64 rt_period_us;
7429		7429
7430	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);	7430	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7431	do_div(rt_period_us, NSEC_PER_USEC);	7431	do_div(rt_period_us, NSEC_PER_USEC);
7432	return rt_period_us;	7432	return rt_period_us;
7433	}	7433	}
7434		7434
7435	static int sched_rt_global_constraints(void)	7435	static int sched_rt_global_constraints(void)
7436	{	7436	{
7437	u64 runtime, period;	7437	u64 runtime, period;
7438	int ret = 0;	7438	int ret = 0;
7439		7439
7440	if (sysctl_sched_rt_period <= 0)	7440	if (sysctl_sched_rt_period <= 0)
7441	return -EINVAL;	7441	return -EINVAL;
7442		7442
7443	runtime = global_rt_runtime();	7443	runtime = global_rt_runtime();
7444	period = global_rt_period();	7444	period = global_rt_period();
7445		7445
7446	/*	7446	/*
7447	* Sanity check on the sysctl variables.	7447	* Sanity check on the sysctl variables.
7448	*/	7448	*/
7449	if (runtime > period && runtime != RUNTIME_INF)	7449	if (runtime > period && runtime != RUNTIME_INF)
7450	return -EINVAL;	7450	return -EINVAL;
7451		7451
7452	mutex_lock(&rt_constraints_mutex);	7452	mutex_lock(&rt_constraints_mutex);
7453	read_lock(&tasklist_lock);	7453	read_lock(&tasklist_lock);
7454	ret = __rt_schedulable(NULL, 0, 0);	7454	ret = __rt_schedulable(NULL, 0, 0);
7455	read_unlock(&tasklist_lock);	7455	read_unlock(&tasklist_lock);
7456	mutex_unlock(&rt_constraints_mutex);	7456	mutex_unlock(&rt_constraints_mutex);
7457		7457
7458	return ret;	7458	return ret;
7459	}	7459	}
7460		7460
7461	int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)	7461	int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)
7462	{	7462	{
7463	/* Don't accept realtime tasks when there is no way for them to run */	7463	/* Don't accept realtime tasks when there is no way for them to run */
7464	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)	7464	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7465	return 0;	7465	return 0;
7466		7466
7467	return 1;	7467	return 1;
7468	}	7468	}
7469		7469
7470	#else /* !CONFIG_RT_GROUP_SCHED */	7470	#else /* !CONFIG_RT_GROUP_SCHED */
7471	static int sched_rt_global_constraints(void)	7471	static int sched_rt_global_constraints(void)
7472	{	7472	{
7473	unsigned long flags;	7473	unsigned long flags;
7474	int i;	7474	int i;
7475		7475
7476	if (sysctl_sched_rt_period <= 0)	7476	if (sysctl_sched_rt_period <= 0)
7477	return -EINVAL;	7477	return -EINVAL;
7478		7478
7479	/*	7479	/*
7480	* There's always some RT tasks in the root group	7480	* There's always some RT tasks in the root group
7481	* -- migration, kstopmachine etc..	7481	* -- migration, kstopmachine etc..
7482	*/	7482	*/
7483	if (sysctl_sched_rt_runtime == 0)	7483	if (sysctl_sched_rt_runtime == 0)
7484	return -EBUSY;	7484	return -EBUSY;
7485		7485
7486	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);	7486	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7487	for_each_possible_cpu(i) {	7487	for_each_possible_cpu(i) {
7488	struct rt_rq *rt_rq = &cpu_rq(i)->rt;	7488	struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7489		7489
7490	raw_spin_lock(&rt_rq->rt_runtime_lock);	7490	raw_spin_lock(&rt_rq->rt_runtime_lock);
7491	rt_rq->rt_runtime = global_rt_runtime();	7491	rt_rq->rt_runtime = global_rt_runtime();
7492	raw_spin_unlock(&rt_rq->rt_runtime_lock);	7492	raw_spin_unlock(&rt_rq->rt_runtime_lock);
7493	}	7493	}
7494	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);	7494	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7495		7495
7496	return 0;	7496	return 0;
7497	}	7497	}
7498	#endif /* CONFIG_RT_GROUP_SCHED */	7498	#endif /* CONFIG_RT_GROUP_SCHED */
7499		7499
7500	int sched_rt_handler(struct ctl_table *table, int write,	7500	int sched_rt_handler(struct ctl_table *table, int write,
7501	void __user buffer, size_t lenp,	7501	void __user buffer, size_t lenp,
7502	loff_t *ppos)	7502	loff_t *ppos)
7503	{	7503	{
7504	int ret;	7504	int ret;
7505	int old_period, old_runtime;	7505	int old_period, old_runtime;
7506	static DEFINE_MUTEX(mutex);	7506	static DEFINE_MUTEX(mutex);
7507		7507
7508	mutex_lock(&mutex);	7508	mutex_lock(&mutex);
7509	old_period = sysctl_sched_rt_period;	7509	old_period = sysctl_sched_rt_period;
7510	old_runtime = sysctl_sched_rt_runtime;	7510	old_runtime = sysctl_sched_rt_runtime;
7511		7511
7512	ret = proc_dointvec(table, write, buffer, lenp, ppos);	7512	ret = proc_dointvec(table, write, buffer, lenp, ppos);
7513		7513
7514	if (!ret && write) {	7514	if (!ret && write) {
7515	ret = sched_rt_global_constraints();	7515	ret = sched_rt_global_constraints();
7516	if (ret) {	7516	if (ret) {
7517	sysctl_sched_rt_period = old_period;	7517	sysctl_sched_rt_period = old_period;
7518	sysctl_sched_rt_runtime = old_runtime;	7518	sysctl_sched_rt_runtime = old_runtime;
7519	} else {	7519	} else {
7520	def_rt_bandwidth.rt_runtime = global_rt_runtime();	7520	def_rt_bandwidth.rt_runtime = global_rt_runtime();
7521	def_rt_bandwidth.rt_period =	7521	def_rt_bandwidth.rt_period =
7522	ns_to_ktime(global_rt_period());	7522	ns_to_ktime(global_rt_period());
7523	}	7523	}
7524	}	7524	}
7525	mutex_unlock(&mutex);	7525	mutex_unlock(&mutex);
7526		7526
7527	return ret;	7527	return ret;
7528	}	7528	}
7529		7529
7530	#ifdef CONFIG_CGROUP_SCHED	7530	#ifdef CONFIG_CGROUP_SCHED
7531		7531
7532	/* return corresponding task_group object of a cgroup */	7532	/* return corresponding task_group object of a cgroup */
7533	static inline struct task_group cgroup_tg(struct cgroup cgrp)	7533	static inline struct task_group cgroup_tg(struct cgroup cgrp)
7534	{	7534	{
7535	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),	7535	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7536	struct task_group, css);	7536	struct task_group, css);
7537	}	7537	}
7538		7538
7539	static struct cgroup_subsys_state *	7539	static struct cgroup_subsys_state *
7540	cpu_cgroup_create(struct cgroup_subsys ss, struct cgroup cgrp)	7540	cpu_cgroup_create(struct cgroup_subsys ss, struct cgroup cgrp)
7541	{	7541	{
7542	struct task_group tg, parent;	7542	struct task_group tg, parent;
7543		7543
7544	if (!cgrp->parent) {	7544	if (!cgrp->parent) {
7545	/* This is early initialization for the top cgroup */	7545	/* This is early initialization for the top cgroup */
7546	return &root_task_group.css;	7546	return &root_task_group.css;
7547	}	7547	}
7548		7548
7549	parent = cgroup_tg(cgrp->parent);	7549	parent = cgroup_tg(cgrp->parent);
7550	tg = sched_create_group(parent);	7550	tg = sched_create_group(parent);
7551	if (IS_ERR(tg))	7551	if (IS_ERR(tg))
7552	return ERR_PTR(-ENOMEM);	7552	return ERR_PTR(-ENOMEM);
7553		7553
7554	return &tg->css;	7554	return &tg->css;
7555	}	7555	}
7556		7556
7557	static void	7557	static void
7558	cpu_cgroup_destroy(struct cgroup_subsys ss, struct cgroup cgrp)	7558	cpu_cgroup_destroy(struct cgroup_subsys ss, struct cgroup cgrp)
7559	{	7559	{
7560	struct task_group *tg = cgroup_tg(cgrp);	7560	struct task_group *tg = cgroup_tg(cgrp);
7561		7561
7562	sched_destroy_group(tg);	7562	sched_destroy_group(tg);
7563	}	7563	}
7564		7564
7565	static int cpu_cgroup_can_attach(struct cgroup_subsys ss, struct cgroup cgrp,	7565	static int cpu_cgroup_can_attach(struct cgroup_subsys ss, struct cgroup cgrp,
7566	struct cgroup_taskset *tset)	7566	struct cgroup_taskset *tset)
7567	{	7567	{
7568	struct task_struct *task;	7568	struct task_struct *task;
7569		7569
7570	cgroup_taskset_for_each(task, cgrp, tset) {	7570	cgroup_taskset_for_each(task, cgrp, tset) {
7571	#ifdef CONFIG_RT_GROUP_SCHED	7571	#ifdef CONFIG_RT_GROUP_SCHED
7572	if (!sched_rt_can_attach(cgroup_tg(cgrp), task))	7572	if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7573	return -EINVAL;	7573	return -EINVAL;
7574	#else	7574	#else
7575	/* We don't support RT-tasks being in separate groups */	7575	/* We don't support RT-tasks being in separate groups */
7576	if (task->sched_class != &fair_sched_class)	7576	if (task->sched_class != &fair_sched_class)
7577	return -EINVAL;	7577	return -EINVAL;
7578	#endif	7578	#endif
7579	}	7579	}
7580	return 0;	7580	return 0;
7581	}	7581	}
7582		7582
7583	static void cpu_cgroup_attach(struct cgroup_subsys ss, struct cgroup cgrp,	7583	static void cpu_cgroup_attach(struct cgroup_subsys ss, struct cgroup cgrp,
7584	struct cgroup_taskset *tset)	7584	struct cgroup_taskset *tset)
7585	{	7585	{
7586	struct task_struct *task;	7586	struct task_struct *task;
7587		7587
7588	cgroup_taskset_for_each(task, cgrp, tset)	7588	cgroup_taskset_for_each(task, cgrp, tset)
7589	sched_move_task(task);	7589	sched_move_task(task);
7590	}	7590	}
7591		7591
7592	static void	7592	static void
7593	cpu_cgroup_exit(struct cgroup_subsys ss, struct cgroup cgrp,	7593	cpu_cgroup_exit(struct cgroup_subsys ss, struct cgroup cgrp,
7594	struct cgroup old_cgrp, struct task_struct task)	7594	struct cgroup old_cgrp, struct task_struct task)
7595	{	7595	{
7596	/*	7596	/*
7597	* cgroup_exit() is called in the copy_process() failure path.	7597	* cgroup_exit() is called in the copy_process() failure path.
7598	* Ignore this case since the task hasn't ran yet, this avoids	7598	* Ignore this case since the task hasn't ran yet, this avoids
7599	* trying to poke a half freed task state from generic code.	7599	* trying to poke a half freed task state from generic code.
7600	*/	7600	*/
7601	if (!(task->flags & PF_EXITING))	7601	if (!(task->flags & PF_EXITING))
7602	return;	7602	return;
7603		7603
7604	sched_move_task(task);	7604	sched_move_task(task);
7605	}	7605	}
7606		7606
7607	#ifdef CONFIG_FAIR_GROUP_SCHED	7607	#ifdef CONFIG_FAIR_GROUP_SCHED
7608	static int cpu_shares_write_u64(struct cgroup cgrp, struct cftype cftype,	7608	static int cpu_shares_write_u64(struct cgroup cgrp, struct cftype cftype,
7609	u64 shareval)	7609	u64 shareval)
7610	{	7610	{
7611	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));	7611	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7612	}	7612	}
7613		7613
7614	static u64 cpu_shares_read_u64(struct cgroup cgrp, struct cftype cft)	7614	static u64 cpu_shares_read_u64(struct cgroup cgrp, struct cftype cft)
7615	{	7615	{
7616	struct task_group *tg = cgroup_tg(cgrp);	7616	struct task_group *tg = cgroup_tg(cgrp);
7617		7617
7618	return (u64) scale_load_down(tg->shares);	7618	return (u64) scale_load_down(tg->shares);
7619	}	7619	}
7620		7620
7621	#ifdef CONFIG_CFS_BANDWIDTH	7621	#ifdef CONFIG_CFS_BANDWIDTH
7622	static DEFINE_MUTEX(cfs_constraints_mutex);	7622	static DEFINE_MUTEX(cfs_constraints_mutex);
7623		7623
7624	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */	7624	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7625	const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */	7625	const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7626		7626
7627	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);	7627	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7628		7628
7629	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)	7629	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7630	{	7630	{
7631	int i, ret = 0, runtime_enabled, runtime_was_enabled;	7631	int i, ret = 0, runtime_enabled, runtime_was_enabled;
7632	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;	7632	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7633		7633
7634	if (tg == &root_task_group)	7634	if (tg == &root_task_group)
7635	return -EINVAL;	7635	return -EINVAL;
7636		7636
7637	/*	7637	/*
7638	* Ensure we have at some amount of bandwidth every period. This is	7638	* Ensure we have at some amount of bandwidth every period. This is
7639	* to prevent reaching a state of large arrears when throttled via	7639	* to prevent reaching a state of large arrears when throttled via
7640	* entity_tick() resulting in prolonged exit starvation.	7640	* entity_tick() resulting in prolonged exit starvation.
7641	*/	7641	*/
7642	if (quota < min_cfs_quota_period \|\| period < min_cfs_quota_period)	7642	if (quota < min_cfs_quota_period \|\| period < min_cfs_quota_period)
7643	return -EINVAL;	7643	return -EINVAL;
7644		7644
7645	/*	7645	/*
7646	* Likewise, bound things on the otherside by preventing insane quota	7646	* Likewise, bound things on the otherside by preventing insane quota
7647	* periods. This also allows us to normalize in computing quota	7647	* periods. This also allows us to normalize in computing quota
7648	* feasibility.	7648	* feasibility.
7649	*/	7649	*/
7650	if (period > max_cfs_quota_period)	7650	if (period > max_cfs_quota_period)
7651	return -EINVAL;	7651	return -EINVAL;
7652		7652
7653	mutex_lock(&cfs_constraints_mutex);	7653	mutex_lock(&cfs_constraints_mutex);
7654	ret = __cfs_schedulable(tg, period, quota);	7654	ret = __cfs_schedulable(tg, period, quota);
7655	if (ret)	7655	if (ret)
7656	goto out_unlock;	7656	goto out_unlock;
7657		7657
7658	runtime_enabled = quota != RUNTIME_INF;	7658	runtime_enabled = quota != RUNTIME_INF;
7659	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;	7659	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7660	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);	7660	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7661	raw_spin_lock_irq(&cfs_b->lock);	7661	raw_spin_lock_irq(&cfs_b->lock);
7662	cfs_b->period = ns_to_ktime(period);	7662	cfs_b->period = ns_to_ktime(period);
7663	cfs_b->quota = quota;	7663	cfs_b->quota = quota;
7664		7664
7665	__refill_cfs_bandwidth_runtime(cfs_b);	7665	__refill_cfs_bandwidth_runtime(cfs_b);
7666	/* restart the period timer (if active) to handle new period expiry */	7666	/* restart the period timer (if active) to handle new period expiry */
7667	if (runtime_enabled && cfs_b->timer_active) {	7667	if (runtime_enabled && cfs_b->timer_active) {
7668	/* force a reprogram */	7668	/* force a reprogram */
7669	cfs_b->timer_active = 0;	7669	cfs_b->timer_active = 0;
7670	__start_cfs_bandwidth(cfs_b);	7670	__start_cfs_bandwidth(cfs_b);
7671	}	7671	}
7672	raw_spin_unlock_irq(&cfs_b->lock);	7672	raw_spin_unlock_irq(&cfs_b->lock);
7673		7673
7674	for_each_possible_cpu(i) {	7674	for_each_possible_cpu(i) {
7675	struct cfs_rq *cfs_rq = tg->cfs_rq[i];	7675	struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7676	struct rq *rq = cfs_rq->rq;	7676	struct rq *rq = cfs_rq->rq;
7677		7677
7678	raw_spin_lock_irq(&rq->lock);	7678	raw_spin_lock_irq(&rq->lock);
7679	cfs_rq->runtime_enabled = runtime_enabled;	7679	cfs_rq->runtime_enabled = runtime_enabled;
7680	cfs_rq->runtime_remaining = 0;	7680	cfs_rq->runtime_remaining = 0;
7681		7681
7682	if (cfs_rq->throttled)	7682	if (cfs_rq->throttled)
7683	unthrottle_cfs_rq(cfs_rq);	7683	unthrottle_cfs_rq(cfs_rq);
7684	raw_spin_unlock_irq(&rq->lock);	7684	raw_spin_unlock_irq(&rq->lock);
7685	}	7685	}
7686	out_unlock:	7686	out_unlock:
7687	mutex_unlock(&cfs_constraints_mutex);	7687	mutex_unlock(&cfs_constraints_mutex);
7688		7688
7689	return ret;	7689	return ret;
7690	}	7690	}
7691		7691
7692	int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)	7692	int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7693	{	7693	{
7694	u64 quota, period;	7694	u64 quota, period;
7695		7695
7696	period = ktime_to_ns(tg->cfs_bandwidth.period);	7696	period = ktime_to_ns(tg->cfs_bandwidth.period);
7697	if (cfs_quota_us < 0)	7697	if (cfs_quota_us < 0)
7698	quota = RUNTIME_INF;	7698	quota = RUNTIME_INF;
7699	else	7699	else
7700	quota = (u64)cfs_quota_us * NSEC_PER_USEC;	7700	quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7701		7701
7702	return tg_set_cfs_bandwidth(tg, period, quota);	7702	return tg_set_cfs_bandwidth(tg, period, quota);
7703	}	7703	}
7704		7704
7705	long tg_get_cfs_quota(struct task_group *tg)	7705	long tg_get_cfs_quota(struct task_group *tg)
7706	{	7706	{
7707	u64 quota_us;	7707	u64 quota_us;
7708		7708
7709	if (tg->cfs_bandwidth.quota == RUNTIME_INF)	7709	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7710	return -1;	7710	return -1;
7711		7711
7712	quota_us = tg->cfs_bandwidth.quota;	7712	quota_us = tg->cfs_bandwidth.quota;
7713	do_div(quota_us, NSEC_PER_USEC);	7713	do_div(quota_us, NSEC_PER_USEC);
7714		7714
7715	return quota_us;	7715	return quota_us;
7716	}	7716	}
7717		7717
7718	int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)	7718	int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7719	{	7719	{
7720	u64 quota, period;	7720	u64 quota, period;
7721		7721
7722	period = (u64)cfs_period_us * NSEC_PER_USEC;	7722	period = (u64)cfs_period_us * NSEC_PER_USEC;
7723	quota = tg->cfs_bandwidth.quota;	7723	quota = tg->cfs_bandwidth.quota;
7724		7724
7725	return tg_set_cfs_bandwidth(tg, period, quota);	7725	return tg_set_cfs_bandwidth(tg, period, quota);
7726	}	7726	}
7727		7727
7728	long tg_get_cfs_period(struct task_group *tg)	7728	long tg_get_cfs_period(struct task_group *tg)
7729	{	7729	{
7730	u64 cfs_period_us;	7730	u64 cfs_period_us;
7731		7731
7732	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);	7732	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7733	do_div(cfs_period_us, NSEC_PER_USEC);	7733	do_div(cfs_period_us, NSEC_PER_USEC);
7734		7734
7735	return cfs_period_us;	7735	return cfs_period_us;
7736	}	7736	}
7737		7737
7738	static s64 cpu_cfs_quota_read_s64(struct cgroup cgrp, struct cftype cft)	7738	static s64 cpu_cfs_quota_read_s64(struct cgroup cgrp, struct cftype cft)
7739	{	7739	{
7740	return tg_get_cfs_quota(cgroup_tg(cgrp));	7740	return tg_get_cfs_quota(cgroup_tg(cgrp));
7741	}	7741	}
7742		7742
7743	static int cpu_cfs_quota_write_s64(struct cgroup cgrp, struct cftype cftype,	7743	static int cpu_cfs_quota_write_s64(struct cgroup cgrp, struct cftype cftype,
7744	s64 cfs_quota_us)	7744	s64 cfs_quota_us)
7745	{	7745	{
7746	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);	7746	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7747	}	7747	}
7748		7748
7749	static u64 cpu_cfs_period_read_u64(struct cgroup cgrp, struct cftype cft)	7749	static u64 cpu_cfs_period_read_u64(struct cgroup cgrp, struct cftype cft)
7750	{	7750	{
7751	return tg_get_cfs_period(cgroup_tg(cgrp));	7751	return tg_get_cfs_period(cgroup_tg(cgrp));
7752	}	7752	}
7753		7753
7754	static int cpu_cfs_period_write_u64(struct cgroup cgrp, struct cftype cftype,	7754	static int cpu_cfs_period_write_u64(struct cgroup cgrp, struct cftype cftype,
7755	u64 cfs_period_us)	7755	u64 cfs_period_us)
7756	{	7756	{
7757	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);	7757	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7758	}	7758	}
7759		7759
7760	struct cfs_schedulable_data {	7760	struct cfs_schedulable_data {
7761	struct task_group *tg;	7761	struct task_group *tg;
7762	u64 period, quota;	7762	u64 period, quota;
7763	};	7763	};
7764		7764
7765	/*	7765	/*
7766	* normalize group quota/period to be quota/max_period	7766	* normalize group quota/period to be quota/max_period
7767	* note: units are usecs	7767	* note: units are usecs
7768	*/	7768	*/
7769	static u64 normalize_cfs_quota(struct task_group *tg,	7769	static u64 normalize_cfs_quota(struct task_group *tg,
7770	struct cfs_schedulable_data *d)	7770	struct cfs_schedulable_data *d)
7771	{	7771	{
7772	u64 quota, period;	7772	u64 quota, period;
7773		7773
7774	if (tg == d->tg) {	7774	if (tg == d->tg) {
7775	period = d->period;	7775	period = d->period;
7776	quota = d->quota;	7776	quota = d->quota;
7777	} else {	7777	} else {
7778	period = tg_get_cfs_period(tg);	7778	period = tg_get_cfs_period(tg);
7779	quota = tg_get_cfs_quota(tg);	7779	quota = tg_get_cfs_quota(tg);
7780	}	7780	}
7781		7781
7782	/* note: these should typically be equivalent */	7782	/* note: these should typically be equivalent */
7783	if (quota == RUNTIME_INF \|\| quota == -1)	7783	if (quota == RUNTIME_INF \|\| quota == -1)
7784	return RUNTIME_INF;	7784	return RUNTIME_INF;
7785		7785
7786	return to_ratio(period, quota);	7786	return to_ratio(period, quota);
7787	}	7787	}
7788		7788
7789	static int tg_cfs_schedulable_down(struct task_group tg, void data)	7789	static int tg_cfs_schedulable_down(struct task_group tg, void data)
7790	{	7790	{
7791	struct cfs_schedulable_data *d = data;	7791	struct cfs_schedulable_data *d = data;
7792	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;	7792	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7793	s64 quota = 0, parent_quota = -1;	7793	s64 quota = 0, parent_quota = -1;
7794		7794
7795	if (!tg->parent) {	7795	if (!tg->parent) {
7796	quota = RUNTIME_INF;	7796	quota = RUNTIME_INF;
7797	} else {	7797	} else {
7798	struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;	7798	struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7799		7799
7800	quota = normalize_cfs_quota(tg, d);	7800	quota = normalize_cfs_quota(tg, d);
7801	parent_quota = parent_b->hierarchal_quota;	7801	parent_quota = parent_b->hierarchal_quota;
7802		7802
7803	/*	7803	/*
7804	* ensure max(child_quota) <= parent_quota, inherit when no	7804	* ensure max(child_quota) <= parent_quota, inherit when no
7805	* limit is set	7805	* limit is set
7806	*/	7806	*/
7807	if (quota == RUNTIME_INF)	7807	if (quota == RUNTIME_INF)
7808	quota = parent_quota;	7808	quota = parent_quota;
7809	else if (parent_quota != RUNTIME_INF && quota > parent_quota)	7809	else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7810	return -EINVAL;	7810	return -EINVAL;
7811	}	7811	}
7812	cfs_b->hierarchal_quota = quota;	7812	cfs_b->hierarchal_quota = quota;
7813		7813
7814	return 0;	7814	return 0;
7815	}	7815	}
7816		7816
7817	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)	7817	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7818	{	7818	{
7819	int ret;	7819	int ret;
7820	struct cfs_schedulable_data data = {	7820	struct cfs_schedulable_data data = {
7821	.tg = tg,	7821	.tg = tg,
7822	.period = period,	7822	.period = period,
7823	.quota = quota,	7823	.quota = quota,
7824	};	7824	};
7825		7825
7826	if (quota != RUNTIME_INF) {	7826	if (quota != RUNTIME_INF) {
7827	do_div(data.period, NSEC_PER_USEC);	7827	do_div(data.period, NSEC_PER_USEC);
7828	do_div(data.quota, NSEC_PER_USEC);	7828	do_div(data.quota, NSEC_PER_USEC);
7829	}	7829	}
7830		7830
7831	rcu_read_lock();	7831	rcu_read_lock();
7832	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);	7832	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7833	rcu_read_unlock();	7833	rcu_read_unlock();
7834		7834
7835	return ret;	7835	return ret;
7836	}	7836	}
7837		7837
7838	static int cpu_stats_show(struct cgroup cgrp, struct cftype cft,	7838	static int cpu_stats_show(struct cgroup cgrp, struct cftype cft,
7839	struct cgroup_map_cb *cb)	7839	struct cgroup_map_cb *cb)
7840	{	7840	{
7841	struct task_group *tg = cgroup_tg(cgrp);	7841	struct task_group *tg = cgroup_tg(cgrp);
7842	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;	7842	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7843		7843
7844	cb->fill(cb, "nr_periods", cfs_b->nr_periods);	7844	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7845	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);	7845	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7846	cb->fill(cb, "throttled_time", cfs_b->throttled_time);	7846	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7847		7847
7848	return 0;	7848	return 0;
7849	}	7849	}
7850	#endif /* CONFIG_CFS_BANDWIDTH */	7850	#endif /* CONFIG_CFS_BANDWIDTH */
7851	#endif /* CONFIG_FAIR_GROUP_SCHED */	7851	#endif /* CONFIG_FAIR_GROUP_SCHED */
7852		7852
7853	#ifdef CONFIG_RT_GROUP_SCHED	7853	#ifdef CONFIG_RT_GROUP_SCHED
7854	static int cpu_rt_runtime_write(struct cgroup cgrp, struct cftype cft,	7854	static int cpu_rt_runtime_write(struct cgroup cgrp, struct cftype cft,
7855	s64 val)	7855	s64 val)
7856	{	7856	{
7857	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);	7857	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7858	}	7858	}
7859		7859
7860	static s64 cpu_rt_runtime_read(struct cgroup cgrp, struct cftype cft)	7860	static s64 cpu_rt_runtime_read(struct cgroup cgrp, struct cftype cft)
7861	{	7861	{
7862	return sched_group_rt_runtime(cgroup_tg(cgrp));	7862	return sched_group_rt_runtime(cgroup_tg(cgrp));
7863	}	7863	}
7864		7864
7865	static int cpu_rt_period_write_uint(struct cgroup cgrp, struct cftype cftype,	7865	static int cpu_rt_period_write_uint(struct cgroup cgrp, struct cftype cftype,
7866	u64 rt_period_us)	7866	u64 rt_period_us)
7867	{	7867	{
7868	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);	7868	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7869	}	7869	}
7870		7870
7871	static u64 cpu_rt_period_read_uint(struct cgroup cgrp, struct cftype cft)	7871	static u64 cpu_rt_period_read_uint(struct cgroup cgrp, struct cftype cft)
7872	{	7872	{
7873	return sched_group_rt_period(cgroup_tg(cgrp));	7873	return sched_group_rt_period(cgroup_tg(cgrp));
7874	}	7874	}
7875	#endif /* CONFIG_RT_GROUP_SCHED */	7875	#endif /* CONFIG_RT_GROUP_SCHED */
7876		7876
7877	static struct cftype cpu_files[] = {	7877	static struct cftype cpu_files[] = {
7878	#ifdef CONFIG_FAIR_GROUP_SCHED	7878	#ifdef CONFIG_FAIR_GROUP_SCHED
7879	{	7879	{
7880	.name = "shares",	7880	.name = "shares",
7881	.read_u64 = cpu_shares_read_u64,	7881	.read_u64 = cpu_shares_read_u64,
7882	.write_u64 = cpu_shares_write_u64,	7882	.write_u64 = cpu_shares_write_u64,
7883	},	7883	},
7884	#endif	7884	#endif
7885	#ifdef CONFIG_CFS_BANDWIDTH	7885	#ifdef CONFIG_CFS_BANDWIDTH
7886	{	7886	{
7887	.name = "cfs_quota_us",	7887	.name = "cfs_quota_us",
7888	.read_s64 = cpu_cfs_quota_read_s64,	7888	.read_s64 = cpu_cfs_quota_read_s64,
7889	.write_s64 = cpu_cfs_quota_write_s64,	7889	.write_s64 = cpu_cfs_quota_write_s64,
7890	},	7890	},
7891	{	7891	{
7892	.name = "cfs_period_us",	7892	.name = "cfs_period_us",
7893	.read_u64 = cpu_cfs_period_read_u64,	7893	.read_u64 = cpu_cfs_period_read_u64,
7894	.write_u64 = cpu_cfs_period_write_u64,	7894	.write_u64 = cpu_cfs_period_write_u64,
7895	},	7895	},
7896	{	7896	{
7897	.name = "stat",	7897	.name = "stat",
7898	.read_map = cpu_stats_show,	7898	.read_map = cpu_stats_show,
7899	},	7899	},
7900	#endif	7900	#endif
7901	#ifdef CONFIG_RT_GROUP_SCHED	7901	#ifdef CONFIG_RT_GROUP_SCHED
7902	{	7902	{
7903	.name = "rt_runtime_us",	7903	.name = "rt_runtime_us",
7904	.read_s64 = cpu_rt_runtime_read,	7904	.read_s64 = cpu_rt_runtime_read,
7905	.write_s64 = cpu_rt_runtime_write,	7905	.write_s64 = cpu_rt_runtime_write,
7906	},	7906	},
7907	{	7907	{
7908	.name = "rt_period_us",	7908	.name = "rt_period_us",
7909	.read_u64 = cpu_rt_period_read_uint,	7909	.read_u64 = cpu_rt_period_read_uint,
7910	.write_u64 = cpu_rt_period_write_uint,	7910	.write_u64 = cpu_rt_period_write_uint,
7911	},	7911	},
7912	#endif	7912	#endif
7913	};	7913	};
7914		7914
7915	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)	7915	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)
7916	{	7916	{
7917	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));	7917	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7918	}	7918	}
7919		7919
7920	struct cgroup_subsys cpu_cgroup_subsys = {	7920	struct cgroup_subsys cpu_cgroup_subsys = {
7921	.name = "cpu",	7921	.name = "cpu",
7922	.create = cpu_cgroup_create,	7922	.create = cpu_cgroup_create,
7923	.destroy = cpu_cgroup_destroy,	7923	.destroy = cpu_cgroup_destroy,
7924	.can_attach = cpu_cgroup_can_attach,	7924	.can_attach = cpu_cgroup_can_attach,
7925	.attach = cpu_cgroup_attach,	7925	.attach = cpu_cgroup_attach,
7926	.exit = cpu_cgroup_exit,	7926	.exit = cpu_cgroup_exit,
7927	.populate = cpu_cgroup_populate,	7927	.populate = cpu_cgroup_populate,
7928	.subsys_id = cpu_cgroup_subsys_id,	7928	.subsys_id = cpu_cgroup_subsys_id,
7929	.early_init = 1,	7929	.early_init = 1,
7930	};	7930	};
7931		7931
7932	#endif /* CONFIG_CGROUP_SCHED */	7932	#endif /* CONFIG_CGROUP_SCHED */
7933		7933
7934	#ifdef CONFIG_CGROUP_CPUACCT	7934	#ifdef CONFIG_CGROUP_CPUACCT
7935		7935
7936	/*	7936	/*
7937	* CPU accounting code for task groups.	7937	* CPU accounting code for task groups.
7938	*	7938	*
7939	* Based on the work by Paul Menage (menage@google.com) and Balbir Singh	7939	* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7940	* (balbir@in.ibm.com).	7940	* (balbir@in.ibm.com).
7941	*/	7941	*/
7942		7942
7943	/* create a new cpu accounting group */	7943	/* create a new cpu accounting group */
7944	static struct cgroup_subsys_state *cpuacct_create(	7944	static struct cgroup_subsys_state *cpuacct_create(
7945	struct cgroup_subsys ss, struct cgroup cgrp)	7945	struct cgroup_subsys ss, struct cgroup cgrp)
7946	{	7946	{
7947	struct cpuacct *ca;	7947	struct cpuacct *ca;
7948		7948
7949	if (!cgrp->parent)	7949	if (!cgrp->parent)
7950	return &root_cpuacct.css;	7950	return &root_cpuacct.css;
7951		7951
7952	ca = kzalloc(sizeof(*ca), GFP_KERNEL);	7952	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7953	if (!ca)	7953	if (!ca)
7954	goto out;	7954	goto out;
7955		7955
7956	ca->cpuusage = alloc_percpu(u64);	7956	ca->cpuusage = alloc_percpu(u64);
7957	if (!ca->cpuusage)	7957	if (!ca->cpuusage)
7958	goto out_free_ca;	7958	goto out_free_ca;
7959		7959
7960	ca->cpustat = alloc_percpu(struct kernel_cpustat);	7960	ca->cpustat = alloc_percpu(struct kernel_cpustat);
7961	if (!ca->cpustat)	7961	if (!ca->cpustat)
7962	goto out_free_cpuusage;	7962	goto out_free_cpuusage;
7963		7963
7964	return &ca->css;	7964	return &ca->css;
7965		7965
7966	out_free_cpuusage:	7966	out_free_cpuusage:
7967	free_percpu(ca->cpuusage);	7967	free_percpu(ca->cpuusage);
7968	out_free_ca:	7968	out_free_ca:
7969	kfree(ca);	7969	kfree(ca);
7970	out:	7970	out:
7971	return ERR_PTR(-ENOMEM);	7971	return ERR_PTR(-ENOMEM);
7972	}	7972	}
7973		7973
7974	/* destroy an existing cpu accounting group */	7974	/* destroy an existing cpu accounting group */
7975	static void	7975	static void
7976	cpuacct_destroy(struct cgroup_subsys ss, struct cgroup cgrp)	7976	cpuacct_destroy(struct cgroup_subsys ss, struct cgroup cgrp)
7977	{	7977	{
7978	struct cpuacct *ca = cgroup_ca(cgrp);	7978	struct cpuacct *ca = cgroup_ca(cgrp);
7979		7979
7980	free_percpu(ca->cpustat);	7980	free_percpu(ca->cpustat);
7981	free_percpu(ca->cpuusage);	7981	free_percpu(ca->cpuusage);
7982	kfree(ca);	7982	kfree(ca);
7983	}	7983	}
7984		7984
7985	static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)	7985	static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
7986	{	7986	{
7987	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	7987	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
7988	u64 data;	7988	u64 data;
7989		7989
7990	#ifndef CONFIG_64BIT	7990	#ifndef CONFIG_64BIT
7991	/*	7991	/*
7992	* Take rq->lock to make 64-bit read safe on 32-bit platforms.	7992	* Take rq->lock to make 64-bit read safe on 32-bit platforms.
7993	*/	7993	*/
7994	raw_spin_lock_irq(&cpu_rq(cpu)->lock);	7994	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
7995	data = *cpuusage;	7995	data = *cpuusage;
7996	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);	7996	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
7997	#else	7997	#else
7998	data = *cpuusage;	7998	data = *cpuusage;
7999	#endif	7999	#endif
8000		8000
8001	return data;	8001	return data;
8002	}	8002	}
8003		8003
8004	static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)	8004	static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8005	{	8005	{
8006	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	8006	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8007		8007
8008	#ifndef CONFIG_64BIT	8008	#ifndef CONFIG_64BIT
8009	/*	8009	/*
8010	* Take rq->lock to make 64-bit write safe on 32-bit platforms.	8010	* Take rq->lock to make 64-bit write safe on 32-bit platforms.
8011	*/	8011	*/
8012	raw_spin_lock_irq(&cpu_rq(cpu)->lock);	8012	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8013	*cpuusage = val;	8013	*cpuusage = val;
8014	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);	8014	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8015	#else	8015	#else
8016	*cpuusage = val;	8016	*cpuusage = val;
8017	#endif	8017	#endif
8018	}	8018	}
8019		8019
8020	/* return total cpu usage (in nanoseconds) of a group */	8020	/* return total cpu usage (in nanoseconds) of a group */
8021	static u64 cpuusage_read(struct cgroup cgrp, struct cftype cft)	8021	static u64 cpuusage_read(struct cgroup cgrp, struct cftype cft)
8022	{	8022	{
8023	struct cpuacct *ca = cgroup_ca(cgrp);	8023	struct cpuacct *ca = cgroup_ca(cgrp);
8024	u64 totalcpuusage = 0;	8024	u64 totalcpuusage = 0;
8025	int i;	8025	int i;
8026		8026
8027	for_each_present_cpu(i)	8027	for_each_present_cpu(i)
8028	totalcpuusage += cpuacct_cpuusage_read(ca, i);	8028	totalcpuusage += cpuacct_cpuusage_read(ca, i);
8029		8029
8030	return totalcpuusage;	8030	return totalcpuusage;
8031	}	8031	}
8032		8032
8033	static int cpuusage_write(struct cgroup cgrp, struct cftype cftype,	8033	static int cpuusage_write(struct cgroup cgrp, struct cftype cftype,
8034	u64 reset)	8034	u64 reset)
8035	{	8035	{
8036	struct cpuacct *ca = cgroup_ca(cgrp);	8036	struct cpuacct *ca = cgroup_ca(cgrp);
8037	int err = 0;	8037	int err = 0;
8038	int i;	8038	int i;
8039		8039
8040	if (reset) {	8040	if (reset) {
8041	err = -EINVAL;	8041	err = -EINVAL;
8042	goto out;	8042	goto out;
8043	}	8043	}
8044		8044
8045	for_each_present_cpu(i)	8045	for_each_present_cpu(i)
8046	cpuacct_cpuusage_write(ca, i, 0);	8046	cpuacct_cpuusage_write(ca, i, 0);
8047		8047
8048	out:	8048	out:
8049	return err;	8049	return err;
8050	}	8050	}
8051		8051
8052	static int cpuacct_percpu_seq_read(struct cgroup cgroup, struct cftype cft,	8052	static int cpuacct_percpu_seq_read(struct cgroup cgroup, struct cftype cft,
8053	struct seq_file *m)	8053	struct seq_file *m)
8054	{	8054	{
8055	struct cpuacct *ca = cgroup_ca(cgroup);	8055	struct cpuacct *ca = cgroup_ca(cgroup);
8056	u64 percpu;	8056	u64 percpu;
8057	int i;	8057	int i;
8058		8058
8059	for_each_present_cpu(i) {	8059	for_each_present_cpu(i) {
8060	percpu = cpuacct_cpuusage_read(ca, i);	8060	percpu = cpuacct_cpuusage_read(ca, i);
8061	seq_printf(m, "%llu ", (unsigned long long) percpu);	8061	seq_printf(m, "%llu ", (unsigned long long) percpu);
8062	}	8062	}
8063	seq_printf(m, "\n");	8063	seq_printf(m, "\n");
8064	return 0;	8064	return 0;
8065	}	8065	}
8066		8066
8067	static const char *cpuacct_stat_desc[] = {	8067	static const char *cpuacct_stat_desc[] = {
8068	[CPUACCT_STAT_USER] = "user",	8068	[CPUACCT_STAT_USER] = "user",
8069	[CPUACCT_STAT_SYSTEM] = "system",	8069	[CPUACCT_STAT_SYSTEM] = "system",
8070	};	8070	};
8071		8071
8072	static int cpuacct_stats_show(struct cgroup cgrp, struct cftype cft,	8072	static int cpuacct_stats_show(struct cgroup cgrp, struct cftype cft,
8073	struct cgroup_map_cb *cb)	8073	struct cgroup_map_cb *cb)
8074	{	8074	{
8075	struct cpuacct *ca = cgroup_ca(cgrp);	8075	struct cpuacct *ca = cgroup_ca(cgrp);
8076	int cpu;	8076	int cpu;
8077	s64 val = 0;	8077	s64 val = 0;
8078		8078
8079	for_each_online_cpu(cpu) {	8079	for_each_online_cpu(cpu) {
8080	struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);	8080	struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8081	val += kcpustat->cpustat[CPUTIME_USER];	8081	val += kcpustat->cpustat[CPUTIME_USER];
8082	val += kcpustat->cpustat[CPUTIME_NICE];	8082	val += kcpustat->cpustat[CPUTIME_NICE];
8083	}	8083	}
8084	val = cputime64_to_clock_t(val);	8084	val = cputime64_to_clock_t(val);
8085	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);	8085	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8086		8086
8087	val = 0;	8087	val = 0;
8088	for_each_online_cpu(cpu) {	8088	for_each_online_cpu(cpu) {
8089	struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);	8089	struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8090	val += kcpustat->cpustat[CPUTIME_SYSTEM];	8090	val += kcpustat->cpustat[CPUTIME_SYSTEM];
8091	val += kcpustat->cpustat[CPUTIME_IRQ];	8091	val += kcpustat->cpustat[CPUTIME_IRQ];
8092	val += kcpustat->cpustat[CPUTIME_SOFTIRQ];	8092	val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8093	}	8093	}
8094		8094
8095	val = cputime64_to_clock_t(val);	8095	val = cputime64_to_clock_t(val);
8096	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);	8096	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8097		8097
8098	return 0;	8098	return 0;
8099	}	8099	}
8100		8100
8101	static struct cftype files[] = {	8101	static struct cftype files[] = {
8102	{	8102	{
8103	.name = "usage",	8103	.name = "usage",
8104	.read_u64 = cpuusage_read,	8104	.read_u64 = cpuusage_read,
8105	.write_u64 = cpuusage_write,	8105	.write_u64 = cpuusage_write,
8106	},	8106	},
8107	{	8107	{
8108	.name = "usage_percpu",	8108	.name = "usage_percpu",
8109	.read_seq_string = cpuacct_percpu_seq_read,	8109	.read_seq_string = cpuacct_percpu_seq_read,
8110	},	8110	},
8111	{	8111	{
8112	.name = "stat",	8112	.name = "stat",
8113	.read_map = cpuacct_stats_show,	8113	.read_map = cpuacct_stats_show,
8114	},	8114	},
8115	};	8115	};
8116		8116
8117	static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)	8117	static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)
8118	{	8118	{
8119	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));	8119	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8120	}	8120	}
8121		8121
8122	/*	8122	/*
8123	* charge this task's execution time to its accounting group.	8123	* charge this task's execution time to its accounting group.
8124	*	8124	*
8125	* called with rq->lock held.	8125	* called with rq->lock held.
8126	*/	8126	*/
8127	void cpuacct_charge(struct task_struct *tsk, u64 cputime)	8127	void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8128	{	8128	{
8129	struct cpuacct *ca;	8129	struct cpuacct *ca;
8130	int cpu;	8130	int cpu;
8131		8131
8132	if (unlikely(!cpuacct_subsys.active))	8132	if (unlikely(!cpuacct_subsys.active))
8133	return;	8133	return;
8134		8134
8135	cpu = task_cpu(tsk);	8135	cpu = task_cpu(tsk);
8136		8136
8137	rcu_read_lock();	8137	rcu_read_lock();
8138		8138
8139	ca = task_ca(tsk);	8139	ca = task_ca(tsk);
8140		8140
8141	for (; ca; ca = parent_ca(ca)) {	8141	for (; ca; ca = parent_ca(ca)) {
8142	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	8142	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8143	*cpuusage += cputime;	8143	*cpuusage += cputime;
8144	}	8144	}
8145		8145
8146	rcu_read_unlock();	8146	rcu_read_unlock();
8147	}	8147	}
8148		8148
8149	struct cgroup_subsys cpuacct_subsys = {	8149	struct cgroup_subsys cpuacct_subsys = {
8150	.name = "cpuacct",	8150	.name = "cpuacct",
8151	.create = cpuacct_create,	8151	.create = cpuacct_create,
8152	.destroy = cpuacct_destroy,	8152	.destroy = cpuacct_destroy,
8153	.populate = cpuacct_populate,	8153	.populate = cpuacct_populate,
8154	.subsys_id = cpuacct_subsys_id,	8154	.subsys_id = cpuacct_subsys_id,
8155	};	8155	};
8156	#endif /* CONFIG_CGROUP_CPUACCT */	8156	#endif /* CONFIG_CGROUP_CPUACCT */
8157		8157

kernel/softirq.c

Diff comments View file @ ba74c14

1	/*	1	/*
2	* linux/kernel/softirq.c	2	* linux/kernel/softirq.c
3	*	3	*
4	* Copyright (C) 1992 Linus Torvalds	4	* Copyright (C) 1992 Linus Torvalds
5	*	5	*
6	* Distribute under GPLv2.	6	* Distribute under GPLv2.
7	*	7	*
8	* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)	8	* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9	*	9	*
10	* Remote softirq infrastructure is by Jens Axboe.	10	* Remote softirq infrastructure is by Jens Axboe.
11	*/	11	*/
12		12
13	#include <linux/export.h>	13	#include <linux/export.h>
14	#include <linux/kernel_stat.h>	14	#include <linux/kernel_stat.h>
15	#include <linux/interrupt.h>	15	#include <linux/interrupt.h>
16	#include <linux/init.h>	16	#include <linux/init.h>
17	#include <linux/mm.h>	17	#include <linux/mm.h>
18	#include <linux/notifier.h>	18	#include <linux/notifier.h>
19	#include <linux/percpu.h>	19	#include <linux/percpu.h>
20	#include <linux/cpu.h>	20	#include <linux/cpu.h>
21	#include <linux/freezer.h>	21	#include <linux/freezer.h>
22	#include <linux/kthread.h>	22	#include <linux/kthread.h>
23	#include <linux/rcupdate.h>	23	#include <linux/rcupdate.h>
24	#include <linux/ftrace.h>	24	#include <linux/ftrace.h>
25	#include <linux/smp.h>	25	#include <linux/smp.h>
26	#include <linux/tick.h>	26	#include <linux/tick.h>
27		27
28	#define CREATE_TRACE_POINTS	28	#define CREATE_TRACE_POINTS
29	#include <trace/events/irq.h>	29	#include <trace/events/irq.h>
30		30
31	#include <asm/irq.h>	31	#include <asm/irq.h>
32	/*	32	/*
33	- No shared variables, all the data are CPU local.	33	- No shared variables, all the data are CPU local.
34	- If a softirq needs serialization, let it serialize itself	34	- If a softirq needs serialization, let it serialize itself
35	by its own spinlocks.	35	by its own spinlocks.
36	- Even if softirq is serialized, only local cpu is marked for	36	- Even if softirq is serialized, only local cpu is marked for
37	execution. Hence, we get something sort of weak cpu binding.	37	execution. Hence, we get something sort of weak cpu binding.
38	Though it is still not clear, will it result in better locality	38	Though it is still not clear, will it result in better locality
39	or will not.	39	or will not.
40		40
41	Examples:	41	Examples:
42	- NET RX softirq. It is multithreaded and does not require	42	- NET RX softirq. It is multithreaded and does not require
43	any global serialization.	43	any global serialization.
44	- NET TX softirq. It kicks software netdevice queues, hence	44	- NET TX softirq. It kicks software netdevice queues, hence
45	it is logically serialized per device, but this serialization	45	it is logically serialized per device, but this serialization
46	is invisible to common code.	46	is invisible to common code.
47	- Tasklets: serialized wrt itself.	47	- Tasklets: serialized wrt itself.
48	*/	48	*/
49		49
50	#ifndef __ARCH_IRQ_STAT	50	#ifndef __ARCH_IRQ_STAT
51	irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;	51	irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
52	EXPORT_SYMBOL(irq_stat);	52	EXPORT_SYMBOL(irq_stat);
53	#endif	53	#endif
54		54
55	static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;	55	static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56		56
57	DEFINE_PER_CPU(struct task_struct *, ksoftirqd);	57	DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58		58
59	char *softirq_to_name[NR_SOFTIRQS] = {	59	char *softirq_to_name[NR_SOFTIRQS] = {
60	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",	60	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61	"TASKLET", "SCHED", "HRTIMER", "RCU"	61	"TASKLET", "SCHED", "HRTIMER", "RCU"
62	};	62	};
63		63
64	/*	64	/*
65	* we cannot loop indefinitely here to avoid userspace starvation,	65	* we cannot loop indefinitely here to avoid userspace starvation,
66	* but we also don't want to introduce a worst case 1/HZ latency	66	* but we also don't want to introduce a worst case 1/HZ latency
67	* to the pending events, so lets the scheduler to balance	67	* to the pending events, so lets the scheduler to balance
68	* the softirq load for us.	68	* the softirq load for us.
69	*/	69	*/
70	static void wakeup_softirqd(void)	70	static void wakeup_softirqd(void)
71	{	71	{
72	/* Interrupts are disabled: no need to stop preemption */	72	/* Interrupts are disabled: no need to stop preemption */
73	struct task_struct *tsk = __this_cpu_read(ksoftirqd);	73	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74		74
75	if (tsk && tsk->state != TASK_RUNNING)	75	if (tsk && tsk->state != TASK_RUNNING)
76	wake_up_process(tsk);	76	wake_up_process(tsk);
77	}	77	}
78		78
79	/*	79	/*
80	* preempt_count and SOFTIRQ_OFFSET usage:	80	* preempt_count and SOFTIRQ_OFFSET usage:
81	* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving	81	* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82	* softirq processing.	82	* softirq processing.
83	* - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)	83	* - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84	* on local_bh_disable or local_bh_enable.	84	* on local_bh_disable or local_bh_enable.
85	* This lets us distinguish between whether we are currently processing	85	* This lets us distinguish between whether we are currently processing
86	* softirq and whether we just have bh disabled.	86	* softirq and whether we just have bh disabled.
87	*/	87	*/
88		88
89	/*	89	/*
90	* This one is for softirq.c-internal use,	90	* This one is for softirq.c-internal use,
91	* where hardirqs are disabled legitimately:	91	* where hardirqs are disabled legitimately:
92	*/	92	*/
93	#ifdef CONFIG_TRACE_IRQFLAGS	93	#ifdef CONFIG_TRACE_IRQFLAGS
94	static void __local_bh_disable(unsigned long ip, unsigned int cnt)	94	static void __local_bh_disable(unsigned long ip, unsigned int cnt)
95	{	95	{
96	unsigned long flags;	96	unsigned long flags;
97		97
98	WARN_ON_ONCE(in_irq());	98	WARN_ON_ONCE(in_irq());
99		99
100	raw_local_irq_save(flags);	100	raw_local_irq_save(flags);
101	/*	101	/*
102	* The preempt tracer hooks into add_preempt_count and will break	102	* The preempt tracer hooks into add_preempt_count and will break
103	* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET	103	* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
104	* is set and before current->softirq_enabled is cleared.	104	* is set and before current->softirq_enabled is cleared.
105	* We must manually increment preempt_count here and manually	105	* We must manually increment preempt_count here and manually
106	* call the trace_preempt_off later.	106	* call the trace_preempt_off later.
107	*/	107	*/
108	preempt_count() += cnt;	108	preempt_count() += cnt;
109	/*	109	/*
110	* Were softirqs turned off above:	110	* Were softirqs turned off above:
111	*/	111	*/
112	if (softirq_count() == cnt)	112	if (softirq_count() == cnt)
113	trace_softirqs_off(ip);	113	trace_softirqs_off(ip);
114	raw_local_irq_restore(flags);	114	raw_local_irq_restore(flags);
115		115
116	if (preempt_count() == cnt)	116	if (preempt_count() == cnt)
117	trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));	117	trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
118	}	118	}
119	#else /* !CONFIG_TRACE_IRQFLAGS */	119	#else /* !CONFIG_TRACE_IRQFLAGS */
120	static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)	120	static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
121	{	121	{
122	add_preempt_count(cnt);	122	add_preempt_count(cnt);
123	barrier();	123	barrier();
124	}	124	}
125	#endif /* CONFIG_TRACE_IRQFLAGS */	125	#endif /* CONFIG_TRACE_IRQFLAGS */
126		126
127	void local_bh_disable(void)	127	void local_bh_disable(void)
128	{	128	{
129	__local_bh_disable((unsigned long)__builtin_return_address(0),	129	__local_bh_disable((unsigned long)__builtin_return_address(0),
130	SOFTIRQ_DISABLE_OFFSET);	130	SOFTIRQ_DISABLE_OFFSET);
131	}	131	}
132		132
133	EXPORT_SYMBOL(local_bh_disable);	133	EXPORT_SYMBOL(local_bh_disable);
134		134
135	static void __local_bh_enable(unsigned int cnt)	135	static void __local_bh_enable(unsigned int cnt)
136	{	136	{
137	WARN_ON_ONCE(in_irq());	137	WARN_ON_ONCE(in_irq());
138	WARN_ON_ONCE(!irqs_disabled());	138	WARN_ON_ONCE(!irqs_disabled());
139		139
140	if (softirq_count() == cnt)	140	if (softirq_count() == cnt)
141	trace_softirqs_on((unsigned long)__builtin_return_address(0));	141	trace_softirqs_on((unsigned long)__builtin_return_address(0));
142	sub_preempt_count(cnt);	142	sub_preempt_count(cnt);
143	}	143	}
144		144
145	/*	145	/*
146	* Special-case - softirqs can safely be enabled in	146	* Special-case - softirqs can safely be enabled in
147	* cond_resched_softirq(), or by __do_softirq(),	147	* cond_resched_softirq(), or by __do_softirq(),
148	* without processing still-pending softirqs:	148	* without processing still-pending softirqs:
149	*/	149	*/
150	void _local_bh_enable(void)	150	void _local_bh_enable(void)
151	{	151	{
152	__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);	152	__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153	}	153	}
154		154
155	EXPORT_SYMBOL(_local_bh_enable);	155	EXPORT_SYMBOL(_local_bh_enable);
156		156
157	static inline void _local_bh_enable_ip(unsigned long ip)	157	static inline void _local_bh_enable_ip(unsigned long ip)
158	{	158	{
159	WARN_ON_ONCE(in_irq() \|\| irqs_disabled());	159	WARN_ON_ONCE(in_irq() \|\| irqs_disabled());
160	#ifdef CONFIG_TRACE_IRQFLAGS	160	#ifdef CONFIG_TRACE_IRQFLAGS
161	local_irq_disable();	161	local_irq_disable();
162	#endif	162	#endif
163	/*	163	/*
164	* Are softirqs going to be turned on now:	164	* Are softirqs going to be turned on now:
165	*/	165	*/
166	if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)	166	if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
167	trace_softirqs_on(ip);	167	trace_softirqs_on(ip);
168	/*	168	/*
169	* Keep preemption disabled until we are done with	169	* Keep preemption disabled until we are done with
170	* softirq processing:	170	* softirq processing:
171	*/	171	*/
172	sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);	172	sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
173		173
174	if (unlikely(!in_interrupt() && local_softirq_pending()))	174	if (unlikely(!in_interrupt() && local_softirq_pending()))
175	do_softirq();	175	do_softirq();
176		176
177	dec_preempt_count();	177	dec_preempt_count();
178	#ifdef CONFIG_TRACE_IRQFLAGS	178	#ifdef CONFIG_TRACE_IRQFLAGS
179	local_irq_enable();	179	local_irq_enable();
180	#endif	180	#endif
181	preempt_check_resched();	181	preempt_check_resched();
182	}	182	}
183		183
184	void local_bh_enable(void)	184	void local_bh_enable(void)
185	{	185	{
186	_local_bh_enable_ip((unsigned long)__builtin_return_address(0));	186	_local_bh_enable_ip((unsigned long)__builtin_return_address(0));
187	}	187	}
188	EXPORT_SYMBOL(local_bh_enable);	188	EXPORT_SYMBOL(local_bh_enable);
189		189
190	void local_bh_enable_ip(unsigned long ip)	190	void local_bh_enable_ip(unsigned long ip)
191	{	191	{
192	_local_bh_enable_ip(ip);	192	_local_bh_enable_ip(ip);
193	}	193	}
194	EXPORT_SYMBOL(local_bh_enable_ip);	194	EXPORT_SYMBOL(local_bh_enable_ip);
195		195
196	/*	196	/*
197	* We restart softirq processing MAX_SOFTIRQ_RESTART times,	197	* We restart softirq processing MAX_SOFTIRQ_RESTART times,
198	* and we fall back to softirqd after that.	198	* and we fall back to softirqd after that.
199	*	199	*
200	* This number has been established via experimentation.	200	* This number has been established via experimentation.
201	* The two things to balance is latency against fairness -	201	* The two things to balance is latency against fairness -
202	* we want to handle softirqs as soon as possible, but they	202	* we want to handle softirqs as soon as possible, but they
203	* should not be able to lock up the box.	203	* should not be able to lock up the box.
204	*/	204	*/
205	#define MAX_SOFTIRQ_RESTART 10	205	#define MAX_SOFTIRQ_RESTART 10
206		206
207	asmlinkage void __do_softirq(void)	207	asmlinkage void __do_softirq(void)
208	{	208	{
209	struct softirq_action *h;	209	struct softirq_action *h;
210	__u32 pending;	210	__u32 pending;
211	int max_restart = MAX_SOFTIRQ_RESTART;	211	int max_restart = MAX_SOFTIRQ_RESTART;
212	int cpu;	212	int cpu;
213		213
214	pending = local_softirq_pending();	214	pending = local_softirq_pending();
215	account_system_vtime(current);	215	account_system_vtime(current);
216		216
217	__local_bh_disable((unsigned long)__builtin_return_address(0),	217	__local_bh_disable((unsigned long)__builtin_return_address(0),
218	SOFTIRQ_OFFSET);	218	SOFTIRQ_OFFSET);
219	lockdep_softirq_enter();	219	lockdep_softirq_enter();
220		220
221	cpu = smp_processor_id();	221	cpu = smp_processor_id();
222	restart:	222	restart:
223	/* Reset the pending bitmask before enabling irqs */	223	/* Reset the pending bitmask before enabling irqs */
224	set_softirq_pending(0);	224	set_softirq_pending(0);
225		225
226	local_irq_enable();	226	local_irq_enable();
227		227
228	h = softirq_vec;	228	h = softirq_vec;
229		229
230	do {	230	do {
231	if (pending & 1) {	231	if (pending & 1) {
232	unsigned int vec_nr = h - softirq_vec;	232	unsigned int vec_nr = h - softirq_vec;
233	int prev_count = preempt_count();	233	int prev_count = preempt_count();
234		234
235	kstat_incr_softirqs_this_cpu(vec_nr);	235	kstat_incr_softirqs_this_cpu(vec_nr);
236		236
237	trace_softirq_entry(vec_nr);	237	trace_softirq_entry(vec_nr);
238	h->action(h);	238	h->action(h);
239	trace_softirq_exit(vec_nr);	239	trace_softirq_exit(vec_nr);
240	if (unlikely(prev_count != preempt_count())) {	240	if (unlikely(prev_count != preempt_count())) {
241	printk(KERN_ERR "huh, entered softirq %u %s %p"	241	printk(KERN_ERR "huh, entered softirq %u %s %p"
242	"with preempt_count %08x,"	242	"with preempt_count %08x,"
243	" exited with %08x?\n", vec_nr,	243	" exited with %08x?\n", vec_nr,
244	softirq_to_name[vec_nr], h->action,	244	softirq_to_name[vec_nr], h->action,
245	prev_count, preempt_count());	245	prev_count, preempt_count());
246	preempt_count() = prev_count;	246	preempt_count() = prev_count;
247	}	247	}
248		248
249	rcu_bh_qs(cpu);	249	rcu_bh_qs(cpu);
250	}	250	}
251	h++;	251	h++;
252	pending >>= 1;	252	pending >>= 1;
253	} while (pending);	253	} while (pending);
254		254
255	local_irq_disable();	255	local_irq_disable();
256		256
257	pending = local_softirq_pending();	257	pending = local_softirq_pending();
258	if (pending && --max_restart)	258	if (pending && --max_restart)
259	goto restart;	259	goto restart;
260		260
261	if (pending)	261	if (pending)
262	wakeup_softirqd();	262	wakeup_softirqd();
263		263
264	lockdep_softirq_exit();	264	lockdep_softirq_exit();
265		265
266	account_system_vtime(current);	266	account_system_vtime(current);
267	__local_bh_enable(SOFTIRQ_OFFSET);	267	__local_bh_enable(SOFTIRQ_OFFSET);
268	}	268	}
269		269
270	#ifndef __ARCH_HAS_DO_SOFTIRQ	270	#ifndef __ARCH_HAS_DO_SOFTIRQ
271		271
272	asmlinkage void do_softirq(void)	272	asmlinkage void do_softirq(void)
273	{	273	{
274	__u32 pending;	274	__u32 pending;
275	unsigned long flags;	275	unsigned long flags;
276		276
277	if (in_interrupt())	277	if (in_interrupt())
278	return;	278	return;
279		279
280	local_irq_save(flags);	280	local_irq_save(flags);
281		281
282	pending = local_softirq_pending();	282	pending = local_softirq_pending();
283		283
284	if (pending)	284	if (pending)
285	__do_softirq();	285	__do_softirq();
286		286
287	local_irq_restore(flags);	287	local_irq_restore(flags);
288	}	288	}
289		289
290	#endif	290	#endif
291		291
292	/*	292	/*
293	* Enter an interrupt context.	293	* Enter an interrupt context.
294	*/	294	*/
295	void irq_enter(void)	295	void irq_enter(void)
296	{	296	{
297	int cpu = smp_processor_id();	297	int cpu = smp_processor_id();
298		298
299	rcu_irq_enter();	299	rcu_irq_enter();
300	if (idle_cpu(cpu) && !in_interrupt()) {	300	if (idle_cpu(cpu) && !in_interrupt()) {
301	/*	301	/*
302	* Prevent raise_softirq from needlessly waking up ksoftirqd	302	* Prevent raise_softirq from needlessly waking up ksoftirqd
303	* here, as softirq will be serviced on return from interrupt.	303	* here, as softirq will be serviced on return from interrupt.
304	*/	304	*/
305	local_bh_disable();	305	local_bh_disable();
306	tick_check_idle(cpu);	306	tick_check_idle(cpu);
307	_local_bh_enable();	307	_local_bh_enable();
308	}	308	}
309		309
310	__irq_enter();	310	__irq_enter();
311	}	311	}
312		312
313	#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED	313	#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314	static inline void invoke_softirq(void)	314	static inline void invoke_softirq(void)
315	{	315	{
316	if (!force_irqthreads)	316	if (!force_irqthreads)
317	__do_softirq();	317	__do_softirq();
318	else {	318	else {
319	__local_bh_disable((unsigned long)__builtin_return_address(0),	319	__local_bh_disable((unsigned long)__builtin_return_address(0),
320	SOFTIRQ_OFFSET);	320	SOFTIRQ_OFFSET);
321	wakeup_softirqd();	321	wakeup_softirqd();
322	__local_bh_enable(SOFTIRQ_OFFSET);	322	__local_bh_enable(SOFTIRQ_OFFSET);
323	}	323	}
324	}	324	}
325	#else	325	#else
326	static inline void invoke_softirq(void)	326	static inline void invoke_softirq(void)
327	{	327	{
328	if (!force_irqthreads)	328	if (!force_irqthreads)
329	do_softirq();	329	do_softirq();
330	else {	330	else {
331	__local_bh_disable((unsigned long)__builtin_return_address(0),	331	__local_bh_disable((unsigned long)__builtin_return_address(0),
332	SOFTIRQ_OFFSET);	332	SOFTIRQ_OFFSET);
333	wakeup_softirqd();	333	wakeup_softirqd();
334	__local_bh_enable(SOFTIRQ_OFFSET);	334	__local_bh_enable(SOFTIRQ_OFFSET);
335	}	335	}
336	}	336	}
337	#endif	337	#endif
338		338
339	/*	339	/*
340	* Exit an interrupt context. Process softirqs if needed and possible:	340	* Exit an interrupt context. Process softirqs if needed and possible:
341	*/	341	*/
342	void irq_exit(void)	342	void irq_exit(void)
343	{	343	{
344	account_system_vtime(current);	344	account_system_vtime(current);
345	trace_hardirq_exit();	345	trace_hardirq_exit();
346	sub_preempt_count(IRQ_EXIT_OFFSET);	346	sub_preempt_count(IRQ_EXIT_OFFSET);
347	if (!in_interrupt() && local_softirq_pending())	347	if (!in_interrupt() && local_softirq_pending())
348	invoke_softirq();	348	invoke_softirq();
349		349
350	#ifdef CONFIG_NO_HZ	350	#ifdef CONFIG_NO_HZ
351	/* Make sure that timer wheel updates are propagated */	351	/* Make sure that timer wheel updates are propagated */
352	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())	352	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
353	tick_nohz_irq_exit();	353	tick_nohz_irq_exit();
354	#endif	354	#endif
355	rcu_irq_exit();	355	rcu_irq_exit();
356	preempt_enable_no_resched();	356	sched_preempt_enable_no_resched();
357	}	357	}
358		358
359	/*	359	/*
360	* This function must run with irqs disabled!	360	* This function must run with irqs disabled!
361	*/	361	*/
362	inline void raise_softirq_irqoff(unsigned int nr)	362	inline void raise_softirq_irqoff(unsigned int nr)
363	{	363	{
364	__raise_softirq_irqoff(nr);	364	__raise_softirq_irqoff(nr);
365		365
366	/*	366	/*
367	* If we're in an interrupt or softirq, we're done	367	* If we're in an interrupt or softirq, we're done
368	* (this also catches softirq-disabled code). We will	368	* (this also catches softirq-disabled code). We will
369	* actually run the softirq once we return from	369	* actually run the softirq once we return from
370	* the irq or softirq.	370	* the irq or softirq.
371	*	371	*
372	* Otherwise we wake up ksoftirqd to make sure we	372	* Otherwise we wake up ksoftirqd to make sure we
373	* schedule the softirq soon.	373	* schedule the softirq soon.
374	*/	374	*/
375	if (!in_interrupt())	375	if (!in_interrupt())
376	wakeup_softirqd();	376	wakeup_softirqd();
377	}	377	}
378		378
379	void raise_softirq(unsigned int nr)	379	void raise_softirq(unsigned int nr)
380	{	380	{
381	unsigned long flags;	381	unsigned long flags;
382		382
383	local_irq_save(flags);	383	local_irq_save(flags);
384	raise_softirq_irqoff(nr);	384	raise_softirq_irqoff(nr);
385	local_irq_restore(flags);	385	local_irq_restore(flags);
386	}	386	}
387		387
388	void open_softirq(int nr, void (action)(struct softirq_action ))	388	void open_softirq(int nr, void (action)(struct softirq_action ))
389	{	389	{
390	softirq_vec[nr].action = action;	390	softirq_vec[nr].action = action;
391	}	391	}
392		392
393	/*	393	/*
394	* Tasklets	394	* Tasklets
395	*/	395	*/
396	struct tasklet_head	396	struct tasklet_head
397	{	397	{
398	struct tasklet_struct *head;	398	struct tasklet_struct *head;
399	struct tasklet_struct **tail;	399	struct tasklet_struct **tail;
400	};	400	};
401		401
402	static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);	402	static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
403	static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);	403	static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
404		404
405	void __tasklet_schedule(struct tasklet_struct *t)	405	void __tasklet_schedule(struct tasklet_struct *t)
406	{	406	{
407	unsigned long flags;	407	unsigned long flags;
408		408
409	local_irq_save(flags);	409	local_irq_save(flags);
410	t->next = NULL;	410	t->next = NULL;
411	*__this_cpu_read(tasklet_vec.tail) = t;	411	*__this_cpu_read(tasklet_vec.tail) = t;
412	__this_cpu_write(tasklet_vec.tail, &(t->next));	412	__this_cpu_write(tasklet_vec.tail, &(t->next));
413	raise_softirq_irqoff(TASKLET_SOFTIRQ);	413	raise_softirq_irqoff(TASKLET_SOFTIRQ);
414	local_irq_restore(flags);	414	local_irq_restore(flags);
415	}	415	}
416		416
417	EXPORT_SYMBOL(__tasklet_schedule);	417	EXPORT_SYMBOL(__tasklet_schedule);
418		418
419	void __tasklet_hi_schedule(struct tasklet_struct *t)	419	void __tasklet_hi_schedule(struct tasklet_struct *t)
420	{	420	{
421	unsigned long flags;	421	unsigned long flags;
422		422
423	local_irq_save(flags);	423	local_irq_save(flags);
424	t->next = NULL;	424	t->next = NULL;
425	*__this_cpu_read(tasklet_hi_vec.tail) = t;	425	*__this_cpu_read(tasklet_hi_vec.tail) = t;
426	__this_cpu_write(tasklet_hi_vec.tail, &(t->next));	426	__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
427	raise_softirq_irqoff(HI_SOFTIRQ);	427	raise_softirq_irqoff(HI_SOFTIRQ);
428	local_irq_restore(flags);	428	local_irq_restore(flags);
429	}	429	}
430		430
431	EXPORT_SYMBOL(__tasklet_hi_schedule);	431	EXPORT_SYMBOL(__tasklet_hi_schedule);
432		432
433	void __tasklet_hi_schedule_first(struct tasklet_struct *t)	433	void __tasklet_hi_schedule_first(struct tasklet_struct *t)
434	{	434	{
435	BUG_ON(!irqs_disabled());	435	BUG_ON(!irqs_disabled());
436		436
437	t->next = __this_cpu_read(tasklet_hi_vec.head);	437	t->next = __this_cpu_read(tasklet_hi_vec.head);
438	__this_cpu_write(tasklet_hi_vec.head, t);	438	__this_cpu_write(tasklet_hi_vec.head, t);
439	__raise_softirq_irqoff(HI_SOFTIRQ);	439	__raise_softirq_irqoff(HI_SOFTIRQ);
440	}	440	}
441		441
442	EXPORT_SYMBOL(__tasklet_hi_schedule_first);	442	EXPORT_SYMBOL(__tasklet_hi_schedule_first);
443		443
444	static void tasklet_action(struct softirq_action *a)	444	static void tasklet_action(struct softirq_action *a)
445	{	445	{
446	struct tasklet_struct *list;	446	struct tasklet_struct *list;
447		447
448	local_irq_disable();	448	local_irq_disable();
449	list = __this_cpu_read(tasklet_vec.head);	449	list = __this_cpu_read(tasklet_vec.head);
450	__this_cpu_write(tasklet_vec.head, NULL);	450	__this_cpu_write(tasklet_vec.head, NULL);
451	__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);	451	__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
452	local_irq_enable();	452	local_irq_enable();
453		453
454	while (list) {	454	while (list) {
455	struct tasklet_struct *t = list;	455	struct tasklet_struct *t = list;
456		456
457	list = list->next;	457	list = list->next;
458		458
459	if (tasklet_trylock(t)) {	459	if (tasklet_trylock(t)) {
460	if (!atomic_read(&t->count)) {	460	if (!atomic_read(&t->count)) {
461	if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))	461	if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
462	BUG();	462	BUG();
463	t->func(t->data);	463	t->func(t->data);
464	tasklet_unlock(t);	464	tasklet_unlock(t);
465	continue;	465	continue;
466	}	466	}
467	tasklet_unlock(t);	467	tasklet_unlock(t);
468	}	468	}
469		469
470	local_irq_disable();	470	local_irq_disable();
471	t->next = NULL;	471	t->next = NULL;
472	*__this_cpu_read(tasklet_vec.tail) = t;	472	*__this_cpu_read(tasklet_vec.tail) = t;
473	__this_cpu_write(tasklet_vec.tail, &(t->next));	473	__this_cpu_write(tasklet_vec.tail, &(t->next));
474	__raise_softirq_irqoff(TASKLET_SOFTIRQ);	474	__raise_softirq_irqoff(TASKLET_SOFTIRQ);
475	local_irq_enable();	475	local_irq_enable();
476	}	476	}
477	}	477	}
478		478
479	static void tasklet_hi_action(struct softirq_action *a)	479	static void tasklet_hi_action(struct softirq_action *a)
480	{	480	{
481	struct tasklet_struct *list;	481	struct tasklet_struct *list;
482		482
483	local_irq_disable();	483	local_irq_disable();
484	list = __this_cpu_read(tasklet_hi_vec.head);	484	list = __this_cpu_read(tasklet_hi_vec.head);
485	__this_cpu_write(tasklet_hi_vec.head, NULL);	485	__this_cpu_write(tasklet_hi_vec.head, NULL);
486	__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);	486	__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
487	local_irq_enable();	487	local_irq_enable();
488		488
489	while (list) {	489	while (list) {
490	struct tasklet_struct *t = list;	490	struct tasklet_struct *t = list;
491		491
492	list = list->next;	492	list = list->next;
493		493
494	if (tasklet_trylock(t)) {	494	if (tasklet_trylock(t)) {
495	if (!atomic_read(&t->count)) {	495	if (!atomic_read(&t->count)) {
496	if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))	496	if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
497	BUG();	497	BUG();
498	t->func(t->data);	498	t->func(t->data);
499	tasklet_unlock(t);	499	tasklet_unlock(t);
500	continue;	500	continue;
501	}	501	}
502	tasklet_unlock(t);	502	tasklet_unlock(t);
503	}	503	}
504		504
505	local_irq_disable();	505	local_irq_disable();
506	t->next = NULL;	506	t->next = NULL;
507	*__this_cpu_read(tasklet_hi_vec.tail) = t;	507	*__this_cpu_read(tasklet_hi_vec.tail) = t;
508	__this_cpu_write(tasklet_hi_vec.tail, &(t->next));	508	__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
509	__raise_softirq_irqoff(HI_SOFTIRQ);	509	__raise_softirq_irqoff(HI_SOFTIRQ);
510	local_irq_enable();	510	local_irq_enable();
511	}	511	}
512	}	512	}
513		513
514		514
515	void tasklet_init(struct tasklet_struct *t,	515	void tasklet_init(struct tasklet_struct *t,
516	void (*func)(unsigned long), unsigned long data)	516	void (*func)(unsigned long), unsigned long data)
517	{	517	{
518	t->next = NULL;	518	t->next = NULL;
519	t->state = 0;	519	t->state = 0;
520	atomic_set(&t->count, 0);	520	atomic_set(&t->count, 0);
521	t->func = func;	521	t->func = func;
522	t->data = data;	522	t->data = data;
523	}	523	}
524		524
525	EXPORT_SYMBOL(tasklet_init);	525	EXPORT_SYMBOL(tasklet_init);
526		526
527	void tasklet_kill(struct tasklet_struct *t)	527	void tasklet_kill(struct tasklet_struct *t)
528	{	528	{
529	if (in_interrupt())	529	if (in_interrupt())
530	printk("Attempt to kill tasklet from interrupt\n");	530	printk("Attempt to kill tasklet from interrupt\n");
531		531
532	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {	532	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
533	do {	533	do {
534	yield();	534	yield();
535	} while (test_bit(TASKLET_STATE_SCHED, &t->state));	535	} while (test_bit(TASKLET_STATE_SCHED, &t->state));
536	}	536	}
537	tasklet_unlock_wait(t);	537	tasklet_unlock_wait(t);
538	clear_bit(TASKLET_STATE_SCHED, &t->state);	538	clear_bit(TASKLET_STATE_SCHED, &t->state);
539	}	539	}
540		540
541	EXPORT_SYMBOL(tasklet_kill);	541	EXPORT_SYMBOL(tasklet_kill);
542		542
543	/*	543	/*
544	* tasklet_hrtimer	544	* tasklet_hrtimer
545	*/	545	*/
546		546
547	/*	547	/*
548	* The trampoline is called when the hrtimer expires. It schedules a tasklet	548	* The trampoline is called when the hrtimer expires. It schedules a tasklet
549	* to run __tasklet_hrtimer_trampoline() which in turn will call the intended	549	* to run __tasklet_hrtimer_trampoline() which in turn will call the intended
550	* hrtimer callback, but from softirq context.	550	* hrtimer callback, but from softirq context.
551	*/	551	*/
552	static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)	552	static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
553	{	553	{
554	struct tasklet_hrtimer *ttimer =	554	struct tasklet_hrtimer *ttimer =
555	container_of(timer, struct tasklet_hrtimer, timer);	555	container_of(timer, struct tasklet_hrtimer, timer);
556		556
557	tasklet_hi_schedule(&ttimer->tasklet);	557	tasklet_hi_schedule(&ttimer->tasklet);
558	return HRTIMER_NORESTART;	558	return HRTIMER_NORESTART;
559	}	559	}
560		560
561	/*	561	/*
562	* Helper function which calls the hrtimer callback from	562	* Helper function which calls the hrtimer callback from
563	* tasklet/softirq context	563	* tasklet/softirq context
564	*/	564	*/
565	static void __tasklet_hrtimer_trampoline(unsigned long data)	565	static void __tasklet_hrtimer_trampoline(unsigned long data)
566	{	566	{
567	struct tasklet_hrtimer ttimer = (void )data;	567	struct tasklet_hrtimer ttimer = (void )data;
568	enum hrtimer_restart restart;	568	enum hrtimer_restart restart;
569		569
570	restart = ttimer->function(&ttimer->timer);	570	restart = ttimer->function(&ttimer->timer);
571	if (restart != HRTIMER_NORESTART)	571	if (restart != HRTIMER_NORESTART)
572	hrtimer_restart(&ttimer->timer);	572	hrtimer_restart(&ttimer->timer);
573	}	573	}
574		574
575	/**	575	/**
576	* tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks	576	* tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
577	* @ttimer: tasklet_hrtimer which is initialized	577	* @ttimer: tasklet_hrtimer which is initialized
578	* @function: hrtimer callback function which gets called from softirq context	578	* @function: hrtimer callback function which gets called from softirq context
579	* @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)	579	* @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
580	* @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)	580	* @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
581	*/	581	*/
582	void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,	582	void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
583	enum hrtimer_restart (function)(struct hrtimer ),	583	enum hrtimer_restart (function)(struct hrtimer ),
584	clockid_t which_clock, enum hrtimer_mode mode)	584	clockid_t which_clock, enum hrtimer_mode mode)
585	{	585	{
586	hrtimer_init(&ttimer->timer, which_clock, mode);	586	hrtimer_init(&ttimer->timer, which_clock, mode);
587	ttimer->timer.function = __hrtimer_tasklet_trampoline;	587	ttimer->timer.function = __hrtimer_tasklet_trampoline;
588	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,	588	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
589	(unsigned long)ttimer);	589	(unsigned long)ttimer);
590	ttimer->function = function;	590	ttimer->function = function;
591	}	591	}
592	EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);	592	EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
593		593
594	/*	594	/*
595	* Remote softirq bits	595	* Remote softirq bits
596	*/	596	*/
597		597
598	DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);	598	DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
599	EXPORT_PER_CPU_SYMBOL(softirq_work_list);	599	EXPORT_PER_CPU_SYMBOL(softirq_work_list);
600		600
601	static void __local_trigger(struct call_single_data *cp, int softirq)	601	static void __local_trigger(struct call_single_data *cp, int softirq)
602	{	602	{
603	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);	603	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
604		604
605	list_add_tail(&cp->list, head);	605	list_add_tail(&cp->list, head);
606		606
607	/* Trigger the softirq only if the list was previously empty. */	607	/* Trigger the softirq only if the list was previously empty. */
608	if (head->next == &cp->list)	608	if (head->next == &cp->list)
609	raise_softirq_irqoff(softirq);	609	raise_softirq_irqoff(softirq);
610	}	610	}
611		611
612	#ifdef CONFIG_USE_GENERIC_SMP_HELPERS	612	#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
613	static void remote_softirq_receive(void *data)	613	static void remote_softirq_receive(void *data)
614	{	614	{
615	struct call_single_data *cp = data;	615	struct call_single_data *cp = data;
616	unsigned long flags;	616	unsigned long flags;
617	int softirq;	617	int softirq;
618		618
619	softirq = cp->priv;	619	softirq = cp->priv;
620		620
621	local_irq_save(flags);	621	local_irq_save(flags);
622	__local_trigger(cp, softirq);	622	__local_trigger(cp, softirq);
623	local_irq_restore(flags);	623	local_irq_restore(flags);
624	}	624	}
625		625
626	static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)	626	static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
627	{	627	{
628	if (cpu_online(cpu)) {	628	if (cpu_online(cpu)) {
629	cp->func = remote_softirq_receive;	629	cp->func = remote_softirq_receive;
630	cp->info = cp;	630	cp->info = cp;
631	cp->flags = 0;	631	cp->flags = 0;
632	cp->priv = softirq;	632	cp->priv = softirq;
633		633
634	__smp_call_function_single(cpu, cp, 0);	634	__smp_call_function_single(cpu, cp, 0);
635	return 0;	635	return 0;
636	}	636	}
637	return 1;	637	return 1;
638	}	638	}
639	#else /* CONFIG_USE_GENERIC_SMP_HELPERS */	639	#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
640	static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)	640	static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
641	{	641	{
642	return 1;	642	return 1;
643	}	643	}
644	#endif	644	#endif
645		645
646	/**	646	/**
647	* __send_remote_softirq - try to schedule softirq work on a remote cpu	647	* __send_remote_softirq - try to schedule softirq work on a remote cpu
648	* @cp: private SMP call function data area	648	* @cp: private SMP call function data area
649	* @cpu: the remote cpu	649	* @cpu: the remote cpu
650	* @this_cpu: the currently executing cpu	650	* @this_cpu: the currently executing cpu
651	* @softirq: the softirq for the work	651	* @softirq: the softirq for the work
652	*	652	*
653	* Attempt to schedule softirq work on a remote cpu. If this cannot be	653	* Attempt to schedule softirq work on a remote cpu. If this cannot be
654	* done, the work is instead queued up on the local cpu.	654	* done, the work is instead queued up on the local cpu.
655	*	655	*
656	* Interrupts must be disabled.	656	* Interrupts must be disabled.
657	*/	657	*/
658	void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)	658	void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
659	{	659	{
660	if (cpu == this_cpu \|\| __try_remote_softirq(cp, cpu, softirq))	660	if (cpu == this_cpu \|\| __try_remote_softirq(cp, cpu, softirq))
661	__local_trigger(cp, softirq);	661	__local_trigger(cp, softirq);
662	}	662	}
663	EXPORT_SYMBOL(__send_remote_softirq);	663	EXPORT_SYMBOL(__send_remote_softirq);
664		664
665	/**	665	/**
666	* send_remote_softirq - try to schedule softirq work on a remote cpu	666	* send_remote_softirq - try to schedule softirq work on a remote cpu
667	* @cp: private SMP call function data area	667	* @cp: private SMP call function data area
668	* @cpu: the remote cpu	668	* @cpu: the remote cpu
669	* @softirq: the softirq for the work	669	* @softirq: the softirq for the work
670	*	670	*
671	* Like __send_remote_softirq except that disabling interrupts and	671	* Like __send_remote_softirq except that disabling interrupts and
672	* computing the current cpu is done for the caller.	672	* computing the current cpu is done for the caller.
673	*/	673	*/
674	void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)	674	void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
675	{	675	{
676	unsigned long flags;	676	unsigned long flags;
677	int this_cpu;	677	int this_cpu;
678		678
679	local_irq_save(flags);	679	local_irq_save(flags);
680	this_cpu = smp_processor_id();	680	this_cpu = smp_processor_id();
681	__send_remote_softirq(cp, cpu, this_cpu, softirq);	681	__send_remote_softirq(cp, cpu, this_cpu, softirq);
682	local_irq_restore(flags);	682	local_irq_restore(flags);
683	}	683	}
684	EXPORT_SYMBOL(send_remote_softirq);	684	EXPORT_SYMBOL(send_remote_softirq);
685		685
686	static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,	686	static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
687	unsigned long action, void *hcpu)	687	unsigned long action, void *hcpu)
688	{	688	{
689	/*	689	/*
690	* If a CPU goes away, splice its entries to the current CPU	690	* If a CPU goes away, splice its entries to the current CPU
691	* and trigger a run of the softirq	691	* and trigger a run of the softirq
692	*/	692	*/
693	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {	693	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {
694	int cpu = (unsigned long) hcpu;	694	int cpu = (unsigned long) hcpu;
695	int i;	695	int i;
696		696
697	local_irq_disable();	697	local_irq_disable();
698	for (i = 0; i < NR_SOFTIRQS; i++) {	698	for (i = 0; i < NR_SOFTIRQS; i++) {
699	struct list_head *head = &per_cpu(softirq_work_list[i], cpu);	699	struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
700	struct list_head *local_head;	700	struct list_head *local_head;
701		701
702	if (list_empty(head))	702	if (list_empty(head))
703	continue;	703	continue;
704		704
705	local_head = &__get_cpu_var(softirq_work_list[i]);	705	local_head = &__get_cpu_var(softirq_work_list[i]);
706	list_splice_init(head, local_head);	706	list_splice_init(head, local_head);
707	raise_softirq_irqoff(i);	707	raise_softirq_irqoff(i);
708	}	708	}
709	local_irq_enable();	709	local_irq_enable();
710	}	710	}
711		711
712	return NOTIFY_OK;	712	return NOTIFY_OK;
713	}	713	}
714		714
715	static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {	715	static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
716	.notifier_call = remote_softirq_cpu_notify,	716	.notifier_call = remote_softirq_cpu_notify,
717	};	717	};
718		718
719	void __init softirq_init(void)	719	void __init softirq_init(void)
720	{	720	{
721	int cpu;	721	int cpu;
722		722
723	for_each_possible_cpu(cpu) {	723	for_each_possible_cpu(cpu) {
724	int i;	724	int i;
725		725
726	per_cpu(tasklet_vec, cpu).tail =	726	per_cpu(tasklet_vec, cpu).tail =
727	&per_cpu(tasklet_vec, cpu).head;	727	&per_cpu(tasklet_vec, cpu).head;
728	per_cpu(tasklet_hi_vec, cpu).tail =	728	per_cpu(tasklet_hi_vec, cpu).tail =
729	&per_cpu(tasklet_hi_vec, cpu).head;	729	&per_cpu(tasklet_hi_vec, cpu).head;
730	for (i = 0; i < NR_SOFTIRQS; i++)	730	for (i = 0; i < NR_SOFTIRQS; i++)
731	INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));	731	INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
732	}	732	}
733		733
734	register_hotcpu_notifier(&remote_softirq_cpu_notifier);	734	register_hotcpu_notifier(&remote_softirq_cpu_notifier);
735		735
736	open_softirq(TASKLET_SOFTIRQ, tasklet_action);	736	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
737	open_softirq(HI_SOFTIRQ, tasklet_hi_action);	737	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
738	}	738	}
739		739
740	static int run_ksoftirqd(void * __bind_cpu)	740	static int run_ksoftirqd(void * __bind_cpu)
741	{	741	{
742	set_current_state(TASK_INTERRUPTIBLE);	742	set_current_state(TASK_INTERRUPTIBLE);
743		743
744	while (!kthread_should_stop()) {	744	while (!kthread_should_stop()) {
745	preempt_disable();	745	preempt_disable();
746	if (!local_softirq_pending()) {	746	if (!local_softirq_pending()) {
747	schedule_preempt_disabled();	747	schedule_preempt_disabled();
748	}	748	}
749		749
750	__set_current_state(TASK_RUNNING);	750	__set_current_state(TASK_RUNNING);
751		751
752	while (local_softirq_pending()) {	752	while (local_softirq_pending()) {
753	/* Preempt disable stops cpu going offline.	753	/* Preempt disable stops cpu going offline.
754	If already offline, we'll be on wrong CPU:	754	If already offline, we'll be on wrong CPU:
755	don't process */	755	don't process */
756	if (cpu_is_offline((long)__bind_cpu))	756	if (cpu_is_offline((long)__bind_cpu))
757	goto wait_to_die;	757	goto wait_to_die;
758	local_irq_disable();	758	local_irq_disable();
759	if (local_softirq_pending())	759	if (local_softirq_pending())
760	__do_softirq();	760	__do_softirq();
761	local_irq_enable();	761	local_irq_enable();
762	preempt_enable_no_resched();	762	sched_preempt_enable_no_resched();
763	cond_resched();	763	cond_resched();
764	preempt_disable();	764	preempt_disable();
765	rcu_note_context_switch((long)__bind_cpu);	765	rcu_note_context_switch((long)__bind_cpu);
766	}	766	}
767	preempt_enable();	767	preempt_enable();
768	set_current_state(TASK_INTERRUPTIBLE);	768	set_current_state(TASK_INTERRUPTIBLE);
769	}	769	}
770	__set_current_state(TASK_RUNNING);	770	__set_current_state(TASK_RUNNING);
771	return 0;	771	return 0;
772		772
773	wait_to_die:	773	wait_to_die:
774	preempt_enable();	774	preempt_enable();
775	/* Wait for kthread_stop */	775	/* Wait for kthread_stop */
776	set_current_state(TASK_INTERRUPTIBLE);	776	set_current_state(TASK_INTERRUPTIBLE);
777	while (!kthread_should_stop()) {	777	while (!kthread_should_stop()) {
778	schedule();	778	schedule();
779	set_current_state(TASK_INTERRUPTIBLE);	779	set_current_state(TASK_INTERRUPTIBLE);
780	}	780	}
781	__set_current_state(TASK_RUNNING);	781	__set_current_state(TASK_RUNNING);
782	return 0;	782	return 0;
783	}	783	}
784		784
785	#ifdef CONFIG_HOTPLUG_CPU	785	#ifdef CONFIG_HOTPLUG_CPU
786	/*	786	/*
787	* tasklet_kill_immediate is called to remove a tasklet which can already be	787	* tasklet_kill_immediate is called to remove a tasklet which can already be
788	* scheduled for execution on @cpu.	788	* scheduled for execution on @cpu.
789	*	789	*
790	* Unlike tasklet_kill, this function removes the tasklet	790	* Unlike tasklet_kill, this function removes the tasklet
791	* _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.	791	* _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
792	*	792	*
793	* When this function is called, @cpu must be in the CPU_DEAD state.	793	* When this function is called, @cpu must be in the CPU_DEAD state.
794	*/	794	*/
795	void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)	795	void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
796	{	796	{
797	struct tasklet_struct **i;	797	struct tasklet_struct **i;
798		798
799	BUG_ON(cpu_online(cpu));	799	BUG_ON(cpu_online(cpu));
800	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));	800	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
801		801
802	if (!test_bit(TASKLET_STATE_SCHED, &t->state))	802	if (!test_bit(TASKLET_STATE_SCHED, &t->state))
803	return;	803	return;
804		804
805	/* CPU is dead, so no lock needed. */	805	/* CPU is dead, so no lock needed. */
806	for (i = &per_cpu(tasklet_vec, cpu).head; i; i = &(i)->next) {	806	for (i = &per_cpu(tasklet_vec, cpu).head; i; i = &(i)->next) {
807	if (*i == t) {	807	if (*i == t) {
808	*i = t->next;	808	*i = t->next;
809	/* If this was the tail element, move the tail ptr */	809	/* If this was the tail element, move the tail ptr */
810	if (*i == NULL)	810	if (*i == NULL)
811	per_cpu(tasklet_vec, cpu).tail = i;	811	per_cpu(tasklet_vec, cpu).tail = i;
812	return;	812	return;
813	}	813	}
814	}	814	}
815	BUG();	815	BUG();
816	}	816	}
817		817
818	static void takeover_tasklets(unsigned int cpu)	818	static void takeover_tasklets(unsigned int cpu)
819	{	819	{
820	/* CPU is dead, so no lock needed. */	820	/* CPU is dead, so no lock needed. */
821	local_irq_disable();	821	local_irq_disable();
822		822
823	/* Find end, append list for that CPU. */	823	/* Find end, append list for that CPU. */
824	if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {	824	if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
825	*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;	825	*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
826	this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);	826	this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
827	per_cpu(tasklet_vec, cpu).head = NULL;	827	per_cpu(tasklet_vec, cpu).head = NULL;
828	per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;	828	per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
829	}	829	}
830	raise_softirq_irqoff(TASKLET_SOFTIRQ);	830	raise_softirq_irqoff(TASKLET_SOFTIRQ);
831		831
832	if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {	832	if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
833	*__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;	833	*__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
834	__this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);	834	__this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
835	per_cpu(tasklet_hi_vec, cpu).head = NULL;	835	per_cpu(tasklet_hi_vec, cpu).head = NULL;
836	per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;	836	per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
837	}	837	}
838	raise_softirq_irqoff(HI_SOFTIRQ);	838	raise_softirq_irqoff(HI_SOFTIRQ);
839		839
840	local_irq_enable();	840	local_irq_enable();
841	}	841	}
842	#endif /* CONFIG_HOTPLUG_CPU */	842	#endif /* CONFIG_HOTPLUG_CPU */
843		843
844	static int __cpuinit cpu_callback(struct notifier_block *nfb,	844	static int __cpuinit cpu_callback(struct notifier_block *nfb,
845	unsigned long action,	845	unsigned long action,
846	void *hcpu)	846	void *hcpu)
847	{	847	{
848	int hotcpu = (unsigned long)hcpu;	848	int hotcpu = (unsigned long)hcpu;
849	struct task_struct *p;	849	struct task_struct *p;
850		850
851	switch (action) {	851	switch (action) {
852	case CPU_UP_PREPARE:	852	case CPU_UP_PREPARE:
853	case CPU_UP_PREPARE_FROZEN:	853	case CPU_UP_PREPARE_FROZEN:
854	p = kthread_create_on_node(run_ksoftirqd,	854	p = kthread_create_on_node(run_ksoftirqd,
855	hcpu,	855	hcpu,
856	cpu_to_node(hotcpu),	856	cpu_to_node(hotcpu),
857	"ksoftirqd/%d", hotcpu);	857	"ksoftirqd/%d", hotcpu);
858	if (IS_ERR(p)) {	858	if (IS_ERR(p)) {
859	printk("ksoftirqd for %i failed\n", hotcpu);	859	printk("ksoftirqd for %i failed\n", hotcpu);
860	return notifier_from_errno(PTR_ERR(p));	860	return notifier_from_errno(PTR_ERR(p));
861	}	861	}
862	kthread_bind(p, hotcpu);	862	kthread_bind(p, hotcpu);
863	per_cpu(ksoftirqd, hotcpu) = p;	863	per_cpu(ksoftirqd, hotcpu) = p;
864	break;	864	break;
865	case CPU_ONLINE:	865	case CPU_ONLINE:
866	case CPU_ONLINE_FROZEN:	866	case CPU_ONLINE_FROZEN:
867	wake_up_process(per_cpu(ksoftirqd, hotcpu));	867	wake_up_process(per_cpu(ksoftirqd, hotcpu));
868	break;	868	break;
869	#ifdef CONFIG_HOTPLUG_CPU	869	#ifdef CONFIG_HOTPLUG_CPU
870	case CPU_UP_CANCELED:	870	case CPU_UP_CANCELED:
871	case CPU_UP_CANCELED_FROZEN:	871	case CPU_UP_CANCELED_FROZEN:
872	if (!per_cpu(ksoftirqd, hotcpu))	872	if (!per_cpu(ksoftirqd, hotcpu))
873	break;	873	break;
874	/* Unbind so it can run. Fall thru. */	874	/* Unbind so it can run. Fall thru. */
875	kthread_bind(per_cpu(ksoftirqd, hotcpu),	875	kthread_bind(per_cpu(ksoftirqd, hotcpu),
876	cpumask_any(cpu_online_mask));	876	cpumask_any(cpu_online_mask));
877	case CPU_DEAD:	877	case CPU_DEAD:
878	case CPU_DEAD_FROZEN: {	878	case CPU_DEAD_FROZEN: {
879	static const struct sched_param param = {	879	static const struct sched_param param = {
880	.sched_priority = MAX_RT_PRIO-1	880	.sched_priority = MAX_RT_PRIO-1
881	};	881	};
882		882
883	p = per_cpu(ksoftirqd, hotcpu);	883	p = per_cpu(ksoftirqd, hotcpu);
884	per_cpu(ksoftirqd, hotcpu) = NULL;	884	per_cpu(ksoftirqd, hotcpu) = NULL;
885	sched_setscheduler_nocheck(p, SCHED_FIFO, &param);	885	sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
886	kthread_stop(p);	886	kthread_stop(p);
887	takeover_tasklets(hotcpu);	887	takeover_tasklets(hotcpu);
888	break;	888	break;
889	}	889	}
890	#endif /* CONFIG_HOTPLUG_CPU */	890	#endif /* CONFIG_HOTPLUG_CPU */
891	}	891	}
892	return NOTIFY_OK;	892	return NOTIFY_OK;
893	}	893	}
894		894
895	static struct notifier_block __cpuinitdata cpu_nfb = {	895	static struct notifier_block __cpuinitdata cpu_nfb = {
896	.notifier_call = cpu_callback	896	.notifier_call = cpu_callback
897	};	897	};
898		898
899	static __init int spawn_ksoftirqd(void)	899	static __init int spawn_ksoftirqd(void)
900	{	900	{
901	void cpu = (void )(long)smp_processor_id();	901	void cpu = (void )(long)smp_processor_id();
902	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);	902	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
903		903
904	BUG_ON(err != NOTIFY_OK);	904	BUG_ON(err != NOTIFY_OK);
905	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);	905	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
906	register_cpu_notifier(&cpu_nfb);	906	register_cpu_notifier(&cpu_nfb);
907	return 0;	907	return 0;
908	}	908	}
909	early_initcall(spawn_ksoftirqd);	909	early_initcall(spawn_ksoftirqd);
910		910
911	/*	911	/*
912	* [ These __weak aliases are kept in a separate compilation unit, so that	912	* [ These __weak aliases are kept in a separate compilation unit, so that
913	* GCC does not inline them incorrectly. ]	913	* GCC does not inline them incorrectly. ]
914	*/	914	*/
915		915
916	int __init __weak early_irq_init(void)	916	int __init __weak early_irq_init(void)
917	{	917	{
918	return 0;	918	return 0;
919	}	919	}
920		920
921	#ifdef CONFIG_GENERIC_HARDIRQS	921	#ifdef CONFIG_GENERIC_HARDIRQS
922	int __init __weak arch_probe_nr_irqs(void)	922	int __init __weak arch_probe_nr_irqs(void)
923	{	923	{
924	return NR_IRQS_LEGACY;	924	return NR_IRQS_LEGACY;
925	}	925	}
926		926
927	int __init __weak arch_early_irq_init(void)	927	int __init __weak arch_early_irq_init(void)
928	{	928	{
929	return 0;	929	return 0;
930	}	930	}
931	#endif	931	#endif
932		932