Eric Lee / linux-smarc-t335x-v3.2

Commit 39c715b71740c4a78ba4769fb54826929bac03cb

Authored by Ingo Molnar 2005-06-22 08:14:34 +0800

Committed by Linus Torvalds 2005-06-22 09:46:13 +0800

Exists in master and in 4 other branches

[PATCH] smp_processor_id() cleanup

This patch implements a number of smp_processor_id() cleanup ideas that
Arjan van de Ven and I came up with.

The previous __smp_processor_id/_smp_processor_id/smp_processor_id API
spaghetti was hard to follow both on the implementational and on the
usage side.

Some of the complexity arose from picking wrong names, some of the
complexity comes from the fact that not all architectures defined
__smp_processor_id.

In the new code, there are two externally visible symbols:

 - smp_processor_id(): debug variant.

 - raw_smp_processor_id(): nondebug variant. Replaces all existing
   uses of _smp_processor_id() and __smp_processor_id(). Defined
   by every SMP architecture in include/asm-*/smp.h.

There is one new internal symbol, dependent on DEBUG_PREEMPT:

 - debug_smp_processor_id(): internal debug variant, mapped to
                             smp_processor_id().

Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new
lib/smp_processor_id.c file.  All related comments got updated and/or
clarified.

I have build/boot tested the following 8 .config combinations on x86:

 {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT}

I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT.  (Other
architectures are untested, but should work just fine.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 37 changed files with 119 additions and 125 deletions Inline Diff

arch/i386/kernel/traps.c
arch/i386/lib/delay.c
arch/ppc/lib/locks.c
arch/ppc64/kernel/idle.c
arch/sh/lib/delay.c
arch/sparc64/lib/delay.c
arch/x86_64/lib/delay.c
drivers/acpi/processor_idle.c
drivers/input/gameport/gameport.c
drivers/oprofile/buffer_sync.c
fs/xfs/linux-2.6/xfs_linux.h
include/asm-alpha/smp.h
include/asm-arm/smp.h
include/asm-i386/smp.h
include/asm-ia64/smp.h
include/asm-m32r/smp.h
include/asm-mips/smp.h
include/asm-parisc/smp.h
include/asm-ppc/smp.h
include/asm-ppc64/smp.h
include/asm-s390/smp.h
include/asm-sh/smp.h
include/asm-sparc/smp.h
include/asm-sparc64/smp.h
include/asm-um/smp.h
include/asm-x86_64/smp.h
include/linux/mmzone.h
include/linux/smp.h
include/net/route.h
include/net/snmp.h
kernel/module.c
kernel/power/smp.c
kernel/sched.c
kernel/stop_machine.c
lib/Makefile
lib/kernel_lock.c
lib/smp_processor_id.c

arch/i386/kernel/traps.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* linux/arch/i386/traps.c	2	* linux/arch/i386/traps.c
3	*	3	*
4	* Copyright (C) 1991, 1992 Linus Torvalds	4	* Copyright (C) 1991, 1992 Linus Torvalds
5	*	5	*
6	* Pentium III FXSR, SSE support	6	* Pentium III FXSR, SSE support
7	* Gareth Hughes <gareth@valinux.com>, May 2000	7	* Gareth Hughes <gareth@valinux.com>, May 2000
8	*/	8	*/
9		9
10	/*	10	/*
11	* 'Traps.c' handles hardware traps and faults after we have saved some	11	* 'Traps.c' handles hardware traps and faults after we have saved some
12	* state in 'asm.s'.	12	* state in 'asm.s'.
13	*/	13	*/
14	#include <linux/config.h>	14	#include <linux/config.h>
15	#include <linux/sched.h>	15	#include <linux/sched.h>
16	#include <linux/kernel.h>	16	#include <linux/kernel.h>
17	#include <linux/string.h>	17	#include <linux/string.h>
18	#include <linux/errno.h>	18	#include <linux/errno.h>
19	#include <linux/timer.h>	19	#include <linux/timer.h>
20	#include <linux/mm.h>	20	#include <linux/mm.h>
21	#include <linux/init.h>	21	#include <linux/init.h>
22	#include <linux/delay.h>	22	#include <linux/delay.h>
23	#include <linux/spinlock.h>	23	#include <linux/spinlock.h>
24	#include <linux/interrupt.h>	24	#include <linux/interrupt.h>
25	#include <linux/highmem.h>	25	#include <linux/highmem.h>
26	#include <linux/kallsyms.h>	26	#include <linux/kallsyms.h>
27	#include <linux/ptrace.h>	27	#include <linux/ptrace.h>
28	#include <linux/utsname.h>	28	#include <linux/utsname.h>
29	#include <linux/kprobes.h>	29	#include <linux/kprobes.h>
30		30
31	#ifdef CONFIG_EISA	31	#ifdef CONFIG_EISA
32	#include <linux/ioport.h>	32	#include <linux/ioport.h>
33	#include <linux/eisa.h>	33	#include <linux/eisa.h>
34	#endif	34	#endif
35		35
36	#ifdef CONFIG_MCA	36	#ifdef CONFIG_MCA
37	#include <linux/mca.h>	37	#include <linux/mca.h>
38	#endif	38	#endif
39		39
40	#include <asm/processor.h>	40	#include <asm/processor.h>
41	#include <asm/system.h>	41	#include <asm/system.h>
42	#include <asm/uaccess.h>	42	#include <asm/uaccess.h>
43	#include <asm/io.h>	43	#include <asm/io.h>
44	#include <asm/atomic.h>	44	#include <asm/atomic.h>
45	#include <asm/debugreg.h>	45	#include <asm/debugreg.h>
46	#include <asm/desc.h>	46	#include <asm/desc.h>
47	#include <asm/i387.h>	47	#include <asm/i387.h>
48	#include <asm/nmi.h>	48	#include <asm/nmi.h>
49		49
50	#include <asm/smp.h>	50	#include <asm/smp.h>
51	#include <asm/arch_hooks.h>	51	#include <asm/arch_hooks.h>
52	#include <asm/kdebug.h>	52	#include <asm/kdebug.h>
53		53
54	#include <linux/irq.h>	54	#include <linux/irq.h>
55	#include <linux/module.h>	55	#include <linux/module.h>
56		56
57	#include "mach_traps.h"	57	#include "mach_traps.h"
58		58
59	asmlinkage int system_call(void);	59	asmlinkage int system_call(void);
60		60
61	struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },	61	struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
62	{ 0, 0 }, { 0, 0 } };	62	{ 0, 0 }, { 0, 0 } };
63		63
64	/* Do we ignore FPU interrupts ? */	64	/* Do we ignore FPU interrupts ? */
65	char ignore_fpu_irq = 0;	65	char ignore_fpu_irq = 0;
66		66
67	/*	67	/*
68	* The IDT has to be page-aligned to simplify the Pentium	68	* The IDT has to be page-aligned to simplify the Pentium
69	* F0 0F bug workaround.. We have a special link segment	69	* F0 0F bug workaround.. We have a special link segment
70	* for this.	70	* for this.
71	*/	71	*/
72	struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };	72	struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
73		73
74	asmlinkage void divide_error(void);	74	asmlinkage void divide_error(void);
75	asmlinkage void debug(void);	75	asmlinkage void debug(void);
76	asmlinkage void nmi(void);	76	asmlinkage void nmi(void);
77	asmlinkage void int3(void);	77	asmlinkage void int3(void);
78	asmlinkage void overflow(void);	78	asmlinkage void overflow(void);
79	asmlinkage void bounds(void);	79	asmlinkage void bounds(void);
80	asmlinkage void invalid_op(void);	80	asmlinkage void invalid_op(void);
81	asmlinkage void device_not_available(void);	81	asmlinkage void device_not_available(void);
82	asmlinkage void coprocessor_segment_overrun(void);	82	asmlinkage void coprocessor_segment_overrun(void);
83	asmlinkage void invalid_TSS(void);	83	asmlinkage void invalid_TSS(void);
84	asmlinkage void segment_not_present(void);	84	asmlinkage void segment_not_present(void);
85	asmlinkage void stack_segment(void);	85	asmlinkage void stack_segment(void);
86	asmlinkage void general_protection(void);	86	asmlinkage void general_protection(void);
87	asmlinkage void page_fault(void);	87	asmlinkage void page_fault(void);
88	asmlinkage void coprocessor_error(void);	88	asmlinkage void coprocessor_error(void);
89	asmlinkage void simd_coprocessor_error(void);	89	asmlinkage void simd_coprocessor_error(void);
90	asmlinkage void alignment_check(void);	90	asmlinkage void alignment_check(void);
91	asmlinkage void spurious_interrupt_bug(void);	91	asmlinkage void spurious_interrupt_bug(void);
92	asmlinkage void machine_check(void);	92	asmlinkage void machine_check(void);
93		93
94	static int kstack_depth_to_print = 24;	94	static int kstack_depth_to_print = 24;
95	struct notifier_block *i386die_chain;	95	struct notifier_block *i386die_chain;
96	static DEFINE_SPINLOCK(die_notifier_lock);	96	static DEFINE_SPINLOCK(die_notifier_lock);
97		97
98	int register_die_notifier(struct notifier_block *nb)	98	int register_die_notifier(struct notifier_block *nb)
99	{	99	{
100	int err = 0;	100	int err = 0;
101	unsigned long flags;	101	unsigned long flags;
102	spin_lock_irqsave(&die_notifier_lock, flags);	102	spin_lock_irqsave(&die_notifier_lock, flags);
103	err = notifier_chain_register(&i386die_chain, nb);	103	err = notifier_chain_register(&i386die_chain, nb);
104	spin_unlock_irqrestore(&die_notifier_lock, flags);	104	spin_unlock_irqrestore(&die_notifier_lock, flags);
105	return err;	105	return err;
106	}	106	}
107		107
108	static inline int valid_stack_ptr(struct thread_info tinfo, void p)	108	static inline int valid_stack_ptr(struct thread_info tinfo, void p)
109	{	109	{
110	return p > (void *)tinfo &&	110	return p > (void *)tinfo &&
111	p < (void *)tinfo + THREAD_SIZE - 3;	111	p < (void *)tinfo + THREAD_SIZE - 3;
112	}	112	}
113		113
114	static inline unsigned long print_context_stack(struct thread_info *tinfo,	114	static inline unsigned long print_context_stack(struct thread_info *tinfo,
115	unsigned long *stack, unsigned long ebp)	115	unsigned long *stack, unsigned long ebp)
116	{	116	{
117	unsigned long addr;	117	unsigned long addr;
118		118
119	#ifdef CONFIG_FRAME_POINTER	119	#ifdef CONFIG_FRAME_POINTER
120	while (valid_stack_ptr(tinfo, (void *)ebp)) {	120	while (valid_stack_ptr(tinfo, (void *)ebp)) {
121	addr = (unsigned long )(ebp + 4);	121	addr = (unsigned long )(ebp + 4);
122	printk(" [<%08lx>] ", addr);	122	printk(" [<%08lx>] ", addr);
123	print_symbol("%s", addr);	123	print_symbol("%s", addr);
124	printk("\n");	124	printk("\n");
125	ebp = (unsigned long )ebp;	125	ebp = (unsigned long )ebp;
126	}	126	}
127	#else	127	#else
128	while (valid_stack_ptr(tinfo, stack)) {	128	while (valid_stack_ptr(tinfo, stack)) {
129	addr = *stack++;	129	addr = *stack++;
130	if (__kernel_text_address(addr)) {	130	if (__kernel_text_address(addr)) {
131	printk(" [<%08lx>]", addr);	131	printk(" [<%08lx>]", addr);
132	print_symbol(" %s", addr);	132	print_symbol(" %s", addr);
133	printk("\n");	133	printk("\n");
134	}	134	}
135	}	135	}
136	#endif	136	#endif
137	return ebp;	137	return ebp;
138	}	138	}
139		139
140	void show_trace(struct task_struct task, unsigned long stack)	140	void show_trace(struct task_struct task, unsigned long stack)
141	{	141	{
142	unsigned long ebp;	142	unsigned long ebp;
143		143
144	if (!task)	144	if (!task)
145	task = current;	145	task = current;
146		146
147	if (task == current) {	147	if (task == current) {
148	/* Grab ebp right from our regs */	148	/* Grab ebp right from our regs */
149	asm ("movl %%ebp, %0" : "=r" (ebp) : );	149	asm ("movl %%ebp, %0" : "=r" (ebp) : );
150	} else {	150	} else {
151	/* ebp is the last reg pushed by switch_to */	151	/* ebp is the last reg pushed by switch_to */
152	ebp = (unsigned long ) task->thread.esp;	152	ebp = (unsigned long ) task->thread.esp;
153	}	153	}
154		154
155	while (1) {	155	while (1) {
156	struct thread_info *context;	156	struct thread_info *context;
157	context = (struct thread_info *)	157	context = (struct thread_info *)
158	((unsigned long)stack & (~(THREAD_SIZE - 1)));	158	((unsigned long)stack & (~(THREAD_SIZE - 1)));
159	ebp = print_context_stack(context, stack, ebp);	159	ebp = print_context_stack(context, stack, ebp);
160	stack = (unsigned long*)context->previous_esp;	160	stack = (unsigned long*)context->previous_esp;
161	if (!stack)	161	if (!stack)
162	break;	162	break;
163	printk(" =======================\n");	163	printk(" =======================\n");
164	}	164	}
165	}	165	}
166		166
167	void show_stack(struct task_struct task, unsigned long esp)	167	void show_stack(struct task_struct task, unsigned long esp)
168	{	168	{
169	unsigned long *stack;	169	unsigned long *stack;
170	int i;	170	int i;
171		171
172	if (esp == NULL) {	172	if (esp == NULL) {
173	if (task)	173	if (task)
174	esp = (unsigned long*)task->thread.esp;	174	esp = (unsigned long*)task->thread.esp;
175	else	175	else
176	esp = (unsigned long *)&esp;	176	esp = (unsigned long *)&esp;
177	}	177	}
178		178
179	stack = esp;	179	stack = esp;
180	for(i = 0; i < kstack_depth_to_print; i++) {	180	for(i = 0; i < kstack_depth_to_print; i++) {
181	if (kstack_end(stack))	181	if (kstack_end(stack))
182	break;	182	break;
183	if (i && ((i % 8) == 0))	183	if (i && ((i % 8) == 0))
184	printk("\n ");	184	printk("\n ");
185	printk("%08lx ", *stack++);	185	printk("%08lx ", *stack++);
186	}	186	}
187	printk("\nCall Trace:\n");	187	printk("\nCall Trace:\n");
188	show_trace(task, esp);	188	show_trace(task, esp);
189	}	189	}
190		190
191	/*	191	/*
192	* The architecture-independent dump_stack generator	192	* The architecture-independent dump_stack generator
193	*/	193	*/
194	void dump_stack(void)	194	void dump_stack(void)
195	{	195	{
196	unsigned long stack;	196	unsigned long stack;
197		197
198	show_trace(current, &stack);	198	show_trace(current, &stack);
199	}	199	}
200		200
201	EXPORT_SYMBOL(dump_stack);	201	EXPORT_SYMBOL(dump_stack);
202		202
203	void show_registers(struct pt_regs *regs)	203	void show_registers(struct pt_regs *regs)
204	{	204	{
205	int i;	205	int i;
206	int in_kernel = 1;	206	int in_kernel = 1;
207	unsigned long esp;	207	unsigned long esp;
208	unsigned short ss;	208	unsigned short ss;
209		209
210	esp = (unsigned long) (&regs->esp);	210	esp = (unsigned long) (&regs->esp);
211	ss = __KERNEL_DS;	211	ss = __KERNEL_DS;
212	if (regs->xcs & 3) {	212	if (regs->xcs & 3) {
213	in_kernel = 0;	213	in_kernel = 0;
214	esp = regs->esp;	214	esp = regs->esp;
215	ss = regs->xss & 0xffff;	215	ss = regs->xss & 0xffff;
216	}	216	}
217	print_modules();	217	print_modules();
218	printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx"	218	printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx"
219	" (%s) \n",	219	" (%s) \n",
220	smp_processor_id(), 0xffff & regs->xcs, regs->eip,	220	smp_processor_id(), 0xffff & regs->xcs, regs->eip,
221	print_tainted(), regs->eflags, system_utsname.release);	221	print_tainted(), regs->eflags, system_utsname.release);
222	print_symbol("EIP is at %s\n", regs->eip);	222	print_symbol("EIP is at %s\n", regs->eip);
223	printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",	223	printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
224	regs->eax, regs->ebx, regs->ecx, regs->edx);	224	regs->eax, regs->ebx, regs->ecx, regs->edx);
225	printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",	225	printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
226	regs->esi, regs->edi, regs->ebp, esp);	226	regs->esi, regs->edi, regs->ebp, esp);
227	printk("ds: %04x es: %04x ss: %04x\n",	227	printk("ds: %04x es: %04x ss: %04x\n",
228	regs->xds & 0xffff, regs->xes & 0xffff, ss);	228	regs->xds & 0xffff, regs->xes & 0xffff, ss);
229	printk("Process %s (pid: %d, threadinfo=%p task=%p)",	229	printk("Process %s (pid: %d, threadinfo=%p task=%p)",
230	current->comm, current->pid, current_thread_info(), current);	230	current->comm, current->pid, current_thread_info(), current);
231	/*	231	/*
232	* When in-kernel, we also print out the stack and code at the	232	* When in-kernel, we also print out the stack and code at the
233	* time of the fault..	233	* time of the fault..
234	*/	234	*/
235	if (in_kernel) {	235	if (in_kernel) {
236	u8 *eip;	236	u8 *eip;
237		237
238	printk("\nStack: ");	238	printk("\nStack: ");
239	show_stack(NULL, (unsigned long*)esp);	239	show_stack(NULL, (unsigned long*)esp);
240		240
241	printk("Code: ");	241	printk("Code: ");
242		242
243	eip = (u8 *)regs->eip - 43;	243	eip = (u8 *)regs->eip - 43;
244	for (i = 0; i < 64; i++, eip++) {	244	for (i = 0; i < 64; i++, eip++) {
245	unsigned char c;	245	unsigned char c;
246		246
247	if (eip < (u8 *)PAGE_OFFSET \|\| __get_user(c, eip)) {	247	if (eip < (u8 *)PAGE_OFFSET \|\| __get_user(c, eip)) {
248	printk(" Bad EIP value.");	248	printk(" Bad EIP value.");
249	break;	249	break;
250	}	250	}
251	if (eip == (u8 *)regs->eip)	251	if (eip == (u8 *)regs->eip)
252	printk("<%02x> ", c);	252	printk("<%02x> ", c);
253	else	253	else
254	printk("%02x ", c);	254	printk("%02x ", c);
255	}	255	}
256	}	256	}
257	printk("\n");	257	printk("\n");
258	}	258	}
259		259
260	static void handle_BUG(struct pt_regs *regs)	260	static void handle_BUG(struct pt_regs *regs)
261	{	261	{
262	unsigned short ud2;	262	unsigned short ud2;
263	unsigned short line;	263	unsigned short line;
264	char *file;	264	char *file;
265	char c;	265	char c;
266	unsigned long eip;	266	unsigned long eip;
267		267
268	if (regs->xcs & 3)	268	if (regs->xcs & 3)
269	goto no_bug; /* Not in kernel */	269	goto no_bug; /* Not in kernel */
270		270
271	eip = regs->eip;	271	eip = regs->eip;
272		272
273	if (eip < PAGE_OFFSET)	273	if (eip < PAGE_OFFSET)
274	goto no_bug;	274	goto no_bug;
275	if (__get_user(ud2, (unsigned short *)eip))	275	if (__get_user(ud2, (unsigned short *)eip))
276	goto no_bug;	276	goto no_bug;
277	if (ud2 != 0x0b0f)	277	if (ud2 != 0x0b0f)
278	goto no_bug;	278	goto no_bug;
279	if (__get_user(line, (unsigned short *)(eip + 2)))	279	if (__get_user(line, (unsigned short *)(eip + 2)))
280	goto bug;	280	goto bug;
281	if (__get_user(file, (char **)(eip + 4)) \|\|	281	if (__get_user(file, (char **)(eip + 4)) \|\|
282	(unsigned long)file < PAGE_OFFSET \|\| __get_user(c, file))	282	(unsigned long)file < PAGE_OFFSET \|\| __get_user(c, file))
283	file = "<bad filename>";	283	file = "<bad filename>";
284		284
285	printk("------------[ cut here ]------------\n");	285	printk("------------[ cut here ]------------\n");
286	printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line);	286	printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line);
287		287
288	no_bug:	288	no_bug:
289	return;	289	return;
290		290
291	/* Here we know it was a BUG but file-n-line is unavailable */	291	/* Here we know it was a BUG but file-n-line is unavailable */
292	bug:	292	bug:
293	printk("Kernel BUG\n");	293	printk("Kernel BUG\n");
294	}	294	}
295		295
296	void die(const char * str, struct pt_regs * regs, long err)	296	void die(const char * str, struct pt_regs * regs, long err)
297	{	297	{
298	static struct {	298	static struct {
299	spinlock_t lock;	299	spinlock_t lock;
300	u32 lock_owner;	300	u32 lock_owner;
301	int lock_owner_depth;	301	int lock_owner_depth;
302	} die = {	302	} die = {
303	.lock = SPIN_LOCK_UNLOCKED,	303	.lock = SPIN_LOCK_UNLOCKED,
304	.lock_owner = -1,	304	.lock_owner = -1,
305	.lock_owner_depth = 0	305	.lock_owner_depth = 0
306	};	306	};
307	static int die_counter;	307	static int die_counter;
308		308
309	if (die.lock_owner != _smp_processor_id()) {	309	if (die.lock_owner != raw_smp_processor_id()) {
310	console_verbose();	310	console_verbose();
311	spin_lock_irq(&die.lock);	311	spin_lock_irq(&die.lock);
312	die.lock_owner = smp_processor_id();	312	die.lock_owner = smp_processor_id();
313	die.lock_owner_depth = 0;	313	die.lock_owner_depth = 0;
314	bust_spinlocks(1);	314	bust_spinlocks(1);
315	}	315	}
316		316
317	if (++die.lock_owner_depth < 3) {	317	if (++die.lock_owner_depth < 3) {
318	int nl = 0;	318	int nl = 0;
319	handle_BUG(regs);	319	handle_BUG(regs);
320	printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);	320	printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
321	#ifdef CONFIG_PREEMPT	321	#ifdef CONFIG_PREEMPT
322	printk("PREEMPT ");	322	printk("PREEMPT ");
323	nl = 1;	323	nl = 1;
324	#endif	324	#endif
325	#ifdef CONFIG_SMP	325	#ifdef CONFIG_SMP
326	printk("SMP ");	326	printk("SMP ");
327	nl = 1;	327	nl = 1;
328	#endif	328	#endif
329	#ifdef CONFIG_DEBUG_PAGEALLOC	329	#ifdef CONFIG_DEBUG_PAGEALLOC
330	printk("DEBUG_PAGEALLOC");	330	printk("DEBUG_PAGEALLOC");
331	nl = 1;	331	nl = 1;
332	#endif	332	#endif
333	if (nl)	333	if (nl)
334	printk("\n");	334	printk("\n");
335	notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);	335	notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
336	show_registers(regs);	336	show_registers(regs);
337	} else	337	} else
338	printk(KERN_ERR "Recursive die() failure, output suppressed\n");	338	printk(KERN_ERR "Recursive die() failure, output suppressed\n");
339		339
340	bust_spinlocks(0);	340	bust_spinlocks(0);
341	die.lock_owner = -1;	341	die.lock_owner = -1;
342	spin_unlock_irq(&die.lock);	342	spin_unlock_irq(&die.lock);
343	if (in_interrupt())	343	if (in_interrupt())
344	panic("Fatal exception in interrupt");	344	panic("Fatal exception in interrupt");
345		345
346	if (panic_on_oops) {	346	if (panic_on_oops) {
347	printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");	347	printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
348	ssleep(5);	348	ssleep(5);
349	panic("Fatal exception");	349	panic("Fatal exception");
350	}	350	}
351	do_exit(SIGSEGV);	351	do_exit(SIGSEGV);
352	}	352	}
353		353
354	static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)	354	static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
355	{	355	{
356	if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs))	356	if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs))
357	die(str, regs, err);	357	die(str, regs, err);
358	}	358	}
359		359
360	static void do_trap(int trapnr, int signr, char *str, int vm86,	360	static void do_trap(int trapnr, int signr, char *str, int vm86,
361	struct pt_regs * regs, long error_code, siginfo_t *info)	361	struct pt_regs * regs, long error_code, siginfo_t *info)
362	{	362	{
363	if (regs->eflags & VM_MASK) {	363	if (regs->eflags & VM_MASK) {
364	if (vm86)	364	if (vm86)
365	goto vm86_trap;	365	goto vm86_trap;
366	goto trap_signal;	366	goto trap_signal;
367	}	367	}
368		368
369	if (!(regs->xcs & 3))	369	if (!(regs->xcs & 3))
370	goto kernel_trap;	370	goto kernel_trap;
371		371
372	trap_signal: {	372	trap_signal: {
373	struct task_struct *tsk = current;	373	struct task_struct *tsk = current;
374	tsk->thread.error_code = error_code;	374	tsk->thread.error_code = error_code;
375	tsk->thread.trap_no = trapnr;	375	tsk->thread.trap_no = trapnr;
376	if (info)	376	if (info)
377	force_sig_info(signr, info, tsk);	377	force_sig_info(signr, info, tsk);
378	else	378	else
379	force_sig(signr, tsk);	379	force_sig(signr, tsk);
380	return;	380	return;
381	}	381	}
382		382
383	kernel_trap: {	383	kernel_trap: {
384	if (!fixup_exception(regs))	384	if (!fixup_exception(regs))
385	die(str, regs, error_code);	385	die(str, regs, error_code);
386	return;	386	return;
387	}	387	}
388		388
389	vm86_trap: {	389	vm86_trap: {
390	int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);	390	int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
391	if (ret) goto trap_signal;	391	if (ret) goto trap_signal;
392	return;	392	return;
393	}	393	}
394	}	394	}
395		395
396	#define DO_ERROR(trapnr, signr, str, name) \	396	#define DO_ERROR(trapnr, signr, str, name) \
397	fastcall void do_##name(struct pt_regs * regs, long error_code) \	397	fastcall void do_##name(struct pt_regs * regs, long error_code) \
398	{ \	398	{ \
399	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \	399	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
400	== NOTIFY_STOP) \	400	== NOTIFY_STOP) \
401	return; \	401	return; \
402	do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \	402	do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
403	}	403	}
404		404
405	#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \	405	#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
406	fastcall void do_##name(struct pt_regs * regs, long error_code) \	406	fastcall void do_##name(struct pt_regs * regs, long error_code) \
407	{ \	407	{ \
408	siginfo_t info; \	408	siginfo_t info; \
409	info.si_signo = signr; \	409	info.si_signo = signr; \
410	info.si_errno = 0; \	410	info.si_errno = 0; \
411	info.si_code = sicode; \	411	info.si_code = sicode; \
412	info.si_addr = (void __user *)siaddr; \	412	info.si_addr = (void __user *)siaddr; \
413	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \	413	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
414	== NOTIFY_STOP) \	414	== NOTIFY_STOP) \
415	return; \	415	return; \
416	do_trap(trapnr, signr, str, 0, regs, error_code, &info); \	416	do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
417	}	417	}
418		418
419	#define DO_VM86_ERROR(trapnr, signr, str, name) \	419	#define DO_VM86_ERROR(trapnr, signr, str, name) \
420	fastcall void do_##name(struct pt_regs * regs, long error_code) \	420	fastcall void do_##name(struct pt_regs * regs, long error_code) \
421	{ \	421	{ \
422	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \	422	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
423	== NOTIFY_STOP) \	423	== NOTIFY_STOP) \
424	return; \	424	return; \
425	do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \	425	do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
426	}	426	}
427		427
428	#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \	428	#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
429	fastcall void do_##name(struct pt_regs * regs, long error_code) \	429	fastcall void do_##name(struct pt_regs * regs, long error_code) \
430	{ \	430	{ \
431	siginfo_t info; \	431	siginfo_t info; \
432	info.si_signo = signr; \	432	info.si_signo = signr; \
433	info.si_errno = 0; \	433	info.si_errno = 0; \
434	info.si_code = sicode; \	434	info.si_code = sicode; \
435	info.si_addr = (void __user *)siaddr; \	435	info.si_addr = (void __user *)siaddr; \
436	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \	436	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
437	== NOTIFY_STOP) \	437	== NOTIFY_STOP) \
438	return; \	438	return; \
439	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \	439	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
440	}	440	}
441		441
442	DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)	442	DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
443	#ifndef CONFIG_KPROBES	443	#ifndef CONFIG_KPROBES
444	DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)	444	DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
445	#endif	445	#endif
446	DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)	446	DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
447	DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)	447	DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
448	DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)	448	DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
449	DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)	449	DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
450	DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)	450	DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
451	DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)	451	DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
452	DO_ERROR(12, SIGBUS, "stack segment", stack_segment)	452	DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
453	DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)	453	DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
454	DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)	454	DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
455		455
456	fastcall void do_general_protection(struct pt_regs * regs, long error_code)	456	fastcall void do_general_protection(struct pt_regs * regs, long error_code)
457	{	457	{
458	int cpu = get_cpu();	458	int cpu = get_cpu();
459	struct tss_struct *tss = &per_cpu(init_tss, cpu);	459	struct tss_struct *tss = &per_cpu(init_tss, cpu);
460	struct thread_struct *thread = &current->thread;	460	struct thread_struct *thread = &current->thread;
461		461
462	/*	462	/*
463	* Perform the lazy TSS's I/O bitmap copy. If the TSS has an	463	* Perform the lazy TSS's I/O bitmap copy. If the TSS has an
464	* invalid offset set (the LAZY one) and the faulting thread has	464	* invalid offset set (the LAZY one) and the faulting thread has
465	* a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS	465	* a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
466	* and we set the offset field correctly. Then we let the CPU to	466	* and we set the offset field correctly. Then we let the CPU to
467	* restart the faulting instruction.	467	* restart the faulting instruction.
468	*/	468	*/
469	if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&	469	if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
470	thread->io_bitmap_ptr) {	470	thread->io_bitmap_ptr) {
471	memcpy(tss->io_bitmap, thread->io_bitmap_ptr,	471	memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
472	thread->io_bitmap_max);	472	thread->io_bitmap_max);
473	/*	473	/*
474	* If the previously set map was extending to higher ports	474	* If the previously set map was extending to higher ports
475	* than the current one, pad extra space with 0xff (no access).	475	* than the current one, pad extra space with 0xff (no access).
476	*/	476	*/
477	if (thread->io_bitmap_max < tss->io_bitmap_max)	477	if (thread->io_bitmap_max < tss->io_bitmap_max)
478	memset((char *) tss->io_bitmap +	478	memset((char *) tss->io_bitmap +
479	thread->io_bitmap_max, 0xff,	479	thread->io_bitmap_max, 0xff,
480	tss->io_bitmap_max - thread->io_bitmap_max);	480	tss->io_bitmap_max - thread->io_bitmap_max);
481	tss->io_bitmap_max = thread->io_bitmap_max;	481	tss->io_bitmap_max = thread->io_bitmap_max;
482	tss->io_bitmap_base = IO_BITMAP_OFFSET;	482	tss->io_bitmap_base = IO_BITMAP_OFFSET;
483	put_cpu();	483	put_cpu();
484	return;	484	return;
485	}	485	}
486	put_cpu();	486	put_cpu();
487		487
488	if (regs->eflags & VM_MASK)	488	if (regs->eflags & VM_MASK)
489	goto gp_in_vm86;	489	goto gp_in_vm86;
490		490
491	if (!(regs->xcs & 3))	491	if (!(regs->xcs & 3))
492	goto gp_in_kernel;	492	goto gp_in_kernel;
493		493
494	current->thread.error_code = error_code;	494	current->thread.error_code = error_code;
495	current->thread.trap_no = 13;	495	current->thread.trap_no = 13;
496	force_sig(SIGSEGV, current);	496	force_sig(SIGSEGV, current);
497	return;	497	return;
498		498
499	gp_in_vm86:	499	gp_in_vm86:
500	local_irq_enable();	500	local_irq_enable();
501	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);	501	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
502	return;	502	return;
503		503
504	gp_in_kernel:	504	gp_in_kernel:
505	if (!fixup_exception(regs)) {	505	if (!fixup_exception(regs)) {
506	if (notify_die(DIE_GPF, "general protection fault", regs,	506	if (notify_die(DIE_GPF, "general protection fault", regs,
507	error_code, 13, SIGSEGV) == NOTIFY_STOP)	507	error_code, 13, SIGSEGV) == NOTIFY_STOP)
508	return;	508	return;
509	die("general protection fault", regs, error_code);	509	die("general protection fault", regs, error_code);
510	}	510	}
511	}	511	}
512		512
513	static void mem_parity_error(unsigned char reason, struct pt_regs * regs)	513	static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
514	{	514	{
515	printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");	515	printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
516	printk("You probably have a hardware problem with your RAM chips\n");	516	printk("You probably have a hardware problem with your RAM chips\n");
517		517
518	/* Clear and disable the memory parity error line. */	518	/* Clear and disable the memory parity error line. */
519	clear_mem_error(reason);	519	clear_mem_error(reason);
520	}	520	}
521		521
522	static void io_check_error(unsigned char reason, struct pt_regs * regs)	522	static void io_check_error(unsigned char reason, struct pt_regs * regs)
523	{	523	{
524	unsigned long i;	524	unsigned long i;
525		525
526	printk("NMI: IOCK error (debug interrupt?)\n");	526	printk("NMI: IOCK error (debug interrupt?)\n");
527	show_registers(regs);	527	show_registers(regs);
528		528
529	/* Re-enable the IOCK line, wait for a few seconds */	529	/* Re-enable the IOCK line, wait for a few seconds */
530	reason = (reason & 0xf) \| 8;	530	reason = (reason & 0xf) \| 8;
531	outb(reason, 0x61);	531	outb(reason, 0x61);
532	i = 2000;	532	i = 2000;
533	while (--i) udelay(1000);	533	while (--i) udelay(1000);
534	reason &= ~8;	534	reason &= ~8;
535	outb(reason, 0x61);	535	outb(reason, 0x61);
536	}	536	}
537		537
538	static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)	538	static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
539	{	539	{
540	#ifdef CONFIG_MCA	540	#ifdef CONFIG_MCA
541	/* Might actually be able to figure out what the guilty party	541	/* Might actually be able to figure out what the guilty party
542	* is. */	542	* is. */
543	if( MCA_bus ) {	543	if( MCA_bus ) {
544	mca_handle_nmi();	544	mca_handle_nmi();
545	return;	545	return;
546	}	546	}
547	#endif	547	#endif
548	printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",	548	printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
549	reason, smp_processor_id());	549	reason, smp_processor_id());
550	printk("Dazed and confused, but trying to continue\n");	550	printk("Dazed and confused, but trying to continue\n");
551	printk("Do you have a strange power saving mode enabled?\n");	551	printk("Do you have a strange power saving mode enabled?\n");
552	}	552	}
553		553
554	static DEFINE_SPINLOCK(nmi_print_lock);	554	static DEFINE_SPINLOCK(nmi_print_lock);
555		555
556	void die_nmi (struct pt_regs regs, const char msg)	556	void die_nmi (struct pt_regs regs, const char msg)
557	{	557	{
558	spin_lock(&nmi_print_lock);	558	spin_lock(&nmi_print_lock);
559	/*	559	/*
560	* We are in trouble anyway, lets at least try	560	* We are in trouble anyway, lets at least try
561	* to get a message out.	561	* to get a message out.
562	*/	562	*/
563	bust_spinlocks(1);	563	bust_spinlocks(1);
564	printk(msg);	564	printk(msg);
565	printk(" on CPU%d, eip %08lx, registers:\n",	565	printk(" on CPU%d, eip %08lx, registers:\n",
566	smp_processor_id(), regs->eip);	566	smp_processor_id(), regs->eip);
567	show_registers(regs);	567	show_registers(regs);
568	printk("console shuts up ...\n");	568	printk("console shuts up ...\n");
569	console_silent();	569	console_silent();
570	spin_unlock(&nmi_print_lock);	570	spin_unlock(&nmi_print_lock);
571	bust_spinlocks(0);	571	bust_spinlocks(0);
572	do_exit(SIGSEGV);	572	do_exit(SIGSEGV);
573	}	573	}
574		574
575	static void default_do_nmi(struct pt_regs * regs)	575	static void default_do_nmi(struct pt_regs * regs)
576	{	576	{
577	unsigned char reason = 0;	577	unsigned char reason = 0;
578		578
579	/* Only the BSP gets external NMIs from the system. */	579	/* Only the BSP gets external NMIs from the system. */
580	if (!smp_processor_id())	580	if (!smp_processor_id())
581	reason = get_nmi_reason();	581	reason = get_nmi_reason();
582		582
583	if (!(reason & 0xc0)) {	583	if (!(reason & 0xc0)) {
584	if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)	584	if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
585	== NOTIFY_STOP)	585	== NOTIFY_STOP)
586	return;	586	return;
587	#ifdef CONFIG_X86_LOCAL_APIC	587	#ifdef CONFIG_X86_LOCAL_APIC
588	/*	588	/*
589	* Ok, so this is none of the documented NMI sources,	589	* Ok, so this is none of the documented NMI sources,
590	* so it must be the NMI watchdog.	590	* so it must be the NMI watchdog.
591	*/	591	*/
592	if (nmi_watchdog) {	592	if (nmi_watchdog) {
593	nmi_watchdog_tick(regs);	593	nmi_watchdog_tick(regs);
594	return;	594	return;
595	}	595	}
596	#endif	596	#endif
597	unknown_nmi_error(reason, regs);	597	unknown_nmi_error(reason, regs);
598	return;	598	return;
599	}	599	}
600	if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)	600	if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
601	return;	601	return;
602	if (reason & 0x80)	602	if (reason & 0x80)
603	mem_parity_error(reason, regs);	603	mem_parity_error(reason, regs);
604	if (reason & 0x40)	604	if (reason & 0x40)
605	io_check_error(reason, regs);	605	io_check_error(reason, regs);
606	/*	606	/*
607	* Reassert NMI in case it became active meanwhile	607	* Reassert NMI in case it became active meanwhile
608	* as it's edge-triggered.	608	* as it's edge-triggered.
609	*/	609	*/
610	reassert_nmi();	610	reassert_nmi();
611	}	611	}
612		612
613	static int dummy_nmi_callback(struct pt_regs * regs, int cpu)	613	static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
614	{	614	{
615	return 0;	615	return 0;
616	}	616	}
617		617
618	static nmi_callback_t nmi_callback = dummy_nmi_callback;	618	static nmi_callback_t nmi_callback = dummy_nmi_callback;
619		619
620	fastcall void do_nmi(struct pt_regs * regs, long error_code)	620	fastcall void do_nmi(struct pt_regs * regs, long error_code)
621	{	621	{
622	int cpu;	622	int cpu;
623		623
624	nmi_enter();	624	nmi_enter();
625		625
626	cpu = smp_processor_id();	626	cpu = smp_processor_id();
627	++nmi_count(cpu);	627	++nmi_count(cpu);
628		628
629	if (!nmi_callback(regs, cpu))	629	if (!nmi_callback(regs, cpu))
630	default_do_nmi(regs);	630	default_do_nmi(regs);
631		631
632	nmi_exit();	632	nmi_exit();
633	}	633	}
634		634
635	void set_nmi_callback(nmi_callback_t callback)	635	void set_nmi_callback(nmi_callback_t callback)
636	{	636	{
637	nmi_callback = callback;	637	nmi_callback = callback;
638	}	638	}
639		639
640	void unset_nmi_callback(void)	640	void unset_nmi_callback(void)
641	{	641	{
642	nmi_callback = dummy_nmi_callback;	642	nmi_callback = dummy_nmi_callback;
643	}	643	}
644		644
645	#ifdef CONFIG_KPROBES	645	#ifdef CONFIG_KPROBES
646	fastcall void do_int3(struct pt_regs *regs, long error_code)	646	fastcall void do_int3(struct pt_regs *regs, long error_code)
647	{	647	{
648	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)	648	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
649	== NOTIFY_STOP)	649	== NOTIFY_STOP)
650	return;	650	return;
651	/* This is an interrupt gate, because kprobes wants interrupts	651	/* This is an interrupt gate, because kprobes wants interrupts
652	disabled. Normal trap handlers don't. */	652	disabled. Normal trap handlers don't. */
653	restore_interrupts(regs);	653	restore_interrupts(regs);
654	do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);	654	do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
655	}	655	}
656	#endif	656	#endif
657		657
658	/*	658	/*
659	* Our handling of the processor debug registers is non-trivial.	659	* Our handling of the processor debug registers is non-trivial.
660	* We do not clear them on entry and exit from the kernel. Therefore	660	* We do not clear them on entry and exit from the kernel. Therefore
661	* it is possible to get a watchpoint trap here from inside the kernel.	661	* it is possible to get a watchpoint trap here from inside the kernel.
662	* However, the code in ./ptrace.c has ensured that the user can	662	* However, the code in ./ptrace.c has ensured that the user can
663	* only set watchpoints on userspace addresses. Therefore the in-kernel	663	* only set watchpoints on userspace addresses. Therefore the in-kernel
664	* watchpoint trap can only occur in code which is reading/writing	664	* watchpoint trap can only occur in code which is reading/writing
665	* from user space. Such code must not hold kernel locks (since it	665	* from user space. Such code must not hold kernel locks (since it
666	* can equally take a page fault), therefore it is safe to call	666	* can equally take a page fault), therefore it is safe to call
667	* force_sig_info even though that claims and releases locks.	667	* force_sig_info even though that claims and releases locks.
668	*	668	*
669	* Code in ./signal.c ensures that the debug control register	669	* Code in ./signal.c ensures that the debug control register
670	* is restored before we deliver any signal, and therefore that	670	* is restored before we deliver any signal, and therefore that
671	* user code runs with the correct debug control register even though	671	* user code runs with the correct debug control register even though
672	* we clear it here.	672	* we clear it here.
673	*	673	*
674	* Being careful here means that we don't have to be as careful in a	674	* Being careful here means that we don't have to be as careful in a
675	* lot of more complicated places (task switching can be a bit lazy	675	* lot of more complicated places (task switching can be a bit lazy
676	* about restoring all the debug state, and ptrace doesn't have to	676	* about restoring all the debug state, and ptrace doesn't have to
677	* find every occurrence of the TF bit that could be saved away even	677	* find every occurrence of the TF bit that could be saved away even
678	* by user code)	678	* by user code)
679	*/	679	*/
680	fastcall void do_debug(struct pt_regs * regs, long error_code)	680	fastcall void do_debug(struct pt_regs * regs, long error_code)
681	{	681	{
682	unsigned int condition;	682	unsigned int condition;
683	struct task_struct *tsk = current;	683	struct task_struct *tsk = current;
684		684
685	__asm__ __volatile__("movl %%db6,%0" : "=r" (condition));	685	__asm__ __volatile__("movl %%db6,%0" : "=r" (condition));
686		686
687	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,	687	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
688	SIGTRAP) == NOTIFY_STOP)	688	SIGTRAP) == NOTIFY_STOP)
689	return;	689	return;
690	/* It's safe to allow irq's after DR6 has been saved */	690	/* It's safe to allow irq's after DR6 has been saved */
691	if (regs->eflags & X86_EFLAGS_IF)	691	if (regs->eflags & X86_EFLAGS_IF)
692	local_irq_enable();	692	local_irq_enable();
693		693
694	/* Mask out spurious debug traps due to lazy DR7 setting */	694	/* Mask out spurious debug traps due to lazy DR7 setting */
695	if (condition & (DR_TRAP0\|DR_TRAP1\|DR_TRAP2\|DR_TRAP3)) {	695	if (condition & (DR_TRAP0\|DR_TRAP1\|DR_TRAP2\|DR_TRAP3)) {
696	if (!tsk->thread.debugreg[7])	696	if (!tsk->thread.debugreg[7])
697	goto clear_dr7;	697	goto clear_dr7;
698	}	698	}
699		699
700	if (regs->eflags & VM_MASK)	700	if (regs->eflags & VM_MASK)
701	goto debug_vm86;	701	goto debug_vm86;
702		702
703	/* Save debug status register where ptrace can see it */	703	/* Save debug status register where ptrace can see it */
704	tsk->thread.debugreg[6] = condition;	704	tsk->thread.debugreg[6] = condition;
705		705
706	/*	706	/*
707	* Single-stepping through TF: make sure we ignore any events in	707	* Single-stepping through TF: make sure we ignore any events in
708	* kernel space (but re-enable TF when returning to user mode).	708	* kernel space (but re-enable TF when returning to user mode).
709	*/	709	*/
710	if (condition & DR_STEP) {	710	if (condition & DR_STEP) {
711	/*	711	/*
712	* We already checked v86 mode above, so we can	712	* We already checked v86 mode above, so we can
713	* check for kernel mode by just checking the CPL	713	* check for kernel mode by just checking the CPL
714	* of CS.	714	* of CS.
715	*/	715	*/
716	if ((regs->xcs & 3) == 0)	716	if ((regs->xcs & 3) == 0)
717	goto clear_TF_reenable;	717	goto clear_TF_reenable;
718	}	718	}
719		719
720	/* Ok, finally something we can handle */	720	/* Ok, finally something we can handle */
721	send_sigtrap(tsk, regs, error_code);	721	send_sigtrap(tsk, regs, error_code);
722		722
723	/* Disable additional traps. They'll be re-enabled when	723	/* Disable additional traps. They'll be re-enabled when
724	* the signal is delivered.	724	* the signal is delivered.
725	*/	725	*/
726	clear_dr7:	726	clear_dr7:
727	__asm__("movl %0,%%db7"	727	__asm__("movl %0,%%db7"
728	: /* no output */	728	: /* no output */
729	: "r" (0));	729	: "r" (0));
730	return;	730	return;
731		731
732	debug_vm86:	732	debug_vm86:
733	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);	733	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
734	return;	734	return;
735		735
736	clear_TF_reenable:	736	clear_TF_reenable:
737	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);	737	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
738	regs->eflags &= ~TF_MASK;	738	regs->eflags &= ~TF_MASK;
739	return;	739	return;
740	}	740	}
741		741
742	/*	742	/*
743	* Note that we play around with the 'TS' bit in an attempt to get	743	* Note that we play around with the 'TS' bit in an attempt to get
744	* the correct behaviour even in the presence of the asynchronous	744	* the correct behaviour even in the presence of the asynchronous
745	* IRQ13 behaviour	745	* IRQ13 behaviour
746	*/	746	*/
747	void math_error(void __user *eip)	747	void math_error(void __user *eip)
748	{	748	{
749	struct task_struct * task;	749	struct task_struct * task;
750	siginfo_t info;	750	siginfo_t info;
751	unsigned short cwd, swd;	751	unsigned short cwd, swd;
752		752
753	/*	753	/*
754	* Save the info for the exception handler and clear the error.	754	* Save the info for the exception handler and clear the error.
755	*/	755	*/
756	task = current;	756	task = current;
757	save_init_fpu(task);	757	save_init_fpu(task);
758	task->thread.trap_no = 16;	758	task->thread.trap_no = 16;
759	task->thread.error_code = 0;	759	task->thread.error_code = 0;
760	info.si_signo = SIGFPE;	760	info.si_signo = SIGFPE;
761	info.si_errno = 0;	761	info.si_errno = 0;
762	info.si_code = __SI_FAULT;	762	info.si_code = __SI_FAULT;
763	info.si_addr = eip;	763	info.si_addr = eip;
764	/*	764	/*
765	* (~cwd & swd) will mask out exceptions that are not set to unmasked	765	* (~cwd & swd) will mask out exceptions that are not set to unmasked
766	* status. 0x3f is the exception bits in these regs, 0x200 is the	766	* status. 0x3f is the exception bits in these regs, 0x200 is the
767	* C1 reg you need in case of a stack fault, 0x040 is the stack	767	* C1 reg you need in case of a stack fault, 0x040 is the stack
768	* fault bit. We should only be taking one exception at a time,	768	* fault bit. We should only be taking one exception at a time,
769	* so if this combination doesn't produce any single exception,	769	* so if this combination doesn't produce any single exception,
770	* then we have a bad program that isn't syncronizing its FPU usage	770	* then we have a bad program that isn't syncronizing its FPU usage
771	* and it will suffer the consequences since we won't be able to	771	* and it will suffer the consequences since we won't be able to
772	* fully reproduce the context of the exception	772	* fully reproduce the context of the exception
773	*/	773	*/
774	cwd = get_fpu_cwd(task);	774	cwd = get_fpu_cwd(task);
775	swd = get_fpu_swd(task);	775	swd = get_fpu_swd(task);
776	switch (((~cwd) & swd & 0x3f) \| (swd & 0x240)) {	776	switch (((~cwd) & swd & 0x3f) \| (swd & 0x240)) {
777	case 0x000:	777	case 0x000:
778	default:	778	default:
779	break;	779	break;
780	case 0x001: /* Invalid Op */	780	case 0x001: /* Invalid Op */
781	case 0x041: /* Stack Fault */	781	case 0x041: /* Stack Fault */
782	case 0x241: /* Stack Fault \| Direction */	782	case 0x241: /* Stack Fault \| Direction */
783	info.si_code = FPE_FLTINV;	783	info.si_code = FPE_FLTINV;
784	/* Should we clear the SF or let user space do it ???? */	784	/* Should we clear the SF or let user space do it ???? */
785	break;	785	break;
786	case 0x002: /* Denormalize */	786	case 0x002: /* Denormalize */
787	case 0x010: /* Underflow */	787	case 0x010: /* Underflow */
788	info.si_code = FPE_FLTUND;	788	info.si_code = FPE_FLTUND;
789	break;	789	break;
790	case 0x004: /* Zero Divide */	790	case 0x004: /* Zero Divide */
791	info.si_code = FPE_FLTDIV;	791	info.si_code = FPE_FLTDIV;
792	break;	792	break;
793	case 0x008: /* Overflow */	793	case 0x008: /* Overflow */
794	info.si_code = FPE_FLTOVF;	794	info.si_code = FPE_FLTOVF;
795	break;	795	break;
796	case 0x020: /* Precision */	796	case 0x020: /* Precision */
797	info.si_code = FPE_FLTRES;	797	info.si_code = FPE_FLTRES;
798	break;	798	break;
799	}	799	}
800	force_sig_info(SIGFPE, &info, task);	800	force_sig_info(SIGFPE, &info, task);
801	}	801	}
802		802
803	fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)	803	fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
804	{	804	{
805	ignore_fpu_irq = 1;	805	ignore_fpu_irq = 1;
806	math_error((void __user *)regs->eip);	806	math_error((void __user *)regs->eip);
807	}	807	}
808		808
809	static void simd_math_error(void __user *eip)	809	static void simd_math_error(void __user *eip)
810	{	810	{
811	struct task_struct * task;	811	struct task_struct * task;
812	siginfo_t info;	812	siginfo_t info;
813	unsigned short mxcsr;	813	unsigned short mxcsr;
814		814
815	/*	815	/*
816	* Save the info for the exception handler and clear the error.	816	* Save the info for the exception handler and clear the error.
817	*/	817	*/
818	task = current;	818	task = current;
819	save_init_fpu(task);	819	save_init_fpu(task);
820	task->thread.trap_no = 19;	820	task->thread.trap_no = 19;
821	task->thread.error_code = 0;	821	task->thread.error_code = 0;
822	info.si_signo = SIGFPE;	822	info.si_signo = SIGFPE;
823	info.si_errno = 0;	823	info.si_errno = 0;
824	info.si_code = __SI_FAULT;	824	info.si_code = __SI_FAULT;
825	info.si_addr = eip;	825	info.si_addr = eip;
826	/*	826	/*
827	* The SIMD FPU exceptions are handled a little differently, as there	827	* The SIMD FPU exceptions are handled a little differently, as there
828	* is only a single status/control register. Thus, to determine which	828	* is only a single status/control register. Thus, to determine which
829	* unmasked exception was caught we must mask the exception mask bits	829	* unmasked exception was caught we must mask the exception mask bits
830	* at 0x1f80, and then use these to mask the exception bits at 0x3f.	830	* at 0x1f80, and then use these to mask the exception bits at 0x3f.
831	*/	831	*/
832	mxcsr = get_fpu_mxcsr(task);	832	mxcsr = get_fpu_mxcsr(task);
833	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {	833	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
834	case 0x000:	834	case 0x000:
835	default:	835	default:
836	break;	836	break;
837	case 0x001: /* Invalid Op */	837	case 0x001: /* Invalid Op */
838	info.si_code = FPE_FLTINV;	838	info.si_code = FPE_FLTINV;
839	break;	839	break;
840	case 0x002: /* Denormalize */	840	case 0x002: /* Denormalize */
841	case 0x010: /* Underflow */	841	case 0x010: /* Underflow */
842	info.si_code = FPE_FLTUND;	842	info.si_code = FPE_FLTUND;
843	break;	843	break;
844	case 0x004: /* Zero Divide */	844	case 0x004: /* Zero Divide */
845	info.si_code = FPE_FLTDIV;	845	info.si_code = FPE_FLTDIV;
846	break;	846	break;
847	case 0x008: /* Overflow */	847	case 0x008: /* Overflow */
848	info.si_code = FPE_FLTOVF;	848	info.si_code = FPE_FLTOVF;
849	break;	849	break;
850	case 0x020: /* Precision */	850	case 0x020: /* Precision */
851	info.si_code = FPE_FLTRES;	851	info.si_code = FPE_FLTRES;
852	break;	852	break;
853	}	853	}
854	force_sig_info(SIGFPE, &info, task);	854	force_sig_info(SIGFPE, &info, task);
855	}	855	}
856		856
857	fastcall void do_simd_coprocessor_error(struct pt_regs * regs,	857	fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
858	long error_code)	858	long error_code)
859	{	859	{
860	if (cpu_has_xmm) {	860	if (cpu_has_xmm) {
861	/* Handle SIMD FPU exceptions on PIII+ processors. */	861	/* Handle SIMD FPU exceptions on PIII+ processors. */
862	ignore_fpu_irq = 1;	862	ignore_fpu_irq = 1;
863	simd_math_error((void __user *)regs->eip);	863	simd_math_error((void __user *)regs->eip);
864	} else {	864	} else {
865	/*	865	/*
866	* Handle strange cache flush from user space exception	866	* Handle strange cache flush from user space exception
867	* in all other cases. This is undocumented behaviour.	867	* in all other cases. This is undocumented behaviour.
868	*/	868	*/
869	if (regs->eflags & VM_MASK) {	869	if (regs->eflags & VM_MASK) {
870	handle_vm86_fault((struct kernel_vm86_regs *)regs,	870	handle_vm86_fault((struct kernel_vm86_regs *)regs,
871	error_code);	871	error_code);
872	return;	872	return;
873	}	873	}
874	die_if_kernel("cache flush denied", regs, error_code);	874	die_if_kernel("cache flush denied", regs, error_code);
875	current->thread.trap_no = 19;	875	current->thread.trap_no = 19;
876	current->thread.error_code = error_code;	876	current->thread.error_code = error_code;
877	force_sig(SIGSEGV, current);	877	force_sig(SIGSEGV, current);
878	}	878	}
879	}	879	}
880		880
881	fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,	881	fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
882	long error_code)	882	long error_code)
883	{	883	{
884	#if 0	884	#if 0
885	/* No need to warn about this any longer. */	885	/* No need to warn about this any longer. */
886	printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");	886	printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
887	#endif	887	#endif
888	}	888	}
889		889
890	fastcall void setup_x86_bogus_stack(unsigned char * stk)	890	fastcall void setup_x86_bogus_stack(unsigned char * stk)
891	{	891	{
892	unsigned long switch16_ptr, switch32_ptr;	892	unsigned long switch16_ptr, switch32_ptr;
893	struct pt_regs *regs;	893	struct pt_regs *regs;
894	unsigned long stack_top, stack_bot;	894	unsigned long stack_top, stack_bot;
895	unsigned short iret_frame16_off;	895	unsigned short iret_frame16_off;
896	int cpu = smp_processor_id();	896	int cpu = smp_processor_id();
897	/* reserve the space on 32bit stack for the magic switch16 pointer */	897	/* reserve the space on 32bit stack for the magic switch16 pointer */
898	memmove(stk, stk + 8, sizeof(struct pt_regs));	898	memmove(stk, stk + 8, sizeof(struct pt_regs));
899	switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));	899	switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
900	regs = (struct pt_regs *)stk;	900	regs = (struct pt_regs *)stk;
901	/* now the switch32 on 16bit stack */	901	/* now the switch32 on 16bit stack */
902	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);	902	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
903	stack_top = stack_bot + CPU_16BIT_STACK_SIZE;	903	stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
904	switch32_ptr = (unsigned long *)(stack_top - 8);	904	switch32_ptr = (unsigned long *)(stack_top - 8);
905	iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;	905	iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
906	/* copy iret frame on 16bit stack */	906	/* copy iret frame on 16bit stack */
907	memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);	907	memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
908	/* fill in the switch pointers */	908	/* fill in the switch pointers */
909	switch16_ptr[0] = (regs->esp & 0xffff0000) \| iret_frame16_off;	909	switch16_ptr[0] = (regs->esp & 0xffff0000) \| iret_frame16_off;
910	switch16_ptr[1] = __ESPFIX_SS;	910	switch16_ptr[1] = __ESPFIX_SS;
911	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +	911	switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
912	8 - CPU_16BIT_STACK_SIZE;	912	8 - CPU_16BIT_STACK_SIZE;
913	switch32_ptr[1] = __KERNEL_DS;	913	switch32_ptr[1] = __KERNEL_DS;
914	}	914	}
915		915
916	fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)	916	fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
917	{	917	{
918	unsigned long *switch32_ptr;	918	unsigned long *switch32_ptr;
919	unsigned char stack16, stack32;	919	unsigned char stack16, stack32;
920	unsigned long stack_top, stack_bot;	920	unsigned long stack_top, stack_bot;
921	int len;	921	int len;
922	int cpu = smp_processor_id();	922	int cpu = smp_processor_id();
923	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);	923	stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
924	stack_top = stack_bot + CPU_16BIT_STACK_SIZE;	924	stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
925	switch32_ptr = (unsigned long *)(stack_top - 8);	925	switch32_ptr = (unsigned long *)(stack_top - 8);
926	/* copy the data from 16bit stack to 32bit stack */	926	/* copy the data from 16bit stack to 32bit stack */
927	len = CPU_16BIT_STACK_SIZE - 8 - sp;	927	len = CPU_16BIT_STACK_SIZE - 8 - sp;
928	stack16 = (unsigned char *)(stack_bot + sp);	928	stack16 = (unsigned char *)(stack_bot + sp);
929	stack32 = (unsigned char *)	929	stack32 = (unsigned char *)
930	(switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);	930	(switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
931	memcpy(stack32, stack16, len);	931	memcpy(stack32, stack16, len);
932	return stack32;	932	return stack32;
933	}	933	}
934		934
935	/*	935	/*
936	* 'math_state_restore()' saves the current math information in the	936	* 'math_state_restore()' saves the current math information in the
937	* old math state array, and gets the new ones from the current task	937	* old math state array, and gets the new ones from the current task
938	*	938	*
939	* Careful.. There are problems with IBM-designed IRQ13 behaviour.	939	* Careful.. There are problems with IBM-designed IRQ13 behaviour.
940	* Don't touch unless you really know how it works.	940	* Don't touch unless you really know how it works.
941	*	941	*
942	* Must be called with kernel preemption disabled (in this case,	942	* Must be called with kernel preemption disabled (in this case,
943	* local interrupts are disabled at the call-site in entry.S).	943	* local interrupts are disabled at the call-site in entry.S).
944	*/	944	*/
945	asmlinkage void math_state_restore(struct pt_regs regs)	945	asmlinkage void math_state_restore(struct pt_regs regs)
946	{	946	{
947	struct thread_info *thread = current_thread_info();	947	struct thread_info *thread = current_thread_info();
948	struct task_struct *tsk = thread->task;	948	struct task_struct *tsk = thread->task;
949		949
950	clts(); /* Allow maths ops (or we recurse) */	950	clts(); /* Allow maths ops (or we recurse) */
951	if (!tsk_used_math(tsk))	951	if (!tsk_used_math(tsk))
952	init_fpu(tsk);	952	init_fpu(tsk);
953	restore_fpu(tsk);	953	restore_fpu(tsk);
954	thread->status \|= TS_USEDFPU; /* So we fnsave on switch_to() */	954	thread->status \|= TS_USEDFPU; /* So we fnsave on switch_to() */
955	}	955	}
956		956
957	#ifndef CONFIG_MATH_EMULATION	957	#ifndef CONFIG_MATH_EMULATION
958		958
959	asmlinkage void math_emulate(long arg)	959	asmlinkage void math_emulate(long arg)
960	{	960	{
961	printk("math-emulation not enabled and no coprocessor found.\n");	961	printk("math-emulation not enabled and no coprocessor found.\n");
962	printk("killing %s.\n",current->comm);	962	printk("killing %s.\n",current->comm);
963	force_sig(SIGFPE,current);	963	force_sig(SIGFPE,current);
964	schedule();	964	schedule();
965	}	965	}
966		966
967	#endif /* CONFIG_MATH_EMULATION */	967	#endif /* CONFIG_MATH_EMULATION */
968		968
969	#ifdef CONFIG_X86_F00F_BUG	969	#ifdef CONFIG_X86_F00F_BUG
970	void __init trap_init_f00f_bug(void)	970	void __init trap_init_f00f_bug(void)
971	{	971	{
972	__set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);	972	__set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
973		973
974	/*	974	/*
975	* Update the IDT descriptor and reload the IDT so that	975	* Update the IDT descriptor and reload the IDT so that
976	* it uses the read-only mapped virtual address.	976	* it uses the read-only mapped virtual address.
977	*/	977	*/
978	idt_descr.address = fix_to_virt(FIX_F00F_IDT);	978	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
979	__asm__ __volatile__("lidt %0" : : "m" (idt_descr));	979	__asm__ __volatile__("lidt %0" : : "m" (idt_descr));
980	}	980	}
981	#endif	981	#endif
982		982
983	#define _set_gate(gate_addr,type,dpl,addr,seg) \	983	#define _set_gate(gate_addr,type,dpl,addr,seg) \
984	do { \	984	do { \
985	int __d0, __d1; \	985	int __d0, __d1; \
986	__asm__ __volatile__ ("movw %%dx,%%ax\n\t" \	986	__asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
987	"movw %4,%%dx\n\t" \	987	"movw %4,%%dx\n\t" \
988	"movl %%eax,%0\n\t" \	988	"movl %%eax,%0\n\t" \
989	"movl %%edx,%1" \	989	"movl %%edx,%1" \
990	:"=m" (((long ) (gate_addr))), \	990	:"=m" (((long ) (gate_addr))), \
991	"=m" ((1+(long ) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \	991	"=m" ((1+(long ) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
992	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \	992	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
993	"3" ((char *) (addr)),"2" ((seg) << 16)); \	993	"3" ((char *) (addr)),"2" ((seg) << 16)); \
994	} while (0)	994	} while (0)
995		995
996		996
997	/*	997	/*
998	* This needs to use 'idt_table' rather than 'idt', and	998	* This needs to use 'idt_table' rather than 'idt', and
999	* thus use the _nonmapped_ version of the IDT, as the	999	* thus use the _nonmapped_ version of the IDT, as the
1000	* Pentium F0 0F bugfix can have resulted in the mapped	1000	* Pentium F0 0F bugfix can have resulted in the mapped
1001	* IDT being write-protected.	1001	* IDT being write-protected.
1002	*/	1002	*/
1003	void set_intr_gate(unsigned int n, void *addr)	1003	void set_intr_gate(unsigned int n, void *addr)
1004	{	1004	{
1005	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);	1005	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
1006	}	1006	}
1007		1007
1008	/*	1008	/*
1009	* This routine sets up an interrupt gate at directory privilege level 3.	1009	* This routine sets up an interrupt gate at directory privilege level 3.
1010	*/	1010	*/
1011	static inline void set_system_intr_gate(unsigned int n, void *addr)	1011	static inline void set_system_intr_gate(unsigned int n, void *addr)
1012	{	1012	{
1013	_set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);	1013	_set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
1014	}	1014	}
1015		1015
1016	static void __init set_trap_gate(unsigned int n, void *addr)	1016	static void __init set_trap_gate(unsigned int n, void *addr)
1017	{	1017	{
1018	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);	1018	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
1019	}	1019	}
1020		1020
1021	static void __init set_system_gate(unsigned int n, void *addr)	1021	static void __init set_system_gate(unsigned int n, void *addr)
1022	{	1022	{
1023	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);	1023	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
1024	}	1024	}
1025		1025
1026	static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)	1026	static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1027	{	1027	{
1028	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));	1028	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
1029	}	1029	}
1030		1030
1031		1031
1032	void __init trap_init(void)	1032	void __init trap_init(void)
1033	{	1033	{
1034	#ifdef CONFIG_EISA	1034	#ifdef CONFIG_EISA
1035	void __iomem *p = ioremap(0x0FFFD9, 4);	1035	void __iomem *p = ioremap(0x0FFFD9, 4);
1036	if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {	1036	if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1037	EISA_bus = 1;	1037	EISA_bus = 1;
1038	}	1038	}
1039	iounmap(p);	1039	iounmap(p);
1040	#endif	1040	#endif
1041		1041
1042	#ifdef CONFIG_X86_LOCAL_APIC	1042	#ifdef CONFIG_X86_LOCAL_APIC
1043	init_apic_mappings();	1043	init_apic_mappings();
1044	#endif	1044	#endif
1045		1045
1046	set_trap_gate(0,&divide_error);	1046	set_trap_gate(0,&divide_error);
1047	set_intr_gate(1,&debug);	1047	set_intr_gate(1,&debug);
1048	set_intr_gate(2,&nmi);	1048	set_intr_gate(2,&nmi);
1049	set_system_intr_gate(3, &int3); /* int3-5 can be called from all */	1049	set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
1050	set_system_gate(4,&overflow);	1050	set_system_gate(4,&overflow);
1051	set_system_gate(5,&bounds);	1051	set_system_gate(5,&bounds);
1052	set_trap_gate(6,&invalid_op);	1052	set_trap_gate(6,&invalid_op);
1053	set_trap_gate(7,&device_not_available);	1053	set_trap_gate(7,&device_not_available);
1054	set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);	1054	set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
1055	set_trap_gate(9,&coprocessor_segment_overrun);	1055	set_trap_gate(9,&coprocessor_segment_overrun);
1056	set_trap_gate(10,&invalid_TSS);	1056	set_trap_gate(10,&invalid_TSS);
1057	set_trap_gate(11,&segment_not_present);	1057	set_trap_gate(11,&segment_not_present);
1058	set_trap_gate(12,&stack_segment);	1058	set_trap_gate(12,&stack_segment);
1059	set_trap_gate(13,&general_protection);	1059	set_trap_gate(13,&general_protection);
1060	set_intr_gate(14,&page_fault);	1060	set_intr_gate(14,&page_fault);
1061	set_trap_gate(15,&spurious_interrupt_bug);	1061	set_trap_gate(15,&spurious_interrupt_bug);
1062	set_trap_gate(16,&coprocessor_error);	1062	set_trap_gate(16,&coprocessor_error);
1063	set_trap_gate(17,&alignment_check);	1063	set_trap_gate(17,&alignment_check);
1064	#ifdef CONFIG_X86_MCE	1064	#ifdef CONFIG_X86_MCE
1065	set_trap_gate(18,&machine_check);	1065	set_trap_gate(18,&machine_check);
1066	#endif	1066	#endif
1067	set_trap_gate(19,&simd_coprocessor_error);	1067	set_trap_gate(19,&simd_coprocessor_error);
1068		1068
1069	set_system_gate(SYSCALL_VECTOR,&system_call);	1069	set_system_gate(SYSCALL_VECTOR,&system_call);
1070		1070
1071	/*	1071	/*
1072	* Should be a barrier for any external CPU state.	1072	* Should be a barrier for any external CPU state.
1073	*/	1073	*/
1074	cpu_init();	1074	cpu_init();
1075		1075
1076	trap_init_hook();	1076	trap_init_hook();
1077	}	1077	}
1078		1078
1079	static int __init kstack_setup(char *s)	1079	static int __init kstack_setup(char *s)
1080	{	1080	{
1081	kstack_depth_to_print = simple_strtoul(s, NULL, 0);	1081	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1082	return 0;	1082	return 0;
1083	}	1083	}
1084	__setup("kstack=", kstack_setup);	1084	__setup("kstack=", kstack_setup);
1085		1085

arch/i386/lib/delay.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Precise Delay Loops for i386	2	* Precise Delay Loops for i386
3	*	3	*
4	* Copyright (C) 1993 Linus Torvalds	4	* Copyright (C) 1993 Linus Torvalds
5	* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>	5	* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6	*	6	*
7	* The __delay function must _NOT_ be inlined as its execution time	7	* The __delay function must _NOT_ be inlined as its execution time
8	* depends wildly on alignment on many x86 processors. The additional	8	* depends wildly on alignment on many x86 processors. The additional
9	* jump magic is needed to get the timing stable on all the CPU's	9	* jump magic is needed to get the timing stable on all the CPU's
10	* we have to worry about.	10	* we have to worry about.
11	*/	11	*/
12		12
13	#include <linux/config.h>	13	#include <linux/config.h>
14	#include <linux/sched.h>	14	#include <linux/sched.h>
15	#include <linux/delay.h>	15	#include <linux/delay.h>
16	#include <asm/processor.h>	16	#include <asm/processor.h>
17	#include <asm/delay.h>	17	#include <asm/delay.h>
18	#include <asm/timer.h>	18	#include <asm/timer.h>
19		19
20	#ifdef CONFIG_SMP	20	#ifdef CONFIG_SMP
21	#include <asm/smp.h>	21	#include <asm/smp.h>
22	#endif	22	#endif
23		23
24	extern struct timer_opts* timer;	24	extern struct timer_opts* timer;
25		25
26	void __delay(unsigned long loops)	26	void __delay(unsigned long loops)
27	{	27	{
28	cur_timer->delay(loops);	28	cur_timer->delay(loops);
29	}	29	}
30		30
31	inline void __const_udelay(unsigned long xloops)	31	inline void __const_udelay(unsigned long xloops)
32	{	32	{
33	int d0;	33	int d0;
34	xloops *= 4;	34	xloops *= 4;
35	__asm__("mull %0"	35	__asm__("mull %0"
36	:"=d" (xloops), "=&a" (d0)	36	:"=d" (xloops), "=&a" (d0)
37	:"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4)));	37	:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
38	__delay(++xloops);	38	__delay(++xloops);
39	}	39	}
40		40
41	void __udelay(unsigned long usecs)	41	void __udelay(unsigned long usecs)
42	{	42	{
43	__const_udelay(usecs * 0x000010c7); /* 2*32 / 1000000 (rounded up) /	43	__const_udelay(usecs * 0x000010c7); /* 2*32 / 1000000 (rounded up) /
44	}	44	}
45		45
46	void __ndelay(unsigned long nsecs)	46	void __ndelay(unsigned long nsecs)
47	{	47	{
48	__const_udelay(nsecs * 0x00005); /* 2*32 / 1000000000 (rounded up) /	48	__const_udelay(nsecs * 0x00005); /* 2*32 / 1000000000 (rounded up) /
49	}	49	}
50		50

arch/ppc/lib/locks.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Locks for smp ppc	2	* Locks for smp ppc
3	*	3	*
4	* Written by Cort Dougan (cort@cs.nmt.edu)	4	* Written by Cort Dougan (cort@cs.nmt.edu)
5	*/	5	*/
6		6
7	#include <linux/config.h>	7	#include <linux/config.h>
8	#include <linux/sched.h>	8	#include <linux/sched.h>
9	#include <linux/spinlock.h>	9	#include <linux/spinlock.h>
10	#include <linux/module.h>	10	#include <linux/module.h>
11	#include <asm/ppc_asm.h>	11	#include <asm/ppc_asm.h>
12	#include <asm/smp.h>	12	#include <asm/smp.h>
13		13
14	#ifdef CONFIG_DEBUG_SPINLOCK	14	#ifdef CONFIG_DEBUG_SPINLOCK
15		15
16	#undef INIT_STUCK	16	#undef INIT_STUCK
17	#define INIT_STUCK 200000000 /0xffffffff/	17	#define INIT_STUCK 200000000 /0xffffffff/
18		18
19	/*	19	/*
20	* Try to acquire a spinlock.	20	* Try to acquire a spinlock.
21	* Only does the stwcx. if the load returned 0 - the Programming	21	* Only does the stwcx. if the load returned 0 - the Programming
22	* Environments Manual suggests not doing unnecessary stcwx.'s	22	* Environments Manual suggests not doing unnecessary stcwx.'s
23	* since they may inhibit forward progress by other CPUs in getting	23	* since they may inhibit forward progress by other CPUs in getting
24	* a lock.	24	* a lock.
25	*/	25	*/
26	static inline unsigned long __spin_trylock(volatile unsigned long *lock)	26	static inline unsigned long __spin_trylock(volatile unsigned long *lock)
27	{	27	{
28	unsigned long ret;	28	unsigned long ret;
29		29
30	__asm__ __volatile__ ("\n\	30	__asm__ __volatile__ ("\n\
31	1: lwarx %0,0,%1\n\	31	1: lwarx %0,0,%1\n\
32	cmpwi 0,%0,0\n\	32	cmpwi 0,%0,0\n\
33	bne 2f\n"	33	bne 2f\n"
34	PPC405_ERR77(0,%1)	34	PPC405_ERR77(0,%1)
35	" stwcx. %2,0,%1\n\	35	" stwcx. %2,0,%1\n\
36	bne- 1b\n\	36	bne- 1b\n\
37	isync\n\	37	isync\n\
38	2:"	38	2:"
39	: "=&r"(ret)	39	: "=&r"(ret)
40	: "r"(lock), "r"(1)	40	: "r"(lock), "r"(1)
41	: "cr0", "memory");	41	: "cr0", "memory");
42		42
43	return ret;	43	return ret;
44	}	44	}
45		45
46	void _raw_spin_lock(spinlock_t *lock)	46	void _raw_spin_lock(spinlock_t *lock)
47	{	47	{
48	int cpu = smp_processor_id();	48	int cpu = smp_processor_id();
49	unsigned int stuck = INIT_STUCK;	49	unsigned int stuck = INIT_STUCK;
50	while (__spin_trylock(&lock->lock)) {	50	while (__spin_trylock(&lock->lock)) {
51	while ((unsigned volatile long)lock->lock != 0) {	51	while ((unsigned volatile long)lock->lock != 0) {
52	if (!--stuck) {	52	if (!--stuck) {
53	printk("_spin_lock(%p) CPU#%d NIP %p"	53	printk("_spin_lock(%p) CPU#%d NIP %p"
54	" holder: cpu %ld pc %08lX\n",	54	" holder: cpu %ld pc %08lX\n",
55	lock, cpu, __builtin_return_address(0),	55	lock, cpu, __builtin_return_address(0),
56	lock->owner_cpu,lock->owner_pc);	56	lock->owner_cpu,lock->owner_pc);
57	stuck = INIT_STUCK;	57	stuck = INIT_STUCK;
58	/* steal the lock */	58	/* steal the lock */
59	/xchg_u32((void )&lock->lock,0);*/	59	/xchg_u32((void )&lock->lock,0);*/
60	}	60	}
61	}	61	}
62	}	62	}
63	lock->owner_pc = (unsigned long)__builtin_return_address(0);	63	lock->owner_pc = (unsigned long)__builtin_return_address(0);
64	lock->owner_cpu = cpu;	64	lock->owner_cpu = cpu;
65	}	65	}
66	EXPORT_SYMBOL(_raw_spin_lock);	66	EXPORT_SYMBOL(_raw_spin_lock);
67		67
68	int _raw_spin_trylock(spinlock_t *lock)	68	int _raw_spin_trylock(spinlock_t *lock)
69	{	69	{
70	if (__spin_trylock(&lock->lock))	70	if (__spin_trylock(&lock->lock))
71	return 0;	71	return 0;
72	lock->owner_cpu = smp_processor_id();	72	lock->owner_cpu = smp_processor_id();
73	lock->owner_pc = (unsigned long)__builtin_return_address(0);	73	lock->owner_pc = (unsigned long)__builtin_return_address(0);
74	return 1;	74	return 1;
75	}	75	}
76	EXPORT_SYMBOL(_raw_spin_trylock);	76	EXPORT_SYMBOL(_raw_spin_trylock);
77		77
78	void _raw_spin_unlock(spinlock_t *lp)	78	void _raw_spin_unlock(spinlock_t *lp)
79	{	79	{
80	if ( !lp->lock )	80	if ( !lp->lock )
81	printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",	81	printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
82	lp, smp_processor_id(), __builtin_return_address(0),	82	lp, smp_processor_id(), __builtin_return_address(0),
83	current->comm, current->pid);	83	current->comm, current->pid);
84	if ( lp->owner_cpu != smp_processor_id() )	84	if ( lp->owner_cpu != smp_processor_id() )
85	printk("_spin_unlock(%p): cpu %d trying clear of cpu %d pc %lx val %lx\n",	85	printk("_spin_unlock(%p): cpu %d trying clear of cpu %d pc %lx val %lx\n",
86	lp, smp_processor_id(), (int)lp->owner_cpu,	86	lp, smp_processor_id(), (int)lp->owner_cpu,
87	lp->owner_pc,lp->lock);	87	lp->owner_pc,lp->lock);
88	lp->owner_pc = lp->owner_cpu = 0;	88	lp->owner_pc = lp->owner_cpu = 0;
89	wmb();	89	wmb();
90	lp->lock = 0;	90	lp->lock = 0;
91	}	91	}
92	EXPORT_SYMBOL(_raw_spin_unlock);	92	EXPORT_SYMBOL(_raw_spin_unlock);
93		93
94	/*	94	/*
95	* For rwlocks, zero is unlocked, -1 is write-locked,	95	* For rwlocks, zero is unlocked, -1 is write-locked,
96	* positive is read-locked.	96	* positive is read-locked.
97	*/	97	*/
98	static __inline__ int __read_trylock(rwlock_t *rw)	98	static __inline__ int __read_trylock(rwlock_t *rw)
99	{	99	{
100	signed int tmp;	100	signed int tmp;
101		101
102	__asm__ __volatile__(	102	__asm__ __volatile__(
103	"2: lwarx %0,0,%1 # __read_trylock\n\	103	"2: lwarx %0,0,%1 # __read_trylock\n\
104	addic. %0,%0,1\n\	104	addic. %0,%0,1\n\
105	ble- 1f\n"	105	ble- 1f\n"
106	PPC405_ERR77(0,%1)	106	PPC405_ERR77(0,%1)
107	" stwcx. %0,0,%1\n\	107	" stwcx. %0,0,%1\n\
108	bne- 2b\n\	108	bne- 2b\n\
109	isync\n\	109	isync\n\
110	1:"	110	1:"
111	: "=&r"(tmp)	111	: "=&r"(tmp)
112	: "r"(&rw->lock)	112	: "r"(&rw->lock)
113	: "cr0", "memory");	113	: "cr0", "memory");
114		114
115	return tmp;	115	return tmp;
116	}	116	}
117		117
118	int _raw_read_trylock(rwlock_t *rw)	118	int _raw_read_trylock(rwlock_t *rw)
119	{	119	{
120	return __read_trylock(rw) > 0;	120	return __read_trylock(rw) > 0;
121	}	121	}
122	EXPORT_SYMBOL(_raw_read_trylock);	122	EXPORT_SYMBOL(_raw_read_trylock);
123		123
124	void _raw_read_lock(rwlock_t *rw)	124	void _raw_read_lock(rwlock_t *rw)
125	{	125	{
126	unsigned int stuck;	126	unsigned int stuck;
127		127
128	while (__read_trylock(rw) <= 0) {	128	while (__read_trylock(rw) <= 0) {
129	stuck = INIT_STUCK;	129	stuck = INIT_STUCK;
130	while (!read_can_lock(rw)) {	130	while (!read_can_lock(rw)) {
131	if (--stuck == 0) {	131	if (--stuck == 0) {
132	printk("_read_lock(%p) CPU#%d lock %d\n",	132	printk("_read_lock(%p) CPU#%d lock %d\n",
133	rw, _smp_processor_id(), rw->lock);	133	rw, raw_smp_processor_id(), rw->lock);
134	stuck = INIT_STUCK;	134	stuck = INIT_STUCK;
135	}	135	}
136	}	136	}
137	}	137	}
138	}	138	}
139	EXPORT_SYMBOL(_raw_read_lock);	139	EXPORT_SYMBOL(_raw_read_lock);
140		140
141	void _raw_read_unlock(rwlock_t *rw)	141	void _raw_read_unlock(rwlock_t *rw)
142	{	142	{
143	if ( rw->lock == 0 )	143	if ( rw->lock == 0 )
144	printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",	144	printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
145	current->comm,current->pid,current->thread.regs->nip,	145	current->comm,current->pid,current->thread.regs->nip,
146	rw->lock);	146	rw->lock);
147	wmb();	147	wmb();
148	atomic_dec((atomic_t *) &(rw)->lock);	148	atomic_dec((atomic_t *) &(rw)->lock);
149	}	149	}
150	EXPORT_SYMBOL(_raw_read_unlock);	150	EXPORT_SYMBOL(_raw_read_unlock);
151		151
152	void _raw_write_lock(rwlock_t *rw)	152	void _raw_write_lock(rwlock_t *rw)
153	{	153	{
154	unsigned int stuck;	154	unsigned int stuck;
155		155
156	while (cmpxchg(&rw->lock, 0, -1) != 0) {	156	while (cmpxchg(&rw->lock, 0, -1) != 0) {
157	stuck = INIT_STUCK;	157	stuck = INIT_STUCK;
158	while (!write_can_lock(rw)) {	158	while (!write_can_lock(rw)) {
159	if (--stuck == 0) {	159	if (--stuck == 0) {
160	printk("write_lock(%p) CPU#%d lock %d)\n",	160	printk("write_lock(%p) CPU#%d lock %d)\n",
161	rw, _smp_processor_id(), rw->lock);	161	rw, raw_smp_processor_id(), rw->lock);
162	stuck = INIT_STUCK;	162	stuck = INIT_STUCK;
163	}	163	}
164	}	164	}
165	}	165	}
166	wmb();	166	wmb();
167	}	167	}
168	EXPORT_SYMBOL(_raw_write_lock);	168	EXPORT_SYMBOL(_raw_write_lock);
169		169
170	int _raw_write_trylock(rwlock_t *rw)	170	int _raw_write_trylock(rwlock_t *rw)
171	{	171	{
172	if (cmpxchg(&rw->lock, 0, -1) != 0)	172	if (cmpxchg(&rw->lock, 0, -1) != 0)
173	return 0;	173	return 0;
174	wmb();	174	wmb();
175	return 1;	175	return 1;
176	}	176	}
177	EXPORT_SYMBOL(_raw_write_trylock);	177	EXPORT_SYMBOL(_raw_write_trylock);
178		178
179	void _raw_write_unlock(rwlock_t *rw)	179	void _raw_write_unlock(rwlock_t *rw)
180	{	180	{
181	if (rw->lock >= 0)	181	if (rw->lock >= 0)
182	printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",	182	printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
183	current->comm,current->pid,current->thread.regs->nip,	183	current->comm,current->pid,current->thread.regs->nip,
184	rw->lock);	184	rw->lock);
185	wmb();	185	wmb();
186	rw->lock = 0;	186	rw->lock = 0;
187	}	187	}
188	EXPORT_SYMBOL(_raw_write_unlock);	188	EXPORT_SYMBOL(_raw_write_unlock);
189		189
190	#endif	190	#endif
191		191

arch/ppc64/kernel/idle.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Idle daemon for PowerPC. Idle daemon will handle any action	2	* Idle daemon for PowerPC. Idle daemon will handle any action
3	* that needs to be taken when the system becomes idle.	3	* that needs to be taken when the system becomes idle.
4	*	4	*
5	* Originally Written by Cort Dougan (cort@cs.nmt.edu)	5	* Originally Written by Cort Dougan (cort@cs.nmt.edu)
6	*	6	*
7	* iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>	7	* iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
8	*	8	*
9	* Additional shared processor, SMT, and firmware support	9	* Additional shared processor, SMT, and firmware support
10	* Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>	10	* Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
11	*	11	*
12	* This program is free software; you can redistribute it and/or	12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License	13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation; either version	14	* as published by the Free Software Foundation; either version
15	* 2 of the License, or (at your option) any later version.	15	* 2 of the License, or (at your option) any later version.
16	*/	16	*/
17		17
18	#include <linux/config.h>	18	#include <linux/config.h>
19	#include <linux/sched.h>	19	#include <linux/sched.h>
20	#include <linux/kernel.h>	20	#include <linux/kernel.h>
21	#include <linux/smp.h>	21	#include <linux/smp.h>
22	#include <linux/cpu.h>	22	#include <linux/cpu.h>
23	#include <linux/module.h>	23	#include <linux/module.h>
24	#include <linux/sysctl.h>	24	#include <linux/sysctl.h>
25	#include <linux/smp.h>	25	#include <linux/smp.h>
26		26
27	#include <asm/system.h>	27	#include <asm/system.h>
28	#include <asm/processor.h>	28	#include <asm/processor.h>
29	#include <asm/mmu.h>	29	#include <asm/mmu.h>
30	#include <asm/cputable.h>	30	#include <asm/cputable.h>
31	#include <asm/time.h>	31	#include <asm/time.h>
32	#include <asm/iSeries/HvCall.h>	32	#include <asm/iSeries/HvCall.h>
33	#include <asm/iSeries/ItLpQueue.h>	33	#include <asm/iSeries/ItLpQueue.h>
34	#include <asm/plpar_wrappers.h>	34	#include <asm/plpar_wrappers.h>
35	#include <asm/systemcfg.h>	35	#include <asm/systemcfg.h>
36		36
37	extern void power4_idle(void);	37	extern void power4_idle(void);
38		38
39	static int (*idle_loop)(void);	39	static int (*idle_loop)(void);
40		40
41	#ifdef CONFIG_PPC_ISERIES	41	#ifdef CONFIG_PPC_ISERIES
42	static unsigned long maxYieldTime = 0;	42	static unsigned long maxYieldTime = 0;
43	static unsigned long minYieldTime = 0xffffffffffffffffUL;	43	static unsigned long minYieldTime = 0xffffffffffffffffUL;
44		44
45	static void yield_shared_processor(void)	45	static void yield_shared_processor(void)
46	{	46	{
47	unsigned long tb;	47	unsigned long tb;
48	unsigned long yieldTime;	48	unsigned long yieldTime;
49		49
50	HvCall_setEnabledInterrupts(HvCall_MaskIPI \|	50	HvCall_setEnabledInterrupts(HvCall_MaskIPI \|
51	HvCall_MaskLpEvent \|	51	HvCall_MaskLpEvent \|
52	HvCall_MaskLpProd \|	52	HvCall_MaskLpProd \|
53	HvCall_MaskTimeout);	53	HvCall_MaskTimeout);
54		54
55	tb = get_tb();	55	tb = get_tb();
56	/* Compute future tb value when yield should expire */	56	/* Compute future tb value when yield should expire */
57	HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy);	57	HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy);
58		58
59	yieldTime = get_tb() - tb;	59	yieldTime = get_tb() - tb;
60	if (yieldTime > maxYieldTime)	60	if (yieldTime > maxYieldTime)
61	maxYieldTime = yieldTime;	61	maxYieldTime = yieldTime;
62		62
63	if (yieldTime < minYieldTime)	63	if (yieldTime < minYieldTime)
64	minYieldTime = yieldTime;	64	minYieldTime = yieldTime;
65		65
66	/*	66	/*
67	* The decrementer stops during the yield. Force a fake decrementer	67	* The decrementer stops during the yield. Force a fake decrementer
68	* here and let the timer_interrupt code sort out the actual time.	68	* here and let the timer_interrupt code sort out the actual time.
69	*/	69	*/
70	get_paca()->lppaca.int_dword.fields.decr_int = 1;	70	get_paca()->lppaca.int_dword.fields.decr_int = 1;
71	process_iSeries_events();	71	process_iSeries_events();
72	}	72	}
73		73
74	static int iSeries_idle(void)	74	static int iSeries_idle(void)
75	{	75	{
76	struct paca_struct *lpaca;	76	struct paca_struct *lpaca;
77	long oldval;	77	long oldval;
78		78
79	/* ensure iSeries run light will be out when idle */	79	/* ensure iSeries run light will be out when idle */
80	ppc64_runlatch_off();	80	ppc64_runlatch_off();
81		81
82	lpaca = get_paca();	82	lpaca = get_paca();
83		83
84	while (1) {	84	while (1) {
85	if (lpaca->lppaca.shared_proc) {	85	if (lpaca->lppaca.shared_proc) {
86	if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))	86	if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
87	process_iSeries_events();	87	process_iSeries_events();
88	if (!need_resched())	88	if (!need_resched())
89	yield_shared_processor();	89	yield_shared_processor();
90	} else {	90	} else {
91	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);	91	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
92		92
93	if (!oldval) {	93	if (!oldval) {
94	set_thread_flag(TIF_POLLING_NRFLAG);	94	set_thread_flag(TIF_POLLING_NRFLAG);
95		95
96	while (!need_resched()) {	96	while (!need_resched()) {
97	HMT_medium();	97	HMT_medium();
98	if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))	98	if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
99	process_iSeries_events();	99	process_iSeries_events();
100	HMT_low();	100	HMT_low();
101	}	101	}
102		102
103	HMT_medium();	103	HMT_medium();
104	clear_thread_flag(TIF_POLLING_NRFLAG);	104	clear_thread_flag(TIF_POLLING_NRFLAG);
105	} else {	105	} else {
106	set_need_resched();	106	set_need_resched();
107	}	107	}
108	}	108	}
109		109
110	ppc64_runlatch_on();	110	ppc64_runlatch_on();
111	schedule();	111	schedule();
112	ppc64_runlatch_off();	112	ppc64_runlatch_off();
113	}	113	}
114		114
115	return 0;	115	return 0;
116	}	116	}
117		117
118	#else	118	#else
119		119
120	static int default_idle(void)	120	static int default_idle(void)
121	{	121	{
122	long oldval;	122	long oldval;
123	unsigned int cpu = smp_processor_id();	123	unsigned int cpu = smp_processor_id();
124		124
125	while (1) {	125	while (1) {
126	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);	126	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
127		127
128	if (!oldval) {	128	if (!oldval) {
129	set_thread_flag(TIF_POLLING_NRFLAG);	129	set_thread_flag(TIF_POLLING_NRFLAG);
130		130
131	while (!need_resched() && !cpu_is_offline(cpu)) {	131	while (!need_resched() && !cpu_is_offline(cpu)) {
132	barrier();	132	barrier();
133	/*	133	/*
134	* Go into low thread priority and possibly	134	* Go into low thread priority and possibly
135	* low power mode.	135	* low power mode.
136	*/	136	*/
137	HMT_low();	137	HMT_low();
138	HMT_very_low();	138	HMT_very_low();
139	}	139	}
140		140
141	HMT_medium();	141	HMT_medium();
142	clear_thread_flag(TIF_POLLING_NRFLAG);	142	clear_thread_flag(TIF_POLLING_NRFLAG);
143	} else {	143	} else {
144	set_need_resched();	144	set_need_resched();
145	}	145	}
146		146
147	schedule();	147	schedule();
148	if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)	148	if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
149	cpu_die();	149	cpu_die();
150	}	150	}
151		151
152	return 0;	152	return 0;
153	}	153	}
154		154
155	#ifdef CONFIG_PPC_PSERIES	155	#ifdef CONFIG_PPC_PSERIES
156		156
157	DECLARE_PER_CPU(unsigned long, smt_snooze_delay);	157	DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
158		158
159	int dedicated_idle(void)	159	int dedicated_idle(void)
160	{	160	{
161	long oldval;	161	long oldval;
162	struct paca_struct lpaca = get_paca(), ppaca;	162	struct paca_struct lpaca = get_paca(), ppaca;
163	unsigned long start_snooze;	163	unsigned long start_snooze;
164	unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);	164	unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
165	unsigned int cpu = smp_processor_id();	165	unsigned int cpu = smp_processor_id();
166		166
167	ppaca = &paca[cpu ^ 1];	167	ppaca = &paca[cpu ^ 1];
168		168
169	while (1) {	169	while (1) {
170	/*	170	/*
171	* Indicate to the HV that we are idle. Now would be	171	* Indicate to the HV that we are idle. Now would be
172	* a good time to find other work to dispatch.	172	* a good time to find other work to dispatch.
173	*/	173	*/
174	lpaca->lppaca.idle = 1;	174	lpaca->lppaca.idle = 1;
175		175
176	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);	176	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
177	if (!oldval) {	177	if (!oldval) {
178	set_thread_flag(TIF_POLLING_NRFLAG);	178	set_thread_flag(TIF_POLLING_NRFLAG);
179	start_snooze = __get_tb() +	179	start_snooze = __get_tb() +
180	smt_snooze_delay tb_ticks_per_usec;	180	smt_snooze_delay tb_ticks_per_usec;
181	while (!need_resched() && !cpu_is_offline(cpu)) {	181	while (!need_resched() && !cpu_is_offline(cpu)) {
182	/*	182	/*
183	* Go into low thread priority and possibly	183	* Go into low thread priority and possibly
184	* low power mode.	184	* low power mode.
185	*/	185	*/
186	HMT_low();	186	HMT_low();
187	HMT_very_low();	187	HMT_very_low();
188		188
189	if (*smt_snooze_delay == 0 \|\|	189	if (*smt_snooze_delay == 0 \|\|
190	__get_tb() < start_snooze)	190	__get_tb() < start_snooze)
191	continue;	191	continue;
192		192
193	HMT_medium();	193	HMT_medium();
194		194
195	if (!(ppaca->lppaca.idle)) {	195	if (!(ppaca->lppaca.idle)) {
196	local_irq_disable();	196	local_irq_disable();
197		197
198	/*	198	/*
199	* We are about to sleep the thread	199	* We are about to sleep the thread
200	* and so wont be polling any	200	* and so wont be polling any
201	* more.	201	* more.
202	*/	202	*/
203	clear_thread_flag(TIF_POLLING_NRFLAG);	203	clear_thread_flag(TIF_POLLING_NRFLAG);
204		204
205	/*	205	/*
206	* SMT dynamic mode. Cede will result	206	* SMT dynamic mode. Cede will result
207	* in this thread going dormant, if the	207	* in this thread going dormant, if the
208	* partner thread is still doing work.	208	* partner thread is still doing work.
209	* Thread wakes up if partner goes idle,	209	* Thread wakes up if partner goes idle,
210	* an interrupt is presented, or a prod	210	* an interrupt is presented, or a prod
211	* occurs. Returning from the cede	211	* occurs. Returning from the cede
212	* enables external interrupts.	212	* enables external interrupts.
213	*/	213	*/
214	if (!need_resched())	214	if (!need_resched())
215	cede_processor();	215	cede_processor();
216	else	216	else
217	local_irq_enable();	217	local_irq_enable();
218	} else {	218	} else {
219	/*	219	/*
220	* Give the HV an opportunity at the	220	* Give the HV an opportunity at the
221	* processor, since we are not doing	221	* processor, since we are not doing
222	* any work.	222	* any work.
223	*/	223	*/
224	poll_pending();	224	poll_pending();
225	}	225	}
226	}	226	}
227		227
228	clear_thread_flag(TIF_POLLING_NRFLAG);	228	clear_thread_flag(TIF_POLLING_NRFLAG);
229	} else {	229	} else {
230	set_need_resched();	230	set_need_resched();
231	}	231	}
232		232
233	HMT_medium();	233	HMT_medium();
234	lpaca->lppaca.idle = 0;	234	lpaca->lppaca.idle = 0;
235	schedule();	235	schedule();
236	if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)	236	if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
237	cpu_die();	237	cpu_die();
238	}	238	}
239	return 0;	239	return 0;
240	}	240	}
241		241
242	static int shared_idle(void)	242	static int shared_idle(void)
243	{	243	{
244	struct paca_struct *lpaca = get_paca();	244	struct paca_struct *lpaca = get_paca();
245	unsigned int cpu = smp_processor_id();	245	unsigned int cpu = smp_processor_id();
246		246
247	while (1) {	247	while (1) {
248	/*	248	/*
249	* Indicate to the HV that we are idle. Now would be	249	* Indicate to the HV that we are idle. Now would be
250	* a good time to find other work to dispatch.	250	* a good time to find other work to dispatch.
251	*/	251	*/
252	lpaca->lppaca.idle = 1;	252	lpaca->lppaca.idle = 1;
253		253
254	while (!need_resched() && !cpu_is_offline(cpu)) {	254	while (!need_resched() && !cpu_is_offline(cpu)) {
255	local_irq_disable();	255	local_irq_disable();
256		256
257	/*	257	/*
258	* Yield the processor to the hypervisor. We return if	258	* Yield the processor to the hypervisor. We return if
259	* an external interrupt occurs (which are driven prior	259	* an external interrupt occurs (which are driven prior
260	* to returning here) or if a prod occurs from another	260	* to returning here) or if a prod occurs from another
261	* processor. When returning here, external interrupts	261	* processor. When returning here, external interrupts
262	* are enabled.	262	* are enabled.
263	*	263	*
264	* Check need_resched() again with interrupts disabled	264	* Check need_resched() again with interrupts disabled
265	* to avoid a race.	265	* to avoid a race.
266	*/	266	*/
267	if (!need_resched())	267	if (!need_resched())
268	cede_processor();	268	cede_processor();
269	else	269	else
270	local_irq_enable();	270	local_irq_enable();
271	}	271	}
272		272
273	HMT_medium();	273	HMT_medium();
274	lpaca->lppaca.idle = 0;	274	lpaca->lppaca.idle = 0;
275	schedule();	275	schedule();
276	if (cpu_is_offline(smp_processor_id()) &&	276	if (cpu_is_offline(smp_processor_id()) &&
277	system_state == SYSTEM_RUNNING)	277	system_state == SYSTEM_RUNNING)
278	cpu_die();	278	cpu_die();
279	}	279	}
280		280
281	return 0;	281	return 0;
282	}	282	}
283		283
284	#endif /* CONFIG_PPC_PSERIES */	284	#endif /* CONFIG_PPC_PSERIES */
285		285
286	static int native_idle(void)	286	static int native_idle(void)
287	{	287	{
288	while(1) {	288	while(1) {
289	/* check CPU type here */	289	/* check CPU type here */
290	if (!need_resched())	290	if (!need_resched())
291	power4_idle();	291	power4_idle();
292	if (need_resched())	292	if (need_resched())
293	schedule();	293	schedule();
294		294
295	if (cpu_is_offline(_smp_processor_id()) &&	295	if (cpu_is_offline(raw_smp_processor_id()) &&
296	system_state == SYSTEM_RUNNING)	296	system_state == SYSTEM_RUNNING)
297	cpu_die();	297	cpu_die();
298	}	298	}
299	return 0;	299	return 0;
300	}	300	}
301		301
302	#endif /* CONFIG_PPC_ISERIES */	302	#endif /* CONFIG_PPC_ISERIES */
303		303
304	void cpu_idle(void)	304	void cpu_idle(void)
305	{	305	{
306	idle_loop();	306	idle_loop();
307	}	307	}
308		308
309	int powersave_nap;	309	int powersave_nap;
310		310
311	#ifdef CONFIG_SYSCTL	311	#ifdef CONFIG_SYSCTL
312	/*	312	/*
313	* Register the sysctl to set/clear powersave_nap.	313	* Register the sysctl to set/clear powersave_nap.
314	*/	314	*/
315	static ctl_table powersave_nap_ctl_table[]={	315	static ctl_table powersave_nap_ctl_table[]={
316	{	316	{
317	.ctl_name = KERN_PPC_POWERSAVE_NAP,	317	.ctl_name = KERN_PPC_POWERSAVE_NAP,
318	.procname = "powersave-nap",	318	.procname = "powersave-nap",
319	.data = &powersave_nap,	319	.data = &powersave_nap,
320	.maxlen = sizeof(int),	320	.maxlen = sizeof(int),
321	.mode = 0644,	321	.mode = 0644,
322	.proc_handler = &proc_dointvec,	322	.proc_handler = &proc_dointvec,
323	},	323	},
324	{ 0, },	324	{ 0, },
325	};	325	};
326	static ctl_table powersave_nap_sysctl_root[] = {	326	static ctl_table powersave_nap_sysctl_root[] = {
327	{ 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },	327	{ 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },
328	{ 0,},	328	{ 0,},
329	};	329	};
330		330
331	static int __init	331	static int __init
332	register_powersave_nap_sysctl(void)	332	register_powersave_nap_sysctl(void)
333	{	333	{
334	register_sysctl_table(powersave_nap_sysctl_root, 0);	334	register_sysctl_table(powersave_nap_sysctl_root, 0);
335		335
336	return 0;	336	return 0;
337	}	337	}
338	__initcall(register_powersave_nap_sysctl);	338	__initcall(register_powersave_nap_sysctl);
339	#endif	339	#endif
340		340
341	int idle_setup(void)	341	int idle_setup(void)
342	{	342	{
343	/*	343	/*
344	* Move that junk to each platform specific file, eventually define	344	* Move that junk to each platform specific file, eventually define
345	* a pSeries_idle for shared processor stuff	345	* a pSeries_idle for shared processor stuff
346	*/	346	*/
347	#ifdef CONFIG_PPC_ISERIES	347	#ifdef CONFIG_PPC_ISERIES
348	idle_loop = iSeries_idle;	348	idle_loop = iSeries_idle;
349	return 1;	349	return 1;
350	#else	350	#else
351	idle_loop = default_idle;	351	idle_loop = default_idle;
352	#endif	352	#endif
353	#ifdef CONFIG_PPC_PSERIES	353	#ifdef CONFIG_PPC_PSERIES
354	if (systemcfg->platform & PLATFORM_PSERIES) {	354	if (systemcfg->platform & PLATFORM_PSERIES) {
355	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {	355	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
356	if (get_paca()->lppaca.shared_proc) {	356	if (get_paca()->lppaca.shared_proc) {
357	printk(KERN_INFO "Using shared processor idle loop\n");	357	printk(KERN_INFO "Using shared processor idle loop\n");
358	idle_loop = shared_idle;	358	idle_loop = shared_idle;
359	} else {	359	} else {
360	printk(KERN_INFO "Using dedicated idle loop\n");	360	printk(KERN_INFO "Using dedicated idle loop\n");
361	idle_loop = dedicated_idle;	361	idle_loop = dedicated_idle;
362	}	362	}
363	} else {	363	} else {
364	printk(KERN_INFO "Using default idle loop\n");	364	printk(KERN_INFO "Using default idle loop\n");
365	idle_loop = default_idle;	365	idle_loop = default_idle;
366	}	366	}
367	}	367	}
368	#endif /* CONFIG_PPC_PSERIES */	368	#endif /* CONFIG_PPC_PSERIES */
369	#ifndef CONFIG_PPC_ISERIES	369	#ifndef CONFIG_PPC_ISERIES
370	if (systemcfg->platform == PLATFORM_POWERMAC \|\|	370	if (systemcfg->platform == PLATFORM_POWERMAC \|\|
371	systemcfg->platform == PLATFORM_MAPLE) {	371	systemcfg->platform == PLATFORM_MAPLE) {
372	printk(KERN_INFO "Using native/NAP idle loop\n");	372	printk(KERN_INFO "Using native/NAP idle loop\n");
373	idle_loop = native_idle;	373	idle_loop = native_idle;
374	}	374	}
375	#endif /* CONFIG_PPC_ISERIES */	375	#endif /* CONFIG_PPC_ISERIES */
376		376
377	return 1;	377	return 1;
378	}	378	}
379		379

arch/sh/lib/delay.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Precise Delay Loops for SuperH	2	* Precise Delay Loops for SuperH
3	*	3	*
4	* Copyright (C) 1999 Niibe Yutaka & Kaz Kojima	4	* Copyright (C) 1999 Niibe Yutaka & Kaz Kojima
5	*/	5	*/
6		6
7	#include <linux/sched.h>	7	#include <linux/sched.h>
8	#include <linux/delay.h>	8	#include <linux/delay.h>
9		9
10	void __delay(unsigned long loops)	10	void __delay(unsigned long loops)
11	{	11	{
12	__asm__ __volatile__(	12	__asm__ __volatile__(
13	"tst %0, %0\n\t"	13	"tst %0, %0\n\t"
14	"1:\t"	14	"1:\t"
15	"bf/s 1b\n\t"	15	"bf/s 1b\n\t"
16	" dt %0"	16	" dt %0"
17	: "=r" (loops)	17	: "=r" (loops)
18	: "0" (loops)	18	: "0" (loops)
19	: "t");	19	: "t");
20	}	20	}
21		21
22	inline void __const_udelay(unsigned long xloops)	22	inline void __const_udelay(unsigned long xloops)
23	{	23	{
24	__asm__("dmulu.l %0, %2\n\t"	24	__asm__("dmulu.l %0, %2\n\t"
25	"sts mach, %0"	25	"sts mach, %0"
26	: "=r" (xloops)	26	: "=r" (xloops)
27	: "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)	27	: "0" (xloops), "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy)
28	: "macl", "mach");	28	: "macl", "mach");
29	__delay(xloops * HZ);	29	__delay(xloops * HZ);
30	}	30	}
31		31
32	void __udelay(unsigned long usecs)	32	void __udelay(unsigned long usecs)
33	{	33	{
34	__const_udelay(usecs * 0x000010c6); /* 2*32 / 1000000 /	34	__const_udelay(usecs * 0x000010c6); /* 2*32 / 1000000 /
35	}	35	}
36		36
37	void __ndelay(unsigned long nsecs)	37	void __ndelay(unsigned long nsecs)
38	{	38	{
39	__const_udelay(nsecs * 0x00000005);	39	__const_udelay(nsecs * 0x00000005);
40	}	40	}
41		41
42		42

arch/sparc64/lib/delay.c

Diff comments View file @ 39c715b

1	/* delay.c: Delay loops for sparc64	1	/* delay.c: Delay loops for sparc64
2	*	2	*
3	* Copyright (C) 2004 David S. Miller <davem@redhat.com>	3	* Copyright (C) 2004 David S. Miller <davem@redhat.com>
4	*	4	*
5	* Based heavily upon x86 variant which is:	5	* Based heavily upon x86 variant which is:
6	* Copyright (C) 1993 Linus Torvalds	6	* Copyright (C) 1993 Linus Torvalds
7	* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>	7	* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
8	*/	8	*/
9		9
10	#include <linux/delay.h>	10	#include <linux/delay.h>
11		11
12	void __delay(unsigned long loops)	12	void __delay(unsigned long loops)
13	{	13	{
14	__asm__ __volatile__(	14	__asm__ __volatile__(
15	" b,pt %%xcc, 1f\n"	15	" b,pt %%xcc, 1f\n"
16	" cmp %0, 0\n"	16	" cmp %0, 0\n"
17	" .align 32\n"	17	" .align 32\n"
18	"1:\n"	18	"1:\n"
19	" bne,pt %%xcc, 1b\n"	19	" bne,pt %%xcc, 1b\n"
20	" subcc %0, 1, %0\n"	20	" subcc %0, 1, %0\n"
21	: "=&r" (loops)	21	: "=&r" (loops)
22	: "0" (loops)	22	: "0" (loops)
23	: "cc");	23	: "cc");
24	}	24	}
25		25
26	/* We used to multiply by HZ after shifting down by 32 bits	26	/* We used to multiply by HZ after shifting down by 32 bits
27	* but that runs into problems for higher values of HZ and	27	* but that runs into problems for higher values of HZ and
28	* slow cpus.	28	* slow cpus.
29	*/	29	*/
30	void __const_udelay(unsigned long n)	30	void __const_udelay(unsigned long n)
31	{	31	{
32	n *= 4;	32	n *= 4;
33		33
34	n = (cpu_data(_smp_processor_id()).udelay_val (HZ/4));	34	n = (cpu_data(raw_smp_processor_id()).udelay_val (HZ/4));
35	n >>= 32;	35	n >>= 32;
36		36
37	__delay(n + 1);	37	__delay(n + 1);
38	}	38	}
39		39
40	void __udelay(unsigned long n)	40	void __udelay(unsigned long n)
41	{	41	{
42	__const_udelay(n * 0x10c7UL);	42	__const_udelay(n * 0x10c7UL);
43	}	43	}
44		44
45		45
46	void __ndelay(unsigned long n)	46	void __ndelay(unsigned long n)
47	{	47	{
48	__const_udelay(n * 0x5UL);	48	__const_udelay(n * 0x5UL);
49	}	49	}
50		50

arch/x86_64/lib/delay.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Precise Delay Loops for x86-64	2	* Precise Delay Loops for x86-64
3	*	3	*
4	* Copyright (C) 1993 Linus Torvalds	4	* Copyright (C) 1993 Linus Torvalds
5	* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>	5	* Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6	*	6	*
7	* The __delay function must _NOT_ be inlined as its execution time	7	* The __delay function must _NOT_ be inlined as its execution time
8	* depends wildly on alignment on many x86 processors.	8	* depends wildly on alignment on many x86 processors.
9	*/	9	*/
10		10
11	#include <linux/config.h>	11	#include <linux/config.h>
12	#include <linux/sched.h>	12	#include <linux/sched.h>
13	#include <linux/delay.h>	13	#include <linux/delay.h>
14	#include <asm/delay.h>	14	#include <asm/delay.h>
15		15
16	#ifdef CONFIG_SMP	16	#ifdef CONFIG_SMP
17	#include <asm/smp.h>	17	#include <asm/smp.h>
18	#endif	18	#endif
19		19
20	int x86_udelay_tsc = 0; /* Delay via TSC */	20	int x86_udelay_tsc = 0; /* Delay via TSC */
21		21
22	void __delay(unsigned long loops)	22	void __delay(unsigned long loops)
23	{	23	{
24	unsigned bclock, now;	24	unsigned bclock, now;
25		25
26	rdtscl(bclock);	26	rdtscl(bclock);
27	do	27	do
28	{	28	{
29	rep_nop();	29	rep_nop();
30	rdtscl(now);	30	rdtscl(now);
31	}	31	}
32	while((now-bclock) < loops);	32	while((now-bclock) < loops);
33	}	33	}
34		34
35	inline void __const_udelay(unsigned long xloops)	35	inline void __const_udelay(unsigned long xloops)
36	{	36	{
37	__delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);	37	__delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
38	}	38	}
39		39
40	void __udelay(unsigned long usecs)	40	void __udelay(unsigned long usecs)
41	{	41	{
42	__const_udelay(usecs * 0x000010c6); /* 2*32 / 1000000 /	42	__const_udelay(usecs * 0x000010c6); /* 2*32 / 1000000 /
43	}	43	}
44		44
45	void __ndelay(unsigned long nsecs)	45	void __ndelay(unsigned long nsecs)
46	{	46	{
47	__const_udelay(nsecs * 0x00005); /* 2*32 / 1000000000 (rounded up) /	47	__const_udelay(nsecs * 0x00005); /* 2*32 / 1000000000 (rounded up) /
48	}	48	}
49		49

drivers/acpi/processor_idle.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* processor_idle - idle state submodule to the ACPI processor driver	2	* processor_idle - idle state submodule to the ACPI processor driver
3	*	3	*
4	* Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>	4	* Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5	* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>	5	* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6	* Copyright (C) 2004 Dominik Brodowski <linux@brodo.de>	6	* Copyright (C) 2004 Dominik Brodowski <linux@brodo.de>
7	* Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>	7	* Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8	* - Added processor hotplug support	8	* - Added processor hotplug support
9	*	9	*
10	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~	10	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11	*	11	*
12	* This program is free software; you can redistribute it and/or modify	12	* This program is free software; you can redistribute it and/or modify
13	* it under the terms of the GNU General Public License as published by	13	* it under the terms of the GNU General Public License as published by
14	* the Free Software Foundation; either version 2 of the License, or (at	14	* the Free Software Foundation; either version 2 of the License, or (at
15	* your option) any later version.	15	* your option) any later version.
16	*	16	*
17	* This program is distributed in the hope that it will be useful, but	17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.	20	* General Public License for more details.
21	*	21	*
22	* You should have received a copy of the GNU General Public License along	22	* You should have received a copy of the GNU General Public License along
23	* with this program; if not, write to the Free Software Foundation, Inc.,	23	* with this program; if not, write to the Free Software Foundation, Inc.,
24	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.	24	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
25	*	25	*
26	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~	26	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27	*/	27	*/
28		28
29	#include <linux/kernel.h>	29	#include <linux/kernel.h>
30	#include <linux/module.h>	30	#include <linux/module.h>
31	#include <linux/init.h>	31	#include <linux/init.h>
32	#include <linux/cpufreq.h>	32	#include <linux/cpufreq.h>
33	#include <linux/proc_fs.h>	33	#include <linux/proc_fs.h>
34	#include <linux/seq_file.h>	34	#include <linux/seq_file.h>
35	#include <linux/acpi.h>	35	#include <linux/acpi.h>
36	#include <linux/dmi.h>	36	#include <linux/dmi.h>
37	#include <linux/moduleparam.h>	37	#include <linux/moduleparam.h>
38		38
39	#include <asm/io.h>	39	#include <asm/io.h>
40	#include <asm/uaccess.h>	40	#include <asm/uaccess.h>
41		41
42	#include <acpi/acpi_bus.h>	42	#include <acpi/acpi_bus.h>
43	#include <acpi/processor.h>	43	#include <acpi/processor.h>
44		44
45	#define ACPI_PROCESSOR_COMPONENT 0x01000000	45	#define ACPI_PROCESSOR_COMPONENT 0x01000000
46	#define ACPI_PROCESSOR_CLASS "processor"	46	#define ACPI_PROCESSOR_CLASS "processor"
47	#define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver"	47	#define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver"
48	#define _COMPONENT ACPI_PROCESSOR_COMPONENT	48	#define _COMPONENT ACPI_PROCESSOR_COMPONENT
49	ACPI_MODULE_NAME ("acpi_processor")	49	ACPI_MODULE_NAME ("acpi_processor")
50		50
51	#define ACPI_PROCESSOR_FILE_POWER "power"	51	#define ACPI_PROCESSOR_FILE_POWER "power"
52		52
53	#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)	53	#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
54	#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */	54	#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
55	#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */	55	#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
56		56
57	static void (*pm_idle_save)(void);	57	static void (*pm_idle_save)(void);
58	module_param(max_cstate, uint, 0644);	58	module_param(max_cstate, uint, 0644);
59		59
60	static unsigned int nocst = 0;	60	static unsigned int nocst = 0;
61	module_param(nocst, uint, 0000);	61	module_param(nocst, uint, 0000);
62		62
63	/*	63	/*
64	* bm_history -- bit-mask with a bit per jiffy of bus-master activity	64	* bm_history -- bit-mask with a bit per jiffy of bus-master activity
65	* 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms	65	* 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
66	* 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms	66	* 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
67	* 100 HZ: 0x0000000F: 4 jiffies = 40ms	67	* 100 HZ: 0x0000000F: 4 jiffies = 40ms
68	* reduce history for more aggressive entry into C3	68	* reduce history for more aggressive entry into C3
69	*/	69	*/
70	static unsigned int bm_history = (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));	70	static unsigned int bm_history = (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
71	module_param(bm_history, uint, 0644);	71	module_param(bm_history, uint, 0644);
72	/* --------------------------------------------------------------------------	72	/* --------------------------------------------------------------------------
73	Power Management	73	Power Management
74	-------------------------------------------------------------------------- */	74	-------------------------------------------------------------------------- */
75		75
76	/*	76	/*
77	* IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.	77	* IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
78	* For now disable this. Probably a bug somewhere else.	78	* For now disable this. Probably a bug somewhere else.
79	*	79	*
80	* To skip this limit, boot/load with a large max_cstate limit.	80	* To skip this limit, boot/load with a large max_cstate limit.
81	*/	81	*/
82	static int no_c2c3(struct dmi_system_id *id)	82	static int no_c2c3(struct dmi_system_id *id)
83	{	83	{
84	if (max_cstate > ACPI_PROCESSOR_MAX_POWER)	84	if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
85	return 0;	85	return 0;
86		86
87	printk(KERN_NOTICE PREFIX "%s detected - C2,C3 disabled."	87	printk(KERN_NOTICE PREFIX "%s detected - C2,C3 disabled."
88	" Override with \"processor.max_cstate=%d\"\n", id->ident,	88	" Override with \"processor.max_cstate=%d\"\n", id->ident,
89	ACPI_PROCESSOR_MAX_POWER + 1);	89	ACPI_PROCESSOR_MAX_POWER + 1);
90		90
91	max_cstate = 1;	91	max_cstate = 1;
92		92
93	return 0;	93	return 0;
94	}	94	}
95		95
96		96
97		97
98		98
99	static struct dmi_system_id __initdata processor_power_dmi_table[] = {	99	static struct dmi_system_id __initdata processor_power_dmi_table[] = {
100	{ no_c2c3, "IBM ThinkPad R40e", {	100	{ no_c2c3, "IBM ThinkPad R40e", {
101	DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),	101	DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
102	DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }},	102	DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }},
103	{ no_c2c3, "Medion 41700", {	103	{ no_c2c3, "Medion 41700", {
104	DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),	104	DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
105	DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J") }},	105	DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J") }},
106	{},	106	{},
107	};	107	};
108		108
109		109
110	static inline u32	110	static inline u32
111	ticks_elapsed (	111	ticks_elapsed (
112	u32 t1,	112	u32 t1,
113	u32 t2)	113	u32 t2)
114	{	114	{
115	if (t2 >= t1)	115	if (t2 >= t1)
116	return (t2 - t1);	116	return (t2 - t1);
117	else if (!acpi_fadt.tmr_val_ext)	117	else if (!acpi_fadt.tmr_val_ext)
118	return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);	118	return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
119	else	119	else
120	return ((0xFFFFFFFF - t1) + t2);	120	return ((0xFFFFFFFF - t1) + t2);
121	}	121	}
122		122
123		123
124	static void	124	static void
125	acpi_processor_power_activate (	125	acpi_processor_power_activate (
126	struct acpi_processor *pr,	126	struct acpi_processor *pr,
127	struct acpi_processor_cx *new)	127	struct acpi_processor_cx *new)
128	{	128	{
129	struct acpi_processor_cx *old;	129	struct acpi_processor_cx *old;
130		130
131	if (!pr \|\| !new)	131	if (!pr \|\| !new)
132	return;	132	return;
133		133
134	old = pr->power.state;	134	old = pr->power.state;
135		135
136	if (old)	136	if (old)
137	old->promotion.count = 0;	137	old->promotion.count = 0;
138	new->demotion.count = 0;	138	new->demotion.count = 0;
139		139
140	/* Cleanup from old state. */	140	/* Cleanup from old state. */
141	if (old) {	141	if (old) {
142	switch (old->type) {	142	switch (old->type) {
143	case ACPI_STATE_C3:	143	case ACPI_STATE_C3:
144	/* Disable bus master reload */	144	/* Disable bus master reload */
145	if (new->type != ACPI_STATE_C3)	145	if (new->type != ACPI_STATE_C3)
146	acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0, ACPI_MTX_DO_NOT_LOCK);	146	acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0, ACPI_MTX_DO_NOT_LOCK);
147	break;	147	break;
148	}	148	}
149	}	149	}
150		150
151	/* Prepare to use new state. */	151	/* Prepare to use new state. */
152	switch (new->type) {	152	switch (new->type) {
153	case ACPI_STATE_C3:	153	case ACPI_STATE_C3:
154	/* Enable bus master reload */	154	/* Enable bus master reload */
155	if (old->type != ACPI_STATE_C3)	155	if (old->type != ACPI_STATE_C3)
156	acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1, ACPI_MTX_DO_NOT_LOCK);	156	acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1, ACPI_MTX_DO_NOT_LOCK);
157	break;	157	break;
158	}	158	}
159		159
160	pr->power.state = new;	160	pr->power.state = new;
161		161
162	return;	162	return;
163	}	163	}
164		164
165		165
166	static void acpi_processor_idle (void)	166	static void acpi_processor_idle (void)
167	{	167	{
168	struct acpi_processor *pr = NULL;	168	struct acpi_processor *pr = NULL;
169	struct acpi_processor_cx *cx = NULL;	169	struct acpi_processor_cx *cx = NULL;
170	struct acpi_processor_cx *next_state = NULL;	170	struct acpi_processor_cx *next_state = NULL;
171	int sleep_ticks = 0;	171	int sleep_ticks = 0;
172	u32 t1, t2 = 0;	172	u32 t1, t2 = 0;
173		173
174	pr = processors[_smp_processor_id()];	174	pr = processors[raw_smp_processor_id()];
175	if (!pr)	175	if (!pr)
176	return;	176	return;
177		177
178	/*	178	/*
179	* Interrupts must be disabled during bus mastering calculations and	179	* Interrupts must be disabled during bus mastering calculations and
180	* for C2/C3 transitions.	180	* for C2/C3 transitions.
181	*/	181	*/
182	local_irq_disable();	182	local_irq_disable();
183		183
184	/*	184	/*
185	* Check whether we truly need to go idle, or should	185	* Check whether we truly need to go idle, or should
186	* reschedule:	186	* reschedule:
187	*/	187	*/
188	if (unlikely(need_resched())) {	188	if (unlikely(need_resched())) {
189	local_irq_enable();	189	local_irq_enable();
190	return;	190	return;
191	}	191	}
192		192
193	cx = pr->power.state;	193	cx = pr->power.state;
194	if (!cx)	194	if (!cx)
195	goto easy_out;	195	goto easy_out;
196		196
197	/*	197	/*
198	* Check BM Activity	198	* Check BM Activity
199	* -----------------	199	* -----------------
200	* Check for bus mastering activity (if required), record, and check	200	* Check for bus mastering activity (if required), record, and check
201	* for demotion.	201	* for demotion.
202	*/	202	*/
203	if (pr->flags.bm_check) {	203	if (pr->flags.bm_check) {
204	u32 bm_status = 0;	204	u32 bm_status = 0;
205	unsigned long diff = jiffies - pr->power.bm_check_timestamp;	205	unsigned long diff = jiffies - pr->power.bm_check_timestamp;
206		206
207	if (diff > 32)	207	if (diff > 32)
208	diff = 32;	208	diff = 32;
209		209
210	while (diff) {	210	while (diff) {
211	/* if we didn't get called, assume there was busmaster activity */	211	/* if we didn't get called, assume there was busmaster activity */
212	diff--;	212	diff--;
213	if (diff)	213	if (diff)
214	pr->power.bm_activity \|= 0x1;	214	pr->power.bm_activity \|= 0x1;
215	pr->power.bm_activity <<= 1;	215	pr->power.bm_activity <<= 1;
216	}	216	}
217		217
218	acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS,	218	acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS,
219	&bm_status, ACPI_MTX_DO_NOT_LOCK);	219	&bm_status, ACPI_MTX_DO_NOT_LOCK);
220	if (bm_status) {	220	if (bm_status) {
221	pr->power.bm_activity++;	221	pr->power.bm_activity++;
222	acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS,	222	acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS,
223	1, ACPI_MTX_DO_NOT_LOCK);	223	1, ACPI_MTX_DO_NOT_LOCK);
224	}	224	}
225	/*	225	/*
226	* PIIX4 Erratum #18: Note that BM_STS doesn't always reflect	226	* PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
227	* the true state of bus mastering activity; forcing us to	227	* the true state of bus mastering activity; forcing us to
228	* manually check the BMIDEA bit of each IDE channel.	228	* manually check the BMIDEA bit of each IDE channel.
229	*/	229	*/
230	else if (errata.piix4.bmisx) {	230	else if (errata.piix4.bmisx) {
231	if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)	231	if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
232	\|\| (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))	232	\|\| (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
233	pr->power.bm_activity++;	233	pr->power.bm_activity++;
234	}	234	}
235		235
236	pr->power.bm_check_timestamp = jiffies;	236	pr->power.bm_check_timestamp = jiffies;
237		237
238	/*	238	/*
239	* Apply bus mastering demotion policy. Automatically demote	239	* Apply bus mastering demotion policy. Automatically demote
240	* to avoid a faulty transition. Note that the processor	240	* to avoid a faulty transition. Note that the processor
241	* won't enter a low-power state during this call (to this	241	* won't enter a low-power state during this call (to this
242	* funciton) but should upon the next.	242	* funciton) but should upon the next.
243	*	243	*
244	* TBD: A better policy might be to fallback to the demotion	244	* TBD: A better policy might be to fallback to the demotion
245	* state (use it for this quantum only) istead of	245	* state (use it for this quantum only) istead of
246	* demoting -- and rely on duration as our sole demotion	246	* demoting -- and rely on duration as our sole demotion
247	* qualification. This may, however, introduce DMA	247	* qualification. This may, however, introduce DMA
248	* issues (e.g. floppy DMA transfer overrun/underrun).	248	* issues (e.g. floppy DMA transfer overrun/underrun).
249	*/	249	*/
250	if (pr->power.bm_activity & cx->demotion.threshold.bm) {	250	if (pr->power.bm_activity & cx->demotion.threshold.bm) {
251	local_irq_enable();	251	local_irq_enable();
252	next_state = cx->demotion.state;	252	next_state = cx->demotion.state;
253	goto end;	253	goto end;
254	}	254	}
255	}	255	}
256		256
257	cx->usage++;	257	cx->usage++;
258		258
259	/*	259	/*
260	* Sleep:	260	* Sleep:
261	* ------	261	* ------
262	* Invoke the current Cx state to put the processor to sleep.	262	* Invoke the current Cx state to put the processor to sleep.
263	*/	263	*/
264	switch (cx->type) {	264	switch (cx->type) {
265		265
266	case ACPI_STATE_C1:	266	case ACPI_STATE_C1:
267	/*	267	/*
268	* Invoke C1.	268	* Invoke C1.
269	* Use the appropriate idle routine, the one that would	269	* Use the appropriate idle routine, the one that would
270	* be used without acpi C-states.	270	* be used without acpi C-states.
271	*/	271	*/
272	if (pm_idle_save)	272	if (pm_idle_save)
273	pm_idle_save();	273	pm_idle_save();
274	else	274	else
275	safe_halt();	275	safe_halt();
276	/*	276	/*
277	* TBD: Can't get time duration while in C1, as resumes	277	* TBD: Can't get time duration while in C1, as resumes
278	* go to an ISR rather than here. Need to instrument	278	* go to an ISR rather than here. Need to instrument
279	* base interrupt handler.	279	* base interrupt handler.
280	*/	280	*/
281	sleep_ticks = 0xFFFFFFFF;	281	sleep_ticks = 0xFFFFFFFF;
282	break;	282	break;
283		283
284	case ACPI_STATE_C2:	284	case ACPI_STATE_C2:
285	/* Get start time (ticks) */	285	/* Get start time (ticks) */
286	t1 = inl(acpi_fadt.xpm_tmr_blk.address);	286	t1 = inl(acpi_fadt.xpm_tmr_blk.address);
287	/* Invoke C2 */	287	/* Invoke C2 */
288	inb(cx->address);	288	inb(cx->address);
289	/* Dummy op - must do something useless after P_LVL2 read */	289	/* Dummy op - must do something useless after P_LVL2 read */
290	t2 = inl(acpi_fadt.xpm_tmr_blk.address);	290	t2 = inl(acpi_fadt.xpm_tmr_blk.address);
291	/* Get end time (ticks) */	291	/* Get end time (ticks) */
292	t2 = inl(acpi_fadt.xpm_tmr_blk.address);	292	t2 = inl(acpi_fadt.xpm_tmr_blk.address);
293	/* Re-enable interrupts */	293	/* Re-enable interrupts */
294	local_irq_enable();	294	local_irq_enable();
295	/* Compute time (ticks) that we were actually asleep */	295	/* Compute time (ticks) that we were actually asleep */
296	sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;	296	sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
297	break;	297	break;
298		298
299	case ACPI_STATE_C3:	299	case ACPI_STATE_C3:
300	/* Disable bus master arbitration */	300	/* Disable bus master arbitration */
301	acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1, ACPI_MTX_DO_NOT_LOCK);	301	acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1, ACPI_MTX_DO_NOT_LOCK);
302	/* Get start time (ticks) */	302	/* Get start time (ticks) */
303	t1 = inl(acpi_fadt.xpm_tmr_blk.address);	303	t1 = inl(acpi_fadt.xpm_tmr_blk.address);
304	/* Invoke C3 */	304	/* Invoke C3 */
305	inb(cx->address);	305	inb(cx->address);
306	/* Dummy op - must do something useless after P_LVL3 read */	306	/* Dummy op - must do something useless after P_LVL3 read */
307	t2 = inl(acpi_fadt.xpm_tmr_blk.address);	307	t2 = inl(acpi_fadt.xpm_tmr_blk.address);
308	/* Get end time (ticks) */	308	/* Get end time (ticks) */
309	t2 = inl(acpi_fadt.xpm_tmr_blk.address);	309	t2 = inl(acpi_fadt.xpm_tmr_blk.address);
310	/* Enable bus master arbitration */	310	/* Enable bus master arbitration */
311	acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0, ACPI_MTX_DO_NOT_LOCK);	311	acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0, ACPI_MTX_DO_NOT_LOCK);
312	/* Re-enable interrupts */	312	/* Re-enable interrupts */
313	local_irq_enable();	313	local_irq_enable();
314	/* Compute time (ticks) that we were actually asleep */	314	/* Compute time (ticks) that we were actually asleep */
315	sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;	315	sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
316	break;	316	break;
317		317
318	default:	318	default:
319	local_irq_enable();	319	local_irq_enable();
320	return;	320	return;
321	}	321	}
322		322
323	next_state = pr->power.state;	323	next_state = pr->power.state;
324		324
325	/*	325	/*
326	* Promotion?	326	* Promotion?
327	* ----------	327	* ----------
328	* Track the number of longs (time asleep is greater than threshold)	328	* Track the number of longs (time asleep is greater than threshold)
329	* and promote when the count threshold is reached. Note that bus	329	* and promote when the count threshold is reached. Note that bus
330	* mastering activity may prevent promotions.	330	* mastering activity may prevent promotions.
331	* Do not promote above max_cstate.	331	* Do not promote above max_cstate.
332	*/	332	*/
333	if (cx->promotion.state &&	333	if (cx->promotion.state &&
334	((cx->promotion.state - pr->power.states) <= max_cstate)) {	334	((cx->promotion.state - pr->power.states) <= max_cstate)) {
335	if (sleep_ticks > cx->promotion.threshold.ticks) {	335	if (sleep_ticks > cx->promotion.threshold.ticks) {
336	cx->promotion.count++;	336	cx->promotion.count++;
337	cx->demotion.count = 0;	337	cx->demotion.count = 0;
338	if (cx->promotion.count >= cx->promotion.threshold.count) {	338	if (cx->promotion.count >= cx->promotion.threshold.count) {
339	if (pr->flags.bm_check) {	339	if (pr->flags.bm_check) {
340	if (!(pr->power.bm_activity & cx->promotion.threshold.bm)) {	340	if (!(pr->power.bm_activity & cx->promotion.threshold.bm)) {
341	next_state = cx->promotion.state;	341	next_state = cx->promotion.state;
342	goto end;	342	goto end;
343	}	343	}
344	}	344	}
345	else {	345	else {
346	next_state = cx->promotion.state;	346	next_state = cx->promotion.state;
347	goto end;	347	goto end;
348	}	348	}
349	}	349	}
350	}	350	}
351	}	351	}
352		352
353	/*	353	/*
354	* Demotion?	354	* Demotion?
355	* ---------	355	* ---------
356	* Track the number of shorts (time asleep is less than time threshold)	356	* Track the number of shorts (time asleep is less than time threshold)
357	* and demote when the usage threshold is reached.	357	* and demote when the usage threshold is reached.
358	*/	358	*/
359	if (cx->demotion.state) {	359	if (cx->demotion.state) {
360	if (sleep_ticks < cx->demotion.threshold.ticks) {	360	if (sleep_ticks < cx->demotion.threshold.ticks) {
361	cx->demotion.count++;	361	cx->demotion.count++;
362	cx->promotion.count = 0;	362	cx->promotion.count = 0;
363	if (cx->demotion.count >= cx->demotion.threshold.count) {	363	if (cx->demotion.count >= cx->demotion.threshold.count) {
364	next_state = cx->demotion.state;	364	next_state = cx->demotion.state;
365	goto end;	365	goto end;
366	}	366	}
367	}	367	}
368	}	368	}
369		369
370	end:	370	end:
371	/*	371	/*
372	* Demote if current state exceeds max_cstate	372	* Demote if current state exceeds max_cstate
373	*/	373	*/
374	if ((pr->power.state - pr->power.states) > max_cstate) {	374	if ((pr->power.state - pr->power.states) > max_cstate) {
375	if (cx->demotion.state)	375	if (cx->demotion.state)
376	next_state = cx->demotion.state;	376	next_state = cx->demotion.state;
377	}	377	}
378		378
379	/*	379	/*
380	* New Cx State?	380	* New Cx State?
381	* -------------	381	* -------------
382	* If we're going to start using a new Cx state we must clean up	382	* If we're going to start using a new Cx state we must clean up
383	* from the previous and prepare to use the new.	383	* from the previous and prepare to use the new.
384	*/	384	*/
385	if (next_state != pr->power.state)	385	if (next_state != pr->power.state)
386	acpi_processor_power_activate(pr, next_state);	386	acpi_processor_power_activate(pr, next_state);
387		387
388	return;	388	return;
389		389
390	easy_out:	390	easy_out:
391	/* do C1 instead of busy loop */	391	/* do C1 instead of busy loop */
392	if (pm_idle_save)	392	if (pm_idle_save)
393	pm_idle_save();	393	pm_idle_save();
394	else	394	else
395	safe_halt();	395	safe_halt();
396	return;	396	return;
397	}	397	}
398		398
399		399
400	static int	400	static int
401	acpi_processor_set_power_policy (	401	acpi_processor_set_power_policy (
402	struct acpi_processor *pr)	402	struct acpi_processor *pr)
403	{	403	{
404	unsigned int i;	404	unsigned int i;
405	unsigned int state_is_set = 0;	405	unsigned int state_is_set = 0;
406	struct acpi_processor_cx *lower = NULL;	406	struct acpi_processor_cx *lower = NULL;
407	struct acpi_processor_cx *higher = NULL;	407	struct acpi_processor_cx *higher = NULL;
408	struct acpi_processor_cx *cx;	408	struct acpi_processor_cx *cx;
409		409
410	ACPI_FUNCTION_TRACE("acpi_processor_set_power_policy");	410	ACPI_FUNCTION_TRACE("acpi_processor_set_power_policy");
411		411
412	if (!pr)	412	if (!pr)
413	return_VALUE(-EINVAL);	413	return_VALUE(-EINVAL);
414		414
415	/*	415	/*
416	* This function sets the default Cx state policy (OS idle handler).	416	* This function sets the default Cx state policy (OS idle handler).
417	* Our scheme is to promote quickly to C2 but more conservatively	417	* Our scheme is to promote quickly to C2 but more conservatively
418	* to C3. We're favoring C2 for its characteristics of low latency	418	* to C3. We're favoring C2 for its characteristics of low latency
419	* (quick response), good power savings, and ability to allow bus	419	* (quick response), good power savings, and ability to allow bus
420	* mastering activity. Note that the Cx state policy is completely	420	* mastering activity. Note that the Cx state policy is completely
421	* customizable and can be altered dynamically.	421	* customizable and can be altered dynamically.
422	*/	422	*/
423		423
424	/* startup state */	424	/* startup state */
425	for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {	425	for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
426	cx = &pr->power.states[i];	426	cx = &pr->power.states[i];
427	if (!cx->valid)	427	if (!cx->valid)
428	continue;	428	continue;
429		429
430	if (!state_is_set)	430	if (!state_is_set)
431	pr->power.state = cx;	431	pr->power.state = cx;
432	state_is_set++;	432	state_is_set++;
433	break;	433	break;
434	}	434	}
435		435
436	if (!state_is_set)	436	if (!state_is_set)
437	return_VALUE(-ENODEV);	437	return_VALUE(-ENODEV);
438		438
439	/* demotion */	439	/* demotion */
440	for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {	440	for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
441	cx = &pr->power.states[i];	441	cx = &pr->power.states[i];
442	if (!cx->valid)	442	if (!cx->valid)
443	continue;	443	continue;
444		444
445	if (lower) {	445	if (lower) {
446	cx->demotion.state = lower;	446	cx->demotion.state = lower;
447	cx->demotion.threshold.ticks = cx->latency_ticks;	447	cx->demotion.threshold.ticks = cx->latency_ticks;
448	cx->demotion.threshold.count = 1;	448	cx->demotion.threshold.count = 1;
449	if (cx->type == ACPI_STATE_C3)	449	if (cx->type == ACPI_STATE_C3)
450	cx->demotion.threshold.bm = bm_history;	450	cx->demotion.threshold.bm = bm_history;
451	}	451	}
452		452
453	lower = cx;	453	lower = cx;
454	}	454	}
455		455
456	/* promotion */	456	/* promotion */
457	for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {	457	for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
458	cx = &pr->power.states[i];	458	cx = &pr->power.states[i];
459	if (!cx->valid)	459	if (!cx->valid)
460	continue;	460	continue;
461		461
462	if (higher) {	462	if (higher) {
463	cx->promotion.state = higher;	463	cx->promotion.state = higher;
464	cx->promotion.threshold.ticks = cx->latency_ticks;	464	cx->promotion.threshold.ticks = cx->latency_ticks;
465	if (cx->type >= ACPI_STATE_C2)	465	if (cx->type >= ACPI_STATE_C2)
466	cx->promotion.threshold.count = 4;	466	cx->promotion.threshold.count = 4;
467	else	467	else
468	cx->promotion.threshold.count = 10;	468	cx->promotion.threshold.count = 10;
469	if (higher->type == ACPI_STATE_C3)	469	if (higher->type == ACPI_STATE_C3)
470	cx->promotion.threshold.bm = bm_history;	470	cx->promotion.threshold.bm = bm_history;
471	}	471	}
472		472
473	higher = cx;	473	higher = cx;
474	}	474	}
475		475
476	return_VALUE(0);	476	return_VALUE(0);
477	}	477	}
478		478
479		479
480	static int acpi_processor_get_power_info_fadt (struct acpi_processor *pr)	480	static int acpi_processor_get_power_info_fadt (struct acpi_processor *pr)
481	{	481	{
482	int i;	482	int i;
483		483
484	ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_fadt");	484	ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_fadt");
485		485
486	if (!pr)	486	if (!pr)
487	return_VALUE(-EINVAL);	487	return_VALUE(-EINVAL);
488		488
489	if (!pr->pblk)	489	if (!pr->pblk)
490	return_VALUE(-ENODEV);	490	return_VALUE(-ENODEV);
491		491
492	for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++)	492	for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++)
493	memset(pr->power.states, 0, sizeof(struct acpi_processor_cx));	493	memset(pr->power.states, 0, sizeof(struct acpi_processor_cx));
494		494
495	/* if info is obtained from pblk/fadt, type equals state */	495	/* if info is obtained from pblk/fadt, type equals state */
496	pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;	496	pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
497	pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;	497	pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;
498	pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;	498	pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;
499		499
500	/* the C0 state only exists as a filler in our array,	500	/* the C0 state only exists as a filler in our array,
501	* and all processors need to support C1 */	501	* and all processors need to support C1 */
502	pr->power.states[ACPI_STATE_C0].valid = 1;	502	pr->power.states[ACPI_STATE_C0].valid = 1;
503	pr->power.states[ACPI_STATE_C1].valid = 1;	503	pr->power.states[ACPI_STATE_C1].valid = 1;
504		504
505	/* determine C2 and C3 address from pblk */	505	/* determine C2 and C3 address from pblk */
506	pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;	506	pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;
507	pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;	507	pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;
508		508
509	/* determine latencies from FADT */	509	/* determine latencies from FADT */
510	pr->power.states[ACPI_STATE_C2].latency = acpi_fadt.plvl2_lat;	510	pr->power.states[ACPI_STATE_C2].latency = acpi_fadt.plvl2_lat;
511	pr->power.states[ACPI_STATE_C3].latency = acpi_fadt.plvl3_lat;	511	pr->power.states[ACPI_STATE_C3].latency = acpi_fadt.plvl3_lat;
512		512
513	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	513	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
514	"lvl2[0x%08x] lvl3[0x%08x]\n",	514	"lvl2[0x%08x] lvl3[0x%08x]\n",
515	pr->power.states[ACPI_STATE_C2].address,	515	pr->power.states[ACPI_STATE_C2].address,
516	pr->power.states[ACPI_STATE_C3].address));	516	pr->power.states[ACPI_STATE_C3].address));
517		517
518	return_VALUE(0);	518	return_VALUE(0);
519	}	519	}
520		520
521		521
522	static int acpi_processor_get_power_info_cst (struct acpi_processor *pr)	522	static int acpi_processor_get_power_info_cst (struct acpi_processor *pr)
523	{	523	{
524	acpi_status status = 0;	524	acpi_status status = 0;
525	acpi_integer count;	525	acpi_integer count;
526	int i;	526	int i;
527	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};	527	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
528	union acpi_object *cst;	528	union acpi_object *cst;
529		529
530	ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_cst");	530	ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_cst");
531		531
532	if (errata.smp)	532	if (errata.smp)
533	return_VALUE(-ENODEV);	533	return_VALUE(-ENODEV);
534		534
535	if (nocst)	535	if (nocst)
536	return_VALUE(-ENODEV);	536	return_VALUE(-ENODEV);
537		537
538	pr->power.count = 0;	538	pr->power.count = 0;
539	for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++)	539	for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++)
540	memset(pr->power.states, 0, sizeof(struct acpi_processor_cx));	540	memset(pr->power.states, 0, sizeof(struct acpi_processor_cx));
541		541
542	status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);	542	status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
543	if (ACPI_FAILURE(status)) {	543	if (ACPI_FAILURE(status)) {
544	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));	544	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
545	return_VALUE(-ENODEV);	545	return_VALUE(-ENODEV);
546	}	546	}
547		547
548	cst = (union acpi_object *) buffer.pointer;	548	cst = (union acpi_object *) buffer.pointer;
549		549
550	/* There must be at least 2 elements */	550	/* There must be at least 2 elements */
551	if (!cst \|\| (cst->type != ACPI_TYPE_PACKAGE) \|\| cst->package.count < 2) {	551	if (!cst \|\| (cst->type != ACPI_TYPE_PACKAGE) \|\| cst->package.count < 2) {
552	ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "not enough elements in _CST\n"));	552	ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "not enough elements in _CST\n"));
553	status = -EFAULT;	553	status = -EFAULT;
554	goto end;	554	goto end;
555	}	555	}
556		556
557	count = cst->package.elements[0].integer.value;	557	count = cst->package.elements[0].integer.value;
558		558
559	/* Validate number of power states. */	559	/* Validate number of power states. */
560	if (count < 1 \|\| count != cst->package.count - 1) {	560	if (count < 1 \|\| count != cst->package.count - 1) {
561	ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "count given by _CST is not valid\n"));	561	ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "count given by _CST is not valid\n"));
562	status = -EFAULT;	562	status = -EFAULT;
563	goto end;	563	goto end;
564	}	564	}
565		565
566	/* We support up to ACPI_PROCESSOR_MAX_POWER. */	566	/* We support up to ACPI_PROCESSOR_MAX_POWER. */
567	if (count > ACPI_PROCESSOR_MAX_POWER) {	567	if (count > ACPI_PROCESSOR_MAX_POWER) {
568	printk(KERN_WARNING "Limiting number of power states to max (%d)\n", ACPI_PROCESSOR_MAX_POWER);	568	printk(KERN_WARNING "Limiting number of power states to max (%d)\n", ACPI_PROCESSOR_MAX_POWER);
569	printk(KERN_WARNING "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");	569	printk(KERN_WARNING "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
570	count = ACPI_PROCESSOR_MAX_POWER;	570	count = ACPI_PROCESSOR_MAX_POWER;
571	}	571	}
572		572
573	/* Tell driver that at least _CST is supported. */	573	/* Tell driver that at least _CST is supported. */
574	pr->flags.has_cst = 1;	574	pr->flags.has_cst = 1;
575		575
576	for (i = 1; i <= count; i++) {	576	for (i = 1; i <= count; i++) {
577	union acpi_object *element;	577	union acpi_object *element;
578	union acpi_object *obj;	578	union acpi_object *obj;
579	struct acpi_power_register *reg;	579	struct acpi_power_register *reg;
580	struct acpi_processor_cx cx;	580	struct acpi_processor_cx cx;
581		581
582	memset(&cx, 0, sizeof(cx));	582	memset(&cx, 0, sizeof(cx));
583		583
584	element = (union acpi_object *) &(cst->package.elements[i]);	584	element = (union acpi_object *) &(cst->package.elements[i]);
585	if (element->type != ACPI_TYPE_PACKAGE)	585	if (element->type != ACPI_TYPE_PACKAGE)
586	continue;	586	continue;
587		587
588	if (element->package.count != 4)	588	if (element->package.count != 4)
589	continue;	589	continue;
590		590
591	obj = (union acpi_object *) &(element->package.elements[0]);	591	obj = (union acpi_object *) &(element->package.elements[0]);
592		592
593	if (obj->type != ACPI_TYPE_BUFFER)	593	if (obj->type != ACPI_TYPE_BUFFER)
594	continue;	594	continue;
595		595
596	reg = (struct acpi_power_register *) obj->buffer.pointer;	596	reg = (struct acpi_power_register *) obj->buffer.pointer;
597		597
598	if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&	598	if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
599	(reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))	599	(reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
600	continue;	600	continue;
601		601
602	cx.address = (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) ?	602	cx.address = (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) ?
603	0 : reg->address;	603	0 : reg->address;
604		604
605	/* There should be an easy way to extract an integer... */	605	/* There should be an easy way to extract an integer... */
606	obj = (union acpi_object *) &(element->package.elements[1]);	606	obj = (union acpi_object *) &(element->package.elements[1]);
607	if (obj->type != ACPI_TYPE_INTEGER)	607	if (obj->type != ACPI_TYPE_INTEGER)
608	continue;	608	continue;
609		609
610	cx.type = obj->integer.value;	610	cx.type = obj->integer.value;
611		611
612	if ((cx.type != ACPI_STATE_C1) &&	612	if ((cx.type != ACPI_STATE_C1) &&
613	(reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO))	613	(reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO))
614	continue;	614	continue;
615		615
616	if ((cx.type < ACPI_STATE_C1) \|\|	616	if ((cx.type < ACPI_STATE_C1) \|\|
617	(cx.type > ACPI_STATE_C3))	617	(cx.type > ACPI_STATE_C3))
618	continue;	618	continue;
619		619
620	obj = (union acpi_object *) &(element->package.elements[2]);	620	obj = (union acpi_object *) &(element->package.elements[2]);
621	if (obj->type != ACPI_TYPE_INTEGER)	621	if (obj->type != ACPI_TYPE_INTEGER)
622	continue;	622	continue;
623		623
624	cx.latency = obj->integer.value;	624	cx.latency = obj->integer.value;
625		625
626	obj = (union acpi_object *) &(element->package.elements[3]);	626	obj = (union acpi_object *) &(element->package.elements[3]);
627	if (obj->type != ACPI_TYPE_INTEGER)	627	if (obj->type != ACPI_TYPE_INTEGER)
628	continue;	628	continue;
629		629
630	cx.power = obj->integer.value;	630	cx.power = obj->integer.value;
631		631
632	(pr->power.count)++;	632	(pr->power.count)++;
633	memcpy(&(pr->power.states[pr->power.count]), &cx, sizeof(cx));	633	memcpy(&(pr->power.states[pr->power.count]), &cx, sizeof(cx));
634	}	634	}
635		635
636	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n", pr->power.count));	636	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n", pr->power.count));
637		637
638	/* Validate number of power states discovered */	638	/* Validate number of power states discovered */
639	if (pr->power.count < 2)	639	if (pr->power.count < 2)
640	status = -ENODEV;	640	status = -ENODEV;
641		641
642	end:	642	end:
643	acpi_os_free(buffer.pointer);	643	acpi_os_free(buffer.pointer);
644		644
645	return_VALUE(status);	645	return_VALUE(status);
646	}	646	}
647		647
648		648
649	static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)	649	static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)
650	{	650	{
651	ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c2");	651	ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c2");
652		652
653	if (!cx->address)	653	if (!cx->address)
654	return_VOID;	654	return_VOID;
655		655
656	/*	656	/*
657	* C2 latency must be less than or equal to 100	657	* C2 latency must be less than or equal to 100
658	* microseconds.	658	* microseconds.
659	*/	659	*/
660	else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {	660	else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {
661	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	661	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
662	"latency too large [%d]\n",	662	"latency too large [%d]\n",
663	cx->latency));	663	cx->latency));
664	return_VOID;	664	return_VOID;
665	}	665	}
666		666
667	/* We're (currently) only supporting C2 on UP */	667	/* We're (currently) only supporting C2 on UP */
668	else if (errata.smp) {	668	else if (errata.smp) {
669	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	669	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
670	"C2 not supported in SMP mode\n"));	670	"C2 not supported in SMP mode\n"));
671	return_VOID;	671	return_VOID;
672	}	672	}
673		673
674	/*	674	/*
675	* Otherwise we've met all of our C2 requirements.	675	* Otherwise we've met all of our C2 requirements.
676	* Normalize the C2 latency to expidite policy	676	* Normalize the C2 latency to expidite policy
677	*/	677	*/
678	cx->valid = 1;	678	cx->valid = 1;
679	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);	679	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
680		680
681	return_VOID;	681	return_VOID;
682	}	682	}
683		683
684		684
685	static void acpi_processor_power_verify_c3(	685	static void acpi_processor_power_verify_c3(
686	struct acpi_processor *pr,	686	struct acpi_processor *pr,
687	struct acpi_processor_cx *cx)	687	struct acpi_processor_cx *cx)
688	{	688	{
689	ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c3");	689	ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c3");
690		690
691	if (!cx->address)	691	if (!cx->address)
692	return_VOID;	692	return_VOID;
693		693
694	/*	694	/*
695	* C3 latency must be less than or equal to 1000	695	* C3 latency must be less than or equal to 1000
696	* microseconds.	696	* microseconds.
697	*/	697	*/
698	else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {	698	else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {
699	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	699	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
700	"latency too large [%d]\n",	700	"latency too large [%d]\n",
701	cx->latency));	701	cx->latency));
702	return_VOID;	702	return_VOID;
703	}	703	}
704		704
705	/* bus mastering control is necessary */	705	/* bus mastering control is necessary */
706	else if (!pr->flags.bm_control) {	706	else if (!pr->flags.bm_control) {
707	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	707	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
708	"C3 support requires bus mastering control\n"));	708	"C3 support requires bus mastering control\n"));
709	return_VOID;	709	return_VOID;
710	}	710	}
711		711
712	/* We're (currently) only supporting C2 on UP */	712	/* We're (currently) only supporting C2 on UP */
713	else if (errata.smp) {	713	else if (errata.smp) {
714	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	714	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
715	"C3 not supported in SMP mode\n"));	715	"C3 not supported in SMP mode\n"));
716	return_VOID;	716	return_VOID;
717	}	717	}
718		718
719	/*	719	/*
720	* PIIX4 Erratum #18: We don't support C3 when Type-F (fast)	720	* PIIX4 Erratum #18: We don't support C3 when Type-F (fast)
721	* DMA transfers are used by any ISA device to avoid livelock.	721	* DMA transfers are used by any ISA device to avoid livelock.
722	* Note that we could disable Type-F DMA (as recommended by	722	* Note that we could disable Type-F DMA (as recommended by
723	* the erratum), but this is known to disrupt certain ISA	723	* the erratum), but this is known to disrupt certain ISA
724	* devices thus we take the conservative approach.	724	* devices thus we take the conservative approach.
725	*/	725	*/
726	else if (errata.piix4.fdma) {	726	else if (errata.piix4.fdma) {
727	ACPI_DEBUG_PRINT((ACPI_DB_INFO,	727	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
728	"C3 not supported on PIIX4 with Type-F DMA\n"));	728	"C3 not supported on PIIX4 with Type-F DMA\n"));
729	return_VOID;	729	return_VOID;
730	}	730	}
731		731
732	/*	732	/*
733	* Otherwise we've met all of our C3 requirements.	733	* Otherwise we've met all of our C3 requirements.
734	* Normalize the C3 latency to expidite policy. Enable	734	* Normalize the C3 latency to expidite policy. Enable
735	* checking of bus mastering status (bm_check) so we can	735	* checking of bus mastering status (bm_check) so we can
736	* use this in our C3 policy	736	* use this in our C3 policy
737	*/	737	*/
738	cx->valid = 1;	738	cx->valid = 1;
739	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);	739	cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
740	pr->flags.bm_check = 1;	740	pr->flags.bm_check = 1;
741		741
742	return_VOID;	742	return_VOID;
743	}	743	}
744		744
745		745
746	static int acpi_processor_power_verify(struct acpi_processor *pr)	746	static int acpi_processor_power_verify(struct acpi_processor *pr)
747	{	747	{
748	unsigned int i;	748	unsigned int i;
749	unsigned int working = 0;	749	unsigned int working = 0;
750		750
751	for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {	751	for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
752	struct acpi_processor_cx *cx = &pr->power.states[i];	752	struct acpi_processor_cx *cx = &pr->power.states[i];
753		753
754	switch (cx->type) {	754	switch (cx->type) {
755	case ACPI_STATE_C1:	755	case ACPI_STATE_C1:
756	cx->valid = 1;	756	cx->valid = 1;
757	break;	757	break;
758		758
759	case ACPI_STATE_C2:	759	case ACPI_STATE_C2:
760	acpi_processor_power_verify_c2(cx);	760	acpi_processor_power_verify_c2(cx);
761	break;	761	break;
762		762
763	case ACPI_STATE_C3:	763	case ACPI_STATE_C3:
764	acpi_processor_power_verify_c3(pr, cx);	764	acpi_processor_power_verify_c3(pr, cx);
765	break;	765	break;
766	}	766	}
767		767
768	if (cx->valid)	768	if (cx->valid)
769	working++;	769	working++;
770	}	770	}
771		771
772	return (working);	772	return (working);
773	}	773	}
774		774
775	static int acpi_processor_get_power_info (	775	static int acpi_processor_get_power_info (
776	struct acpi_processor *pr)	776	struct acpi_processor *pr)
777	{	777	{
778	unsigned int i;	778	unsigned int i;
779	int result;	779	int result;
780		780
781	ACPI_FUNCTION_TRACE("acpi_processor_get_power_info");	781	ACPI_FUNCTION_TRACE("acpi_processor_get_power_info");
782		782
783	/* NOTE: the idle thread may not be running while calling	783	/* NOTE: the idle thread may not be running while calling
784	* this function */	784	* this function */
785		785
786	result = acpi_processor_get_power_info_cst(pr);	786	result = acpi_processor_get_power_info_cst(pr);
787	if ((result) \|\| (acpi_processor_power_verify(pr) < 2)) {	787	if ((result) \|\| (acpi_processor_power_verify(pr) < 2)) {
788	result = acpi_processor_get_power_info_fadt(pr);	788	result = acpi_processor_get_power_info_fadt(pr);
789	if (result)	789	if (result)
790	return_VALUE(result);	790	return_VALUE(result);
791		791
792	if (acpi_processor_power_verify(pr) < 2)	792	if (acpi_processor_power_verify(pr) < 2)
793	return_VALUE(-ENODEV);	793	return_VALUE(-ENODEV);
794	}	794	}
795		795
796	/*	796	/*
797	* Set Default Policy	797	* Set Default Policy
798	* ------------------	798	* ------------------
799	* Now that we know which states are supported, set the default	799	* Now that we know which states are supported, set the default
800	* policy. Note that this policy can be changed dynamically	800	* policy. Note that this policy can be changed dynamically
801	* (e.g. encourage deeper sleeps to conserve battery life when	801	* (e.g. encourage deeper sleeps to conserve battery life when
802	* not on AC).	802	* not on AC).
803	*/	803	*/
804	result = acpi_processor_set_power_policy(pr);	804	result = acpi_processor_set_power_policy(pr);
805	if (result)	805	if (result)
806	return_VALUE(result);	806	return_VALUE(result);
807		807
808	/*	808	/*
809	* if one state of type C2 or C3 is available, mark this	809	* if one state of type C2 or C3 is available, mark this
810	* CPU as being "idle manageable"	810	* CPU as being "idle manageable"
811	*/	811	*/
812	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {	812	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
813	if (pr->power.states[i].valid)	813	if (pr->power.states[i].valid)
814	pr->power.count = i;	814	pr->power.count = i;
815	if ((pr->power.states[i].valid) &&	815	if ((pr->power.states[i].valid) &&
816	(pr->power.states[i].type >= ACPI_STATE_C2))	816	(pr->power.states[i].type >= ACPI_STATE_C2))
817	pr->flags.power = 1;	817	pr->flags.power = 1;
818	}	818	}
819		819
820	return_VALUE(0);	820	return_VALUE(0);
821	}	821	}
822		822
823	int acpi_processor_cst_has_changed (struct acpi_processor *pr)	823	int acpi_processor_cst_has_changed (struct acpi_processor *pr)
824	{	824	{
825	int result = 0;	825	int result = 0;
826		826
827	ACPI_FUNCTION_TRACE("acpi_processor_cst_has_changed");	827	ACPI_FUNCTION_TRACE("acpi_processor_cst_has_changed");
828		828
829	if (!pr)	829	if (!pr)
830	return_VALUE(-EINVAL);	830	return_VALUE(-EINVAL);
831		831
832	if (errata.smp \|\| nocst) {	832	if (errata.smp \|\| nocst) {
833	return_VALUE(-ENODEV);	833	return_VALUE(-ENODEV);
834	}	834	}
835		835
836	if (!pr->flags.power_setup_done)	836	if (!pr->flags.power_setup_done)
837	return_VALUE(-ENODEV);	837	return_VALUE(-ENODEV);
838		838
839	/* Fall back to the default idle loop */	839	/* Fall back to the default idle loop */
840	pm_idle = pm_idle_save;	840	pm_idle = pm_idle_save;
841	synchronize_sched(); /* Relies on interrupts forcing exit from idle. */	841	synchronize_sched(); /* Relies on interrupts forcing exit from idle. */
842		842
843	pr->flags.power = 0;	843	pr->flags.power = 0;
844	result = acpi_processor_get_power_info(pr);	844	result = acpi_processor_get_power_info(pr);
845	if ((pr->flags.power == 1) && (pr->flags.power_setup_done))	845	if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
846	pm_idle = acpi_processor_idle;	846	pm_idle = acpi_processor_idle;
847		847
848	return_VALUE(result);	848	return_VALUE(result);
849	}	849	}
850		850
851	/* proc interface */	851	/* proc interface */
852		852
853	static int acpi_processor_power_seq_show(struct seq_file seq, void offset)	853	static int acpi_processor_power_seq_show(struct seq_file seq, void offset)
854	{	854	{
855	struct acpi_processor pr = (struct acpi_processor )seq->private;	855	struct acpi_processor pr = (struct acpi_processor )seq->private;
856	unsigned int i;	856	unsigned int i;
857		857
858	ACPI_FUNCTION_TRACE("acpi_processor_power_seq_show");	858	ACPI_FUNCTION_TRACE("acpi_processor_power_seq_show");
859		859
860	if (!pr)	860	if (!pr)
861	goto end;	861	goto end;
862		862
863	seq_printf(seq, "active state: C%zd\n"	863	seq_printf(seq, "active state: C%zd\n"
864	"max_cstate: C%d\n"	864	"max_cstate: C%d\n"
865	"bus master activity: %08x\n",	865	"bus master activity: %08x\n",
866	pr->power.state ? pr->power.state - pr->power.states : 0,	866	pr->power.state ? pr->power.state - pr->power.states : 0,
867	max_cstate,	867	max_cstate,
868	(unsigned)pr->power.bm_activity);	868	(unsigned)pr->power.bm_activity);
869		869
870	seq_puts(seq, "states:\n");	870	seq_puts(seq, "states:\n");
871		871
872	for (i = 1; i <= pr->power.count; i++) {	872	for (i = 1; i <= pr->power.count; i++) {
873	seq_printf(seq, " %cC%d: ",	873	seq_printf(seq, " %cC%d: ",
874	(&pr->power.states[i] == pr->power.state?'*':' '), i);	874	(&pr->power.states[i] == pr->power.state?'*':' '), i);
875		875
876	if (!pr->power.states[i].valid) {	876	if (!pr->power.states[i].valid) {
877	seq_puts(seq, "<not supported>\n");	877	seq_puts(seq, "<not supported>\n");
878	continue;	878	continue;
879	}	879	}
880		880
881	switch (pr->power.states[i].type) {	881	switch (pr->power.states[i].type) {
882	case ACPI_STATE_C1:	882	case ACPI_STATE_C1:
883	seq_printf(seq, "type[C1] ");	883	seq_printf(seq, "type[C1] ");
884	break;	884	break;
885	case ACPI_STATE_C2:	885	case ACPI_STATE_C2:
886	seq_printf(seq, "type[C2] ");	886	seq_printf(seq, "type[C2] ");
887	break;	887	break;
888	case ACPI_STATE_C3:	888	case ACPI_STATE_C3:
889	seq_printf(seq, "type[C3] ");	889	seq_printf(seq, "type[C3] ");
890	break;	890	break;
891	default:	891	default:
892	seq_printf(seq, "type[--] ");	892	seq_printf(seq, "type[--] ");
893	break;	893	break;
894	}	894	}
895		895
896	if (pr->power.states[i].promotion.state)	896	if (pr->power.states[i].promotion.state)
897	seq_printf(seq, "promotion[C%zd] ",	897	seq_printf(seq, "promotion[C%zd] ",
898	(pr->power.states[i].promotion.state -	898	(pr->power.states[i].promotion.state -
899	pr->power.states));	899	pr->power.states));
900	else	900	else
901	seq_puts(seq, "promotion[--] ");	901	seq_puts(seq, "promotion[--] ");
902		902
903	if (pr->power.states[i].demotion.state)	903	if (pr->power.states[i].demotion.state)
904	seq_printf(seq, "demotion[C%zd] ",	904	seq_printf(seq, "demotion[C%zd] ",
905	(pr->power.states[i].demotion.state -	905	(pr->power.states[i].demotion.state -
906	pr->power.states));	906	pr->power.states));
907	else	907	else
908	seq_puts(seq, "demotion[--] ");	908	seq_puts(seq, "demotion[--] ");
909		909
910	seq_printf(seq, "latency[%03d] usage[%08d]\n",	910	seq_printf(seq, "latency[%03d] usage[%08d]\n",
911	pr->power.states[i].latency,	911	pr->power.states[i].latency,
912	pr->power.states[i].usage);	912	pr->power.states[i].usage);
913	}	913	}
914		914
915	end:	915	end:
916	return_VALUE(0);	916	return_VALUE(0);
917	}	917	}
918		918
919	static int acpi_processor_power_open_fs(struct inode inode, struct file file)	919	static int acpi_processor_power_open_fs(struct inode inode, struct file file)
920	{	920	{
921	return single_open(file, acpi_processor_power_seq_show,	921	return single_open(file, acpi_processor_power_seq_show,
922	PDE(inode)->data);	922	PDE(inode)->data);
923	}	923	}
924		924
925	static struct file_operations acpi_processor_power_fops = {	925	static struct file_operations acpi_processor_power_fops = {
926	.open = acpi_processor_power_open_fs,	926	.open = acpi_processor_power_open_fs,
927	.read = seq_read,	927	.read = seq_read,
928	.llseek = seq_lseek,	928	.llseek = seq_lseek,
929	.release = single_release,	929	.release = single_release,
930	};	930	};
931		931
932		932
933	int acpi_processor_power_init(struct acpi_processor pr, struct acpi_device device)	933	int acpi_processor_power_init(struct acpi_processor pr, struct acpi_device device)
934	{	934	{
935	acpi_status status = 0;	935	acpi_status status = 0;
936	static int first_run = 0;	936	static int first_run = 0;
937	struct proc_dir_entry *entry = NULL;	937	struct proc_dir_entry *entry = NULL;
938	unsigned int i;	938	unsigned int i;
939		939
940	ACPI_FUNCTION_TRACE("acpi_processor_power_init");	940	ACPI_FUNCTION_TRACE("acpi_processor_power_init");
941		941
942	if (!first_run) {	942	if (!first_run) {
943	dmi_check_system(processor_power_dmi_table);	943	dmi_check_system(processor_power_dmi_table);
944	if (max_cstate < ACPI_C_STATES_MAX)	944	if (max_cstate < ACPI_C_STATES_MAX)
945	printk(KERN_NOTICE "ACPI: processor limited to max C-state %d\n", max_cstate);	945	printk(KERN_NOTICE "ACPI: processor limited to max C-state %d\n", max_cstate);
946	first_run++;	946	first_run++;
947	}	947	}
948		948
949	if (!errata.smp && (pr->id == 0) && acpi_fadt.cst_cnt && !nocst) {	949	if (!errata.smp && (pr->id == 0) && acpi_fadt.cst_cnt && !nocst) {
950	status = acpi_os_write_port(acpi_fadt.smi_cmd, acpi_fadt.cst_cnt, 8);	950	status = acpi_os_write_port(acpi_fadt.smi_cmd, acpi_fadt.cst_cnt, 8);
951	if (ACPI_FAILURE(status)) {	951	if (ACPI_FAILURE(status)) {
952	ACPI_DEBUG_PRINT((ACPI_DB_ERROR,	952	ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
953	"Notifying BIOS of _CST ability failed\n"));	953	"Notifying BIOS of _CST ability failed\n"));
954	}	954	}
955	}	955	}
956		956
957	acpi_processor_get_power_info(pr);	957	acpi_processor_get_power_info(pr);
958		958
959	/*	959	/*
960	* Install the idle handler if processor power management is supported.	960	* Install the idle handler if processor power management is supported.
961	* Note that we use previously set idle handler will be used on	961	* Note that we use previously set idle handler will be used on
962	* platforms that only support C1.	962	* platforms that only support C1.
963	*/	963	*/
964	if ((pr->flags.power) && (!boot_option_idle_override)) {	964	if ((pr->flags.power) && (!boot_option_idle_override)) {
965	printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);	965	printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
966	for (i = 1; i <= pr->power.count; i++)	966	for (i = 1; i <= pr->power.count; i++)
967	if (pr->power.states[i].valid)	967	if (pr->power.states[i].valid)
968	printk(" C%d[C%d]", i, pr->power.states[i].type);	968	printk(" C%d[C%d]", i, pr->power.states[i].type);
969	printk(")\n");	969	printk(")\n");
970		970
971	if (pr->id == 0) {	971	if (pr->id == 0) {
972	pm_idle_save = pm_idle;	972	pm_idle_save = pm_idle;
973	pm_idle = acpi_processor_idle;	973	pm_idle = acpi_processor_idle;
974	}	974	}
975	}	975	}
976		976
977	/* 'power' [R] */	977	/* 'power' [R] */
978	entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,	978	entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,
979	S_IRUGO, acpi_device_dir(device));	979	S_IRUGO, acpi_device_dir(device));
980	if (!entry)	980	if (!entry)
981	ACPI_DEBUG_PRINT((ACPI_DB_ERROR,	981	ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
982	"Unable to create '%s' fs entry\n",	982	"Unable to create '%s' fs entry\n",
983	ACPI_PROCESSOR_FILE_POWER));	983	ACPI_PROCESSOR_FILE_POWER));
984	else {	984	else {
985	entry->proc_fops = &acpi_processor_power_fops;	985	entry->proc_fops = &acpi_processor_power_fops;
986	entry->data = acpi_driver_data(device);	986	entry->data = acpi_driver_data(device);
987	entry->owner = THIS_MODULE;	987	entry->owner = THIS_MODULE;
988	}	988	}
989		989
990	pr->flags.power_setup_done = 1;	990	pr->flags.power_setup_done = 1;
991		991
992	return_VALUE(0);	992	return_VALUE(0);
993	}	993	}
994		994
995	int acpi_processor_power_exit(struct acpi_processor pr, struct acpi_device device)	995	int acpi_processor_power_exit(struct acpi_processor pr, struct acpi_device device)
996	{	996	{
997	ACPI_FUNCTION_TRACE("acpi_processor_power_exit");	997	ACPI_FUNCTION_TRACE("acpi_processor_power_exit");
998		998
999	pr->flags.power_setup_done = 0;	999	pr->flags.power_setup_done = 0;
1000		1000
1001	if (acpi_device_dir(device))	1001	if (acpi_device_dir(device))
1002	remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,acpi_device_dir(device));	1002	remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,acpi_device_dir(device));
1003		1003
1004	/* Unregister the idle handler when processor #0 is removed. */	1004	/* Unregister the idle handler when processor #0 is removed. */
1005	if (pr->id == 0) {	1005	if (pr->id == 0) {
1006	pm_idle = pm_idle_save;	1006	pm_idle = pm_idle_save;
1007		1007
1008	/*	1008	/*
1009	* We are about to unload the current idle thread pm callback	1009	* We are about to unload the current idle thread pm callback
1010	* (pm_idle), Wait for all processors to update cached/local	1010	* (pm_idle), Wait for all processors to update cached/local
1011	* copies of pm_idle before proceeding.	1011	* copies of pm_idle before proceeding.
1012	*/	1012	*/
1013	cpu_idle_wait();	1013	cpu_idle_wait();
1014	}	1014	}
1015		1015
1016	return_VALUE(0);	1016	return_VALUE(0);
1017	}	1017	}
1018		1018

drivers/input/gameport/gameport.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Generic gameport layer	2	* Generic gameport layer
3	*	3	*
4	* Copyright (c) 1999-2002 Vojtech Pavlik	4	* Copyright (c) 1999-2002 Vojtech Pavlik
5	* Copyright (c) 2005 Dmitry Torokhov	5	* Copyright (c) 2005 Dmitry Torokhov
6	*/	6	*/
7		7
8	/*	8	/*
9	* This program is free software; you can redistribute it and/or modify it	9	* This program is free software; you can redistribute it and/or modify it
10	* under the terms of the GNU General Public License version 2 as published by	10	* under the terms of the GNU General Public License version 2 as published by
11	* the Free Software Foundation.	11	* the Free Software Foundation.
12	*/	12	*/
13		13
14	#include <linux/stddef.h>	14	#include <linux/stddef.h>
15	#include <linux/module.h>	15	#include <linux/module.h>
16	#include <linux/ioport.h>	16	#include <linux/ioport.h>
17	#include <linux/init.h>	17	#include <linux/init.h>
18	#include <linux/gameport.h>	18	#include <linux/gameport.h>
19	#include <linux/wait.h>	19	#include <linux/wait.h>
20	#include <linux/completion.h>	20	#include <linux/completion.h>
21	#include <linux/sched.h>	21	#include <linux/sched.h>
22	#include <linux/smp_lock.h>	22	#include <linux/smp_lock.h>
23	#include <linux/slab.h>	23	#include <linux/slab.h>
24	#include <linux/delay.h>	24	#include <linux/delay.h>
25		25
26	/#include <asm/io.h>/	26	/#include <asm/io.h>/
27		27
28	MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");	28	MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
29	MODULE_DESCRIPTION("Generic gameport layer");	29	MODULE_DESCRIPTION("Generic gameport layer");
30	MODULE_LICENSE("GPL");	30	MODULE_LICENSE("GPL");
31		31
32	EXPORT_SYMBOL(__gameport_register_port);	32	EXPORT_SYMBOL(__gameport_register_port);
33	EXPORT_SYMBOL(gameport_unregister_port);	33	EXPORT_SYMBOL(gameport_unregister_port);
34	EXPORT_SYMBOL(__gameport_register_driver);	34	EXPORT_SYMBOL(__gameport_register_driver);
35	EXPORT_SYMBOL(gameport_unregister_driver);	35	EXPORT_SYMBOL(gameport_unregister_driver);
36	EXPORT_SYMBOL(gameport_open);	36	EXPORT_SYMBOL(gameport_open);
37	EXPORT_SYMBOL(gameport_close);	37	EXPORT_SYMBOL(gameport_close);
38	EXPORT_SYMBOL(gameport_rescan);	38	EXPORT_SYMBOL(gameport_rescan);
39	EXPORT_SYMBOL(gameport_cooked_read);	39	EXPORT_SYMBOL(gameport_cooked_read);
40	EXPORT_SYMBOL(gameport_set_name);	40	EXPORT_SYMBOL(gameport_set_name);
41	EXPORT_SYMBOL(gameport_set_phys);	41	EXPORT_SYMBOL(gameport_set_phys);
42	EXPORT_SYMBOL(gameport_start_polling);	42	EXPORT_SYMBOL(gameport_start_polling);
43	EXPORT_SYMBOL(gameport_stop_polling);	43	EXPORT_SYMBOL(gameport_stop_polling);
44		44
45	/*	45	/*
46	* gameport_sem protects entire gameport subsystem and is taken	46	* gameport_sem protects entire gameport subsystem and is taken
47	* every time gameport port or driver registrered or unregistered.	47	* every time gameport port or driver registrered or unregistered.
48	*/	48	*/
49	static DECLARE_MUTEX(gameport_sem);	49	static DECLARE_MUTEX(gameport_sem);
50		50
51	static LIST_HEAD(gameport_list);	51	static LIST_HEAD(gameport_list);
52		52
53	static struct bus_type gameport_bus = {	53	static struct bus_type gameport_bus = {
54	.name = "gameport",	54	.name = "gameport",
55	};	55	};
56		56
57	static void gameport_add_port(struct gameport *gameport);	57	static void gameport_add_port(struct gameport *gameport);
58	static void gameport_destroy_port(struct gameport *gameport);	58	static void gameport_destroy_port(struct gameport *gameport);
59	static void gameport_reconnect_port(struct gameport *gameport);	59	static void gameport_reconnect_port(struct gameport *gameport);
60	static void gameport_disconnect_port(struct gameport *gameport);	60	static void gameport_disconnect_port(struct gameport *gameport);
61		61
62	#if defined(__i386__)	62	#if defined(__i386__)
63		63
64	#define DELTA(x,y) ((y)-(x)+((y)<(x)?1193182/HZ:0))	64	#define DELTA(x,y) ((y)-(x)+((y)<(x)?1193182/HZ:0))
65	#define GET_TIME(x) do { x = get_time_pit(); } while (0)	65	#define GET_TIME(x) do { x = get_time_pit(); } while (0)
66		66
67	static unsigned int get_time_pit(void)	67	static unsigned int get_time_pit(void)
68	{	68	{
69	extern spinlock_t i8253_lock;	69	extern spinlock_t i8253_lock;
70	unsigned long flags;	70	unsigned long flags;
71	unsigned int count;	71	unsigned int count;
72		72
73	spin_lock_irqsave(&i8253_lock, flags);	73	spin_lock_irqsave(&i8253_lock, flags);
74	outb_p(0x00, 0x43);	74	outb_p(0x00, 0x43);
75	count = inb_p(0x40);	75	count = inb_p(0x40);
76	count \|= inb_p(0x40) << 8;	76	count \|= inb_p(0x40) << 8;
77	spin_unlock_irqrestore(&i8253_lock, flags);	77	spin_unlock_irqrestore(&i8253_lock, flags);
78		78
79	return count;	79	return count;
80	}	80	}
81		81
82	#endif	82	#endif
83		83
84		84
85		85
86	/*	86	/*
87	* gameport_measure_speed() measures the gameport i/o speed.	87	* gameport_measure_speed() measures the gameport i/o speed.
88	*/	88	*/
89		89
90	static int gameport_measure_speed(struct gameport *gameport)	90	static int gameport_measure_speed(struct gameport *gameport)
91	{	91	{
92	#if defined(__i386__)	92	#if defined(__i386__)
93		93
94	unsigned int i, t, t1, t2, t3, tx;	94	unsigned int i, t, t1, t2, t3, tx;
95	unsigned long flags;	95	unsigned long flags;
96		96
97	if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))	97	if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))
98	return 0;	98	return 0;
99		99
100	tx = 1 << 30;	100	tx = 1 << 30;
101		101
102	for(i = 0; i < 50; i++) {	102	for(i = 0; i < 50; i++) {
103	local_irq_save(flags);	103	local_irq_save(flags);
104	GET_TIME(t1);	104	GET_TIME(t1);
105	for (t = 0; t < 50; t++) gameport_read(gameport);	105	for (t = 0; t < 50; t++) gameport_read(gameport);
106	GET_TIME(t2);	106	GET_TIME(t2);
107	GET_TIME(t3);	107	GET_TIME(t3);
108	local_irq_restore(flags);	108	local_irq_restore(flags);
109	udelay(i * 10);	109	udelay(i * 10);
110	if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;	110	if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
111	}	111	}
112		112
113	gameport_close(gameport);	113	gameport_close(gameport);
114	return 59659 / (tx < 1 ? 1 : tx);	114	return 59659 / (tx < 1 ? 1 : tx);
115		115
116	#elif defined (__x86_64__)	116	#elif defined (__x86_64__)
117		117
118	unsigned int i, t;	118	unsigned int i, t;
119	unsigned long tx, t1, t2, flags;	119	unsigned long tx, t1, t2, flags;
120		120
121	if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))	121	if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))
122	return 0;	122	return 0;
123		123
124	tx = 1 << 30;	124	tx = 1 << 30;
125		125
126	for(i = 0; i < 50; i++) {	126	for(i = 0; i < 50; i++) {
127	local_irq_save(flags);	127	local_irq_save(flags);
128	rdtscl(t1);	128	rdtscl(t1);
129	for (t = 0; t < 50; t++) gameport_read(gameport);	129	for (t = 0; t < 50; t++) gameport_read(gameport);
130	rdtscl(t2);	130	rdtscl(t2);
131	local_irq_restore(flags);	131	local_irq_restore(flags);
132	udelay(i * 10);	132	udelay(i * 10);
133	if (t2 - t1 < tx) tx = t2 - t1;	133	if (t2 - t1 < tx) tx = t2 - t1;
134	}	134	}
135		135
136	gameport_close(gameport);	136	gameport_close(gameport);
137	return (cpu_data[_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);	137	return (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
138		138
139	#else	139	#else
140		140
141	unsigned int j, t = 0;	141	unsigned int j, t = 0;
142		142
143	if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))	143	if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))
144	return 0;	144	return 0;
145		145
146	j = jiffies; while (j == jiffies);	146	j = jiffies; while (j == jiffies);
147	j = jiffies; while (j == jiffies) { t++; gameport_read(gameport); }	147	j = jiffies; while (j == jiffies) { t++; gameport_read(gameport); }
148		148
149	gameport_close(gameport);	149	gameport_close(gameport);
150	return t * HZ / 1000;	150	return t * HZ / 1000;
151		151
152	#endif	152	#endif
153	}	153	}
154		154
155	void gameport_start_polling(struct gameport *gameport)	155	void gameport_start_polling(struct gameport *gameport)
156	{	156	{
157	spin_lock(&gameport->timer_lock);	157	spin_lock(&gameport->timer_lock);
158		158
159	if (!gameport->poll_cnt++) {	159	if (!gameport->poll_cnt++) {
160	BUG_ON(!gameport->poll_handler);	160	BUG_ON(!gameport->poll_handler);
161	BUG_ON(!gameport->poll_interval);	161	BUG_ON(!gameport->poll_interval);
162	mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval));	162	mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval));
163	}	163	}
164		164
165	spin_unlock(&gameport->timer_lock);	165	spin_unlock(&gameport->timer_lock);
166	}	166	}
167		167
168	void gameport_stop_polling(struct gameport *gameport)	168	void gameport_stop_polling(struct gameport *gameport)
169	{	169	{
170	spin_lock(&gameport->timer_lock);	170	spin_lock(&gameport->timer_lock);
171		171
172	if (!--gameport->poll_cnt)	172	if (!--gameport->poll_cnt)
173	del_timer(&gameport->poll_timer);	173	del_timer(&gameport->poll_timer);
174		174
175	spin_unlock(&gameport->timer_lock);	175	spin_unlock(&gameport->timer_lock);
176	}	176	}
177		177
178	static void gameport_run_poll_handler(unsigned long d)	178	static void gameport_run_poll_handler(unsigned long d)
179	{	179	{
180	struct gameport gameport = (struct gameport )d;	180	struct gameport gameport = (struct gameport )d;
181		181
182	gameport->poll_handler(gameport);	182	gameport->poll_handler(gameport);
183	if (gameport->poll_cnt)	183	if (gameport->poll_cnt)
184	mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval));	184	mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval));
185	}	185	}
186		186
187	/*	187	/*
188	* Basic gameport -> driver core mappings	188	* Basic gameport -> driver core mappings
189	*/	189	*/
190		190
191	static void gameport_bind_driver(struct gameport gameport, struct gameport_driver drv)	191	static void gameport_bind_driver(struct gameport gameport, struct gameport_driver drv)
192	{	192	{
193	down_write(&gameport_bus.subsys.rwsem);	193	down_write(&gameport_bus.subsys.rwsem);
194		194
195	gameport->dev.driver = &drv->driver;	195	gameport->dev.driver = &drv->driver;
196	if (drv->connect(gameport, drv)) {	196	if (drv->connect(gameport, drv)) {
197	gameport->dev.driver = NULL;	197	gameport->dev.driver = NULL;
198	goto out;	198	goto out;
199	}	199	}
200	device_bind_driver(&gameport->dev);	200	device_bind_driver(&gameport->dev);
201	out:	201	out:
202	up_write(&gameport_bus.subsys.rwsem);	202	up_write(&gameport_bus.subsys.rwsem);
203	}	203	}
204		204
205	static void gameport_release_driver(struct gameport *gameport)	205	static void gameport_release_driver(struct gameport *gameport)
206	{	206	{
207	down_write(&gameport_bus.subsys.rwsem);	207	down_write(&gameport_bus.subsys.rwsem);
208	device_release_driver(&gameport->dev);	208	device_release_driver(&gameport->dev);
209	up_write(&gameport_bus.subsys.rwsem);	209	up_write(&gameport_bus.subsys.rwsem);
210	}	210	}
211		211
212	static void gameport_find_driver(struct gameport *gameport)	212	static void gameport_find_driver(struct gameport *gameport)
213	{	213	{
214	down_write(&gameport_bus.subsys.rwsem);	214	down_write(&gameport_bus.subsys.rwsem);
215	device_attach(&gameport->dev);	215	device_attach(&gameport->dev);
216	up_write(&gameport_bus.subsys.rwsem);	216	up_write(&gameport_bus.subsys.rwsem);
217	}	217	}
218		218
219		219
220	/*	220	/*
221	* Gameport event processing.	221	* Gameport event processing.
222	*/	222	*/
223		223
224	enum gameport_event_type {	224	enum gameport_event_type {
225	GAMEPORT_RESCAN,	225	GAMEPORT_RESCAN,
226	GAMEPORT_RECONNECT,	226	GAMEPORT_RECONNECT,
227	GAMEPORT_REGISTER_PORT,	227	GAMEPORT_REGISTER_PORT,
228	GAMEPORT_REGISTER_DRIVER,	228	GAMEPORT_REGISTER_DRIVER,
229	};	229	};
230		230
231	struct gameport_event {	231	struct gameport_event {
232	enum gameport_event_type type;	232	enum gameport_event_type type;
233	void *object;	233	void *object;
234	struct module *owner;	234	struct module *owner;
235	struct list_head node;	235	struct list_head node;
236	};	236	};
237		237
238	static DEFINE_SPINLOCK(gameport_event_lock); /* protects gameport_event_list */	238	static DEFINE_SPINLOCK(gameport_event_lock); /* protects gameport_event_list */
239	static LIST_HEAD(gameport_event_list);	239	static LIST_HEAD(gameport_event_list);
240	static DECLARE_WAIT_QUEUE_HEAD(gameport_wait);	240	static DECLARE_WAIT_QUEUE_HEAD(gameport_wait);
241	static DECLARE_COMPLETION(gameport_exited);	241	static DECLARE_COMPLETION(gameport_exited);
242	static int gameport_pid;	242	static int gameport_pid;
243		243
244	static void gameport_queue_event(void object, struct module owner,	244	static void gameport_queue_event(void object, struct module owner,
245	enum gameport_event_type event_type)	245	enum gameport_event_type event_type)
246	{	246	{
247	unsigned long flags;	247	unsigned long flags;
248	struct gameport_event *event;	248	struct gameport_event *event;
249		249
250	spin_lock_irqsave(&gameport_event_lock, flags);	250	spin_lock_irqsave(&gameport_event_lock, flags);
251		251
252	/*	252	/*
253	* Scan event list for the other events for the same gameport port,	253	* Scan event list for the other events for the same gameport port,
254	* starting with the most recent one. If event is the same we	254	* starting with the most recent one. If event is the same we
255	* do not need add new one. If event is of different type we	255	* do not need add new one. If event is of different type we
256	* need to add this event and should not look further because	256	* need to add this event and should not look further because
257	* we need to preseve sequence of distinct events.	257	* we need to preseve sequence of distinct events.
258	*/	258	*/
259	list_for_each_entry_reverse(event, &gameport_event_list, node) {	259	list_for_each_entry_reverse(event, &gameport_event_list, node) {
260	if (event->object == object) {	260	if (event->object == object) {
261	if (event->type == event_type)	261	if (event->type == event_type)
262	goto out;	262	goto out;
263	break;	263	break;
264	}	264	}
265	}	265	}
266		266
267	if ((event = kmalloc(sizeof(struct gameport_event), GFP_ATOMIC))) {	267	if ((event = kmalloc(sizeof(struct gameport_event), GFP_ATOMIC))) {
268	if (!try_module_get(owner)) {	268	if (!try_module_get(owner)) {
269	printk(KERN_WARNING "gameport: Can't get module reference, dropping event %d\n", event_type);	269	printk(KERN_WARNING "gameport: Can't get module reference, dropping event %d\n", event_type);
270	goto out;	270	goto out;
271	}	271	}
272		272
273	event->type = event_type;	273	event->type = event_type;
274	event->object = object;	274	event->object = object;
275	event->owner = owner;	275	event->owner = owner;
276		276
277	list_add_tail(&event->node, &gameport_event_list);	277	list_add_tail(&event->node, &gameport_event_list);
278	wake_up(&gameport_wait);	278	wake_up(&gameport_wait);
279	} else {	279	} else {
280	printk(KERN_ERR "gameport: Not enough memory to queue event %d\n", event_type);	280	printk(KERN_ERR "gameport: Not enough memory to queue event %d\n", event_type);
281	}	281	}
282	out:	282	out:
283	spin_unlock_irqrestore(&gameport_event_lock, flags);	283	spin_unlock_irqrestore(&gameport_event_lock, flags);
284	}	284	}
285		285
286	static void gameport_free_event(struct gameport_event *event)	286	static void gameport_free_event(struct gameport_event *event)
287	{	287	{
288	module_put(event->owner);	288	module_put(event->owner);
289	kfree(event);	289	kfree(event);
290	}	290	}
291		291
292	static void gameport_remove_duplicate_events(struct gameport_event *event)	292	static void gameport_remove_duplicate_events(struct gameport_event *event)
293	{	293	{
294	struct list_head node, next;	294	struct list_head node, next;
295	struct gameport_event *e;	295	struct gameport_event *e;
296	unsigned long flags;	296	unsigned long flags;
297		297
298	spin_lock_irqsave(&gameport_event_lock, flags);	298	spin_lock_irqsave(&gameport_event_lock, flags);
299		299
300	list_for_each_safe(node, next, &gameport_event_list) {	300	list_for_each_safe(node, next, &gameport_event_list) {
301	e = list_entry(node, struct gameport_event, node);	301	e = list_entry(node, struct gameport_event, node);
302	if (event->object == e->object) {	302	if (event->object == e->object) {
303	/*	303	/*
304	* If this event is of different type we should not	304	* If this event is of different type we should not
305	* look further - we only suppress duplicate events	305	* look further - we only suppress duplicate events
306	* that were sent back-to-back.	306	* that were sent back-to-back.
307	*/	307	*/
308	if (event->type != e->type)	308	if (event->type != e->type)
309	break;	309	break;
310		310
311	list_del_init(node);	311	list_del_init(node);
312	gameport_free_event(e);	312	gameport_free_event(e);
313	}	313	}
314	}	314	}
315		315
316	spin_unlock_irqrestore(&gameport_event_lock, flags);	316	spin_unlock_irqrestore(&gameport_event_lock, flags);
317	}	317	}
318		318
319		319
320	static struct gameport_event *gameport_get_event(void)	320	static struct gameport_event *gameport_get_event(void)
321	{	321	{
322	struct gameport_event *event;	322	struct gameport_event *event;
323	struct list_head *node;	323	struct list_head *node;
324	unsigned long flags;	324	unsigned long flags;
325		325
326	spin_lock_irqsave(&gameport_event_lock, flags);	326	spin_lock_irqsave(&gameport_event_lock, flags);
327		327
328	if (list_empty(&gameport_event_list)) {	328	if (list_empty(&gameport_event_list)) {
329	spin_unlock_irqrestore(&gameport_event_lock, flags);	329	spin_unlock_irqrestore(&gameport_event_lock, flags);
330	return NULL;	330	return NULL;
331	}	331	}
332		332
333	node = gameport_event_list.next;	333	node = gameport_event_list.next;
334	event = list_entry(node, struct gameport_event, node);	334	event = list_entry(node, struct gameport_event, node);
335	list_del_init(node);	335	list_del_init(node);
336		336
337	spin_unlock_irqrestore(&gameport_event_lock, flags);	337	spin_unlock_irqrestore(&gameport_event_lock, flags);
338		338
339	return event;	339	return event;
340	}	340	}
341		341
342	static void gameport_handle_events(void)	342	static void gameport_handle_events(void)
343	{	343	{
344	struct gameport_event *event;	344	struct gameport_event *event;
345	struct gameport_driver *gameport_drv;	345	struct gameport_driver *gameport_drv;
346		346
347	down(&gameport_sem);	347	down(&gameport_sem);
348		348
349	while ((event = gameport_get_event())) {	349	while ((event = gameport_get_event())) {
350		350
351	switch (event->type) {	351	switch (event->type) {
352	case GAMEPORT_REGISTER_PORT:	352	case GAMEPORT_REGISTER_PORT:
353	gameport_add_port(event->object);	353	gameport_add_port(event->object);
354	break;	354	break;
355		355
356	case GAMEPORT_RECONNECT:	356	case GAMEPORT_RECONNECT:
357	gameport_reconnect_port(event->object);	357	gameport_reconnect_port(event->object);
358	break;	358	break;
359		359
360	case GAMEPORT_RESCAN:	360	case GAMEPORT_RESCAN:
361	gameport_disconnect_port(event->object);	361	gameport_disconnect_port(event->object);
362	gameport_find_driver(event->object);	362	gameport_find_driver(event->object);
363	break;	363	break;
364		364
365	case GAMEPORT_REGISTER_DRIVER:	365	case GAMEPORT_REGISTER_DRIVER:
366	gameport_drv = event->object;	366	gameport_drv = event->object;
367	driver_register(&gameport_drv->driver);	367	driver_register(&gameport_drv->driver);
368	break;	368	break;
369		369
370	default:	370	default:
371	break;	371	break;
372	}	372	}
373		373
374	gameport_remove_duplicate_events(event);	374	gameport_remove_duplicate_events(event);
375	gameport_free_event(event);	375	gameport_free_event(event);
376	}	376	}
377		377
378	up(&gameport_sem);	378	up(&gameport_sem);
379	}	379	}
380		380
381	/*	381	/*
382	* Remove all events that have been submitted for a given gameport port.	382	* Remove all events that have been submitted for a given gameport port.
383	*/	383	*/
384	static void gameport_remove_pending_events(struct gameport *gameport)	384	static void gameport_remove_pending_events(struct gameport *gameport)
385	{	385	{
386	struct list_head node, next;	386	struct list_head node, next;
387	struct gameport_event *event;	387	struct gameport_event *event;
388	unsigned long flags;	388	unsigned long flags;
389		389
390	spin_lock_irqsave(&gameport_event_lock, flags);	390	spin_lock_irqsave(&gameport_event_lock, flags);
391		391
392	list_for_each_safe(node, next, &gameport_event_list) {	392	list_for_each_safe(node, next, &gameport_event_list) {
393	event = list_entry(node, struct gameport_event, node);	393	event = list_entry(node, struct gameport_event, node);
394	if (event->object == gameport) {	394	if (event->object == gameport) {
395	list_del_init(node);	395	list_del_init(node);
396	gameport_free_event(event);	396	gameport_free_event(event);
397	}	397	}
398	}	398	}
399		399
400	spin_unlock_irqrestore(&gameport_event_lock, flags);	400	spin_unlock_irqrestore(&gameport_event_lock, flags);
401	}	401	}
402		402
403	/*	403	/*
404	* Destroy child gameport port (if any) that has not been fully registered yet.	404	* Destroy child gameport port (if any) that has not been fully registered yet.
405	*	405	*
406	* Note that we rely on the fact that port can have only one child and therefore	406	* Note that we rely on the fact that port can have only one child and therefore
407	* only one child registration request can be pending. Additionally, children	407	* only one child registration request can be pending. Additionally, children
408	* are registered by driver's connect() handler so there can't be a grandchild	408	* are registered by driver's connect() handler so there can't be a grandchild
409	* pending registration together with a child.	409	* pending registration together with a child.
410	*/	410	*/
411	static struct gameport gameport_get_pending_child(struct gameport parent)	411	static struct gameport gameport_get_pending_child(struct gameport parent)
412	{	412	{
413	struct gameport_event *event;	413	struct gameport_event *event;
414	struct gameport gameport, child = NULL;	414	struct gameport gameport, child = NULL;
415	unsigned long flags;	415	unsigned long flags;
416		416
417	spin_lock_irqsave(&gameport_event_lock, flags);	417	spin_lock_irqsave(&gameport_event_lock, flags);
418		418
419	list_for_each_entry(event, &gameport_event_list, node) {	419	list_for_each_entry(event, &gameport_event_list, node) {
420	if (event->type == GAMEPORT_REGISTER_PORT) {	420	if (event->type == GAMEPORT_REGISTER_PORT) {
421	gameport = event->object;	421	gameport = event->object;
422	if (gameport->parent == parent) {	422	if (gameport->parent == parent) {
423	child = gameport;	423	child = gameport;
424	break;	424	break;
425	}	425	}
426	}	426	}
427	}	427	}
428		428
429	spin_unlock_irqrestore(&gameport_event_lock, flags);	429	spin_unlock_irqrestore(&gameport_event_lock, flags);
430	return child;	430	return child;
431	}	431	}
432		432
433	static int gameport_thread(void *nothing)	433	static int gameport_thread(void *nothing)
434	{	434	{
435	lock_kernel();	435	lock_kernel();
436	daemonize("kgameportd");	436	daemonize("kgameportd");
437	allow_signal(SIGTERM);	437	allow_signal(SIGTERM);
438		438
439	do {	439	do {
440	gameport_handle_events();	440	gameport_handle_events();
441	wait_event_interruptible(gameport_wait, !list_empty(&gameport_event_list));	441	wait_event_interruptible(gameport_wait, !list_empty(&gameport_event_list));
442	try_to_freeze(PF_FREEZE);	442	try_to_freeze(PF_FREEZE);
443	} while (!signal_pending(current));	443	} while (!signal_pending(current));
444		444
445	printk(KERN_DEBUG "gameport: kgameportd exiting\n");	445	printk(KERN_DEBUG "gameport: kgameportd exiting\n");
446		446
447	unlock_kernel();	447	unlock_kernel();
448	complete_and_exit(&gameport_exited, 0);	448	complete_and_exit(&gameport_exited, 0);
449	}	449	}
450		450
451		451
452	/*	452	/*
453	* Gameport port operations	453	* Gameport port operations
454	*/	454	*/
455		455
456	static ssize_t gameport_show_description(struct device dev, struct device_attribute attr, char *buf)	456	static ssize_t gameport_show_description(struct device dev, struct device_attribute attr, char *buf)
457	{	457	{
458	struct gameport *gameport = to_gameport_port(dev);	458	struct gameport *gameport = to_gameport_port(dev);
459	return sprintf(buf, "%s\n", gameport->name);	459	return sprintf(buf, "%s\n", gameport->name);
460	}	460	}
461		461
462	static ssize_t gameport_rebind_driver(struct device dev, struct device_attribute attr, const char *buf, size_t count)	462	static ssize_t gameport_rebind_driver(struct device dev, struct device_attribute attr, const char *buf, size_t count)
463	{	463	{
464	struct gameport *gameport = to_gameport_port(dev);	464	struct gameport *gameport = to_gameport_port(dev);
465	struct device_driver *drv;	465	struct device_driver *drv;
466	int retval;	466	int retval;
467		467
468	retval = down_interruptible(&gameport_sem);	468	retval = down_interruptible(&gameport_sem);
469	if (retval)	469	if (retval)
470	return retval;	470	return retval;
471		471
472	retval = count;	472	retval = count;
473	if (!strncmp(buf, "none", count)) {	473	if (!strncmp(buf, "none", count)) {
474	gameport_disconnect_port(gameport);	474	gameport_disconnect_port(gameport);
475	} else if (!strncmp(buf, "reconnect", count)) {	475	} else if (!strncmp(buf, "reconnect", count)) {
476	gameport_reconnect_port(gameport);	476	gameport_reconnect_port(gameport);
477	} else if (!strncmp(buf, "rescan", count)) {	477	} else if (!strncmp(buf, "rescan", count)) {
478	gameport_disconnect_port(gameport);	478	gameport_disconnect_port(gameport);
479	gameport_find_driver(gameport);	479	gameport_find_driver(gameport);
480	} else if ((drv = driver_find(buf, &gameport_bus)) != NULL) {	480	} else if ((drv = driver_find(buf, &gameport_bus)) != NULL) {
481	gameport_disconnect_port(gameport);	481	gameport_disconnect_port(gameport);
482	gameport_bind_driver(gameport, to_gameport_driver(drv));	482	gameport_bind_driver(gameport, to_gameport_driver(drv));
483	put_driver(drv);	483	put_driver(drv);
484	} else {	484	} else {
485	retval = -EINVAL;	485	retval = -EINVAL;
486	}	486	}
487		487
488	up(&gameport_sem);	488	up(&gameport_sem);
489		489
490	return retval;	490	return retval;
491	}	491	}
492		492
493	static struct device_attribute gameport_device_attrs[] = {	493	static struct device_attribute gameport_device_attrs[] = {
494	__ATTR(description, S_IRUGO, gameport_show_description, NULL),	494	__ATTR(description, S_IRUGO, gameport_show_description, NULL),
495	__ATTR(drvctl, S_IWUSR, NULL, gameport_rebind_driver),	495	__ATTR(drvctl, S_IWUSR, NULL, gameport_rebind_driver),
496	__ATTR_NULL	496	__ATTR_NULL
497	};	497	};
498		498
499	static void gameport_release_port(struct device *dev)	499	static void gameport_release_port(struct device *dev)
500	{	500	{
501	struct gameport *gameport = to_gameport_port(dev);	501	struct gameport *gameport = to_gameport_port(dev);
502		502
503	kfree(gameport);	503	kfree(gameport);
504	module_put(THIS_MODULE);	504	module_put(THIS_MODULE);
505	}	505	}
506		506
507	void gameport_set_phys(struct gameport gameport, const char fmt, ...)	507	void gameport_set_phys(struct gameport gameport, const char fmt, ...)
508	{	508	{
509	va_list args;	509	va_list args;
510		510
511	va_start(args, fmt);	511	va_start(args, fmt);
512	vsnprintf(gameport->phys, sizeof(gameport->phys), fmt, args);	512	vsnprintf(gameport->phys, sizeof(gameport->phys), fmt, args);
513	va_end(args);	513	va_end(args);
514	}	514	}
515		515
516	/*	516	/*
517	* Prepare gameport port for registration.	517	* Prepare gameport port for registration.
518	*/	518	*/
519	static void gameport_init_port(struct gameport *gameport)	519	static void gameport_init_port(struct gameport *gameport)
520	{	520	{
521	static atomic_t gameport_no = ATOMIC_INIT(0);	521	static atomic_t gameport_no = ATOMIC_INIT(0);
522		522
523	__module_get(THIS_MODULE);	523	__module_get(THIS_MODULE);
524		524
525	init_MUTEX(&gameport->drv_sem);	525	init_MUTEX(&gameport->drv_sem);
526	device_initialize(&gameport->dev);	526	device_initialize(&gameport->dev);
527	snprintf(gameport->dev.bus_id, sizeof(gameport->dev.bus_id),	527	snprintf(gameport->dev.bus_id, sizeof(gameport->dev.bus_id),
528	"gameport%lu", (unsigned long)atomic_inc_return(&gameport_no) - 1);	528	"gameport%lu", (unsigned long)atomic_inc_return(&gameport_no) - 1);
529	gameport->dev.bus = &gameport_bus;	529	gameport->dev.bus = &gameport_bus;
530	gameport->dev.release = gameport_release_port;	530	gameport->dev.release = gameport_release_port;
531	if (gameport->parent)	531	if (gameport->parent)
532	gameport->dev.parent = &gameport->parent->dev;	532	gameport->dev.parent = &gameport->parent->dev;
533		533
534	spin_lock_init(&gameport->timer_lock);	534	spin_lock_init(&gameport->timer_lock);
535	init_timer(&gameport->poll_timer);	535	init_timer(&gameport->poll_timer);
536	gameport->poll_timer.function = gameport_run_poll_handler;	536	gameport->poll_timer.function = gameport_run_poll_handler;
537	gameport->poll_timer.data = (unsigned long)gameport;	537	gameport->poll_timer.data = (unsigned long)gameport;
538	}	538	}
539		539
540	/*	540	/*
541	* Complete gameport port registration.	541	* Complete gameport port registration.
542	* Driver core will attempt to find appropriate driver for the port.	542	* Driver core will attempt to find appropriate driver for the port.
543	*/	543	*/
544	static void gameport_add_port(struct gameport *gameport)	544	static void gameport_add_port(struct gameport *gameport)
545	{	545	{
546	if (gameport->parent)	546	if (gameport->parent)
547	gameport->parent->child = gameport;	547	gameport->parent->child = gameport;
548		548
549	gameport->speed = gameport_measure_speed(gameport);	549	gameport->speed = gameport_measure_speed(gameport);
550		550
551	list_add_tail(&gameport->node, &gameport_list);	551	list_add_tail(&gameport->node, &gameport_list);
552		552
553	if (gameport->io)	553	if (gameport->io)
554	printk(KERN_INFO "gameport: %s is %s, io %#x, speed %dkHz\n",	554	printk(KERN_INFO "gameport: %s is %s, io %#x, speed %dkHz\n",
555	gameport->name, gameport->phys, gameport->io, gameport->speed);	555	gameport->name, gameport->phys, gameport->io, gameport->speed);
556	else	556	else
557	printk(KERN_INFO "gameport: %s is %s, speed %dkHz\n",	557	printk(KERN_INFO "gameport: %s is %s, speed %dkHz\n",
558	gameport->name, gameport->phys, gameport->speed);	558	gameport->name, gameport->phys, gameport->speed);
559		559
560	device_add(&gameport->dev);	560	device_add(&gameport->dev);
561	gameport->registered = 1;	561	gameport->registered = 1;
562	}	562	}
563		563
564	/*	564	/*
565	* gameport_destroy_port() completes deregistration process and removes	565	* gameport_destroy_port() completes deregistration process and removes
566	* port from the system	566	* port from the system
567	*/	567	*/
568	static void gameport_destroy_port(struct gameport *gameport)	568	static void gameport_destroy_port(struct gameport *gameport)
569	{	569	{
570	struct gameport *child;	570	struct gameport *child;
571		571
572	child = gameport_get_pending_child(gameport);	572	child = gameport_get_pending_child(gameport);
573	if (child) {	573	if (child) {
574	gameport_remove_pending_events(child);	574	gameport_remove_pending_events(child);
575	put_device(&child->dev);	575	put_device(&child->dev);
576	}	576	}
577		577
578	if (gameport->parent) {	578	if (gameport->parent) {
579	gameport->parent->child = NULL;	579	gameport->parent->child = NULL;
580	gameport->parent = NULL;	580	gameport->parent = NULL;
581	}	581	}
582		582
583	if (gameport->registered) {	583	if (gameport->registered) {
584	device_del(&gameport->dev);	584	device_del(&gameport->dev);
585	list_del_init(&gameport->node);	585	list_del_init(&gameport->node);
586	gameport->registered = 0;	586	gameport->registered = 0;
587	}	587	}
588		588
589	gameport_remove_pending_events(gameport);	589	gameport_remove_pending_events(gameport);
590	put_device(&gameport->dev);	590	put_device(&gameport->dev);
591	}	591	}
592		592
593	/*	593	/*
594	* Reconnect gameport port and all its children (re-initialize attached devices)	594	* Reconnect gameport port and all its children (re-initialize attached devices)
595	*/	595	*/
596	static void gameport_reconnect_port(struct gameport *gameport)	596	static void gameport_reconnect_port(struct gameport *gameport)
597	{	597	{
598	do {	598	do {
599	if (!gameport->drv \|\| !gameport->drv->reconnect \|\| gameport->drv->reconnect(gameport)) {	599	if (!gameport->drv \|\| !gameport->drv->reconnect \|\| gameport->drv->reconnect(gameport)) {
600	gameport_disconnect_port(gameport);	600	gameport_disconnect_port(gameport);
601	gameport_find_driver(gameport);	601	gameport_find_driver(gameport);
602	/* Ok, old children are now gone, we are done */	602	/* Ok, old children are now gone, we are done */
603	break;	603	break;
604	}	604	}
605	gameport = gameport->child;	605	gameport = gameport->child;
606	} while (gameport);	606	} while (gameport);
607	}	607	}
608		608
609	/*	609	/*
610	* gameport_disconnect_port() unbinds a port from its driver. As a side effect	610	* gameport_disconnect_port() unbinds a port from its driver. As a side effect
611	* all child ports are unbound and destroyed.	611	* all child ports are unbound and destroyed.
612	*/	612	*/
613	static void gameport_disconnect_port(struct gameport *gameport)	613	static void gameport_disconnect_port(struct gameport *gameport)
614	{	614	{
615	struct gameport s, parent;	615	struct gameport s, parent;
616		616
617	if (gameport->child) {	617	if (gameport->child) {
618	/*	618	/*
619	* Children ports should be disconnected and destroyed	619	* Children ports should be disconnected and destroyed
620	* first, staring with the leaf one, since we don't want	620	* first, staring with the leaf one, since we don't want
621	* to do recursion	621	* to do recursion
622	*/	622	*/
623	for (s = gameport; s->child; s = s->child)	623	for (s = gameport; s->child; s = s->child)
624	/* empty */;	624	/* empty */;
625		625
626	do {	626	do {
627	parent = s->parent;	627	parent = s->parent;
628		628
629	gameport_release_driver(s);	629	gameport_release_driver(s);
630	gameport_destroy_port(s);	630	gameport_destroy_port(s);
631	} while ((s = parent) != gameport);	631	} while ((s = parent) != gameport);
632	}	632	}
633		633
634	/*	634	/*
635	* Ok, no children left, now disconnect this port	635	* Ok, no children left, now disconnect this port
636	*/	636	*/
637	gameport_release_driver(gameport);	637	gameport_release_driver(gameport);
638	}	638	}
639		639
640	void gameport_rescan(struct gameport *gameport)	640	void gameport_rescan(struct gameport *gameport)
641	{	641	{
642	gameport_queue_event(gameport, NULL, GAMEPORT_RESCAN);	642	gameport_queue_event(gameport, NULL, GAMEPORT_RESCAN);
643	}	643	}
644		644
645	void gameport_reconnect(struct gameport *gameport)	645	void gameport_reconnect(struct gameport *gameport)
646	{	646	{
647	gameport_queue_event(gameport, NULL, GAMEPORT_RECONNECT);	647	gameport_queue_event(gameport, NULL, GAMEPORT_RECONNECT);
648	}	648	}
649		649
650	/*	650	/*
651	* Submits register request to kgameportd for subsequent execution.	651	* Submits register request to kgameportd for subsequent execution.
652	* Note that port registration is always asynchronous.	652	* Note that port registration is always asynchronous.
653	*/	653	*/
654	void __gameport_register_port(struct gameport gameport, struct module owner)	654	void __gameport_register_port(struct gameport gameport, struct module owner)
655	{	655	{
656	gameport_init_port(gameport);	656	gameport_init_port(gameport);
657	gameport_queue_event(gameport, owner, GAMEPORT_REGISTER_PORT);	657	gameport_queue_event(gameport, owner, GAMEPORT_REGISTER_PORT);
658	}	658	}
659		659
660	/*	660	/*
661	* Synchronously unregisters gameport port.	661	* Synchronously unregisters gameport port.
662	*/	662	*/
663	void gameport_unregister_port(struct gameport *gameport)	663	void gameport_unregister_port(struct gameport *gameport)
664	{	664	{
665	down(&gameport_sem);	665	down(&gameport_sem);
666	gameport_disconnect_port(gameport);	666	gameport_disconnect_port(gameport);
667	gameport_destroy_port(gameport);	667	gameport_destroy_port(gameport);
668	up(&gameport_sem);	668	up(&gameport_sem);
669	}	669	}
670		670
671		671
672	/*	672	/*
673	* Gameport driver operations	673	* Gameport driver operations
674	*/	674	*/
675		675
676	static ssize_t gameport_driver_show_description(struct device_driver drv, char buf)	676	static ssize_t gameport_driver_show_description(struct device_driver drv, char buf)
677	{	677	{
678	struct gameport_driver *driver = to_gameport_driver(drv);	678	struct gameport_driver *driver = to_gameport_driver(drv);
679	return sprintf(buf, "%s\n", driver->description ? driver->description : "(none)");	679	return sprintf(buf, "%s\n", driver->description ? driver->description : "(none)");
680	}	680	}
681		681
682	static struct driver_attribute gameport_driver_attrs[] = {	682	static struct driver_attribute gameport_driver_attrs[] = {
683	__ATTR(description, S_IRUGO, gameport_driver_show_description, NULL),	683	__ATTR(description, S_IRUGO, gameport_driver_show_description, NULL),
684	__ATTR_NULL	684	__ATTR_NULL
685	};	685	};
686		686
687	static int gameport_driver_probe(struct device *dev)	687	static int gameport_driver_probe(struct device *dev)
688	{	688	{
689	struct gameport *gameport = to_gameport_port(dev);	689	struct gameport *gameport = to_gameport_port(dev);
690	struct gameport_driver *drv = to_gameport_driver(dev->driver);	690	struct gameport_driver *drv = to_gameport_driver(dev->driver);
691		691
692	drv->connect(gameport, drv);	692	drv->connect(gameport, drv);
693	return gameport->drv ? 0 : -ENODEV;	693	return gameport->drv ? 0 : -ENODEV;
694	}	694	}
695		695
696	static int gameport_driver_remove(struct device *dev)	696	static int gameport_driver_remove(struct device *dev)
697	{	697	{
698	struct gameport *gameport = to_gameport_port(dev);	698	struct gameport *gameport = to_gameport_port(dev);
699	struct gameport_driver *drv = to_gameport_driver(dev->driver);	699	struct gameport_driver *drv = to_gameport_driver(dev->driver);
700		700
701	drv->disconnect(gameport);	701	drv->disconnect(gameport);
702	return 0;	702	return 0;
703	}	703	}
704		704
705	void __gameport_register_driver(struct gameport_driver drv, struct module owner)	705	void __gameport_register_driver(struct gameport_driver drv, struct module owner)
706	{	706	{
707	drv->driver.bus = &gameport_bus;	707	drv->driver.bus = &gameport_bus;
708	drv->driver.probe = gameport_driver_probe;	708	drv->driver.probe = gameport_driver_probe;
709	drv->driver.remove = gameport_driver_remove;	709	drv->driver.remove = gameport_driver_remove;
710	gameport_queue_event(drv, owner, GAMEPORT_REGISTER_DRIVER);	710	gameport_queue_event(drv, owner, GAMEPORT_REGISTER_DRIVER);
711	}	711	}
712		712
713	void gameport_unregister_driver(struct gameport_driver *drv)	713	void gameport_unregister_driver(struct gameport_driver *drv)
714	{	714	{
715	struct gameport *gameport;	715	struct gameport *gameport;
716		716
717	down(&gameport_sem);	717	down(&gameport_sem);
718	drv->ignore = 1; /* so gameport_find_driver ignores it */	718	drv->ignore = 1; /* so gameport_find_driver ignores it */
719		719
720	start_over:	720	start_over:
721	list_for_each_entry(gameport, &gameport_list, node) {	721	list_for_each_entry(gameport, &gameport_list, node) {
722	if (gameport->drv == drv) {	722	if (gameport->drv == drv) {
723	gameport_disconnect_port(gameport);	723	gameport_disconnect_port(gameport);
724	gameport_find_driver(gameport);	724	gameport_find_driver(gameport);
725	/* we could've deleted some ports, restart */	725	/* we could've deleted some ports, restart */
726	goto start_over;	726	goto start_over;
727	}	727	}
728	}	728	}
729		729
730	driver_unregister(&drv->driver);	730	driver_unregister(&drv->driver);
731	up(&gameport_sem);	731	up(&gameport_sem);
732	}	732	}
733		733
734	static int gameport_bus_match(struct device dev, struct device_driver drv)	734	static int gameport_bus_match(struct device dev, struct device_driver drv)
735	{	735	{
736	struct gameport_driver *gameport_drv = to_gameport_driver(drv);	736	struct gameport_driver *gameport_drv = to_gameport_driver(drv);
737		737
738	return !gameport_drv->ignore;	738	return !gameport_drv->ignore;
739	}	739	}
740		740
741	static void gameport_set_drv(struct gameport gameport, struct gameport_driver drv)	741	static void gameport_set_drv(struct gameport gameport, struct gameport_driver drv)
742	{	742	{
743	down(&gameport->drv_sem);	743	down(&gameport->drv_sem);
744	gameport->drv = drv;	744	gameport->drv = drv;
745	up(&gameport->drv_sem);	745	up(&gameport->drv_sem);
746	}	746	}
747		747
748	int gameport_open(struct gameport gameport, struct gameport_driver drv, int mode)	748	int gameport_open(struct gameport gameport, struct gameport_driver drv, int mode)
749	{	749	{
750		750
751	if (gameport->open) {	751	if (gameport->open) {
752	if (gameport->open(gameport, mode)) {	752	if (gameport->open(gameport, mode)) {
753	return -1;	753	return -1;
754	}	754	}
755	} else {	755	} else {
756	if (mode != GAMEPORT_MODE_RAW)	756	if (mode != GAMEPORT_MODE_RAW)
757	return -1;	757	return -1;
758	}	758	}
759		759
760	gameport_set_drv(gameport, drv);	760	gameport_set_drv(gameport, drv);
761	return 0;	761	return 0;
762	}	762	}
763		763
764	void gameport_close(struct gameport *gameport)	764	void gameport_close(struct gameport *gameport)
765	{	765	{
766	del_timer_sync(&gameport->poll_timer);	766	del_timer_sync(&gameport->poll_timer);
767	gameport->poll_handler = NULL;	767	gameport->poll_handler = NULL;
768	gameport->poll_interval = 0;	768	gameport->poll_interval = 0;
769	gameport_set_drv(gameport, NULL);	769	gameport_set_drv(gameport, NULL);
770	if (gameport->close)	770	if (gameport->close)
771	gameport->close(gameport);	771	gameport->close(gameport);
772	}	772	}
773		773
774	static int __init gameport_init(void)	774	static int __init gameport_init(void)
775	{	775	{
776	if (!(gameport_pid = kernel_thread(gameport_thread, NULL, CLONE_KERNEL))) {	776	if (!(gameport_pid = kernel_thread(gameport_thread, NULL, CLONE_KERNEL))) {
777	printk(KERN_ERR "gameport: Failed to start kgameportd\n");	777	printk(KERN_ERR "gameport: Failed to start kgameportd\n");
778	return -1;	778	return -1;
779	}	779	}
780		780
781	gameport_bus.dev_attrs = gameport_device_attrs;	781	gameport_bus.dev_attrs = gameport_device_attrs;
782	gameport_bus.drv_attrs = gameport_driver_attrs;	782	gameport_bus.drv_attrs = gameport_driver_attrs;
783	gameport_bus.match = gameport_bus_match;	783	gameport_bus.match = gameport_bus_match;
784	bus_register(&gameport_bus);	784	bus_register(&gameport_bus);
785		785
786	return 0;	786	return 0;
787	}	787	}
788		788
789	static void __exit gameport_exit(void)	789	static void __exit gameport_exit(void)
790	{	790	{
791	bus_unregister(&gameport_bus);	791	bus_unregister(&gameport_bus);
792	kill_proc(gameport_pid, SIGTERM, 1);	792	kill_proc(gameport_pid, SIGTERM, 1);
793	wait_for_completion(&gameport_exited);	793	wait_for_completion(&gameport_exited);
794	}	794	}
795		795
796	module_init(gameport_init);	796	module_init(gameport_init);
797	module_exit(gameport_exit);	797	module_exit(gameport_exit);
798		798

drivers/oprofile/buffer_sync.c

Diff comments View file @ 39c715b

1	/**	1	/**
2	* @file buffer_sync.c	2	* @file buffer_sync.c
3	*	3	*
4	* @remark Copyright 2002 OProfile authors	4	* @remark Copyright 2002 OProfile authors
5	* @remark Read the file COPYING	5	* @remark Read the file COPYING
6	*	6	*
7	* @author John Levon <levon@movementarian.org>	7	* @author John Levon <levon@movementarian.org>
8	*	8	*
9	* This is the core of the buffer management. Each	9	* This is the core of the buffer management. Each
10	* CPU buffer is processed and entered into the	10	* CPU buffer is processed and entered into the
11	* global event buffer. Such processing is necessary	11	* global event buffer. Such processing is necessary
12	* in several circumstances, mentioned below.	12	* in several circumstances, mentioned below.
13	*	13	*
14	* The processing does the job of converting the	14	* The processing does the job of converting the
15	* transitory EIP value into a persistent dentry/offset	15	* transitory EIP value into a persistent dentry/offset
16	* value that the profiler can record at its leisure.	16	* value that the profiler can record at its leisure.
17	*	17	*
18	* See fs/dcookies.c for a description of the dentry/offset	18	* See fs/dcookies.c for a description of the dentry/offset
19	* objects.	19	* objects.
20	*/	20	*/
21		21
22	#include <linux/mm.h>	22	#include <linux/mm.h>
23	#include <linux/workqueue.h>	23	#include <linux/workqueue.h>
24	#include <linux/notifier.h>	24	#include <linux/notifier.h>
25	#include <linux/dcookies.h>	25	#include <linux/dcookies.h>
26	#include <linux/profile.h>	26	#include <linux/profile.h>
27	#include <linux/module.h>	27	#include <linux/module.h>
28	#include <linux/fs.h>	28	#include <linux/fs.h>
29		29
30	#include "oprofile_stats.h"	30	#include "oprofile_stats.h"
31	#include "event_buffer.h"	31	#include "event_buffer.h"
32	#include "cpu_buffer.h"	32	#include "cpu_buffer.h"
33	#include "buffer_sync.h"	33	#include "buffer_sync.h"
34		34
35	static LIST_HEAD(dying_tasks);	35	static LIST_HEAD(dying_tasks);
36	static LIST_HEAD(dead_tasks);	36	static LIST_HEAD(dead_tasks);
37	static cpumask_t marked_cpus = CPU_MASK_NONE;	37	static cpumask_t marked_cpus = CPU_MASK_NONE;
38	static DEFINE_SPINLOCK(task_mortuary);	38	static DEFINE_SPINLOCK(task_mortuary);
39	static void process_task_mortuary(void);	39	static void process_task_mortuary(void);
40		40
41		41
42	/* Take ownership of the task struct and place it on the	42	/* Take ownership of the task struct and place it on the
43	* list for processing. Only after two full buffer syncs	43	* list for processing. Only after two full buffer syncs
44	* does the task eventually get freed, because by then	44	* does the task eventually get freed, because by then
45	* we are sure we will not reference it again.	45	* we are sure we will not reference it again.
46	*/	46	*/
47	static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)	47	static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)
48	{	48	{
49	struct task_struct * task = data;	49	struct task_struct * task = data;
50	spin_lock(&task_mortuary);	50	spin_lock(&task_mortuary);
51	list_add(&task->tasks, &dying_tasks);	51	list_add(&task->tasks, &dying_tasks);
52	spin_unlock(&task_mortuary);	52	spin_unlock(&task_mortuary);
53	return NOTIFY_OK;	53	return NOTIFY_OK;
54	}	54	}
55		55
56		56
57	/* The task is on its way out. A sync of the buffer means we can catch	57	/* The task is on its way out. A sync of the buffer means we can catch
58	* any remaining samples for this task.	58	* any remaining samples for this task.
59	*/	59	*/
60	static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data)	60	static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data)
61	{	61	{
62	/* To avoid latency problems, we only process the current CPU,	62	/* To avoid latency problems, we only process the current CPU,
63	* hoping that most samples for the task are on this CPU	63	* hoping that most samples for the task are on this CPU
64	*/	64	*/
65	sync_buffer(_smp_processor_id());	65	sync_buffer(raw_smp_processor_id());
66	return 0;	66	return 0;
67	}	67	}
68		68
69		69
70	/* The task is about to try a do_munmap(). We peek at what it's going to	70	/* The task is about to try a do_munmap(). We peek at what it's going to
71	* do, and if it's an executable region, process the samples first, so	71	* do, and if it's an executable region, process the samples first, so
72	* we don't lose any. This does not have to be exact, it's a QoI issue	72	* we don't lose any. This does not have to be exact, it's a QoI issue
73	* only.	73	* only.
74	*/	74	*/
75	static int munmap_notify(struct notifier_block * self, unsigned long val, void * data)	75	static int munmap_notify(struct notifier_block * self, unsigned long val, void * data)
76	{	76	{
77	unsigned long addr = (unsigned long)data;	77	unsigned long addr = (unsigned long)data;
78	struct mm_struct * mm = current->mm;	78	struct mm_struct * mm = current->mm;
79	struct vm_area_struct * mpnt;	79	struct vm_area_struct * mpnt;
80		80
81	down_read(&mm->mmap_sem);	81	down_read(&mm->mmap_sem);
82		82
83	mpnt = find_vma(mm, addr);	83	mpnt = find_vma(mm, addr);
84	if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {	84	if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
85	up_read(&mm->mmap_sem);	85	up_read(&mm->mmap_sem);
86	/* To avoid latency problems, we only process the current CPU,	86	/* To avoid latency problems, we only process the current CPU,
87	* hoping that most samples for the task are on this CPU	87	* hoping that most samples for the task are on this CPU
88	*/	88	*/
89	sync_buffer(_smp_processor_id());	89	sync_buffer(raw_smp_processor_id());
90	return 0;	90	return 0;
91	}	91	}
92		92
93	up_read(&mm->mmap_sem);	93	up_read(&mm->mmap_sem);
94	return 0;	94	return 0;
95	}	95	}
96		96
97		97
98	/* We need to be told about new modules so we don't attribute to a previously	98	/* We need to be told about new modules so we don't attribute to a previously
99	* loaded module, or drop the samples on the floor.	99	* loaded module, or drop the samples on the floor.
100	*/	100	*/
101	static int module_load_notify(struct notifier_block * self, unsigned long val, void * data)	101	static int module_load_notify(struct notifier_block * self, unsigned long val, void * data)
102	{	102	{
103	#ifdef CONFIG_MODULES	103	#ifdef CONFIG_MODULES
104	if (val != MODULE_STATE_COMING)	104	if (val != MODULE_STATE_COMING)
105	return 0;	105	return 0;
106		106
107	/* FIXME: should we process all CPU buffers ? */	107	/* FIXME: should we process all CPU buffers ? */
108	down(&buffer_sem);	108	down(&buffer_sem);
109	add_event_entry(ESCAPE_CODE);	109	add_event_entry(ESCAPE_CODE);
110	add_event_entry(MODULE_LOADED_CODE);	110	add_event_entry(MODULE_LOADED_CODE);
111	up(&buffer_sem);	111	up(&buffer_sem);
112	#endif	112	#endif
113	return 0;	113	return 0;
114	}	114	}
115		115
116		116
117	static struct notifier_block task_free_nb = {	117	static struct notifier_block task_free_nb = {
118	.notifier_call = task_free_notify,	118	.notifier_call = task_free_notify,
119	};	119	};
120		120
121	static struct notifier_block task_exit_nb = {	121	static struct notifier_block task_exit_nb = {
122	.notifier_call = task_exit_notify,	122	.notifier_call = task_exit_notify,
123	};	123	};
124		124
125	static struct notifier_block munmap_nb = {	125	static struct notifier_block munmap_nb = {
126	.notifier_call = munmap_notify,	126	.notifier_call = munmap_notify,
127	};	127	};
128		128
129	static struct notifier_block module_load_nb = {	129	static struct notifier_block module_load_nb = {
130	.notifier_call = module_load_notify,	130	.notifier_call = module_load_notify,
131	};	131	};
132		132
133		133
134	static void end_sync(void)	134	static void end_sync(void)
135	{	135	{
136	end_cpu_work();	136	end_cpu_work();
137	/* make sure we don't leak task structs */	137	/* make sure we don't leak task structs */
138	process_task_mortuary();	138	process_task_mortuary();
139	process_task_mortuary();	139	process_task_mortuary();
140	}	140	}
141		141
142		142
143	int sync_start(void)	143	int sync_start(void)
144	{	144	{
145	int err;	145	int err;
146		146
147	start_cpu_work();	147	start_cpu_work();
148		148
149	err = task_handoff_register(&task_free_nb);	149	err = task_handoff_register(&task_free_nb);
150	if (err)	150	if (err)
151	goto out1;	151	goto out1;
152	err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);	152	err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
153	if (err)	153	if (err)
154	goto out2;	154	goto out2;
155	err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);	155	err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
156	if (err)	156	if (err)
157	goto out3;	157	goto out3;
158	err = register_module_notifier(&module_load_nb);	158	err = register_module_notifier(&module_load_nb);
159	if (err)	159	if (err)
160	goto out4;	160	goto out4;
161		161
162	out:	162	out:
163	return err;	163	return err;
164	out4:	164	out4:
165	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);	165	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
166	out3:	166	out3:
167	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);	167	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
168	out2:	168	out2:
169	task_handoff_unregister(&task_free_nb);	169	task_handoff_unregister(&task_free_nb);
170	out1:	170	out1:
171	end_sync();	171	end_sync();
172	goto out;	172	goto out;
173	}	173	}
174		174
175		175
176	void sync_stop(void)	176	void sync_stop(void)
177	{	177	{
178	unregister_module_notifier(&module_load_nb);	178	unregister_module_notifier(&module_load_nb);
179	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);	179	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
180	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);	180	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
181	task_handoff_unregister(&task_free_nb);	181	task_handoff_unregister(&task_free_nb);
182	end_sync();	182	end_sync();
183	}	183	}
184		184
185		185
186	/* Optimisation. We can manage without taking the dcookie sem	186	/* Optimisation. We can manage without taking the dcookie sem
187	* because we cannot reach this code without at least one	187	* because we cannot reach this code without at least one
188	* dcookie user still being registered (namely, the reader	188	* dcookie user still being registered (namely, the reader
189	* of the event buffer). */	189	* of the event buffer). */
190	static inline unsigned long fast_get_dcookie(struct dentry * dentry,	190	static inline unsigned long fast_get_dcookie(struct dentry * dentry,
191	struct vfsmount * vfsmnt)	191	struct vfsmount * vfsmnt)
192	{	192	{
193	unsigned long cookie;	193	unsigned long cookie;
194		194
195	if (dentry->d_cookie)	195	if (dentry->d_cookie)
196	return (unsigned long)dentry;	196	return (unsigned long)dentry;
197	get_dcookie(dentry, vfsmnt, &cookie);	197	get_dcookie(dentry, vfsmnt, &cookie);
198	return cookie;	198	return cookie;
199	}	199	}
200		200
201		201
202	/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,	202	/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
203	* which corresponds loosely to "application name". This is	203	* which corresponds loosely to "application name". This is
204	* not strictly necessary but allows oprofile to associate	204	* not strictly necessary but allows oprofile to associate
205	* shared-library samples with particular applications	205	* shared-library samples with particular applications
206	*/	206	*/
207	static unsigned long get_exec_dcookie(struct mm_struct * mm)	207	static unsigned long get_exec_dcookie(struct mm_struct * mm)
208	{	208	{
209	unsigned long cookie = 0;	209	unsigned long cookie = 0;
210	struct vm_area_struct * vma;	210	struct vm_area_struct * vma;
211		211
212	if (!mm)	212	if (!mm)
213	goto out;	213	goto out;
214		214
215	for (vma = mm->mmap; vma; vma = vma->vm_next) {	215	for (vma = mm->mmap; vma; vma = vma->vm_next) {
216	if (!vma->vm_file)	216	if (!vma->vm_file)
217	continue;	217	continue;
218	if (!(vma->vm_flags & VM_EXECUTABLE))	218	if (!(vma->vm_flags & VM_EXECUTABLE))
219	continue;	219	continue;
220	cookie = fast_get_dcookie(vma->vm_file->f_dentry,	220	cookie = fast_get_dcookie(vma->vm_file->f_dentry,
221	vma->vm_file->f_vfsmnt);	221	vma->vm_file->f_vfsmnt);
222	break;	222	break;
223	}	223	}
224		224
225	out:	225	out:
226	return cookie;	226	return cookie;
227	}	227	}
228		228
229		229
230	/* Convert the EIP value of a sample into a persistent dentry/offset	230	/* Convert the EIP value of a sample into a persistent dentry/offset
231	* pair that can then be added to the global event buffer. We make	231	* pair that can then be added to the global event buffer. We make
232	* sure to do this lookup before a mm->mmap modification happens so	232	* sure to do this lookup before a mm->mmap modification happens so
233	* we don't lose track.	233	* we don't lose track.
234	*/	234	*/
235	static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset)	235	static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset)
236	{	236	{
237	unsigned long cookie = 0;	237	unsigned long cookie = 0;
238	struct vm_area_struct * vma;	238	struct vm_area_struct * vma;
239		239
240	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {	240	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
241		241
242	if (!vma->vm_file)	242	if (!vma->vm_file)
243	continue;	243	continue;
244		244
245	if (addr < vma->vm_start \|\| addr >= vma->vm_end)	245	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
246	continue;	246	continue;
247		247
248	cookie = fast_get_dcookie(vma->vm_file->f_dentry,	248	cookie = fast_get_dcookie(vma->vm_file->f_dentry,
249	vma->vm_file->f_vfsmnt);	249	vma->vm_file->f_vfsmnt);
250	*offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start;	250	*offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start;
251	break;	251	break;
252	}	252	}
253		253
254	return cookie;	254	return cookie;
255	}	255	}
256		256
257		257
258	static unsigned long last_cookie = ~0UL;	258	static unsigned long last_cookie = ~0UL;
259		259
260	static void add_cpu_switch(int i)	260	static void add_cpu_switch(int i)
261	{	261	{
262	add_event_entry(ESCAPE_CODE);	262	add_event_entry(ESCAPE_CODE);
263	add_event_entry(CPU_SWITCH_CODE);	263	add_event_entry(CPU_SWITCH_CODE);
264	add_event_entry(i);	264	add_event_entry(i);
265	last_cookie = ~0UL;	265	last_cookie = ~0UL;
266	}	266	}
267		267
268	static void add_kernel_ctx_switch(unsigned int in_kernel)	268	static void add_kernel_ctx_switch(unsigned int in_kernel)
269	{	269	{
270	add_event_entry(ESCAPE_CODE);	270	add_event_entry(ESCAPE_CODE);
271	if (in_kernel)	271	if (in_kernel)
272	add_event_entry(KERNEL_ENTER_SWITCH_CODE);	272	add_event_entry(KERNEL_ENTER_SWITCH_CODE);
273	else	273	else
274	add_event_entry(KERNEL_EXIT_SWITCH_CODE);	274	add_event_entry(KERNEL_EXIT_SWITCH_CODE);
275	}	275	}
276		276
277	static void	277	static void
278	add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)	278	add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
279	{	279	{
280	add_event_entry(ESCAPE_CODE);	280	add_event_entry(ESCAPE_CODE);
281	add_event_entry(CTX_SWITCH_CODE);	281	add_event_entry(CTX_SWITCH_CODE);
282	add_event_entry(task->pid);	282	add_event_entry(task->pid);
283	add_event_entry(cookie);	283	add_event_entry(cookie);
284	/* Another code for daemon back-compat */	284	/* Another code for daemon back-compat */
285	add_event_entry(ESCAPE_CODE);	285	add_event_entry(ESCAPE_CODE);
286	add_event_entry(CTX_TGID_CODE);	286	add_event_entry(CTX_TGID_CODE);
287	add_event_entry(task->tgid);	287	add_event_entry(task->tgid);
288	}	288	}
289		289
290		290
291	static void add_cookie_switch(unsigned long cookie)	291	static void add_cookie_switch(unsigned long cookie)
292	{	292	{
293	add_event_entry(ESCAPE_CODE);	293	add_event_entry(ESCAPE_CODE);
294	add_event_entry(COOKIE_SWITCH_CODE);	294	add_event_entry(COOKIE_SWITCH_CODE);
295	add_event_entry(cookie);	295	add_event_entry(cookie);
296	}	296	}
297		297
298		298
299	static void add_trace_begin(void)	299	static void add_trace_begin(void)
300	{	300	{
301	add_event_entry(ESCAPE_CODE);	301	add_event_entry(ESCAPE_CODE);
302	add_event_entry(TRACE_BEGIN_CODE);	302	add_event_entry(TRACE_BEGIN_CODE);
303	}	303	}
304		304
305		305
306	static void add_sample_entry(unsigned long offset, unsigned long event)	306	static void add_sample_entry(unsigned long offset, unsigned long event)
307	{	307	{
308	add_event_entry(offset);	308	add_event_entry(offset);
309	add_event_entry(event);	309	add_event_entry(event);
310	}	310	}
311		311
312		312
313	static int add_us_sample(struct mm_struct * mm, struct op_sample * s)	313	static int add_us_sample(struct mm_struct * mm, struct op_sample * s)
314	{	314	{
315	unsigned long cookie;	315	unsigned long cookie;
316	off_t offset;	316	off_t offset;
317		317
318	cookie = lookup_dcookie(mm, s->eip, &offset);	318	cookie = lookup_dcookie(mm, s->eip, &offset);
319		319
320	if (!cookie) {	320	if (!cookie) {
321	atomic_inc(&oprofile_stats.sample_lost_no_mapping);	321	atomic_inc(&oprofile_stats.sample_lost_no_mapping);
322	return 0;	322	return 0;
323	}	323	}
324		324
325	if (cookie != last_cookie) {	325	if (cookie != last_cookie) {
326	add_cookie_switch(cookie);	326	add_cookie_switch(cookie);
327	last_cookie = cookie;	327	last_cookie = cookie;
328	}	328	}
329		329
330	add_sample_entry(offset, s->event);	330	add_sample_entry(offset, s->event);
331		331
332	return 1;	332	return 1;
333	}	333	}
334		334
335		335
336	/* Add a sample to the global event buffer. If possible the	336	/* Add a sample to the global event buffer. If possible the
337	* sample is converted into a persistent dentry/offset pair	337	* sample is converted into a persistent dentry/offset pair
338	* for later lookup from userspace.	338	* for later lookup from userspace.
339	*/	339	*/
340	static int	340	static int
341	add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)	341	add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
342	{	342	{
343	if (in_kernel) {	343	if (in_kernel) {
344	add_sample_entry(s->eip, s->event);	344	add_sample_entry(s->eip, s->event);
345	return 1;	345	return 1;
346	} else if (mm) {	346	} else if (mm) {
347	return add_us_sample(mm, s);	347	return add_us_sample(mm, s);
348	} else {	348	} else {
349	atomic_inc(&oprofile_stats.sample_lost_no_mm);	349	atomic_inc(&oprofile_stats.sample_lost_no_mm);
350	}	350	}
351	return 0;	351	return 0;
352	}	352	}
353		353
354		354
355	static void release_mm(struct mm_struct * mm)	355	static void release_mm(struct mm_struct * mm)
356	{	356	{
357	if (!mm)	357	if (!mm)
358	return;	358	return;
359	up_read(&mm->mmap_sem);	359	up_read(&mm->mmap_sem);
360	mmput(mm);	360	mmput(mm);
361	}	361	}
362		362
363		363
364	static struct mm_struct * take_tasks_mm(struct task_struct * task)	364	static struct mm_struct * take_tasks_mm(struct task_struct * task)
365	{	365	{
366	struct mm_struct * mm = get_task_mm(task);	366	struct mm_struct * mm = get_task_mm(task);
367	if (mm)	367	if (mm)
368	down_read(&mm->mmap_sem);	368	down_read(&mm->mmap_sem);
369	return mm;	369	return mm;
370	}	370	}
371		371
372		372
373	static inline int is_code(unsigned long val)	373	static inline int is_code(unsigned long val)
374	{	374	{
375	return val == ESCAPE_CODE;	375	return val == ESCAPE_CODE;
376	}	376	}
377		377
378		378
379	/* "acquire" as many cpu buffer slots as we can */	379	/* "acquire" as many cpu buffer slots as we can */
380	static unsigned long get_slots(struct oprofile_cpu_buffer * b)	380	static unsigned long get_slots(struct oprofile_cpu_buffer * b)
381	{	381	{
382	unsigned long head = b->head_pos;	382	unsigned long head = b->head_pos;
383	unsigned long tail = b->tail_pos;	383	unsigned long tail = b->tail_pos;
384		384
385	/*	385	/*
386	* Subtle. This resets the persistent last_task	386	* Subtle. This resets the persistent last_task
387	* and in_kernel values used for switching notes.	387	* and in_kernel values used for switching notes.
388	* BUT, there is a small window between reading	388	* BUT, there is a small window between reading
389	* head_pos, and this call, that means samples	389	* head_pos, and this call, that means samples
390	* can appear at the new head position, but not	390	* can appear at the new head position, but not
391	* be prefixed with the notes for switching	391	* be prefixed with the notes for switching
392	* kernel mode or a task switch. This small hole	392	* kernel mode or a task switch. This small hole
393	* can lead to mis-attribution or samples where	393	* can lead to mis-attribution or samples where
394	* we don't know if it's in the kernel or not,	394	* we don't know if it's in the kernel or not,
395	* at the start of an event buffer.	395	* at the start of an event buffer.
396	*/	396	*/
397	cpu_buffer_reset(b);	397	cpu_buffer_reset(b);
398		398
399	if (head >= tail)	399	if (head >= tail)
400	return head - tail;	400	return head - tail;
401		401
402	return head + (b->buffer_size - tail);	402	return head + (b->buffer_size - tail);
403	}	403	}
404		404
405		405
406	static void increment_tail(struct oprofile_cpu_buffer * b)	406	static void increment_tail(struct oprofile_cpu_buffer * b)
407	{	407	{
408	unsigned long new_tail = b->tail_pos + 1;	408	unsigned long new_tail = b->tail_pos + 1;
409		409
410	rmb();	410	rmb();
411		411
412	if (new_tail < b->buffer_size)	412	if (new_tail < b->buffer_size)
413	b->tail_pos = new_tail;	413	b->tail_pos = new_tail;
414	else	414	else
415	b->tail_pos = 0;	415	b->tail_pos = 0;
416	}	416	}
417		417
418		418
419	/* Move tasks along towards death. Any tasks on dead_tasks	419	/* Move tasks along towards death. Any tasks on dead_tasks
420	* will definitely have no remaining references in any	420	* will definitely have no remaining references in any
421	* CPU buffers at this point, because we use two lists,	421	* CPU buffers at this point, because we use two lists,
422	* and to have reached the list, it must have gone through	422	* and to have reached the list, it must have gone through
423	* one full sync already.	423	* one full sync already.
424	*/	424	*/
425	static void process_task_mortuary(void)	425	static void process_task_mortuary(void)
426	{	426	{
427	struct list_head * pos;	427	struct list_head * pos;
428	struct list_head * pos2;	428	struct list_head * pos2;
429	struct task_struct * task;	429	struct task_struct * task;
430		430
431	spin_lock(&task_mortuary);	431	spin_lock(&task_mortuary);
432		432
433	list_for_each_safe(pos, pos2, &dead_tasks) {	433	list_for_each_safe(pos, pos2, &dead_tasks) {
434	task = list_entry(pos, struct task_struct, tasks);	434	task = list_entry(pos, struct task_struct, tasks);
435	list_del(&task->tasks);	435	list_del(&task->tasks);
436	free_task(task);	436	free_task(task);
437	}	437	}
438		438
439	list_for_each_safe(pos, pos2, &dying_tasks) {	439	list_for_each_safe(pos, pos2, &dying_tasks) {
440	task = list_entry(pos, struct task_struct, tasks);	440	task = list_entry(pos, struct task_struct, tasks);
441	list_del(&task->tasks);	441	list_del(&task->tasks);
442	list_add_tail(&task->tasks, &dead_tasks);	442	list_add_tail(&task->tasks, &dead_tasks);
443	}	443	}
444		444
445	spin_unlock(&task_mortuary);	445	spin_unlock(&task_mortuary);
446	}	446	}
447		447
448		448
449	static void mark_done(int cpu)	449	static void mark_done(int cpu)
450	{	450	{
451	int i;	451	int i;
452		452
453	cpu_set(cpu, marked_cpus);	453	cpu_set(cpu, marked_cpus);
454		454
455	for_each_online_cpu(i) {	455	for_each_online_cpu(i) {
456	if (!cpu_isset(i, marked_cpus))	456	if (!cpu_isset(i, marked_cpus))
457	return;	457	return;
458	}	458	}
459		459
460	/* All CPUs have been processed at least once,	460	/* All CPUs have been processed at least once,
461	* we can process the mortuary once	461	* we can process the mortuary once
462	*/	462	*/
463	process_task_mortuary();	463	process_task_mortuary();
464		464
465	cpus_clear(marked_cpus);	465	cpus_clear(marked_cpus);
466	}	466	}
467		467
468		468
469	/* FIXME: this is not sufficient if we implement syscall barrier backtrace	469	/* FIXME: this is not sufficient if we implement syscall barrier backtrace
470	* traversal, the code switch to sb_sample_start at first kernel enter/exit	470	* traversal, the code switch to sb_sample_start at first kernel enter/exit
471	* switch so we need a fifth state and some special handling in sync_buffer()	471	* switch so we need a fifth state and some special handling in sync_buffer()
472	*/	472	*/
473	typedef enum {	473	typedef enum {
474	sb_bt_ignore = -2,	474	sb_bt_ignore = -2,
475	sb_buffer_start,	475	sb_buffer_start,
476	sb_bt_start,	476	sb_bt_start,
477	sb_sample_start,	477	sb_sample_start,
478	} sync_buffer_state;	478	} sync_buffer_state;
479		479
480	/* Sync one of the CPU's buffers into the global event buffer.	480	/* Sync one of the CPU's buffers into the global event buffer.
481	* Here we need to go through each batch of samples punctuated	481	* Here we need to go through each batch of samples punctuated
482	* by context switch notes, taking the task's mmap_sem and doing	482	* by context switch notes, taking the task's mmap_sem and doing
483	* lookup in task->mm->mmap to convert EIP into dcookie/offset	483	* lookup in task->mm->mmap to convert EIP into dcookie/offset
484	* value.	484	* value.
485	*/	485	*/
486	void sync_buffer(int cpu)	486	void sync_buffer(int cpu)
487	{	487	{
488	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu];	488	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu];
489	struct mm_struct *mm = NULL;	489	struct mm_struct *mm = NULL;
490	struct task_struct * new;	490	struct task_struct * new;
491	unsigned long cookie = 0;	491	unsigned long cookie = 0;
492	int in_kernel = 1;	492	int in_kernel = 1;
493	unsigned int i;	493	unsigned int i;
494	sync_buffer_state state = sb_buffer_start;	494	sync_buffer_state state = sb_buffer_start;
495	unsigned long available;	495	unsigned long available;
496		496
497	down(&buffer_sem);	497	down(&buffer_sem);
498		498
499	add_cpu_switch(cpu);	499	add_cpu_switch(cpu);
500		500
501	/* Remember, only we can modify tail_pos */	501	/* Remember, only we can modify tail_pos */
502		502
503	available = get_slots(cpu_buf);	503	available = get_slots(cpu_buf);
504		504
505	for (i = 0; i < available; ++i) {	505	for (i = 0; i < available; ++i) {
506	struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];	506	struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
507		507
508	if (is_code(s->eip)) {	508	if (is_code(s->eip)) {
509	if (s->event <= CPU_IS_KERNEL) {	509	if (s->event <= CPU_IS_KERNEL) {
510	/* kernel/userspace switch */	510	/* kernel/userspace switch */
511	in_kernel = s->event;	511	in_kernel = s->event;
512	if (state == sb_buffer_start)	512	if (state == sb_buffer_start)
513	state = sb_sample_start;	513	state = sb_sample_start;
514	add_kernel_ctx_switch(s->event);	514	add_kernel_ctx_switch(s->event);
515	} else if (s->event == CPU_TRACE_BEGIN) {	515	} else if (s->event == CPU_TRACE_BEGIN) {
516	state = sb_bt_start;	516	state = sb_bt_start;
517	add_trace_begin();	517	add_trace_begin();
518	} else {	518	} else {
519	struct mm_struct * oldmm = mm;	519	struct mm_struct * oldmm = mm;
520		520
521	/* userspace context switch */	521	/* userspace context switch */
522	new = (struct task_struct *)s->event;	522	new = (struct task_struct *)s->event;
523		523
524	release_mm(oldmm);	524	release_mm(oldmm);
525	mm = take_tasks_mm(new);	525	mm = take_tasks_mm(new);
526	if (mm != oldmm)	526	if (mm != oldmm)
527	cookie = get_exec_dcookie(mm);	527	cookie = get_exec_dcookie(mm);
528	add_user_ctx_switch(new, cookie);	528	add_user_ctx_switch(new, cookie);
529	}	529	}
530	} else {	530	} else {
531	if (state >= sb_bt_start &&	531	if (state >= sb_bt_start &&
532	!add_sample(mm, s, in_kernel)) {	532	!add_sample(mm, s, in_kernel)) {
533	if (state == sb_bt_start) {	533	if (state == sb_bt_start) {
534	state = sb_bt_ignore;	534	state = sb_bt_ignore;
535	atomic_inc(&oprofile_stats.bt_lost_no_mapping);	535	atomic_inc(&oprofile_stats.bt_lost_no_mapping);
536	}	536	}
537	}	537	}
538	}	538	}
539		539
540	increment_tail(cpu_buf);	540	increment_tail(cpu_buf);
541	}	541	}
542	release_mm(mm);	542	release_mm(mm);
543		543
544	mark_done(cpu);	544	mark_done(cpu);
545		545
546	up(&buffer_sem);	546	up(&buffer_sem);
547	}	547	}
548		548

fs/xfs/linux-2.6/xfs_linux.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.	2	* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3	*	3	*
4	* This program is free software; you can redistribute it and/or modify it	4	* This program is free software; you can redistribute it and/or modify it
5	* under the terms of version 2 of the GNU General Public License as	5	* under the terms of version 2 of the GNU General Public License as
6	* published by the Free Software Foundation.	6	* published by the Free Software Foundation.
7	*	7	*
8	* This program is distributed in the hope that it would be useful, but	8	* This program is distributed in the hope that it would be useful, but
9	* WITHOUT ANY WARRANTY; without even the implied warranty of	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11	*	11	*
12	* Further, this software is distributed without any warranty that it is	12	* Further, this software is distributed without any warranty that it is
13	* free of the rightful claim of any third person regarding infringement	13	* free of the rightful claim of any third person regarding infringement
14	* or the like. Any license provided herein, whether implied or	14	* or the like. Any license provided herein, whether implied or
15	* otherwise, applies only to this software file. Patent licenses, if	15	* otherwise, applies only to this software file. Patent licenses, if
16	* any, provided herein do not apply to combinations of this program with	16	* any, provided herein do not apply to combinations of this program with
17	* other software, or any other product whatsoever.	17	* other software, or any other product whatsoever.
18	*	18	*
19	* You should have received a copy of the GNU General Public License along	19	* You should have received a copy of the GNU General Public License along
20	* with this program; if not, write the Free Software Foundation, Inc., 59	20	* with this program; if not, write the Free Software Foundation, Inc., 59
21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.	21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
22	*	22	*
23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,	23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24	* Mountain View, CA 94043, or:	24	* Mountain View, CA 94043, or:
25	*	25	*
26	* http://www.sgi.com	26	* http://www.sgi.com
27	*	27	*
28	* For further information regarding this notice, see:	28	* For further information regarding this notice, see:
29	*	29	*
30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/	30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31	*/	31	*/
32	#ifndef __XFS_LINUX__	32	#ifndef __XFS_LINUX__
33	#define __XFS_LINUX__	33	#define __XFS_LINUX__
34		34
35	#include <linux/types.h>	35	#include <linux/types.h>
36	#include <linux/config.h>	36	#include <linux/config.h>
37		37
38	/*	38	/*
39	* Some types are conditional depending on the target system.	39	* Some types are conditional depending on the target system.
40	* XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.	40	* XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
41	* XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well	41	* XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
42	* as requiring XFS_BIG_BLKNOS to be set.	42	* as requiring XFS_BIG_BLKNOS to be set.
43	*/	43	*/
44	#if defined(CONFIG_LBD) \|\| (BITS_PER_LONG == 64)	44	#if defined(CONFIG_LBD) \|\| (BITS_PER_LONG == 64)
45	# define XFS_BIG_BLKNOS 1	45	# define XFS_BIG_BLKNOS 1
46	# if BITS_PER_LONG == 64	46	# if BITS_PER_LONG == 64
47	# define XFS_BIG_INUMS 1	47	# define XFS_BIG_INUMS 1
48	# else	48	# else
49	# define XFS_BIG_INUMS 0	49	# define XFS_BIG_INUMS 0
50	# endif	50	# endif
51	#else	51	#else
52	# define XFS_BIG_BLKNOS 0	52	# define XFS_BIG_BLKNOS 0
53	# define XFS_BIG_INUMS 0	53	# define XFS_BIG_INUMS 0
54	#endif	54	#endif
55		55
56	#include <xfs_types.h>	56	#include <xfs_types.h>
57	#include <xfs_arch.h>	57	#include <xfs_arch.h>
58		58
59	#include <kmem.h>	59	#include <kmem.h>
60	#include <mrlock.h>	60	#include <mrlock.h>
61	#include <spin.h>	61	#include <spin.h>
62	#include <sv.h>	62	#include <sv.h>
63	#include <mutex.h>	63	#include <mutex.h>
64	#include <sema.h>	64	#include <sema.h>
65	#include <time.h>	65	#include <time.h>
66		66
67	#include <support/qsort.h>	67	#include <support/qsort.h>
68	#include <support/ktrace.h>	68	#include <support/ktrace.h>
69	#include <support/debug.h>	69	#include <support/debug.h>
70	#include <support/move.h>	70	#include <support/move.h>
71	#include <support/uuid.h>	71	#include <support/uuid.h>
72		72
73	#include <linux/mm.h>	73	#include <linux/mm.h>
74	#include <linux/kernel.h>	74	#include <linux/kernel.h>
75	#include <linux/blkdev.h>	75	#include <linux/blkdev.h>
76	#include <linux/slab.h>	76	#include <linux/slab.h>
77	#include <linux/module.h>	77	#include <linux/module.h>
78	#include <linux/file.h>	78	#include <linux/file.h>
79	#include <linux/swap.h>	79	#include <linux/swap.h>
80	#include <linux/errno.h>	80	#include <linux/errno.h>
81	#include <linux/sched.h>	81	#include <linux/sched.h>
82	#include <linux/bitops.h>	82	#include <linux/bitops.h>
83	#include <linux/major.h>	83	#include <linux/major.h>
84	#include <linux/pagemap.h>	84	#include <linux/pagemap.h>
85	#include <linux/vfs.h>	85	#include <linux/vfs.h>
86	#include <linux/seq_file.h>	86	#include <linux/seq_file.h>
87	#include <linux/init.h>	87	#include <linux/init.h>
88	#include <linux/list.h>	88	#include <linux/list.h>
89	#include <linux/proc_fs.h>	89	#include <linux/proc_fs.h>
90	#include <linux/version.h>	90	#include <linux/version.h>
91	#include <linux/sort.h>	91	#include <linux/sort.h>
92		92
93	#include <asm/page.h>	93	#include <asm/page.h>
94	#include <asm/div64.h>	94	#include <asm/div64.h>
95	#include <asm/param.h>	95	#include <asm/param.h>
96	#include <asm/uaccess.h>	96	#include <asm/uaccess.h>
97	#include <asm/byteorder.h>	97	#include <asm/byteorder.h>
98	#include <asm/unaligned.h>	98	#include <asm/unaligned.h>
99		99
100	#include <xfs_behavior.h>	100	#include <xfs_behavior.h>
101	#include <xfs_vfs.h>	101	#include <xfs_vfs.h>
102	#include <xfs_cred.h>	102	#include <xfs_cred.h>
103	#include <xfs_vnode.h>	103	#include <xfs_vnode.h>
104	#include <xfs_stats.h>	104	#include <xfs_stats.h>
105	#include <xfs_sysctl.h>	105	#include <xfs_sysctl.h>
106	#include <xfs_iops.h>	106	#include <xfs_iops.h>
107	#include <xfs_super.h>	107	#include <xfs_super.h>
108	#include <xfs_globals.h>	108	#include <xfs_globals.h>
109	#include <xfs_fs_subr.h>	109	#include <xfs_fs_subr.h>
110	#include <xfs_lrw.h>	110	#include <xfs_lrw.h>
111	#include <xfs_buf.h>	111	#include <xfs_buf.h>
112		112
113	/*	113	/*
114	* Feature macros (disable/enable)	114	* Feature macros (disable/enable)
115	*/	115	*/
116	#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */	116	#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
117	#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */	117	#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */
118		118
119	/*	119	/*
120	* State flag for unwritten extent buffers.	120	* State flag for unwritten extent buffers.
121	*	121	*
122	* We need to be able to distinguish between these and delayed	122	* We need to be able to distinguish between these and delayed
123	* allocate buffers within XFS. The generic IO path code does	123	* allocate buffers within XFS. The generic IO path code does
124	* not need to distinguish - we use the BH_Delay flag for both	124	* not need to distinguish - we use the BH_Delay flag for both
125	* delalloc and these ondisk-uninitialised buffers.	125	* delalloc and these ondisk-uninitialised buffers.
126	*/	126	*/
127	BUFFER_FNS(PrivateStart, unwritten);	127	BUFFER_FNS(PrivateStart, unwritten);
128	static inline void set_buffer_unwritten_io(struct buffer_head *bh)	128	static inline void set_buffer_unwritten_io(struct buffer_head *bh)
129	{	129	{
130	bh->b_end_io = linvfs_unwritten_done;	130	bh->b_end_io = linvfs_unwritten_done;
131	}	131	}
132		132
133	#define restricted_chown xfs_params.restrict_chown.val	133	#define restricted_chown xfs_params.restrict_chown.val
134	#define irix_sgid_inherit xfs_params.sgid_inherit.val	134	#define irix_sgid_inherit xfs_params.sgid_inherit.val
135	#define irix_symlink_mode xfs_params.symlink_mode.val	135	#define irix_symlink_mode xfs_params.symlink_mode.val
136	#define xfs_panic_mask xfs_params.panic_mask.val	136	#define xfs_panic_mask xfs_params.panic_mask.val
137	#define xfs_error_level xfs_params.error_level.val	137	#define xfs_error_level xfs_params.error_level.val
138	#define xfs_syncd_centisecs xfs_params.syncd_timer.val	138	#define xfs_syncd_centisecs xfs_params.syncd_timer.val
139	#define xfs_stats_clear xfs_params.stats_clear.val	139	#define xfs_stats_clear xfs_params.stats_clear.val
140	#define xfs_inherit_sync xfs_params.inherit_sync.val	140	#define xfs_inherit_sync xfs_params.inherit_sync.val
141	#define xfs_inherit_nodump xfs_params.inherit_nodump.val	141	#define xfs_inherit_nodump xfs_params.inherit_nodump.val
142	#define xfs_inherit_noatime xfs_params.inherit_noatim.val	142	#define xfs_inherit_noatime xfs_params.inherit_noatim.val
143	#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val	143	#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
144	#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val	144	#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
145	#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val	145	#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
146	#define xfs_rotorstep xfs_params.rotorstep.val	146	#define xfs_rotorstep xfs_params.rotorstep.val
147		147
148	#ifndef __smp_processor_id	148	#ifndef raw_smp_processor_id
149	#define __smp_processor_id() smp_processor_id()	149	#define raw_smp_processor_id() smp_processor_id()
150	#endif	150	#endif
151	#define current_cpu() __smp_processor_id()	151	#define current_cpu() raw_smp_processor_id()
152	#define current_pid() (current->pid)	152	#define current_pid() (current->pid)
153	#define current_fsuid(cred) (current->fsuid)	153	#define current_fsuid(cred) (current->fsuid)
154	#define current_fsgid(cred) (current->fsgid)	154	#define current_fsgid(cred) (current->fsgid)
155		155
156	#define NBPP PAGE_SIZE	156	#define NBPP PAGE_SIZE
157	#define DPPSHFT (PAGE_SHIFT - 9)	157	#define DPPSHFT (PAGE_SHIFT - 9)
158	#define NDPP (1 << (PAGE_SHIFT - 9))	158	#define NDPP (1 << (PAGE_SHIFT - 9))
159	#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT)	159	#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT)
160	#define dtopt(DD) ((DD) >> DPPSHFT)	160	#define dtopt(DD) ((DD) >> DPPSHFT)
161	#define dpoff(DD) ((DD) & (NDPP-1))	161	#define dpoff(DD) ((DD) & (NDPP-1))
162		162
163	#define NBBY 8 /* number of bits per byte */	163	#define NBBY 8 /* number of bits per byte */
164	#define NBPC PAGE_SIZE /* Number of bytes per click */	164	#define NBPC PAGE_SIZE /* Number of bytes per click */
165	#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */	165	#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */
166		166
167	/*	167	/*
168	* Size of block device i/o is parameterized here.	168	* Size of block device i/o is parameterized here.
169	* Currently the system supports page-sized i/o.	169	* Currently the system supports page-sized i/o.
170	*/	170	*/
171	#define BLKDEV_IOSHIFT BPCSHIFT	171	#define BLKDEV_IOSHIFT BPCSHIFT
172	#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)	172	#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)
173	/* number of BB's per block device block */	173	/* number of BB's per block device block */
174	#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)	174	#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)
175		175
176	/* bytes to clicks */	176	/* bytes to clicks */
177	#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)	177	#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
178	#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)	178	#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)
179	#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)	179	#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
180	#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT)	180	#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT)
181	#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)	181	#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
182	#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT)	182	#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT)
183		183
184	/* off_t bytes to clicks */	184	/* off_t bytes to clicks */
185	#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)	185	#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
186	#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT)	186	#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT)
187		187
188	/* clicks to off_t bytes */	188	/* clicks to off_t bytes */
189	#define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT)	189	#define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT)
190		190
191	/* clicks to bytes */	191	/* clicks to bytes */
192	#define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT)	192	#define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT)
193	#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)	193	#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)
194	#define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT)	194	#define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT)
195	#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT)	195	#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT)
196		196
197	/* bytes to clicks */	197	/* bytes to clicks */
198	#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)	198	#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
199		199
200	#ifndef CELL_CAPABLE	200	#ifndef CELL_CAPABLE
201	#define FSC_NOTIFY_NAME_CHANGED(vp)	201	#define FSC_NOTIFY_NAME_CHANGED(vp)
202	#endif	202	#endif
203		203
204	#ifndef ENOATTR	204	#ifndef ENOATTR
205	#define ENOATTR ENODATA /* Attribute not found */	205	#define ENOATTR ENODATA /* Attribute not found */
206	#endif	206	#endif
207		207
208	/* Note: EWRONGFS never visible outside the kernel */	208	/* Note: EWRONGFS never visible outside the kernel */
209	#define EWRONGFS EINVAL /* Mount with wrong filesystem type */	209	#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
210		210
211	/*	211	/*
212	* XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't	212	* XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
213	* return codes out of its known range in errno.	213	* return codes out of its known range in errno.
214	* XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't	214	* XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
215	* conflict with any code we use already or any code a driver may use)	215	* conflict with any code we use already or any code a driver may use)
216	* XXX Some options (currently we do #2):	216	* XXX Some options (currently we do #2):
217	* 1/ New error code ["Filesystem is corrupted", _after_ glibc updated]	217	* 1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
218	* 2/ 990 ["Unknown error 990"]	218	* 2/ 990 ["Unknown error 990"]
219	* 3/ EUCLEAN ["Structure needs cleaning"]	219	* 3/ EUCLEAN ["Structure needs cleaning"]
220	* 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]	220	* 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
221	*/	221	*/
222	#define EFSCORRUPTED 990 /* Filesystem is corrupted */	222	#define EFSCORRUPTED 990 /* Filesystem is corrupted */
223		223
224	#define SYNCHRONIZE() barrier()	224	#define SYNCHRONIZE() barrier()
225	#define __return_address __builtin_return_address(0)	225	#define __return_address __builtin_return_address(0)
226		226
227	/*	227	/*
228	* IRIX (BSD) quotactl makes use of separate commands for user/group,	228	* IRIX (BSD) quotactl makes use of separate commands for user/group,
229	* whereas on Linux the syscall encodes this information into the cmd	229	* whereas on Linux the syscall encodes this information into the cmd
230	* field (see the QCMD macro in quota.h). These macros help keep the	230	* field (see the QCMD macro in quota.h). These macros help keep the
231	* code portable - they are not visible from the syscall interface.	231	* code portable - they are not visible from the syscall interface.
232	*/	232	*/
233	#define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */	233	#define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */
234	#define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */	234	#define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */
235		235
236	/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs2) /	236	/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs2) /
237	/* we may well need to fine-tune this if it ever becomes an issue. */	237	/* we may well need to fine-tune this if it ever becomes an issue. */
238	#define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */	238	#define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */
239	#define ndquot DQUOT_MAX_HEURISTIC	239	#define ndquot DQUOT_MAX_HEURISTIC
240		240
241	/* IRIX uses the current size of the name cache to guess a good value */	241	/* IRIX uses the current size of the name cache to guess a good value */
242	/* - this isn't the same but is a good enough starting point for now. */	242	/* - this isn't the same but is a good enough starting point for now. */
243	#define DQUOT_HASH_HEURISTIC files_stat.nr_files	243	#define DQUOT_HASH_HEURISTIC files_stat.nr_files
244		244
245	/* IRIX inodes maintain the project ID also, zero this field on Linux */	245	/* IRIX inodes maintain the project ID also, zero this field on Linux */
246	#define DEFAULT_PROJID 0	246	#define DEFAULT_PROJID 0
247	#define dfltprid DEFAULT_PROJID	247	#define dfltprid DEFAULT_PROJID
248		248
249	#define MAXPATHLEN 1024	249	#define MAXPATHLEN 1024
250		250
251	#define MIN(a,b) (min(a,b))	251	#define MIN(a,b) (min(a,b))
252	#define MAX(a,b) (max(a,b))	252	#define MAX(a,b) (max(a,b))
253	#define howmany(x, y) (((x)+((y)-1))/(y))	253	#define howmany(x, y) (((x)+((y)-1))/(y))
254	#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))	254	#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
255		255
256	#define xfs_stack_trace() dump_stack()	256	#define xfs_stack_trace() dump_stack()
257		257
258	#define xfs_itruncate_data(ip, off) \	258	#define xfs_itruncate_data(ip, off) \
259	(-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))	259	(-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
260		260
261		261
262	/* Move the kernel do_div definition off to one side */	262	/* Move the kernel do_div definition off to one side */
263		263
264	#if defined __i386__	264	#if defined __i386__
265	/* For ia32 we need to pull some tricks to get past various versions	265	/* For ia32 we need to pull some tricks to get past various versions
266	* of the compiler which do not like us using do_div in the middle	266	* of the compiler which do not like us using do_div in the middle
267	* of large functions.	267	* of large functions.
268	*/	268	*/
269	static inline __u32 xfs_do_div(void *a, __u32 b, int n)	269	static inline __u32 xfs_do_div(void *a, __u32 b, int n)
270	{	270	{
271	__u32 mod;	271	__u32 mod;
272		272
273	switch (n) {	273	switch (n) {
274	case 4:	274	case 4:
275	mod = (__u32 )a % b;	275	mod = (__u32 )a % b;
276	(__u32 )a = (__u32 )a / b;	276	(__u32 )a = (__u32 )a / b;
277	return mod;	277	return mod;
278	case 8:	278	case 8:
279	{	279	{
280	unsigned long __upper, __low, __high, __mod;	280	unsigned long __upper, __low, __high, __mod;
281	__u64 c = (__u64 )a;	281	__u64 c = (__u64 )a;
282	__upper = __high = c >> 32;	282	__upper = __high = c >> 32;
283	__low = c;	283	__low = c;
284	if (__high) {	284	if (__high) {
285	__upper = __high % (b);	285	__upper = __high % (b);
286	__high = __high / (b);	286	__high = __high / (b);
287	}	287	}
288	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));	288	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
289	asm("":"=A" (c):"a" (__low),"d" (__high));	289	asm("":"=A" (c):"a" (__low),"d" (__high));
290	(__u64 )a = c;	290	(__u64 )a = c;
291	return __mod;	291	return __mod;
292	}	292	}
293	}	293	}
294		294
295	/* NOTREACHED */	295	/* NOTREACHED */
296	return 0;	296	return 0;
297	}	297	}
298		298
299	/* Side effect free 64 bit mod operation */	299	/* Side effect free 64 bit mod operation */
300	static inline __u32 xfs_do_mod(void *a, __u32 b, int n)	300	static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
301	{	301	{
302	switch (n) {	302	switch (n) {
303	case 4:	303	case 4:
304	return (__u32 )a % b;	304	return (__u32 )a % b;
305	case 8:	305	case 8:
306	{	306	{
307	unsigned long __upper, __low, __high, __mod;	307	unsigned long __upper, __low, __high, __mod;
308	__u64 c = (__u64 )a;	308	__u64 c = (__u64 )a;
309	__upper = __high = c >> 32;	309	__upper = __high = c >> 32;
310	__low = c;	310	__low = c;
311	if (__high) {	311	if (__high) {
312	__upper = __high % (b);	312	__upper = __high % (b);
313	__high = __high / (b);	313	__high = __high / (b);
314	}	314	}
315	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));	315	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
316	asm("":"=A" (c):"a" (__low),"d" (__high));	316	asm("":"=A" (c):"a" (__low),"d" (__high));
317	return __mod;	317	return __mod;
318	}	318	}
319	}	319	}
320		320
321	/* NOTREACHED */	321	/* NOTREACHED */
322	return 0;	322	return 0;
323	}	323	}
324	#else	324	#else
325	static inline __u32 xfs_do_div(void *a, __u32 b, int n)	325	static inline __u32 xfs_do_div(void *a, __u32 b, int n)
326	{	326	{
327	__u32 mod;	327	__u32 mod;
328		328
329	switch (n) {	329	switch (n) {
330	case 4:	330	case 4:
331	mod = (__u32 )a % b;	331	mod = (__u32 )a % b;
332	(__u32 )a = (__u32 )a / b;	332	(__u32 )a = (__u32 )a / b;
333	return mod;	333	return mod;
334	case 8:	334	case 8:
335	mod = do_div((__u64 )a, b);	335	mod = do_div((__u64 )a, b);
336	return mod;	336	return mod;
337	}	337	}
338		338
339	/* NOTREACHED */	339	/* NOTREACHED */
340	return 0;	340	return 0;
341	}	341	}
342		342
343	/* Side effect free 64 bit mod operation */	343	/* Side effect free 64 bit mod operation */
344	static inline __u32 xfs_do_mod(void *a, __u32 b, int n)	344	static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
345	{	345	{
346	switch (n) {	346	switch (n) {
347	case 4:	347	case 4:
348	return (__u32 )a % b;	348	return (__u32 )a % b;
349	case 8:	349	case 8:
350	{	350	{
351	__u64 c = (__u64 )a;	351	__u64 c = (__u64 )a;
352	return do_div(c, b);	352	return do_div(c, b);
353	}	353	}
354	}	354	}
355		355
356	/* NOTREACHED */	356	/* NOTREACHED */
357	return 0;	357	return 0;
358	}	358	}
359	#endif	359	#endif
360		360
361	#undef do_div	361	#undef do_div
362	#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))	362	#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
363	#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))	363	#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
364		364
365	static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)	365	static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
366	{	366	{
367	x += y - 1;	367	x += y - 1;
368	do_div(x, y);	368	do_div(x, y);
369	return(x * y);	369	return(x * y);
370	}	370	}
371		371
372	#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)	372	#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)
373		373
374	#endif /* __XFS_LINUX__ */	374	#endif /* __XFS_LINUX__ */
375		375

include/asm-alpha/smp.h

Diff comments View file @ 39c715b

1	#ifndef __ASM_SMP_H	1	#ifndef __ASM_SMP_H
2	#define __ASM_SMP_H	2	#define __ASM_SMP_H
3		3
4	#include <linux/config.h>	4	#include <linux/config.h>
5	#include <linux/threads.h>	5	#include <linux/threads.h>
6	#include <linux/cpumask.h>	6	#include <linux/cpumask.h>
7	#include <linux/bitops.h>	7	#include <linux/bitops.h>
8	#include <asm/pal.h>	8	#include <asm/pal.h>
9		9
10	/* HACK: Cabrio WHAMI return value is bogus if more than 8 bits used.. :-( */	10	/* HACK: Cabrio WHAMI return value is bogus if more than 8 bits used.. :-( */
11		11
12	static __inline__ unsigned char	12	static __inline__ unsigned char
13	__hard_smp_processor_id(void)	13	__hard_smp_processor_id(void)
14	{	14	{
15	register unsigned char __r0 __asm__("$0");	15	register unsigned char __r0 __asm__("$0");
16	__asm__ __volatile__(	16	__asm__ __volatile__(
17	"call_pal %1 #whami"	17	"call_pal %1 #whami"
18	: "=r"(__r0)	18	: "=r"(__r0)
19	:"i" (PAL_whami)	19	:"i" (PAL_whami)
20	: "$1", "$22", "$23", "$24", "$25");	20	: "$1", "$22", "$23", "$24", "$25");
21	return __r0;	21	return __r0;
22	}	22	}
23		23
24	#ifdef CONFIG_SMP	24	#ifdef CONFIG_SMP
25		25
26	#include <asm/irq.h>	26	#include <asm/irq.h>
27		27
28	struct cpuinfo_alpha {	28	struct cpuinfo_alpha {
29	unsigned long loops_per_jiffy;	29	unsigned long loops_per_jiffy;
30	unsigned long last_asn;	30	unsigned long last_asn;
31	int need_new_asn;	31	int need_new_asn;
32	int asn_lock;	32	int asn_lock;
33	unsigned long ipi_count;	33	unsigned long ipi_count;
34	unsigned long prof_multiplier;	34	unsigned long prof_multiplier;
35	unsigned long prof_counter;	35	unsigned long prof_counter;
36	unsigned char mcheck_expected;	36	unsigned char mcheck_expected;
37	unsigned char mcheck_taken;	37	unsigned char mcheck_taken;
38	unsigned char mcheck_extra;	38	unsigned char mcheck_extra;
39	} __attribute__((aligned(64)));	39	} __attribute__((aligned(64)));
40		40
41	extern struct cpuinfo_alpha cpu_data[NR_CPUS];	41	extern struct cpuinfo_alpha cpu_data[NR_CPUS];
42		42
43	#define PROC_CHANGE_PENALTY 20	43	#define PROC_CHANGE_PENALTY 20
44		44
45	#define hard_smp_processor_id() __hard_smp_processor_id()	45	#define hard_smp_processor_id() __hard_smp_processor_id()
46	#define smp_processor_id() (current_thread_info()->cpu)	46	#define raw_smp_processor_id() (current_thread_info()->cpu)
47		47
48	extern cpumask_t cpu_present_mask;	48	extern cpumask_t cpu_present_mask;
49	extern cpumask_t cpu_online_map;	49	extern cpumask_t cpu_online_map;
50	extern int smp_num_cpus;	50	extern int smp_num_cpus;
51	#define cpu_possible_map cpu_present_mask	51	#define cpu_possible_map cpu_present_mask
52		52
53	int smp_call_function_on_cpu(void (func) (void info), void *info,int retry, int wait, cpumask_t cpu);	53	int smp_call_function_on_cpu(void (func) (void info), void *info,int retry, int wait, cpumask_t cpu);
54		54
55	#else /* CONFIG_SMP */	55	#else /* CONFIG_SMP */
56		56
57	#define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; })	57	#define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; })
58		58
59	#endif /* CONFIG_SMP */	59	#endif /* CONFIG_SMP */
60		60
61	#define NO_PROC_ID (-1)	61	#define NO_PROC_ID (-1)
62		62
63	#endif	63	#endif
64		64

include/asm-arm/smp.h

Diff comments View file @ 39c715b

include/asm-i386/smp.h

Diff comments View file @ 39c715b

1	#ifndef __ASM_SMP_H	1	#ifndef __ASM_SMP_H
2	#define __ASM_SMP_H	2	#define __ASM_SMP_H
3		3
4	/*	4	/*
5	* We need the APIC definitions automatically as part of 'smp.h'	5	* We need the APIC definitions automatically as part of 'smp.h'
6	*/	6	*/
7	#ifndef __ASSEMBLY__	7	#ifndef __ASSEMBLY__
8	#include <linux/config.h>	8	#include <linux/config.h>
9	#include <linux/kernel.h>	9	#include <linux/kernel.h>
10	#include <linux/threads.h>	10	#include <linux/threads.h>
11	#include <linux/cpumask.h>	11	#include <linux/cpumask.h>
12	#endif	12	#endif
13		13
14	#ifdef CONFIG_X86_LOCAL_APIC	14	#ifdef CONFIG_X86_LOCAL_APIC
15	#ifndef __ASSEMBLY__	15	#ifndef __ASSEMBLY__
16	#include <asm/fixmap.h>	16	#include <asm/fixmap.h>
17	#include <asm/bitops.h>	17	#include <asm/bitops.h>
18	#include <asm/mpspec.h>	18	#include <asm/mpspec.h>
19	#ifdef CONFIG_X86_IO_APIC	19	#ifdef CONFIG_X86_IO_APIC
20	#include <asm/io_apic.h>	20	#include <asm/io_apic.h>
21	#endif	21	#endif
22	#include <asm/apic.h>	22	#include <asm/apic.h>
23	#endif	23	#endif
24	#endif	24	#endif
25		25
26	#define BAD_APICID 0xFFu	26	#define BAD_APICID 0xFFu
27	#ifdef CONFIG_SMP	27	#ifdef CONFIG_SMP
28	#ifndef __ASSEMBLY__	28	#ifndef __ASSEMBLY__
29		29
30	/*	30	/*
31	* Private routines/data	31	* Private routines/data
32	*/	32	*/
33		33
34	extern void smp_alloc_memory(void);	34	extern void smp_alloc_memory(void);
35	extern int pic_mode;	35	extern int pic_mode;
36	extern int smp_num_siblings;	36	extern int smp_num_siblings;
37	extern cpumask_t cpu_sibling_map[];	37	extern cpumask_t cpu_sibling_map[];
38	extern cpumask_t cpu_core_map[];	38	extern cpumask_t cpu_core_map[];
39		39
40	extern void smp_flush_tlb(void);	40	extern void smp_flush_tlb(void);
41	extern void smp_message_irq(int cpl, void dev_id, struct pt_regs regs);	41	extern void smp_message_irq(int cpl, void dev_id, struct pt_regs regs);
42	extern void smp_invalidate_rcv(void); /* Process an NMI */	42	extern void smp_invalidate_rcv(void); /* Process an NMI */
43	extern void (*mtrr_hook) (void);	43	extern void (*mtrr_hook) (void);
44	extern void zap_low_mappings (void);	44	extern void zap_low_mappings (void);
45		45
46	#define MAX_APICID 256	46	#define MAX_APICID 256
47	extern u8 x86_cpu_to_apicid[];	47	extern u8 x86_cpu_to_apicid[];
48		48
49	/*	49	/*
50	* This function is needed by all SMP systems. It must _always_ be valid	50	* This function is needed by all SMP systems. It must _always_ be valid
51	* from the initial startup. We map APIC_BASE very early in page_setup(),	51	* from the initial startup. We map APIC_BASE very early in page_setup(),
52	* so this is correct in the x86 case.	52	* so this is correct in the x86 case.
53	*/	53	*/
54	#define __smp_processor_id() (current_thread_info()->cpu)	54	#define raw_smp_processor_id() (current_thread_info()->cpu)
55		55
56	extern cpumask_t cpu_callout_map;	56	extern cpumask_t cpu_callout_map;
57	extern cpumask_t cpu_callin_map;	57	extern cpumask_t cpu_callin_map;
58	#define cpu_possible_map cpu_callout_map	58	#define cpu_possible_map cpu_callout_map
59		59
60	/* We don't mark CPUs online until __cpu_up(), so we need another measure */	60	/* We don't mark CPUs online until __cpu_up(), so we need another measure */
61	static inline int num_booting_cpus(void)	61	static inline int num_booting_cpus(void)
62	{	62	{
63	return cpus_weight(cpu_callout_map);	63	return cpus_weight(cpu_callout_map);
64	}	64	}
65		65
66	#ifdef CONFIG_X86_LOCAL_APIC	66	#ifdef CONFIG_X86_LOCAL_APIC
67		67
68	#ifdef APIC_DEFINITION	68	#ifdef APIC_DEFINITION
69	extern int hard_smp_processor_id(void);	69	extern int hard_smp_processor_id(void);
70	#else	70	#else
71	#include <mach_apicdef.h>	71	#include <mach_apicdef.h>
72	static inline int hard_smp_processor_id(void)	72	static inline int hard_smp_processor_id(void)
73	{	73	{
74	/* we don't want to mark this access volatile - bad code generation */	74	/* we don't want to mark this access volatile - bad code generation */
75	return GET_APIC_ID((unsigned long )(APIC_BASE+APIC_ID));	75	return GET_APIC_ID((unsigned long )(APIC_BASE+APIC_ID));
76	}	76	}
77	#endif	77	#endif
78		78
79	static __inline int logical_smp_processor_id(void)	79	static __inline int logical_smp_processor_id(void)
80	{	80	{
81	/* we don't want to mark this access volatile - bad code generation */	81	/* we don't want to mark this access volatile - bad code generation */
82	return GET_APIC_LOGICAL_ID((unsigned long )(APIC_BASE+APIC_LDR));	82	return GET_APIC_LOGICAL_ID((unsigned long )(APIC_BASE+APIC_LDR));
83	}	83	}
84		84
85	#endif	85	#endif
86	#endif /* !__ASSEMBLY__ */	86	#endif /* !__ASSEMBLY__ */
87		87
88	#define NO_PROC_ID 0xFF /* No processor magic marker */	88	#define NO_PROC_ID 0xFF /* No processor magic marker */
89		89
90	#endif	90	#endif
91	#endif	91	#endif
92		92

include/asm-ia64/smp.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	* SMP Support	2	* SMP Support
3	*	3	*
4	* Copyright (C) 1999 VA Linux Systems	4	* Copyright (C) 1999 VA Linux Systems
5	* Copyright (C) 1999 Walt Drummond <drummond@valinux.com>	5	* Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
6	* (c) Copyright 2001-2003, 2005 Hewlett-Packard Development Company, L.P.	6	* (c) Copyright 2001-2003, 2005 Hewlett-Packard Development Company, L.P.
7	* David Mosberger-Tang <davidm@hpl.hp.com>	7	* David Mosberger-Tang <davidm@hpl.hp.com>
8	* Bjorn Helgaas <bjorn.helgaas@hp.com>	8	* Bjorn Helgaas <bjorn.helgaas@hp.com>
9	*/	9	*/
10	#ifndef _ASM_IA64_SMP_H	10	#ifndef _ASM_IA64_SMP_H
11	#define _ASM_IA64_SMP_H	11	#define _ASM_IA64_SMP_H
12		12
13	#include <linux/config.h>	13	#include <linux/config.h>
14	#include <linux/init.h>	14	#include <linux/init.h>
15	#include <linux/threads.h>	15	#include <linux/threads.h>
16	#include <linux/kernel.h>	16	#include <linux/kernel.h>
17	#include <linux/cpumask.h>	17	#include <linux/cpumask.h>
18		18
19	#include <asm/bitops.h>	19	#include <asm/bitops.h>
20	#include <asm/io.h>	20	#include <asm/io.h>
21	#include <asm/param.h>	21	#include <asm/param.h>
22	#include <asm/processor.h>	22	#include <asm/processor.h>
23	#include <asm/ptrace.h>	23	#include <asm/ptrace.h>
24		24
25	static inline unsigned int	25	static inline unsigned int
26	ia64_get_lid (void)	26	ia64_get_lid (void)
27	{	27	{
28	union {	28	union {
29	struct {	29	struct {
30	unsigned long reserved : 16;	30	unsigned long reserved : 16;
31	unsigned long eid : 8;	31	unsigned long eid : 8;
32	unsigned long id : 8;	32	unsigned long id : 8;
33	unsigned long ignored : 32;	33	unsigned long ignored : 32;
34	} f;	34	} f;
35	unsigned long bits;	35	unsigned long bits;
36	} lid;	36	} lid;
37		37
38	lid.bits = ia64_getreg(_IA64_REG_CR_LID);	38	lid.bits = ia64_getreg(_IA64_REG_CR_LID);
39	return lid.f.id << 8 \| lid.f.eid;	39	return lid.f.id << 8 \| lid.f.eid;
40	}	40	}
41		41
42	#ifdef CONFIG_SMP	42	#ifdef CONFIG_SMP
43		43
44	#define XTP_OFFSET 0x1e0008	44	#define XTP_OFFSET 0x1e0008
45		45
46	#define SMP_IRQ_REDIRECTION (1 << 0)	46	#define SMP_IRQ_REDIRECTION (1 << 0)
47	#define SMP_IPI_REDIRECTION (1 << 1)	47	#define SMP_IPI_REDIRECTION (1 << 1)
48		48
49	#define smp_processor_id() (current_thread_info()->cpu)	49	#define raw_smp_processor_id() (current_thread_info()->cpu)
50		50
51	extern struct smp_boot_data {	51	extern struct smp_boot_data {
52	int cpu_count;	52	int cpu_count;
53	int cpu_phys_id[NR_CPUS];	53	int cpu_phys_id[NR_CPUS];
54	} smp_boot_data __initdata;	54	} smp_boot_data __initdata;
55		55
56	extern char no_int_routing __devinitdata;	56	extern char no_int_routing __devinitdata;
57		57
58	extern cpumask_t cpu_online_map;	58	extern cpumask_t cpu_online_map;
59	extern cpumask_t cpu_core_map[NR_CPUS];	59	extern cpumask_t cpu_core_map[NR_CPUS];
60	extern cpumask_t cpu_sibling_map[NR_CPUS];	60	extern cpumask_t cpu_sibling_map[NR_CPUS];
61	extern int smp_num_siblings;	61	extern int smp_num_siblings;
62	extern int smp_num_cpucores;	62	extern int smp_num_cpucores;
63	extern void __iomem *ipi_base_addr;	63	extern void __iomem *ipi_base_addr;
64	extern unsigned char smp_int_redirect;	64	extern unsigned char smp_int_redirect;
65		65
66	extern volatile int ia64_cpu_to_sapicid[];	66	extern volatile int ia64_cpu_to_sapicid[];
67	#define cpu_physical_id(i) ia64_cpu_to_sapicid[i]	67	#define cpu_physical_id(i) ia64_cpu_to_sapicid[i]
68		68
69	extern unsigned long ap_wakeup_vector;	69	extern unsigned long ap_wakeup_vector;
70		70
71	/*	71	/*
72	* Function to map hard smp processor id to logical id. Slow, so don't use this in	72	* Function to map hard smp processor id to logical id. Slow, so don't use this in
73	* performance-critical code.	73	* performance-critical code.
74	*/	74	*/
75	static inline int	75	static inline int
76	cpu_logical_id (int cpuid)	76	cpu_logical_id (int cpuid)
77	{	77	{
78	int i;	78	int i;
79		79
80	for (i = 0; i < NR_CPUS; ++i)	80	for (i = 0; i < NR_CPUS; ++i)
81	if (cpu_physical_id(i) == cpuid)	81	if (cpu_physical_id(i) == cpuid)
82	break;	82	break;
83	return i;	83	return i;
84	}	84	}
85		85
86	/*	86	/*
87	* XTP control functions:	87	* XTP control functions:
88	* min_xtp : route all interrupts to this CPU	88	* min_xtp : route all interrupts to this CPU
89	* normal_xtp: nominal XTP value	89	* normal_xtp: nominal XTP value
90	* max_xtp : never deliver interrupts to this CPU.	90	* max_xtp : never deliver interrupts to this CPU.
91	*/	91	*/
92		92
93	static inline void	93	static inline void
94	min_xtp (void)	94	min_xtp (void)
95	{	95	{
96	if (smp_int_redirect & SMP_IRQ_REDIRECTION)	96	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
97	writeb(0x00, ipi_base_addr + XTP_OFFSET); /* XTP to min */	97	writeb(0x00, ipi_base_addr + XTP_OFFSET); /* XTP to min */
98	}	98	}
99		99
100	static inline void	100	static inline void
101	normal_xtp (void)	101	normal_xtp (void)
102	{	102	{
103	if (smp_int_redirect & SMP_IRQ_REDIRECTION)	103	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
104	writeb(0x08, ipi_base_addr + XTP_OFFSET); /* XTP normal */	104	writeb(0x08, ipi_base_addr + XTP_OFFSET); /* XTP normal */
105	}	105	}
106		106
107	static inline void	107	static inline void
108	max_xtp (void)	108	max_xtp (void)
109	{	109	{
110	if (smp_int_redirect & SMP_IRQ_REDIRECTION)	110	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
111	writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */	111	writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */
112	}	112	}
113		113
114	#define hard_smp_processor_id() ia64_get_lid()	114	#define hard_smp_processor_id() ia64_get_lid()
115		115
116	/* Upping and downing of CPUs */	116	/* Upping and downing of CPUs */
117	extern int __cpu_disable (void);	117	extern int __cpu_disable (void);
118	extern void __cpu_die (unsigned int cpu);	118	extern void __cpu_die (unsigned int cpu);
119	extern void cpu_die (void) __attribute__ ((noreturn));	119	extern void cpu_die (void) __attribute__ ((noreturn));
120	extern int __cpu_up (unsigned int cpu);	120	extern int __cpu_up (unsigned int cpu);
121	extern void __init smp_build_cpu_map(void);	121	extern void __init smp_build_cpu_map(void);
122		122
123	extern void __init init_smp_config (void);	123	extern void __init init_smp_config (void);
124	extern void smp_do_timer (struct pt_regs *regs);	124	extern void smp_do_timer (struct pt_regs *regs);
125		125
126	extern int smp_call_function_single (int cpuid, void (func) (void info), void *info,	126	extern int smp_call_function_single (int cpuid, void (func) (void info), void *info,
127	int retry, int wait);	127	int retry, int wait);
128	extern void smp_send_reschedule (int cpu);	128	extern void smp_send_reschedule (int cpu);
129	extern void lock_ipi_calllock(void);	129	extern void lock_ipi_calllock(void);
130	extern void unlock_ipi_calllock(void);	130	extern void unlock_ipi_calllock(void);
131	extern void identify_siblings (struct cpuinfo_ia64 *);	131	extern void identify_siblings (struct cpuinfo_ia64 *);
132		132
133	#else	133	#else
134		134
135	#define cpu_logical_id(i) 0	135	#define cpu_logical_id(i) 0
136	#define cpu_physical_id(i) ia64_get_lid()	136	#define cpu_physical_id(i) ia64_get_lid()
137		137
138	#endif /* CONFIG_SMP */	138	#endif /* CONFIG_SMP */
139	#endif /* _ASM_IA64_SMP_H */	139	#endif /* _ASM_IA64_SMP_H */
140		140

include/asm-m32r/smp.h

Diff comments View file @ 39c715b

include/asm-mips/smp.h

Diff comments View file @ 39c715b

include/asm-parisc/smp.h

Diff comments View file @ 39c715b

1	#ifndef __ASM_SMP_H	1	#ifndef __ASM_SMP_H
2	#define __ASM_SMP_H	2	#define __ASM_SMP_H
3		3
4	#include <linux/config.h>	4	#include <linux/config.h>
5		5
6	#if defined(CONFIG_SMP)	6	#if defined(CONFIG_SMP)
7		7
8	/* Page Zero Location PDC will look for the address to branch to when we poke	8	/* Page Zero Location PDC will look for the address to branch to when we poke
9	** slave CPUs still in "Icache loop".	9	** slave CPUs still in "Icache loop".
10	*/	10	*/
11	#define PDC_OS_BOOT_RENDEZVOUS 0x10	11	#define PDC_OS_BOOT_RENDEZVOUS 0x10
12	#define PDC_OS_BOOT_RENDEZVOUS_HI 0x28	12	#define PDC_OS_BOOT_RENDEZVOUS_HI 0x28
13		13
14	#ifndef ASSEMBLY	14	#ifndef ASSEMBLY
15	#include <linux/bitops.h>	15	#include <linux/bitops.h>
16	#include <linux/threads.h> /* for NR_CPUS */	16	#include <linux/threads.h> /* for NR_CPUS */
17	#include <linux/cpumask.h>	17	#include <linux/cpumask.h>
18	typedef unsigned long address_t;	18	typedef unsigned long address_t;
19		19
20	extern cpumask_t cpu_online_map;	20	extern cpumask_t cpu_online_map;
21		21
22		22
23	/*	23	/*
24	* Private routines/data	24	* Private routines/data
25	*	25	*
26	* physical and logical are equivalent until we support CPU hotplug.	26	* physical and logical are equivalent until we support CPU hotplug.
27	*/	27	*/
28	#define cpu_number_map(cpu) (cpu)	28	#define cpu_number_map(cpu) (cpu)
29	#define cpu_logical_map(cpu) (cpu)	29	#define cpu_logical_map(cpu) (cpu)
30		30
31	extern void smp_send_reschedule(int cpu);	31	extern void smp_send_reschedule(int cpu);
32		32
33	#endif /* !ASSEMBLY */	33	#endif /* !ASSEMBLY */
34		34
35	/*	35	/*
36	* This magic constant controls our willingness to transfer	36	* This magic constant controls our willingness to transfer
37	* a process across CPUs. Such a transfer incurs cache and tlb	37	* a process across CPUs. Such a transfer incurs cache and tlb
38	* misses. The current value is inherited from i386. Still needs	38	* misses. The current value is inherited from i386. Still needs
39	* to be tuned for parisc.	39	* to be tuned for parisc.
40	*/	40	*/
41		41
42	#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */	42	#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
43		43
44	#undef ENTRY_SYS_CPUS	44	#undef ENTRY_SYS_CPUS
45	#ifdef ENTRY_SYS_CPUS	45	#ifdef ENTRY_SYS_CPUS
46	#define STATE_RENDEZVOUS 0	46	#define STATE_RENDEZVOUS 0
47	#define STATE_STOPPED 1	47	#define STATE_STOPPED 1
48	#define STATE_RUNNING 2	48	#define STATE_RUNNING 2
49	#define STATE_HALTED 3	49	#define STATE_HALTED 3
50	#endif	50	#endif
51		51
52	extern unsigned long cpu_present_mask;	52	extern unsigned long cpu_present_mask;
53		53
54	#define smp_processor_id() (current_thread_info()->cpu)	54	#define raw_smp_processor_id() (current_thread_info()->cpu)
55		55
56	#endif /* CONFIG_SMP */	56	#endif /* CONFIG_SMP */
57		57
58	#define NO_PROC_ID 0xFF /* No processor magic marker */	58	#define NO_PROC_ID 0xFF /* No processor magic marker */
59	#define ANY_PROC_ID 0xFF /* Any processor magic marker */	59	#define ANY_PROC_ID 0xFF /* Any processor magic marker */
60	static inline int __cpu_disable (void) {	60	static inline int __cpu_disable (void) {
61	return 0;	61	return 0;
62	}	62	}
63	static inline void __cpu_die (unsigned int cpu) {	63	static inline void __cpu_die (unsigned int cpu) {
64	while(1)	64	while(1)
65	;	65	;
66	}	66	}
67	extern int __cpu_up (unsigned int cpu);	67	extern int __cpu_up (unsigned int cpu);
68		68
69	#endif /* __ASM_SMP_H */	69	#endif /* __ASM_SMP_H */
70		70

include/asm-ppc/smp.h

Diff comments View file @ 39c715b

1	/* smp.h: PPC specific SMP stuff.	1	/* smp.h: PPC specific SMP stuff.
2	*	2	*
3	* Original was a copy of sparc smp.h. Now heavily modified	3	* Original was a copy of sparc smp.h. Now heavily modified
4	* for PPC.	4	* for PPC.
5	*	5	*
6	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)	6	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
7	* Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com>	7	* Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com>
8	*/	8	*/
9	#ifdef __KERNEL__	9	#ifdef __KERNEL__
10	#ifndef _PPC_SMP_H	10	#ifndef _PPC_SMP_H
11	#define _PPC_SMP_H	11	#define _PPC_SMP_H
12		12
13	#include <linux/config.h>	13	#include <linux/config.h>
14	#include <linux/kernel.h>	14	#include <linux/kernel.h>
15	#include <linux/bitops.h>	15	#include <linux/bitops.h>
16	#include <linux/errno.h>	16	#include <linux/errno.h>
17	#include <linux/cpumask.h>	17	#include <linux/cpumask.h>
18	#include <linux/threads.h>	18	#include <linux/threads.h>
19		19
20	#ifdef CONFIG_SMP	20	#ifdef CONFIG_SMP
21		21
22	#ifndef __ASSEMBLY__	22	#ifndef __ASSEMBLY__
23		23
24	struct cpuinfo_PPC {	24	struct cpuinfo_PPC {
25	unsigned long loops_per_jiffy;	25	unsigned long loops_per_jiffy;
26	unsigned long pvr;	26	unsigned long pvr;
27	unsigned long *pgd_cache;	27	unsigned long *pgd_cache;
28	unsigned long *pte_cache;	28	unsigned long *pte_cache;
29	unsigned long pgtable_cache_sz;	29	unsigned long pgtable_cache_sz;
30	};	30	};
31		31
32	extern struct cpuinfo_PPC cpu_data[];	32	extern struct cpuinfo_PPC cpu_data[];
33	extern cpumask_t cpu_online_map;	33	extern cpumask_t cpu_online_map;
34	extern cpumask_t cpu_possible_map;	34	extern cpumask_t cpu_possible_map;
35	extern unsigned long smp_proc_in_lock[];	35	extern unsigned long smp_proc_in_lock[];
36	extern volatile unsigned long cpu_callin_map[];	36	extern volatile unsigned long cpu_callin_map[];
37	extern int smp_tb_synchronized;	37	extern int smp_tb_synchronized;
38		38
39	extern void smp_send_tlb_invalidate(int);	39	extern void smp_send_tlb_invalidate(int);
40	extern void smp_send_xmon_break(int cpu);	40	extern void smp_send_xmon_break(int cpu);
41	struct pt_regs;	41	struct pt_regs;
42	extern void smp_message_recv(int, struct pt_regs *);	42	extern void smp_message_recv(int, struct pt_regs *);
43		43
44	#define NO_PROC_ID 0xFF /* No processor magic marker */	44	#define NO_PROC_ID 0xFF /* No processor magic marker */
45	#define PROC_CHANGE_PENALTY 20	45	#define PROC_CHANGE_PENALTY 20
46		46
47	#define smp_processor_id() (current_thread_info()->cpu)	47	#define raw_smp_processor_id() (current_thread_info()->cpu)
48		48
49	extern int __cpu_up(unsigned int cpu);	49	extern int __cpu_up(unsigned int cpu);
50		50
51	extern int smp_hw_index[];	51	extern int smp_hw_index[];
52	#define hard_smp_processor_id() (smp_hw_index[smp_processor_id()])	52	#define hard_smp_processor_id() (smp_hw_index[smp_processor_id()])
53		53
54	struct klock_info_struct {	54	struct klock_info_struct {
55	unsigned long kernel_flag;	55	unsigned long kernel_flag;
56	unsigned char akp;	56	unsigned char akp;
57	};	57	};
58		58
59	extern struct klock_info_struct klock_info;	59	extern struct klock_info_struct klock_info;
60	#define KLOCK_HELD 0xffffffff	60	#define KLOCK_HELD 0xffffffff
61	#define KLOCK_CLEAR 0x0	61	#define KLOCK_CLEAR 0x0
62		62
63	#endif /* __ASSEMBLY__ */	63	#endif /* __ASSEMBLY__ */
64		64
65	#else /* !(CONFIG_SMP) */	65	#else /* !(CONFIG_SMP) */
66		66
67	#endif /* !(CONFIG_SMP) */	67	#endif /* !(CONFIG_SMP) */
68		68
69	#endif /* !(_PPC_SMP_H) */	69	#endif /* !(_PPC_SMP_H) */
70	#endif /* __KERNEL__ */	70	#endif /* __KERNEL__ */
71		71

include/asm-ppc64/smp.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	* smp.h: PPC64 specific SMP code.	2	* smp.h: PPC64 specific SMP code.
3	*	3	*
4	* Original was a copy of sparc smp.h. Now heavily modified	4	* Original was a copy of sparc smp.h. Now heavily modified
5	* for PPC.	5	* for PPC.
6	*	6	*
7	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)	7	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
8	* Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com>	8	* Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com>
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public License	11	* modify it under the terms of the GNU General Public License
12	* as published by the Free Software Foundation; either version	12	* as published by the Free Software Foundation; either version
13	* 2 of the License, or (at your option) any later version.	13	* 2 of the License, or (at your option) any later version.
14	*/	14	*/
15		15
16	#ifdef __KERNEL__	16	#ifdef __KERNEL__
17	#ifndef _PPC64_SMP_H	17	#ifndef _PPC64_SMP_H
18	#define _PPC64_SMP_H	18	#define _PPC64_SMP_H
19		19
20	#include <linux/config.h>	20	#include <linux/config.h>
21	#include <linux/threads.h>	21	#include <linux/threads.h>
22	#include <linux/cpumask.h>	22	#include <linux/cpumask.h>
23	#include <linux/kernel.h>	23	#include <linux/kernel.h>
24		24
25	#ifndef __ASSEMBLY__	25	#ifndef __ASSEMBLY__
26		26
27	#include <asm/paca.h>	27	#include <asm/paca.h>
28		28
29	extern int boot_cpuid;	29	extern int boot_cpuid;
30	extern int boot_cpuid_phys;	30	extern int boot_cpuid_phys;
31		31
32	extern void cpu_die(void);	32	extern void cpu_die(void);
33		33
34	#ifdef CONFIG_SMP	34	#ifdef CONFIG_SMP
35		35
36	extern void smp_send_debugger_break(int cpu);	36	extern void smp_send_debugger_break(int cpu);
37	struct pt_regs;	37	struct pt_regs;
38	extern void smp_message_recv(int, struct pt_regs *);	38	extern void smp_message_recv(int, struct pt_regs *);
39		39
40	#ifdef CONFIG_HOTPLUG_CPU	40	#ifdef CONFIG_HOTPLUG_CPU
41	extern void fixup_irqs(cpumask_t map);	41	extern void fixup_irqs(cpumask_t map);
42	int generic_cpu_disable(void);	42	int generic_cpu_disable(void);
43	int generic_cpu_enable(unsigned int cpu);	43	int generic_cpu_enable(unsigned int cpu);
44	void generic_cpu_die(unsigned int cpu);	44	void generic_cpu_die(unsigned int cpu);
45	void generic_mach_cpu_die(void);	45	void generic_mach_cpu_die(void);
46	#endif	46	#endif
47		47
48	#define __smp_processor_id() (get_paca()->paca_index)	48	#define raw_smp_processor_id() (get_paca()->paca_index)
49	#define hard_smp_processor_id() (get_paca()->hw_cpu_id)	49	#define hard_smp_processor_id() (get_paca()->hw_cpu_id)
50		50
51	extern cpumask_t cpu_sibling_map[NR_CPUS];	51	extern cpumask_t cpu_sibling_map[NR_CPUS];
52		52
53	/* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.	53	/* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
54	*	54	*
55	* Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up	55	* Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
56	* in /proc/interrupts will be wrong!!! --Troy */	56	* in /proc/interrupts will be wrong!!! --Troy */
57	#define PPC_MSG_CALL_FUNCTION 0	57	#define PPC_MSG_CALL_FUNCTION 0
58	#define PPC_MSG_RESCHEDULE 1	58	#define PPC_MSG_RESCHEDULE 1
59	/* This is unused now */	59	/* This is unused now */
60	#if 0	60	#if 0
61	#define PPC_MSG_MIGRATE_TASK 2	61	#define PPC_MSG_MIGRATE_TASK 2
62	#endif	62	#endif
63	#define PPC_MSG_DEBUGGER_BREAK 3	63	#define PPC_MSG_DEBUGGER_BREAK 3
64		64
65	void smp_init_iSeries(void);	65	void smp_init_iSeries(void);
66	void smp_init_pSeries(void);	66	void smp_init_pSeries(void);
67		67
68	extern int __cpu_disable(void);	68	extern int __cpu_disable(void);
69	extern void __cpu_die(unsigned int cpu);	69	extern void __cpu_die(unsigned int cpu);
70	#endif /* CONFIG_SMP */	70	#endif /* CONFIG_SMP */
71		71
72	#define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id)	72	#define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id)
73	#define set_hard_smp_processor_id(CPU, VAL) \	73	#define set_hard_smp_processor_id(CPU, VAL) \
74	do { (paca[(CPU)].hw_cpu_id = (VAL)); } while (0)	74	do { (paca[(CPU)].hw_cpu_id = (VAL)); } while (0)
75		75
76	extern int smt_enabled_at_boot;	76	extern int smt_enabled_at_boot;
77		77
78	extern int smp_mpic_probe(void);	78	extern int smp_mpic_probe(void);
79	extern void smp_mpic_setup_cpu(int cpu);	79	extern void smp_mpic_setup_cpu(int cpu);
80	extern void smp_mpic_message_pass(int target, int msg);	80	extern void smp_mpic_message_pass(int target, int msg);
81	extern void smp_generic_kick_cpu(int nr);	81	extern void smp_generic_kick_cpu(int nr);
82		82
83	extern void smp_generic_give_timebase(void);	83	extern void smp_generic_give_timebase(void);
84	extern void smp_generic_take_timebase(void);	84	extern void smp_generic_take_timebase(void);
85		85
86	extern struct smp_ops_t *smp_ops;	86	extern struct smp_ops_t *smp_ops;
87		87
88	#endif /* __ASSEMBLY__ */	88	#endif /* __ASSEMBLY__ */
89		89
90	#endif /* !(_PPC64_SMP_H) */	90	#endif /* !(_PPC64_SMP_H) */
91	#endif /* __KERNEL__ */	91	#endif /* __KERNEL__ */
92		92

include/asm-s390/smp.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	* include/asm-s390/smp.h	2	* include/asm-s390/smp.h
3	*	3	*
4	* S390 version	4	* S390 version
5	* Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation	5	* Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation
6	* Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),	6	* Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
7	* Martin Schwidefsky (schwidefsky@de.ibm.com)	7	* Martin Schwidefsky (schwidefsky@de.ibm.com)
8	* Heiko Carstens (heiko.carstens@de.ibm.com)	8	* Heiko Carstens (heiko.carstens@de.ibm.com)
9	*/	9	*/
10	#ifndef __ASM_SMP_H	10	#ifndef __ASM_SMP_H
11	#define __ASM_SMP_H	11	#define __ASM_SMP_H
12		12
13	#include <linux/config.h>	13	#include <linux/config.h>
14	#include <linux/threads.h>	14	#include <linux/threads.h>
15	#include <linux/cpumask.h>	15	#include <linux/cpumask.h>
16	#include <linux/bitops.h>	16	#include <linux/bitops.h>
17		17
18	#if defined(__KERNEL__) && defined(CONFIG_SMP) && !defined(__ASSEMBLY__)	18	#if defined(__KERNEL__) && defined(CONFIG_SMP) && !defined(__ASSEMBLY__)
19		19
20	#include <asm/lowcore.h>	20	#include <asm/lowcore.h>
21	#include <asm/sigp.h>	21	#include <asm/sigp.h>
22		22
23	/*	23	/*
24	s390 specific smp.c headers	24	s390 specific smp.c headers
25	*/	25	*/
26	typedef struct	26	typedef struct
27	{	27	{
28	int intresting;	28	int intresting;
29	sigp_ccode ccode;	29	sigp_ccode ccode;
30	__u32 status;	30	__u32 status;
31	__u16 cpu;	31	__u16 cpu;
32	} sigp_info;	32	} sigp_info;
33		33
34	extern int smp_call_function_on(void (func) (void info), void *info,	34	extern int smp_call_function_on(void (func) (void info), void *info,
35	int nonatomic, int wait, int cpu);	35	int nonatomic, int wait, int cpu);
36	#define NO_PROC_ID 0xFF /* No processor magic marker */	36	#define NO_PROC_ID 0xFF /* No processor magic marker */
37		37
38	/*	38	/*
39	* This magic constant controls our willingness to transfer	39	* This magic constant controls our willingness to transfer
40	* a process across CPUs. Such a transfer incurs misses on the L1	40	* a process across CPUs. Such a transfer incurs misses on the L1
41	* cache, and on a P6 or P5 with multiple L2 caches L2 hits. My	41	* cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
42	* gut feeling is this will vary by board in value. For a board	42	* gut feeling is this will vary by board in value. For a board
43	* with separate L2 cache it probably depends also on the RSS, and	43	* with separate L2 cache it probably depends also on the RSS, and
44	* for a board with shared L2 cache it ought to decay fast as other	44	* for a board with shared L2 cache it ought to decay fast as other
45	* processes are run.	45	* processes are run.
46	*/	46	*/
47		47
48	#define PROC_CHANGE_PENALTY 20 /* Schedule penalty */	48	#define PROC_CHANGE_PENALTY 20 /* Schedule penalty */
49		49
50	#define smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)	50	#define raw_smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)
51		51
52	extern int smp_get_cpu(cpumask_t cpu_map);	52	extern int smp_get_cpu(cpumask_t cpu_map);
53	extern void smp_put_cpu(int cpu);	53	extern void smp_put_cpu(int cpu);
54		54
55	extern __inline__ __u16 hard_smp_processor_id(void)	55	extern __inline__ __u16 hard_smp_processor_id(void)
56	{	56	{
57	__u16 cpu_address;	57	__u16 cpu_address;
58		58
59	__asm__ ("stap %0\n" : "=m" (cpu_address));	59	__asm__ ("stap %0\n" : "=m" (cpu_address));
60	return cpu_address;	60	return cpu_address;
61	}	61	}
62		62
63	/*	63	/*
64	* returns 1 if cpu is in stopped/check stopped state or not operational	64	* returns 1 if cpu is in stopped/check stopped state or not operational
65	* returns 0 otherwise	65	* returns 0 otherwise
66	*/	66	*/
67	static inline int	67	static inline int
68	smp_cpu_not_running(int cpu)	68	smp_cpu_not_running(int cpu)
69	{	69	{
70	__u32 status;	70	__u32 status;
71		71
72	switch (signal_processor_ps(&status, 0, cpu, sigp_sense)) {	72	switch (signal_processor_ps(&status, 0, cpu, sigp_sense)) {
73	case sigp_order_code_accepted:	73	case sigp_order_code_accepted:
74	case sigp_status_stored:	74	case sigp_status_stored:
75	/* Check for stopped and check stop state */	75	/* Check for stopped and check stop state */
76	if (status & 0x50)	76	if (status & 0x50)
77	return 1;	77	return 1;
78	break;	78	break;
79	case sigp_not_operational:	79	case sigp_not_operational:
80	return 1;	80	return 1;
81	default:	81	default:
82	break;	82	break;
83	}	83	}
84	return 0;	84	return 0;
85	}	85	}
86		86
87	#define cpu_logical_map(cpu) (cpu)	87	#define cpu_logical_map(cpu) (cpu)
88		88
89	extern int __cpu_disable (void);	89	extern int __cpu_disable (void);
90	extern void __cpu_die (unsigned int cpu);	90	extern void __cpu_die (unsigned int cpu);
91	extern void cpu_die (void) __attribute__ ((noreturn));	91	extern void cpu_die (void) __attribute__ ((noreturn));
92	extern int __cpu_up (unsigned int cpu);	92	extern int __cpu_up (unsigned int cpu);
93		93
94	#endif	94	#endif
95		95
96	#ifndef CONFIG_SMP	96	#ifndef CONFIG_SMP
97	static inline int	97	static inline int
98	smp_call_function_on(void (func) (void info), void *info,	98	smp_call_function_on(void (func) (void info), void *info,
99	int nonatomic, int wait, int cpu)	99	int nonatomic, int wait, int cpu)
100	{	100	{
101	func(info);	101	func(info);
102	return 0;	102	return 0;
103	}	103	}
104	#define smp_get_cpu(cpu) ({ 0; })	104	#define smp_get_cpu(cpu) ({ 0; })
105	#define smp_put_cpu(cpu) ({ 0; })	105	#define smp_put_cpu(cpu) ({ 0; })
106	#endif	106	#endif
107		107
108	#endif	108	#endif
109		109

include/asm-sh/smp.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	* include/asm-sh/smp.h	2	* include/asm-sh/smp.h
3	*	3	*
4	* Copyright (C) 2002, 2003 Paul Mundt	4	* Copyright (C) 2002, 2003 Paul Mundt
5	*	5	*
6	* This file is subject to the terms and conditions of the GNU General Public	6	* This file is subject to the terms and conditions of the GNU General Public
7	* License. See the file "COPYING" in the main directory of this archive for	7	* License. See the file "COPYING" in the main directory of this archive for
8	* more details.	8	* more details.
9	*/	9	*/
10	#ifndef __ASM_SH_SMP_H	10	#ifndef __ASM_SH_SMP_H
11	#define __ASM_SH_SMP_H	11	#define __ASM_SH_SMP_H
12		12
13	#include <linux/config.h>	13	#include <linux/config.h>
14	#include <linux/bitops.h>	14	#include <linux/bitops.h>
15	#include <linux/cpumask.h>	15	#include <linux/cpumask.h>
16		16
17	#ifdef CONFIG_SMP	17	#ifdef CONFIG_SMP
18		18
19	#include <asm/spinlock.h>	19	#include <asm/spinlock.h>
20	#include <asm/atomic.h>	20	#include <asm/atomic.h>
21	#include <asm/current.h>	21	#include <asm/current.h>
22		22
23	extern cpumask_t cpu_online_map;	23	extern cpumask_t cpu_online_map;
24	extern cpumask_t cpu_possible_map;	24	extern cpumask_t cpu_possible_map;
25		25
26	#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map)	26	#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map)
27		27
28	#define smp_processor_id() (current_thread_info()->cpu)	28	#define raw_smp_processor_id() (current_thread_info()->cpu)
29		29
30	/* I've no idea what the real meaning of this is */	30	/* I've no idea what the real meaning of this is */
31	#define PROC_CHANGE_PENALTY 20	31	#define PROC_CHANGE_PENALTY 20
32		32
33	#define NO_PROC_ID (-1)	33	#define NO_PROC_ID (-1)
34		34
35	struct smp_fn_call_struct {	35	struct smp_fn_call_struct {
36	spinlock_t lock;	36	spinlock_t lock;
37	atomic_t finished;	37	atomic_t finished;
38	void (fn)(void );	38	void (fn)(void );
39	void *data;	39	void *data;
40	};	40	};
41		41
42	extern struct smp_fn_call_struct smp_fn_call;	42	extern struct smp_fn_call_struct smp_fn_call;
43		43
44	#define SMP_MSG_RESCHEDULE 0x0001	44	#define SMP_MSG_RESCHEDULE 0x0001
45		45
46	#endif /* CONFIG_SMP */	46	#endif /* CONFIG_SMP */
47		47
48	#endif /* __ASM_SH_SMP_H */	48	#endif /* __ASM_SH_SMP_H */
49		49

include/asm-sparc/smp.h

Diff comments View file @ 39c715b

1	/* smp.h: Sparc specific SMP stuff.	1	/* smp.h: Sparc specific SMP stuff.
2	*	2	*
3	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)	3	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
4	*/	4	*/
5		5
6	#ifndef _SPARC_SMP_H	6	#ifndef _SPARC_SMP_H
7	#define _SPARC_SMP_H	7	#define _SPARC_SMP_H
8		8
9	#include <linux/config.h>	9	#include <linux/config.h>
10	#include <linux/threads.h>	10	#include <linux/threads.h>
11	#include <asm/head.h>	11	#include <asm/head.h>
12	#include <asm/btfixup.h>	12	#include <asm/btfixup.h>
13		13
14	#ifndef __ASSEMBLY__	14	#ifndef __ASSEMBLY__
15		15
16	#include <linux/cpumask.h>	16	#include <linux/cpumask.h>
17		17
18	#endif /* __ASSEMBLY__ */	18	#endif /* __ASSEMBLY__ */
19		19
20	#ifdef CONFIG_SMP	20	#ifdef CONFIG_SMP
21		21
22	#ifndef __ASSEMBLY__	22	#ifndef __ASSEMBLY__
23		23
24	#include <asm/ptrace.h>	24	#include <asm/ptrace.h>
25	#include <asm/asi.h>	25	#include <asm/asi.h>
26	#include <asm/atomic.h>	26	#include <asm/atomic.h>
27		27
28	/*	28	/*
29	* Private routines/data	29	* Private routines/data
30	*/	30	*/
31		31
32	extern unsigned char boot_cpu_id;	32	extern unsigned char boot_cpu_id;
33	extern cpumask_t phys_cpu_present_map;	33	extern cpumask_t phys_cpu_present_map;
34	#define cpu_possible_map phys_cpu_present_map	34	#define cpu_possible_map phys_cpu_present_map
35		35
36	typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long,	36	typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long,
37	unsigned long, unsigned long);	37	unsigned long, unsigned long);
38		38
39	/*	39	/*
40	* General functions that each host system must provide.	40	* General functions that each host system must provide.
41	*/	41	*/
42		42
43	void sun4m_init_smp(void);	43	void sun4m_init_smp(void);
44	void sun4d_init_smp(void);	44	void sun4d_init_smp(void);
45		45
46	void smp_callin(void);	46	void smp_callin(void);
47	void smp_boot_cpus(void);	47	void smp_boot_cpus(void);
48	void smp_store_cpu_info(int);	48	void smp_store_cpu_info(int);
49		49
50	struct seq_file;	50	struct seq_file;
51	void smp_bogo(struct seq_file *);	51	void smp_bogo(struct seq_file *);
52	void smp_info(struct seq_file *);	52	void smp_info(struct seq_file *);
53		53
54	BTFIXUPDEF_CALL(void, smp_cross_call, smpfunc_t, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long)	54	BTFIXUPDEF_CALL(void, smp_cross_call, smpfunc_t, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long)
55	BTFIXUPDEF_CALL(void, smp_message_pass, int, int, unsigned long, int)	55	BTFIXUPDEF_CALL(void, smp_message_pass, int, int, unsigned long, int)
56	BTFIXUPDEF_CALL(int, __hard_smp_processor_id, void)	56	BTFIXUPDEF_CALL(int, __hard_smp_processor_id, void)
57	BTFIXUPDEF_BLACKBOX(hard_smp_processor_id)	57	BTFIXUPDEF_BLACKBOX(hard_smp_processor_id)
58	BTFIXUPDEF_BLACKBOX(load_current)	58	BTFIXUPDEF_BLACKBOX(load_current)
59		59
60	#define smp_cross_call(func,arg1,arg2,arg3,arg4,arg5) BTFIXUP_CALL(smp_cross_call)(func,arg1,arg2,arg3,arg4,arg5)	60	#define smp_cross_call(func,arg1,arg2,arg3,arg4,arg5) BTFIXUP_CALL(smp_cross_call)(func,arg1,arg2,arg3,arg4,arg5)
61	#define smp_message_pass(target,msg,data,wait) BTFIXUP_CALL(smp_message_pass)(target,msg,data,wait)	61	#define smp_message_pass(target,msg,data,wait) BTFIXUP_CALL(smp_message_pass)(target,msg,data,wait)
62		62
63	extern __inline__ void xc0(smpfunc_t func) { smp_cross_call(func, 0, 0, 0, 0, 0); }	63	extern __inline__ void xc0(smpfunc_t func) { smp_cross_call(func, 0, 0, 0, 0, 0); }
64	extern __inline__ void xc1(smpfunc_t func, unsigned long arg1)	64	extern __inline__ void xc1(smpfunc_t func, unsigned long arg1)
65	{ smp_cross_call(func, arg1, 0, 0, 0, 0); }	65	{ smp_cross_call(func, arg1, 0, 0, 0, 0); }
66	extern __inline__ void xc2(smpfunc_t func, unsigned long arg1, unsigned long arg2)	66	extern __inline__ void xc2(smpfunc_t func, unsigned long arg1, unsigned long arg2)
67	{ smp_cross_call(func, arg1, arg2, 0, 0, 0); }	67	{ smp_cross_call(func, arg1, arg2, 0, 0, 0); }
68	extern __inline__ void xc3(smpfunc_t func, unsigned long arg1, unsigned long arg2,	68	extern __inline__ void xc3(smpfunc_t func, unsigned long arg1, unsigned long arg2,
69	unsigned long arg3)	69	unsigned long arg3)
70	{ smp_cross_call(func, arg1, arg2, arg3, 0, 0); }	70	{ smp_cross_call(func, arg1, arg2, arg3, 0, 0); }
71	extern __inline__ void xc4(smpfunc_t func, unsigned long arg1, unsigned long arg2,	71	extern __inline__ void xc4(smpfunc_t func, unsigned long arg1, unsigned long arg2,
72	unsigned long arg3, unsigned long arg4)	72	unsigned long arg3, unsigned long arg4)
73	{ smp_cross_call(func, arg1, arg2, arg3, arg4, 0); }	73	{ smp_cross_call(func, arg1, arg2, arg3, arg4, 0); }
74	extern __inline__ void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg2,	74	extern __inline__ void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg2,
75	unsigned long arg3, unsigned long arg4, unsigned long arg5)	75	unsigned long arg3, unsigned long arg4, unsigned long arg5)
76	{ smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); }	76	{ smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); }
77		77
78	extern __inline__ int smp_call_function(void (func)(void info), void *info, int nonatomic, int wait)	78	extern __inline__ int smp_call_function(void (func)(void info), void *info, int nonatomic, int wait)
79	{	79	{
80	xc1((smpfunc_t)func, (unsigned long)info);	80	xc1((smpfunc_t)func, (unsigned long)info);
81	return 0;	81	return 0;
82	}	82	}
83		83
84	extern __volatile__ int __cpu_number_map[NR_CPUS];	84	extern __volatile__ int __cpu_number_map[NR_CPUS];
85	extern __volatile__ int __cpu_logical_map[NR_CPUS];	85	extern __volatile__ int __cpu_logical_map[NR_CPUS];
86		86
87	extern __inline__ int cpu_logical_map(int cpu)	87	extern __inline__ int cpu_logical_map(int cpu)
88	{	88	{
89	return __cpu_logical_map[cpu];	89	return __cpu_logical_map[cpu];
90	}	90	}
91	extern __inline__ int cpu_number_map(int cpu)	91	extern __inline__ int cpu_number_map(int cpu)
92	{	92	{
93	return __cpu_number_map[cpu];	93	return __cpu_number_map[cpu];
94	}	94	}
95		95
96	extern __inline__ int hard_smp4m_processor_id(void)	96	extern __inline__ int hard_smp4m_processor_id(void)
97	{	97	{
98	int cpuid;	98	int cpuid;
99		99
100	__asm__ __volatile__("rd %%tbr, %0\n\t"	100	__asm__ __volatile__("rd %%tbr, %0\n\t"
101	"srl %0, 12, %0\n\t"	101	"srl %0, 12, %0\n\t"
102	"and %0, 3, %0\n\t" :	102	"and %0, 3, %0\n\t" :
103	"=&r" (cpuid));	103	"=&r" (cpuid));
104	return cpuid;	104	return cpuid;
105	}	105	}
106		106
107	extern __inline__ int hard_smp4d_processor_id(void)	107	extern __inline__ int hard_smp4d_processor_id(void)
108	{	108	{
109	int cpuid;	109	int cpuid;
110		110
111	__asm__ __volatile__("lda [%%g0] %1, %0\n\t" :	111	__asm__ __volatile__("lda [%%g0] %1, %0\n\t" :
112	"=&r" (cpuid) : "i" (ASI_M_VIKING_TMP1));	112	"=&r" (cpuid) : "i" (ASI_M_VIKING_TMP1));
113	return cpuid;	113	return cpuid;
114	}	114	}
115		115
116	#ifndef MODULE	116	#ifndef MODULE
117	extern __inline__ int hard_smp_processor_id(void)	117	extern __inline__ int hard_smp_processor_id(void)
118	{	118	{
119	int cpuid;	119	int cpuid;
120		120
121	/* Black box - sun4m	121	/* Black box - sun4m
122	__asm__ __volatile__("rd %%tbr, %0\n\t"	122	__asm__ __volatile__("rd %%tbr, %0\n\t"
123	"srl %0, 12, %0\n\t"	123	"srl %0, 12, %0\n\t"
124	"and %0, 3, %0\n\t" :	124	"and %0, 3, %0\n\t" :
125	"=&r" (cpuid));	125	"=&r" (cpuid));
126	- sun4d	126	- sun4d
127	__asm__ __volatile__("lda [%g0] ASI_M_VIKING_TMP1, %0\n\t"	127	__asm__ __volatile__("lda [%g0] ASI_M_VIKING_TMP1, %0\n\t"
128	"nop; nop" :	128	"nop; nop" :
129	"=&r" (cpuid));	129	"=&r" (cpuid));
130	See btfixup.h and btfixupprep.c to understand how a blackbox works.	130	See btfixup.h and btfixupprep.c to understand how a blackbox works.
131	*/	131	*/
132	__asm__ __volatile__("sethi %%hi(___b_hard_smp_processor_id), %0\n\t"	132	__asm__ __volatile__("sethi %%hi(___b_hard_smp_processor_id), %0\n\t"
133	"sethi %%hi(boot_cpu_id), %0\n\t"	133	"sethi %%hi(boot_cpu_id), %0\n\t"
134	"ldub [%0 + %%lo(boot_cpu_id)], %0\n\t" :	134	"ldub [%0 + %%lo(boot_cpu_id)], %0\n\t" :
135	"=&r" (cpuid));	135	"=&r" (cpuid));
136	return cpuid;	136	return cpuid;
137	}	137	}
138	#else	138	#else
139	extern __inline__ int hard_smp_processor_id(void)	139	extern __inline__ int hard_smp_processor_id(void)
140	{	140	{
141	int cpuid;	141	int cpuid;
142		142
143	__asm__ __volatile__("mov %%o7, %%g1\n\t"	143	__asm__ __volatile__("mov %%o7, %%g1\n\t"
144	"call ___f___hard_smp_processor_id\n\t"	144	"call ___f___hard_smp_processor_id\n\t"
145	" nop\n\t"	145	" nop\n\t"
146	"mov %%g2, %0\n\t" : "=r"(cpuid) : : "g1", "g2");	146	"mov %%g2, %0\n\t" : "=r"(cpuid) : : "g1", "g2");
147	return cpuid;	147	return cpuid;
148	}	148	}
149	#endif	149	#endif
150		150
151	#define smp_processor_id() (current_thread_info()->cpu)	151	#define raw_smp_processor_id() (current_thread_info()->cpu)
152		152
153	#define prof_multiplier(__cpu) cpu_data(__cpu).multiplier	153	#define prof_multiplier(__cpu) cpu_data(__cpu).multiplier
154	#define prof_counter(__cpu) cpu_data(__cpu).counter	154	#define prof_counter(__cpu) cpu_data(__cpu).counter
155		155
156	#endif /* !(__ASSEMBLY__) */	156	#endif /* !(__ASSEMBLY__) */
157		157
158	/* Sparc specific messages. */	158	/* Sparc specific messages. */
159	#define MSG_CROSS_CALL 0x0005 /* run func on cpus */	159	#define MSG_CROSS_CALL 0x0005 /* run func on cpus */
160		160
161	/* Empirical PROM processor mailbox constants. If the per-cpu mailbox	161	/* Empirical PROM processor mailbox constants. If the per-cpu mailbox
162	* contains something other than one of these then the ipi is from	162	* contains something other than one of these then the ipi is from
163	* Linux's active_kernel_processor. This facility exists so that	163	* Linux's active_kernel_processor. This facility exists so that
164	* the boot monitor can capture all the other cpus when one catches	164	* the boot monitor can capture all the other cpus when one catches
165	* a watchdog reset or the user enters the monitor using L1-A keys.	165	* a watchdog reset or the user enters the monitor using L1-A keys.
166	*/	166	*/
167	#define MBOX_STOPCPU 0xFB	167	#define MBOX_STOPCPU 0xFB
168	#define MBOX_IDLECPU 0xFC	168	#define MBOX_IDLECPU 0xFC
169	#define MBOX_IDLECPU2 0xFD	169	#define MBOX_IDLECPU2 0xFD
170	#define MBOX_STOPCPU2 0xFE	170	#define MBOX_STOPCPU2 0xFE
171		171
172	#endif /* SMP */	172	#endif /* SMP */
173		173
174	#define NO_PROC_ID 0xFF	174	#define NO_PROC_ID 0xFF
175		175
176	#endif /* !(_SPARC_SMP_H) */	176	#endif /* !(_SPARC_SMP_H) */
177		177

include/asm-sparc64/smp.h

Diff comments View file @ 39c715b

1	/* smp.h: Sparc64 specific SMP stuff.	1	/* smp.h: Sparc64 specific SMP stuff.
2	*	2	*
3	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)	3	* Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
4	*/	4	*/
5		5
6	#ifndef _SPARC64_SMP_H	6	#ifndef _SPARC64_SMP_H
7	#define _SPARC64_SMP_H	7	#define _SPARC64_SMP_H
8		8
9	#include <linux/config.h>	9	#include <linux/config.h>
10	#include <linux/threads.h>	10	#include <linux/threads.h>
11	#include <asm/asi.h>	11	#include <asm/asi.h>
12	#include <asm/starfire.h>	12	#include <asm/starfire.h>
13	#include <asm/spitfire.h>	13	#include <asm/spitfire.h>
14		14
15	#ifndef __ASSEMBLY__	15	#ifndef __ASSEMBLY__
16		16
17	#include <linux/cpumask.h>	17	#include <linux/cpumask.h>
18	#include <linux/cache.h>	18	#include <linux/cache.h>
19		19
20	#endif /* !(__ASSEMBLY__) */	20	#endif /* !(__ASSEMBLY__) */
21		21
22	#ifdef CONFIG_SMP	22	#ifdef CONFIG_SMP
23		23
24	#ifndef __ASSEMBLY__	24	#ifndef __ASSEMBLY__
25		25
26	/*	26	/*
27	* Private routines/data	27	* Private routines/data
28	*/	28	*/
29		29
30	#include <asm/bitops.h>	30	#include <asm/bitops.h>
31	#include <asm/atomic.h>	31	#include <asm/atomic.h>
32		32
33	extern cpumask_t phys_cpu_present_map;	33	extern cpumask_t phys_cpu_present_map;
34	#define cpu_possible_map phys_cpu_present_map	34	#define cpu_possible_map phys_cpu_present_map
35		35
36	/*	36	/*
37	* General functions that each host system must provide.	37	* General functions that each host system must provide.
38	*/	38	*/
39		39
40	static __inline__ int hard_smp_processor_id(void)	40	static __inline__ int hard_smp_processor_id(void)
41	{	41	{
42	if (tlb_type == cheetah \|\| tlb_type == cheetah_plus) {	42	if (tlb_type == cheetah \|\| tlb_type == cheetah_plus) {
43	unsigned long cfg, ver;	43	unsigned long cfg, ver;
44	__asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));	44	__asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
45	if ((ver >> 32) == 0x003e0016) {	45	if ((ver >> 32) == 0x003e0016) {
46	__asm__ __volatile__("ldxa [%%g0] %1, %0"	46	__asm__ __volatile__("ldxa [%%g0] %1, %0"
47	: "=r" (cfg)	47	: "=r" (cfg)
48	: "i" (ASI_JBUS_CONFIG));	48	: "i" (ASI_JBUS_CONFIG));
49	return ((cfg >> 17) & 0x1f);	49	return ((cfg >> 17) & 0x1f);
50	} else {	50	} else {
51	__asm__ __volatile__("ldxa [%%g0] %1, %0"	51	__asm__ __volatile__("ldxa [%%g0] %1, %0"
52	: "=r" (cfg)	52	: "=r" (cfg)
53	: "i" (ASI_SAFARI_CONFIG));	53	: "i" (ASI_SAFARI_CONFIG));
54	return ((cfg >> 17) & 0x3ff);	54	return ((cfg >> 17) & 0x3ff);
55	}	55	}
56	} else if (this_is_starfire != 0) {	56	} else if (this_is_starfire != 0) {
57	return starfire_hard_smp_processor_id();	57	return starfire_hard_smp_processor_id();
58	} else {	58	} else {
59	unsigned long upaconfig;	59	unsigned long upaconfig;
60	__asm__ __volatile__("ldxa [%%g0] %1, %0"	60	__asm__ __volatile__("ldxa [%%g0] %1, %0"
61	: "=r" (upaconfig)	61	: "=r" (upaconfig)
62	: "i" (ASI_UPA_CONFIG));	62	: "i" (ASI_UPA_CONFIG));
63	return ((upaconfig >> 17) & 0x1f);	63	return ((upaconfig >> 17) & 0x1f);
64	}	64	}
65	}	65	}
66		66
67	#define smp_processor_id() (current_thread_info()->cpu)	67	#define raw_smp_processor_id() (current_thread_info()->cpu)
68		68
69	#endif /* !(__ASSEMBLY__) */	69	#endif /* !(__ASSEMBLY__) */
70		70
71	#endif /* !(CONFIG_SMP) */	71	#endif /* !(CONFIG_SMP) */
72		72
73	#define NO_PROC_ID 0xFF	73	#define NO_PROC_ID 0xFF
74		74
75	#endif /* !(_SPARC64_SMP_H) */	75	#endif /* !(_SPARC64_SMP_H) */
76		76

include/asm-um/smp.h

Diff comments View file @ 39c715b

1	#ifndef __UM_SMP_H	1	#ifndef __UM_SMP_H
2	#define __UM_SMP_H	2	#define __UM_SMP_H
3		3
4	#ifdef CONFIG_SMP	4	#ifdef CONFIG_SMP
5		5
6	#include "linux/config.h"	6	#include "linux/config.h"
7	#include "linux/bitops.h"	7	#include "linux/bitops.h"
8	#include "asm/current.h"	8	#include "asm/current.h"
9	#include "linux/cpumask.h"	9	#include "linux/cpumask.h"
10		10
11	#define smp_processor_id() (current_thread->cpu)	11	#define raw_smp_processor_id() (current_thread->cpu)
		12
12	#define cpu_logical_map(n) (n)	13	#define cpu_logical_map(n) (n)
13	#define cpu_number_map(n) (n)	14	#define cpu_number_map(n) (n)
14	#define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */	15	#define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */
15	extern int hard_smp_processor_id(void);	16	extern int hard_smp_processor_id(void);
16	#define NO_PROC_ID -1	17	#define NO_PROC_ID -1
17		18
18	extern int ncpus;	19	extern int ncpus;
19		20
20		21
21	extern inline void smp_cpus_done(unsigned int maxcpus)	22	extern inline void smp_cpus_done(unsigned int maxcpus)
22	{	23	{
23	}	24	}
24		25
25	#endif	26	#endif
26		27
27	#endif	28	#endif
28		29

include/asm-x86_64/smp.h

Diff comments View file @ 39c715b

1	#ifndef __ASM_SMP_H	1	#ifndef __ASM_SMP_H
2	#define __ASM_SMP_H	2	#define __ASM_SMP_H
3		3
4	/*	4	/*
5	* We need the APIC definitions automatically as part of 'smp.h'	5	* We need the APIC definitions automatically as part of 'smp.h'
6	*/	6	*/
7	#ifndef __ASSEMBLY__	7	#ifndef __ASSEMBLY__
8	#include <linux/config.h>	8	#include <linux/config.h>
9	#include <linux/threads.h>	9	#include <linux/threads.h>
10	#include <linux/cpumask.h>	10	#include <linux/cpumask.h>
11	#include <linux/bitops.h>	11	#include <linux/bitops.h>
12	extern int disable_apic;	12	extern int disable_apic;
13	#endif	13	#endif
14		14
15	#ifdef CONFIG_X86_LOCAL_APIC	15	#ifdef CONFIG_X86_LOCAL_APIC
16	#ifndef __ASSEMBLY__	16	#ifndef __ASSEMBLY__
17	#include <asm/fixmap.h>	17	#include <asm/fixmap.h>
18	#include <asm/mpspec.h>	18	#include <asm/mpspec.h>
19	#ifdef CONFIG_X86_IO_APIC	19	#ifdef CONFIG_X86_IO_APIC
20	#include <asm/io_apic.h>	20	#include <asm/io_apic.h>
21	#endif	21	#endif
22	#include <asm/apic.h>	22	#include <asm/apic.h>
23	#include <asm/thread_info.h>	23	#include <asm/thread_info.h>
24	#endif	24	#endif
25	#endif	25	#endif
26		26
27	#ifdef CONFIG_SMP	27	#ifdef CONFIG_SMP
28	#ifndef ASSEMBLY	28	#ifndef ASSEMBLY
29		29
30	#include <asm/pda.h>	30	#include <asm/pda.h>
31		31
32	struct pt_regs;	32	struct pt_regs;
33		33
34	extern cpumask_t cpu_present_mask;	34	extern cpumask_t cpu_present_mask;
35	extern cpumask_t cpu_possible_map;	35	extern cpumask_t cpu_possible_map;
36	extern cpumask_t cpu_online_map;	36	extern cpumask_t cpu_online_map;
37	extern cpumask_t cpu_callout_map;	37	extern cpumask_t cpu_callout_map;
38		38
39	/*	39	/*
40	* Private routines/data	40	* Private routines/data
41	*/	41	*/
42		42
43	extern void smp_alloc_memory(void);	43	extern void smp_alloc_memory(void);
44	extern volatile unsigned long smp_invalidate_needed;	44	extern volatile unsigned long smp_invalidate_needed;
45	extern int pic_mode;	45	extern int pic_mode;
46	extern int smp_num_siblings;	46	extern int smp_num_siblings;
47	extern void smp_flush_tlb(void);	47	extern void smp_flush_tlb(void);
48	extern void smp_message_irq(int cpl, void dev_id, struct pt_regs regs);	48	extern void smp_message_irq(int cpl, void dev_id, struct pt_regs regs);
49	extern void smp_send_reschedule(int cpu);	49	extern void smp_send_reschedule(int cpu);
50	extern void smp_invalidate_rcv(void); /* Process an NMI */	50	extern void smp_invalidate_rcv(void); /* Process an NMI */
51	extern void zap_low_mappings(void);	51	extern void zap_low_mappings(void);
52	void smp_stop_cpu(void);	52	void smp_stop_cpu(void);
53	extern cpumask_t cpu_sibling_map[NR_CPUS];	53	extern cpumask_t cpu_sibling_map[NR_CPUS];
54	extern cpumask_t cpu_core_map[NR_CPUS];	54	extern cpumask_t cpu_core_map[NR_CPUS];
55	extern u8 phys_proc_id[NR_CPUS];	55	extern u8 phys_proc_id[NR_CPUS];
56	extern u8 cpu_core_id[NR_CPUS];	56	extern u8 cpu_core_id[NR_CPUS];
57		57
58	#define SMP_TRAMPOLINE_BASE 0x6000	58	#define SMP_TRAMPOLINE_BASE 0x6000
59		59
60	/*	60	/*
61	* On x86 all CPUs are mapped 1:1 to the APIC space.	61	* On x86 all CPUs are mapped 1:1 to the APIC space.
62	* This simplifies scheduling and IPI sending and	62	* This simplifies scheduling and IPI sending and
63	* compresses data structures.	63	* compresses data structures.
64	*/	64	*/
65		65
66	static inline int num_booting_cpus(void)	66	static inline int num_booting_cpus(void)
67	{	67	{
68	return cpus_weight(cpu_callout_map);	68	return cpus_weight(cpu_callout_map);
69	}	69	}
70		70
71	#define __smp_processor_id() read_pda(cpunumber)	71	#define raw_smp_processor_id() read_pda(cpunumber)
72		72
73	extern __inline int hard_smp_processor_id(void)	73	extern __inline int hard_smp_processor_id(void)
74	{	74	{
75	/* we don't want to mark this access volatile - bad code generation */	75	/* we don't want to mark this access volatile - bad code generation */
76	return GET_APIC_ID((unsigned int )(APIC_BASE+APIC_ID));	76	return GET_APIC_ID((unsigned int )(APIC_BASE+APIC_ID));
77	}	77	}
78		78
79	extern int safe_smp_processor_id(void);	79	extern int safe_smp_processor_id(void);
80		80
81	#endif /* !ASSEMBLY */	81	#endif /* !ASSEMBLY */
82		82
83	#define NO_PROC_ID 0xFF /* No processor magic marker */	83	#define NO_PROC_ID 0xFF /* No processor magic marker */
84		84
85	#endif	85	#endif
86		86
87	#ifndef ASSEMBLY	87	#ifndef ASSEMBLY
88	/*	88	/*
89	* Some lowlevel functions might want to know about	89	* Some lowlevel functions might want to know about
90	* the real APIC ID <-> CPU # mapping.	90	* the real APIC ID <-> CPU # mapping.
91	*/	91	*/
92	extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */	92	extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
93	extern u8 x86_cpu_to_log_apicid[NR_CPUS];	93	extern u8 x86_cpu_to_log_apicid[NR_CPUS];
94	extern u8 bios_cpu_apicid[];	94	extern u8 bios_cpu_apicid[];
95		95
96	static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)	96	static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
97	{	97	{
98	return cpus_addr(cpumask)[0];	98	return cpus_addr(cpumask)[0];
99	}	99	}
100		100
101	static inline int cpu_present_to_apicid(int mps_cpu)	101	static inline int cpu_present_to_apicid(int mps_cpu)
102	{	102	{
103	if (mps_cpu < NR_CPUS)	103	if (mps_cpu < NR_CPUS)
104	return (int)bios_cpu_apicid[mps_cpu];	104	return (int)bios_cpu_apicid[mps_cpu];
105	else	105	else
106	return BAD_APICID;	106	return BAD_APICID;
107	}	107	}
108		108
109	#endif /* !ASSEMBLY */	109	#endif /* !ASSEMBLY */
110		110
111	#ifndef CONFIG_SMP	111	#ifndef CONFIG_SMP
112	#define stack_smp_processor_id() 0	112	#define stack_smp_processor_id() 0
113	#define safe_smp_processor_id() 0	113	#define safe_smp_processor_id() 0
114	#define cpu_logical_map(x) (x)	114	#define cpu_logical_map(x) (x)
115	#else	115	#else
116	#include <asm/thread_info.h>	116	#include <asm/thread_info.h>
117	#define stack_smp_processor_id() \	117	#define stack_smp_processor_id() \
118	({ \	118	({ \
119	struct thread_info *ti; \	119	struct thread_info *ti; \
120	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \	120	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
121	ti->cpu; \	121	ti->cpu; \
122	})	122	})
123	#endif	123	#endif
124		124
125	#ifndef __ASSEMBLY__	125	#ifndef __ASSEMBLY__
126	static __inline int logical_smp_processor_id(void)	126	static __inline int logical_smp_processor_id(void)
127	{	127	{
128	/* we don't want to mark this access volatile - bad code generation */	128	/* we don't want to mark this access volatile - bad code generation */
129	return GET_APIC_LOGICAL_ID((unsigned long )(APIC_BASE+APIC_LDR));	129	return GET_APIC_LOGICAL_ID((unsigned long )(APIC_BASE+APIC_LDR));
130	}	130	}
131	#endif	131	#endif
132		132
133	#endif	133	#endif
134		134
135		135

include/linux/mmzone.h

Diff comments View file @ 39c715b

include/linux/smp.h

Diff comments View file @ 39c715b

1	#ifndef __LINUX_SMP_H	1	#ifndef __LINUX_SMP_H
2	#define __LINUX_SMP_H	2	#define __LINUX_SMP_H
3		3
4	/*	4	/*
5	* Generic SMP support	5	* Generic SMP support
6	* Alan Cox. <alan@redhat.com>	6	* Alan Cox. <alan@redhat.com>
7	*/	7	*/
8		8
9	#include <linux/config.h>	9	#include <linux/config.h>
10		10
11	extern void cpu_idle(void);	11	extern void cpu_idle(void);
12		12
13	#ifdef CONFIG_SMP	13	#ifdef CONFIG_SMP
14		14
15	#include <linux/preempt.h>	15	#include <linux/preempt.h>
16	#include <linux/kernel.h>	16	#include <linux/kernel.h>
17	#include <linux/compiler.h>	17	#include <linux/compiler.h>
18	#include <linux/thread_info.h>	18	#include <linux/thread_info.h>
19	#include <asm/smp.h>	19	#include <asm/smp.h>
20	#include <asm/bug.h>	20	#include <asm/bug.h>
21		21
22	/*	22	/*
23	* main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.	23	* main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
24	* (defined in asm header):	24	* (defined in asm header):
25	*/	25	*/
26		26
27	/*	27	/*
28	* stops all CPUs but the current one:	28	* stops all CPUs but the current one:
29	*/	29	*/
30	extern void smp_send_stop(void);	30	extern void smp_send_stop(void);
31		31
32	/*	32	/*
33	* sends a 'reschedule' event to another CPU:	33	* sends a 'reschedule' event to another CPU:
34	*/	34	*/
35	extern void smp_send_reschedule(int cpu);	35	extern void smp_send_reschedule(int cpu);
36		36
37		37
38	/*	38	/*
39	* Prepare machine for booting other CPUs.	39	* Prepare machine for booting other CPUs.
40	*/	40	*/
41	extern void smp_prepare_cpus(unsigned int max_cpus);	41	extern void smp_prepare_cpus(unsigned int max_cpus);
42		42
43	/*	43	/*
44	* Bring a CPU up	44	* Bring a CPU up
45	*/	45	*/
46	extern int __cpu_up(unsigned int cpunum);	46	extern int __cpu_up(unsigned int cpunum);
47		47
48	/*	48	/*
49	* Final polishing of CPUs	49	* Final polishing of CPUs
50	*/	50	*/
51	extern void smp_cpus_done(unsigned int max_cpus);	51	extern void smp_cpus_done(unsigned int max_cpus);
52		52
53	/*	53	/*
54	* Call a function on all other processors	54	* Call a function on all other processors
55	*/	55	*/
56	extern int smp_call_function (void (func) (void info), void *info,	56	extern int smp_call_function (void (func) (void info), void *info,
57	int retry, int wait);	57	int retry, int wait);
58		58
59	/*	59	/*
60	* Call a function on all processors	60	* Call a function on all processors
61	*/	61	*/
62	static inline int on_each_cpu(void (func) (void info), void *info,	62	static inline int on_each_cpu(void (func) (void info), void *info,
63	int retry, int wait)	63	int retry, int wait)
64	{	64	{
65	int ret = 0;	65	int ret = 0;
66		66
67	preempt_disable();	67	preempt_disable();
68	ret = smp_call_function(func, info, retry, wait);	68	ret = smp_call_function(func, info, retry, wait);
69	func(info);	69	func(info);
70	preempt_enable();	70	preempt_enable();
71	return ret;	71	return ret;
72	}	72	}
73		73
74	#define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */	74	#define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */
75	#define MSG_ALL 0x8001	75	#define MSG_ALL 0x8001
76		76
77	#define MSG_INVALIDATE_TLB 0x0001 /* Remote processor TLB invalidate */	77	#define MSG_INVALIDATE_TLB 0x0001 /* Remote processor TLB invalidate */
78	#define MSG_STOP_CPU 0x0002 /* Sent to shut down slave CPU's	78	#define MSG_STOP_CPU 0x0002 /* Sent to shut down slave CPU's
79	* when rebooting	79	* when rebooting
80	*/	80	*/
81	#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/	81	#define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
82	#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */	82	#define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */
83		83
84	/*	84	/*
85	* Mark the boot cpu "online" so that it can call console drivers in	85	* Mark the boot cpu "online" so that it can call console drivers in
86	* printk() and can access its per-cpu storage.	86	* printk() and can access its per-cpu storage.
87	*/	87	*/
88	void smp_prepare_boot_cpu(void);	88	void smp_prepare_boot_cpu(void);
89		89
90	#else /* !SMP */	90	#else /* !SMP */
91		91
92	/*	92	/*
93	* These macros fold the SMP functionality into a single CPU system	93	* These macros fold the SMP functionality into a single CPU system
94	*/	94	*/
95		95	#define raw_smp_processor_id() 0
96	#if !defined(__smp_processor_id) \|\| !defined(CONFIG_PREEMPT)
97	# define smp_processor_id() 0
98	#endif
99	#define hard_smp_processor_id() 0	96	#define hard_smp_processor_id() 0
100	#define smp_call_function(func,info,retry,wait) ({ 0; })	97	#define smp_call_function(func,info,retry,wait) ({ 0; })
101	#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; })	98	#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; })
102	static inline void smp_send_reschedule(int cpu) { }	99	static inline void smp_send_reschedule(int cpu) { }
103	#define num_booting_cpus() 1	100	#define num_booting_cpus() 1
104	#define smp_prepare_boot_cpu() do {} while (0)	101	#define smp_prepare_boot_cpu() do {} while (0)
105		102
106	#endif /* !SMP */	103	#endif /* !SMP */
107		104
108	/*	105	/*
109	* DEBUG_PREEMPT support: check whether smp_processor_id() is being	106	* smp_processor_id(): get the current CPU ID.
110	* used in a preemption-safe way.
111	*	107	*
112	* An architecture has to enable this debugging code explicitly.	108	* if DEBUG_PREEMPT is enabled the we check whether it is
113	* It can do so by renaming the smp_processor_id() macro to	109	* used in a preemption-safe way. (smp_processor_id() is safe
114	* __smp_processor_id(). This should only be done after some minimal	110	* if it's used in a preemption-off critical section, or in
115	* testing, because usually there are a number of false positives	111	* a thread that is bound to the current CPU.)
116	* that an architecture will trigger.
117	*	112	*
118	* To fix a false positive (i.e. smp_processor_id() use that the	113	* NOTE: raw_smp_processor_id() is for internal use only
119	* debugging code reports but which use for some reason is legal),	114	* (smp_processor_id() is the preferred variant), but in rare
120	* change the smp_processor_id() reference to _smp_processor_id(),	115	* instances it might also be used to turn off false positives
121	* which is the nondebug variant. NOTE: don't use this to hack around	116	* (i.e. smp_processor_id() use that the debugging code reports but
122	* real bugs.	117	* which use for some reason is legal). Don't use this to hack around
		118	* the warning message, as your code might not work under PREEMPT.
123	*/	119	*/
124	#ifdef __smp_processor_id	120	#ifdef CONFIG_DEBUG_PREEMPT
125	# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)	121	extern unsigned int debug_smp_processor_id(void);
126	extern unsigned int smp_processor_id(void);	122	# define smp_processor_id() debug_smp_processor_id()
127	# else
128	# define smp_processor_id() __smp_processor_id()
129	# endif
130	# define _smp_processor_id() __smp_processor_id()
131	#else	123	#else
132	# define _smp_processor_id() smp_processor_id()	124	# define smp_processor_id() raw_smp_processor_id()
133	#endif	125	#endif
134		126
135	#define get_cpu() ({ preempt_disable(); smp_processor_id(); })	127	#define get_cpu() ({ preempt_disable(); smp_processor_id(); })
136	#define put_cpu() preempt_enable()	128	#define put_cpu() preempt_enable()
137	#define put_cpu_no_resched() preempt_enable_no_resched()	129	#define put_cpu_no_resched() preempt_enable_no_resched()
138		130
139	#endif /* __LINUX_SMP_H */	131	#endif /* __LINUX_SMP_H */

include/net/route.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	* INET An implementation of the TCP/IP protocol suite for the LINUX	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
3	* operating system. INET is implemented using the BSD Socket	3	* operating system. INET is implemented using the BSD Socket
4	* interface as the means of communication with the user level.	4	* interface as the means of communication with the user level.
5	*	5	*
6	* Definitions for the IP router.	6	* Definitions for the IP router.
7	*	7	*
8	* Version: @(#)route.h 1.0.4 05/27/93	8	* Version: @(#)route.h 1.0.4 05/27/93
9	*	9	*
10	* Authors: Ross Biro	10	* Authors: Ross Biro
11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12	* Fixes:	12	* Fixes:
13	* Alan Cox : Reformatted. Added ip_rt_local()	13	* Alan Cox : Reformatted. Added ip_rt_local()
14	* Alan Cox : Support for TCP parameters.	14	* Alan Cox : Support for TCP parameters.
15	* Alexey Kuznetsov: Major changes for new routing code.	15	* Alexey Kuznetsov: Major changes for new routing code.
16	* Mike McLagan : Routing by source	16	* Mike McLagan : Routing by source
17	* Robert Olsson : Added rt_cache statistics	17	* Robert Olsson : Added rt_cache statistics
18	*	18	*
19	* This program is free software; you can redistribute it and/or	19	* This program is free software; you can redistribute it and/or
20	* modify it under the terms of the GNU General Public License	20	* modify it under the terms of the GNU General Public License
21	* as published by the Free Software Foundation; either version	21	* as published by the Free Software Foundation; either version
22	* 2 of the License, or (at your option) any later version.	22	* 2 of the License, or (at your option) any later version.
23	*/	23	*/
24	#ifndef _ROUTE_H	24	#ifndef _ROUTE_H
25	#define _ROUTE_H	25	#define _ROUTE_H
26		26
27	#include <linux/config.h>	27	#include <linux/config.h>
28	#include <net/dst.h>	28	#include <net/dst.h>
29	#include <net/inetpeer.h>	29	#include <net/inetpeer.h>
30	#include <net/flow.h>	30	#include <net/flow.h>
31	#include <linux/in_route.h>	31	#include <linux/in_route.h>
32	#include <linux/rtnetlink.h>	32	#include <linux/rtnetlink.h>
33	#include <linux/route.h>	33	#include <linux/route.h>
34	#include <linux/ip.h>	34	#include <linux/ip.h>
35	#include <linux/cache.h>	35	#include <linux/cache.h>
36		36
37	#ifndef __KERNEL__	37	#ifndef __KERNEL__
38	#warning This file is not supposed to be used outside of kernel.	38	#warning This file is not supposed to be used outside of kernel.
39	#endif	39	#endif
40		40
41	#define RTO_ONLINK 0x01	41	#define RTO_ONLINK 0x01
42		42
43	#define RTO_CONN 0	43	#define RTO_CONN 0
44	/* RTO_CONN is not used (being alias for 0), but preserved not to break	44	/* RTO_CONN is not used (being alias for 0), but preserved not to break
45	* some modules referring to it. */	45	* some modules referring to it. */
46		46
47	#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) \| sock_flag(sk, SOCK_LOCALROUTE))	47	#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) \| sock_flag(sk, SOCK_LOCALROUTE))
48		48
49	struct fib_nh;	49	struct fib_nh;
50	struct inet_peer;	50	struct inet_peer;
51	struct rtable	51	struct rtable
52	{	52	{
53	union	53	union
54	{	54	{
55	struct dst_entry dst;	55	struct dst_entry dst;
56	struct rtable *rt_next;	56	struct rtable *rt_next;
57	} u;	57	} u;
58		58
59	struct in_device *idev;	59	struct in_device *idev;
60		60
61	unsigned rt_flags;	61	unsigned rt_flags;
62	__u16 rt_type;	62	__u16 rt_type;
63	__u16 rt_multipath_alg;	63	__u16 rt_multipath_alg;
64		64
65	__u32 rt_dst; /* Path destination */	65	__u32 rt_dst; /* Path destination */
66	__u32 rt_src; /* Path source */	66	__u32 rt_src; /* Path source */
67	int rt_iif;	67	int rt_iif;
68		68
69	/* Info on neighbour */	69	/* Info on neighbour */
70	__u32 rt_gateway;	70	__u32 rt_gateway;
71		71
72	/* Cache lookup keys */	72	/* Cache lookup keys */
73	struct flowi fl;	73	struct flowi fl;
74		74
75	/* Miscellaneous cached information */	75	/* Miscellaneous cached information */
76	__u32 rt_spec_dst; /* RFC1122 specific destination */	76	__u32 rt_spec_dst; /* RFC1122 specific destination */
77	struct inet_peer peer; / long-living peer info */	77	struct inet_peer peer; / long-living peer info */
78	};	78	};
79		79
80	struct ip_rt_acct	80	struct ip_rt_acct
81	{	81	{
82	__u32 o_bytes;	82	__u32 o_bytes;
83	__u32 o_packets;	83	__u32 o_packets;
84	__u32 i_bytes;	84	__u32 i_bytes;
85	__u32 i_packets;	85	__u32 i_packets;
86	};	86	};
87		87
88	struct rt_cache_stat	88	struct rt_cache_stat
89	{	89	{
90	unsigned int in_hit;	90	unsigned int in_hit;
91	unsigned int in_slow_tot;	91	unsigned int in_slow_tot;
92	unsigned int in_slow_mc;	92	unsigned int in_slow_mc;
93	unsigned int in_no_route;	93	unsigned int in_no_route;
94	unsigned int in_brd;	94	unsigned int in_brd;
95	unsigned int in_martian_dst;	95	unsigned int in_martian_dst;
96	unsigned int in_martian_src;	96	unsigned int in_martian_src;
97	unsigned int out_hit;	97	unsigned int out_hit;
98	unsigned int out_slow_tot;	98	unsigned int out_slow_tot;
99	unsigned int out_slow_mc;	99	unsigned int out_slow_mc;
100	unsigned int gc_total;	100	unsigned int gc_total;
101	unsigned int gc_ignored;	101	unsigned int gc_ignored;
102	unsigned int gc_goal_miss;	102	unsigned int gc_goal_miss;
103	unsigned int gc_dst_overflow;	103	unsigned int gc_dst_overflow;
104	unsigned int in_hlist_search;	104	unsigned int in_hlist_search;
105	unsigned int out_hlist_search;	105	unsigned int out_hlist_search;
106	};	106	};
107		107
108	extern struct rt_cache_stat *rt_cache_stat;	108	extern struct rt_cache_stat *rt_cache_stat;
109	#define RT_CACHE_STAT_INC(field) \	109	#define RT_CACHE_STAT_INC(field) \
110	(per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++)	110	(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
111		111
112	extern struct ip_rt_acct *ip_rt_acct;	112	extern struct ip_rt_acct *ip_rt_acct;
113		113
114	struct in_device;	114	struct in_device;
115	extern int ip_rt_init(void);	115	extern int ip_rt_init(void);
116	extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,	116	extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
117	u32 src, u8 tos, struct net_device *dev);	117	u32 src, u8 tos, struct net_device *dev);
118	extern void ip_rt_advice(struct rtable **rp, int advice);	118	extern void ip_rt_advice(struct rtable **rp, int advice);
119	extern void rt_cache_flush(int how);	119	extern void rt_cache_flush(int how);
120	extern int __ip_route_output_key(struct rtable *, const struct flowi flp);	120	extern int __ip_route_output_key(struct rtable *, const struct flowi flp);
121	extern int ip_route_output_key(struct rtable *, struct flowi flp);	121	extern int ip_route_output_key(struct rtable *, struct flowi flp);
122	extern int ip_route_output_flow(struct rtable *rp, struct flowi flp, struct sock *sk, int flags);	122	extern int ip_route_output_flow(struct rtable *rp, struct flowi flp, struct sock *sk, int flags);
123	extern int ip_route_input(struct sk_buff, u32 dst, u32 src, u8 tos, struct net_device devin);	123	extern int ip_route_input(struct sk_buff, u32 dst, u32 src, u8 tos, struct net_device devin);
124	extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu);	124	extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu);
125	extern void ip_rt_send_redirect(struct sk_buff *skb);	125	extern void ip_rt_send_redirect(struct sk_buff *skb);
126		126
127	extern unsigned inet_addr_type(u32 addr);	127	extern unsigned inet_addr_type(u32 addr);
128	extern void ip_rt_multicast_event(struct in_device *);	128	extern void ip_rt_multicast_event(struct in_device *);
129	extern int ip_rt_ioctl(unsigned int cmd, void __user *arg);	129	extern int ip_rt_ioctl(unsigned int cmd, void __user *arg);
130	extern void ip_rt_get_source(u8 src, struct rtable rt);	130	extern void ip_rt_get_source(u8 src, struct rtable rt);
131	extern int ip_rt_dump(struct sk_buff skb, struct netlink_callback cb);	131	extern int ip_rt_dump(struct sk_buff skb, struct netlink_callback cb);
132		132
133	static inline void ip_rt_put(struct rtable * rt)	133	static inline void ip_rt_put(struct rtable * rt)
134	{	134	{
135	if (rt)	135	if (rt)
136	dst_release(&rt->u.dst);	136	dst_release(&rt->u.dst);
137	}	137	}
138		138
139	#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3)	139	#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3)
140		140
141	extern __u8 ip_tos2prio[16];	141	extern __u8 ip_tos2prio[16];
142		142
143	static inline char rt_tos2priority(u8 tos)	143	static inline char rt_tos2priority(u8 tos)
144	{	144	{
145	return ip_tos2prio[IPTOS_TOS(tos)>>1];	145	return ip_tos2prio[IPTOS_TOS(tos)>>1];
146	}	146	}
147		147
148	static inline int ip_route_connect(struct rtable **rp, u32 dst,	148	static inline int ip_route_connect(struct rtable **rp, u32 dst,
149	u32 src, u32 tos, int oif, u8 protocol,	149	u32 src, u32 tos, int oif, u8 protocol,
150	u16 sport, u16 dport, struct sock *sk)	150	u16 sport, u16 dport, struct sock *sk)
151	{	151	{
152	struct flowi fl = { .oif = oif,	152	struct flowi fl = { .oif = oif,
153	.nl_u = { .ip4_u = { .daddr = dst,	153	.nl_u = { .ip4_u = { .daddr = dst,
154	.saddr = src,	154	.saddr = src,
155	.tos = tos } },	155	.tos = tos } },
156	.proto = protocol,	156	.proto = protocol,
157	.uli_u = { .ports =	157	.uli_u = { .ports =
158	{ .sport = sport,	158	{ .sport = sport,
159	.dport = dport } } };	159	.dport = dport } } };
160		160
161	int err;	161	int err;
162	if (!dst \|\| !src) {	162	if (!dst \|\| !src) {
163	err = __ip_route_output_key(rp, &fl);	163	err = __ip_route_output_key(rp, &fl);
164	if (err)	164	if (err)
165	return err;	165	return err;
166	fl.fl4_dst = (*rp)->rt_dst;	166	fl.fl4_dst = (*rp)->rt_dst;
167	fl.fl4_src = (*rp)->rt_src;	167	fl.fl4_src = (*rp)->rt_src;
168	ip_rt_put(*rp);	168	ip_rt_put(*rp);
169	*rp = NULL;	169	*rp = NULL;
170	}	170	}
171	return ip_route_output_flow(rp, &fl, sk, 0);	171	return ip_route_output_flow(rp, &fl, sk, 0);
172	}	172	}
173		173
174	static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport,	174	static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport,
175	struct sock *sk)	175	struct sock *sk)
176	{	176	{
177	if (sport != (*rp)->fl.fl_ip_sport \|\|	177	if (sport != (*rp)->fl.fl_ip_sport \|\|
178	dport != (*rp)->fl.fl_ip_dport) {	178	dport != (*rp)->fl.fl_ip_dport) {
179	struct flowi fl;	179	struct flowi fl;
180		180
181	memcpy(&fl, &(*rp)->fl, sizeof(fl));	181	memcpy(&fl, &(*rp)->fl, sizeof(fl));
182	fl.fl_ip_sport = sport;	182	fl.fl_ip_sport = sport;
183	fl.fl_ip_dport = dport;	183	fl.fl_ip_dport = dport;
184	ip_rt_put(*rp);	184	ip_rt_put(*rp);
185	*rp = NULL;	185	*rp = NULL;
186	return ip_route_output_flow(rp, &fl, sk, 0);	186	return ip_route_output_flow(rp, &fl, sk, 0);
187	}	187	}
188	return 0;	188	return 0;
189	}	189	}
190		190
191	extern void rt_bind_peer(struct rtable *rt, int create);	191	extern void rt_bind_peer(struct rtable *rt, int create);
192		192
193	static inline struct inet_peer rt_get_peer(struct rtable rt)	193	static inline struct inet_peer rt_get_peer(struct rtable rt)
194	{	194	{
195	if (rt->peer)	195	if (rt->peer)
196	return rt->peer;	196	return rt->peer;
197		197
198	rt_bind_peer(rt, 0);	198	rt_bind_peer(rt, 0);
199	return rt->peer;	199	return rt->peer;
200	}	200	}
201		201
202	#endif /* _ROUTE_H */	202	#endif /* _ROUTE_H */
203		203

include/net/snmp.h

Diff comments View file @ 39c715b

1	/*	1	/*
2	*	2	*
3	* SNMP MIB entries for the IP subsystem.	3	* SNMP MIB entries for the IP subsystem.
4	*	4	*
5	* Alan Cox <gw4pts@gw4pts.ampr.org>	5	* Alan Cox <gw4pts@gw4pts.ampr.org>
6	*	6	*
7	* We don't chose to implement SNMP in the kernel (this would	7	* We don't chose to implement SNMP in the kernel (this would
8	* be silly as SNMP is a pain in the backside in places). We do	8	* be silly as SNMP is a pain in the backside in places). We do
9	* however need to collect the MIB statistics and export them	9	* however need to collect the MIB statistics and export them
10	* out of /proc (eventually)	10	* out of /proc (eventually)
11	*	11	*
12	* This program is free software; you can redistribute it and/or	12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License	13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation; either version	14	* as published by the Free Software Foundation; either version
15	* 2 of the License, or (at your option) any later version.	15	* 2 of the License, or (at your option) any later version.
16	*	16	*
17	* $Id: snmp.h,v 1.19 2001/06/14 13:40:46 davem Exp $	17	* $Id: snmp.h,v 1.19 2001/06/14 13:40:46 davem Exp $
18	*	18	*
19	*/	19	*/
20		20
21	#ifndef _SNMP_H	21	#ifndef _SNMP_H
22	#define _SNMP_H	22	#define _SNMP_H
23		23
24	#include <linux/cache.h>	24	#include <linux/cache.h>
25	#include <linux/snmp.h>	25	#include <linux/snmp.h>
26		26
27	/*	27	/*
28	* Mibs are stored in array of unsigned long.	28	* Mibs are stored in array of unsigned long.
29	*/	29	*/
30	/*	30	/*
31	* struct snmp_mib{}	31	* struct snmp_mib{}
32	* - list of entries for particular API (such as /proc/net/snmp)	32	* - list of entries for particular API (such as /proc/net/snmp)
33	* - name of entries.	33	* - name of entries.
34	*/	34	*/
35	struct snmp_mib {	35	struct snmp_mib {
36	char *name;	36	char *name;
37	int entry;	37	int entry;
38	};	38	};
39		39
40	#define SNMP_MIB_ITEM(_name,_entry) { \	40	#define SNMP_MIB_ITEM(_name,_entry) { \
41	.name = _name, \	41	.name = _name, \
42	.entry = _entry, \	42	.entry = _entry, \
43	}	43	}
44		44
45	#define SNMP_MIB_SENTINEL { \	45	#define SNMP_MIB_SENTINEL { \
46	.name = NULL, \	46	.name = NULL, \
47	.entry = 0, \	47	.entry = 0, \
48	}	48	}
49		49
50	/*	50	/*
51	* We use all unsigned longs. Linux will soon be so reliable that even	51	* We use all unsigned longs. Linux will soon be so reliable that even
52	* these will rapidly get too small 8-). Seriously consider the IpInReceives	52	* these will rapidly get too small 8-). Seriously consider the IpInReceives
53	* count on the 20Gb/s + networks people expect in a few years time!	53	* count on the 20Gb/s + networks people expect in a few years time!
54	*/	54	*/
55		55
56	/*	56	/*
57	* The rule for padding:	57	* The rule for padding:
58	* Best is power of two because then the right structure can be found by a	58	* Best is power of two because then the right structure can be found by a
59	* simple shift. The structure should be always cache line aligned.	59	* simple shift. The structure should be always cache line aligned.
60	* gcc needs n=alignto(cachelinesize, popcnt(sizeof(bla_mib))) shift/add	60	* gcc needs n=alignto(cachelinesize, popcnt(sizeof(bla_mib))) shift/add
61	* instructions to emulate multiply in case it is not power-of-two.	61	* instructions to emulate multiply in case it is not power-of-two.
62	* Currently n is always <=3 for all sizes so simple cache line alignment	62	* Currently n is always <=3 for all sizes so simple cache line alignment
63	* is enough.	63	* is enough.
64	*	64	*
65	* The best solution would be a global CPU local area , especially on 64	65	* The best solution would be a global CPU local area , especially on 64
66	* and 128byte cacheline machine it makes a lot of sense -AK	66	* and 128byte cacheline machine it makes a lot of sense -AK
67	*/	67	*/
68		68
69	#define __SNMP_MIB_ALIGN__ ____cacheline_aligned	69	#define __SNMP_MIB_ALIGN__ ____cacheline_aligned
70		70
71	/* IPstats */	71	/* IPstats */
72	#define IPSTATS_MIB_MAX __IPSTATS_MIB_MAX	72	#define IPSTATS_MIB_MAX __IPSTATS_MIB_MAX
73	struct ipstats_mib {	73	struct ipstats_mib {
74	unsigned long mibs[IPSTATS_MIB_MAX];	74	unsigned long mibs[IPSTATS_MIB_MAX];
75	} __SNMP_MIB_ALIGN__;	75	} __SNMP_MIB_ALIGN__;
76		76
77	/* ICMP */	77	/* ICMP */
78	#define ICMP_MIB_DUMMY __ICMP_MIB_MAX	78	#define ICMP_MIB_DUMMY __ICMP_MIB_MAX
79	#define ICMP_MIB_MAX (__ICMP_MIB_MAX + 1)	79	#define ICMP_MIB_MAX (__ICMP_MIB_MAX + 1)
80		80
81	struct icmp_mib {	81	struct icmp_mib {
82	unsigned long mibs[ICMP_MIB_MAX];	82	unsigned long mibs[ICMP_MIB_MAX];
83	} __SNMP_MIB_ALIGN__;	83	} __SNMP_MIB_ALIGN__;
84		84
85	/* ICMP6 (IPv6-ICMP) */	85	/* ICMP6 (IPv6-ICMP) */
86	#define ICMP6_MIB_MAX __ICMP6_MIB_MAX	86	#define ICMP6_MIB_MAX __ICMP6_MIB_MAX
87	struct icmpv6_mib {	87	struct icmpv6_mib {
88	unsigned long mibs[ICMP6_MIB_MAX];	88	unsigned long mibs[ICMP6_MIB_MAX];
89	} __SNMP_MIB_ALIGN__;	89	} __SNMP_MIB_ALIGN__;
90		90
91	/* TCP */	91	/* TCP */
92	#define TCP_MIB_MAX __TCP_MIB_MAX	92	#define TCP_MIB_MAX __TCP_MIB_MAX
93	struct tcp_mib {	93	struct tcp_mib {
94	unsigned long mibs[TCP_MIB_MAX];	94	unsigned long mibs[TCP_MIB_MAX];
95	} __SNMP_MIB_ALIGN__;	95	} __SNMP_MIB_ALIGN__;
96		96
97	/* UDP */	97	/* UDP */
98	#define UDP_MIB_MAX __UDP_MIB_MAX	98	#define UDP_MIB_MAX __UDP_MIB_MAX
99	struct udp_mib {	99	struct udp_mib {
100	unsigned long mibs[UDP_MIB_MAX];	100	unsigned long mibs[UDP_MIB_MAX];
101	} __SNMP_MIB_ALIGN__;	101	} __SNMP_MIB_ALIGN__;
102		102
103	/* SCTP */	103	/* SCTP */
104	#define SCTP_MIB_MAX __SCTP_MIB_MAX	104	#define SCTP_MIB_MAX __SCTP_MIB_MAX
105	struct sctp_mib {	105	struct sctp_mib {
106	unsigned long mibs[SCTP_MIB_MAX];	106	unsigned long mibs[SCTP_MIB_MAX];
107	} __SNMP_MIB_ALIGN__;	107	} __SNMP_MIB_ALIGN__;
108		108
109	/* Linux */	109	/* Linux */
110	#define LINUX_MIB_MAX __LINUX_MIB_MAX	110	#define LINUX_MIB_MAX __LINUX_MIB_MAX
111	struct linux_mib {	111	struct linux_mib {
112	unsigned long mibs[LINUX_MIB_MAX];	112	unsigned long mibs[LINUX_MIB_MAX];
113	};	113	};
114		114
115		115
116	/*	116	/*
117	* FIXME: On x86 and some other CPUs the split into user and softirq parts	117	* FIXME: On x86 and some other CPUs the split into user and softirq parts
118	* is not needed because addl $1,memory is atomic against interrupts (but	118	* is not needed because addl $1,memory is atomic against interrupts (but
119	* atomic_inc would be overkill because of the lock cycles). Wants new	119	* atomic_inc would be overkill because of the lock cycles). Wants new
120	* nonlocked_atomic_inc() primitives -AK	120	* nonlocked_atomic_inc() primitives -AK
121	*/	121	*/
122	#define DEFINE_SNMP_STAT(type, name) \	122	#define DEFINE_SNMP_STAT(type, name) \
123	__typeof__(type) *name[2]	123	__typeof__(type) *name[2]
124	#define DECLARE_SNMP_STAT(type, name) \	124	#define DECLARE_SNMP_STAT(type, name) \
125	extern __typeof__(type) *name[2]	125	extern __typeof__(type) *name[2]
126		126
127	#define SNMP_STAT_BHPTR(name) (name[0])	127	#define SNMP_STAT_BHPTR(name) (name[0])
128	#define SNMP_STAT_USRPTR(name) (name[1])	128	#define SNMP_STAT_USRPTR(name) (name[1])
129		129
130	#define SNMP_INC_STATS_BH(mib, field) \	130	#define SNMP_INC_STATS_BH(mib, field) \
131	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++)	131	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field]++)
132	#define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \	132	#define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \
133	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++)	133	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field + (offset)]++)
134	#define SNMP_INC_STATS_USER(mib, field) \	134	#define SNMP_INC_STATS_USER(mib, field) \
135	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++)	135	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field]++)
136	#define SNMP_INC_STATS(mib, field) \	136	#define SNMP_INC_STATS(mib, field) \
137	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++)	137	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]++)
138	#define SNMP_DEC_STATS(mib, field) \	138	#define SNMP_DEC_STATS(mib, field) \
139	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--)	139	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]--)
140	#define SNMP_ADD_STATS_BH(mib, field, addend) \	140	#define SNMP_ADD_STATS_BH(mib, field, addend) \
141	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend)	141	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field] += addend)
142	#define SNMP_ADD_STATS_USER(mib, field, addend) \	142	#define SNMP_ADD_STATS_USER(mib, field, addend) \
143	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend)	143	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field] += addend)
144		144
145	#endif	145	#endif
146		146

kernel/module.c

Diff comments View file @ 39c715b

1	/* Rewritten by Rusty Russell, on the backs of many others...	1	/* Rewritten by Rusty Russell, on the backs of many others...
2	Copyright (C) 2002 Richard Henderson	2	Copyright (C) 2002 Richard Henderson
3	Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.	3	Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4		4
5	This program is free software; you can redistribute it and/or modify	5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by	6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; either version 2 of the License, or	7	the Free Software Foundation; either version 2 of the License, or
8	(at your option) any later version.	8	(at your option) any later version.
9		9
10	This program is distributed in the hope that it will be useful,	10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.	13	GNU General Public License for more details.
14		14
15	You should have received a copy of the GNU General Public License	15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software	16	along with this program; if not, write to the Free Software
17	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA	17	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18	*/	18	*/
19	#include <linux/config.h>	19	#include <linux/config.h>
20	#include <linux/module.h>	20	#include <linux/module.h>
21	#include <linux/moduleloader.h>	21	#include <linux/moduleloader.h>
22	#include <linux/init.h>	22	#include <linux/init.h>
23	#include <linux/slab.h>	23	#include <linux/slab.h>
24	#include <linux/vmalloc.h>	24	#include <linux/vmalloc.h>
25	#include <linux/elf.h>	25	#include <linux/elf.h>
26	#include <linux/seq_file.h>	26	#include <linux/seq_file.h>
27	#include <linux/syscalls.h>	27	#include <linux/syscalls.h>
28	#include <linux/fcntl.h>	28	#include <linux/fcntl.h>
29	#include <linux/rcupdate.h>	29	#include <linux/rcupdate.h>
30	#include <linux/cpu.h>	30	#include <linux/cpu.h>
31	#include <linux/moduleparam.h>	31	#include <linux/moduleparam.h>
32	#include <linux/errno.h>	32	#include <linux/errno.h>
33	#include <linux/err.h>	33	#include <linux/err.h>
34	#include <linux/vermagic.h>	34	#include <linux/vermagic.h>
35	#include <linux/notifier.h>	35	#include <linux/notifier.h>
36	#include <linux/stop_machine.h>	36	#include <linux/stop_machine.h>
37	#include <linux/device.h>	37	#include <linux/device.h>
38	#include <asm/uaccess.h>	38	#include <asm/uaccess.h>
39	#include <asm/semaphore.h>	39	#include <asm/semaphore.h>
40	#include <asm/cacheflush.h>	40	#include <asm/cacheflush.h>
41		41
42	#if 0	42	#if 0
43	#define DEBUGP printk	43	#define DEBUGP printk
44	#else	44	#else
45	#define DEBUGP(fmt , a...)	45	#define DEBUGP(fmt , a...)
46	#endif	46	#endif
47		47
48	#ifndef ARCH_SHF_SMALL	48	#ifndef ARCH_SHF_SMALL
49	#define ARCH_SHF_SMALL 0	49	#define ARCH_SHF_SMALL 0
50	#endif	50	#endif
51		51
52	/* If this is set, the section belongs in the init part of the module */	52	/* If this is set, the section belongs in the init part of the module */
53	#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))	53	#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
54		54
55	/* Protects module list */	55	/* Protects module list */
56	static DEFINE_SPINLOCK(modlist_lock);	56	static DEFINE_SPINLOCK(modlist_lock);
57		57
58	/* List of modules, protected by module_mutex AND modlist_lock */	58	/* List of modules, protected by module_mutex AND modlist_lock */
59	static DECLARE_MUTEX(module_mutex);	59	static DECLARE_MUTEX(module_mutex);
60	static LIST_HEAD(modules);	60	static LIST_HEAD(modules);
61		61
62	static DECLARE_MUTEX(notify_mutex);	62	static DECLARE_MUTEX(notify_mutex);
63	static struct notifier_block * module_notify_list;	63	static struct notifier_block * module_notify_list;
64		64
65	int register_module_notifier(struct notifier_block * nb)	65	int register_module_notifier(struct notifier_block * nb)
66	{	66	{
67	int err;	67	int err;
68	down(&notify_mutex);	68	down(&notify_mutex);
69	err = notifier_chain_register(&module_notify_list, nb);	69	err = notifier_chain_register(&module_notify_list, nb);
70	up(&notify_mutex);	70	up(&notify_mutex);
71	return err;	71	return err;
72	}	72	}
73	EXPORT_SYMBOL(register_module_notifier);	73	EXPORT_SYMBOL(register_module_notifier);
74		74
75	int unregister_module_notifier(struct notifier_block * nb)	75	int unregister_module_notifier(struct notifier_block * nb)
76	{	76	{
77	int err;	77	int err;
78	down(&notify_mutex);	78	down(&notify_mutex);
79	err = notifier_chain_unregister(&module_notify_list, nb);	79	err = notifier_chain_unregister(&module_notify_list, nb);
80	up(&notify_mutex);	80	up(&notify_mutex);
81	return err;	81	return err;
82	}	82	}
83	EXPORT_SYMBOL(unregister_module_notifier);	83	EXPORT_SYMBOL(unregister_module_notifier);
84		84
85	/* We require a truly strong try_module_get() */	85	/* We require a truly strong try_module_get() */
86	static inline int strong_try_module_get(struct module *mod)	86	static inline int strong_try_module_get(struct module *mod)
87	{	87	{
88	if (mod && mod->state == MODULE_STATE_COMING)	88	if (mod && mod->state == MODULE_STATE_COMING)
89	return 0;	89	return 0;
90	return try_module_get(mod);	90	return try_module_get(mod);
91	}	91	}
92		92
93	/* A thread that wants to hold a reference to a module only while it	93	/* A thread that wants to hold a reference to a module only while it
94	* is running can call ths to safely exit.	94	* is running can call ths to safely exit.
95	* nfsd and lockd use this.	95	* nfsd and lockd use this.
96	*/	96	*/
97	void __module_put_and_exit(struct module *mod, long code)	97	void __module_put_and_exit(struct module *mod, long code)
98	{	98	{
99	module_put(mod);	99	module_put(mod);
100	do_exit(code);	100	do_exit(code);
101	}	101	}
102	EXPORT_SYMBOL(__module_put_and_exit);	102	EXPORT_SYMBOL(__module_put_and_exit);
103		103
104	/* Find a module section: 0 means not found. */	104	/* Find a module section: 0 means not found. */
105	static unsigned int find_sec(Elf_Ehdr *hdr,	105	static unsigned int find_sec(Elf_Ehdr *hdr,
106	Elf_Shdr *sechdrs,	106	Elf_Shdr *sechdrs,
107	const char *secstrings,	107	const char *secstrings,
108	const char *name)	108	const char *name)
109	{	109	{
110	unsigned int i;	110	unsigned int i;
111		111
112	for (i = 1; i < hdr->e_shnum; i++)	112	for (i = 1; i < hdr->e_shnum; i++)
113	/* Alloc bit cleared means "ignore it." */	113	/* Alloc bit cleared means "ignore it." */
114	if ((sechdrs[i].sh_flags & SHF_ALLOC)	114	if ((sechdrs[i].sh_flags & SHF_ALLOC)
115	&& strcmp(secstrings+sechdrs[i].sh_name, name) == 0)	115	&& strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
116	return i;	116	return i;
117	return 0;	117	return 0;
118	}	118	}
119		119
120	/* Provided by the linker */	120	/* Provided by the linker */
121	extern const struct kernel_symbol __start___ksymtab[];	121	extern const struct kernel_symbol __start___ksymtab[];
122	extern const struct kernel_symbol __stop___ksymtab[];	122	extern const struct kernel_symbol __stop___ksymtab[];
123	extern const struct kernel_symbol __start___ksymtab_gpl[];	123	extern const struct kernel_symbol __start___ksymtab_gpl[];
124	extern const struct kernel_symbol __stop___ksymtab_gpl[];	124	extern const struct kernel_symbol __stop___ksymtab_gpl[];
125	extern const unsigned long __start___kcrctab[];	125	extern const unsigned long __start___kcrctab[];
126	extern const unsigned long __start___kcrctab_gpl[];	126	extern const unsigned long __start___kcrctab_gpl[];
127		127
128	#ifndef CONFIG_MODVERSIONS	128	#ifndef CONFIG_MODVERSIONS
129	#define symversion(base, idx) NULL	129	#define symversion(base, idx) NULL
130	#else	130	#else
131	#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)	131	#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
132	#endif	132	#endif
133		133
134	/* Find a symbol, return value, crc and module which owns it */	134	/* Find a symbol, return value, crc and module which owns it */
135	static unsigned long __find_symbol(const char *name,	135	static unsigned long __find_symbol(const char *name,
136	struct module **owner,	136	struct module **owner,
137	const unsigned long **crc,	137	const unsigned long **crc,
138	int gplok)	138	int gplok)
139	{	139	{
140	struct module *mod;	140	struct module *mod;
141	unsigned int i;	141	unsigned int i;
142		142
143	/* Core kernel first. */	143	/* Core kernel first. */
144	*owner = NULL;	144	*owner = NULL;
145	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {	145	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
146	if (strcmp(__start___ksymtab[i].name, name) == 0) {	146	if (strcmp(__start___ksymtab[i].name, name) == 0) {
147	*crc = symversion(__start___kcrctab, i);	147	*crc = symversion(__start___kcrctab, i);
148	return __start___ksymtab[i].value;	148	return __start___ksymtab[i].value;
149	}	149	}
150	}	150	}
151	if (gplok) {	151	if (gplok) {
152	for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)	152	for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
153	if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {	153	if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
154	*crc = symversion(__start___kcrctab_gpl, i);	154	*crc = symversion(__start___kcrctab_gpl, i);
155	return __start___ksymtab_gpl[i].value;	155	return __start___ksymtab_gpl[i].value;
156	}	156	}
157	}	157	}
158		158
159	/* Now try modules. */	159	/* Now try modules. */
160	list_for_each_entry(mod, &modules, list) {	160	list_for_each_entry(mod, &modules, list) {
161	*owner = mod;	161	*owner = mod;
162	for (i = 0; i < mod->num_syms; i++)	162	for (i = 0; i < mod->num_syms; i++)
163	if (strcmp(mod->syms[i].name, name) == 0) {	163	if (strcmp(mod->syms[i].name, name) == 0) {
164	*crc = symversion(mod->crcs, i);	164	*crc = symversion(mod->crcs, i);
165	return mod->syms[i].value;	165	return mod->syms[i].value;
166	}	166	}
167		167
168	if (gplok) {	168	if (gplok) {
169	for (i = 0; i < mod->num_gpl_syms; i++) {	169	for (i = 0; i < mod->num_gpl_syms; i++) {
170	if (strcmp(mod->gpl_syms[i].name, name) == 0) {	170	if (strcmp(mod->gpl_syms[i].name, name) == 0) {
171	*crc = symversion(mod->gpl_crcs, i);	171	*crc = symversion(mod->gpl_crcs, i);
172	return mod->gpl_syms[i].value;	172	return mod->gpl_syms[i].value;
173	}	173	}
174	}	174	}
175	}	175	}
176	}	176	}
177	DEBUGP("Failed to find symbol %s\n", name);	177	DEBUGP("Failed to find symbol %s\n", name);
178	return 0;	178	return 0;
179	}	179	}
180		180
181	/* Find a symbol in this elf symbol table */	181	/* Find a symbol in this elf symbol table */
182	static unsigned long find_local_symbol(Elf_Shdr *sechdrs,	182	static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
183	unsigned int symindex,	183	unsigned int symindex,
184	const char *strtab,	184	const char *strtab,
185	const char *name)	185	const char *name)
186	{	186	{
187	unsigned int i;	187	unsigned int i;
188	Elf_Sym sym = (void )sechdrs[symindex].sh_addr;	188	Elf_Sym sym = (void )sechdrs[symindex].sh_addr;
189		189
190	/* Search (defined) internal symbols first. */	190	/* Search (defined) internal symbols first. */
191	for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {	191	for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
192	if (sym[i].st_shndx != SHN_UNDEF	192	if (sym[i].st_shndx != SHN_UNDEF
193	&& strcmp(name, strtab + sym[i].st_name) == 0)	193	&& strcmp(name, strtab + sym[i].st_name) == 0)
194	return sym[i].st_value;	194	return sym[i].st_value;
195	}	195	}
196	return 0;	196	return 0;
197	}	197	}
198		198
199	/* Search for module by name: must hold module_mutex. */	199	/* Search for module by name: must hold module_mutex. */
200	static struct module find_module(const char name)	200	static struct module find_module(const char name)
201	{	201	{
202	struct module *mod;	202	struct module *mod;
203		203
204	list_for_each_entry(mod, &modules, list) {	204	list_for_each_entry(mod, &modules, list) {
205	if (strcmp(mod->name, name) == 0)	205	if (strcmp(mod->name, name) == 0)
206	return mod;	206	return mod;
207	}	207	}
208	return NULL;	208	return NULL;
209	}	209	}
210		210
211	#ifdef CONFIG_SMP	211	#ifdef CONFIG_SMP
212	/* Number of blocks used and allocated. */	212	/* Number of blocks used and allocated. */
213	static unsigned int pcpu_num_used, pcpu_num_allocated;	213	static unsigned int pcpu_num_used, pcpu_num_allocated;
214	/* Size of each block. -ve means used. */	214	/* Size of each block. -ve means used. */
215	static int *pcpu_size;	215	static int *pcpu_size;
216		216
217	static int split_block(unsigned int i, unsigned short size)	217	static int split_block(unsigned int i, unsigned short size)
218	{	218	{
219	/* Reallocation required? */	219	/* Reallocation required? */
220	if (pcpu_num_used + 1 > pcpu_num_allocated) {	220	if (pcpu_num_used + 1 > pcpu_num_allocated) {
221	int new = kmalloc(sizeof(new[0]) pcpu_num_allocated*2,	221	int new = kmalloc(sizeof(new[0]) pcpu_num_allocated*2,
222	GFP_KERNEL);	222	GFP_KERNEL);
223	if (!new)	223	if (!new)
224	return 0;	224	return 0;
225		225
226	memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);	226	memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
227	pcpu_num_allocated *= 2;	227	pcpu_num_allocated *= 2;
228	kfree(pcpu_size);	228	kfree(pcpu_size);
229	pcpu_size = new;	229	pcpu_size = new;
230	}	230	}
231		231
232	/* Insert a new subblock */	232	/* Insert a new subblock */
233	memmove(&pcpu_size[i+1], &pcpu_size[i],	233	memmove(&pcpu_size[i+1], &pcpu_size[i],
234	sizeof(pcpu_size[0]) * (pcpu_num_used - i));	234	sizeof(pcpu_size[0]) * (pcpu_num_used - i));
235	pcpu_num_used++;	235	pcpu_num_used++;
236		236
237	pcpu_size[i+1] -= size;	237	pcpu_size[i+1] -= size;
238	pcpu_size[i] = size;	238	pcpu_size[i] = size;
239	return 1;	239	return 1;
240	}	240	}
241		241
242	static inline unsigned int block_size(int val)	242	static inline unsigned int block_size(int val)
243	{	243	{
244	if (val < 0)	244	if (val < 0)
245	return -val;	245	return -val;
246	return val;	246	return val;
247	}	247	}
248		248
249	/* Created by linker magic */	249	/* Created by linker magic */
250	extern char __per_cpu_start[], __per_cpu_end[];	250	extern char __per_cpu_start[], __per_cpu_end[];
251		251
252	static void *percpu_modalloc(unsigned long size, unsigned long align)	252	static void *percpu_modalloc(unsigned long size, unsigned long align)
253	{	253	{
254	unsigned long extra;	254	unsigned long extra;
255	unsigned int i;	255	unsigned int i;
256	void *ptr;	256	void *ptr;
257		257
258	BUG_ON(align > SMP_CACHE_BYTES);	258	BUG_ON(align > SMP_CACHE_BYTES);
259		259
260	ptr = __per_cpu_start;	260	ptr = __per_cpu_start;
261	for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {	261	for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
262	/* Extra for alignment requirement. */	262	/* Extra for alignment requirement. */
263	extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;	263	extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
264	BUG_ON(i == 0 && extra != 0);	264	BUG_ON(i == 0 && extra != 0);
265		265
266	if (pcpu_size[i] < 0 \|\| pcpu_size[i] < extra + size)	266	if (pcpu_size[i] < 0 \|\| pcpu_size[i] < extra + size)
267	continue;	267	continue;
268		268
269	/* Transfer extra to previous block. */	269	/* Transfer extra to previous block. */
270	if (pcpu_size[i-1] < 0)	270	if (pcpu_size[i-1] < 0)
271	pcpu_size[i-1] -= extra;	271	pcpu_size[i-1] -= extra;
272	else	272	else
273	pcpu_size[i-1] += extra;	273	pcpu_size[i-1] += extra;
274	pcpu_size[i] -= extra;	274	pcpu_size[i] -= extra;
275	ptr += extra;	275	ptr += extra;
276		276
277	/* Split block if warranted */	277	/* Split block if warranted */
278	if (pcpu_size[i] - size > sizeof(unsigned long))	278	if (pcpu_size[i] - size > sizeof(unsigned long))
279	if (!split_block(i, size))	279	if (!split_block(i, size))
280	return NULL;	280	return NULL;
281		281
282	/* Mark allocated */	282	/* Mark allocated */
283	pcpu_size[i] = -pcpu_size[i];	283	pcpu_size[i] = -pcpu_size[i];
284	return ptr;	284	return ptr;
285	}	285	}
286		286
287	printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",	287	printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
288	size);	288	size);
289	return NULL;	289	return NULL;
290	}	290	}
291		291
292	static void percpu_modfree(void *freeme)	292	static void percpu_modfree(void *freeme)
293	{	293	{
294	unsigned int i;	294	unsigned int i;
295	void *ptr = __per_cpu_start + block_size(pcpu_size[0]);	295	void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
296		296
297	/* First entry is core kernel percpu data. */	297	/* First entry is core kernel percpu data. */
298	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {	298	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
299	if (ptr == freeme) {	299	if (ptr == freeme) {
300	pcpu_size[i] = -pcpu_size[i];	300	pcpu_size[i] = -pcpu_size[i];
301	goto free;	301	goto free;
302	}	302	}
303	}	303	}
304	BUG();	304	BUG();
305		305
306	free:	306	free:
307	/* Merge with previous? */	307	/* Merge with previous? */
308	if (pcpu_size[i-1] >= 0) {	308	if (pcpu_size[i-1] >= 0) {
309	pcpu_size[i-1] += pcpu_size[i];	309	pcpu_size[i-1] += pcpu_size[i];
310	pcpu_num_used--;	310	pcpu_num_used--;
311	memmove(&pcpu_size[i], &pcpu_size[i+1],	311	memmove(&pcpu_size[i], &pcpu_size[i+1],
312	(pcpu_num_used - i) * sizeof(pcpu_size[0]));	312	(pcpu_num_used - i) * sizeof(pcpu_size[0]));
313	i--;	313	i--;
314	}	314	}
315	/* Merge with next? */	315	/* Merge with next? */
316	if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {	316	if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
317	pcpu_size[i] += pcpu_size[i+1];	317	pcpu_size[i] += pcpu_size[i+1];
318	pcpu_num_used--;	318	pcpu_num_used--;
319	memmove(&pcpu_size[i+1], &pcpu_size[i+2],	319	memmove(&pcpu_size[i+1], &pcpu_size[i+2],
320	(pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));	320	(pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
321	}	321	}
322	}	322	}
323		323
324	static unsigned int find_pcpusec(Elf_Ehdr *hdr,	324	static unsigned int find_pcpusec(Elf_Ehdr *hdr,
325	Elf_Shdr *sechdrs,	325	Elf_Shdr *sechdrs,
326	const char *secstrings)	326	const char *secstrings)
327	{	327	{
328	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");	328	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
329	}	329	}
330		330
331	static int percpu_modinit(void)	331	static int percpu_modinit(void)
332	{	332	{
333	pcpu_num_used = 2;	333	pcpu_num_used = 2;
334	pcpu_num_allocated = 2;	334	pcpu_num_allocated = 2;
335	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,	335	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
336	GFP_KERNEL);	336	GFP_KERNEL);
337	/* Static in-kernel percpu data (used). */	337	/* Static in-kernel percpu data (used). */
338	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);	338	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
339	/* Free room. */	339	/* Free room. */
340	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];	340	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
341	if (pcpu_size[1] < 0) {	341	if (pcpu_size[1] < 0) {
342	printk(KERN_ERR "No per-cpu room for modules.\n");	342	printk(KERN_ERR "No per-cpu room for modules.\n");
343	pcpu_num_used = 1;	343	pcpu_num_used = 1;
344	}	344	}
345		345
346	return 0;	346	return 0;
347	}	347	}
348	__initcall(percpu_modinit);	348	__initcall(percpu_modinit);
349	#else /* ... !CONFIG_SMP */	349	#else /* ... !CONFIG_SMP */
350	static inline void *percpu_modalloc(unsigned long size, unsigned long align)	350	static inline void *percpu_modalloc(unsigned long size, unsigned long align)
351	{	351	{
352	return NULL;	352	return NULL;
353	}	353	}
354	static inline void percpu_modfree(void *pcpuptr)	354	static inline void percpu_modfree(void *pcpuptr)
355	{	355	{
356	BUG();	356	BUG();
357	}	357	}
358	static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,	358	static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
359	Elf_Shdr *sechdrs,	359	Elf_Shdr *sechdrs,
360	const char *secstrings)	360	const char *secstrings)
361	{	361	{
362	return 0;	362	return 0;
363	}	363	}
364	static inline void percpu_modcopy(void pcpudst, const void src,	364	static inline void percpu_modcopy(void pcpudst, const void src,
365	unsigned long size)	365	unsigned long size)
366	{	366	{
367	/* pcpusec should be 0, and size of that section should be 0. */	367	/* pcpusec should be 0, and size of that section should be 0. */
368	BUG_ON(size != 0);	368	BUG_ON(size != 0);
369	}	369	}
370	#endif /* CONFIG_SMP */	370	#endif /* CONFIG_SMP */
371		371
372	#ifdef CONFIG_MODULE_UNLOAD	372	#ifdef CONFIG_MODULE_UNLOAD
373	/* Init the unload section of the module. */	373	/* Init the unload section of the module. */
374	static void module_unload_init(struct module *mod)	374	static void module_unload_init(struct module *mod)
375	{	375	{
376	unsigned int i;	376	unsigned int i;
377		377
378	INIT_LIST_HEAD(&mod->modules_which_use_me);	378	INIT_LIST_HEAD(&mod->modules_which_use_me);
379	for (i = 0; i < NR_CPUS; i++)	379	for (i = 0; i < NR_CPUS; i++)
380	local_set(&mod->ref[i].count, 0);	380	local_set(&mod->ref[i].count, 0);
381	/* Hold reference count during initialization. */	381	/* Hold reference count during initialization. */
382	local_set(&mod->ref[_smp_processor_id()].count, 1);	382	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
383	/* Backwards compatibility macros put refcount during init. */	383	/* Backwards compatibility macros put refcount during init. */
384	mod->waiter = current;	384	mod->waiter = current;
385	}	385	}
386		386
387	/* modules using other modules */	387	/* modules using other modules */
388	struct module_use	388	struct module_use
389	{	389	{
390	struct list_head list;	390	struct list_head list;
391	struct module *module_which_uses;	391	struct module *module_which_uses;
392	};	392	};
393		393
394	/* Does a already use b? */	394	/* Does a already use b? */
395	static int already_uses(struct module a, struct module b)	395	static int already_uses(struct module a, struct module b)
396	{	396	{
397	struct module_use *use;	397	struct module_use *use;
398		398
399	list_for_each_entry(use, &b->modules_which_use_me, list) {	399	list_for_each_entry(use, &b->modules_which_use_me, list) {
400	if (use->module_which_uses == a) {	400	if (use->module_which_uses == a) {
401	DEBUGP("%s uses %s!\n", a->name, b->name);	401	DEBUGP("%s uses %s!\n", a->name, b->name);
402	return 1;	402	return 1;
403	}	403	}
404	}	404	}
405	DEBUGP("%s does not use %s!\n", a->name, b->name);	405	DEBUGP("%s does not use %s!\n", a->name, b->name);
406	return 0;	406	return 0;
407	}	407	}
408		408
409	/* Module a uses b */	409	/* Module a uses b */
410	static int use_module(struct module a, struct module b)	410	static int use_module(struct module a, struct module b)
411	{	411	{
412	struct module_use *use;	412	struct module_use *use;
413	if (b == NULL \|\| already_uses(a, b)) return 1;	413	if (b == NULL \|\| already_uses(a, b)) return 1;
414		414
415	if (!strong_try_module_get(b))	415	if (!strong_try_module_get(b))
416	return 0;	416	return 0;
417		417
418	DEBUGP("Allocating new usage for %s.\n", a->name);	418	DEBUGP("Allocating new usage for %s.\n", a->name);
419	use = kmalloc(sizeof(*use), GFP_ATOMIC);	419	use = kmalloc(sizeof(*use), GFP_ATOMIC);
420	if (!use) {	420	if (!use) {
421	printk("%s: out of memory loading\n", a->name);	421	printk("%s: out of memory loading\n", a->name);
422	module_put(b);	422	module_put(b);
423	return 0;	423	return 0;
424	}	424	}
425		425
426	use->module_which_uses = a;	426	use->module_which_uses = a;
427	list_add(&use->list, &b->modules_which_use_me);	427	list_add(&use->list, &b->modules_which_use_me);
428	return 1;	428	return 1;
429	}	429	}
430		430
431	/* Clear the unload stuff of the module. */	431	/* Clear the unload stuff of the module. */
432	static void module_unload_free(struct module *mod)	432	static void module_unload_free(struct module *mod)
433	{	433	{
434	struct module *i;	434	struct module *i;
435		435
436	list_for_each_entry(i, &modules, list) {	436	list_for_each_entry(i, &modules, list) {
437	struct module_use *use;	437	struct module_use *use;
438		438
439	list_for_each_entry(use, &i->modules_which_use_me, list) {	439	list_for_each_entry(use, &i->modules_which_use_me, list) {
440	if (use->module_which_uses == mod) {	440	if (use->module_which_uses == mod) {
441	DEBUGP("%s unusing %s\n", mod->name, i->name);	441	DEBUGP("%s unusing %s\n", mod->name, i->name);
442	module_put(i);	442	module_put(i);
443	list_del(&use->list);	443	list_del(&use->list);
444	kfree(use);	444	kfree(use);
445	/* There can be at most one match. */	445	/* There can be at most one match. */
446	break;	446	break;
447	}	447	}
448	}	448	}
449	}	449	}
450	}	450	}
451		451
452	#ifdef CONFIG_MODULE_FORCE_UNLOAD	452	#ifdef CONFIG_MODULE_FORCE_UNLOAD
453	static inline int try_force(unsigned int flags)	453	static inline int try_force(unsigned int flags)
454	{	454	{
455	int ret = (flags & O_TRUNC);	455	int ret = (flags & O_TRUNC);
456	if (ret)	456	if (ret)
457	tainted \|= TAINT_FORCED_MODULE;	457	tainted \|= TAINT_FORCED_MODULE;
458	return ret;	458	return ret;
459	}	459	}
460	#else	460	#else
461	static inline int try_force(unsigned int flags)	461	static inline int try_force(unsigned int flags)
462	{	462	{
463	return 0;	463	return 0;
464	}	464	}
465	#endif /* CONFIG_MODULE_FORCE_UNLOAD */	465	#endif /* CONFIG_MODULE_FORCE_UNLOAD */
466		466
467	struct stopref	467	struct stopref
468	{	468	{
469	struct module *mod;	469	struct module *mod;
470	int flags;	470	int flags;
471	int *forced;	471	int *forced;
472	};	472	};
473		473
474	/* Whole machine is stopped with interrupts off when this runs. */	474	/* Whole machine is stopped with interrupts off when this runs. */
475	static int __try_stop_module(void *_sref)	475	static int __try_stop_module(void *_sref)
476	{	476	{
477	struct stopref *sref = _sref;	477	struct stopref *sref = _sref;
478		478
479	/* If it's not unused, quit unless we are told to block. */	479	/* If it's not unused, quit unless we are told to block. */
480	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {	480	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
481	if (!(*sref->forced = try_force(sref->flags)))	481	if (!(*sref->forced = try_force(sref->flags)))
482	return -EWOULDBLOCK;	482	return -EWOULDBLOCK;
483	}	483	}
484		484
485	/* Mark it as dying. */	485	/* Mark it as dying. */
486	sref->mod->state = MODULE_STATE_GOING;	486	sref->mod->state = MODULE_STATE_GOING;
487	return 0;	487	return 0;
488	}	488	}
489		489
490	static int try_stop_module(struct module mod, int flags, int forced)	490	static int try_stop_module(struct module mod, int flags, int forced)
491	{	491	{
492	struct stopref sref = { mod, flags, forced };	492	struct stopref sref = { mod, flags, forced };
493		493
494	return stop_machine_run(__try_stop_module, &sref, NR_CPUS);	494	return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
495	}	495	}
496		496
497	unsigned int module_refcount(struct module *mod)	497	unsigned int module_refcount(struct module *mod)
498	{	498	{
499	unsigned int i, total = 0;	499	unsigned int i, total = 0;
500		500
501	for (i = 0; i < NR_CPUS; i++)	501	for (i = 0; i < NR_CPUS; i++)
502	total += local_read(&mod->ref[i].count);	502	total += local_read(&mod->ref[i].count);
503	return total;	503	return total;
504	}	504	}
505	EXPORT_SYMBOL(module_refcount);	505	EXPORT_SYMBOL(module_refcount);
506		506
507	/* This exists whether we can unload or not */	507	/* This exists whether we can unload or not */
508	static void free_module(struct module *mod);	508	static void free_module(struct module *mod);
509		509
510	static void wait_for_zero_refcount(struct module *mod)	510	static void wait_for_zero_refcount(struct module *mod)
511	{	511	{
512	/* Since we might sleep for some time, drop the semaphore first */	512	/* Since we might sleep for some time, drop the semaphore first */
513	up(&module_mutex);	513	up(&module_mutex);
514	for (;;) {	514	for (;;) {
515	DEBUGP("Looking at refcount...\n");	515	DEBUGP("Looking at refcount...\n");
516	set_current_state(TASK_UNINTERRUPTIBLE);	516	set_current_state(TASK_UNINTERRUPTIBLE);
517	if (module_refcount(mod) == 0)	517	if (module_refcount(mod) == 0)
518	break;	518	break;
519	schedule();	519	schedule();
520	}	520	}
521	current->state = TASK_RUNNING;	521	current->state = TASK_RUNNING;
522	down(&module_mutex);	522	down(&module_mutex);
523	}	523	}
524		524
525	asmlinkage long	525	asmlinkage long
526	sys_delete_module(const char __user *name_user, unsigned int flags)	526	sys_delete_module(const char __user *name_user, unsigned int flags)
527	{	527	{
528	struct module *mod;	528	struct module *mod;
529	char name[MODULE_NAME_LEN];	529	char name[MODULE_NAME_LEN];
530	int ret, forced = 0;	530	int ret, forced = 0;
531		531
532	if (!capable(CAP_SYS_MODULE))	532	if (!capable(CAP_SYS_MODULE))
533	return -EPERM;	533	return -EPERM;
534		534
535	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)	535	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
536	return -EFAULT;	536	return -EFAULT;
537	name[MODULE_NAME_LEN-1] = '\0';	537	name[MODULE_NAME_LEN-1] = '\0';
538		538
539	if (down_interruptible(&module_mutex) != 0)	539	if (down_interruptible(&module_mutex) != 0)
540	return -EINTR;	540	return -EINTR;
541		541
542	mod = find_module(name);	542	mod = find_module(name);
543	if (!mod) {	543	if (!mod) {
544	ret = -ENOENT;	544	ret = -ENOENT;
545	goto out;	545	goto out;
546	}	546	}
547		547
548	if (!list_empty(&mod->modules_which_use_me)) {	548	if (!list_empty(&mod->modules_which_use_me)) {
549	/* Other modules depend on us: get rid of them first. */	549	/* Other modules depend on us: get rid of them first. */
550	ret = -EWOULDBLOCK;	550	ret = -EWOULDBLOCK;
551	goto out;	551	goto out;
552	}	552	}
553		553
554	/* Doing init or already dying? */	554	/* Doing init or already dying? */
555	if (mod->state != MODULE_STATE_LIVE) {	555	if (mod->state != MODULE_STATE_LIVE) {
556	/* FIXME: if (force), slam module count and wake up	556	/* FIXME: if (force), slam module count and wake up
557	waiter --RR */	557	waiter --RR */
558	DEBUGP("%s already dying\n", mod->name);	558	DEBUGP("%s already dying\n", mod->name);
559	ret = -EBUSY;	559	ret = -EBUSY;
560	goto out;	560	goto out;
561	}	561	}
562		562
563	/* If it has an init func, it must have an exit func to unload */	563	/* If it has an init func, it must have an exit func to unload */
564	if ((mod->init != NULL && mod->exit == NULL)	564	if ((mod->init != NULL && mod->exit == NULL)
565	\|\| mod->unsafe) {	565	\|\| mod->unsafe) {
566	forced = try_force(flags);	566	forced = try_force(flags);
567	if (!forced) {	567	if (!forced) {
568	/* This module can't be removed */	568	/* This module can't be removed */
569	ret = -EBUSY;	569	ret = -EBUSY;
570	goto out;	570	goto out;
571	}	571	}
572	}	572	}
573		573
574	/* Set this up before setting mod->state */	574	/* Set this up before setting mod->state */
575	mod->waiter = current;	575	mod->waiter = current;
576		576
577	/* Stop the machine so refcounts can't move and disable module. */	577	/* Stop the machine so refcounts can't move and disable module. */
578	ret = try_stop_module(mod, flags, &forced);	578	ret = try_stop_module(mod, flags, &forced);
579	if (ret != 0)	579	if (ret != 0)
580	goto out;	580	goto out;
581		581
582	/* Never wait if forced. */	582	/* Never wait if forced. */
583	if (!forced && module_refcount(mod) != 0)	583	if (!forced && module_refcount(mod) != 0)
584	wait_for_zero_refcount(mod);	584	wait_for_zero_refcount(mod);
585		585
586	/* Final destruction now noone is using it. */	586	/* Final destruction now noone is using it. */
587	if (mod->exit != NULL) {	587	if (mod->exit != NULL) {
588	up(&module_mutex);	588	up(&module_mutex);
589	mod->exit();	589	mod->exit();
590	down(&module_mutex);	590	down(&module_mutex);
591	}	591	}
592	free_module(mod);	592	free_module(mod);
593		593
594	out:	594	out:
595	up(&module_mutex);	595	up(&module_mutex);
596	return ret;	596	return ret;
597	}	597	}
598		598
599	static void print_unload_info(struct seq_file m, struct module mod)	599	static void print_unload_info(struct seq_file m, struct module mod)
600	{	600	{
601	struct module_use *use;	601	struct module_use *use;
602	int printed_something = 0;	602	int printed_something = 0;
603		603
604	seq_printf(m, " %u ", module_refcount(mod));	604	seq_printf(m, " %u ", module_refcount(mod));
605		605
606	/* Always include a trailing , so userspace can differentiate	606	/* Always include a trailing , so userspace can differentiate
607	between this and the old multi-field proc format. */	607	between this and the old multi-field proc format. */
608	list_for_each_entry(use, &mod->modules_which_use_me, list) {	608	list_for_each_entry(use, &mod->modules_which_use_me, list) {
609	printed_something = 1;	609	printed_something = 1;
610	seq_printf(m, "%s,", use->module_which_uses->name);	610	seq_printf(m, "%s,", use->module_which_uses->name);
611	}	611	}
612		612
613	if (mod->unsafe) {	613	if (mod->unsafe) {
614	printed_something = 1;	614	printed_something = 1;
615	seq_printf(m, "[unsafe],");	615	seq_printf(m, "[unsafe],");
616	}	616	}
617		617
618	if (mod->init != NULL && mod->exit == NULL) {	618	if (mod->init != NULL && mod->exit == NULL) {
619	printed_something = 1;	619	printed_something = 1;
620	seq_printf(m, "[permanent],");	620	seq_printf(m, "[permanent],");
621	}	621	}
622		622
623	if (!printed_something)	623	if (!printed_something)
624	seq_printf(m, "-");	624	seq_printf(m, "-");
625	}	625	}
626		626
627	void __symbol_put(const char *symbol)	627	void __symbol_put(const char *symbol)
628	{	628	{
629	struct module *owner;	629	struct module *owner;
630	unsigned long flags;	630	unsigned long flags;
631	const unsigned long *crc;	631	const unsigned long *crc;
632		632
633	spin_lock_irqsave(&modlist_lock, flags);	633	spin_lock_irqsave(&modlist_lock, flags);
634	if (!__find_symbol(symbol, &owner, &crc, 1))	634	if (!__find_symbol(symbol, &owner, &crc, 1))
635	BUG();	635	BUG();
636	module_put(owner);	636	module_put(owner);
637	spin_unlock_irqrestore(&modlist_lock, flags);	637	spin_unlock_irqrestore(&modlist_lock, flags);
638	}	638	}
639	EXPORT_SYMBOL(__symbol_put);	639	EXPORT_SYMBOL(__symbol_put);
640		640
641	void symbol_put_addr(void *addr)	641	void symbol_put_addr(void *addr)
642	{	642	{
643	unsigned long flags;	643	unsigned long flags;
644		644
645	spin_lock_irqsave(&modlist_lock, flags);	645	spin_lock_irqsave(&modlist_lock, flags);
646	if (!kernel_text_address((unsigned long)addr))	646	if (!kernel_text_address((unsigned long)addr))
647	BUG();	647	BUG();
648		648
649	module_put(module_text_address((unsigned long)addr));	649	module_put(module_text_address((unsigned long)addr));
650	spin_unlock_irqrestore(&modlist_lock, flags);	650	spin_unlock_irqrestore(&modlist_lock, flags);
651	}	651	}
652	EXPORT_SYMBOL_GPL(symbol_put_addr);	652	EXPORT_SYMBOL_GPL(symbol_put_addr);
653		653
654	static ssize_t show_refcnt(struct module_attribute *mattr,	654	static ssize_t show_refcnt(struct module_attribute *mattr,
655	struct module mod, char buffer)	655	struct module mod, char buffer)
656	{	656	{
657	/* sysfs holds a reference */	657	/* sysfs holds a reference */
658	return sprintf(buffer, "%u\n", module_refcount(mod)-1);	658	return sprintf(buffer, "%u\n", module_refcount(mod)-1);
659	}	659	}
660		660
661	static struct module_attribute refcnt = {	661	static struct module_attribute refcnt = {
662	.attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },	662	.attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },
663	.show = show_refcnt,	663	.show = show_refcnt,
664	};	664	};
665		665
666	#else /* !CONFIG_MODULE_UNLOAD */	666	#else /* !CONFIG_MODULE_UNLOAD */
667	static void print_unload_info(struct seq_file m, struct module mod)	667	static void print_unload_info(struct seq_file m, struct module mod)
668	{	668	{
669	/* We don't know the usage count, or what modules are using. */	669	/* We don't know the usage count, or what modules are using. */
670	seq_printf(m, " - -");	670	seq_printf(m, " - -");
671	}	671	}
672		672
673	static inline void module_unload_free(struct module *mod)	673	static inline void module_unload_free(struct module *mod)
674	{	674	{
675	}	675	}
676		676
677	static inline int use_module(struct module a, struct module b)	677	static inline int use_module(struct module a, struct module b)
678	{	678	{
679	return strong_try_module_get(b);	679	return strong_try_module_get(b);
680	}	680	}
681		681
682	static inline void module_unload_init(struct module *mod)	682	static inline void module_unload_init(struct module *mod)
683	{	683	{
684	}	684	}
685	#endif /* CONFIG_MODULE_UNLOAD */	685	#endif /* CONFIG_MODULE_UNLOAD */
686		686
687	#ifdef CONFIG_OBSOLETE_MODPARM	687	#ifdef CONFIG_OBSOLETE_MODPARM
688	/* Bounds checking done below */	688	/* Bounds checking done below */
689	static int obsparm_copy_string(const char val, struct kernel_param kp)	689	static int obsparm_copy_string(const char val, struct kernel_param kp)
690	{	690	{
691	strcpy(kp->arg, val);	691	strcpy(kp->arg, val);
692	return 0;	692	return 0;
693	}	693	}
694		694
695	int set_obsolete(const char val, struct kernel_param kp)	695	int set_obsolete(const char val, struct kernel_param kp)
696	{	696	{
697	unsigned int min, max;	697	unsigned int min, max;
698	unsigned int size, maxsize;	698	unsigned int size, maxsize;
699	int dummy;	699	int dummy;
700	char *endp;	700	char *endp;
701	const char *p;	701	const char *p;
702	struct obsolete_modparm *obsparm = kp->arg;	702	struct obsolete_modparm *obsparm = kp->arg;
703		703
704	if (!val) {	704	if (!val) {
705	printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);	705	printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
706	return -EINVAL;	706	return -EINVAL;
707	}	707	}
708		708
709	/* type is: [min[-max]]{b,h,i,l,s} */	709	/* type is: [min[-max]]{b,h,i,l,s} */
710	p = obsparm->type;	710	p = obsparm->type;
711	min = simple_strtol(p, &endp, 10);	711	min = simple_strtol(p, &endp, 10);
712	if (endp == obsparm->type)	712	if (endp == obsparm->type)
713	min = max = 1;	713	min = max = 1;
714	else if (*endp == '-') {	714	else if (*endp == '-') {
715	p = endp+1;	715	p = endp+1;
716	max = simple_strtol(p, &endp, 10);	716	max = simple_strtol(p, &endp, 10);
717	} else	717	} else
718	max = min;	718	max = min;
719	switch (*endp) {	719	switch (*endp) {
720	case 'b':	720	case 'b':
721	return param_array(kp->name, val, min, max, obsparm->addr,	721	return param_array(kp->name, val, min, max, obsparm->addr,
722	1, param_set_byte, &dummy);	722	1, param_set_byte, &dummy);
723	case 'h':	723	case 'h':
724	return param_array(kp->name, val, min, max, obsparm->addr,	724	return param_array(kp->name, val, min, max, obsparm->addr,
725	sizeof(short), param_set_short, &dummy);	725	sizeof(short), param_set_short, &dummy);
726	case 'i':	726	case 'i':
727	return param_array(kp->name, val, min, max, obsparm->addr,	727	return param_array(kp->name, val, min, max, obsparm->addr,
728	sizeof(int), param_set_int, &dummy);	728	sizeof(int), param_set_int, &dummy);
729	case 'l':	729	case 'l':
730	return param_array(kp->name, val, min, max, obsparm->addr,	730	return param_array(kp->name, val, min, max, obsparm->addr,
731	sizeof(long), param_set_long, &dummy);	731	sizeof(long), param_set_long, &dummy);
732	case 's':	732	case 's':
733	return param_array(kp->name, val, min, max, obsparm->addr,	733	return param_array(kp->name, val, min, max, obsparm->addr,
734	sizeof(char *), param_set_charp, &dummy);	734	sizeof(char *), param_set_charp, &dummy);
735		735
736	case 'c':	736	case 'c':
737	/* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,	737	/* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
738	and the decl is "char xxx[5][50];" */	738	and the decl is "char xxx[5][50];" */
739	p = endp+1;	739	p = endp+1;
740	maxsize = simple_strtol(p, &endp, 10);	740	maxsize = simple_strtol(p, &endp, 10);
741	/* We check lengths here (yes, this is a hack). */	741	/* We check lengths here (yes, this is a hack). */
742	p = val;	742	p = val;
743	while (p[size = strcspn(p, ",")]) {	743	while (p[size = strcspn(p, ",")]) {
744	if (size >= maxsize)	744	if (size >= maxsize)
745	goto oversize;	745	goto oversize;
746	p += size+1;	746	p += size+1;
747	}	747	}
748	if (size >= maxsize)	748	if (size >= maxsize)
749	goto oversize;	749	goto oversize;
750	return param_array(kp->name, val, min, max, obsparm->addr,	750	return param_array(kp->name, val, min, max, obsparm->addr,
751	maxsize, obsparm_copy_string, &dummy);	751	maxsize, obsparm_copy_string, &dummy);
752	}	752	}
753	printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);	753	printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
754	return -EINVAL;	754	return -EINVAL;
755	oversize:	755	oversize:
756	printk(KERN_ERR	756	printk(KERN_ERR
757	"Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);	757	"Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
758	return -EINVAL;	758	return -EINVAL;
759	}	759	}
760		760
761	static int obsolete_params(const char *name,	761	static int obsolete_params(const char *name,
762	char *args,	762	char *args,
763	struct obsolete_modparm obsparm[],	763	struct obsolete_modparm obsparm[],
764	unsigned int num,	764	unsigned int num,
765	Elf_Shdr *sechdrs,	765	Elf_Shdr *sechdrs,
766	unsigned int symindex,	766	unsigned int symindex,
767	const char *strtab)	767	const char *strtab)
768	{	768	{
769	struct kernel_param *kp;	769	struct kernel_param *kp;
770	unsigned int i;	770	unsigned int i;
771	int ret;	771	int ret;
772		772
773	kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);	773	kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
774	if (!kp)	774	if (!kp)
775	return -ENOMEM;	775	return -ENOMEM;
776		776
777	for (i = 0; i < num; i++) {	777	for (i = 0; i < num; i++) {
778	char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];	778	char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
779		779
780	snprintf(sym_name, sizeof(sym_name), "%s%s",	780	snprintf(sym_name, sizeof(sym_name), "%s%s",
781	MODULE_SYMBOL_PREFIX, obsparm[i].name);	781	MODULE_SYMBOL_PREFIX, obsparm[i].name);
782		782
783	kp[i].name = obsparm[i].name;	783	kp[i].name = obsparm[i].name;
784	kp[i].perm = 000;	784	kp[i].perm = 000;
785	kp[i].set = set_obsolete;	785	kp[i].set = set_obsolete;
786	kp[i].get = NULL;	786	kp[i].get = NULL;
787	obsparm[i].addr	787	obsparm[i].addr
788	= (void *)find_local_symbol(sechdrs, symindex, strtab,	788	= (void *)find_local_symbol(sechdrs, symindex, strtab,
789	sym_name);	789	sym_name);
790	if (!obsparm[i].addr) {	790	if (!obsparm[i].addr) {
791	printk("%s: falsely claims to have parameter %s\n",	791	printk("%s: falsely claims to have parameter %s\n",
792	name, obsparm[i].name);	792	name, obsparm[i].name);
793	ret = -EINVAL;	793	ret = -EINVAL;
794	goto out;	794	goto out;
795	}	795	}
796	kp[i].arg = &obsparm[i];	796	kp[i].arg = &obsparm[i];
797	}	797	}
798		798
799	ret = parse_args(name, args, kp, num, NULL);	799	ret = parse_args(name, args, kp, num, NULL);
800	out:	800	out:
801	kfree(kp);	801	kfree(kp);
802	return ret;	802	return ret;
803	}	803	}
804	#else	804	#else
805	static int obsolete_params(const char *name,	805	static int obsolete_params(const char *name,
806	char *args,	806	char *args,
807	struct obsolete_modparm obsparm[],	807	struct obsolete_modparm obsparm[],
808	unsigned int num,	808	unsigned int num,
809	Elf_Shdr *sechdrs,	809	Elf_Shdr *sechdrs,
810	unsigned int symindex,	810	unsigned int symindex,
811	const char *strtab)	811	const char *strtab)
812	{	812	{
813	if (num != 0)	813	if (num != 0)
814	printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",	814	printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
815	name);	815	name);
816	return 0;	816	return 0;
817	}	817	}
818	#endif /* CONFIG_OBSOLETE_MODPARM */	818	#endif /* CONFIG_OBSOLETE_MODPARM */
819		819
820	static const char vermagic[] = VERMAGIC_STRING;	820	static const char vermagic[] = VERMAGIC_STRING;
821		821
822	#ifdef CONFIG_MODVERSIONS	822	#ifdef CONFIG_MODVERSIONS
823	static int check_version(Elf_Shdr *sechdrs,	823	static int check_version(Elf_Shdr *sechdrs,
824	unsigned int versindex,	824	unsigned int versindex,
825	const char *symname,	825	const char *symname,
826	struct module *mod,	826	struct module *mod,
827	const unsigned long *crc)	827	const unsigned long *crc)
828	{	828	{
829	unsigned int i, num_versions;	829	unsigned int i, num_versions;
830	struct modversion_info *versions;	830	struct modversion_info *versions;
831		831
832	/* Exporting module didn't supply crcs? OK, we're already tainted. */	832	/* Exporting module didn't supply crcs? OK, we're already tainted. */
833	if (!crc)	833	if (!crc)
834	return 1;	834	return 1;
835		835
836	versions = (void *) sechdrs[versindex].sh_addr;	836	versions = (void *) sechdrs[versindex].sh_addr;
837	num_versions = sechdrs[versindex].sh_size	837	num_versions = sechdrs[versindex].sh_size
838	/ sizeof(struct modversion_info);	838	/ sizeof(struct modversion_info);
839		839
840	for (i = 0; i < num_versions; i++) {	840	for (i = 0; i < num_versions; i++) {
841	if (strcmp(versions[i].name, symname) != 0)	841	if (strcmp(versions[i].name, symname) != 0)
842	continue;	842	continue;
843		843
844	if (versions[i].crc == *crc)	844	if (versions[i].crc == *crc)
845	return 1;	845	return 1;
846	printk("%s: disagrees about version of symbol %s\n",	846	printk("%s: disagrees about version of symbol %s\n",
847	mod->name, symname);	847	mod->name, symname);
848	DEBUGP("Found checksum %lX vs module %lX\n",	848	DEBUGP("Found checksum %lX vs module %lX\n",
849	*crc, versions[i].crc);	849	*crc, versions[i].crc);
850	return 0;	850	return 0;
851	}	851	}
852	/* Not in module's version table. OK, but that taints the kernel. */	852	/* Not in module's version table. OK, but that taints the kernel. */
853	if (!(tainted & TAINT_FORCED_MODULE)) {	853	if (!(tainted & TAINT_FORCED_MODULE)) {
854	printk("%s: no version for \"%s\" found: kernel tainted.\n",	854	printk("%s: no version for \"%s\" found: kernel tainted.\n",
855	mod->name, symname);	855	mod->name, symname);
856	tainted \|= TAINT_FORCED_MODULE;	856	tainted \|= TAINT_FORCED_MODULE;
857	}	857	}
858	return 1;	858	return 1;
859	}	859	}
860		860
861	static inline int check_modstruct_version(Elf_Shdr *sechdrs,	861	static inline int check_modstruct_version(Elf_Shdr *sechdrs,
862	unsigned int versindex,	862	unsigned int versindex,
863	struct module *mod)	863	struct module *mod)
864	{	864	{
865	const unsigned long *crc;	865	const unsigned long *crc;
866	struct module *owner;	866	struct module *owner;
867		867
868	if (!__find_symbol("struct_module", &owner, &crc, 1))	868	if (!__find_symbol("struct_module", &owner, &crc, 1))
869	BUG();	869	BUG();
870	return check_version(sechdrs, versindex, "struct_module", mod,	870	return check_version(sechdrs, versindex, "struct_module", mod,
871	crc);	871	crc);
872	}	872	}
873		873
874	/* First part is kernel version, which we ignore. */	874	/* First part is kernel version, which we ignore. */
875	static inline int same_magic(const char amagic, const char bmagic)	875	static inline int same_magic(const char amagic, const char bmagic)
876	{	876	{
877	amagic += strcspn(amagic, " ");	877	amagic += strcspn(amagic, " ");
878	bmagic += strcspn(bmagic, " ");	878	bmagic += strcspn(bmagic, " ");
879	return strcmp(amagic, bmagic) == 0;	879	return strcmp(amagic, bmagic) == 0;
880	}	880	}
881	#else	881	#else
882	static inline int check_version(Elf_Shdr *sechdrs,	882	static inline int check_version(Elf_Shdr *sechdrs,
883	unsigned int versindex,	883	unsigned int versindex,
884	const char *symname,	884	const char *symname,
885	struct module *mod,	885	struct module *mod,
886	const unsigned long *crc)	886	const unsigned long *crc)
887	{	887	{
888	return 1;	888	return 1;
889	}	889	}
890		890
891	static inline int check_modstruct_version(Elf_Shdr *sechdrs,	891	static inline int check_modstruct_version(Elf_Shdr *sechdrs,
892	unsigned int versindex,	892	unsigned int versindex,
893	struct module *mod)	893	struct module *mod)
894	{	894	{
895	return 1;	895	return 1;
896	}	896	}
897		897
898	static inline int same_magic(const char amagic, const char bmagic)	898	static inline int same_magic(const char amagic, const char bmagic)
899	{	899	{
900	return strcmp(amagic, bmagic) == 0;	900	return strcmp(amagic, bmagic) == 0;
901	}	901	}
902	#endif /* CONFIG_MODVERSIONS */	902	#endif /* CONFIG_MODVERSIONS */
903		903
904	/* Resolve a symbol for this module. I.e. if we find one, record usage.	904	/* Resolve a symbol for this module. I.e. if we find one, record usage.
905	Must be holding module_mutex. */	905	Must be holding module_mutex. */
906	static unsigned long resolve_symbol(Elf_Shdr *sechdrs,	906	static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
907	unsigned int versindex,	907	unsigned int versindex,
908	const char *name,	908	const char *name,
909	struct module *mod)	909	struct module *mod)
910	{	910	{
911	struct module *owner;	911	struct module *owner;
912	unsigned long ret;	912	unsigned long ret;
913	const unsigned long *crc;	913	const unsigned long *crc;
914		914
915	spin_lock_irq(&modlist_lock);	915	spin_lock_irq(&modlist_lock);
916	ret = __find_symbol(name, &owner, &crc, mod->license_gplok);	916	ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
917	if (ret) {	917	if (ret) {
918	/* use_module can fail due to OOM, or module unloading */	918	/* use_module can fail due to OOM, or module unloading */
919	if (!check_version(sechdrs, versindex, name, mod, crc) \|\|	919	if (!check_version(sechdrs, versindex, name, mod, crc) \|\|
920	!use_module(mod, owner))	920	!use_module(mod, owner))
921	ret = 0;	921	ret = 0;
922	}	922	}
923	spin_unlock_irq(&modlist_lock);	923	spin_unlock_irq(&modlist_lock);
924	return ret;	924	return ret;
925	}	925	}
926		926
927		927
928	/*	928	/*
929	* /sys/module/foo/sections stuff	929	* /sys/module/foo/sections stuff
930	* J. Corbet <corbet@lwn.net>	930	* J. Corbet <corbet@lwn.net>
931	*/	931	*/
932	#ifdef CONFIG_KALLSYMS	932	#ifdef CONFIG_KALLSYMS
933	static ssize_t module_sect_show(struct module_attribute *mattr,	933	static ssize_t module_sect_show(struct module_attribute *mattr,
934	struct module mod, char buf)	934	struct module mod, char buf)
935	{	935	{
936	struct module_sect_attr *sattr =	936	struct module_sect_attr *sattr =
937	container_of(mattr, struct module_sect_attr, mattr);	937	container_of(mattr, struct module_sect_attr, mattr);
938	return sprintf(buf, "0x%lx\n", sattr->address);	938	return sprintf(buf, "0x%lx\n", sattr->address);
939	}	939	}
940		940
941	static void add_sect_attrs(struct module *mod, unsigned int nsect,	941	static void add_sect_attrs(struct module *mod, unsigned int nsect,
942	char secstrings, Elf_Shdr sechdrs)	942	char secstrings, Elf_Shdr sechdrs)
943	{	943	{
944	unsigned int nloaded = 0, i, size[2];	944	unsigned int nloaded = 0, i, size[2];
945	struct module_sect_attrs *sect_attrs;	945	struct module_sect_attrs *sect_attrs;
946	struct module_sect_attr *sattr;	946	struct module_sect_attr *sattr;
947	struct attribute **gattr;	947	struct attribute **gattr;
948		948
949	/* Count loaded sections and allocate structures */	949	/* Count loaded sections and allocate structures */
950	for (i = 0; i < nsect; i++)	950	for (i = 0; i < nsect; i++)
951	if (sechdrs[i].sh_flags & SHF_ALLOC)	951	if (sechdrs[i].sh_flags & SHF_ALLOC)
952	nloaded++;	952	nloaded++;
953	size[0] = ALIGN(sizeof(*sect_attrs)	953	size[0] = ALIGN(sizeof(*sect_attrs)
954	+ nloaded * sizeof(sect_attrs->attrs[0]),	954	+ nloaded * sizeof(sect_attrs->attrs[0]),
955	sizeof(sect_attrs->grp.attrs[0]));	955	sizeof(sect_attrs->grp.attrs[0]));
956	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);	956	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
957	if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))	957	if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
958	return;	958	return;
959		959
960	/* Setup section attributes. */	960	/* Setup section attributes. */
961	sect_attrs->grp.name = "sections";	961	sect_attrs->grp.name = "sections";
962	sect_attrs->grp.attrs = (void *)sect_attrs + size[0];	962	sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
963		963
964	sattr = &sect_attrs->attrs[0];	964	sattr = &sect_attrs->attrs[0];
965	gattr = &sect_attrs->grp.attrs[0];	965	gattr = &sect_attrs->grp.attrs[0];
966	for (i = 0; i < nsect; i++) {	966	for (i = 0; i < nsect; i++) {
967	if (! (sechdrs[i].sh_flags & SHF_ALLOC))	967	if (! (sechdrs[i].sh_flags & SHF_ALLOC))
968	continue;	968	continue;
969	sattr->address = sechdrs[i].sh_addr;	969	sattr->address = sechdrs[i].sh_addr;
970	strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,	970	strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
971	MODULE_SECT_NAME_LEN);	971	MODULE_SECT_NAME_LEN);
972	sattr->mattr.show = module_sect_show;	972	sattr->mattr.show = module_sect_show;
973	sattr->mattr.store = NULL;	973	sattr->mattr.store = NULL;
974	sattr->mattr.attr.name = sattr->name;	974	sattr->mattr.attr.name = sattr->name;
975	sattr->mattr.attr.owner = mod;	975	sattr->mattr.attr.owner = mod;
976	sattr->mattr.attr.mode = S_IRUGO;	976	sattr->mattr.attr.mode = S_IRUGO;
977	*(gattr++) = &(sattr++)->mattr.attr;	977	*(gattr++) = &(sattr++)->mattr.attr;
978	}	978	}
979	*gattr = NULL;	979	*gattr = NULL;
980		980
981	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))	981	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
982	goto out;	982	goto out;
983		983
984	mod->sect_attrs = sect_attrs;	984	mod->sect_attrs = sect_attrs;
985	return;	985	return;
986	out:	986	out:
987	kfree(sect_attrs);	987	kfree(sect_attrs);
988	}	988	}
989		989
990	static void remove_sect_attrs(struct module *mod)	990	static void remove_sect_attrs(struct module *mod)
991	{	991	{
992	if (mod->sect_attrs) {	992	if (mod->sect_attrs) {
993	sysfs_remove_group(&mod->mkobj.kobj,	993	sysfs_remove_group(&mod->mkobj.kobj,
994	&mod->sect_attrs->grp);	994	&mod->sect_attrs->grp);
995	/* We are positive that no one is using any sect attrs	995	/* We are positive that no one is using any sect attrs
996	* at this point. Deallocate immediately. */	996	* at this point. Deallocate immediately. */
997	kfree(mod->sect_attrs);	997	kfree(mod->sect_attrs);
998	mod->sect_attrs = NULL;	998	mod->sect_attrs = NULL;
999	}	999	}
1000	}	1000	}
1001		1001
1002		1002
1003	#else	1003	#else
1004	static inline void add_sect_attrs(struct module *mod, unsigned int nsect,	1004	static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
1005	char sectstrings, Elf_Shdr sechdrs)	1005	char sectstrings, Elf_Shdr sechdrs)
1006	{	1006	{
1007	}	1007	}
1008		1008
1009	static inline void remove_sect_attrs(struct module *mod)	1009	static inline void remove_sect_attrs(struct module *mod)
1010	{	1010	{
1011	}	1011	}
1012	#endif /* CONFIG_KALLSYMS */	1012	#endif /* CONFIG_KALLSYMS */
1013		1013
1014		1014
1015	#ifdef CONFIG_MODULE_UNLOAD	1015	#ifdef CONFIG_MODULE_UNLOAD
1016	static inline int module_add_refcnt_attr(struct module *mod)	1016	static inline int module_add_refcnt_attr(struct module *mod)
1017	{	1017	{
1018	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);	1018	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
1019	}	1019	}
1020	static void module_remove_refcnt_attr(struct module *mod)	1020	static void module_remove_refcnt_attr(struct module *mod)
1021	{	1021	{
1022	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);	1022	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
1023	}	1023	}
1024	#else	1024	#else
1025	static inline int module_add_refcnt_attr(struct module *mod)	1025	static inline int module_add_refcnt_attr(struct module *mod)
1026	{	1026	{
1027	return 0;	1027	return 0;
1028	}	1028	}
1029	static void module_remove_refcnt_attr(struct module *mod)	1029	static void module_remove_refcnt_attr(struct module *mod)
1030	{	1030	{
1031	}	1031	}
1032	#endif	1032	#endif
1033		1033
1034		1034
1035	static int mod_sysfs_setup(struct module *mod,	1035	static int mod_sysfs_setup(struct module *mod,
1036	struct kernel_param *kparam,	1036	struct kernel_param *kparam,
1037	unsigned int num_params)	1037	unsigned int num_params)
1038	{	1038	{
1039	int err;	1039	int err;
1040		1040
1041	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));	1041	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1042	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);	1042	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
1043	if (err)	1043	if (err)
1044	goto out;	1044	goto out;
1045	kobj_set_kset_s(&mod->mkobj, module_subsys);	1045	kobj_set_kset_s(&mod->mkobj, module_subsys);
1046	mod->mkobj.mod = mod;	1046	mod->mkobj.mod = mod;
1047	err = kobject_register(&mod->mkobj.kobj);	1047	err = kobject_register(&mod->mkobj.kobj);
1048	if (err)	1048	if (err)
1049	goto out;	1049	goto out;
1050		1050
1051	err = module_add_refcnt_attr(mod);	1051	err = module_add_refcnt_attr(mod);
1052	if (err)	1052	if (err)
1053	goto out_unreg;	1053	goto out_unreg;
1054		1054
1055	err = module_param_sysfs_setup(mod, kparam, num_params);	1055	err = module_param_sysfs_setup(mod, kparam, num_params);
1056	if (err)	1056	if (err)
1057	goto out_unreg;	1057	goto out_unreg;
1058		1058
1059	return 0;	1059	return 0;
1060		1060
1061	out_unreg:	1061	out_unreg:
1062	kobject_unregister(&mod->mkobj.kobj);	1062	kobject_unregister(&mod->mkobj.kobj);
1063	out:	1063	out:
1064	return err;	1064	return err;
1065	}	1065	}
1066		1066
1067	static void mod_kobject_remove(struct module *mod)	1067	static void mod_kobject_remove(struct module *mod)
1068	{	1068	{
1069	module_remove_refcnt_attr(mod);	1069	module_remove_refcnt_attr(mod);
1070	module_param_sysfs_remove(mod);	1070	module_param_sysfs_remove(mod);
1071		1071
1072	kobject_unregister(&mod->mkobj.kobj);	1072	kobject_unregister(&mod->mkobj.kobj);
1073	}	1073	}
1074		1074
1075	/*	1075	/*
1076	* unlink the module with the whole machine is stopped with interrupts off	1076	* unlink the module with the whole machine is stopped with interrupts off
1077	* - this defends against kallsyms not taking locks	1077	* - this defends against kallsyms not taking locks
1078	*/	1078	*/
1079	static int __unlink_module(void *_mod)	1079	static int __unlink_module(void *_mod)
1080	{	1080	{
1081	struct module *mod = _mod;	1081	struct module *mod = _mod;
1082	list_del(&mod->list);	1082	list_del(&mod->list);
1083	return 0;	1083	return 0;
1084	}	1084	}
1085		1085
1086	/* Free a module, remove from lists, etc (must hold module mutex). */	1086	/* Free a module, remove from lists, etc (must hold module mutex). */
1087	static void free_module(struct module *mod)	1087	static void free_module(struct module *mod)
1088	{	1088	{
1089	/* Delete from various lists */	1089	/* Delete from various lists */
1090	stop_machine_run(__unlink_module, mod, NR_CPUS);	1090	stop_machine_run(__unlink_module, mod, NR_CPUS);
1091	remove_sect_attrs(mod);	1091	remove_sect_attrs(mod);
1092	mod_kobject_remove(mod);	1092	mod_kobject_remove(mod);
1093		1093
1094	/* Arch-specific cleanup. */	1094	/* Arch-specific cleanup. */
1095	module_arch_cleanup(mod);	1095	module_arch_cleanup(mod);
1096		1096
1097	/* Module unload stuff */	1097	/* Module unload stuff */
1098	module_unload_free(mod);	1098	module_unload_free(mod);
1099		1099
1100	/* This may be NULL, but that's OK */	1100	/* This may be NULL, but that's OK */
1101	module_free(mod, mod->module_init);	1101	module_free(mod, mod->module_init);
1102	kfree(mod->args);	1102	kfree(mod->args);
1103	if (mod->percpu)	1103	if (mod->percpu)
1104	percpu_modfree(mod->percpu);	1104	percpu_modfree(mod->percpu);
1105		1105
1106	/* Finally, free the core (containing the module structure) */	1106	/* Finally, free the core (containing the module structure) */
1107	module_free(mod, mod->module_core);	1107	module_free(mod, mod->module_core);
1108	}	1108	}
1109		1109
1110	void __symbol_get(const char symbol)	1110	void __symbol_get(const char symbol)
1111	{	1111	{
1112	struct module *owner;	1112	struct module *owner;
1113	unsigned long value, flags;	1113	unsigned long value, flags;
1114	const unsigned long *crc;	1114	const unsigned long *crc;
1115		1115
1116	spin_lock_irqsave(&modlist_lock, flags);	1116	spin_lock_irqsave(&modlist_lock, flags);
1117	value = __find_symbol(symbol, &owner, &crc, 1);	1117	value = __find_symbol(symbol, &owner, &crc, 1);
1118	if (value && !strong_try_module_get(owner))	1118	if (value && !strong_try_module_get(owner))
1119	value = 0;	1119	value = 0;
1120	spin_unlock_irqrestore(&modlist_lock, flags);	1120	spin_unlock_irqrestore(&modlist_lock, flags);
1121		1121
1122	return (void *)value;	1122	return (void *)value;
1123	}	1123	}
1124	EXPORT_SYMBOL_GPL(__symbol_get);	1124	EXPORT_SYMBOL_GPL(__symbol_get);
1125		1125
1126	/* Change all symbols so that sh_value encodes the pointer directly. */	1126	/* Change all symbols so that sh_value encodes the pointer directly. */
1127	static int simplify_symbols(Elf_Shdr *sechdrs,	1127	static int simplify_symbols(Elf_Shdr *sechdrs,
1128	unsigned int symindex,	1128	unsigned int symindex,
1129	const char *strtab,	1129	const char *strtab,
1130	unsigned int versindex,	1130	unsigned int versindex,
1131	unsigned int pcpuindex,	1131	unsigned int pcpuindex,
1132	struct module *mod)	1132	struct module *mod)
1133	{	1133	{
1134	Elf_Sym sym = (void )sechdrs[symindex].sh_addr;	1134	Elf_Sym sym = (void )sechdrs[symindex].sh_addr;
1135	unsigned long secbase;	1135	unsigned long secbase;
1136	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);	1136	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1137	int ret = 0;	1137	int ret = 0;
1138		1138
1139	for (i = 1; i < n; i++) {	1139	for (i = 1; i < n; i++) {
1140	switch (sym[i].st_shndx) {	1140	switch (sym[i].st_shndx) {
1141	case SHN_COMMON:	1141	case SHN_COMMON:
1142	/* We compiled with -fno-common. These are not	1142	/* We compiled with -fno-common. These are not
1143	supposed to happen. */	1143	supposed to happen. */
1144	DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);	1144	DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
1145	printk("%s: please compile with -fno-common\n",	1145	printk("%s: please compile with -fno-common\n",
1146	mod->name);	1146	mod->name);
1147	ret = -ENOEXEC;	1147	ret = -ENOEXEC;
1148	break;	1148	break;
1149		1149
1150	case SHN_ABS:	1150	case SHN_ABS:
1151	/* Don't need to do anything */	1151	/* Don't need to do anything */
1152	DEBUGP("Absolute symbol: 0x%08lx\n",	1152	DEBUGP("Absolute symbol: 0x%08lx\n",
1153	(long)sym[i].st_value);	1153	(long)sym[i].st_value);
1154	break;	1154	break;
1155		1155
1156	case SHN_UNDEF:	1156	case SHN_UNDEF:
1157	sym[i].st_value	1157	sym[i].st_value
1158	= resolve_symbol(sechdrs, versindex,	1158	= resolve_symbol(sechdrs, versindex,
1159	strtab + sym[i].st_name, mod);	1159	strtab + sym[i].st_name, mod);
1160		1160
1161	/* Ok if resolved. */	1161	/* Ok if resolved. */
1162	if (sym[i].st_value != 0)	1162	if (sym[i].st_value != 0)
1163	break;	1163	break;
1164	/* Ok if weak. */	1164	/* Ok if weak. */
1165	if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)	1165	if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1166	break;	1166	break;
1167		1167
1168	printk(KERN_WARNING "%s: Unknown symbol %s\n",	1168	printk(KERN_WARNING "%s: Unknown symbol %s\n",
1169	mod->name, strtab + sym[i].st_name);	1169	mod->name, strtab + sym[i].st_name);
1170	ret = -ENOENT;	1170	ret = -ENOENT;
1171	break;	1171	break;
1172		1172
1173	default:	1173	default:
1174	/* Divert to percpu allocation if a percpu var. */	1174	/* Divert to percpu allocation if a percpu var. */
1175	if (sym[i].st_shndx == pcpuindex)	1175	if (sym[i].st_shndx == pcpuindex)
1176	secbase = (unsigned long)mod->percpu;	1176	secbase = (unsigned long)mod->percpu;
1177	else	1177	else
1178	secbase = sechdrs[sym[i].st_shndx].sh_addr;	1178	secbase = sechdrs[sym[i].st_shndx].sh_addr;
1179	sym[i].st_value += secbase;	1179	sym[i].st_value += secbase;
1180	break;	1180	break;
1181	}	1181	}
1182	}	1182	}
1183		1183
1184	return ret;	1184	return ret;
1185	}	1185	}
1186		1186
1187	/* Update size with this section: return offset. */	1187	/* Update size with this section: return offset. */
1188	static long get_offset(unsigned long size, Elf_Shdr sechdr)	1188	static long get_offset(unsigned long size, Elf_Shdr sechdr)
1189	{	1189	{
1190	long ret;	1190	long ret;
1191		1191
1192	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);	1192	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1193	*size = ret + sechdr->sh_size;	1193	*size = ret + sechdr->sh_size;
1194	return ret;	1194	return ret;
1195	}	1195	}
1196		1196
1197	/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld	1197	/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
1198	might -- code, read-only data, read-write data, small data. Tally	1198	might -- code, read-only data, read-write data, small data. Tally
1199	sizes, and place the offsets into sh_entsize fields: high bit means it	1199	sizes, and place the offsets into sh_entsize fields: high bit means it
1200	belongs in init. */	1200	belongs in init. */
1201	static void layout_sections(struct module *mod,	1201	static void layout_sections(struct module *mod,
1202	const Elf_Ehdr *hdr,	1202	const Elf_Ehdr *hdr,
1203	Elf_Shdr *sechdrs,	1203	Elf_Shdr *sechdrs,
1204	const char *secstrings)	1204	const char *secstrings)
1205	{	1205	{
1206	static unsigned long const masks[][2] = {	1206	static unsigned long const masks[][2] = {
1207	/* NOTE: all executable code must be the first section	1207	/* NOTE: all executable code must be the first section
1208	* in this array; otherwise modify the text_size	1208	* in this array; otherwise modify the text_size
1209	* finder in the two loops below */	1209	* finder in the two loops below */
1210	{ SHF_EXECINSTR \| SHF_ALLOC, ARCH_SHF_SMALL },	1210	{ SHF_EXECINSTR \| SHF_ALLOC, ARCH_SHF_SMALL },
1211	{ SHF_ALLOC, SHF_WRITE \| ARCH_SHF_SMALL },	1211	{ SHF_ALLOC, SHF_WRITE \| ARCH_SHF_SMALL },
1212	{ SHF_WRITE \| SHF_ALLOC, ARCH_SHF_SMALL },	1212	{ SHF_WRITE \| SHF_ALLOC, ARCH_SHF_SMALL },
1213	{ ARCH_SHF_SMALL \| SHF_ALLOC, 0 }	1213	{ ARCH_SHF_SMALL \| SHF_ALLOC, 0 }
1214	};	1214	};
1215	unsigned int m, i;	1215	unsigned int m, i;
1216		1216
1217	for (i = 0; i < hdr->e_shnum; i++)	1217	for (i = 0; i < hdr->e_shnum; i++)
1218	sechdrs[i].sh_entsize = ~0UL;	1218	sechdrs[i].sh_entsize = ~0UL;
1219		1219
1220	DEBUGP("Core section allocation order:\n");	1220	DEBUGP("Core section allocation order:\n");
1221	for (m = 0; m < ARRAY_SIZE(masks); ++m) {	1221	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1222	for (i = 0; i < hdr->e_shnum; ++i) {	1222	for (i = 0; i < hdr->e_shnum; ++i) {
1223	Elf_Shdr *s = &sechdrs[i];	1223	Elf_Shdr *s = &sechdrs[i];
1224		1224
1225	if ((s->sh_flags & masks[m][0]) != masks[m][0]	1225	if ((s->sh_flags & masks[m][0]) != masks[m][0]
1226	\|\| (s->sh_flags & masks[m][1])	1226	\|\| (s->sh_flags & masks[m][1])
1227	\|\| s->sh_entsize != ~0UL	1227	\|\| s->sh_entsize != ~0UL
1228	\|\| strncmp(secstrings + s->sh_name,	1228	\|\| strncmp(secstrings + s->sh_name,
1229	".init", 5) == 0)	1229	".init", 5) == 0)
1230	continue;	1230	continue;
1231	s->sh_entsize = get_offset(&mod->core_size, s);	1231	s->sh_entsize = get_offset(&mod->core_size, s);
1232	DEBUGP("\t%s\n", secstrings + s->sh_name);	1232	DEBUGP("\t%s\n", secstrings + s->sh_name);
1233	}	1233	}
1234	if (m == 0)	1234	if (m == 0)
1235	mod->core_text_size = mod->core_size;	1235	mod->core_text_size = mod->core_size;
1236	}	1236	}
1237		1237
1238	DEBUGP("Init section allocation order:\n");	1238	DEBUGP("Init section allocation order:\n");
1239	for (m = 0; m < ARRAY_SIZE(masks); ++m) {	1239	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1240	for (i = 0; i < hdr->e_shnum; ++i) {	1240	for (i = 0; i < hdr->e_shnum; ++i) {
1241	Elf_Shdr *s = &sechdrs[i];	1241	Elf_Shdr *s = &sechdrs[i];
1242		1242
1243	if ((s->sh_flags & masks[m][0]) != masks[m][0]	1243	if ((s->sh_flags & masks[m][0]) != masks[m][0]
1244	\|\| (s->sh_flags & masks[m][1])	1244	\|\| (s->sh_flags & masks[m][1])
1245	\|\| s->sh_entsize != ~0UL	1245	\|\| s->sh_entsize != ~0UL
1246	\|\| strncmp(secstrings + s->sh_name,	1246	\|\| strncmp(secstrings + s->sh_name,
1247	".init", 5) != 0)	1247	".init", 5) != 0)
1248	continue;	1248	continue;
1249	s->sh_entsize = (get_offset(&mod->init_size, s)	1249	s->sh_entsize = (get_offset(&mod->init_size, s)
1250	\| INIT_OFFSET_MASK);	1250	\| INIT_OFFSET_MASK);
1251	DEBUGP("\t%s\n", secstrings + s->sh_name);	1251	DEBUGP("\t%s\n", secstrings + s->sh_name);
1252	}	1252	}
1253	if (m == 0)	1253	if (m == 0)
1254	mod->init_text_size = mod->init_size;	1254	mod->init_text_size = mod->init_size;
1255	}	1255	}
1256	}	1256	}
1257		1257
1258	static inline int license_is_gpl_compatible(const char *license)	1258	static inline int license_is_gpl_compatible(const char *license)
1259	{	1259	{
1260	return (strcmp(license, "GPL") == 0	1260	return (strcmp(license, "GPL") == 0
1261	\|\| strcmp(license, "GPL v2") == 0	1261	\|\| strcmp(license, "GPL v2") == 0
1262	\|\| strcmp(license, "GPL and additional rights") == 0	1262	\|\| strcmp(license, "GPL and additional rights") == 0
1263	\|\| strcmp(license, "Dual BSD/GPL") == 0	1263	\|\| strcmp(license, "Dual BSD/GPL") == 0
1264	\|\| strcmp(license, "Dual MPL/GPL") == 0);	1264	\|\| strcmp(license, "Dual MPL/GPL") == 0);
1265	}	1265	}
1266		1266
1267	static void set_license(struct module mod, const char license)	1267	static void set_license(struct module mod, const char license)
1268	{	1268	{
1269	if (!license)	1269	if (!license)
1270	license = "unspecified";	1270	license = "unspecified";
1271		1271
1272	mod->license_gplok = license_is_gpl_compatible(license);	1272	mod->license_gplok = license_is_gpl_compatible(license);
1273	if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {	1273	if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
1274	printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",	1274	printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
1275	mod->name, license);	1275	mod->name, license);
1276	tainted \|= TAINT_PROPRIETARY_MODULE;	1276	tainted \|= TAINT_PROPRIETARY_MODULE;
1277	}	1277	}
1278	}	1278	}
1279		1279
1280	/* Parse tag=value strings from .modinfo section */	1280	/* Parse tag=value strings from .modinfo section */
1281	static char next_string(char string, unsigned long *secsize)	1281	static char next_string(char string, unsigned long *secsize)
1282	{	1282	{
1283	/* Skip non-zero chars */	1283	/* Skip non-zero chars */
1284	while (string[0]) {	1284	while (string[0]) {
1285	string++;	1285	string++;
1286	if ((*secsize)-- <= 1)	1286	if ((*secsize)-- <= 1)
1287	return NULL;	1287	return NULL;
1288	}	1288	}
1289		1289
1290	/* Skip any zero padding. */	1290	/* Skip any zero padding. */
1291	while (!string[0]) {	1291	while (!string[0]) {
1292	string++;	1292	string++;
1293	if ((*secsize)-- <= 1)	1293	if ((*secsize)-- <= 1)
1294	return NULL;	1294	return NULL;
1295	}	1295	}
1296	return string;	1296	return string;
1297	}	1297	}
1298		1298
1299	static char get_modinfo(Elf_Shdr sechdrs,	1299	static char get_modinfo(Elf_Shdr sechdrs,
1300	unsigned int info,	1300	unsigned int info,
1301	const char *tag)	1301	const char *tag)
1302	{	1302	{
1303	char *p;	1303	char *p;
1304	unsigned int taglen = strlen(tag);	1304	unsigned int taglen = strlen(tag);
1305	unsigned long size = sechdrs[info].sh_size;	1305	unsigned long size = sechdrs[info].sh_size;
1306		1306
1307	for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {	1307	for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
1308	if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')	1308	if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1309	return p + taglen + 1;	1309	return p + taglen + 1;
1310	}	1310	}
1311	return NULL;	1311	return NULL;
1312	}	1312	}
1313		1313
1314	#ifdef CONFIG_KALLSYMS	1314	#ifdef CONFIG_KALLSYMS
1315	int is_exported(const char name, const struct module mod)	1315	int is_exported(const char name, const struct module mod)
1316	{	1316	{
1317	unsigned int i;	1317	unsigned int i;
1318		1318
1319	if (!mod) {	1319	if (!mod) {
1320	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)	1320	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
1321	if (strcmp(__start___ksymtab[i].name, name) == 0)	1321	if (strcmp(__start___ksymtab[i].name, name) == 0)
1322	return 1;	1322	return 1;
1323	return 0;	1323	return 0;
1324	}	1324	}
1325	for (i = 0; i < mod->num_syms; i++)	1325	for (i = 0; i < mod->num_syms; i++)
1326	if (strcmp(mod->syms[i].name, name) == 0)	1326	if (strcmp(mod->syms[i].name, name) == 0)
1327	return 1;	1327	return 1;
1328	return 0;	1328	return 0;
1329	}	1329	}
1330		1330
1331	/* As per nm */	1331	/* As per nm */
1332	static char elf_type(const Elf_Sym *sym,	1332	static char elf_type(const Elf_Sym *sym,
1333	Elf_Shdr *sechdrs,	1333	Elf_Shdr *sechdrs,
1334	const char *secstrings,	1334	const char *secstrings,
1335	struct module *mod)	1335	struct module *mod)
1336	{	1336	{
1337	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {	1337	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1338	if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)	1338	if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1339	return 'v';	1339	return 'v';
1340	else	1340	else
1341	return 'w';	1341	return 'w';
1342	}	1342	}
1343	if (sym->st_shndx == SHN_UNDEF)	1343	if (sym->st_shndx == SHN_UNDEF)
1344	return 'U';	1344	return 'U';
1345	if (sym->st_shndx == SHN_ABS)	1345	if (sym->st_shndx == SHN_ABS)
1346	return 'a';	1346	return 'a';
1347	if (sym->st_shndx >= SHN_LORESERVE)	1347	if (sym->st_shndx >= SHN_LORESERVE)
1348	return '?';	1348	return '?';
1349	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)	1349	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
1350	return 't';	1350	return 't';
1351	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC	1351	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
1352	&& sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {	1352	&& sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
1353	if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))	1353	if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
1354	return 'r';	1354	return 'r';
1355	else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)	1355	else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
1356	return 'g';	1356	return 'g';
1357	else	1357	else
1358	return 'd';	1358	return 'd';
1359	}	1359	}
1360	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {	1360	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
1361	if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)	1361	if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
1362	return 's';	1362	return 's';
1363	else	1363	else
1364	return 'b';	1364	return 'b';
1365	}	1365	}
1366	if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,	1366	if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
1367	".debug", strlen(".debug")) == 0)	1367	".debug", strlen(".debug")) == 0)
1368	return 'n';	1368	return 'n';
1369	return '?';	1369	return '?';
1370	}	1370	}
1371		1371
1372	static void add_kallsyms(struct module *mod,	1372	static void add_kallsyms(struct module *mod,
1373	Elf_Shdr *sechdrs,	1373	Elf_Shdr *sechdrs,
1374	unsigned int symindex,	1374	unsigned int symindex,
1375	unsigned int strindex,	1375	unsigned int strindex,
1376	const char *secstrings)	1376	const char *secstrings)
1377	{	1377	{
1378	unsigned int i;	1378	unsigned int i;
1379		1379
1380	mod->symtab = (void *)sechdrs[symindex].sh_addr;	1380	mod->symtab = (void *)sechdrs[symindex].sh_addr;
1381	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);	1381	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1382	mod->strtab = (void *)sechdrs[strindex].sh_addr;	1382	mod->strtab = (void *)sechdrs[strindex].sh_addr;
1383		1383
1384	/* Set types up while we still have access to sections. */	1384	/* Set types up while we still have access to sections. */
1385	for (i = 0; i < mod->num_symtab; i++)	1385	for (i = 0; i < mod->num_symtab; i++)
1386	mod->symtab[i].st_info	1386	mod->symtab[i].st_info
1387	= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);	1387	= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1388	}	1388	}
1389	#else	1389	#else
1390	static inline void add_kallsyms(struct module *mod,	1390	static inline void add_kallsyms(struct module *mod,
1391	Elf_Shdr *sechdrs,	1391	Elf_Shdr *sechdrs,
1392	unsigned int symindex,	1392	unsigned int symindex,
1393	unsigned int strindex,	1393	unsigned int strindex,
1394	const char *secstrings)	1394	const char *secstrings)
1395	{	1395	{
1396	}	1396	}
1397	#endif /* CONFIG_KALLSYMS */	1397	#endif /* CONFIG_KALLSYMS */
1398		1398
1399	/* Allocate and load the module: note that size of section 0 is always	1399	/* Allocate and load the module: note that size of section 0 is always
1400	zero, and we rely on this for optional sections. */	1400	zero, and we rely on this for optional sections. */
1401	static struct module load_module(void __user umod,	1401	static struct module load_module(void __user umod,
1402	unsigned long len,	1402	unsigned long len,
1403	const char __user *uargs)	1403	const char __user *uargs)
1404	{	1404	{
1405	Elf_Ehdr *hdr;	1405	Elf_Ehdr *hdr;
1406	Elf_Shdr *sechdrs;	1406	Elf_Shdr *sechdrs;
1407	char secstrings, args, modmagic, strtab = NULL;	1407	char secstrings, args, modmagic, strtab = NULL;
1408	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,	1408	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
1409	exportindex, modindex, obsparmindex, infoindex, gplindex,	1409	exportindex, modindex, obsparmindex, infoindex, gplindex,
1410	crcindex, gplcrcindex, versindex, pcpuindex;	1410	crcindex, gplcrcindex, versindex, pcpuindex;
1411	long arglen;	1411	long arglen;
1412	struct module *mod;	1412	struct module *mod;
1413	long err = 0;	1413	long err = 0;
1414	void percpu = NULL, ptr = NULL; /* Stops spurious gcc warning */	1414	void percpu = NULL, ptr = NULL; /* Stops spurious gcc warning */
1415	struct exception_table_entry *extable;	1415	struct exception_table_entry *extable;
1416		1416
1417	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",	1417	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
1418	umod, len, uargs);	1418	umod, len, uargs);
1419	if (len < sizeof(*hdr))	1419	if (len < sizeof(*hdr))
1420	return ERR_PTR(-ENOEXEC);	1420	return ERR_PTR(-ENOEXEC);
1421		1421
1422	/* Suck in entire file: we'll want most of it. */	1422	/* Suck in entire file: we'll want most of it. */
1423	/* vmalloc barfs on "unusual" numbers. Check here */	1423	/* vmalloc barfs on "unusual" numbers. Check here */
1424	if (len > 64 * 1024 * 1024 \|\| (hdr = vmalloc(len)) == NULL)	1424	if (len > 64 * 1024 * 1024 \|\| (hdr = vmalloc(len)) == NULL)
1425	return ERR_PTR(-ENOMEM);	1425	return ERR_PTR(-ENOMEM);
1426	if (copy_from_user(hdr, umod, len) != 0) {	1426	if (copy_from_user(hdr, umod, len) != 0) {
1427	err = -EFAULT;	1427	err = -EFAULT;
1428	goto free_hdr;	1428	goto free_hdr;
1429	}	1429	}
1430		1430
1431	/* Sanity checks against insmoding binaries or wrong arch,	1431	/* Sanity checks against insmoding binaries or wrong arch,
1432	weird elf version */	1432	weird elf version */
1433	if (memcmp(hdr->e_ident, ELFMAG, 4) != 0	1433	if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
1434	\|\| hdr->e_type != ET_REL	1434	\|\| hdr->e_type != ET_REL
1435	\|\| !elf_check_arch(hdr)	1435	\|\| !elf_check_arch(hdr)
1436	\|\| hdr->e_shentsize != sizeof(*sechdrs)) {	1436	\|\| hdr->e_shentsize != sizeof(*sechdrs)) {
1437	err = -ENOEXEC;	1437	err = -ENOEXEC;
1438	goto free_hdr;	1438	goto free_hdr;
1439	}	1439	}
1440		1440
1441	if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))	1441	if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
1442	goto truncated;	1442	goto truncated;
1443		1443
1444	/* Convenience variables */	1444	/* Convenience variables */
1445	sechdrs = (void *)hdr + hdr->e_shoff;	1445	sechdrs = (void *)hdr + hdr->e_shoff;
1446	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;	1446	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
1447	sechdrs[0].sh_addr = 0;	1447	sechdrs[0].sh_addr = 0;
1448		1448
1449	for (i = 1; i < hdr->e_shnum; i++) {	1449	for (i = 1; i < hdr->e_shnum; i++) {
1450	if (sechdrs[i].sh_type != SHT_NOBITS	1450	if (sechdrs[i].sh_type != SHT_NOBITS
1451	&& len < sechdrs[i].sh_offset + sechdrs[i].sh_size)	1451	&& len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
1452	goto truncated;	1452	goto truncated;
1453		1453
1454	/* Mark all sections sh_addr with their address in the	1454	/* Mark all sections sh_addr with their address in the
1455	temporary image. */	1455	temporary image. */
1456	sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;	1456	sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
1457		1457
1458	/* Internal symbols and strings. */	1458	/* Internal symbols and strings. */
1459	if (sechdrs[i].sh_type == SHT_SYMTAB) {	1459	if (sechdrs[i].sh_type == SHT_SYMTAB) {
1460	symindex = i;	1460	symindex = i;
1461	strindex = sechdrs[i].sh_link;	1461	strindex = sechdrs[i].sh_link;
1462	strtab = (char *)hdr + sechdrs[strindex].sh_offset;	1462	strtab = (char *)hdr + sechdrs[strindex].sh_offset;
1463	}	1463	}
1464	#ifndef CONFIG_MODULE_UNLOAD	1464	#ifndef CONFIG_MODULE_UNLOAD
1465	/* Don't load .exit sections */	1465	/* Don't load .exit sections */
1466	if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)	1466	if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
1467	sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;	1467	sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1468	#endif	1468	#endif
1469	}	1469	}
1470		1470
1471	modindex = find_sec(hdr, sechdrs, secstrings,	1471	modindex = find_sec(hdr, sechdrs, secstrings,
1472	".gnu.linkonce.this_module");	1472	".gnu.linkonce.this_module");
1473	if (!modindex) {	1473	if (!modindex) {
1474	printk(KERN_WARNING "No module found in object\n");	1474	printk(KERN_WARNING "No module found in object\n");
1475	err = -ENOEXEC;	1475	err = -ENOEXEC;
1476	goto free_hdr;	1476	goto free_hdr;
1477	}	1477	}
1478	mod = (void *)sechdrs[modindex].sh_addr;	1478	mod = (void *)sechdrs[modindex].sh_addr;
1479		1479
1480	if (symindex == 0) {	1480	if (symindex == 0) {
1481	printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",	1481	printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
1482	mod->name);	1482	mod->name);
1483	err = -ENOEXEC;	1483	err = -ENOEXEC;
1484	goto free_hdr;	1484	goto free_hdr;
1485	}	1485	}
1486		1486
1487	/* Optional sections */	1487	/* Optional sections */
1488	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");	1488	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1489	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");	1489	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1490	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");	1490	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1491	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");	1491	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1492	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");	1492	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1493	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");	1493	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1494	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");	1494	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1495	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");	1495	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1496	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");	1496	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1497	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);	1497	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1498		1498
1499	/* Don't keep modinfo section */	1499	/* Don't keep modinfo section */
1500	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;	1500	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1501	#ifdef CONFIG_KALLSYMS	1501	#ifdef CONFIG_KALLSYMS
1502	/* Keep symbol and string tables for decoding later. */	1502	/* Keep symbol and string tables for decoding later. */
1503	sechdrs[symindex].sh_flags \|= SHF_ALLOC;	1503	sechdrs[symindex].sh_flags \|= SHF_ALLOC;
1504	sechdrs[strindex].sh_flags \|= SHF_ALLOC;	1504	sechdrs[strindex].sh_flags \|= SHF_ALLOC;
1505	#endif	1505	#endif
1506		1506
1507	/* Check module struct version now, before we try to use module. */	1507	/* Check module struct version now, before we try to use module. */
1508	if (!check_modstruct_version(sechdrs, versindex, mod)) {	1508	if (!check_modstruct_version(sechdrs, versindex, mod)) {
1509	err = -ENOEXEC;	1509	err = -ENOEXEC;
1510	goto free_hdr;	1510	goto free_hdr;
1511	}	1511	}
1512		1512
1513	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");	1513	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1514	/* This is allowed: modprobe --force will invalidate it. */	1514	/* This is allowed: modprobe --force will invalidate it. */
1515	if (!modmagic) {	1515	if (!modmagic) {
1516	tainted \|= TAINT_FORCED_MODULE;	1516	tainted \|= TAINT_FORCED_MODULE;
1517	printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",	1517	printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1518	mod->name);	1518	mod->name);
1519	} else if (!same_magic(modmagic, vermagic)) {	1519	} else if (!same_magic(modmagic, vermagic)) {
1520	printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",	1520	printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
1521	mod->name, modmagic, vermagic);	1521	mod->name, modmagic, vermagic);
1522	err = -ENOEXEC;	1522	err = -ENOEXEC;
1523	goto free_hdr;	1523	goto free_hdr;
1524	}	1524	}
1525		1525
1526	/* Now copy in args */	1526	/* Now copy in args */
1527	arglen = strlen_user(uargs);	1527	arglen = strlen_user(uargs);
1528	if (!arglen) {	1528	if (!arglen) {
1529	err = -EFAULT;	1529	err = -EFAULT;
1530	goto free_hdr;	1530	goto free_hdr;
1531	}	1531	}
1532	args = kmalloc(arglen, GFP_KERNEL);	1532	args = kmalloc(arglen, GFP_KERNEL);
1533	if (!args) {	1533	if (!args) {
1534	err = -ENOMEM;	1534	err = -ENOMEM;
1535	goto free_hdr;	1535	goto free_hdr;
1536	}	1536	}
1537	if (copy_from_user(args, uargs, arglen) != 0) {	1537	if (copy_from_user(args, uargs, arglen) != 0) {
1538	err = -EFAULT;	1538	err = -EFAULT;
1539	goto free_mod;	1539	goto free_mod;
1540	}	1540	}
1541		1541
1542	if (find_module(mod->name)) {	1542	if (find_module(mod->name)) {
1543	err = -EEXIST;	1543	err = -EEXIST;
1544	goto free_mod;	1544	goto free_mod;
1545	}	1545	}
1546		1546
1547	mod->state = MODULE_STATE_COMING;	1547	mod->state = MODULE_STATE_COMING;
1548		1548
1549	/* Allow arches to frob section contents and sizes. */	1549	/* Allow arches to frob section contents and sizes. */
1550	err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);	1550	err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
1551	if (err < 0)	1551	if (err < 0)
1552	goto free_mod;	1552	goto free_mod;
1553		1553
1554	if (pcpuindex) {	1554	if (pcpuindex) {
1555	/* We have a special allocation for this section. */	1555	/* We have a special allocation for this section. */
1556	percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,	1556	percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
1557	sechdrs[pcpuindex].sh_addralign);	1557	sechdrs[pcpuindex].sh_addralign);
1558	if (!percpu) {	1558	if (!percpu) {
1559	err = -ENOMEM;	1559	err = -ENOMEM;
1560	goto free_mod;	1560	goto free_mod;
1561	}	1561	}
1562	sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;	1562	sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1563	mod->percpu = percpu;	1563	mod->percpu = percpu;
1564	}	1564	}
1565		1565
1566	/* Determine total sizes, and put offsets in sh_entsize. For now	1566	/* Determine total sizes, and put offsets in sh_entsize. For now
1567	this is done generically; there doesn't appear to be any	1567	this is done generically; there doesn't appear to be any
1568	special cases for the architectures. */	1568	special cases for the architectures. */
1569	layout_sections(mod, hdr, sechdrs, secstrings);	1569	layout_sections(mod, hdr, sechdrs, secstrings);
1570		1570
1571	/* Do the allocs. */	1571	/* Do the allocs. */
1572	ptr = module_alloc(mod->core_size);	1572	ptr = module_alloc(mod->core_size);
1573	if (!ptr) {	1573	if (!ptr) {
1574	err = -ENOMEM;	1574	err = -ENOMEM;
1575	goto free_percpu;	1575	goto free_percpu;
1576	}	1576	}
1577	memset(ptr, 0, mod->core_size);	1577	memset(ptr, 0, mod->core_size);
1578	mod->module_core = ptr;	1578	mod->module_core = ptr;
1579		1579
1580	ptr = module_alloc(mod->init_size);	1580	ptr = module_alloc(mod->init_size);
1581	if (!ptr && mod->init_size) {	1581	if (!ptr && mod->init_size) {
1582	err = -ENOMEM;	1582	err = -ENOMEM;
1583	goto free_core;	1583	goto free_core;
1584	}	1584	}
1585	memset(ptr, 0, mod->init_size);	1585	memset(ptr, 0, mod->init_size);
1586	mod->module_init = ptr;	1586	mod->module_init = ptr;
1587		1587
1588	/* Transfer each section which specifies SHF_ALLOC */	1588	/* Transfer each section which specifies SHF_ALLOC */
1589	DEBUGP("final section addresses:\n");	1589	DEBUGP("final section addresses:\n");
1590	for (i = 0; i < hdr->e_shnum; i++) {	1590	for (i = 0; i < hdr->e_shnum; i++) {
1591	void *dest;	1591	void *dest;
1592		1592
1593	if (!(sechdrs[i].sh_flags & SHF_ALLOC))	1593	if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1594	continue;	1594	continue;
1595		1595
1596	if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)	1596	if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
1597	dest = mod->module_init	1597	dest = mod->module_init
1598	+ (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);	1598	+ (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
1599	else	1599	else
1600	dest = mod->module_core + sechdrs[i].sh_entsize;	1600	dest = mod->module_core + sechdrs[i].sh_entsize;
1601		1601
1602	if (sechdrs[i].sh_type != SHT_NOBITS)	1602	if (sechdrs[i].sh_type != SHT_NOBITS)
1603	memcpy(dest, (void *)sechdrs[i].sh_addr,	1603	memcpy(dest, (void *)sechdrs[i].sh_addr,
1604	sechdrs[i].sh_size);	1604	sechdrs[i].sh_size);
1605	/* Update sh_addr to point to copy in image. */	1605	/* Update sh_addr to point to copy in image. */
1606	sechdrs[i].sh_addr = (unsigned long)dest;	1606	sechdrs[i].sh_addr = (unsigned long)dest;
1607	DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);	1607	DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
1608	}	1608	}
1609	/* Module has been moved. */	1609	/* Module has been moved. */
1610	mod = (void *)sechdrs[modindex].sh_addr;	1610	mod = (void *)sechdrs[modindex].sh_addr;
1611		1611
1612	/* Now we've moved module, initialize linked lists, etc. */	1612	/* Now we've moved module, initialize linked lists, etc. */
1613	module_unload_init(mod);	1613	module_unload_init(mod);
1614		1614
1615	/* Set up license info based on the info section */	1615	/* Set up license info based on the info section */
1616	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));	1616	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617		1617
1618	/* Fix up syms, so that st_value is a pointer to location. */	1618	/* Fix up syms, so that st_value is a pointer to location. */
1619	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,	1619	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620	mod);	1620	mod);
1621	if (err < 0)	1621	if (err < 0)
1622	goto cleanup;	1622	goto cleanup;
1623		1623
1624	/* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */	1624	/* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
1625	mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);	1625	mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
1626	mod->syms = (void *)sechdrs[exportindex].sh_addr;	1626	mod->syms = (void *)sechdrs[exportindex].sh_addr;
1627	if (crcindex)	1627	if (crcindex)
1628	mod->crcs = (void *)sechdrs[crcindex].sh_addr;	1628	mod->crcs = (void *)sechdrs[crcindex].sh_addr;
1629	mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);	1629	mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
1630	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;	1630	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
1631	if (gplcrcindex)	1631	if (gplcrcindex)
1632	mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;	1632	mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1633		1633
1634	#ifdef CONFIG_MODVERSIONS	1634	#ifdef CONFIG_MODVERSIONS
1635	if ((mod->num_syms && !crcindex) \|\|	1635	if ((mod->num_syms && !crcindex) \|\|
1636	(mod->num_gpl_syms && !gplcrcindex)) {	1636	(mod->num_gpl_syms && !gplcrcindex)) {
1637	printk(KERN_WARNING "%s: No versions for exported symbols."	1637	printk(KERN_WARNING "%s: No versions for exported symbols."
1638	" Tainting kernel.\n", mod->name);	1638	" Tainting kernel.\n", mod->name);
1639	tainted \|= TAINT_FORCED_MODULE;	1639	tainted \|= TAINT_FORCED_MODULE;
1640	}	1640	}
1641	#endif	1641	#endif
1642		1642
1643	/* Now do relocations. */	1643	/* Now do relocations. */
1644	for (i = 1; i < hdr->e_shnum; i++) {	1644	for (i = 1; i < hdr->e_shnum; i++) {
1645	const char strtab = (char )sechdrs[strindex].sh_addr;	1645	const char strtab = (char )sechdrs[strindex].sh_addr;
1646	unsigned int info = sechdrs[i].sh_info;	1646	unsigned int info = sechdrs[i].sh_info;
1647		1647
1648	/* Not a valid relocation section? */	1648	/* Not a valid relocation section? */
1649	if (info >= hdr->e_shnum)	1649	if (info >= hdr->e_shnum)
1650	continue;	1650	continue;
1651		1651
1652	/* Don't bother with non-allocated sections */	1652	/* Don't bother with non-allocated sections */
1653	if (!(sechdrs[info].sh_flags & SHF_ALLOC))	1653	if (!(sechdrs[info].sh_flags & SHF_ALLOC))
1654	continue;	1654	continue;
1655		1655
1656	if (sechdrs[i].sh_type == SHT_REL)	1656	if (sechdrs[i].sh_type == SHT_REL)
1657	err = apply_relocate(sechdrs, strtab, symindex, i,mod);	1657	err = apply_relocate(sechdrs, strtab, symindex, i,mod);
1658	else if (sechdrs[i].sh_type == SHT_RELA)	1658	else if (sechdrs[i].sh_type == SHT_RELA)
1659	err = apply_relocate_add(sechdrs, strtab, symindex, i,	1659	err = apply_relocate_add(sechdrs, strtab, symindex, i,
1660	mod);	1660	mod);
1661	if (err < 0)	1661	if (err < 0)
1662	goto cleanup;	1662	goto cleanup;
1663	}	1663	}
1664		1664
1665	/* Set up and sort exception table */	1665	/* Set up and sort exception table */
1666	mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);	1666	mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
1667	mod->extable = extable = (void *)sechdrs[exindex].sh_addr;	1667	mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
1668	sort_extable(extable, extable + mod->num_exentries);	1668	sort_extable(extable, extable + mod->num_exentries);
1669		1669
1670	/* Finally, copy percpu area over. */	1670	/* Finally, copy percpu area over. */
1671	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,	1671	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
1672	sechdrs[pcpuindex].sh_size);	1672	sechdrs[pcpuindex].sh_size);
1673		1673
1674	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);	1674	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
1675		1675
1676	err = module_finalize(hdr, sechdrs, mod);	1676	err = module_finalize(hdr, sechdrs, mod);
1677	if (err < 0)	1677	if (err < 0)
1678	goto cleanup;	1678	goto cleanup;
1679		1679
1680	mod->args = args;	1680	mod->args = args;
1681	if (obsparmindex) {	1681	if (obsparmindex) {
1682	err = obsolete_params(mod->name, mod->args,	1682	err = obsolete_params(mod->name, mod->args,
1683	(struct obsolete_modparm *)	1683	(struct obsolete_modparm *)
1684	sechdrs[obsparmindex].sh_addr,	1684	sechdrs[obsparmindex].sh_addr,
1685	sechdrs[obsparmindex].sh_size	1685	sechdrs[obsparmindex].sh_size
1686	/ sizeof(struct obsolete_modparm),	1686	/ sizeof(struct obsolete_modparm),
1687	sechdrs, symindex,	1687	sechdrs, symindex,
1688	(char *)sechdrs[strindex].sh_addr);	1688	(char *)sechdrs[strindex].sh_addr);
1689	if (setupindex)	1689	if (setupindex)
1690	printk(KERN_WARNING "%s: Ignoring new-style "	1690	printk(KERN_WARNING "%s: Ignoring new-style "
1691	"parameters in presence of obsolete ones\n",	1691	"parameters in presence of obsolete ones\n",
1692	mod->name);	1692	mod->name);
1693	} else {	1693	} else {
1694	/* Size of section 0 is 0, so this works well if no params */	1694	/* Size of section 0 is 0, so this works well if no params */
1695	err = parse_args(mod->name, mod->args,	1695	err = parse_args(mod->name, mod->args,
1696	(struct kernel_param *)	1696	(struct kernel_param *)
1697	sechdrs[setupindex].sh_addr,	1697	sechdrs[setupindex].sh_addr,
1698	sechdrs[setupindex].sh_size	1698	sechdrs[setupindex].sh_size
1699	/ sizeof(struct kernel_param),	1699	/ sizeof(struct kernel_param),
1700	NULL);	1700	NULL);
1701	}	1701	}
1702	if (err < 0)	1702	if (err < 0)
1703	goto arch_cleanup;	1703	goto arch_cleanup;
1704		1704
1705	err = mod_sysfs_setup(mod,	1705	err = mod_sysfs_setup(mod,
1706	(struct kernel_param *)	1706	(struct kernel_param *)
1707	sechdrs[setupindex].sh_addr,	1707	sechdrs[setupindex].sh_addr,
1708	sechdrs[setupindex].sh_size	1708	sechdrs[setupindex].sh_size
1709	/ sizeof(struct kernel_param));	1709	/ sizeof(struct kernel_param));
1710	if (err < 0)	1710	if (err < 0)
1711	goto arch_cleanup;	1711	goto arch_cleanup;
1712	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);	1712	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1713		1713
1714	/* Get rid of temporary copy */	1714	/* Get rid of temporary copy */
1715	vfree(hdr);	1715	vfree(hdr);
1716		1716
1717	/* Done! */	1717	/* Done! */
1718	return mod;	1718	return mod;
1719		1719
1720	arch_cleanup:	1720	arch_cleanup:
1721	module_arch_cleanup(mod);	1721	module_arch_cleanup(mod);
1722	cleanup:	1722	cleanup:
1723	module_unload_free(mod);	1723	module_unload_free(mod);
1724	module_free(mod, mod->module_init);	1724	module_free(mod, mod->module_init);
1725	free_core:	1725	free_core:
1726	module_free(mod, mod->module_core);	1726	module_free(mod, mod->module_core);
1727	free_percpu:	1727	free_percpu:
1728	if (percpu)	1728	if (percpu)
1729	percpu_modfree(percpu);	1729	percpu_modfree(percpu);
1730	free_mod:	1730	free_mod:
1731	kfree(args);	1731	kfree(args);
1732	free_hdr:	1732	free_hdr:
1733	vfree(hdr);	1733	vfree(hdr);
1734	if (err < 0) return ERR_PTR(err);	1734	if (err < 0) return ERR_PTR(err);
1735	else return ptr;	1735	else return ptr;
1736		1736
1737	truncated:	1737	truncated:
1738	printk(KERN_ERR "Module len %lu truncated\n", len);	1738	printk(KERN_ERR "Module len %lu truncated\n", len);
1739	err = -ENOEXEC;	1739	err = -ENOEXEC;
1740	goto free_hdr;	1740	goto free_hdr;
1741	}	1741	}
1742		1742
1743	/*	1743	/*
1744	* link the module with the whole machine is stopped with interrupts off	1744	* link the module with the whole machine is stopped with interrupts off
1745	* - this defends against kallsyms not taking locks	1745	* - this defends against kallsyms not taking locks
1746	*/	1746	*/
1747	static int __link_module(void *_mod)	1747	static int __link_module(void *_mod)
1748	{	1748	{
1749	struct module *mod = _mod;	1749	struct module *mod = _mod;
1750	list_add(&mod->list, &modules);	1750	list_add(&mod->list, &modules);
1751	return 0;	1751	return 0;
1752	}	1752	}
1753		1753
1754	/* This is where the real work happens */	1754	/* This is where the real work happens */
1755	asmlinkage long	1755	asmlinkage long
1756	sys_init_module(void __user *umod,	1756	sys_init_module(void __user *umod,
1757	unsigned long len,	1757	unsigned long len,
1758	const char __user *uargs)	1758	const char __user *uargs)
1759	{	1759	{
1760	struct module *mod;	1760	struct module *mod;
1761	mm_segment_t old_fs = get_fs();	1761	mm_segment_t old_fs = get_fs();
1762	int ret = 0;	1762	int ret = 0;
1763		1763
1764	/* Must have permission */	1764	/* Must have permission */
1765	if (!capable(CAP_SYS_MODULE))	1765	if (!capable(CAP_SYS_MODULE))
1766	return -EPERM;	1766	return -EPERM;
1767		1767
1768	/* Only one module load at a time, please */	1768	/* Only one module load at a time, please */
1769	if (down_interruptible(&module_mutex) != 0)	1769	if (down_interruptible(&module_mutex) != 0)
1770	return -EINTR;	1770	return -EINTR;
1771		1771
1772	/* Do all the hard work */	1772	/* Do all the hard work */
1773	mod = load_module(umod, len, uargs);	1773	mod = load_module(umod, len, uargs);
1774	if (IS_ERR(mod)) {	1774	if (IS_ERR(mod)) {
1775	up(&module_mutex);	1775	up(&module_mutex);
1776	return PTR_ERR(mod);	1776	return PTR_ERR(mod);
1777	}	1777	}
1778		1778
1779	/* flush the icache in correct context */	1779	/* flush the icache in correct context */
1780	set_fs(KERNEL_DS);	1780	set_fs(KERNEL_DS);
1781		1781
1782	/* Flush the instruction cache, since we've played with text */	1782	/* Flush the instruction cache, since we've played with text */
1783	if (mod->module_init)	1783	if (mod->module_init)
1784	flush_icache_range((unsigned long)mod->module_init,	1784	flush_icache_range((unsigned long)mod->module_init,
1785	(unsigned long)mod->module_init	1785	(unsigned long)mod->module_init
1786	+ mod->init_size);	1786	+ mod->init_size);
1787	flush_icache_range((unsigned long)mod->module_core,	1787	flush_icache_range((unsigned long)mod->module_core,
1788	(unsigned long)mod->module_core + mod->core_size);	1788	(unsigned long)mod->module_core + mod->core_size);
1789		1789
1790	set_fs(old_fs);	1790	set_fs(old_fs);
1791		1791
1792	/* Now sew it into the lists. They won't access us, since	1792	/* Now sew it into the lists. They won't access us, since
1793	strong_try_module_get() will fail. */	1793	strong_try_module_get() will fail. */
1794	stop_machine_run(__link_module, mod, NR_CPUS);	1794	stop_machine_run(__link_module, mod, NR_CPUS);
1795		1795
1796	/* Drop lock so they can recurse */	1796	/* Drop lock so they can recurse */
1797	up(&module_mutex);	1797	up(&module_mutex);
1798		1798
1799	down(&notify_mutex);	1799	down(&notify_mutex);
1800	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);	1800	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
1801	up(&notify_mutex);	1801	up(&notify_mutex);
1802		1802
1803	/* Start the module */	1803	/* Start the module */
1804	if (mod->init != NULL)	1804	if (mod->init != NULL)
1805	ret = mod->init();	1805	ret = mod->init();
1806	if (ret < 0) {	1806	if (ret < 0) {
1807	/* Init routine failed: abort. Try to protect us from	1807	/* Init routine failed: abort. Try to protect us from
1808	buggy refcounters. */	1808	buggy refcounters. */
1809	mod->state = MODULE_STATE_GOING;	1809	mod->state = MODULE_STATE_GOING;
1810	synchronize_sched();	1810	synchronize_sched();
1811	if (mod->unsafe)	1811	if (mod->unsafe)
1812	printk(KERN_ERR "%s: module is now stuck!\n",	1812	printk(KERN_ERR "%s: module is now stuck!\n",
1813	mod->name);	1813	mod->name);
1814	else {	1814	else {
1815	module_put(mod);	1815	module_put(mod);
1816	down(&module_mutex);	1816	down(&module_mutex);
1817	free_module(mod);	1817	free_module(mod);
1818	up(&module_mutex);	1818	up(&module_mutex);
1819	}	1819	}
1820	return ret;	1820	return ret;
1821	}	1821	}
1822		1822
1823	/* Now it's a first class citizen! */	1823	/* Now it's a first class citizen! */
1824	down(&module_mutex);	1824	down(&module_mutex);
1825	mod->state = MODULE_STATE_LIVE;	1825	mod->state = MODULE_STATE_LIVE;
1826	/* Drop initial reference. */	1826	/* Drop initial reference. */
1827	module_put(mod);	1827	module_put(mod);
1828	module_free(mod, mod->module_init);	1828	module_free(mod, mod->module_init);
1829	mod->module_init = NULL;	1829	mod->module_init = NULL;
1830	mod->init_size = 0;	1830	mod->init_size = 0;
1831	mod->init_text_size = 0;	1831	mod->init_text_size = 0;
1832	up(&module_mutex);	1832	up(&module_mutex);
1833		1833
1834	return 0;	1834	return 0;
1835	}	1835	}
1836		1836
1837	static inline int within(unsigned long addr, void *start, unsigned long size)	1837	static inline int within(unsigned long addr, void *start, unsigned long size)
1838	{	1838	{
1839	return ((void )addr >= start && (void )addr < start + size);	1839	return ((void )addr >= start && (void )addr < start + size);
1840	}	1840	}
1841		1841
1842	#ifdef CONFIG_KALLSYMS	1842	#ifdef CONFIG_KALLSYMS
1843	/*	1843	/*
1844	* This ignores the intensely annoying "mapping symbols" found	1844	* This ignores the intensely annoying "mapping symbols" found
1845	* in ARM ELF files: $a, $t and $d.	1845	* in ARM ELF files: $a, $t and $d.
1846	*/	1846	*/
1847	static inline int is_arm_mapping_symbol(const char *str)	1847	static inline int is_arm_mapping_symbol(const char *str)
1848	{	1848	{
1849	return str[0] == '$' && strchr("atd", str[1])	1849	return str[0] == '$' && strchr("atd", str[1])
1850	&& (str[2] == '\0' \|\| str[2] == '.');	1850	&& (str[2] == '\0' \|\| str[2] == '.');
1851	}	1851	}
1852		1852
1853	static const char get_ksymbol(struct module mod,	1853	static const char get_ksymbol(struct module mod,
1854	unsigned long addr,	1854	unsigned long addr,
1855	unsigned long *size,	1855	unsigned long *size,
1856	unsigned long *offset)	1856	unsigned long *offset)
1857	{	1857	{
1858	unsigned int i, best = 0;	1858	unsigned int i, best = 0;
1859	unsigned long nextval;	1859	unsigned long nextval;
1860		1860
1861	/* At worse, next value is at end of module */	1861	/* At worse, next value is at end of module */
1862	if (within(addr, mod->module_init, mod->init_size))	1862	if (within(addr, mod->module_init, mod->init_size))
1863	nextval = (unsigned long)mod->module_init+mod->init_text_size;	1863	nextval = (unsigned long)mod->module_init+mod->init_text_size;
1864	else	1864	else
1865	nextval = (unsigned long)mod->module_core+mod->core_text_size;	1865	nextval = (unsigned long)mod->module_core+mod->core_text_size;
1866		1866
1867	/* Scan for closest preceeding symbol, and next symbol. (ELF	1867	/* Scan for closest preceeding symbol, and next symbol. (ELF
1868	starts real symbols at 1). */	1868	starts real symbols at 1). */
1869	for (i = 1; i < mod->num_symtab; i++) {	1869	for (i = 1; i < mod->num_symtab; i++) {
1870	if (mod->symtab[i].st_shndx == SHN_UNDEF)	1870	if (mod->symtab[i].st_shndx == SHN_UNDEF)
1871	continue;	1871	continue;
1872		1872
1873	/* We ignore unnamed symbols: they're uninformative	1873	/* We ignore unnamed symbols: they're uninformative
1874	* and inserted at a whim. */	1874	* and inserted at a whim. */
1875	if (mod->symtab[i].st_value <= addr	1875	if (mod->symtab[i].st_value <= addr
1876	&& mod->symtab[i].st_value > mod->symtab[best].st_value	1876	&& mod->symtab[i].st_value > mod->symtab[best].st_value
1877	&& *(mod->strtab + mod->symtab[i].st_name) != '\0'	1877	&& *(mod->strtab + mod->symtab[i].st_name) != '\0'
1878	&& !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))	1878	&& !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
1879	best = i;	1879	best = i;
1880	if (mod->symtab[i].st_value > addr	1880	if (mod->symtab[i].st_value > addr
1881	&& mod->symtab[i].st_value < nextval	1881	&& mod->symtab[i].st_value < nextval
1882	&& *(mod->strtab + mod->symtab[i].st_name) != '\0'	1882	&& *(mod->strtab + mod->symtab[i].st_name) != '\0'
1883	&& !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))	1883	&& !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
1884	nextval = mod->symtab[i].st_value;	1884	nextval = mod->symtab[i].st_value;
1885	}	1885	}
1886		1886
1887	if (!best)	1887	if (!best)
1888	return NULL;	1888	return NULL;
1889		1889
1890	*size = nextval - mod->symtab[best].st_value;	1890	*size = nextval - mod->symtab[best].st_value;
1891	*offset = addr - mod->symtab[best].st_value;	1891	*offset = addr - mod->symtab[best].st_value;
1892	return mod->strtab + mod->symtab[best].st_name;	1892	return mod->strtab + mod->symtab[best].st_name;
1893	}	1893	}
1894		1894
1895	/* For kallsyms to ask for address resolution. NULL means not found.	1895	/* For kallsyms to ask for address resolution. NULL means not found.
1896	We don't lock, as this is used for oops resolution and races are a	1896	We don't lock, as this is used for oops resolution and races are a
1897	lesser concern. */	1897	lesser concern. */
1898	const char *module_address_lookup(unsigned long addr,	1898	const char *module_address_lookup(unsigned long addr,
1899	unsigned long *size,	1899	unsigned long *size,
1900	unsigned long *offset,	1900	unsigned long *offset,
1901	char **modname)	1901	char **modname)
1902	{	1902	{
1903	struct module *mod;	1903	struct module *mod;
1904		1904
1905	list_for_each_entry(mod, &modules, list) {	1905	list_for_each_entry(mod, &modules, list) {
1906	if (within(addr, mod->module_init, mod->init_size)	1906	if (within(addr, mod->module_init, mod->init_size)
1907	\|\| within(addr, mod->module_core, mod->core_size)) {	1907	\|\| within(addr, mod->module_core, mod->core_size)) {
1908	*modname = mod->name;	1908	*modname = mod->name;
1909	return get_ksymbol(mod, addr, size, offset);	1909	return get_ksymbol(mod, addr, size, offset);
1910	}	1910	}
1911	}	1911	}
1912	return NULL;	1912	return NULL;
1913	}	1913	}
1914		1914
1915	struct module *module_get_kallsym(unsigned int symnum,	1915	struct module *module_get_kallsym(unsigned int symnum,
1916	unsigned long *value,	1916	unsigned long *value,
1917	char *type,	1917	char *type,
1918	char namebuf[128])	1918	char namebuf[128])
1919	{	1919	{
1920	struct module *mod;	1920	struct module *mod;
1921		1921
1922	down(&module_mutex);	1922	down(&module_mutex);
1923	list_for_each_entry(mod, &modules, list) {	1923	list_for_each_entry(mod, &modules, list) {
1924	if (symnum < mod->num_symtab) {	1924	if (symnum < mod->num_symtab) {
1925	*value = mod->symtab[symnum].st_value;	1925	*value = mod->symtab[symnum].st_value;
1926	*type = mod->symtab[symnum].st_info;	1926	*type = mod->symtab[symnum].st_info;
1927	strncpy(namebuf,	1927	strncpy(namebuf,
1928	mod->strtab + mod->symtab[symnum].st_name,	1928	mod->strtab + mod->symtab[symnum].st_name,
1929	127);	1929	127);
1930	up(&module_mutex);	1930	up(&module_mutex);
1931	return mod;	1931	return mod;
1932	}	1932	}
1933	symnum -= mod->num_symtab;	1933	symnum -= mod->num_symtab;
1934	}	1934	}
1935	up(&module_mutex);	1935	up(&module_mutex);
1936	return NULL;	1936	return NULL;
1937	}	1937	}
1938		1938
1939	static unsigned long mod_find_symname(struct module mod, const char name)	1939	static unsigned long mod_find_symname(struct module mod, const char name)
1940	{	1940	{
1941	unsigned int i;	1941	unsigned int i;
1942		1942
1943	for (i = 0; i < mod->num_symtab; i++)	1943	for (i = 0; i < mod->num_symtab; i++)
1944	if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)	1944	if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)
1945	return mod->symtab[i].st_value;	1945	return mod->symtab[i].st_value;
1946	return 0;	1946	return 0;
1947	}	1947	}
1948		1948
1949	/* Look for this name: can be of form module:name. */	1949	/* Look for this name: can be of form module:name. */
1950	unsigned long module_kallsyms_lookup_name(const char *name)	1950	unsigned long module_kallsyms_lookup_name(const char *name)
1951	{	1951	{
1952	struct module *mod;	1952	struct module *mod;
1953	char *colon;	1953	char *colon;
1954	unsigned long ret = 0;	1954	unsigned long ret = 0;
1955		1955
1956	/* Don't lock: we're in enough trouble already. */	1956	/* Don't lock: we're in enough trouble already. */
1957	if ((colon = strchr(name, ':')) != NULL) {	1957	if ((colon = strchr(name, ':')) != NULL) {
1958	*colon = '\0';	1958	*colon = '\0';
1959	if ((mod = find_module(name)) != NULL)	1959	if ((mod = find_module(name)) != NULL)
1960	ret = mod_find_symname(mod, colon+1);	1960	ret = mod_find_symname(mod, colon+1);
1961	*colon = ':';	1961	*colon = ':';
1962	} else {	1962	} else {
1963	list_for_each_entry(mod, &modules, list)	1963	list_for_each_entry(mod, &modules, list)
1964	if ((ret = mod_find_symname(mod, name)) != 0)	1964	if ((ret = mod_find_symname(mod, name)) != 0)
1965	break;	1965	break;
1966	}	1966	}
1967	return ret;	1967	return ret;
1968	}	1968	}
1969	#endif /* CONFIG_KALLSYMS */	1969	#endif /* CONFIG_KALLSYMS */
1970		1970
1971	/* Called by the /proc file system to return a list of modules. */	1971	/* Called by the /proc file system to return a list of modules. */
1972	static void m_start(struct seq_file m, loff_t *pos)	1972	static void m_start(struct seq_file m, loff_t *pos)
1973	{	1973	{
1974	struct list_head *i;	1974	struct list_head *i;
1975	loff_t n = 0;	1975	loff_t n = 0;
1976		1976
1977	down(&module_mutex);	1977	down(&module_mutex);
1978	list_for_each(i, &modules) {	1978	list_for_each(i, &modules) {
1979	if (n++ == *pos)	1979	if (n++ == *pos)
1980	break;	1980	break;
1981	}	1981	}
1982	if (i == &modules)	1982	if (i == &modules)
1983	return NULL;	1983	return NULL;
1984	return i;	1984	return i;
1985	}	1985	}
1986		1986
1987	static void m_next(struct seq_file m, void p, loff_t pos)	1987	static void m_next(struct seq_file m, void p, loff_t pos)
1988	{	1988	{
1989	struct list_head *i = p;	1989	struct list_head *i = p;
1990	(*pos)++;	1990	(*pos)++;
1991	if (i->next == &modules)	1991	if (i->next == &modules)
1992	return NULL;	1992	return NULL;
1993	return i->next;	1993	return i->next;
1994	}	1994	}
1995		1995
1996	static void m_stop(struct seq_file m, void p)	1996	static void m_stop(struct seq_file m, void p)
1997	{	1997	{
1998	up(&module_mutex);	1998	up(&module_mutex);
1999	}	1999	}
2000		2000
2001	static int m_show(struct seq_file m, void p)	2001	static int m_show(struct seq_file m, void p)
2002	{	2002	{
2003	struct module *mod = list_entry(p, struct module, list);	2003	struct module *mod = list_entry(p, struct module, list);
2004	seq_printf(m, "%s %lu",	2004	seq_printf(m, "%s %lu",
2005	mod->name, mod->init_size + mod->core_size);	2005	mod->name, mod->init_size + mod->core_size);
2006	print_unload_info(m, mod);	2006	print_unload_info(m, mod);
2007		2007
2008	/* Informative for users. */	2008	/* Informative for users. */
2009	seq_printf(m, " %s",	2009	seq_printf(m, " %s",
2010	mod->state == MODULE_STATE_GOING ? "Unloading":	2010	mod->state == MODULE_STATE_GOING ? "Unloading":
2011	mod->state == MODULE_STATE_COMING ? "Loading":	2011	mod->state == MODULE_STATE_COMING ? "Loading":
2012	"Live");	2012	"Live");
2013	/* Used by oprofile and other similar tools. */	2013	/* Used by oprofile and other similar tools. */
2014	seq_printf(m, " 0x%p", mod->module_core);	2014	seq_printf(m, " 0x%p", mod->module_core);
2015		2015
2016	seq_printf(m, "\n");	2016	seq_printf(m, "\n");
2017	return 0;	2017	return 0;
2018	}	2018	}
2019		2019
2020	/* Format: modulename size refcount deps address	2020	/* Format: modulename size refcount deps address
2021		2021
2022	Where refcount is a number or -, and deps is a comma-separated list	2022	Where refcount is a number or -, and deps is a comma-separated list
2023	of depends or -.	2023	of depends or -.
2024	*/	2024	*/
2025	struct seq_operations modules_op = {	2025	struct seq_operations modules_op = {
2026	.start = m_start,	2026	.start = m_start,
2027	.next = m_next,	2027	.next = m_next,
2028	.stop = m_stop,	2028	.stop = m_stop,
2029	.show = m_show	2029	.show = m_show
2030	};	2030	};
2031		2031
2032	/* Given an address, look for it in the module exception tables. */	2032	/* Given an address, look for it in the module exception tables. */
2033	const struct exception_table_entry *search_module_extables(unsigned long addr)	2033	const struct exception_table_entry *search_module_extables(unsigned long addr)
2034	{	2034	{
2035	unsigned long flags;	2035	unsigned long flags;
2036	const struct exception_table_entry *e = NULL;	2036	const struct exception_table_entry *e = NULL;
2037	struct module *mod;	2037	struct module *mod;
2038		2038
2039	spin_lock_irqsave(&modlist_lock, flags);	2039	spin_lock_irqsave(&modlist_lock, flags);
2040	list_for_each_entry(mod, &modules, list) {	2040	list_for_each_entry(mod, &modules, list) {
2041	if (mod->num_exentries == 0)	2041	if (mod->num_exentries == 0)
2042	continue;	2042	continue;
2043		2043
2044	e = search_extable(mod->extable,	2044	e = search_extable(mod->extable,
2045	mod->extable + mod->num_exentries - 1,	2045	mod->extable + mod->num_exentries - 1,
2046	addr);	2046	addr);
2047	if (e)	2047	if (e)
2048	break;	2048	break;
2049	}	2049	}
2050	spin_unlock_irqrestore(&modlist_lock, flags);	2050	spin_unlock_irqrestore(&modlist_lock, flags);
2051		2051
2052	/* Now, if we found one, we are running inside it now, hence	2052	/* Now, if we found one, we are running inside it now, hence
2053	we cannot unload the module, hence no refcnt needed. */	2053	we cannot unload the module, hence no refcnt needed. */
2054	return e;	2054	return e;
2055	}	2055	}
2056		2056
2057	/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */	2057	/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
2058	struct module *__module_text_address(unsigned long addr)	2058	struct module *__module_text_address(unsigned long addr)
2059	{	2059	{
2060	struct module *mod;	2060	struct module *mod;
2061		2061
2062	list_for_each_entry(mod, &modules, list)	2062	list_for_each_entry(mod, &modules, list)
2063	if (within(addr, mod->module_init, mod->init_text_size)	2063	if (within(addr, mod->module_init, mod->init_text_size)
2064	\|\| within(addr, mod->module_core, mod->core_text_size))	2064	\|\| within(addr, mod->module_core, mod->core_text_size))
2065	return mod;	2065	return mod;
2066	return NULL;	2066	return NULL;
2067	}	2067	}
2068		2068
2069	struct module *module_text_address(unsigned long addr)	2069	struct module *module_text_address(unsigned long addr)
2070	{	2070	{
2071	struct module *mod;	2071	struct module *mod;
2072	unsigned long flags;	2072	unsigned long flags;
2073		2073
2074	spin_lock_irqsave(&modlist_lock, flags);	2074	spin_lock_irqsave(&modlist_lock, flags);
2075	mod = __module_text_address(addr);	2075	mod = __module_text_address(addr);
2076	spin_unlock_irqrestore(&modlist_lock, flags);	2076	spin_unlock_irqrestore(&modlist_lock, flags);
2077		2077
2078	return mod;	2078	return mod;
2079	}	2079	}
2080		2080
2081	/* Don't grab lock, we're oopsing. */	2081	/* Don't grab lock, we're oopsing. */
2082	void print_modules(void)	2082	void print_modules(void)
2083	{	2083	{
2084	struct module *mod;	2084	struct module *mod;
2085		2085
2086	printk("Modules linked in:");	2086	printk("Modules linked in:");
2087	list_for_each_entry(mod, &modules, list)	2087	list_for_each_entry(mod, &modules, list)
2088	printk(" %s", mod->name);	2088	printk(" %s", mod->name);
2089	printk("\n");	2089	printk("\n");
2090	}	2090	}
2091		2091
2092	void module_add_driver(struct module mod, struct device_driver drv)	2092	void module_add_driver(struct module mod, struct device_driver drv)
2093	{	2093	{
2094	if (!mod \|\| !drv)	2094	if (!mod \|\| !drv)
2095	return;	2095	return;
2096		2096
2097	/* Don't check return code; this call is idempotent */	2097	/* Don't check return code; this call is idempotent */
2098	sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");	2098	sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
2099	}	2099	}
2100	EXPORT_SYMBOL(module_add_driver);	2100	EXPORT_SYMBOL(module_add_driver);
2101		2101
2102	void module_remove_driver(struct device_driver *drv)	2102	void module_remove_driver(struct device_driver *drv)
2103	{	2103	{
2104	if (!drv)	2104	if (!drv)
2105	return;	2105	return;
2106	sysfs_remove_link(&drv->kobj, "module");	2106	sysfs_remove_link(&drv->kobj, "module");
2107	}	2107	}
2108	EXPORT_SYMBOL(module_remove_driver);	2108	EXPORT_SYMBOL(module_remove_driver);
2109		2109
2110	#ifdef CONFIG_MODVERSIONS	2110	#ifdef CONFIG_MODVERSIONS
2111	/* Generate the signature for struct module here, too, for modversions. */	2111	/* Generate the signature for struct module here, too, for modversions. */
2112	void struct_module(struct module *mod) { return; }	2112	void struct_module(struct module *mod) { return; }
2113	EXPORT_SYMBOL(struct_module);	2113	EXPORT_SYMBOL(struct_module);
2114	#endif	2114	#endif
2115		2115

kernel/power/smp.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* drivers/power/smp.c - Functions for stopping other CPUs.	2	* drivers/power/smp.c - Functions for stopping other CPUs.
3	*	3	*
4	* Copyright 2004 Pavel Machek <pavel@suse.cz>	4	* Copyright 2004 Pavel Machek <pavel@suse.cz>
5	* Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>	5	* Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
6	*	6	*
7	* This file is released under the GPLv2.	7	* This file is released under the GPLv2.
8	*/	8	*/
9		9
10	#undef DEBUG	10	#undef DEBUG
11		11
12	#include <linux/smp_lock.h>	12	#include <linux/smp_lock.h>
13	#include <linux/interrupt.h>	13	#include <linux/interrupt.h>
14	#include <linux/suspend.h>	14	#include <linux/suspend.h>
15	#include <linux/module.h>	15	#include <linux/module.h>
16	#include <asm/atomic.h>	16	#include <asm/atomic.h>
17	#include <asm/tlbflush.h>	17	#include <asm/tlbflush.h>
18		18
19	static atomic_t cpu_counter, freeze;	19	static atomic_t cpu_counter, freeze;
20		20
21		21
22	static void smp_pause(void * data)	22	static void smp_pause(void * data)
23	{	23	{
24	struct saved_context ctxt;	24	struct saved_context ctxt;
25	__save_processor_state(&ctxt);	25	__save_processor_state(&ctxt);
26	printk("Sleeping in:\n");	26	printk("Sleeping in:\n");
27	dump_stack();	27	dump_stack();
28	atomic_inc(&cpu_counter);	28	atomic_inc(&cpu_counter);
29	while (atomic_read(&freeze)) {	29	while (atomic_read(&freeze)) {
30	/* FIXME: restore takes place at random piece inside this.	30	/* FIXME: restore takes place at random piece inside this.
31	This should probably be written in assembly, and	31	This should probably be written in assembly, and
32	preserve general-purpose registers, too	32	preserve general-purpose registers, too
33		33
34	What about stack? We may need to move to new stack here.	34	What about stack? We may need to move to new stack here.
35		35
36	This should better be ran with interrupts disabled.	36	This should better be ran with interrupts disabled.
37	*/	37	*/
38	cpu_relax();	38	cpu_relax();
39	barrier();	39	barrier();
40	}	40	}
41	atomic_dec(&cpu_counter);	41	atomic_dec(&cpu_counter);
42	__restore_processor_state(&ctxt);	42	__restore_processor_state(&ctxt);
43	}	43	}
44		44
45	static cpumask_t oldmask;	45	static cpumask_t oldmask;
46		46
47	void disable_nonboot_cpus(void)	47	void disable_nonboot_cpus(void)
48	{	48	{
49	oldmask = current->cpus_allowed;	49	oldmask = current->cpus_allowed;
50	set_cpus_allowed(current, cpumask_of_cpu(0));	50	set_cpus_allowed(current, cpumask_of_cpu(0));
51	printk("Freezing CPUs (at %d)", _smp_processor_id());	51	printk("Freezing CPUs (at %d)", raw_smp_processor_id());
52	current->state = TASK_INTERRUPTIBLE;	52	current->state = TASK_INTERRUPTIBLE;
53	schedule_timeout(HZ);	53	schedule_timeout(HZ);
54	printk("...");	54	printk("...");
55	BUG_ON(_smp_processor_id() != 0);	55	BUG_ON(raw_smp_processor_id() != 0);
56		56
57	/* FIXME: for this to work, all the CPUs must be running	57	/* FIXME: for this to work, all the CPUs must be running
58	* "idle" thread (or we deadlock). Is that guaranteed? */	58	* "idle" thread (or we deadlock). Is that guaranteed? */
59		59
60	atomic_set(&cpu_counter, 0);	60	atomic_set(&cpu_counter, 0);
61	atomic_set(&freeze, 1);	61	atomic_set(&freeze, 1);
62	smp_call_function(smp_pause, NULL, 0, 0);	62	smp_call_function(smp_pause, NULL, 0, 0);
63	while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {	63	while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
64	cpu_relax();	64	cpu_relax();
65	barrier();	65	barrier();
66	}	66	}
67	printk("ok\n");	67	printk("ok\n");
68	}	68	}
69		69
70	void enable_nonboot_cpus(void)	70	void enable_nonboot_cpus(void)
71	{	71	{
72	printk("Restarting CPUs");	72	printk("Restarting CPUs");
73	atomic_set(&freeze, 0);	73	atomic_set(&freeze, 0);
74	while (atomic_read(&cpu_counter)) {	74	while (atomic_read(&cpu_counter)) {
75	cpu_relax();	75	cpu_relax();
76	barrier();	76	barrier();
77	}	77	}
78	printk("...");	78	printk("...");
79	set_cpus_allowed(current, oldmask);	79	set_cpus_allowed(current, oldmask);
80	schedule();	80	schedule();
81	printk("ok\n");	81	printk("ok\n");
82		82
83	}	83	}
84		84
85		85
86		86

kernel/sched.c

Diff comments View file @ 39c715b

1	/*	1	/*
2	* kernel/sched.c	2	* kernel/sched.c
3	*	3	*
4	* Kernel scheduler and related syscalls	4	* Kernel scheduler and related syscalls
5	*	5	*
6	* Copyright (C) 1991-2002 Linus Torvalds	6	* Copyright (C) 1991-2002 Linus Torvalds
7	*	7	*
8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and	8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9	* make semaphores SMP safe	9	* make semaphores SMP safe
10	* 1998-11-19 Implemented schedule_timeout() and related stuff	10	* 1998-11-19 Implemented schedule_timeout() and related stuff
11	* by Andrea Arcangeli	11	* by Andrea Arcangeli
12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:	12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13	* hybrid priority-list and round-robin design with	13	* hybrid priority-list and round-robin design with
14	* an array-switch method of distributing timeslices	14	* an array-switch method of distributing timeslices
15	* and per-CPU runqueues. Cleanups and useful suggestions	15	* and per-CPU runqueues. Cleanups and useful suggestions
16	* by Davide Libenzi, preemptible kernel bits by Robert Love.	16	* by Davide Libenzi, preemptible kernel bits by Robert Love.
17	* 2003-09-03 Interactivity tuning by Con Kolivas.	17	* 2003-09-03 Interactivity tuning by Con Kolivas.
18	* 2004-04-02 Scheduler domains code by Nick Piggin	18	* 2004-04-02 Scheduler domains code by Nick Piggin
19	*/	19	*/
20		20
21	#include <linux/mm.h>	21	#include <linux/mm.h>
22	#include <linux/module.h>	22	#include <linux/module.h>
23	#include <linux/nmi.h>	23	#include <linux/nmi.h>
24	#include <linux/init.h>	24	#include <linux/init.h>
25	#include <asm/uaccess.h>	25	#include <asm/uaccess.h>
26	#include <linux/highmem.h>	26	#include <linux/highmem.h>
27	#include <linux/smp_lock.h>	27	#include <linux/smp_lock.h>
28	#include <asm/mmu_context.h>	28	#include <asm/mmu_context.h>
29	#include <linux/interrupt.h>	29	#include <linux/interrupt.h>
30	#include <linux/completion.h>	30	#include <linux/completion.h>
31	#include <linux/kernel_stat.h>	31	#include <linux/kernel_stat.h>
32	#include <linux/security.h>	32	#include <linux/security.h>
33	#include <linux/notifier.h>	33	#include <linux/notifier.h>
34	#include <linux/profile.h>	34	#include <linux/profile.h>
35	#include <linux/suspend.h>	35	#include <linux/suspend.h>
36	#include <linux/blkdev.h>	36	#include <linux/blkdev.h>
37	#include <linux/delay.h>	37	#include <linux/delay.h>
38	#include <linux/smp.h>	38	#include <linux/smp.h>
39	#include <linux/threads.h>	39	#include <linux/threads.h>
40	#include <linux/timer.h>	40	#include <linux/timer.h>
41	#include <linux/rcupdate.h>	41	#include <linux/rcupdate.h>
42	#include <linux/cpu.h>	42	#include <linux/cpu.h>
43	#include <linux/cpuset.h>	43	#include <linux/cpuset.h>
44	#include <linux/percpu.h>	44	#include <linux/percpu.h>
45	#include <linux/kthread.h>	45	#include <linux/kthread.h>
46	#include <linux/seq_file.h>	46	#include <linux/seq_file.h>
47	#include <linux/syscalls.h>	47	#include <linux/syscalls.h>
48	#include <linux/times.h>	48	#include <linux/times.h>
49	#include <linux/acct.h>	49	#include <linux/acct.h>
50	#include <asm/tlb.h>	50	#include <asm/tlb.h>
51		51
52	#include <asm/unistd.h>	52	#include <asm/unistd.h>
53		53
54	/*	54	/*
55	* Convert user-nice values [ -20 ... 0 ... 19 ]	55	* Convert user-nice values [ -20 ... 0 ... 19 ]
56	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],	56	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
57	* and back.	57	* and back.
58	*/	58	*/
59	#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)	59	#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
60	#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)	60	#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
61	#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)	61	#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
62		62
63	/*	63	/*
64	* 'User priority' is the nice value converted to something we	64	* 'User priority' is the nice value converted to something we
65	* can work with better when scaling various scheduler parameters,	65	* can work with better when scaling various scheduler parameters,
66	* it's a [ 0 ... 39 ] range.	66	* it's a [ 0 ... 39 ] range.
67	*/	67	*/
68	#define USER_PRIO(p) ((p)-MAX_RT_PRIO)	68	#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
69	#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)	69	#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
70	#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))	70	#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
71		71
72	/*	72	/*
73	* Some helpers for converting nanosecond timing to jiffy resolution	73	* Some helpers for converting nanosecond timing to jiffy resolution
74	*/	74	*/
75	#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))	75	#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
76	#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))	76	#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
77		77
78	/*	78	/*
79	* These are the 'tuning knobs' of the scheduler:	79	* These are the 'tuning knobs' of the scheduler:
80	*	80	*
81	* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),	81	* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
82	* default timeslice is 100 msecs, maximum timeslice is 800 msecs.	82	* default timeslice is 100 msecs, maximum timeslice is 800 msecs.
83	* Timeslices get refilled after they expire.	83	* Timeslices get refilled after they expire.
84	*/	84	*/
85	#define MIN_TIMESLICE max(5 * HZ / 1000, 1)	85	#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
86	#define DEF_TIMESLICE (100 * HZ / 1000)	86	#define DEF_TIMESLICE (100 * HZ / 1000)
87	#define ON_RUNQUEUE_WEIGHT 30	87	#define ON_RUNQUEUE_WEIGHT 30
88	#define CHILD_PENALTY 95	88	#define CHILD_PENALTY 95
89	#define PARENT_PENALTY 100	89	#define PARENT_PENALTY 100
90	#define EXIT_WEIGHT 3	90	#define EXIT_WEIGHT 3
91	#define PRIO_BONUS_RATIO 25	91	#define PRIO_BONUS_RATIO 25
92	#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)	92	#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
93	#define INTERACTIVE_DELTA 2	93	#define INTERACTIVE_DELTA 2
94	#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)	94	#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
95	#define STARVATION_LIMIT (MAX_SLEEP_AVG)	95	#define STARVATION_LIMIT (MAX_SLEEP_AVG)
96	#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))	96	#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
97		97
98	/*	98	/*
99	* If a task is 'interactive' then we reinsert it in the active	99	* If a task is 'interactive' then we reinsert it in the active
100	* array after it has expired its current timeslice. (it will not	100	* array after it has expired its current timeslice. (it will not
101	* continue to run immediately, it will still roundrobin with	101	* continue to run immediately, it will still roundrobin with
102	* other interactive tasks.)	102	* other interactive tasks.)
103	*	103	*
104	* This part scales the interactivity limit depending on niceness.	104	* This part scales the interactivity limit depending on niceness.
105	*	105	*
106	* We scale it linearly, offset by the INTERACTIVE_DELTA delta.	106	* We scale it linearly, offset by the INTERACTIVE_DELTA delta.
107	* Here are a few examples of different nice levels:	107	* Here are a few examples of different nice levels:
108	*	108	*
109	* TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]	109	* TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
110	* TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]	110	* TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
111	* TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]	111	* TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
112	* TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]	112	* TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
113	* TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]	113	* TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
114	*	114	*
115	* (the X axis represents the possible -5 ... 0 ... +5 dynamic	115	* (the X axis represents the possible -5 ... 0 ... +5 dynamic
116	* priority range a task can explore, a value of '1' means the	116	* priority range a task can explore, a value of '1' means the
117	* task is rated interactive.)	117	* task is rated interactive.)
118	*	118	*
119	* Ie. nice +19 tasks can never get 'interactive' enough to be	119	* Ie. nice +19 tasks can never get 'interactive' enough to be
120	* reinserted into the active array. And only heavily CPU-hog nice -20	120	* reinserted into the active array. And only heavily CPU-hog nice -20
121	* tasks will be expired. Default nice 0 tasks are somewhere between,	121	* tasks will be expired. Default nice 0 tasks are somewhere between,
122	* it takes some effort for them to get interactive, but it's not	122	* it takes some effort for them to get interactive, but it's not
123	* too hard.	123	* too hard.
124	*/	124	*/
125		125
126	#define CURRENT_BONUS(p) \	126	#define CURRENT_BONUS(p) \
127	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \	127	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
128	MAX_SLEEP_AVG)	128	MAX_SLEEP_AVG)
129		129
130	#define GRANULARITY (10 * HZ / 1000 ? : 1)	130	#define GRANULARITY (10 * HZ / 1000 ? : 1)
131		131
132	#ifdef CONFIG_SMP	132	#ifdef CONFIG_SMP
133	#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \	133	#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
134	(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \	134	(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
135	num_online_cpus())	135	num_online_cpus())
136	#else	136	#else
137	#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \	137	#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
138	(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))	138	(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
139	#endif	139	#endif
140		140
141	#define SCALE(v1,v1_max,v2_max) \	141	#define SCALE(v1,v1_max,v2_max) \
142	(v1) * (v2_max) / (v1_max)	142	(v1) * (v2_max) / (v1_max)
143		143
144	#define DELTA(p) \	144	#define DELTA(p) \
145	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)	145	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
146		146
147	#define TASK_INTERACTIVE(p) \	147	#define TASK_INTERACTIVE(p) \
148	((p)->prio <= (p)->static_prio - DELTA(p))	148	((p)->prio <= (p)->static_prio - DELTA(p))
149		149
150	#define INTERACTIVE_SLEEP(p) \	150	#define INTERACTIVE_SLEEP(p) \
151	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \	151	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152	(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))	152	(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
153		153
154	#define TASK_PREEMPTS_CURR(p, rq) \	154	#define TASK_PREEMPTS_CURR(p, rq) \
155	((p)->prio < (rq)->curr->prio)	155	((p)->prio < (rq)->curr->prio)
156		156
157	/*	157	/*
158	* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]	158	* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
159	* to time slice values: [800ms ... 100ms ... 5ms]	159	* to time slice values: [800ms ... 100ms ... 5ms]
160	*	160	*
161	* The higher a thread's priority, the bigger timeslices	161	* The higher a thread's priority, the bigger timeslices
162	* it gets during one round of execution. But even the lowest	162	* it gets during one round of execution. But even the lowest
163	* priority thread gets MIN_TIMESLICE worth of execution time.	163	* priority thread gets MIN_TIMESLICE worth of execution time.
164	*/	164	*/
165		165
166	#define SCALE_PRIO(x, prio) \	166	#define SCALE_PRIO(x, prio) \
167	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)	167	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168		168
169	static inline unsigned int task_timeslice(task_t *p)	169	static inline unsigned int task_timeslice(task_t *p)
170	{	170	{
171	if (p->static_prio < NICE_TO_PRIO(0))	171	if (p->static_prio < NICE_TO_PRIO(0))
172	return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);	172	return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
173	else	173	else
174	return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);	174	return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
175	}	175	}
176	#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \	176	#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
177	< (long long) (sd)->cache_hot_time)	177	< (long long) (sd)->cache_hot_time)
178		178
179	/*	179	/*
180	* These are the runqueue data structures:	180	* These are the runqueue data structures:
181	*/	181	*/
182		182
183	#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))	183	#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
184		184
185	typedef struct runqueue runqueue_t;	185	typedef struct runqueue runqueue_t;
186		186
187	struct prio_array {	187	struct prio_array {
188	unsigned int nr_active;	188	unsigned int nr_active;
189	unsigned long bitmap[BITMAP_SIZE];	189	unsigned long bitmap[BITMAP_SIZE];
190	struct list_head queue[MAX_PRIO];	190	struct list_head queue[MAX_PRIO];
191	};	191	};
192		192
193	/*	193	/*
194	* This is the main, per-CPU runqueue data structure.	194	* This is the main, per-CPU runqueue data structure.
195	*	195	*
196	* Locking rule: those places that want to lock multiple runqueues	196	* Locking rule: those places that want to lock multiple runqueues
197	* (such as the load balancing or the thread migration code), lock	197	* (such as the load balancing or the thread migration code), lock
198	* acquire operations must be ordered by ascending &runqueue.	198	* acquire operations must be ordered by ascending &runqueue.
199	*/	199	*/
200	struct runqueue {	200	struct runqueue {
201	spinlock_t lock;	201	spinlock_t lock;
202		202
203	/*	203	/*
204	* nr_running and cpu_load should be in the same cacheline because	204	* nr_running and cpu_load should be in the same cacheline because
205	* remote CPUs use both these fields when doing load calculation.	205	* remote CPUs use both these fields when doing load calculation.
206	*/	206	*/
207	unsigned long nr_running;	207	unsigned long nr_running;
208	#ifdef CONFIG_SMP	208	#ifdef CONFIG_SMP
209	unsigned long cpu_load;	209	unsigned long cpu_load;
210	#endif	210	#endif
211	unsigned long long nr_switches;	211	unsigned long long nr_switches;
212		212
213	/*	213	/*
214	* This is part of a global counter where only the total sum	214	* This is part of a global counter where only the total sum
215	* over all CPUs matters. A task can increase this counter on	215	* over all CPUs matters. A task can increase this counter on
216	* one CPU and if it got migrated afterwards it may decrease	216	* one CPU and if it got migrated afterwards it may decrease
217	* it on another CPU. Always updated under the runqueue lock:	217	* it on another CPU. Always updated under the runqueue lock:
218	*/	218	*/
219	unsigned long nr_uninterruptible;	219	unsigned long nr_uninterruptible;
220		220
221	unsigned long expired_timestamp;	221	unsigned long expired_timestamp;
222	unsigned long long timestamp_last_tick;	222	unsigned long long timestamp_last_tick;
223	task_t curr, idle;	223	task_t curr, idle;
224	struct mm_struct *prev_mm;	224	struct mm_struct *prev_mm;
225	prio_array_t active, expired, arrays[2];	225	prio_array_t active, expired, arrays[2];
226	int best_expired_prio;	226	int best_expired_prio;
227	atomic_t nr_iowait;	227	atomic_t nr_iowait;
228		228
229	#ifdef CONFIG_SMP	229	#ifdef CONFIG_SMP
230	struct sched_domain *sd;	230	struct sched_domain *sd;
231		231
232	/* For active balancing */	232	/* For active balancing */
233	int active_balance;	233	int active_balance;
234	int push_cpu;	234	int push_cpu;
235		235
236	task_t *migration_thread;	236	task_t *migration_thread;
237	struct list_head migration_queue;	237	struct list_head migration_queue;
238	#endif	238	#endif
239		239
240	#ifdef CONFIG_SCHEDSTATS	240	#ifdef CONFIG_SCHEDSTATS
241	/* latency stats */	241	/* latency stats */
242	struct sched_info rq_sched_info;	242	struct sched_info rq_sched_info;
243		243
244	/* sys_sched_yield() stats */	244	/* sys_sched_yield() stats */
245	unsigned long yld_exp_empty;	245	unsigned long yld_exp_empty;
246	unsigned long yld_act_empty;	246	unsigned long yld_act_empty;
247	unsigned long yld_both_empty;	247	unsigned long yld_both_empty;
248	unsigned long yld_cnt;	248	unsigned long yld_cnt;
249		249
250	/* schedule() stats */	250	/* schedule() stats */
251	unsigned long sched_switch;	251	unsigned long sched_switch;
252	unsigned long sched_cnt;	252	unsigned long sched_cnt;
253	unsigned long sched_goidle;	253	unsigned long sched_goidle;
254		254
255	/* try_to_wake_up() stats */	255	/* try_to_wake_up() stats */
256	unsigned long ttwu_cnt;	256	unsigned long ttwu_cnt;
257	unsigned long ttwu_local;	257	unsigned long ttwu_local;
258	#endif	258	#endif
259	};	259	};
260		260
261	static DEFINE_PER_CPU(struct runqueue, runqueues);	261	static DEFINE_PER_CPU(struct runqueue, runqueues);
262		262
263	#define for_each_domain(cpu, domain) \	263	#define for_each_domain(cpu, domain) \
264	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)	264	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
265		265
266	#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))	266	#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267	#define this_rq() (&__get_cpu_var(runqueues))	267	#define this_rq() (&__get_cpu_var(runqueues))
268	#define task_rq(p) cpu_rq(task_cpu(p))	268	#define task_rq(p) cpu_rq(task_cpu(p))
269	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)	269	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270		270
271	/*	271	/*
272	* Default context-switch locking:	272	* Default context-switch locking:
273	*/	273	*/
274	#ifndef prepare_arch_switch	274	#ifndef prepare_arch_switch
275	# define prepare_arch_switch(rq, next) do { } while (0)	275	# define prepare_arch_switch(rq, next) do { } while (0)
276	# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)	276	# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
277	# define task_running(rq, p) ((rq)->curr == (p))	277	# define task_running(rq, p) ((rq)->curr == (p))
278	#endif	278	#endif
279		279
280	/*	280	/*
281	* task_rq_lock - lock the runqueue a given task resides on and disable	281	* task_rq_lock - lock the runqueue a given task resides on and disable
282	* interrupts. Note the ordering: we can safely lookup the task_rq without	282	* interrupts. Note the ordering: we can safely lookup the task_rq without
283	* explicitly disabling preemption.	283	* explicitly disabling preemption.
284	*/	284	*/
285	static inline runqueue_t task_rq_lock(task_t p, unsigned long *flags)	285	static inline runqueue_t task_rq_lock(task_t p, unsigned long *flags)
286	__acquires(rq->lock)	286	__acquires(rq->lock)
287	{	287	{
288	struct runqueue *rq;	288	struct runqueue *rq;
289		289
290	repeat_lock_task:	290	repeat_lock_task:
291	local_irq_save(*flags);	291	local_irq_save(*flags);
292	rq = task_rq(p);	292	rq = task_rq(p);
293	spin_lock(&rq->lock);	293	spin_lock(&rq->lock);
294	if (unlikely(rq != task_rq(p))) {	294	if (unlikely(rq != task_rq(p))) {
295	spin_unlock_irqrestore(&rq->lock, *flags);	295	spin_unlock_irqrestore(&rq->lock, *flags);
296	goto repeat_lock_task;	296	goto repeat_lock_task;
297	}	297	}
298	return rq;	298	return rq;
299	}	299	}
300		300
301	static inline void task_rq_unlock(runqueue_t rq, unsigned long flags)	301	static inline void task_rq_unlock(runqueue_t rq, unsigned long flags)
302	__releases(rq->lock)	302	__releases(rq->lock)
303	{	303	{
304	spin_unlock_irqrestore(&rq->lock, *flags);	304	spin_unlock_irqrestore(&rq->lock, *flags);
305	}	305	}
306		306
307	#ifdef CONFIG_SCHEDSTATS	307	#ifdef CONFIG_SCHEDSTATS
308	/*	308	/*
309	* bump this up when changing the output format or the meaning of an existing	309	* bump this up when changing the output format or the meaning of an existing
310	* format, so that tools can adapt (or abort)	310	* format, so that tools can adapt (or abort)
311	*/	311	*/
312	#define SCHEDSTAT_VERSION 11	312	#define SCHEDSTAT_VERSION 11
313		313
314	static int show_schedstat(struct seq_file seq, void v)	314	static int show_schedstat(struct seq_file seq, void v)
315	{	315	{
316	int cpu;	316	int cpu;
317		317
318	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);	318	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
319	seq_printf(seq, "timestamp %lu\n", jiffies);	319	seq_printf(seq, "timestamp %lu\n", jiffies);
320	for_each_online_cpu(cpu) {	320	for_each_online_cpu(cpu) {
321	runqueue_t *rq = cpu_rq(cpu);	321	runqueue_t *rq = cpu_rq(cpu);
322	#ifdef CONFIG_SMP	322	#ifdef CONFIG_SMP
323	struct sched_domain *sd;	323	struct sched_domain *sd;
324	int dcnt = 0;	324	int dcnt = 0;
325	#endif	325	#endif
326		326
327	/* runqueue-specific stats */	327	/* runqueue-specific stats */
328	seq_printf(seq,	328	seq_printf(seq,
329	"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",	329	"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
330	cpu, rq->yld_both_empty,	330	cpu, rq->yld_both_empty,
331	rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,	331	rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
332	rq->sched_switch, rq->sched_cnt, rq->sched_goidle,	332	rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
333	rq->ttwu_cnt, rq->ttwu_local,	333	rq->ttwu_cnt, rq->ttwu_local,
334	rq->rq_sched_info.cpu_time,	334	rq->rq_sched_info.cpu_time,
335	rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);	335	rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
336		336
337	seq_printf(seq, "\n");	337	seq_printf(seq, "\n");
338		338
339	#ifdef CONFIG_SMP	339	#ifdef CONFIG_SMP
340	/* domain-specific stats */	340	/* domain-specific stats */
341	for_each_domain(cpu, sd) {	341	for_each_domain(cpu, sd) {
342	enum idle_type itype;	342	enum idle_type itype;
343	char mask_str[NR_CPUS];	343	char mask_str[NR_CPUS];
344		344
345	cpumask_scnprintf(mask_str, NR_CPUS, sd->span);	345	cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
346	seq_printf(seq, "domain%d %s", dcnt++, mask_str);	346	seq_printf(seq, "domain%d %s", dcnt++, mask_str);
347	for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;	347	for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
348	itype++) {	348	itype++) {
349	seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",	349	seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
350	sd->lb_cnt[itype],	350	sd->lb_cnt[itype],
351	sd->lb_balanced[itype],	351	sd->lb_balanced[itype],
352	sd->lb_failed[itype],	352	sd->lb_failed[itype],
353	sd->lb_imbalance[itype],	353	sd->lb_imbalance[itype],
354	sd->lb_gained[itype],	354	sd->lb_gained[itype],
355	sd->lb_hot_gained[itype],	355	sd->lb_hot_gained[itype],
356	sd->lb_nobusyq[itype],	356	sd->lb_nobusyq[itype],
357	sd->lb_nobusyg[itype]);	357	sd->lb_nobusyg[itype]);
358	}	358	}
359	seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",	359	seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
360	sd->alb_cnt, sd->alb_failed, sd->alb_pushed,	360	sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361	sd->sbe_pushed, sd->sbe_attempts,	361	sd->sbe_pushed, sd->sbe_attempts,
362	sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);	362	sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363	}	363	}
364	#endif	364	#endif
365	}	365	}
366	return 0;	366	return 0;
367	}	367	}
368		368
369	static int schedstat_open(struct inode inode, struct file file)	369	static int schedstat_open(struct inode inode, struct file file)
370	{	370	{
371	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);	371	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
372	char *buf = kmalloc(size, GFP_KERNEL);	372	char *buf = kmalloc(size, GFP_KERNEL);
373	struct seq_file *m;	373	struct seq_file *m;
374	int res;	374	int res;
375		375
376	if (!buf)	376	if (!buf)
377	return -ENOMEM;	377	return -ENOMEM;
378	res = single_open(file, show_schedstat, NULL);	378	res = single_open(file, show_schedstat, NULL);
379	if (!res) {	379	if (!res) {
380	m = file->private_data;	380	m = file->private_data;
381	m->buf = buf;	381	m->buf = buf;
382	m->size = size;	382	m->size = size;
383	} else	383	} else
384	kfree(buf);	384	kfree(buf);
385	return res;	385	return res;
386	}	386	}
387		387
388	struct file_operations proc_schedstat_operations = {	388	struct file_operations proc_schedstat_operations = {
389	.open = schedstat_open,	389	.open = schedstat_open,
390	.read = seq_read,	390	.read = seq_read,
391	.llseek = seq_lseek,	391	.llseek = seq_lseek,
392	.release = single_release,	392	.release = single_release,
393	};	393	};
394		394
395	# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)	395	# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
396	# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)	396	# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
397	#else /* !CONFIG_SCHEDSTATS */	397	#else /* !CONFIG_SCHEDSTATS */
398	# define schedstat_inc(rq, field) do { } while (0)	398	# define schedstat_inc(rq, field) do { } while (0)
399	# define schedstat_add(rq, field, amt) do { } while (0)	399	# define schedstat_add(rq, field, amt) do { } while (0)
400	#endif	400	#endif
401		401
402	/*	402	/*
403	* rq_lock - lock a given runqueue and disable interrupts.	403	* rq_lock - lock a given runqueue and disable interrupts.
404	*/	404	*/
405	static inline runqueue_t *this_rq_lock(void)	405	static inline runqueue_t *this_rq_lock(void)
406	__acquires(rq->lock)	406	__acquires(rq->lock)
407	{	407	{
408	runqueue_t *rq;	408	runqueue_t *rq;
409		409
410	local_irq_disable();	410	local_irq_disable();
411	rq = this_rq();	411	rq = this_rq();
412	spin_lock(&rq->lock);	412	spin_lock(&rq->lock);
413		413
414	return rq;	414	return rq;
415	}	415	}
416		416
417	#ifdef CONFIG_SCHED_SMT	417	#ifdef CONFIG_SCHED_SMT
418	static int cpu_and_siblings_are_idle(int cpu)	418	static int cpu_and_siblings_are_idle(int cpu)
419	{	419	{
420	int sib;	420	int sib;
421	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {	421	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422	if (idle_cpu(sib))	422	if (idle_cpu(sib))
423	continue;	423	continue;
424	return 0;	424	return 0;
425	}	425	}
426		426
427	return 1;	427	return 1;
428	}	428	}
429	#else	429	#else
430	#define cpu_and_siblings_are_idle(A) idle_cpu(A)	430	#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431	#endif	431	#endif
432		432
433	#ifdef CONFIG_SCHEDSTATS	433	#ifdef CONFIG_SCHEDSTATS
434	/*	434	/*
435	* Called when a process is dequeued from the active array and given	435	* Called when a process is dequeued from the active array and given
436	* the cpu. We should note that with the exception of interactive	436	* the cpu. We should note that with the exception of interactive
437	* tasks, the expired queue will become the active queue after the active	437	* tasks, the expired queue will become the active queue after the active
438	* queue is empty, without explicitly dequeuing and requeuing tasks in the	438	* queue is empty, without explicitly dequeuing and requeuing tasks in the
439	* expired queue. (Interactive tasks may be requeued directly to the	439	* expired queue. (Interactive tasks may be requeued directly to the
440	* active queue, thus delaying tasks in the expired queue from running;	440	* active queue, thus delaying tasks in the expired queue from running;
441	* see scheduler_tick()).	441	* see scheduler_tick()).
442	*	442	*
443	* This function is only called from sched_info_arrive(), rather than	443	* This function is only called from sched_info_arrive(), rather than
444	* dequeue_task(). Even though a task may be queued and dequeued multiple	444	* dequeue_task(). Even though a task may be queued and dequeued multiple
445	* times as it is shuffled about, we're really interested in knowing how	445	* times as it is shuffled about, we're really interested in knowing how
446	* long it was from the first time it was queued to the time that it	446	* long it was from the first time it was queued to the time that it
447	* finally hit a cpu.	447	* finally hit a cpu.
448	*/	448	*/
449	static inline void sched_info_dequeued(task_t *t)	449	static inline void sched_info_dequeued(task_t *t)
450	{	450	{
451	t->sched_info.last_queued = 0;	451	t->sched_info.last_queued = 0;
452	}	452	}
453		453
454	/*	454	/*
455	* Called when a task finally hits the cpu. We can now calculate how	455	* Called when a task finally hits the cpu. We can now calculate how
456	* long it was waiting to run. We also note when it began so that we	456	* long it was waiting to run. We also note when it began so that we
457	* can keep stats on how long its timeslice is.	457	* can keep stats on how long its timeslice is.
458	*/	458	*/
459	static inline void sched_info_arrive(task_t *t)	459	static inline void sched_info_arrive(task_t *t)
460	{	460	{
461	unsigned long now = jiffies, diff = 0;	461	unsigned long now = jiffies, diff = 0;
462	struct runqueue *rq = task_rq(t);	462	struct runqueue *rq = task_rq(t);
463		463
464	if (t->sched_info.last_queued)	464	if (t->sched_info.last_queued)
465	diff = now - t->sched_info.last_queued;	465	diff = now - t->sched_info.last_queued;
466	sched_info_dequeued(t);	466	sched_info_dequeued(t);
467	t->sched_info.run_delay += diff;	467	t->sched_info.run_delay += diff;
468	t->sched_info.last_arrival = now;	468	t->sched_info.last_arrival = now;
469	t->sched_info.pcnt++;	469	t->sched_info.pcnt++;
470		470
471	if (!rq)	471	if (!rq)
472	return;	472	return;
473		473
474	rq->rq_sched_info.run_delay += diff;	474	rq->rq_sched_info.run_delay += diff;
475	rq->rq_sched_info.pcnt++;	475	rq->rq_sched_info.pcnt++;
476	}	476	}
477		477
478	/*	478	/*
479	* Called when a process is queued into either the active or expired	479	* Called when a process is queued into either the active or expired
480	* array. The time is noted and later used to determine how long we	480	* array. The time is noted and later used to determine how long we
481	* had to wait for us to reach the cpu. Since the expired queue will	481	* had to wait for us to reach the cpu. Since the expired queue will
482	* become the active queue after active queue is empty, without dequeuing	482	* become the active queue after active queue is empty, without dequeuing
483	* and requeuing any tasks, we are interested in queuing to either. It	483	* and requeuing any tasks, we are interested in queuing to either. It
484	* is unusual but not impossible for tasks to be dequeued and immediately	484	* is unusual but not impossible for tasks to be dequeued and immediately
485	* requeued in the same or another array: this can happen in sched_yield(),	485	* requeued in the same or another array: this can happen in sched_yield(),
486	* set_user_nice(), and even load_balance() as it moves tasks from runqueue	486	* set_user_nice(), and even load_balance() as it moves tasks from runqueue
487	* to runqueue.	487	* to runqueue.
488	*	488	*
489	* This function is only called from enqueue_task(), but also only updates	489	* This function is only called from enqueue_task(), but also only updates
490	* the timestamp if it is already not set. It's assumed that	490	* the timestamp if it is already not set. It's assumed that
491	* sched_info_dequeued() will clear that stamp when appropriate.	491	* sched_info_dequeued() will clear that stamp when appropriate.
492	*/	492	*/
493	static inline void sched_info_queued(task_t *t)	493	static inline void sched_info_queued(task_t *t)
494	{	494	{
495	if (!t->sched_info.last_queued)	495	if (!t->sched_info.last_queued)
496	t->sched_info.last_queued = jiffies;	496	t->sched_info.last_queued = jiffies;
497	}	497	}
498		498
499	/*	499	/*
500	* Called when a process ceases being the active-running process, either	500	* Called when a process ceases being the active-running process, either
501	* voluntarily or involuntarily. Now we can calculate how long we ran.	501	* voluntarily or involuntarily. Now we can calculate how long we ran.
502	*/	502	*/
503	static inline void sched_info_depart(task_t *t)	503	static inline void sched_info_depart(task_t *t)
504	{	504	{
505	struct runqueue *rq = task_rq(t);	505	struct runqueue *rq = task_rq(t);
506	unsigned long diff = jiffies - t->sched_info.last_arrival;	506	unsigned long diff = jiffies - t->sched_info.last_arrival;
507		507
508	t->sched_info.cpu_time += diff;	508	t->sched_info.cpu_time += diff;
509		509
510	if (rq)	510	if (rq)
511	rq->rq_sched_info.cpu_time += diff;	511	rq->rq_sched_info.cpu_time += diff;
512	}	512	}
513		513
514	/*	514	/*
515	* Called when tasks are switched involuntarily due, typically, to expiring	515	* Called when tasks are switched involuntarily due, typically, to expiring
516	* their time slice. (This may also be called when switching to or from	516	* their time slice. (This may also be called when switching to or from
517	* the idle task.) We are only called when prev != next.	517	* the idle task.) We are only called when prev != next.
518	*/	518	*/
519	static inline void sched_info_switch(task_t prev, task_t next)	519	static inline void sched_info_switch(task_t prev, task_t next)
520	{	520	{
521	struct runqueue *rq = task_rq(prev);	521	struct runqueue *rq = task_rq(prev);
522		522
523	/*	523	/*
524	* prev now departs the cpu. It's not interesting to record	524	* prev now departs the cpu. It's not interesting to record
525	* stats about how efficient we were at scheduling the idle	525	* stats about how efficient we were at scheduling the idle
526	* process, however.	526	* process, however.
527	*/	527	*/
528	if (prev != rq->idle)	528	if (prev != rq->idle)
529	sched_info_depart(prev);	529	sched_info_depart(prev);
530		530
531	if (next != rq->idle)	531	if (next != rq->idle)
532	sched_info_arrive(next);	532	sched_info_arrive(next);
533	}	533	}
534	#else	534	#else
535	#define sched_info_queued(t) do { } while (0)	535	#define sched_info_queued(t) do { } while (0)
536	#define sched_info_switch(t, next) do { } while (0)	536	#define sched_info_switch(t, next) do { } while (0)
537	#endif /* CONFIG_SCHEDSTATS */	537	#endif /* CONFIG_SCHEDSTATS */
538		538
539	/*	539	/*
540	* Adding/removing a task to/from a priority array:	540	* Adding/removing a task to/from a priority array:
541	*/	541	*/
542	static void dequeue_task(struct task_struct p, prio_array_t array)	542	static void dequeue_task(struct task_struct p, prio_array_t array)
543	{	543	{
544	array->nr_active--;	544	array->nr_active--;
545	list_del(&p->run_list);	545	list_del(&p->run_list);
546	if (list_empty(array->queue + p->prio))	546	if (list_empty(array->queue + p->prio))
547	__clear_bit(p->prio, array->bitmap);	547	__clear_bit(p->prio, array->bitmap);
548	}	548	}
549		549
550	static void enqueue_task(struct task_struct p, prio_array_t array)	550	static void enqueue_task(struct task_struct p, prio_array_t array)
551	{	551	{
552	sched_info_queued(p);	552	sched_info_queued(p);
553	list_add_tail(&p->run_list, array->queue + p->prio);	553	list_add_tail(&p->run_list, array->queue + p->prio);
554	__set_bit(p->prio, array->bitmap);	554	__set_bit(p->prio, array->bitmap);
555	array->nr_active++;	555	array->nr_active++;
556	p->array = array;	556	p->array = array;
557	}	557	}
558		558
559	/*	559	/*
560	* Put task to the end of the run list without the overhead of dequeue	560	* Put task to the end of the run list without the overhead of dequeue
561	* followed by enqueue.	561	* followed by enqueue.
562	*/	562	*/
563	static void requeue_task(struct task_struct p, prio_array_t array)	563	static void requeue_task(struct task_struct p, prio_array_t array)
564	{	564	{
565	list_move_tail(&p->run_list, array->queue + p->prio);	565	list_move_tail(&p->run_list, array->queue + p->prio);
566	}	566	}
567		567
568	static inline void enqueue_task_head(struct task_struct p, prio_array_t array)	568	static inline void enqueue_task_head(struct task_struct p, prio_array_t array)
569	{	569	{
570	list_add(&p->run_list, array->queue + p->prio);	570	list_add(&p->run_list, array->queue + p->prio);
571	__set_bit(p->prio, array->bitmap);	571	__set_bit(p->prio, array->bitmap);
572	array->nr_active++;	572	array->nr_active++;
573	p->array = array;	573	p->array = array;
574	}	574	}
575		575
576	/*	576	/*
577	* effective_prio - return the priority that is based on the static	577	* effective_prio - return the priority that is based on the static
578	* priority but is modified by bonuses/penalties.	578	* priority but is modified by bonuses/penalties.
579	*	579	*
580	* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]	580	* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
581	* into the -5 ... 0 ... +5 bonus/penalty range.	581	* into the -5 ... 0 ... +5 bonus/penalty range.
582	*	582	*
583	* We use 25% of the full 0...39 priority range so that:	583	* We use 25% of the full 0...39 priority range so that:
584	*	584	*
585	* 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.	585	* 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
586	* 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.	586	* 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
587	*	587	*
588	* Both properties are important to certain workloads.	588	* Both properties are important to certain workloads.
589	*/	589	*/
590	static int effective_prio(task_t *p)	590	static int effective_prio(task_t *p)
591	{	591	{
592	int bonus, prio;	592	int bonus, prio;
593		593
594	if (rt_task(p))	594	if (rt_task(p))
595	return p->prio;	595	return p->prio;
596		596
597	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;	597	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
598		598
599	prio = p->static_prio - bonus;	599	prio = p->static_prio - bonus;
600	if (prio < MAX_RT_PRIO)	600	if (prio < MAX_RT_PRIO)
601	prio = MAX_RT_PRIO;	601	prio = MAX_RT_PRIO;
602	if (prio > MAX_PRIO-1)	602	if (prio > MAX_PRIO-1)
603	prio = MAX_PRIO-1;	603	prio = MAX_PRIO-1;
604	return prio;	604	return prio;
605	}	605	}
606		606
607	/*	607	/*
608	* __activate_task - move a task to the runqueue.	608	* __activate_task - move a task to the runqueue.
609	*/	609	*/
610	static inline void __activate_task(task_t p, runqueue_t rq)	610	static inline void __activate_task(task_t p, runqueue_t rq)
611	{	611	{
612	enqueue_task(p, rq->active);	612	enqueue_task(p, rq->active);
613	rq->nr_running++;	613	rq->nr_running++;
614	}	614	}
615		615
616	/*	616	/*
617	* __activate_idle_task - move idle task to the _front_ of runqueue.	617	* __activate_idle_task - move idle task to the _front_ of runqueue.
618	*/	618	*/
619	static inline void __activate_idle_task(task_t p, runqueue_t rq)	619	static inline void __activate_idle_task(task_t p, runqueue_t rq)
620	{	620	{
621	enqueue_task_head(p, rq->active);	621	enqueue_task_head(p, rq->active);
622	rq->nr_running++;	622	rq->nr_running++;
623	}	623	}
624		624
625	static void recalc_task_prio(task_t *p, unsigned long long now)	625	static void recalc_task_prio(task_t *p, unsigned long long now)
626	{	626	{
627	/* Caller must always ensure 'now >= p->timestamp' */	627	/* Caller must always ensure 'now >= p->timestamp' */
628	unsigned long long __sleep_time = now - p->timestamp;	628	unsigned long long __sleep_time = now - p->timestamp;
629	unsigned long sleep_time;	629	unsigned long sleep_time;
630		630
631	if (__sleep_time > NS_MAX_SLEEP_AVG)	631	if (__sleep_time > NS_MAX_SLEEP_AVG)
632	sleep_time = NS_MAX_SLEEP_AVG;	632	sleep_time = NS_MAX_SLEEP_AVG;
633	else	633	else
634	sleep_time = (unsigned long)__sleep_time;	634	sleep_time = (unsigned long)__sleep_time;
635		635
636	if (likely(sleep_time > 0)) {	636	if (likely(sleep_time > 0)) {
637	/*	637	/*
638	* User tasks that sleep a long time are categorised as	638	* User tasks that sleep a long time are categorised as
639	* idle and will get just interactive status to stay active &	639	* idle and will get just interactive status to stay active &
640	* prevent them suddenly becoming cpu hogs and starving	640	* prevent them suddenly becoming cpu hogs and starving
641	* other processes.	641	* other processes.
642	*/	642	*/
643	if (p->mm && p->activated != -1 &&	643	if (p->mm && p->activated != -1 &&
644	sleep_time > INTERACTIVE_SLEEP(p)) {	644	sleep_time > INTERACTIVE_SLEEP(p)) {
645	p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -	645	p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
646	DEF_TIMESLICE);	646	DEF_TIMESLICE);
647	} else {	647	} else {
648	/*	648	/*
649	* The lower the sleep avg a task has the more	649	* The lower the sleep avg a task has the more
650	* rapidly it will rise with sleep time.	650	* rapidly it will rise with sleep time.
651	*/	651	*/
652	sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;	652	sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
653		653
654	/*	654	/*
655	* Tasks waking from uninterruptible sleep are	655	* Tasks waking from uninterruptible sleep are
656	* limited in their sleep_avg rise as they	656	* limited in their sleep_avg rise as they
657	* are likely to be waiting on I/O	657	* are likely to be waiting on I/O
658	*/	658	*/
659	if (p->activated == -1 && p->mm) {	659	if (p->activated == -1 && p->mm) {
660	if (p->sleep_avg >= INTERACTIVE_SLEEP(p))	660	if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
661	sleep_time = 0;	661	sleep_time = 0;
662	else if (p->sleep_avg + sleep_time >=	662	else if (p->sleep_avg + sleep_time >=
663	INTERACTIVE_SLEEP(p)) {	663	INTERACTIVE_SLEEP(p)) {
664	p->sleep_avg = INTERACTIVE_SLEEP(p);	664	p->sleep_avg = INTERACTIVE_SLEEP(p);
665	sleep_time = 0;	665	sleep_time = 0;
666	}	666	}
667	}	667	}
668		668
669	/*	669	/*
670	* This code gives a bonus to interactive tasks.	670	* This code gives a bonus to interactive tasks.
671	*	671	*
672	* The boost works by updating the 'average sleep time'	672	* The boost works by updating the 'average sleep time'
673	* value here, based on ->timestamp. The more time a	673	* value here, based on ->timestamp. The more time a
674	* task spends sleeping, the higher the average gets -	674	* task spends sleeping, the higher the average gets -
675	* and the higher the priority boost gets as well.	675	* and the higher the priority boost gets as well.
676	*/	676	*/
677	p->sleep_avg += sleep_time;	677	p->sleep_avg += sleep_time;
678		678
679	if (p->sleep_avg > NS_MAX_SLEEP_AVG)	679	if (p->sleep_avg > NS_MAX_SLEEP_AVG)
680	p->sleep_avg = NS_MAX_SLEEP_AVG;	680	p->sleep_avg = NS_MAX_SLEEP_AVG;
681	}	681	}
682	}	682	}
683		683
684	p->prio = effective_prio(p);	684	p->prio = effective_prio(p);
685	}	685	}
686		686
687	/*	687	/*
688	* activate_task - move a task to the runqueue and do priority recalculation	688	* activate_task - move a task to the runqueue and do priority recalculation
689	*	689	*
690	* Update all the scheduling statistics stuff. (sleep average	690	* Update all the scheduling statistics stuff. (sleep average
691	* calculation, priority modifiers, etc.)	691	* calculation, priority modifiers, etc.)
692	*/	692	*/
693	static void activate_task(task_t p, runqueue_t rq, int local)	693	static void activate_task(task_t p, runqueue_t rq, int local)
694	{	694	{
695	unsigned long long now;	695	unsigned long long now;
696		696
697	now = sched_clock();	697	now = sched_clock();
698	#ifdef CONFIG_SMP	698	#ifdef CONFIG_SMP
699	if (!local) {	699	if (!local) {
700	/* Compensate for drifting sched_clock */	700	/* Compensate for drifting sched_clock */
701	runqueue_t *this_rq = this_rq();	701	runqueue_t *this_rq = this_rq();
702	now = (now - this_rq->timestamp_last_tick)	702	now = (now - this_rq->timestamp_last_tick)
703	+ rq->timestamp_last_tick;	703	+ rq->timestamp_last_tick;
704	}	704	}
705	#endif	705	#endif
706		706
707	recalc_task_prio(p, now);	707	recalc_task_prio(p, now);
708		708
709	/*	709	/*
710	* This checks to make sure it's not an uninterruptible task	710	* This checks to make sure it's not an uninterruptible task
711	* that is now waking up.	711	* that is now waking up.
712	*/	712	*/
713	if (!p->activated) {	713	if (!p->activated) {
714	/*	714	/*
715	* Tasks which were woken up by interrupts (ie. hw events)	715	* Tasks which were woken up by interrupts (ie. hw events)
716	* are most likely of interactive nature. So we give them	716	* are most likely of interactive nature. So we give them
717	* the credit of extending their sleep time to the period	717	* the credit of extending their sleep time to the period
718	* of time they spend on the runqueue, waiting for execution	718	* of time they spend on the runqueue, waiting for execution
719	* on a CPU, first time around:	719	* on a CPU, first time around:
720	*/	720	*/
721	if (in_interrupt())	721	if (in_interrupt())
722	p->activated = 2;	722	p->activated = 2;
723	else {	723	else {
724	/*	724	/*
725	* Normal first-time wakeups get a credit too for	725	* Normal first-time wakeups get a credit too for
726	* on-runqueue time, but it will be weighted down:	726	* on-runqueue time, but it will be weighted down:
727	*/	727	*/
728	p->activated = 1;	728	p->activated = 1;
729	}	729	}
730	}	730	}
731	p->timestamp = now;	731	p->timestamp = now;
732		732
733	__activate_task(p, rq);	733	__activate_task(p, rq);
734	}	734	}
735		735
736	/*	736	/*
737	* deactivate_task - remove a task from the runqueue.	737	* deactivate_task - remove a task from the runqueue.
738	*/	738	*/
739	static void deactivate_task(struct task_struct p, runqueue_t rq)	739	static void deactivate_task(struct task_struct p, runqueue_t rq)
740	{	740	{
741	rq->nr_running--;	741	rq->nr_running--;
742	dequeue_task(p, p->array);	742	dequeue_task(p, p->array);
743	p->array = NULL;	743	p->array = NULL;
744	}	744	}
745		745
746	/*	746	/*
747	* resched_task - mark a task 'to be rescheduled now'.	747	* resched_task - mark a task 'to be rescheduled now'.
748	*	748	*
749	* On UP this means the setting of the need_resched flag, on SMP it	749	* On UP this means the setting of the need_resched flag, on SMP it
750	* might also involve a cross-CPU call to trigger the scheduler on	750	* might also involve a cross-CPU call to trigger the scheduler on
751	* the target CPU.	751	* the target CPU.
752	*/	752	*/
753	#ifdef CONFIG_SMP	753	#ifdef CONFIG_SMP
754	static void resched_task(task_t *p)	754	static void resched_task(task_t *p)
755	{	755	{
756	int need_resched, nrpolling;	756	int need_resched, nrpolling;
757		757
758	assert_spin_locked(&task_rq(p)->lock);	758	assert_spin_locked(&task_rq(p)->lock);
759		759
760	/* minimise the chance of sending an interrupt to poll_idle() */	760	/* minimise the chance of sending an interrupt to poll_idle() */
761	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);	761	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
762	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);	762	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
763	nrpolling \|= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);	763	nrpolling \|= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
764		764
765	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))	765	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
766	smp_send_reschedule(task_cpu(p));	766	smp_send_reschedule(task_cpu(p));
767	}	767	}
768	#else	768	#else
769	static inline void resched_task(task_t *p)	769	static inline void resched_task(task_t *p)
770	{	770	{
771	set_tsk_need_resched(p);	771	set_tsk_need_resched(p);
772	}	772	}
773	#endif	773	#endif
774		774
775	/**	775	/**
776	* task_curr - is this task currently executing on a CPU?	776	* task_curr - is this task currently executing on a CPU?
777	* @p: the task in question.	777	* @p: the task in question.
778	*/	778	*/
779	inline int task_curr(const task_t *p)	779	inline int task_curr(const task_t *p)
780	{	780	{
781	return cpu_curr(task_cpu(p)) == p;	781	return cpu_curr(task_cpu(p)) == p;
782	}	782	}
783		783
784	#ifdef CONFIG_SMP	784	#ifdef CONFIG_SMP
785	enum request_type {	785	enum request_type {
786	REQ_MOVE_TASK,	786	REQ_MOVE_TASK,
787	REQ_SET_DOMAIN,	787	REQ_SET_DOMAIN,
788	};	788	};
789		789
790	typedef struct {	790	typedef struct {
791	struct list_head list;	791	struct list_head list;
792	enum request_type type;	792	enum request_type type;
793		793
794	/* For REQ_MOVE_TASK */	794	/* For REQ_MOVE_TASK */
795	task_t *task;	795	task_t *task;
796	int dest_cpu;	796	int dest_cpu;
797		797
798	/* For REQ_SET_DOMAIN */	798	/* For REQ_SET_DOMAIN */
799	struct sched_domain *sd;	799	struct sched_domain *sd;
800		800
801	struct completion done;	801	struct completion done;
802	} migration_req_t;	802	} migration_req_t;
803		803
804	/*	804	/*
805	* The task's runqueue lock must be held.	805	* The task's runqueue lock must be held.
806	* Returns true if you have to wait for migration thread.	806	* Returns true if you have to wait for migration thread.
807	*/	807	*/
808	static int migrate_task(task_t p, int dest_cpu, migration_req_t req)	808	static int migrate_task(task_t p, int dest_cpu, migration_req_t req)
809	{	809	{
810	runqueue_t *rq = task_rq(p);	810	runqueue_t *rq = task_rq(p);
811		811
812	/*	812	/*
813	* If the task is not on a runqueue (and not running), then	813	* If the task is not on a runqueue (and not running), then
814	* it is sufficient to simply update the task's cpu field.	814	* it is sufficient to simply update the task's cpu field.
815	*/	815	*/
816	if (!p->array && !task_running(rq, p)) {	816	if (!p->array && !task_running(rq, p)) {
817	set_task_cpu(p, dest_cpu);	817	set_task_cpu(p, dest_cpu);
818	return 0;	818	return 0;
819	}	819	}
820		820
821	init_completion(&req->done);	821	init_completion(&req->done);
822	req->type = REQ_MOVE_TASK;	822	req->type = REQ_MOVE_TASK;
823	req->task = p;	823	req->task = p;
824	req->dest_cpu = dest_cpu;	824	req->dest_cpu = dest_cpu;
825	list_add(&req->list, &rq->migration_queue);	825	list_add(&req->list, &rq->migration_queue);
826	return 1;	826	return 1;
827	}	827	}
828		828
829	/*	829	/*
830	* wait_task_inactive - wait for a thread to unschedule.	830	* wait_task_inactive - wait for a thread to unschedule.
831	*	831	*
832	* The caller must ensure that the task will unschedule sometime soon,	832	* The caller must ensure that the task will unschedule sometime soon,
833	* else this function might spin for a long time. This function can't	833	* else this function might spin for a long time. This function can't
834	* be called with interrupts off, or it may introduce deadlock with	834	* be called with interrupts off, or it may introduce deadlock with
835	* smp_call_function() if an IPI is sent by the same process we are	835	* smp_call_function() if an IPI is sent by the same process we are
836	* waiting to become inactive.	836	* waiting to become inactive.
837	*/	837	*/
838	void wait_task_inactive(task_t * p)	838	void wait_task_inactive(task_t * p)
839	{	839	{
840	unsigned long flags;	840	unsigned long flags;
841	runqueue_t *rq;	841	runqueue_t *rq;
842	int preempted;	842	int preempted;
843		843
844	repeat:	844	repeat:
845	rq = task_rq_lock(p, &flags);	845	rq = task_rq_lock(p, &flags);
846	/* Must be off runqueue entirely, not preempted. */	846	/* Must be off runqueue entirely, not preempted. */
847	if (unlikely(p->array \|\| task_running(rq, p))) {	847	if (unlikely(p->array \|\| task_running(rq, p))) {
848	/* If it's preempted, we yield. It could be a while. */	848	/* If it's preempted, we yield. It could be a while. */
849	preempted = !task_running(rq, p);	849	preempted = !task_running(rq, p);
850	task_rq_unlock(rq, &flags);	850	task_rq_unlock(rq, &flags);
851	cpu_relax();	851	cpu_relax();
852	if (preempted)	852	if (preempted)
853	yield();	853	yield();
854	goto repeat;	854	goto repeat;
855	}	855	}
856	task_rq_unlock(rq, &flags);	856	task_rq_unlock(rq, &flags);
857	}	857	}
858		858
859	/***	859	/***
860	* kick_process - kick a running thread to enter/exit the kernel	860	* kick_process - kick a running thread to enter/exit the kernel
861	* @p: the to-be-kicked thread	861	* @p: the to-be-kicked thread
862	*	862	*
863	* Cause a process which is running on another CPU to enter	863	* Cause a process which is running on another CPU to enter
864	* kernel-mode, without any delay. (to get signals handled.)	864	* kernel-mode, without any delay. (to get signals handled.)
865	*	865	*
866	* NOTE: this function doesnt have to take the runqueue lock,	866	* NOTE: this function doesnt have to take the runqueue lock,
867	* because all it wants to ensure is that the remote task enters	867	* because all it wants to ensure is that the remote task enters
868	* the kernel. If the IPI races and the task has been migrated	868	* the kernel. If the IPI races and the task has been migrated
869	* to another CPU then no harm is done and the purpose has been	869	* to another CPU then no harm is done and the purpose has been
870	* achieved as well.	870	* achieved as well.
871	*/	871	*/
872	void kick_process(task_t *p)	872	void kick_process(task_t *p)
873	{	873	{
874	int cpu;	874	int cpu;
875		875
876	preempt_disable();	876	preempt_disable();
877	cpu = task_cpu(p);	877	cpu = task_cpu(p);
878	if ((cpu != smp_processor_id()) && task_curr(p))	878	if ((cpu != smp_processor_id()) && task_curr(p))
879	smp_send_reschedule(cpu);	879	smp_send_reschedule(cpu);
880	preempt_enable();	880	preempt_enable();
881	}	881	}
882		882
883	/*	883	/*
884	* Return a low guess at the load of a migration-source cpu.	884	* Return a low guess at the load of a migration-source cpu.
885	*	885	*
886	* We want to under-estimate the load of migration sources, to	886	* We want to under-estimate the load of migration sources, to
887	* balance conservatively.	887	* balance conservatively.
888	*/	888	*/
889	static inline unsigned long source_load(int cpu)	889	static inline unsigned long source_load(int cpu)
890	{	890	{
891	runqueue_t *rq = cpu_rq(cpu);	891	runqueue_t *rq = cpu_rq(cpu);
892	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;	892	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
893		893
894	return min(rq->cpu_load, load_now);	894	return min(rq->cpu_load, load_now);
895	}	895	}
896		896
897	/*	897	/*
898	* Return a high guess at the load of a migration-target cpu	898	* Return a high guess at the load of a migration-target cpu
899	*/	899	*/
900	static inline unsigned long target_load(int cpu)	900	static inline unsigned long target_load(int cpu)
901	{	901	{
902	runqueue_t *rq = cpu_rq(cpu);	902	runqueue_t *rq = cpu_rq(cpu);
903	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;	903	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
904		904
905	return max(rq->cpu_load, load_now);	905	return max(rq->cpu_load, load_now);
906	}	906	}
907		907
908	#endif	908	#endif
909		909
910	/*	910	/*
911	* wake_idle() will wake a task on an idle cpu if task->cpu is	911	* wake_idle() will wake a task on an idle cpu if task->cpu is
912	* not idle and an idle cpu is available. The span of cpus to	912	* not idle and an idle cpu is available. The span of cpus to
913	* search starts with cpus closest then further out as needed,	913	* search starts with cpus closest then further out as needed,
914	* so we always favor a closer, idle cpu.	914	* so we always favor a closer, idle cpu.
915	*	915	*
916	* Returns the CPU we should wake onto.	916	* Returns the CPU we should wake onto.
917	*/	917	*/
918	#if defined(ARCH_HAS_SCHED_WAKE_IDLE)	918	#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
919	static int wake_idle(int cpu, task_t *p)	919	static int wake_idle(int cpu, task_t *p)
920	{	920	{
921	cpumask_t tmp;	921	cpumask_t tmp;
922	struct sched_domain *sd;	922	struct sched_domain *sd;
923	int i;	923	int i;
924		924
925	if (idle_cpu(cpu))	925	if (idle_cpu(cpu))
926	return cpu;	926	return cpu;
927		927
928	for_each_domain(cpu, sd) {	928	for_each_domain(cpu, sd) {
929	if (sd->flags & SD_WAKE_IDLE) {	929	if (sd->flags & SD_WAKE_IDLE) {
930	cpus_and(tmp, sd->span, cpu_online_map);	930	cpus_and(tmp, sd->span, cpu_online_map);
931	cpus_and(tmp, tmp, p->cpus_allowed);	931	cpus_and(tmp, tmp, p->cpus_allowed);
932	for_each_cpu_mask(i, tmp) {	932	for_each_cpu_mask(i, tmp) {
933	if (idle_cpu(i))	933	if (idle_cpu(i))
934	return i;	934	return i;
935	}	935	}
936	}	936	}
937	else break;	937	else break;
938	}	938	}
939	return cpu;	939	return cpu;
940	}	940	}
941	#else	941	#else
942	static inline int wake_idle(int cpu, task_t *p)	942	static inline int wake_idle(int cpu, task_t *p)
943	{	943	{
944	return cpu;	944	return cpu;
945	}	945	}
946	#endif	946	#endif
947		947
948	/***	948	/***
949	* try_to_wake_up - wake up a thread	949	* try_to_wake_up - wake up a thread
950	* @p: the to-be-woken-up thread	950	* @p: the to-be-woken-up thread
951	* @state: the mask of task states that can be woken	951	* @state: the mask of task states that can be woken
952	* @sync: do a synchronous wakeup?	952	* @sync: do a synchronous wakeup?
953	*	953	*
954	* Put it on the run-queue if it's not already there. The "current"	954	* Put it on the run-queue if it's not already there. The "current"
955	* thread is always on the run-queue (except when the actual	955	* thread is always on the run-queue (except when the actual
956	* re-schedule is in progress), and as such you're allowed to do	956	* re-schedule is in progress), and as such you're allowed to do
957	* the simpler "current->state = TASK_RUNNING" to mark yourself	957	* the simpler "current->state = TASK_RUNNING" to mark yourself
958	* runnable without the overhead of this.	958	* runnable without the overhead of this.
959	*	959	*
960	* returns failure only if the task is already active.	960	* returns failure only if the task is already active.
961	*/	961	*/
962	static int try_to_wake_up(task_t * p, unsigned int state, int sync)	962	static int try_to_wake_up(task_t * p, unsigned int state, int sync)
963	{	963	{
964	int cpu, this_cpu, success = 0;	964	int cpu, this_cpu, success = 0;
965	unsigned long flags;	965	unsigned long flags;
966	long old_state;	966	long old_state;
967	runqueue_t *rq;	967	runqueue_t *rq;
968	#ifdef CONFIG_SMP	968	#ifdef CONFIG_SMP
969	unsigned long load, this_load;	969	unsigned long load, this_load;
970	struct sched_domain *sd;	970	struct sched_domain *sd;
971	int new_cpu;	971	int new_cpu;
972	#endif	972	#endif
973		973
974	rq = task_rq_lock(p, &flags);	974	rq = task_rq_lock(p, &flags);
975	old_state = p->state;	975	old_state = p->state;
976	if (!(old_state & state))	976	if (!(old_state & state))
977	goto out;	977	goto out;
978		978
979	if (p->array)	979	if (p->array)
980	goto out_running;	980	goto out_running;
981		981
982	cpu = task_cpu(p);	982	cpu = task_cpu(p);
983	this_cpu = smp_processor_id();	983	this_cpu = smp_processor_id();
984		984
985	#ifdef CONFIG_SMP	985	#ifdef CONFIG_SMP
986	if (unlikely(task_running(rq, p)))	986	if (unlikely(task_running(rq, p)))
987	goto out_activate;	987	goto out_activate;
988		988
989	#ifdef CONFIG_SCHEDSTATS	989	#ifdef CONFIG_SCHEDSTATS
990	schedstat_inc(rq, ttwu_cnt);	990	schedstat_inc(rq, ttwu_cnt);
991	if (cpu == this_cpu) {	991	if (cpu == this_cpu) {
992	schedstat_inc(rq, ttwu_local);	992	schedstat_inc(rq, ttwu_local);
993	} else {	993	} else {
994	for_each_domain(this_cpu, sd) {	994	for_each_domain(this_cpu, sd) {
995	if (cpu_isset(cpu, sd->span)) {	995	if (cpu_isset(cpu, sd->span)) {
996	schedstat_inc(sd, ttwu_wake_remote);	996	schedstat_inc(sd, ttwu_wake_remote);
997	break;	997	break;
998	}	998	}
999	}	999	}
1000	}	1000	}
1001	#endif	1001	#endif
1002		1002
1003	new_cpu = cpu;	1003	new_cpu = cpu;
1004	if (cpu == this_cpu \|\| unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))	1004	if (cpu == this_cpu \|\| unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005	goto out_set_cpu;	1005	goto out_set_cpu;
1006		1006
1007	load = source_load(cpu);	1007	load = source_load(cpu);
1008	this_load = target_load(this_cpu);	1008	this_load = target_load(this_cpu);
1009		1009
1010	/*	1010	/*
1011	* If sync wakeup then subtract the (maximum possible) effect of	1011	* If sync wakeup then subtract the (maximum possible) effect of
1012	* the currently running task from the load of the current CPU:	1012	* the currently running task from the load of the current CPU:
1013	*/	1013	*/
1014	if (sync)	1014	if (sync)
1015	this_load -= SCHED_LOAD_SCALE;	1015	this_load -= SCHED_LOAD_SCALE;
1016		1016
1017	/* Don't pull the task off an idle CPU to a busy one */	1017	/* Don't pull the task off an idle CPU to a busy one */
1018	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)	1018	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019	goto out_set_cpu;	1019	goto out_set_cpu;
1020		1020
1021	new_cpu = this_cpu; /* Wake to this CPU if we can */	1021	new_cpu = this_cpu; /* Wake to this CPU if we can */
1022		1022
1023	/*	1023	/*
1024	* Scan domains for affine wakeup and passive balancing	1024	* Scan domains for affine wakeup and passive balancing
1025	* possibilities.	1025	* possibilities.
1026	*/	1026	*/
1027	for_each_domain(this_cpu, sd) {	1027	for_each_domain(this_cpu, sd) {
1028	unsigned int imbalance;	1028	unsigned int imbalance;
1029	/*	1029	/*
1030	* Start passive balancing when half the imbalance_pct	1030	* Start passive balancing when half the imbalance_pct
1031	* limit is reached.	1031	* limit is reached.
1032	*/	1032	*/
1033	imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;	1033	imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034		1034
1035	if ((sd->flags & SD_WAKE_AFFINE) &&	1035	if ((sd->flags & SD_WAKE_AFFINE) &&
1036	!task_hot(p, rq->timestamp_last_tick, sd)) {	1036	!task_hot(p, rq->timestamp_last_tick, sd)) {
1037	/*	1037	/*
1038	* This domain has SD_WAKE_AFFINE and p is cache cold	1038	* This domain has SD_WAKE_AFFINE and p is cache cold
1039	* in this domain.	1039	* in this domain.
1040	*/	1040	*/
1041	if (cpu_isset(cpu, sd->span)) {	1041	if (cpu_isset(cpu, sd->span)) {
1042	schedstat_inc(sd, ttwu_move_affine);	1042	schedstat_inc(sd, ttwu_move_affine);
1043	goto out_set_cpu;	1043	goto out_set_cpu;
1044	}	1044	}
1045	} else if ((sd->flags & SD_WAKE_BALANCE) &&	1045	} else if ((sd->flags & SD_WAKE_BALANCE) &&
1046	imbalancethis_load <= 100load) {	1046	imbalancethis_load <= 100load) {
1047	/*	1047	/*
1048	* This domain has SD_WAKE_BALANCE and there is	1048	* This domain has SD_WAKE_BALANCE and there is
1049	* an imbalance.	1049	* an imbalance.
1050	*/	1050	*/
1051	if (cpu_isset(cpu, sd->span)) {	1051	if (cpu_isset(cpu, sd->span)) {
1052	schedstat_inc(sd, ttwu_move_balance);	1052	schedstat_inc(sd, ttwu_move_balance);
1053	goto out_set_cpu;	1053	goto out_set_cpu;
1054	}	1054	}
1055	}	1055	}
1056	}	1056	}
1057		1057
1058	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */	1058	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1059	out_set_cpu:	1059	out_set_cpu:
1060	new_cpu = wake_idle(new_cpu, p);	1060	new_cpu = wake_idle(new_cpu, p);
1061	if (new_cpu != cpu) {	1061	if (new_cpu != cpu) {
1062	set_task_cpu(p, new_cpu);	1062	set_task_cpu(p, new_cpu);
1063	task_rq_unlock(rq, &flags);	1063	task_rq_unlock(rq, &flags);
1064	/* might preempt at this point */	1064	/* might preempt at this point */
1065	rq = task_rq_lock(p, &flags);	1065	rq = task_rq_lock(p, &flags);
1066	old_state = p->state;	1066	old_state = p->state;
1067	if (!(old_state & state))	1067	if (!(old_state & state))
1068	goto out;	1068	goto out;
1069	if (p->array)	1069	if (p->array)
1070	goto out_running;	1070	goto out_running;
1071		1071
1072	this_cpu = smp_processor_id();	1072	this_cpu = smp_processor_id();
1073	cpu = task_cpu(p);	1073	cpu = task_cpu(p);
1074	}	1074	}
1075		1075
1076	out_activate:	1076	out_activate:
1077	#endif /* CONFIG_SMP */	1077	#endif /* CONFIG_SMP */
1078	if (old_state == TASK_UNINTERRUPTIBLE) {	1078	if (old_state == TASK_UNINTERRUPTIBLE) {
1079	rq->nr_uninterruptible--;	1079	rq->nr_uninterruptible--;
1080	/*	1080	/*
1081	* Tasks on involuntary sleep don't earn	1081	* Tasks on involuntary sleep don't earn
1082	* sleep_avg beyond just interactive state.	1082	* sleep_avg beyond just interactive state.
1083	*/	1083	*/
1084	p->activated = -1;	1084	p->activated = -1;
1085	}	1085	}
1086		1086
1087	/*	1087	/*
1088	* Sync wakeups (i.e. those types of wakeups where the waker	1088	* Sync wakeups (i.e. those types of wakeups where the waker
1089	* has indicated that it will leave the CPU in short order)	1089	* has indicated that it will leave the CPU in short order)
1090	* don't trigger a preemption, if the woken up task will run on	1090	* don't trigger a preemption, if the woken up task will run on
1091	* this cpu. (in this case the 'I will reschedule' promise of	1091	* this cpu. (in this case the 'I will reschedule' promise of
1092	* the waker guarantees that the freshly woken up task is going	1092	* the waker guarantees that the freshly woken up task is going
1093	* to be considered on this CPU.)	1093	* to be considered on this CPU.)
1094	*/	1094	*/
1095	activate_task(p, rq, cpu == this_cpu);	1095	activate_task(p, rq, cpu == this_cpu);
1096	if (!sync \|\| cpu != this_cpu) {	1096	if (!sync \|\| cpu != this_cpu) {
1097	if (TASK_PREEMPTS_CURR(p, rq))	1097	if (TASK_PREEMPTS_CURR(p, rq))
1098	resched_task(rq->curr);	1098	resched_task(rq->curr);
1099	}	1099	}
1100	success = 1;	1100	success = 1;
1101		1101
1102	out_running:	1102	out_running:
1103	p->state = TASK_RUNNING;	1103	p->state = TASK_RUNNING;
1104	out:	1104	out:
1105	task_rq_unlock(rq, &flags);	1105	task_rq_unlock(rq, &flags);
1106		1106
1107	return success;	1107	return success;
1108	}	1108	}
1109		1109
1110	int fastcall wake_up_process(task_t * p)	1110	int fastcall wake_up_process(task_t * p)
1111	{	1111	{
1112	return try_to_wake_up(p, TASK_STOPPED \| TASK_TRACED \|	1112	return try_to_wake_up(p, TASK_STOPPED \| TASK_TRACED \|
1113	TASK_INTERRUPTIBLE \| TASK_UNINTERRUPTIBLE, 0);	1113	TASK_INTERRUPTIBLE \| TASK_UNINTERRUPTIBLE, 0);
1114	}	1114	}
1115		1115
1116	EXPORT_SYMBOL(wake_up_process);	1116	EXPORT_SYMBOL(wake_up_process);
1117		1117
1118	int fastcall wake_up_state(task_t *p, unsigned int state)	1118	int fastcall wake_up_state(task_t *p, unsigned int state)
1119	{	1119	{
1120	return try_to_wake_up(p, state, 0);	1120	return try_to_wake_up(p, state, 0);
1121	}	1121	}
1122		1122
1123	#ifdef CONFIG_SMP	1123	#ifdef CONFIG_SMP
1124	static int find_idlest_cpu(struct task_struct *p, int this_cpu,	1124	static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125	struct sched_domain *sd);	1125	struct sched_domain *sd);
1126	#endif	1126	#endif
1127		1127
1128	/*	1128	/*
1129	* Perform scheduler related setup for a newly forked process p.	1129	* Perform scheduler related setup for a newly forked process p.
1130	* p is forked by current.	1130	* p is forked by current.
1131	*/	1131	*/
1132	void fastcall sched_fork(task_t *p)	1132	void fastcall sched_fork(task_t *p)
1133	{	1133	{
1134	/*	1134	/*
1135	* We mark the process as running here, but have not actually	1135	* We mark the process as running here, but have not actually
1136	* inserted it onto the runqueue yet. This guarantees that	1136	* inserted it onto the runqueue yet. This guarantees that
1137	* nobody will actually run it, and a signal or other external	1137	* nobody will actually run it, and a signal or other external
1138	* event cannot wake it up and insert it on the runqueue either.	1138	* event cannot wake it up and insert it on the runqueue either.
1139	*/	1139	*/
1140	p->state = TASK_RUNNING;	1140	p->state = TASK_RUNNING;
1141	INIT_LIST_HEAD(&p->run_list);	1141	INIT_LIST_HEAD(&p->run_list);
1142	p->array = NULL;	1142	p->array = NULL;
1143	spin_lock_init(&p->switch_lock);	1143	spin_lock_init(&p->switch_lock);
1144	#ifdef CONFIG_SCHEDSTATS	1144	#ifdef CONFIG_SCHEDSTATS
1145	memset(&p->sched_info, 0, sizeof(p->sched_info));	1145	memset(&p->sched_info, 0, sizeof(p->sched_info));
1146	#endif	1146	#endif
1147	#ifdef CONFIG_PREEMPT	1147	#ifdef CONFIG_PREEMPT
1148	/*	1148	/*
1149	* During context-switch we hold precisely one spinlock, which	1149	* During context-switch we hold precisely one spinlock, which
1150	* schedule_tail drops. (in the common case it's this_rq()->lock,	1150	* schedule_tail drops. (in the common case it's this_rq()->lock,
1151	* but it also can be p->switch_lock.) So we compensate with a count	1151	* but it also can be p->switch_lock.) So we compensate with a count
1152	* of 1. Also, we want to start with kernel preemption disabled.	1152	* of 1. Also, we want to start with kernel preemption disabled.
1153	*/	1153	*/
1154	p->thread_info->preempt_count = 1;	1154	p->thread_info->preempt_count = 1;
1155	#endif	1155	#endif
1156	/*	1156	/*
1157	* Share the timeslice between parent and child, thus the	1157	* Share the timeslice between parent and child, thus the
1158	* total amount of pending timeslices in the system doesn't change,	1158	* total amount of pending timeslices in the system doesn't change,
1159	* resulting in more scheduling fairness.	1159	* resulting in more scheduling fairness.
1160	*/	1160	*/
1161	local_irq_disable();	1161	local_irq_disable();
1162	p->time_slice = (current->time_slice + 1) >> 1;	1162	p->time_slice = (current->time_slice + 1) >> 1;
1163	/*	1163	/*
1164	* The remainder of the first timeslice might be recovered by	1164	* The remainder of the first timeslice might be recovered by
1165	* the parent if the child exits early enough.	1165	* the parent if the child exits early enough.
1166	*/	1166	*/
1167	p->first_time_slice = 1;	1167	p->first_time_slice = 1;
1168	current->time_slice >>= 1;	1168	current->time_slice >>= 1;
1169	p->timestamp = sched_clock();	1169	p->timestamp = sched_clock();
1170	if (unlikely(!current->time_slice)) {	1170	if (unlikely(!current->time_slice)) {
1171	/*	1171	/*
1172	* This case is rare, it happens when the parent has only	1172	* This case is rare, it happens when the parent has only
1173	* a single jiffy left from its timeslice. Taking the	1173	* a single jiffy left from its timeslice. Taking the
1174	* runqueue lock is not a problem.	1174	* runqueue lock is not a problem.
1175	*/	1175	*/
1176	current->time_slice = 1;	1176	current->time_slice = 1;
1177	preempt_disable();	1177	preempt_disable();
1178	scheduler_tick();	1178	scheduler_tick();
1179	local_irq_enable();	1179	local_irq_enable();
1180	preempt_enable();	1180	preempt_enable();
1181	} else	1181	} else
1182	local_irq_enable();	1182	local_irq_enable();
1183	}	1183	}
1184		1184
1185	/*	1185	/*
1186	* wake_up_new_task - wake up a newly created task for the first time.	1186	* wake_up_new_task - wake up a newly created task for the first time.
1187	*	1187	*
1188	* This function will do some initial scheduler statistics housekeeping	1188	* This function will do some initial scheduler statistics housekeeping
1189	* that must be done for every newly created context, then puts the task	1189	* that must be done for every newly created context, then puts the task
1190	* on the runqueue and wakes it.	1190	* on the runqueue and wakes it.
1191	*/	1191	*/
1192	void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)	1192	void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1193	{	1193	{
1194	unsigned long flags;	1194	unsigned long flags;
1195	int this_cpu, cpu;	1195	int this_cpu, cpu;
1196	runqueue_t rq, this_rq;	1196	runqueue_t rq, this_rq;
1197		1197
1198	rq = task_rq_lock(p, &flags);	1198	rq = task_rq_lock(p, &flags);
1199	cpu = task_cpu(p);	1199	cpu = task_cpu(p);
1200	this_cpu = smp_processor_id();	1200	this_cpu = smp_processor_id();
1201		1201
1202	BUG_ON(p->state != TASK_RUNNING);	1202	BUG_ON(p->state != TASK_RUNNING);
1203		1203
1204	/*	1204	/*
1205	* We decrease the sleep average of forking parents	1205	* We decrease the sleep average of forking parents
1206	* and children as well, to keep max-interactive tasks	1206	* and children as well, to keep max-interactive tasks
1207	* from forking tasks that are max-interactive. The parent	1207	* from forking tasks that are max-interactive. The parent
1208	* (current) is done further down, under its lock.	1208	* (current) is done further down, under its lock.
1209	*/	1209	*/
1210	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *	1210	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1211	CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);	1211	CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1212		1212
1213	p->prio = effective_prio(p);	1213	p->prio = effective_prio(p);
1214		1214
1215	if (likely(cpu == this_cpu)) {	1215	if (likely(cpu == this_cpu)) {
1216	if (!(clone_flags & CLONE_VM)) {	1216	if (!(clone_flags & CLONE_VM)) {
1217	/*	1217	/*
1218	* The VM isn't cloned, so we're in a good position to	1218	* The VM isn't cloned, so we're in a good position to
1219	* do child-runs-first in anticipation of an exec. This	1219	* do child-runs-first in anticipation of an exec. This
1220	* usually avoids a lot of COW overhead.	1220	* usually avoids a lot of COW overhead.
1221	*/	1221	*/
1222	if (unlikely(!current->array))	1222	if (unlikely(!current->array))
1223	__activate_task(p, rq);	1223	__activate_task(p, rq);
1224	else {	1224	else {
1225	p->prio = current->prio;	1225	p->prio = current->prio;
1226	list_add_tail(&p->run_list, &current->run_list);	1226	list_add_tail(&p->run_list, &current->run_list);
1227	p->array = current->array;	1227	p->array = current->array;
1228	p->array->nr_active++;	1228	p->array->nr_active++;
1229	rq->nr_running++;	1229	rq->nr_running++;
1230	}	1230	}
1231	set_need_resched();	1231	set_need_resched();
1232	} else	1232	} else
1233	/* Run child last */	1233	/* Run child last */
1234	__activate_task(p, rq);	1234	__activate_task(p, rq);
1235	/*	1235	/*
1236	* We skip the following code due to cpu == this_cpu	1236	* We skip the following code due to cpu == this_cpu
1237	*	1237	*
1238	* task_rq_unlock(rq, &flags);	1238	* task_rq_unlock(rq, &flags);
1239	* this_rq = task_rq_lock(current, &flags);	1239	* this_rq = task_rq_lock(current, &flags);
1240	*/	1240	*/
1241	this_rq = rq;	1241	this_rq = rq;
1242	} else {	1242	} else {
1243	this_rq = cpu_rq(this_cpu);	1243	this_rq = cpu_rq(this_cpu);
1244		1244
1245	/*	1245	/*
1246	* Not the local CPU - must adjust timestamp. This should	1246	* Not the local CPU - must adjust timestamp. This should
1247	* get optimised away in the !CONFIG_SMP case.	1247	* get optimised away in the !CONFIG_SMP case.
1248	*/	1248	*/
1249	p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)	1249	p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1250	+ rq->timestamp_last_tick;	1250	+ rq->timestamp_last_tick;
1251	__activate_task(p, rq);	1251	__activate_task(p, rq);
1252	if (TASK_PREEMPTS_CURR(p, rq))	1252	if (TASK_PREEMPTS_CURR(p, rq))
1253	resched_task(rq->curr);	1253	resched_task(rq->curr);
1254		1254
1255	/*	1255	/*
1256	* Parent and child are on different CPUs, now get the	1256	* Parent and child are on different CPUs, now get the
1257	* parent runqueue to update the parent's ->sleep_avg:	1257	* parent runqueue to update the parent's ->sleep_avg:
1258	*/	1258	*/
1259	task_rq_unlock(rq, &flags);	1259	task_rq_unlock(rq, &flags);
1260	this_rq = task_rq_lock(current, &flags);	1260	this_rq = task_rq_lock(current, &flags);
1261	}	1261	}
1262	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *	1262	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1263	PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);	1263	PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1264	task_rq_unlock(this_rq, &flags);	1264	task_rq_unlock(this_rq, &flags);
1265	}	1265	}
1266		1266
1267	/*	1267	/*
1268	* Potentially available exiting-child timeslices are	1268	* Potentially available exiting-child timeslices are
1269	* retrieved here - this way the parent does not get	1269	* retrieved here - this way the parent does not get
1270	* penalized for creating too many threads.	1270	* penalized for creating too many threads.
1271	*	1271	*
1272	* (this cannot be used to 'generate' timeslices	1272	* (this cannot be used to 'generate' timeslices
1273	* artificially, because any timeslice recovered here	1273	* artificially, because any timeslice recovered here
1274	* was given away by the parent in the first place.)	1274	* was given away by the parent in the first place.)
1275	*/	1275	*/
1276	void fastcall sched_exit(task_t * p)	1276	void fastcall sched_exit(task_t * p)
1277	{	1277	{
1278	unsigned long flags;	1278	unsigned long flags;
1279	runqueue_t *rq;	1279	runqueue_t *rq;
1280		1280
1281	/*	1281	/*
1282	* If the child was a (relative-) CPU hog then decrease	1282	* If the child was a (relative-) CPU hog then decrease
1283	* the sleep_avg of the parent as well.	1283	* the sleep_avg of the parent as well.
1284	*/	1284	*/
1285	rq = task_rq_lock(p->parent, &flags);	1285	rq = task_rq_lock(p->parent, &flags);
1286	if (p->first_time_slice) {	1286	if (p->first_time_slice) {
1287	p->parent->time_slice += p->time_slice;	1287	p->parent->time_slice += p->time_slice;
1288	if (unlikely(p->parent->time_slice > task_timeslice(p)))	1288	if (unlikely(p->parent->time_slice > task_timeslice(p)))
1289	p->parent->time_slice = task_timeslice(p);	1289	p->parent->time_slice = task_timeslice(p);
1290	}	1290	}
1291	if (p->sleep_avg < p->parent->sleep_avg)	1291	if (p->sleep_avg < p->parent->sleep_avg)
1292	p->parent->sleep_avg = p->parent->sleep_avg /	1292	p->parent->sleep_avg = p->parent->sleep_avg /
1293	(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /	1293	(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1294	(EXIT_WEIGHT + 1);	1294	(EXIT_WEIGHT + 1);
1295	task_rq_unlock(rq, &flags);	1295	task_rq_unlock(rq, &flags);
1296	}	1296	}
1297		1297
1298	/**	1298	/**
1299	* finish_task_switch - clean up after a task-switch	1299	* finish_task_switch - clean up after a task-switch
1300	* @prev: the thread we just switched away from.	1300	* @prev: the thread we just switched away from.
1301	*	1301	*
1302	* We enter this with the runqueue still locked, and finish_arch_switch()	1302	* We enter this with the runqueue still locked, and finish_arch_switch()
1303	* will unlock it along with doing any other architecture-specific cleanup	1303	* will unlock it along with doing any other architecture-specific cleanup
1304	* actions.	1304	* actions.
1305	*	1305	*
1306	* Note that we may have delayed dropping an mm in context_switch(). If	1306	* Note that we may have delayed dropping an mm in context_switch(). If
1307	* so, we finish that here outside of the runqueue lock. (Doing it	1307	* so, we finish that here outside of the runqueue lock. (Doing it
1308	* with the lock held can cause deadlocks; see schedule() for	1308	* with the lock held can cause deadlocks; see schedule() for
1309	* details.)	1309	* details.)
1310	*/	1310	*/
1311	static inline void finish_task_switch(task_t *prev)	1311	static inline void finish_task_switch(task_t *prev)
1312	__releases(rq->lock)	1312	__releases(rq->lock)
1313	{	1313	{
1314	runqueue_t *rq = this_rq();	1314	runqueue_t *rq = this_rq();
1315	struct mm_struct *mm = rq->prev_mm;	1315	struct mm_struct *mm = rq->prev_mm;
1316	unsigned long prev_task_flags;	1316	unsigned long prev_task_flags;
1317		1317
1318	rq->prev_mm = NULL;	1318	rq->prev_mm = NULL;
1319		1319
1320	/*	1320	/*
1321	* A task struct has one reference for the use as "current".	1321	* A task struct has one reference for the use as "current".
1322	* If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and	1322	* If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1323	* calls schedule one last time. The schedule call will never return,	1323	* calls schedule one last time. The schedule call will never return,
1324	* and the scheduled task must drop that reference.	1324	* and the scheduled task must drop that reference.
1325	* The test for EXIT_ZOMBIE must occur while the runqueue locks are	1325	* The test for EXIT_ZOMBIE must occur while the runqueue locks are
1326	* still held, otherwise prev could be scheduled on another cpu, die	1326	* still held, otherwise prev could be scheduled on another cpu, die
1327	* there before we look at prev->state, and then the reference would	1327	* there before we look at prev->state, and then the reference would
1328	* be dropped twice.	1328	* be dropped twice.
1329	* Manfred Spraul <manfred@colorfullife.com>	1329	* Manfred Spraul <manfred@colorfullife.com>
1330	*/	1330	*/
1331	prev_task_flags = prev->flags;	1331	prev_task_flags = prev->flags;
1332	finish_arch_switch(rq, prev);	1332	finish_arch_switch(rq, prev);
1333	if (mm)	1333	if (mm)
1334	mmdrop(mm);	1334	mmdrop(mm);
1335	if (unlikely(prev_task_flags & PF_DEAD))	1335	if (unlikely(prev_task_flags & PF_DEAD))
1336	put_task_struct(prev);	1336	put_task_struct(prev);
1337	}	1337	}
1338		1338
1339	/**	1339	/**
1340	* schedule_tail - first thing a freshly forked thread must call.	1340	* schedule_tail - first thing a freshly forked thread must call.
1341	* @prev: the thread we just switched away from.	1341	* @prev: the thread we just switched away from.
1342	*/	1342	*/
1343	asmlinkage void schedule_tail(task_t *prev)	1343	asmlinkage void schedule_tail(task_t *prev)
1344	__releases(rq->lock)	1344	__releases(rq->lock)
1345	{	1345	{
1346	finish_task_switch(prev);	1346	finish_task_switch(prev);
1347		1347
1348	if (current->set_child_tid)	1348	if (current->set_child_tid)
1349	put_user(current->pid, current->set_child_tid);	1349	put_user(current->pid, current->set_child_tid);
1350	}	1350	}
1351		1351
1352	/*	1352	/*
1353	* context_switch - switch to the new MM and the new	1353	* context_switch - switch to the new MM and the new
1354	* thread's register state.	1354	* thread's register state.
1355	*/	1355	*/
1356	static inline	1356	static inline
1357	task_t * context_switch(runqueue_t rq, task_t prev, task_t *next)	1357	task_t * context_switch(runqueue_t rq, task_t prev, task_t *next)
1358	{	1358	{
1359	struct mm_struct *mm = next->mm;	1359	struct mm_struct *mm = next->mm;
1360	struct mm_struct *oldmm = prev->active_mm;	1360	struct mm_struct *oldmm = prev->active_mm;
1361		1361
1362	if (unlikely(!mm)) {	1362	if (unlikely(!mm)) {
1363	next->active_mm = oldmm;	1363	next->active_mm = oldmm;
1364	atomic_inc(&oldmm->mm_count);	1364	atomic_inc(&oldmm->mm_count);
1365	enter_lazy_tlb(oldmm, next);	1365	enter_lazy_tlb(oldmm, next);
1366	} else	1366	} else
1367	switch_mm(oldmm, mm, next);	1367	switch_mm(oldmm, mm, next);
1368		1368
1369	if (unlikely(!prev->mm)) {	1369	if (unlikely(!prev->mm)) {
1370	prev->active_mm = NULL;	1370	prev->active_mm = NULL;
1371	WARN_ON(rq->prev_mm);	1371	WARN_ON(rq->prev_mm);
1372	rq->prev_mm = oldmm;	1372	rq->prev_mm = oldmm;
1373	}	1373	}
1374		1374
1375	/* Here we just switch the register state and the stack. */	1375	/* Here we just switch the register state and the stack. */
1376	switch_to(prev, next, prev);	1376	switch_to(prev, next, prev);
1377		1377
1378	return prev;	1378	return prev;
1379	}	1379	}
1380		1380
1381	/*	1381	/*
1382	* nr_running, nr_uninterruptible and nr_context_switches:	1382	* nr_running, nr_uninterruptible and nr_context_switches:
1383	*	1383	*
1384	* externally visible scheduler statistics: current number of runnable	1384	* externally visible scheduler statistics: current number of runnable
1385	* threads, current number of uninterruptible-sleeping threads, total	1385	* threads, current number of uninterruptible-sleeping threads, total
1386	* number of context switches performed since bootup.	1386	* number of context switches performed since bootup.
1387	*/	1387	*/
1388	unsigned long nr_running(void)	1388	unsigned long nr_running(void)
1389	{	1389	{
1390	unsigned long i, sum = 0;	1390	unsigned long i, sum = 0;
1391		1391
1392	for_each_online_cpu(i)	1392	for_each_online_cpu(i)
1393	sum += cpu_rq(i)->nr_running;	1393	sum += cpu_rq(i)->nr_running;
1394		1394
1395	return sum;	1395	return sum;
1396	}	1396	}
1397		1397
1398	unsigned long nr_uninterruptible(void)	1398	unsigned long nr_uninterruptible(void)
1399	{	1399	{
1400	unsigned long i, sum = 0;	1400	unsigned long i, sum = 0;
1401		1401
1402	for_each_cpu(i)	1402	for_each_cpu(i)
1403	sum += cpu_rq(i)->nr_uninterruptible;	1403	sum += cpu_rq(i)->nr_uninterruptible;
1404		1404
1405	/*	1405	/*
1406	* Since we read the counters lockless, it might be slightly	1406	* Since we read the counters lockless, it might be slightly
1407	* inaccurate. Do not allow it to go below zero though:	1407	* inaccurate. Do not allow it to go below zero though:
1408	*/	1408	*/
1409	if (unlikely((long)sum < 0))	1409	if (unlikely((long)sum < 0))
1410	sum = 0;	1410	sum = 0;
1411		1411
1412	return sum;	1412	return sum;
1413	}	1413	}
1414		1414
1415	unsigned long long nr_context_switches(void)	1415	unsigned long long nr_context_switches(void)
1416	{	1416	{
1417	unsigned long long i, sum = 0;	1417	unsigned long long i, sum = 0;
1418		1418
1419	for_each_cpu(i)	1419	for_each_cpu(i)
1420	sum += cpu_rq(i)->nr_switches;	1420	sum += cpu_rq(i)->nr_switches;
1421		1421
1422	return sum;	1422	return sum;
1423	}	1423	}
1424		1424
1425	unsigned long nr_iowait(void)	1425	unsigned long nr_iowait(void)
1426	{	1426	{
1427	unsigned long i, sum = 0;	1427	unsigned long i, sum = 0;
1428		1428
1429	for_each_cpu(i)	1429	for_each_cpu(i)
1430	sum += atomic_read(&cpu_rq(i)->nr_iowait);	1430	sum += atomic_read(&cpu_rq(i)->nr_iowait);
1431		1431
1432	return sum;	1432	return sum;
1433	}	1433	}
1434		1434
1435	#ifdef CONFIG_SMP	1435	#ifdef CONFIG_SMP
1436		1436
1437	/*	1437	/*
1438	* double_rq_lock - safely lock two runqueues	1438	* double_rq_lock - safely lock two runqueues
1439	*	1439	*
1440	* Note this does not disable interrupts like task_rq_lock,	1440	* Note this does not disable interrupts like task_rq_lock,
1441	* you need to do so manually before calling.	1441	* you need to do so manually before calling.
1442	*/	1442	*/
1443	static void double_rq_lock(runqueue_t rq1, runqueue_t rq2)	1443	static void double_rq_lock(runqueue_t rq1, runqueue_t rq2)
1444	__acquires(rq1->lock)	1444	__acquires(rq1->lock)
1445	__acquires(rq2->lock)	1445	__acquires(rq2->lock)
1446	{	1446	{
1447	if (rq1 == rq2) {	1447	if (rq1 == rq2) {
1448	spin_lock(&rq1->lock);	1448	spin_lock(&rq1->lock);
1449	__acquire(rq2->lock); /* Fake it out ;) */	1449	__acquire(rq2->lock); /* Fake it out ;) */
1450	} else {	1450	} else {
1451	if (rq1 < rq2) {	1451	if (rq1 < rq2) {
1452	spin_lock(&rq1->lock);	1452	spin_lock(&rq1->lock);
1453	spin_lock(&rq2->lock);	1453	spin_lock(&rq2->lock);
1454	} else {	1454	} else {
1455	spin_lock(&rq2->lock);	1455	spin_lock(&rq2->lock);
1456	spin_lock(&rq1->lock);	1456	spin_lock(&rq1->lock);
1457	}	1457	}
1458	}	1458	}
1459	}	1459	}
1460		1460
1461	/*	1461	/*
1462	* double_rq_unlock - safely unlock two runqueues	1462	* double_rq_unlock - safely unlock two runqueues
1463	*	1463	*
1464	* Note this does not restore interrupts like task_rq_unlock,	1464	* Note this does not restore interrupts like task_rq_unlock,
1465	* you need to do so manually after calling.	1465	* you need to do so manually after calling.
1466	*/	1466	*/
1467	static void double_rq_unlock(runqueue_t rq1, runqueue_t rq2)	1467	static void double_rq_unlock(runqueue_t rq1, runqueue_t rq2)
1468	__releases(rq1->lock)	1468	__releases(rq1->lock)
1469	__releases(rq2->lock)	1469	__releases(rq2->lock)
1470	{	1470	{
1471	spin_unlock(&rq1->lock);	1471	spin_unlock(&rq1->lock);
1472	if (rq1 != rq2)	1472	if (rq1 != rq2)
1473	spin_unlock(&rq2->lock);	1473	spin_unlock(&rq2->lock);
1474	else	1474	else
1475	__release(rq2->lock);	1475	__release(rq2->lock);
1476	}	1476	}
1477		1477
1478	/*	1478	/*
1479	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.	1479	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1480	*/	1480	*/
1481	static void double_lock_balance(runqueue_t this_rq, runqueue_t busiest)	1481	static void double_lock_balance(runqueue_t this_rq, runqueue_t busiest)
1482	__releases(this_rq->lock)	1482	__releases(this_rq->lock)
1483	__acquires(busiest->lock)	1483	__acquires(busiest->lock)
1484	__acquires(this_rq->lock)	1484	__acquires(this_rq->lock)
1485	{	1485	{
1486	if (unlikely(!spin_trylock(&busiest->lock))) {	1486	if (unlikely(!spin_trylock(&busiest->lock))) {
1487	if (busiest < this_rq) {	1487	if (busiest < this_rq) {
1488	spin_unlock(&this_rq->lock);	1488	spin_unlock(&this_rq->lock);
1489	spin_lock(&busiest->lock);	1489	spin_lock(&busiest->lock);
1490	spin_lock(&this_rq->lock);	1490	spin_lock(&this_rq->lock);
1491	} else	1491	} else
1492	spin_lock(&busiest->lock);	1492	spin_lock(&busiest->lock);
1493	}	1493	}
1494	}	1494	}
1495		1495
1496	/*	1496	/*
1497	* find_idlest_cpu - find the least busy runqueue.	1497	* find_idlest_cpu - find the least busy runqueue.
1498	*/	1498	*/
1499	static int find_idlest_cpu(struct task_struct *p, int this_cpu,	1499	static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500	struct sched_domain *sd)	1500	struct sched_domain *sd)
1501	{	1501	{
1502	unsigned long load, min_load, this_load;	1502	unsigned long load, min_load, this_load;
1503	int i, min_cpu;	1503	int i, min_cpu;
1504	cpumask_t mask;	1504	cpumask_t mask;
1505		1505
1506	min_cpu = UINT_MAX;	1506	min_cpu = UINT_MAX;
1507	min_load = ULONG_MAX;	1507	min_load = ULONG_MAX;
1508		1508
1509	cpus_and(mask, sd->span, p->cpus_allowed);	1509	cpus_and(mask, sd->span, p->cpus_allowed);
1510		1510
1511	for_each_cpu_mask(i, mask) {	1511	for_each_cpu_mask(i, mask) {
1512	load = target_load(i);	1512	load = target_load(i);
1513		1513
1514	if (load < min_load) {	1514	if (load < min_load) {
1515	min_cpu = i;	1515	min_cpu = i;
1516	min_load = load;	1516	min_load = load;
1517		1517
1518	/* break out early on an idle CPU: */	1518	/* break out early on an idle CPU: */
1519	if (!min_load)	1519	if (!min_load)
1520	break;	1520	break;
1521	}	1521	}
1522	}	1522	}
1523		1523
1524	/* add +1 to account for the new task */	1524	/* add +1 to account for the new task */
1525	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;	1525	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526		1526
1527	/*	1527	/*
1528	* Would with the addition of the new task to the	1528	* Would with the addition of the new task to the
1529	* current CPU there be an imbalance between this	1529	* current CPU there be an imbalance between this
1530	* CPU and the idlest CPU?	1530	* CPU and the idlest CPU?
1531	*	1531	*
1532	* Use half of the balancing threshold - new-context is	1532	* Use half of the balancing threshold - new-context is
1533	* a good opportunity to balance.	1533	* a good opportunity to balance.
1534	*/	1534	*/
1535	if (min_load(100 + (sd->imbalance_pct-100)/2) < this_load100)	1535	if (min_load(100 + (sd->imbalance_pct-100)/2) < this_load100)
1536	return min_cpu;	1536	return min_cpu;
1537		1537
1538	return this_cpu;	1538	return this_cpu;
1539	}	1539	}
1540		1540
1541	/*	1541	/*
1542	* If dest_cpu is allowed for this process, migrate the task to it.	1542	* If dest_cpu is allowed for this process, migrate the task to it.
1543	* This is accomplished by forcing the cpu_allowed mask to only	1543	* This is accomplished by forcing the cpu_allowed mask to only
1544	* allow dest_cpu, which will force the cpu onto dest_cpu. Then	1544	* allow dest_cpu, which will force the cpu onto dest_cpu. Then
1545	* the cpu_allowed mask is restored.	1545	* the cpu_allowed mask is restored.
1546	*/	1546	*/
1547	static void sched_migrate_task(task_t *p, int dest_cpu)	1547	static void sched_migrate_task(task_t *p, int dest_cpu)
1548	{	1548	{
1549	migration_req_t req;	1549	migration_req_t req;
1550	runqueue_t *rq;	1550	runqueue_t *rq;
1551	unsigned long flags;	1551	unsigned long flags;
1552		1552
1553	rq = task_rq_lock(p, &flags);	1553	rq = task_rq_lock(p, &flags);
1554	if (!cpu_isset(dest_cpu, p->cpus_allowed)	1554	if (!cpu_isset(dest_cpu, p->cpus_allowed)
1555	\|\| unlikely(cpu_is_offline(dest_cpu)))	1555	\|\| unlikely(cpu_is_offline(dest_cpu)))
1556	goto out;	1556	goto out;
1557		1557
1558	/* force the process onto the specified CPU */	1558	/* force the process onto the specified CPU */
1559	if (migrate_task(p, dest_cpu, &req)) {	1559	if (migrate_task(p, dest_cpu, &req)) {
1560	/* Need to wait for migration thread (might exit: take ref). */	1560	/* Need to wait for migration thread (might exit: take ref). */
1561	struct task_struct *mt = rq->migration_thread;	1561	struct task_struct *mt = rq->migration_thread;
1562	get_task_struct(mt);	1562	get_task_struct(mt);
1563	task_rq_unlock(rq, &flags);	1563	task_rq_unlock(rq, &flags);
1564	wake_up_process(mt);	1564	wake_up_process(mt);
1565	put_task_struct(mt);	1565	put_task_struct(mt);
1566	wait_for_completion(&req.done);	1566	wait_for_completion(&req.done);
1567	return;	1567	return;
1568	}	1568	}
1569	out:	1569	out:
1570	task_rq_unlock(rq, &flags);	1570	task_rq_unlock(rq, &flags);
1571	}	1571	}
1572		1572
1573	/*	1573	/*
1574	* sched_exec(): find the highest-level, exec-balance-capable	1574	* sched_exec(): find the highest-level, exec-balance-capable
1575	* domain and try to migrate the task to the least loaded CPU.	1575	* domain and try to migrate the task to the least loaded CPU.
1576	*	1576	*
1577	* execve() is a valuable balancing opportunity, because at this point	1577	* execve() is a valuable balancing opportunity, because at this point
1578	* the task has the smallest effective memory and cache footprint.	1578	* the task has the smallest effective memory and cache footprint.
1579	*/	1579	*/
1580	void sched_exec(void)	1580	void sched_exec(void)
1581	{	1581	{
1582	struct sched_domain tmp, sd = NULL;	1582	struct sched_domain tmp, sd = NULL;
1583	int new_cpu, this_cpu = get_cpu();	1583	int new_cpu, this_cpu = get_cpu();
1584		1584
1585	/* Prefer the current CPU if there's only this task running */	1585	/* Prefer the current CPU if there's only this task running */
1586	if (this_rq()->nr_running <= 1)	1586	if (this_rq()->nr_running <= 1)
1587	goto out;	1587	goto out;
1588		1588
1589	for_each_domain(this_cpu, tmp)	1589	for_each_domain(this_cpu, tmp)
1590	if (tmp->flags & SD_BALANCE_EXEC)	1590	if (tmp->flags & SD_BALANCE_EXEC)
1591	sd = tmp;	1591	sd = tmp;
1592		1592
1593	if (sd) {	1593	if (sd) {
1594	schedstat_inc(sd, sbe_attempts);	1594	schedstat_inc(sd, sbe_attempts);
1595	new_cpu = find_idlest_cpu(current, this_cpu, sd);	1595	new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596	if (new_cpu != this_cpu) {	1596	if (new_cpu != this_cpu) {
1597	schedstat_inc(sd, sbe_pushed);	1597	schedstat_inc(sd, sbe_pushed);
1598	put_cpu();	1598	put_cpu();
1599	sched_migrate_task(current, new_cpu);	1599	sched_migrate_task(current, new_cpu);
1600	return;	1600	return;
1601	}	1601	}
1602	}	1602	}
1603	out:	1603	out:
1604	put_cpu();	1604	put_cpu();
1605	}	1605	}
1606		1606
1607	/*	1607	/*
1608	* pull_task - move a task from a remote runqueue to the local runqueue.	1608	* pull_task - move a task from a remote runqueue to the local runqueue.
1609	* Both runqueues must be locked.	1609	* Both runqueues must be locked.
1610	*/	1610	*/
1611	static inline	1611	static inline
1612	void pull_task(runqueue_t src_rq, prio_array_t src_array, task_t *p,	1612	void pull_task(runqueue_t src_rq, prio_array_t src_array, task_t *p,
1613	runqueue_t this_rq, prio_array_t this_array, int this_cpu)	1613	runqueue_t this_rq, prio_array_t this_array, int this_cpu)
1614	{	1614	{
1615	dequeue_task(p, src_array);	1615	dequeue_task(p, src_array);
1616	src_rq->nr_running--;	1616	src_rq->nr_running--;
1617	set_task_cpu(p, this_cpu);	1617	set_task_cpu(p, this_cpu);
1618	this_rq->nr_running++;	1618	this_rq->nr_running++;
1619	enqueue_task(p, this_array);	1619	enqueue_task(p, this_array);
1620	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)	1620	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1621	+ this_rq->timestamp_last_tick;	1621	+ this_rq->timestamp_last_tick;
1622	/*	1622	/*
1623	* Note that idle threads have a prio of MAX_PRIO, for this test	1623	* Note that idle threads have a prio of MAX_PRIO, for this test
1624	* to be always true for them.	1624	* to be always true for them.
1625	*/	1625	*/
1626	if (TASK_PREEMPTS_CURR(p, this_rq))	1626	if (TASK_PREEMPTS_CURR(p, this_rq))
1627	resched_task(this_rq->curr);	1627	resched_task(this_rq->curr);
1628	}	1628	}
1629		1629
1630	/*	1630	/*
1631	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?	1631	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1632	*/	1632	*/
1633	static inline	1633	static inline
1634	int can_migrate_task(task_t p, runqueue_t rq, int this_cpu,	1634	int can_migrate_task(task_t p, runqueue_t rq, int this_cpu,
1635	struct sched_domain *sd, enum idle_type idle)	1635	struct sched_domain *sd, enum idle_type idle)
1636	{	1636	{
1637	/*	1637	/*
1638	* We do not migrate tasks that are:	1638	* We do not migrate tasks that are:
1639	* 1) running (obviously), or	1639	* 1) running (obviously), or
1640	* 2) cannot be migrated to this CPU due to cpus_allowed, or	1640	* 2) cannot be migrated to this CPU due to cpus_allowed, or
1641	* 3) are cache-hot on their current CPU.	1641	* 3) are cache-hot on their current CPU.
1642	*/	1642	*/
1643	if (task_running(rq, p))	1643	if (task_running(rq, p))
1644	return 0;	1644	return 0;
1645	if (!cpu_isset(this_cpu, p->cpus_allowed))	1645	if (!cpu_isset(this_cpu, p->cpus_allowed))
1646	return 0;	1646	return 0;
1647		1647
1648	/*	1648	/*
1649	* Aggressive migration if:	1649	* Aggressive migration if:
1650	* 1) the [whole] cpu is idle, or	1650	* 1) the [whole] cpu is idle, or
1651	* 2) too many balance attempts have failed.	1651	* 2) too many balance attempts have failed.
1652	*/	1652	*/
1653		1653
1654	if (cpu_and_siblings_are_idle(this_cpu) \|\| \	1654	if (cpu_and_siblings_are_idle(this_cpu) \|\| \
1655	sd->nr_balance_failed > sd->cache_nice_tries)	1655	sd->nr_balance_failed > sd->cache_nice_tries)
1656	return 1;	1656	return 1;
1657		1657
1658	if (task_hot(p, rq->timestamp_last_tick, sd))	1658	if (task_hot(p, rq->timestamp_last_tick, sd))
1659	return 0;	1659	return 0;
1660	return 1;	1660	return 1;
1661	}	1661	}
1662		1662
1663	/*	1663	/*
1664	* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,	1664	* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1665	* as part of a balancing operation within "domain". Returns the number of	1665	* as part of a balancing operation within "domain". Returns the number of
1666	* tasks moved.	1666	* tasks moved.
1667	*	1667	*
1668	* Called with both runqueues locked.	1668	* Called with both runqueues locked.
1669	*/	1669	*/
1670	static int move_tasks(runqueue_t this_rq, int this_cpu, runqueue_t busiest,	1670	static int move_tasks(runqueue_t this_rq, int this_cpu, runqueue_t busiest,
1671	unsigned long max_nr_move, struct sched_domain *sd,	1671	unsigned long max_nr_move, struct sched_domain *sd,
1672	enum idle_type idle)	1672	enum idle_type idle)
1673	{	1673	{
1674	prio_array_t array, dst_array;	1674	prio_array_t array, dst_array;
1675	struct list_head head, curr;	1675	struct list_head head, curr;
1676	int idx, pulled = 0;	1676	int idx, pulled = 0;
1677	task_t *tmp;	1677	task_t *tmp;
1678		1678
1679	if (max_nr_move <= 0 \|\| busiest->nr_running <= 1)	1679	if (max_nr_move <= 0 \|\| busiest->nr_running <= 1)
1680	goto out;	1680	goto out;
1681		1681
1682	/*	1682	/*
1683	* We first consider expired tasks. Those will likely not be	1683	* We first consider expired tasks. Those will likely not be
1684	* executed in the near future, and they are most likely to	1684	* executed in the near future, and they are most likely to
1685	* be cache-cold, thus switching CPUs has the least effect	1685	* be cache-cold, thus switching CPUs has the least effect
1686	* on them.	1686	* on them.
1687	*/	1687	*/
1688	if (busiest->expired->nr_active) {	1688	if (busiest->expired->nr_active) {
1689	array = busiest->expired;	1689	array = busiest->expired;
1690	dst_array = this_rq->expired;	1690	dst_array = this_rq->expired;
1691	} else {	1691	} else {
1692	array = busiest->active;	1692	array = busiest->active;
1693	dst_array = this_rq->active;	1693	dst_array = this_rq->active;
1694	}	1694	}
1695		1695
1696	new_array:	1696	new_array:
1697	/* Start searching at priority 0: */	1697	/* Start searching at priority 0: */
1698	idx = 0;	1698	idx = 0;
1699	skip_bitmap:	1699	skip_bitmap:
1700	if (!idx)	1700	if (!idx)
1701	idx = sched_find_first_bit(array->bitmap);	1701	idx = sched_find_first_bit(array->bitmap);
1702	else	1702	else
1703	idx = find_next_bit(array->bitmap, MAX_PRIO, idx);	1703	idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1704	if (idx >= MAX_PRIO) {	1704	if (idx >= MAX_PRIO) {
1705	if (array == busiest->expired && busiest->active->nr_active) {	1705	if (array == busiest->expired && busiest->active->nr_active) {
1706	array = busiest->active;	1706	array = busiest->active;
1707	dst_array = this_rq->active;	1707	dst_array = this_rq->active;
1708	goto new_array;	1708	goto new_array;
1709	}	1709	}
1710	goto out;	1710	goto out;
1711	}	1711	}
1712		1712
1713	head = array->queue + idx;	1713	head = array->queue + idx;
1714	curr = head->prev;	1714	curr = head->prev;
1715	skip_queue:	1715	skip_queue:
1716	tmp = list_entry(curr, task_t, run_list);	1716	tmp = list_entry(curr, task_t, run_list);
1717		1717
1718	curr = curr->prev;	1718	curr = curr->prev;
1719		1719
1720	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {	1720	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1721	if (curr != head)	1721	if (curr != head)
1722	goto skip_queue;	1722	goto skip_queue;
1723	idx++;	1723	idx++;
1724	goto skip_bitmap;	1724	goto skip_bitmap;
1725	}	1725	}
1726		1726
1727	#ifdef CONFIG_SCHEDSTATS	1727	#ifdef CONFIG_SCHEDSTATS
1728	if (task_hot(tmp, busiest->timestamp_last_tick, sd))	1728	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1729	schedstat_inc(sd, lb_hot_gained[idle]);	1729	schedstat_inc(sd, lb_hot_gained[idle]);
1730	#endif	1730	#endif
1731		1731
1732	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);	1732	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1733	pulled++;	1733	pulled++;
1734		1734
1735	/* We only want to steal up to the prescribed number of tasks. */	1735	/* We only want to steal up to the prescribed number of tasks. */
1736	if (pulled < max_nr_move) {	1736	if (pulled < max_nr_move) {
1737	if (curr != head)	1737	if (curr != head)
1738	goto skip_queue;	1738	goto skip_queue;
1739	idx++;	1739	idx++;
1740	goto skip_bitmap;	1740	goto skip_bitmap;
1741	}	1741	}
1742	out:	1742	out:
1743	/*	1743	/*
1744	* Right now, this is the only place pull_task() is called,	1744	* Right now, this is the only place pull_task() is called,
1745	* so we can safely collect pull_task() stats here rather than	1745	* so we can safely collect pull_task() stats here rather than
1746	* inside pull_task().	1746	* inside pull_task().
1747	*/	1747	*/
1748	schedstat_add(sd, lb_gained[idle], pulled);	1748	schedstat_add(sd, lb_gained[idle], pulled);
1749	return pulled;	1749	return pulled;
1750	}	1750	}
1751		1751
1752	/*	1752	/*
1753	* find_busiest_group finds and returns the busiest CPU group within the	1753	* find_busiest_group finds and returns the busiest CPU group within the
1754	* domain. It calculates and returns the number of tasks which should be	1754	* domain. It calculates and returns the number of tasks which should be
1755	* moved to restore balance via the imbalance parameter.	1755	* moved to restore balance via the imbalance parameter.
1756	*/	1756	*/
1757	static struct sched_group *	1757	static struct sched_group *
1758	find_busiest_group(struct sched_domain *sd, int this_cpu,	1758	find_busiest_group(struct sched_domain *sd, int this_cpu,
1759	unsigned long *imbalance, enum idle_type idle)	1759	unsigned long *imbalance, enum idle_type idle)
1760	{	1760	{
1761	struct sched_group busiest = NULL, this = NULL, *group = sd->groups;	1761	struct sched_group busiest = NULL, this = NULL, *group = sd->groups;
1762	unsigned long max_load, avg_load, total_load, this_load, total_pwr;	1762	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1763		1763
1764	max_load = this_load = total_load = total_pwr = 0;	1764	max_load = this_load = total_load = total_pwr = 0;
1765		1765
1766	do {	1766	do {
1767	unsigned long load;	1767	unsigned long load;
1768	int local_group;	1768	int local_group;
1769	int i;	1769	int i;
1770		1770
1771	local_group = cpu_isset(this_cpu, group->cpumask);	1771	local_group = cpu_isset(this_cpu, group->cpumask);
1772		1772
1773	/* Tally up the load of all CPUs in the group */	1773	/* Tally up the load of all CPUs in the group */
1774	avg_load = 0;	1774	avg_load = 0;
1775		1775
1776	for_each_cpu_mask(i, group->cpumask) {	1776	for_each_cpu_mask(i, group->cpumask) {
1777	/* Bias balancing toward cpus of our domain */	1777	/* Bias balancing toward cpus of our domain */
1778	if (local_group)	1778	if (local_group)
1779	load = target_load(i);	1779	load = target_load(i);
1780	else	1780	else
1781	load = source_load(i);	1781	load = source_load(i);
1782		1782
1783	avg_load += load;	1783	avg_load += load;
1784	}	1784	}
1785		1785
1786	total_load += avg_load;	1786	total_load += avg_load;
1787	total_pwr += group->cpu_power;	1787	total_pwr += group->cpu_power;
1788		1788
1789	/* Adjust by relative CPU power of the group */	1789	/* Adjust by relative CPU power of the group */
1790	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;	1790	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1791		1791
1792	if (local_group) {	1792	if (local_group) {
1793	this_load = avg_load;	1793	this_load = avg_load;
1794	this = group;	1794	this = group;
1795	goto nextgroup;	1795	goto nextgroup;
1796	} else if (avg_load > max_load) {	1796	} else if (avg_load > max_load) {
1797	max_load = avg_load;	1797	max_load = avg_load;
1798	busiest = group;	1798	busiest = group;
1799	}	1799	}
1800	nextgroup:	1800	nextgroup:
1801	group = group->next;	1801	group = group->next;
1802	} while (group != sd->groups);	1802	} while (group != sd->groups);
1803		1803
1804	if (!busiest \|\| this_load >= max_load)	1804	if (!busiest \|\| this_load >= max_load)
1805	goto out_balanced;	1805	goto out_balanced;
1806		1806
1807	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;	1807	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1808		1808
1809	if (this_load >= avg_load \|\|	1809	if (this_load >= avg_load \|\|
1810	100max_load <= sd->imbalance_pctthis_load)	1810	100max_load <= sd->imbalance_pctthis_load)
1811	goto out_balanced;	1811	goto out_balanced;
1812		1812
1813	/*	1813	/*
1814	* We're trying to get all the cpus to the average_load, so we don't	1814	* We're trying to get all the cpus to the average_load, so we don't
1815	* want to push ourselves above the average load, nor do we wish to	1815	* want to push ourselves above the average load, nor do we wish to
1816	* reduce the max loaded cpu below the average load, as either of these	1816	* reduce the max loaded cpu below the average load, as either of these
1817	* actions would just result in more rebalancing later, and ping-pong	1817	* actions would just result in more rebalancing later, and ping-pong
1818	* tasks around. Thus we look for the minimum possible imbalance.	1818	* tasks around. Thus we look for the minimum possible imbalance.
1819	* Negative imbalances (we are more loaded than anyone else) will	1819	* Negative imbalances (we are more loaded than anyone else) will
1820	* be counted as no imbalance for these purposes -- we can't fix that	1820	* be counted as no imbalance for these purposes -- we can't fix that
1821	* by pulling tasks to us. Be careful of negative numbers as they'll	1821	* by pulling tasks to us. Be careful of negative numbers as they'll
1822	* appear as very large values with unsigned longs.	1822	* appear as very large values with unsigned longs.
1823	*/	1823	*/
1824	/* How much load to actually move to equalise the imbalance */	1824	/* How much load to actually move to equalise the imbalance */
1825	imbalance = min((max_load - avg_load) busiest->cpu_power,	1825	imbalance = min((max_load - avg_load) busiest->cpu_power,
1826	(avg_load - this_load) * this->cpu_power)	1826	(avg_load - this_load) * this->cpu_power)
1827	/ SCHED_LOAD_SCALE;	1827	/ SCHED_LOAD_SCALE;
1828		1828
1829	if (*imbalance < SCHED_LOAD_SCALE) {	1829	if (*imbalance < SCHED_LOAD_SCALE) {
1830	unsigned long pwr_now = 0, pwr_move = 0;	1830	unsigned long pwr_now = 0, pwr_move = 0;
1831	unsigned long tmp;	1831	unsigned long tmp;
1832		1832
1833	if (max_load - this_load >= SCHED_LOAD_SCALE*2) {	1833	if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1834	*imbalance = 1;	1834	*imbalance = 1;
1835	return busiest;	1835	return busiest;
1836	}	1836	}
1837		1837
1838	/*	1838	/*
1839	* OK, we don't have enough imbalance to justify moving tasks,	1839	* OK, we don't have enough imbalance to justify moving tasks,
1840	* however we may be able to increase total CPU power used by	1840	* however we may be able to increase total CPU power used by
1841	* moving them.	1841	* moving them.
1842	*/	1842	*/
1843		1843
1844	pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);	1844	pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1845	pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);	1845	pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1846	pwr_now /= SCHED_LOAD_SCALE;	1846	pwr_now /= SCHED_LOAD_SCALE;
1847		1847
1848	/* Amount of load we'd subtract */	1848	/* Amount of load we'd subtract */
1849	tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;	1849	tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1850	if (max_load > tmp)	1850	if (max_load > tmp)
1851	pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,	1851	pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1852	max_load - tmp);	1852	max_load - tmp);
1853		1853
1854	/* Amount of load we'd add */	1854	/* Amount of load we'd add */
1855	if (max_load*busiest->cpu_power <	1855	if (max_load*busiest->cpu_power <
1856	SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)	1856	SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
1857	tmp = max_load*busiest->cpu_power/this->cpu_power;	1857	tmp = max_load*busiest->cpu_power/this->cpu_power;
1858	else	1858	else
1859	tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;	1859	tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1860	pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);	1860	pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1861	pwr_move /= SCHED_LOAD_SCALE;	1861	pwr_move /= SCHED_LOAD_SCALE;
1862		1862
1863	/* Move if we gain throughput */	1863	/* Move if we gain throughput */
1864	if (pwr_move <= pwr_now)	1864	if (pwr_move <= pwr_now)
1865	goto out_balanced;	1865	goto out_balanced;
1866		1866
1867	*imbalance = 1;	1867	*imbalance = 1;
1868	return busiest;	1868	return busiest;
1869	}	1869	}
1870		1870
1871	/* Get rid of the scaling factor, rounding down as we divide */	1871	/* Get rid of the scaling factor, rounding down as we divide */
1872	imbalance = imbalance / SCHED_LOAD_SCALE;	1872	imbalance = imbalance / SCHED_LOAD_SCALE;
1873		1873
1874	return busiest;	1874	return busiest;
1875		1875
1876	out_balanced:	1876	out_balanced:
1877	if (busiest && (idle == NEWLY_IDLE \|\|	1877	if (busiest && (idle == NEWLY_IDLE \|\|
1878	(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {	1878	(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879	*imbalance = 1;	1879	*imbalance = 1;
1880	return busiest;	1880	return busiest;
1881	}	1881	}
1882		1882
1883	*imbalance = 0;	1883	*imbalance = 0;
1884	return NULL;	1884	return NULL;
1885	}	1885	}
1886		1886
1887	/*	1887	/*
1888	* find_busiest_queue - find the busiest runqueue among the cpus in group.	1888	* find_busiest_queue - find the busiest runqueue among the cpus in group.
1889	*/	1889	*/
1890	static runqueue_t find_busiest_queue(struct sched_group group)	1890	static runqueue_t find_busiest_queue(struct sched_group group)
1891	{	1891	{
1892	unsigned long load, max_load = 0;	1892	unsigned long load, max_load = 0;
1893	runqueue_t *busiest = NULL;	1893	runqueue_t *busiest = NULL;
1894	int i;	1894	int i;
1895		1895
1896	for_each_cpu_mask(i, group->cpumask) {	1896	for_each_cpu_mask(i, group->cpumask) {
1897	load = source_load(i);	1897	load = source_load(i);
1898		1898
1899	if (load > max_load) {	1899	if (load > max_load) {
1900	max_load = load;	1900	max_load = load;
1901	busiest = cpu_rq(i);	1901	busiest = cpu_rq(i);
1902	}	1902	}
1903	}	1903	}
1904		1904
1905	return busiest;	1905	return busiest;
1906	}	1906	}
1907		1907
1908	/*	1908	/*
1909	* Check this_cpu to ensure it is balanced within domain. Attempt to move	1909	* Check this_cpu to ensure it is balanced within domain. Attempt to move
1910	* tasks if there is an imbalance.	1910	* tasks if there is an imbalance.
1911	*	1911	*
1912	* Called with this_rq unlocked.	1912	* Called with this_rq unlocked.
1913	*/	1913	*/
1914	static int load_balance(int this_cpu, runqueue_t *this_rq,	1914	static int load_balance(int this_cpu, runqueue_t *this_rq,
1915	struct sched_domain *sd, enum idle_type idle)	1915	struct sched_domain *sd, enum idle_type idle)
1916	{	1916	{
1917	struct sched_group *group;	1917	struct sched_group *group;
1918	runqueue_t *busiest;	1918	runqueue_t *busiest;
1919	unsigned long imbalance;	1919	unsigned long imbalance;
1920	int nr_moved;	1920	int nr_moved;
1921		1921
1922	spin_lock(&this_rq->lock);	1922	spin_lock(&this_rq->lock);
1923	schedstat_inc(sd, lb_cnt[idle]);	1923	schedstat_inc(sd, lb_cnt[idle]);
1924		1924
1925	group = find_busiest_group(sd, this_cpu, &imbalance, idle);	1925	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1926	if (!group) {	1926	if (!group) {
1927	schedstat_inc(sd, lb_nobusyg[idle]);	1927	schedstat_inc(sd, lb_nobusyg[idle]);
1928	goto out_balanced;	1928	goto out_balanced;
1929	}	1929	}
1930		1930
1931	busiest = find_busiest_queue(group);	1931	busiest = find_busiest_queue(group);
1932	if (!busiest) {	1932	if (!busiest) {
1933	schedstat_inc(sd, lb_nobusyq[idle]);	1933	schedstat_inc(sd, lb_nobusyq[idle]);
1934	goto out_balanced;	1934	goto out_balanced;
1935	}	1935	}
1936		1936
1937	/*	1937	/*
1938	* This should be "impossible", but since load	1938	* This should be "impossible", but since load
1939	* balancing is inherently racy and statistical,	1939	* balancing is inherently racy and statistical,
1940	* it could happen in theory.	1940	* it could happen in theory.
1941	*/	1941	*/
1942	if (unlikely(busiest == this_rq)) {	1942	if (unlikely(busiest == this_rq)) {
1943	WARN_ON(1);	1943	WARN_ON(1);
1944	goto out_balanced;	1944	goto out_balanced;
1945	}	1945	}
1946		1946
1947	schedstat_add(sd, lb_imbalance[idle], imbalance);	1947	schedstat_add(sd, lb_imbalance[idle], imbalance);
1948		1948
1949	nr_moved = 0;	1949	nr_moved = 0;
1950	if (busiest->nr_running > 1) {	1950	if (busiest->nr_running > 1) {
1951	/*	1951	/*
1952	* Attempt to move tasks. If find_busiest_group has found	1952	* Attempt to move tasks. If find_busiest_group has found
1953	* an imbalance but busiest->nr_running <= 1, the group is	1953	* an imbalance but busiest->nr_running <= 1, the group is
1954	* still unbalanced. nr_moved simply stays zero, so it is	1954	* still unbalanced. nr_moved simply stays zero, so it is
1955	* correctly treated as an imbalance.	1955	* correctly treated as an imbalance.
1956	*/	1956	*/
1957	double_lock_balance(this_rq, busiest);	1957	double_lock_balance(this_rq, busiest);
1958	nr_moved = move_tasks(this_rq, this_cpu, busiest,	1958	nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959	imbalance, sd, idle);	1959	imbalance, sd, idle);
1960	spin_unlock(&busiest->lock);	1960	spin_unlock(&busiest->lock);
1961	}	1961	}
1962	spin_unlock(&this_rq->lock);	1962	spin_unlock(&this_rq->lock);
1963		1963
1964	if (!nr_moved) {	1964	if (!nr_moved) {
1965	schedstat_inc(sd, lb_failed[idle]);	1965	schedstat_inc(sd, lb_failed[idle]);
1966	sd->nr_balance_failed++;	1966	sd->nr_balance_failed++;
1967		1967
1968	if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {	1968	if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969	int wake = 0;	1969	int wake = 0;
1970		1970
1971	spin_lock(&busiest->lock);	1971	spin_lock(&busiest->lock);
1972	if (!busiest->active_balance) {	1972	if (!busiest->active_balance) {
1973	busiest->active_balance = 1;	1973	busiest->active_balance = 1;
1974	busiest->push_cpu = this_cpu;	1974	busiest->push_cpu = this_cpu;
1975	wake = 1;	1975	wake = 1;
1976	}	1976	}
1977	spin_unlock(&busiest->lock);	1977	spin_unlock(&busiest->lock);
1978	if (wake)	1978	if (wake)
1979	wake_up_process(busiest->migration_thread);	1979	wake_up_process(busiest->migration_thread);
1980		1980
1981	/*	1981	/*
1982	* We've kicked active balancing, reset the failure	1982	* We've kicked active balancing, reset the failure
1983	* counter.	1983	* counter.
1984	*/	1984	*/
1985	sd->nr_balance_failed = sd->cache_nice_tries;	1985	sd->nr_balance_failed = sd->cache_nice_tries;
1986	}	1986	}
1987		1987
1988	/*	1988	/*
1989	* We were unbalanced, but unsuccessful in move_tasks(),	1989	* We were unbalanced, but unsuccessful in move_tasks(),
1990	* so bump the balance_interval to lessen the lock contention.	1990	* so bump the balance_interval to lessen the lock contention.
1991	*/	1991	*/
1992	if (sd->balance_interval < sd->max_interval)	1992	if (sd->balance_interval < sd->max_interval)
1993	sd->balance_interval++;	1993	sd->balance_interval++;
1994	} else {	1994	} else {
1995	sd->nr_balance_failed = 0;	1995	sd->nr_balance_failed = 0;
1996		1996
1997	/* We were unbalanced, so reset the balancing interval */	1997	/* We were unbalanced, so reset the balancing interval */
1998	sd->balance_interval = sd->min_interval;	1998	sd->balance_interval = sd->min_interval;
1999	}	1999	}
2000		2000
2001	return nr_moved;	2001	return nr_moved;
2002		2002
2003	out_balanced:	2003	out_balanced:
2004	spin_unlock(&this_rq->lock);	2004	spin_unlock(&this_rq->lock);
2005		2005
2006	schedstat_inc(sd, lb_balanced[idle]);	2006	schedstat_inc(sd, lb_balanced[idle]);
2007		2007
2008	/* tune up the balancing interval */	2008	/* tune up the balancing interval */
2009	if (sd->balance_interval < sd->max_interval)	2009	if (sd->balance_interval < sd->max_interval)
2010	sd->balance_interval *= 2;	2010	sd->balance_interval *= 2;
2011		2011
2012	return 0;	2012	return 0;
2013	}	2013	}
2014		2014
2015	/*	2015	/*
2016	* Check this_cpu to ensure it is balanced within domain. Attempt to move	2016	* Check this_cpu to ensure it is balanced within domain. Attempt to move
2017	* tasks if there is an imbalance.	2017	* tasks if there is an imbalance.
2018	*	2018	*
2019	* Called from schedule when this_rq is about to become idle (NEWLY_IDLE).	2019	* Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2020	* this_rq is locked.	2020	* this_rq is locked.
2021	*/	2021	*/
2022	static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,	2022	static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2023	struct sched_domain *sd)	2023	struct sched_domain *sd)
2024	{	2024	{
2025	struct sched_group *group;	2025	struct sched_group *group;
2026	runqueue_t *busiest = NULL;	2026	runqueue_t *busiest = NULL;
2027	unsigned long imbalance;	2027	unsigned long imbalance;
2028	int nr_moved = 0;	2028	int nr_moved = 0;
2029		2029
2030	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);	2030	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);	2031	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032	if (!group) {	2032	if (!group) {
2033	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);	2033	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034	schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);	2034	schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035	goto out;	2035	goto out;
2036	}	2036	}
2037		2037
2038	busiest = find_busiest_queue(group);	2038	busiest = find_busiest_queue(group);
2039	if (!busiest \|\| busiest == this_rq) {	2039	if (!busiest \|\| busiest == this_rq) {
2040	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);	2040	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041	schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);	2041	schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042	goto out;	2042	goto out;
2043	}	2043	}
2044		2044
2045	/* Attempt to move tasks */	2045	/* Attempt to move tasks */
2046	double_lock_balance(this_rq, busiest);	2046	double_lock_balance(this_rq, busiest);
2047		2047
2048	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);	2048	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049	nr_moved = move_tasks(this_rq, this_cpu, busiest,	2049	nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050	imbalance, sd, NEWLY_IDLE);	2050	imbalance, sd, NEWLY_IDLE);
2051	if (!nr_moved)	2051	if (!nr_moved)
2052	schedstat_inc(sd, lb_failed[NEWLY_IDLE]);	2052	schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2053		2053
2054	spin_unlock(&busiest->lock);	2054	spin_unlock(&busiest->lock);
2055		2055
2056	out:	2056	out:
2057	return nr_moved;	2057	return nr_moved;
2058	}	2058	}
2059		2059
2060	/*	2060	/*
2061	* idle_balance is called by schedule() if this_cpu is about to become	2061	* idle_balance is called by schedule() if this_cpu is about to become
2062	* idle. Attempts to pull tasks from other CPUs.	2062	* idle. Attempts to pull tasks from other CPUs.
2063	*/	2063	*/
2064	static inline void idle_balance(int this_cpu, runqueue_t *this_rq)	2064	static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2065	{	2065	{
2066	struct sched_domain *sd;	2066	struct sched_domain *sd;
2067		2067
2068	for_each_domain(this_cpu, sd) {	2068	for_each_domain(this_cpu, sd) {
2069	if (sd->flags & SD_BALANCE_NEWIDLE) {	2069	if (sd->flags & SD_BALANCE_NEWIDLE) {
2070	if (load_balance_newidle(this_cpu, this_rq, sd)) {	2070	if (load_balance_newidle(this_cpu, this_rq, sd)) {
2071	/* We've pulled tasks over so stop searching */	2071	/* We've pulled tasks over so stop searching */
2072	break;	2072	break;
2073	}	2073	}
2074	}	2074	}
2075	}	2075	}
2076	}	2076	}
2077		2077
2078	/*	2078	/*
2079	* active_load_balance is run by migration threads. It pushes running tasks	2079	* active_load_balance is run by migration threads. It pushes running tasks
2080	* off the busiest CPU onto idle CPUs. It requires at least 1 task to be	2080	* off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2081	* running on each physical CPU where possible, and avoids physical /	2081	* running on each physical CPU where possible, and avoids physical /
2082	* logical imbalances.	2082	* logical imbalances.
2083	*	2083	*
2084	* Called with busiest_rq locked.	2084	* Called with busiest_rq locked.
2085	*/	2085	*/
2086	static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)	2086	static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087	{	2087	{
2088	struct sched_domain *sd;	2088	struct sched_domain *sd;
2089	struct sched_group *cpu_group;	2089	struct sched_group *cpu_group;
2090	runqueue_t *target_rq;	2090	runqueue_t *target_rq;
2091	cpumask_t visited_cpus;	2091	cpumask_t visited_cpus;
2092	int cpu;	2092	int cpu;
2093		2093
2094	/*	2094	/*
2095	* Search for suitable CPUs to push tasks to in successively higher	2095	* Search for suitable CPUs to push tasks to in successively higher
2096	* domains with SD_LOAD_BALANCE set.	2096	* domains with SD_LOAD_BALANCE set.
2097	*/	2097	*/
2098	visited_cpus = CPU_MASK_NONE;	2098	visited_cpus = CPU_MASK_NONE;
2099	for_each_domain(busiest_cpu, sd) {	2099	for_each_domain(busiest_cpu, sd) {
2100	if (!(sd->flags & SD_LOAD_BALANCE))	2100	if (!(sd->flags & SD_LOAD_BALANCE))
2101	/* no more domains to search */	2101	/* no more domains to search */
2102	break;	2102	break;
2103		2103
2104	schedstat_inc(sd, alb_cnt);	2104	schedstat_inc(sd, alb_cnt);
2105		2105
2106	cpu_group = sd->groups;	2106	cpu_group = sd->groups;
2107	do {	2107	do {
2108	for_each_cpu_mask(cpu, cpu_group->cpumask) {	2108	for_each_cpu_mask(cpu, cpu_group->cpumask) {
2109	if (busiest_rq->nr_running <= 1)	2109	if (busiest_rq->nr_running <= 1)
2110	/* no more tasks left to move */	2110	/* no more tasks left to move */
2111	return;	2111	return;
2112	if (cpu_isset(cpu, visited_cpus))	2112	if (cpu_isset(cpu, visited_cpus))
2113	continue;	2113	continue;
2114	cpu_set(cpu, visited_cpus);	2114	cpu_set(cpu, visited_cpus);
2115	if (!cpu_and_siblings_are_idle(cpu) \|\| cpu == busiest_cpu)	2115	if (!cpu_and_siblings_are_idle(cpu) \|\| cpu == busiest_cpu)
2116	continue;	2116	continue;
2117		2117
2118	target_rq = cpu_rq(cpu);	2118	target_rq = cpu_rq(cpu);
2119	/*	2119	/*
2120	* This condition is "impossible", if it occurs	2120	* This condition is "impossible", if it occurs
2121	* we need to fix it. Originally reported by	2121	* we need to fix it. Originally reported by
2122	* Bjorn Helgaas on a 128-cpu setup.	2122	* Bjorn Helgaas on a 128-cpu setup.
2123	*/	2123	*/
2124	BUG_ON(busiest_rq == target_rq);	2124	BUG_ON(busiest_rq == target_rq);
2125		2125
2126	/* move a task from busiest_rq to target_rq */	2126	/* move a task from busiest_rq to target_rq */
2127	double_lock_balance(busiest_rq, target_rq);	2127	double_lock_balance(busiest_rq, target_rq);
2128	if (move_tasks(target_rq, cpu, busiest_rq,	2128	if (move_tasks(target_rq, cpu, busiest_rq,
2129	1, sd, SCHED_IDLE)) {	2129	1, sd, SCHED_IDLE)) {
2130	schedstat_inc(sd, alb_pushed);	2130	schedstat_inc(sd, alb_pushed);
2131	} else {	2131	} else {
2132	schedstat_inc(sd, alb_failed);	2132	schedstat_inc(sd, alb_failed);
2133	}	2133	}
2134	spin_unlock(&target_rq->lock);	2134	spin_unlock(&target_rq->lock);
2135	}	2135	}
2136	cpu_group = cpu_group->next;	2136	cpu_group = cpu_group->next;
2137	} while (cpu_group != sd->groups);	2137	} while (cpu_group != sd->groups);
2138	}	2138	}
2139	}	2139	}
2140		2140
2141	/*	2141	/*
2142	* rebalance_tick will get called every timer tick, on every CPU.	2142	* rebalance_tick will get called every timer tick, on every CPU.
2143	*	2143	*
2144	* It checks each scheduling domain to see if it is due to be balanced,	2144	* It checks each scheduling domain to see if it is due to be balanced,
2145	* and initiates a balancing operation if so.	2145	* and initiates a balancing operation if so.
2146	*	2146	*
2147	* Balancing parameters are set up in arch_init_sched_domains.	2147	* Balancing parameters are set up in arch_init_sched_domains.
2148	*/	2148	*/
2149		2149
2150	/* Don't have all balancing operations going off at once */	2150	/* Don't have all balancing operations going off at once */
2151	#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)	2151	#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2152		2152
2153	static void rebalance_tick(int this_cpu, runqueue_t *this_rq,	2153	static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2154	enum idle_type idle)	2154	enum idle_type idle)
2155	{	2155	{
2156	unsigned long old_load, this_load;	2156	unsigned long old_load, this_load;
2157	unsigned long j = jiffies + CPU_OFFSET(this_cpu);	2157	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158	struct sched_domain *sd;	2158	struct sched_domain *sd;
2159		2159
2160	/* Update our load */	2160	/* Update our load */
2161	old_load = this_rq->cpu_load;	2161	old_load = this_rq->cpu_load;
2162	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;	2162	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163	/*	2163	/*
2164	* Round up the averaging division if load is increasing. This	2164	* Round up the averaging division if load is increasing. This
2165	* prevents us from getting stuck on 9 if the load is 10, for	2165	* prevents us from getting stuck on 9 if the load is 10, for
2166	* example.	2166	* example.
2167	*/	2167	*/
2168	if (this_load > old_load)	2168	if (this_load > old_load)
2169	old_load++;	2169	old_load++;
2170	this_rq->cpu_load = (old_load + this_load) / 2;	2170	this_rq->cpu_load = (old_load + this_load) / 2;
2171		2171
2172	for_each_domain(this_cpu, sd) {	2172	for_each_domain(this_cpu, sd) {
2173	unsigned long interval;	2173	unsigned long interval;
2174		2174
2175	if (!(sd->flags & SD_LOAD_BALANCE))	2175	if (!(sd->flags & SD_LOAD_BALANCE))
2176	continue;	2176	continue;
2177		2177
2178	interval = sd->balance_interval;	2178	interval = sd->balance_interval;
2179	if (idle != SCHED_IDLE)	2179	if (idle != SCHED_IDLE)
2180	interval *= sd->busy_factor;	2180	interval *= sd->busy_factor;
2181		2181
2182	/* scale ms to jiffies */	2182	/* scale ms to jiffies */
2183	interval = msecs_to_jiffies(interval);	2183	interval = msecs_to_jiffies(interval);
2184	if (unlikely(!interval))	2184	if (unlikely(!interval))
2185	interval = 1;	2185	interval = 1;
2186		2186
2187	if (j - sd->last_balance >= interval) {	2187	if (j - sd->last_balance >= interval) {
2188	if (load_balance(this_cpu, this_rq, sd, idle)) {	2188	if (load_balance(this_cpu, this_rq, sd, idle)) {
2189	/* We've pulled tasks over so no longer idle */	2189	/* We've pulled tasks over so no longer idle */
2190	idle = NOT_IDLE;	2190	idle = NOT_IDLE;
2191	}	2191	}
2192	sd->last_balance += interval;	2192	sd->last_balance += interval;
2193	}	2193	}
2194	}	2194	}
2195	}	2195	}
2196	#else	2196	#else
2197	/*	2197	/*
2198	* on UP we do not need to balance between CPUs:	2198	* on UP we do not need to balance between CPUs:
2199	*/	2199	*/
2200	static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)	2200	static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2201	{	2201	{
2202	}	2202	}
2203	static inline void idle_balance(int cpu, runqueue_t *rq)	2203	static inline void idle_balance(int cpu, runqueue_t *rq)
2204	{	2204	{
2205	}	2205	}
2206	#endif	2206	#endif
2207		2207
2208	static inline int wake_priority_sleeper(runqueue_t *rq)	2208	static inline int wake_priority_sleeper(runqueue_t *rq)
2209	{	2209	{
2210	int ret = 0;	2210	int ret = 0;
2211	#ifdef CONFIG_SCHED_SMT	2211	#ifdef CONFIG_SCHED_SMT
2212	spin_lock(&rq->lock);	2212	spin_lock(&rq->lock);
2213	/*	2213	/*
2214	* If an SMT sibling task has been put to sleep for priority	2214	* If an SMT sibling task has been put to sleep for priority
2215	* reasons reschedule the idle task to see if it can now run.	2215	* reasons reschedule the idle task to see if it can now run.
2216	*/	2216	*/
2217	if (rq->nr_running) {	2217	if (rq->nr_running) {
2218	resched_task(rq->idle);	2218	resched_task(rq->idle);
2219	ret = 1;	2219	ret = 1;
2220	}	2220	}
2221	spin_unlock(&rq->lock);	2221	spin_unlock(&rq->lock);
2222	#endif	2222	#endif
2223	return ret;	2223	return ret;
2224	}	2224	}
2225		2225
2226	DEFINE_PER_CPU(struct kernel_stat, kstat);	2226	DEFINE_PER_CPU(struct kernel_stat, kstat);
2227		2227
2228	EXPORT_PER_CPU_SYMBOL(kstat);	2228	EXPORT_PER_CPU_SYMBOL(kstat);
2229		2229
2230	/*	2230	/*
2231	* This is called on clock ticks and on context switches.	2231	* This is called on clock ticks and on context switches.
2232	* Bank in p->sched_time the ns elapsed since the last tick or switch.	2232	* Bank in p->sched_time the ns elapsed since the last tick or switch.
2233	*/	2233	*/
2234	static inline void update_cpu_clock(task_t p, runqueue_t rq,	2234	static inline void update_cpu_clock(task_t p, runqueue_t rq,
2235	unsigned long long now)	2235	unsigned long long now)
2236	{	2236	{
2237	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);	2237	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2238	p->sched_time += now - last;	2238	p->sched_time += now - last;
2239	}	2239	}
2240		2240
2241	/*	2241	/*
2242	* Return current->sched_time plus any more ns on the sched_clock	2242	* Return current->sched_time plus any more ns on the sched_clock
2243	* that have not yet been banked.	2243	* that have not yet been banked.
2244	*/	2244	*/
2245	unsigned long long current_sched_time(const task_t *tsk)	2245	unsigned long long current_sched_time(const task_t *tsk)
2246	{	2246	{
2247	unsigned long long ns;	2247	unsigned long long ns;
2248	unsigned long flags;	2248	unsigned long flags;
2249	local_irq_save(flags);	2249	local_irq_save(flags);
2250	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);	2250	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2251	ns = tsk->sched_time + (sched_clock() - ns);	2251	ns = tsk->sched_time + (sched_clock() - ns);
2252	local_irq_restore(flags);	2252	local_irq_restore(flags);
2253	return ns;	2253	return ns;
2254	}	2254	}
2255		2255
2256	/*	2256	/*
2257	* We place interactive tasks back into the active array, if possible.	2257	* We place interactive tasks back into the active array, if possible.
2258	*	2258	*
2259	* To guarantee that this does not starve expired tasks we ignore the	2259	* To guarantee that this does not starve expired tasks we ignore the
2260	* interactivity of a task if the first expired task had to wait more	2260	* interactivity of a task if the first expired task had to wait more
2261	* than a 'reasonable' amount of time. This deadline timeout is	2261	* than a 'reasonable' amount of time. This deadline timeout is
2262	* load-dependent, as the frequency of array switched decreases with	2262	* load-dependent, as the frequency of array switched decreases with
2263	* increasing number of running tasks. We also ignore the interactivity	2263	* increasing number of running tasks. We also ignore the interactivity
2264	* if a better static_prio task has expired:	2264	* if a better static_prio task has expired:
2265	*/	2265	*/
2266	#define EXPIRED_STARVING(rq) \	2266	#define EXPIRED_STARVING(rq) \
2267	((STARVATION_LIMIT && ((rq)->expired_timestamp && \	2267	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2268	(jiffies - (rq)->expired_timestamp >= \	2268	(jiffies - (rq)->expired_timestamp >= \
2269	STARVATION_LIMIT * ((rq)->nr_running) + 1))) \|\| \	2269	STARVATION_LIMIT * ((rq)->nr_running) + 1))) \|\| \
2270	((rq)->curr->static_prio > (rq)->best_expired_prio))	2270	((rq)->curr->static_prio > (rq)->best_expired_prio))
2271		2271
2272	/*	2272	/*
2273	* Account user cpu time to a process.	2273	* Account user cpu time to a process.
2274	* @p: the process that the cpu time gets accounted to	2274	* @p: the process that the cpu time gets accounted to
2275	* @hardirq_offset: the offset to subtract from hardirq_count()	2275	* @hardirq_offset: the offset to subtract from hardirq_count()
2276	* @cputime: the cpu time spent in user space since the last update	2276	* @cputime: the cpu time spent in user space since the last update
2277	*/	2277	*/
2278	void account_user_time(struct task_struct *p, cputime_t cputime)	2278	void account_user_time(struct task_struct *p, cputime_t cputime)
2279	{	2279	{
2280	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	2280	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2281	cputime64_t tmp;	2281	cputime64_t tmp;
2282		2282
2283	p->utime = cputime_add(p->utime, cputime);	2283	p->utime = cputime_add(p->utime, cputime);
2284		2284
2285	/* Add user time to cpustat. */	2285	/* Add user time to cpustat. */
2286	tmp = cputime_to_cputime64(cputime);	2286	tmp = cputime_to_cputime64(cputime);
2287	if (TASK_NICE(p) > 0)	2287	if (TASK_NICE(p) > 0)
2288	cpustat->nice = cputime64_add(cpustat->nice, tmp);	2288	cpustat->nice = cputime64_add(cpustat->nice, tmp);
2289	else	2289	else
2290	cpustat->user = cputime64_add(cpustat->user, tmp);	2290	cpustat->user = cputime64_add(cpustat->user, tmp);
2291	}	2291	}
2292		2292
2293	/*	2293	/*
2294	* Account system cpu time to a process.	2294	* Account system cpu time to a process.
2295	* @p: the process that the cpu time gets accounted to	2295	* @p: the process that the cpu time gets accounted to
2296	* @hardirq_offset: the offset to subtract from hardirq_count()	2296	* @hardirq_offset: the offset to subtract from hardirq_count()
2297	* @cputime: the cpu time spent in kernel space since the last update	2297	* @cputime: the cpu time spent in kernel space since the last update
2298	*/	2298	*/
2299	void account_system_time(struct task_struct *p, int hardirq_offset,	2299	void account_system_time(struct task_struct *p, int hardirq_offset,
2300	cputime_t cputime)	2300	cputime_t cputime)
2301	{	2301	{
2302	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	2302	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2303	runqueue_t *rq = this_rq();	2303	runqueue_t *rq = this_rq();
2304	cputime64_t tmp;	2304	cputime64_t tmp;
2305		2305
2306	p->stime = cputime_add(p->stime, cputime);	2306	p->stime = cputime_add(p->stime, cputime);
2307		2307
2308	/* Add system time to cpustat. */	2308	/* Add system time to cpustat. */
2309	tmp = cputime_to_cputime64(cputime);	2309	tmp = cputime_to_cputime64(cputime);
2310	if (hardirq_count() - hardirq_offset)	2310	if (hardirq_count() - hardirq_offset)
2311	cpustat->irq = cputime64_add(cpustat->irq, tmp);	2311	cpustat->irq = cputime64_add(cpustat->irq, tmp);
2312	else if (softirq_count())	2312	else if (softirq_count())
2313	cpustat->softirq = cputime64_add(cpustat->softirq, tmp);	2313	cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2314	else if (p != rq->idle)	2314	else if (p != rq->idle)
2315	cpustat->system = cputime64_add(cpustat->system, tmp);	2315	cpustat->system = cputime64_add(cpustat->system, tmp);
2316	else if (atomic_read(&rq->nr_iowait) > 0)	2316	else if (atomic_read(&rq->nr_iowait) > 0)
2317	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);	2317	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2318	else	2318	else
2319	cpustat->idle = cputime64_add(cpustat->idle, tmp);	2319	cpustat->idle = cputime64_add(cpustat->idle, tmp);
2320	/* Account for system time used */	2320	/* Account for system time used */
2321	acct_update_integrals(p);	2321	acct_update_integrals(p);
2322	/* Update rss highwater mark */	2322	/* Update rss highwater mark */
2323	update_mem_hiwater(p);	2323	update_mem_hiwater(p);
2324	}	2324	}
2325		2325
2326	/*	2326	/*
2327	* Account for involuntary wait time.	2327	* Account for involuntary wait time.
2328	* @p: the process from which the cpu time has been stolen	2328	* @p: the process from which the cpu time has been stolen
2329	* @steal: the cpu time spent in involuntary wait	2329	* @steal: the cpu time spent in involuntary wait
2330	*/	2330	*/
2331	void account_steal_time(struct task_struct *p, cputime_t steal)	2331	void account_steal_time(struct task_struct *p, cputime_t steal)
2332	{	2332	{
2333	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	2333	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2334	cputime64_t tmp = cputime_to_cputime64(steal);	2334	cputime64_t tmp = cputime_to_cputime64(steal);
2335	runqueue_t *rq = this_rq();	2335	runqueue_t *rq = this_rq();
2336		2336
2337	if (p == rq->idle) {	2337	if (p == rq->idle) {
2338	p->stime = cputime_add(p->stime, steal);	2338	p->stime = cputime_add(p->stime, steal);
2339	if (atomic_read(&rq->nr_iowait) > 0)	2339	if (atomic_read(&rq->nr_iowait) > 0)
2340	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);	2340	cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2341	else	2341	else
2342	cpustat->idle = cputime64_add(cpustat->idle, tmp);	2342	cpustat->idle = cputime64_add(cpustat->idle, tmp);
2343	} else	2343	} else
2344	cpustat->steal = cputime64_add(cpustat->steal, tmp);	2344	cpustat->steal = cputime64_add(cpustat->steal, tmp);
2345	}	2345	}
2346		2346
2347	/*	2347	/*
2348	* This function gets called by the timer code, with HZ frequency.	2348	* This function gets called by the timer code, with HZ frequency.
2349	* We call it with interrupts disabled.	2349	* We call it with interrupts disabled.
2350	*	2350	*
2351	* It also gets called by the fork code, when changing the parent's	2351	* It also gets called by the fork code, when changing the parent's
2352	* timeslices.	2352	* timeslices.
2353	*/	2353	*/
2354	void scheduler_tick(void)	2354	void scheduler_tick(void)
2355	{	2355	{
2356	int cpu = smp_processor_id();	2356	int cpu = smp_processor_id();
2357	runqueue_t *rq = this_rq();	2357	runqueue_t *rq = this_rq();
2358	task_t *p = current;	2358	task_t *p = current;
2359	unsigned long long now = sched_clock();	2359	unsigned long long now = sched_clock();
2360		2360
2361	update_cpu_clock(p, rq, now);	2361	update_cpu_clock(p, rq, now);
2362		2362
2363	rq->timestamp_last_tick = now;	2363	rq->timestamp_last_tick = now;
2364		2364
2365	if (p == rq->idle) {	2365	if (p == rq->idle) {
2366	if (wake_priority_sleeper(rq))	2366	if (wake_priority_sleeper(rq))
2367	goto out;	2367	goto out;
2368	rebalance_tick(cpu, rq, SCHED_IDLE);	2368	rebalance_tick(cpu, rq, SCHED_IDLE);
2369	return;	2369	return;
2370	}	2370	}
2371		2371
2372	/* Task might have expired already, but not scheduled off yet */	2372	/* Task might have expired already, but not scheduled off yet */
2373	if (p->array != rq->active) {	2373	if (p->array != rq->active) {
2374	set_tsk_need_resched(p);	2374	set_tsk_need_resched(p);
2375	goto out;	2375	goto out;
2376	}	2376	}
2377	spin_lock(&rq->lock);	2377	spin_lock(&rq->lock);
2378	/*	2378	/*
2379	* The task was running during this tick - update the	2379	* The task was running during this tick - update the
2380	* time slice counter. Note: we do not update a thread's	2380	* time slice counter. Note: we do not update a thread's
2381	* priority until it either goes to sleep or uses up its	2381	* priority until it either goes to sleep or uses up its
2382	* timeslice. This makes it possible for interactive tasks	2382	* timeslice. This makes it possible for interactive tasks
2383	* to use up their timeslices at their highest priority levels.	2383	* to use up their timeslices at their highest priority levels.
2384	*/	2384	*/
2385	if (rt_task(p)) {	2385	if (rt_task(p)) {
2386	/*	2386	/*
2387	* RR tasks need a special form of timeslice management.	2387	* RR tasks need a special form of timeslice management.
2388	* FIFO tasks have no timeslices.	2388	* FIFO tasks have no timeslices.
2389	*/	2389	*/
2390	if ((p->policy == SCHED_RR) && !--p->time_slice) {	2390	if ((p->policy == SCHED_RR) && !--p->time_slice) {
2391	p->time_slice = task_timeslice(p);	2391	p->time_slice = task_timeslice(p);
2392	p->first_time_slice = 0;	2392	p->first_time_slice = 0;
2393	set_tsk_need_resched(p);	2393	set_tsk_need_resched(p);
2394		2394
2395	/* put it at the end of the queue: */	2395	/* put it at the end of the queue: */
2396	requeue_task(p, rq->active);	2396	requeue_task(p, rq->active);
2397	}	2397	}
2398	goto out_unlock;	2398	goto out_unlock;
2399	}	2399	}
2400	if (!--p->time_slice) {	2400	if (!--p->time_slice) {
2401	dequeue_task(p, rq->active);	2401	dequeue_task(p, rq->active);
2402	set_tsk_need_resched(p);	2402	set_tsk_need_resched(p);
2403	p->prio = effective_prio(p);	2403	p->prio = effective_prio(p);
2404	p->time_slice = task_timeslice(p);	2404	p->time_slice = task_timeslice(p);
2405	p->first_time_slice = 0;	2405	p->first_time_slice = 0;
2406		2406
2407	if (!rq->expired_timestamp)	2407	if (!rq->expired_timestamp)
2408	rq->expired_timestamp = jiffies;	2408	rq->expired_timestamp = jiffies;
2409	if (!TASK_INTERACTIVE(p) \|\| EXPIRED_STARVING(rq)) {	2409	if (!TASK_INTERACTIVE(p) \|\| EXPIRED_STARVING(rq)) {
2410	enqueue_task(p, rq->expired);	2410	enqueue_task(p, rq->expired);
2411	if (p->static_prio < rq->best_expired_prio)	2411	if (p->static_prio < rq->best_expired_prio)
2412	rq->best_expired_prio = p->static_prio;	2412	rq->best_expired_prio = p->static_prio;
2413	} else	2413	} else
2414	enqueue_task(p, rq->active);	2414	enqueue_task(p, rq->active);
2415	} else {	2415	} else {
2416	/*	2416	/*
2417	* Prevent a too long timeslice allowing a task to monopolize	2417	* Prevent a too long timeslice allowing a task to monopolize
2418	* the CPU. We do this by splitting up the timeslice into	2418	* the CPU. We do this by splitting up the timeslice into
2419	* smaller pieces.	2419	* smaller pieces.
2420	*	2420	*
2421	* Note: this does not mean the task's timeslices expire or	2421	* Note: this does not mean the task's timeslices expire or
2422	* get lost in any way, they just might be preempted by	2422	* get lost in any way, they just might be preempted by
2423	* another task of equal priority. (one with higher	2423	* another task of equal priority. (one with higher
2424	* priority would have preempted this task already.) We	2424	* priority would have preempted this task already.) We
2425	* requeue this task to the end of the list on this priority	2425	* requeue this task to the end of the list on this priority
2426	* level, which is in essence a round-robin of tasks with	2426	* level, which is in essence a round-robin of tasks with
2427	* equal priority.	2427	* equal priority.
2428	*	2428	*
2429	* This only applies to tasks in the interactive	2429	* This only applies to tasks in the interactive
2430	* delta range with at least TIMESLICE_GRANULARITY to requeue.	2430	* delta range with at least TIMESLICE_GRANULARITY to requeue.
2431	*/	2431	*/
2432	if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -	2432	if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2433	p->time_slice) % TIMESLICE_GRANULARITY(p)) &&	2433	p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2434	(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&	2434	(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2435	(p->array == rq->active)) {	2435	(p->array == rq->active)) {
2436		2436
2437	requeue_task(p, rq->active);	2437	requeue_task(p, rq->active);
2438	set_tsk_need_resched(p);	2438	set_tsk_need_resched(p);
2439	}	2439	}
2440	}	2440	}
2441	out_unlock:	2441	out_unlock:
2442	spin_unlock(&rq->lock);	2442	spin_unlock(&rq->lock);
2443	out:	2443	out:
2444	rebalance_tick(cpu, rq, NOT_IDLE);	2444	rebalance_tick(cpu, rq, NOT_IDLE);
2445	}	2445	}
2446		2446
2447	#ifdef CONFIG_SCHED_SMT	2447	#ifdef CONFIG_SCHED_SMT
2448	static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)	2448	static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449	{	2449	{
2450	struct sched_domain *sd = this_rq->sd;	2450	struct sched_domain *sd = this_rq->sd;
2451	cpumask_t sibling_map;	2451	cpumask_t sibling_map;
2452	int i;	2452	int i;
2453		2453
2454	if (!(sd->flags & SD_SHARE_CPUPOWER))	2454	if (!(sd->flags & SD_SHARE_CPUPOWER))
2455	return;	2455	return;
2456		2456
2457	/*	2457	/*
2458	* Unlock the current runqueue because we have to lock in	2458	* Unlock the current runqueue because we have to lock in
2459	* CPU order to avoid deadlocks. Caller knows that we might	2459	* CPU order to avoid deadlocks. Caller knows that we might
2460	* unlock. We keep IRQs disabled.	2460	* unlock. We keep IRQs disabled.
2461	*/	2461	*/
2462	spin_unlock(&this_rq->lock);	2462	spin_unlock(&this_rq->lock);
2463		2463
2464	sibling_map = sd->span;	2464	sibling_map = sd->span;
2465		2465
2466	for_each_cpu_mask(i, sibling_map)	2466	for_each_cpu_mask(i, sibling_map)
2467	spin_lock(&cpu_rq(i)->lock);	2467	spin_lock(&cpu_rq(i)->lock);
2468	/*	2468	/*
2469	* We clear this CPU from the mask. This both simplifies the	2469	* We clear this CPU from the mask. This both simplifies the
2470	* inner loop and keps this_rq locked when we exit:	2470	* inner loop and keps this_rq locked when we exit:
2471	*/	2471	*/
2472	cpu_clear(this_cpu, sibling_map);	2472	cpu_clear(this_cpu, sibling_map);
2473		2473
2474	for_each_cpu_mask(i, sibling_map) {	2474	for_each_cpu_mask(i, sibling_map) {
2475	runqueue_t *smt_rq = cpu_rq(i);	2475	runqueue_t *smt_rq = cpu_rq(i);
2476		2476
2477	/*	2477	/*
2478	* If an SMT sibling task is sleeping due to priority	2478	* If an SMT sibling task is sleeping due to priority
2479	* reasons wake it up now.	2479	* reasons wake it up now.
2480	*/	2480	*/
2481	if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)	2481	if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2482	resched_task(smt_rq->idle);	2482	resched_task(smt_rq->idle);
2483	}	2483	}
2484		2484
2485	for_each_cpu_mask(i, sibling_map)	2485	for_each_cpu_mask(i, sibling_map)
2486	spin_unlock(&cpu_rq(i)->lock);	2486	spin_unlock(&cpu_rq(i)->lock);
2487	/*	2487	/*
2488	* We exit with this_cpu's rq still held and IRQs	2488	* We exit with this_cpu's rq still held and IRQs
2489	* still disabled:	2489	* still disabled:
2490	*/	2490	*/
2491	}	2491	}
2492		2492
2493	static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)	2493	static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494	{	2494	{
2495	struct sched_domain *sd = this_rq->sd;	2495	struct sched_domain *sd = this_rq->sd;
2496	cpumask_t sibling_map;	2496	cpumask_t sibling_map;
2497	prio_array_t *array;	2497	prio_array_t *array;
2498	int ret = 0, i;	2498	int ret = 0, i;
2499	task_t *p;	2499	task_t *p;
2500		2500
2501	if (!(sd->flags & SD_SHARE_CPUPOWER))	2501	if (!(sd->flags & SD_SHARE_CPUPOWER))
2502	return 0;	2502	return 0;
2503		2503
2504	/*	2504	/*
2505	* The same locking rules and details apply as for	2505	* The same locking rules and details apply as for
2506	* wake_sleeping_dependent():	2506	* wake_sleeping_dependent():
2507	*/	2507	*/
2508	spin_unlock(&this_rq->lock);	2508	spin_unlock(&this_rq->lock);
2509	sibling_map = sd->span;	2509	sibling_map = sd->span;
2510	for_each_cpu_mask(i, sibling_map)	2510	for_each_cpu_mask(i, sibling_map)
2511	spin_lock(&cpu_rq(i)->lock);	2511	spin_lock(&cpu_rq(i)->lock);
2512	cpu_clear(this_cpu, sibling_map);	2512	cpu_clear(this_cpu, sibling_map);
2513		2513
2514	/*	2514	/*
2515	* Establish next task to be run - it might have gone away because	2515	* Establish next task to be run - it might have gone away because
2516	* we released the runqueue lock above:	2516	* we released the runqueue lock above:
2517	*/	2517	*/
2518	if (!this_rq->nr_running)	2518	if (!this_rq->nr_running)
2519	goto out_unlock;	2519	goto out_unlock;
2520	array = this_rq->active;	2520	array = this_rq->active;
2521	if (!array->nr_active)	2521	if (!array->nr_active)
2522	array = this_rq->expired;	2522	array = this_rq->expired;
2523	BUG_ON(!array->nr_active);	2523	BUG_ON(!array->nr_active);
2524		2524
2525	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,	2525	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2526	task_t, run_list);	2526	task_t, run_list);
2527		2527
2528	for_each_cpu_mask(i, sibling_map) {	2528	for_each_cpu_mask(i, sibling_map) {
2529	runqueue_t *smt_rq = cpu_rq(i);	2529	runqueue_t *smt_rq = cpu_rq(i);
2530	task_t *smt_curr = smt_rq->curr;	2530	task_t *smt_curr = smt_rq->curr;
2531		2531
2532	/*	2532	/*
2533	* If a user task with lower static priority than the	2533	* If a user task with lower static priority than the
2534	* running task on the SMT sibling is trying to schedule,	2534	* running task on the SMT sibling is trying to schedule,
2535	* delay it till there is proportionately less timeslice	2535	* delay it till there is proportionately less timeslice
2536	* left of the sibling task to prevent a lower priority	2536	* left of the sibling task to prevent a lower priority
2537	* task from using an unfair proportion of the	2537	* task from using an unfair proportion of the
2538	* physical cpu's resources. -ck	2538	* physical cpu's resources. -ck
2539	*/	2539	*/
2540	if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >	2540	if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2541	task_timeslice(p) \|\| rt_task(smt_curr)) &&	2541	task_timeslice(p) \|\| rt_task(smt_curr)) &&
2542	p->mm && smt_curr->mm && !rt_task(p))	2542	p->mm && smt_curr->mm && !rt_task(p))
2543	ret = 1;	2543	ret = 1;
2544		2544
2545	/*	2545	/*
2546	* Reschedule a lower priority task on the SMT sibling,	2546	* Reschedule a lower priority task on the SMT sibling,
2547	* or wake it up if it has been put to sleep for priority	2547	* or wake it up if it has been put to sleep for priority
2548	* reasons.	2548	* reasons.
2549	*/	2549	*/
2550	if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >	2550	if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2551	task_timeslice(smt_curr) \|\| rt_task(p)) &&	2551	task_timeslice(smt_curr) \|\| rt_task(p)) &&
2552	smt_curr->mm && p->mm && !rt_task(smt_curr)) \|\|	2552	smt_curr->mm && p->mm && !rt_task(smt_curr)) \|\|
2553	(smt_curr == smt_rq->idle && smt_rq->nr_running))	2553	(smt_curr == smt_rq->idle && smt_rq->nr_running))
2554	resched_task(smt_curr);	2554	resched_task(smt_curr);
2555	}	2555	}
2556	out_unlock:	2556	out_unlock:
2557	for_each_cpu_mask(i, sibling_map)	2557	for_each_cpu_mask(i, sibling_map)
2558	spin_unlock(&cpu_rq(i)->lock);	2558	spin_unlock(&cpu_rq(i)->lock);
2559	return ret;	2559	return ret;
2560	}	2560	}
2561	#else	2561	#else
2562	static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)	2562	static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2563	{	2563	{
2564	}	2564	}
2565		2565
2566	static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)	2566	static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2567	{	2567	{
2568	return 0;	2568	return 0;
2569	}	2569	}
2570	#endif	2570	#endif
2571		2571
2572	#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)	2572	#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2573		2573
2574	void fastcall add_preempt_count(int val)	2574	void fastcall add_preempt_count(int val)
2575	{	2575	{
2576	/*	2576	/*
2577	* Underflow?	2577	* Underflow?
2578	*/	2578	*/
2579	BUG_ON(((int)preempt_count() < 0));	2579	BUG_ON(((int)preempt_count() < 0));
2580	preempt_count() += val;	2580	preempt_count() += val;
2581	/*	2581	/*
2582	* Spinlock count overflowing soon?	2582	* Spinlock count overflowing soon?
2583	*/	2583	*/
2584	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);	2584	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2585	}	2585	}
2586	EXPORT_SYMBOL(add_preempt_count);	2586	EXPORT_SYMBOL(add_preempt_count);
2587		2587
2588	void fastcall sub_preempt_count(int val)	2588	void fastcall sub_preempt_count(int val)
2589	{	2589	{
2590	/*	2590	/*
2591	* Underflow?	2591	* Underflow?
2592	*/	2592	*/
2593	BUG_ON(val > preempt_count());	2593	BUG_ON(val > preempt_count());
2594	/*	2594	/*
2595	* Is the spinlock portion underflowing?	2595	* Is the spinlock portion underflowing?
2596	*/	2596	*/
2597	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));	2597	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2598	preempt_count() -= val;	2598	preempt_count() -= val;
2599	}	2599	}
2600	EXPORT_SYMBOL(sub_preempt_count);	2600	EXPORT_SYMBOL(sub_preempt_count);
2601		2601
2602	#endif	2602	#endif
2603		2603
2604	/*	2604	/*
2605	* schedule() is the main scheduler function.	2605	* schedule() is the main scheduler function.
2606	*/	2606	*/
2607	asmlinkage void __sched schedule(void)	2607	asmlinkage void __sched schedule(void)
2608	{	2608	{
2609	long *switch_count;	2609	long *switch_count;
2610	task_t prev, next;	2610	task_t prev, next;
2611	runqueue_t *rq;	2611	runqueue_t *rq;
2612	prio_array_t *array;	2612	prio_array_t *array;
2613	struct list_head *queue;	2613	struct list_head *queue;
2614	unsigned long long now;	2614	unsigned long long now;
2615	unsigned long run_time;	2615	unsigned long run_time;
2616	int cpu, idx;	2616	int cpu, idx;
2617		2617
2618	/*	2618	/*
2619	* Test if we are atomic. Since do_exit() needs to call into	2619	* Test if we are atomic. Since do_exit() needs to call into
2620	* schedule() atomically, we ignore that path for now.	2620	* schedule() atomically, we ignore that path for now.
2621	* Otherwise, whine if we are scheduling when we should not be.	2621	* Otherwise, whine if we are scheduling when we should not be.
2622	*/	2622	*/
2623	if (likely(!current->exit_state)) {	2623	if (likely(!current->exit_state)) {
2624	if (unlikely(in_atomic())) {	2624	if (unlikely(in_atomic())) {
2625	printk(KERN_ERR "scheduling while atomic: "	2625	printk(KERN_ERR "scheduling while atomic: "
2626	"%s/0x%08x/%d\n",	2626	"%s/0x%08x/%d\n",
2627	current->comm, preempt_count(), current->pid);	2627	current->comm, preempt_count(), current->pid);
2628	dump_stack();	2628	dump_stack();
2629	}	2629	}
2630	}	2630	}
2631	profile_hit(SCHED_PROFILING, __builtin_return_address(0));	2631	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2632		2632
2633	need_resched:	2633	need_resched:
2634	preempt_disable();	2634	preempt_disable();
2635	prev = current;	2635	prev = current;
2636	release_kernel_lock(prev);	2636	release_kernel_lock(prev);
2637	need_resched_nonpreemptible:	2637	need_resched_nonpreemptible:
2638	rq = this_rq();	2638	rq = this_rq();
2639		2639
2640	/*	2640	/*
2641	* The idle thread is not allowed to schedule!	2641	* The idle thread is not allowed to schedule!
2642	* Remove this check after it has been exercised a bit.	2642	* Remove this check after it has been exercised a bit.
2643	*/	2643	*/
2644	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {	2644	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2645	printk(KERN_ERR "bad: scheduling from the idle thread!\n");	2645	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2646	dump_stack();	2646	dump_stack();
2647	}	2647	}
2648		2648
2649	schedstat_inc(rq, sched_cnt);	2649	schedstat_inc(rq, sched_cnt);
2650	now = sched_clock();	2650	now = sched_clock();
2651	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {	2651	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
2652	run_time = now - prev->timestamp;	2652	run_time = now - prev->timestamp;
2653	if (unlikely((long long)(now - prev->timestamp) < 0))	2653	if (unlikely((long long)(now - prev->timestamp) < 0))
2654	run_time = 0;	2654	run_time = 0;
2655	} else	2655	} else
2656	run_time = NS_MAX_SLEEP_AVG;	2656	run_time = NS_MAX_SLEEP_AVG;
2657		2657
2658	/*	2658	/*
2659	* Tasks charged proportionately less run_time at high sleep_avg to	2659	* Tasks charged proportionately less run_time at high sleep_avg to
2660	* delay them losing their interactive status	2660	* delay them losing their interactive status
2661	*/	2661	*/
2662	run_time /= (CURRENT_BONUS(prev) ? : 1);	2662	run_time /= (CURRENT_BONUS(prev) ? : 1);
2663		2663
2664	spin_lock_irq(&rq->lock);	2664	spin_lock_irq(&rq->lock);
2665		2665
2666	if (unlikely(prev->flags & PF_DEAD))	2666	if (unlikely(prev->flags & PF_DEAD))
2667	prev->state = EXIT_DEAD;	2667	prev->state = EXIT_DEAD;
2668		2668
2669	switch_count = &prev->nivcsw;	2669	switch_count = &prev->nivcsw;
2670	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	2670	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2671	switch_count = &prev->nvcsw;	2671	switch_count = &prev->nvcsw;
2672	if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&	2672	if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2673	unlikely(signal_pending(prev))))	2673	unlikely(signal_pending(prev))))
2674	prev->state = TASK_RUNNING;	2674	prev->state = TASK_RUNNING;
2675	else {	2675	else {
2676	if (prev->state == TASK_UNINTERRUPTIBLE)	2676	if (prev->state == TASK_UNINTERRUPTIBLE)
2677	rq->nr_uninterruptible++;	2677	rq->nr_uninterruptible++;
2678	deactivate_task(prev, rq);	2678	deactivate_task(prev, rq);
2679	}	2679	}
2680	}	2680	}
2681		2681
2682	cpu = smp_processor_id();	2682	cpu = smp_processor_id();
2683	if (unlikely(!rq->nr_running)) {	2683	if (unlikely(!rq->nr_running)) {
2684	go_idle:	2684	go_idle:
2685	idle_balance(cpu, rq);	2685	idle_balance(cpu, rq);
2686	if (!rq->nr_running) {	2686	if (!rq->nr_running) {
2687	next = rq->idle;	2687	next = rq->idle;
2688	rq->expired_timestamp = 0;	2688	rq->expired_timestamp = 0;
2689	wake_sleeping_dependent(cpu, rq);	2689	wake_sleeping_dependent(cpu, rq);
2690	/*	2690	/*
2691	* wake_sleeping_dependent() might have released	2691	* wake_sleeping_dependent() might have released
2692	* the runqueue, so break out if we got new	2692	* the runqueue, so break out if we got new
2693	* tasks meanwhile:	2693	* tasks meanwhile:
2694	*/	2694	*/
2695	if (!rq->nr_running)	2695	if (!rq->nr_running)
2696	goto switch_tasks;	2696	goto switch_tasks;
2697	}	2697	}
2698	} else {	2698	} else {
2699	if (dependent_sleeper(cpu, rq)) {	2699	if (dependent_sleeper(cpu, rq)) {
2700	next = rq->idle;	2700	next = rq->idle;
2701	goto switch_tasks;	2701	goto switch_tasks;
2702	}	2702	}
2703	/*	2703	/*
2704	* dependent_sleeper() releases and reacquires the runqueue	2704	* dependent_sleeper() releases and reacquires the runqueue
2705	* lock, hence go into the idle loop if the rq went	2705	* lock, hence go into the idle loop if the rq went
2706	* empty meanwhile:	2706	* empty meanwhile:
2707	*/	2707	*/
2708	if (unlikely(!rq->nr_running))	2708	if (unlikely(!rq->nr_running))
2709	goto go_idle;	2709	goto go_idle;
2710	}	2710	}
2711		2711
2712	array = rq->active;	2712	array = rq->active;
2713	if (unlikely(!array->nr_active)) {	2713	if (unlikely(!array->nr_active)) {
2714	/*	2714	/*
2715	* Switch the active and expired arrays.	2715	* Switch the active and expired arrays.
2716	*/	2716	*/
2717	schedstat_inc(rq, sched_switch);	2717	schedstat_inc(rq, sched_switch);
2718	rq->active = rq->expired;	2718	rq->active = rq->expired;
2719	rq->expired = array;	2719	rq->expired = array;
2720	array = rq->active;	2720	array = rq->active;
2721	rq->expired_timestamp = 0;	2721	rq->expired_timestamp = 0;
2722	rq->best_expired_prio = MAX_PRIO;	2722	rq->best_expired_prio = MAX_PRIO;
2723	}	2723	}
2724		2724
2725	idx = sched_find_first_bit(array->bitmap);	2725	idx = sched_find_first_bit(array->bitmap);
2726	queue = array->queue + idx;	2726	queue = array->queue + idx;
2727	next = list_entry(queue->next, task_t, run_list);	2727	next = list_entry(queue->next, task_t, run_list);
2728		2728
2729	if (!rt_task(next) && next->activated > 0) {	2729	if (!rt_task(next) && next->activated > 0) {
2730	unsigned long long delta = now - next->timestamp;	2730	unsigned long long delta = now - next->timestamp;
2731	if (unlikely((long long)(now - next->timestamp) < 0))	2731	if (unlikely((long long)(now - next->timestamp) < 0))
2732	delta = 0;	2732	delta = 0;
2733		2733
2734	if (next->activated == 1)	2734	if (next->activated == 1)
2735	delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;	2735	delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736		2736
2737	array = next->array;	2737	array = next->array;
2738	dequeue_task(next, array);	2738	dequeue_task(next, array);
2739	recalc_task_prio(next, next->timestamp + delta);	2739	recalc_task_prio(next, next->timestamp + delta);
2740	enqueue_task(next, array);	2740	enqueue_task(next, array);
2741	}	2741	}
2742	next->activated = 0;	2742	next->activated = 0;
2743	switch_tasks:	2743	switch_tasks:
2744	if (next == rq->idle)	2744	if (next == rq->idle)
2745	schedstat_inc(rq, sched_goidle);	2745	schedstat_inc(rq, sched_goidle);
2746	prefetch(next);	2746	prefetch(next);
2747	clear_tsk_need_resched(prev);	2747	clear_tsk_need_resched(prev);
2748	rcu_qsctr_inc(task_cpu(prev));	2748	rcu_qsctr_inc(task_cpu(prev));
2749		2749
2750	update_cpu_clock(prev, rq, now);	2750	update_cpu_clock(prev, rq, now);
2751		2751
2752	prev->sleep_avg -= run_time;	2752	prev->sleep_avg -= run_time;
2753	if ((long)prev->sleep_avg <= 0)	2753	if ((long)prev->sleep_avg <= 0)
2754	prev->sleep_avg = 0;	2754	prev->sleep_avg = 0;
2755	prev->timestamp = prev->last_ran = now;	2755	prev->timestamp = prev->last_ran = now;
2756		2756
2757	sched_info_switch(prev, next);	2757	sched_info_switch(prev, next);
2758	if (likely(prev != next)) {	2758	if (likely(prev != next)) {
2759	next->timestamp = now;	2759	next->timestamp = now;
2760	rq->nr_switches++;	2760	rq->nr_switches++;
2761	rq->curr = next;	2761	rq->curr = next;
2762	++*switch_count;	2762	++*switch_count;
2763		2763
2764	prepare_arch_switch(rq, next);	2764	prepare_arch_switch(rq, next);
2765	prev = context_switch(rq, prev, next);	2765	prev = context_switch(rq, prev, next);
2766	barrier();	2766	barrier();
2767		2767
2768	finish_task_switch(prev);	2768	finish_task_switch(prev);
2769	} else	2769	} else
2770	spin_unlock_irq(&rq->lock);	2770	spin_unlock_irq(&rq->lock);
2771		2771
2772	prev = current;	2772	prev = current;
2773	if (unlikely(reacquire_kernel_lock(prev) < 0))	2773	if (unlikely(reacquire_kernel_lock(prev) < 0))
2774	goto need_resched_nonpreemptible;	2774	goto need_resched_nonpreemptible;
2775	preempt_enable_no_resched();	2775	preempt_enable_no_resched();
2776	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))	2776	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2777	goto need_resched;	2777	goto need_resched;
2778	}	2778	}
2779		2779
2780	EXPORT_SYMBOL(schedule);	2780	EXPORT_SYMBOL(schedule);
2781		2781
2782	#ifdef CONFIG_PREEMPT	2782	#ifdef CONFIG_PREEMPT
2783	/*	2783	/*
2784	* this is is the entry point to schedule() from in-kernel preemption	2784	* this is is the entry point to schedule() from in-kernel preemption
2785	* off of preempt_enable. Kernel preemptions off return from interrupt	2785	* off of preempt_enable. Kernel preemptions off return from interrupt
2786	* occur there and call schedule directly.	2786	* occur there and call schedule directly.
2787	*/	2787	*/
2788	asmlinkage void __sched preempt_schedule(void)	2788	asmlinkage void __sched preempt_schedule(void)
2789	{	2789	{
2790	struct thread_info *ti = current_thread_info();	2790	struct thread_info *ti = current_thread_info();
2791	#ifdef CONFIG_PREEMPT_BKL	2791	#ifdef CONFIG_PREEMPT_BKL
2792	struct task_struct *task = current;	2792	struct task_struct *task = current;
2793	int saved_lock_depth;	2793	int saved_lock_depth;
2794	#endif	2794	#endif
2795	/*	2795	/*
2796	* If there is a non-zero preempt_count or interrupts are disabled,	2796	* If there is a non-zero preempt_count or interrupts are disabled,
2797	* we do not want to preempt the current task. Just return..	2797	* we do not want to preempt the current task. Just return..
2798	*/	2798	*/
2799	if (unlikely(ti->preempt_count \|\| irqs_disabled()))	2799	if (unlikely(ti->preempt_count \|\| irqs_disabled()))
2800	return;	2800	return;
2801		2801
2802	need_resched:	2802	need_resched:
2803	add_preempt_count(PREEMPT_ACTIVE);	2803	add_preempt_count(PREEMPT_ACTIVE);
2804	/*	2804	/*
2805	* We keep the big kernel semaphore locked, but we	2805	* We keep the big kernel semaphore locked, but we
2806	* clear ->lock_depth so that schedule() doesnt	2806	* clear ->lock_depth so that schedule() doesnt
2807	* auto-release the semaphore:	2807	* auto-release the semaphore:
2808	*/	2808	*/
2809	#ifdef CONFIG_PREEMPT_BKL	2809	#ifdef CONFIG_PREEMPT_BKL
2810	saved_lock_depth = task->lock_depth;	2810	saved_lock_depth = task->lock_depth;
2811	task->lock_depth = -1;	2811	task->lock_depth = -1;
2812	#endif	2812	#endif
2813	schedule();	2813	schedule();
2814	#ifdef CONFIG_PREEMPT_BKL	2814	#ifdef CONFIG_PREEMPT_BKL
2815	task->lock_depth = saved_lock_depth;	2815	task->lock_depth = saved_lock_depth;
2816	#endif	2816	#endif
2817	sub_preempt_count(PREEMPT_ACTIVE);	2817	sub_preempt_count(PREEMPT_ACTIVE);
2818		2818
2819	/* we could miss a preemption opportunity between schedule and now */	2819	/* we could miss a preemption opportunity between schedule and now */
2820	barrier();	2820	barrier();
2821	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))	2821	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2822	goto need_resched;	2822	goto need_resched;
2823	}	2823	}
2824		2824
2825	EXPORT_SYMBOL(preempt_schedule);	2825	EXPORT_SYMBOL(preempt_schedule);
2826		2826
2827	/*	2827	/*
2828	* this is is the entry point to schedule() from kernel preemption	2828	* this is is the entry point to schedule() from kernel preemption
2829	* off of irq context.	2829	* off of irq context.
2830	* Note, that this is called and return with irqs disabled. This will	2830	* Note, that this is called and return with irqs disabled. This will
2831	* protect us against recursive calling from irq.	2831	* protect us against recursive calling from irq.
2832	*/	2832	*/
2833	asmlinkage void __sched preempt_schedule_irq(void)	2833	asmlinkage void __sched preempt_schedule_irq(void)
2834	{	2834	{
2835	struct thread_info *ti = current_thread_info();	2835	struct thread_info *ti = current_thread_info();
2836	#ifdef CONFIG_PREEMPT_BKL	2836	#ifdef CONFIG_PREEMPT_BKL
2837	struct task_struct *task = current;	2837	struct task_struct *task = current;
2838	int saved_lock_depth;	2838	int saved_lock_depth;
2839	#endif	2839	#endif
2840	/* Catch callers which need to be fixed*/	2840	/* Catch callers which need to be fixed*/
2841	BUG_ON(ti->preempt_count \|\| !irqs_disabled());	2841	BUG_ON(ti->preempt_count \|\| !irqs_disabled());
2842		2842
2843	need_resched:	2843	need_resched:
2844	add_preempt_count(PREEMPT_ACTIVE);	2844	add_preempt_count(PREEMPT_ACTIVE);
2845	/*	2845	/*
2846	* We keep the big kernel semaphore locked, but we	2846	* We keep the big kernel semaphore locked, but we
2847	* clear ->lock_depth so that schedule() doesnt	2847	* clear ->lock_depth so that schedule() doesnt
2848	* auto-release the semaphore:	2848	* auto-release the semaphore:
2849	*/	2849	*/
2850	#ifdef CONFIG_PREEMPT_BKL	2850	#ifdef CONFIG_PREEMPT_BKL
2851	saved_lock_depth = task->lock_depth;	2851	saved_lock_depth = task->lock_depth;
2852	task->lock_depth = -1;	2852	task->lock_depth = -1;
2853	#endif	2853	#endif
2854	local_irq_enable();	2854	local_irq_enable();
2855	schedule();	2855	schedule();
2856	local_irq_disable();	2856	local_irq_disable();
2857	#ifdef CONFIG_PREEMPT_BKL	2857	#ifdef CONFIG_PREEMPT_BKL
2858	task->lock_depth = saved_lock_depth;	2858	task->lock_depth = saved_lock_depth;
2859	#endif	2859	#endif
2860	sub_preempt_count(PREEMPT_ACTIVE);	2860	sub_preempt_count(PREEMPT_ACTIVE);
2861		2861
2862	/* we could miss a preemption opportunity between schedule and now */	2862	/* we could miss a preemption opportunity between schedule and now */
2863	barrier();	2863	barrier();
2864	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))	2864	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2865	goto need_resched;	2865	goto need_resched;
2866	}	2866	}
2867		2867
2868	#endif /* CONFIG_PREEMPT */	2868	#endif /* CONFIG_PREEMPT */
2869		2869
2870	int default_wake_function(wait_queue_t curr, unsigned mode, int sync, void key)	2870	int default_wake_function(wait_queue_t curr, unsigned mode, int sync, void key)
2871	{	2871	{
2872	task_t *p = curr->task;	2872	task_t *p = curr->task;
2873	return try_to_wake_up(p, mode, sync);	2873	return try_to_wake_up(p, mode, sync);
2874	}	2874	}
2875		2875
2876	EXPORT_SYMBOL(default_wake_function);	2876	EXPORT_SYMBOL(default_wake_function);
2877		2877
2878	/*	2878	/*
2879	* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just	2879	* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2880	* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve	2880	* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2881	* number) then we wake all the non-exclusive tasks and one exclusive task.	2881	* number) then we wake all the non-exclusive tasks and one exclusive task.
2882	*	2882	*
2883	* There are circumstances in which we can try to wake a task which has already	2883	* There are circumstances in which we can try to wake a task which has already
2884	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns	2884	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2885	* zero in this (rare) case, and we handle it by continuing to scan the queue.	2885	* zero in this (rare) case, and we handle it by continuing to scan the queue.
2886	*/	2886	*/
2887	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,	2887	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2888	int nr_exclusive, int sync, void *key)	2888	int nr_exclusive, int sync, void *key)
2889	{	2889	{
2890	struct list_head tmp, next;	2890	struct list_head tmp, next;
2891		2891
2892	list_for_each_safe(tmp, next, &q->task_list) {	2892	list_for_each_safe(tmp, next, &q->task_list) {
2893	wait_queue_t *curr;	2893	wait_queue_t *curr;
2894	unsigned flags;	2894	unsigned flags;
2895	curr = list_entry(tmp, wait_queue_t, task_list);	2895	curr = list_entry(tmp, wait_queue_t, task_list);
2896	flags = curr->flags;	2896	flags = curr->flags;
2897	if (curr->func(curr, mode, sync, key) &&	2897	if (curr->func(curr, mode, sync, key) &&
2898	(flags & WQ_FLAG_EXCLUSIVE) &&	2898	(flags & WQ_FLAG_EXCLUSIVE) &&
2899	!--nr_exclusive)	2899	!--nr_exclusive)
2900	break;	2900	break;
2901	}	2901	}
2902	}	2902	}
2903		2903
2904	/**	2904	/**
2905	* __wake_up - wake up threads blocked on a waitqueue.	2905	* __wake_up - wake up threads blocked on a waitqueue.
2906	* @q: the waitqueue	2906	* @q: the waitqueue
2907	* @mode: which threads	2907	* @mode: which threads
2908	* @nr_exclusive: how many wake-one or wake-many threads to wake up	2908	* @nr_exclusive: how many wake-one or wake-many threads to wake up
2909	* @key: is directly passed to the wakeup function	2909	* @key: is directly passed to the wakeup function
2910	*/	2910	*/
2911	void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,	2911	void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2912	int nr_exclusive, void *key)	2912	int nr_exclusive, void *key)
2913	{	2913	{
2914	unsigned long flags;	2914	unsigned long flags;
2915		2915
2916	spin_lock_irqsave(&q->lock, flags);	2916	spin_lock_irqsave(&q->lock, flags);
2917	__wake_up_common(q, mode, nr_exclusive, 0, key);	2917	__wake_up_common(q, mode, nr_exclusive, 0, key);
2918	spin_unlock_irqrestore(&q->lock, flags);	2918	spin_unlock_irqrestore(&q->lock, flags);
2919	}	2919	}
2920		2920
2921	EXPORT_SYMBOL(__wake_up);	2921	EXPORT_SYMBOL(__wake_up);
2922		2922
2923	/*	2923	/*
2924	* Same as __wake_up but called with the spinlock in wait_queue_head_t held.	2924	* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2925	*/	2925	*/
2926	void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)	2926	void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2927	{	2927	{
2928	__wake_up_common(q, mode, 1, 0, NULL);	2928	__wake_up_common(q, mode, 1, 0, NULL);
2929	}	2929	}
2930		2930
2931	/**	2931	/**
2932	* __wake_up_sync - wake up threads blocked on a waitqueue.	2932	* __wake_up_sync - wake up threads blocked on a waitqueue.
2933	* @q: the waitqueue	2933	* @q: the waitqueue
2934	* @mode: which threads	2934	* @mode: which threads
2935	* @nr_exclusive: how many wake-one or wake-many threads to wake up	2935	* @nr_exclusive: how many wake-one or wake-many threads to wake up
2936	*	2936	*
2937	* The sync wakeup differs that the waker knows that it will schedule	2937	* The sync wakeup differs that the waker knows that it will schedule
2938	* away soon, so while the target thread will be woken up, it will not	2938	* away soon, so while the target thread will be woken up, it will not
2939	* be migrated to another CPU - ie. the two threads are 'synchronized'	2939	* be migrated to another CPU - ie. the two threads are 'synchronized'
2940	* with each other. This can prevent needless bouncing between CPUs.	2940	* with each other. This can prevent needless bouncing between CPUs.
2941	*	2941	*
2942	* On UP it can prevent extra preemption.	2942	* On UP it can prevent extra preemption.
2943	*/	2943	*/
2944	void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)	2944	void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2945	{	2945	{
2946	unsigned long flags;	2946	unsigned long flags;
2947	int sync = 1;	2947	int sync = 1;
2948		2948
2949	if (unlikely(!q))	2949	if (unlikely(!q))
2950	return;	2950	return;
2951		2951
2952	if (unlikely(!nr_exclusive))	2952	if (unlikely(!nr_exclusive))
2953	sync = 0;	2953	sync = 0;
2954		2954
2955	spin_lock_irqsave(&q->lock, flags);	2955	spin_lock_irqsave(&q->lock, flags);
2956	__wake_up_common(q, mode, nr_exclusive, sync, NULL);	2956	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
2957	spin_unlock_irqrestore(&q->lock, flags);	2957	spin_unlock_irqrestore(&q->lock, flags);
2958	}	2958	}
2959	EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */	2959	EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2960		2960
2961	void fastcall complete(struct completion *x)	2961	void fastcall complete(struct completion *x)
2962	{	2962	{
2963	unsigned long flags;	2963	unsigned long flags;
2964		2964
2965	spin_lock_irqsave(&x->wait.lock, flags);	2965	spin_lock_irqsave(&x->wait.lock, flags);
2966	x->done++;	2966	x->done++;
2967	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE \| TASK_INTERRUPTIBLE,	2967	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE \| TASK_INTERRUPTIBLE,
2968	1, 0, NULL);	2968	1, 0, NULL);
2969	spin_unlock_irqrestore(&x->wait.lock, flags);	2969	spin_unlock_irqrestore(&x->wait.lock, flags);
2970	}	2970	}
2971	EXPORT_SYMBOL(complete);	2971	EXPORT_SYMBOL(complete);
2972		2972
2973	void fastcall complete_all(struct completion *x)	2973	void fastcall complete_all(struct completion *x)
2974	{	2974	{
2975	unsigned long flags;	2975	unsigned long flags;
2976		2976
2977	spin_lock_irqsave(&x->wait.lock, flags);	2977	spin_lock_irqsave(&x->wait.lock, flags);
2978	x->done += UINT_MAX/2;	2978	x->done += UINT_MAX/2;
2979	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE \| TASK_INTERRUPTIBLE,	2979	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE \| TASK_INTERRUPTIBLE,
2980	0, 0, NULL);	2980	0, 0, NULL);
2981	spin_unlock_irqrestore(&x->wait.lock, flags);	2981	spin_unlock_irqrestore(&x->wait.lock, flags);
2982	}	2982	}
2983	EXPORT_SYMBOL(complete_all);	2983	EXPORT_SYMBOL(complete_all);
2984		2984
2985	void fastcall __sched wait_for_completion(struct completion *x)	2985	void fastcall __sched wait_for_completion(struct completion *x)
2986	{	2986	{
2987	might_sleep();	2987	might_sleep();
2988	spin_lock_irq(&x->wait.lock);	2988	spin_lock_irq(&x->wait.lock);
2989	if (!x->done) {	2989	if (!x->done) {
2990	DECLARE_WAITQUEUE(wait, current);	2990	DECLARE_WAITQUEUE(wait, current);
2991		2991
2992	wait.flags \|= WQ_FLAG_EXCLUSIVE;	2992	wait.flags \|= WQ_FLAG_EXCLUSIVE;
2993	__add_wait_queue_tail(&x->wait, &wait);	2993	__add_wait_queue_tail(&x->wait, &wait);
2994	do {	2994	do {
2995	__set_current_state(TASK_UNINTERRUPTIBLE);	2995	__set_current_state(TASK_UNINTERRUPTIBLE);
2996	spin_unlock_irq(&x->wait.lock);	2996	spin_unlock_irq(&x->wait.lock);
2997	schedule();	2997	schedule();
2998	spin_lock_irq(&x->wait.lock);	2998	spin_lock_irq(&x->wait.lock);
2999	} while (!x->done);	2999	} while (!x->done);
3000	__remove_wait_queue(&x->wait, &wait);	3000	__remove_wait_queue(&x->wait, &wait);
3001	}	3001	}
3002	x->done--;	3002	x->done--;
3003	spin_unlock_irq(&x->wait.lock);	3003	spin_unlock_irq(&x->wait.lock);
3004	}	3004	}
3005	EXPORT_SYMBOL(wait_for_completion);	3005	EXPORT_SYMBOL(wait_for_completion);
3006		3006
3007	unsigned long fastcall __sched	3007	unsigned long fastcall __sched
3008	wait_for_completion_timeout(struct completion *x, unsigned long timeout)	3008	wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3009	{	3009	{
3010	might_sleep();	3010	might_sleep();
3011		3011
3012	spin_lock_irq(&x->wait.lock);	3012	spin_lock_irq(&x->wait.lock);
3013	if (!x->done) {	3013	if (!x->done) {
3014	DECLARE_WAITQUEUE(wait, current);	3014	DECLARE_WAITQUEUE(wait, current);
3015		3015
3016	wait.flags \|= WQ_FLAG_EXCLUSIVE;	3016	wait.flags \|= WQ_FLAG_EXCLUSIVE;
3017	__add_wait_queue_tail(&x->wait, &wait);	3017	__add_wait_queue_tail(&x->wait, &wait);
3018	do {	3018	do {
3019	__set_current_state(TASK_UNINTERRUPTIBLE);	3019	__set_current_state(TASK_UNINTERRUPTIBLE);
3020	spin_unlock_irq(&x->wait.lock);	3020	spin_unlock_irq(&x->wait.lock);
3021	timeout = schedule_timeout(timeout);	3021	timeout = schedule_timeout(timeout);
3022	spin_lock_irq(&x->wait.lock);	3022	spin_lock_irq(&x->wait.lock);
3023	if (!timeout) {	3023	if (!timeout) {
3024	__remove_wait_queue(&x->wait, &wait);	3024	__remove_wait_queue(&x->wait, &wait);
3025	goto out;	3025	goto out;
3026	}	3026	}
3027	} while (!x->done);	3027	} while (!x->done);
3028	__remove_wait_queue(&x->wait, &wait);	3028	__remove_wait_queue(&x->wait, &wait);
3029	}	3029	}
3030	x->done--;	3030	x->done--;
3031	out:	3031	out:
3032	spin_unlock_irq(&x->wait.lock);	3032	spin_unlock_irq(&x->wait.lock);
3033	return timeout;	3033	return timeout;
3034	}	3034	}
3035	EXPORT_SYMBOL(wait_for_completion_timeout);	3035	EXPORT_SYMBOL(wait_for_completion_timeout);
3036		3036
3037	int fastcall __sched wait_for_completion_interruptible(struct completion *x)	3037	int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3038	{	3038	{
3039	int ret = 0;	3039	int ret = 0;
3040		3040
3041	might_sleep();	3041	might_sleep();
3042		3042
3043	spin_lock_irq(&x->wait.lock);	3043	spin_lock_irq(&x->wait.lock);
3044	if (!x->done) {	3044	if (!x->done) {
3045	DECLARE_WAITQUEUE(wait, current);	3045	DECLARE_WAITQUEUE(wait, current);
3046		3046
3047	wait.flags \|= WQ_FLAG_EXCLUSIVE;	3047	wait.flags \|= WQ_FLAG_EXCLUSIVE;
3048	__add_wait_queue_tail(&x->wait, &wait);	3048	__add_wait_queue_tail(&x->wait, &wait);
3049	do {	3049	do {
3050	if (signal_pending(current)) {	3050	if (signal_pending(current)) {
3051	ret = -ERESTARTSYS;	3051	ret = -ERESTARTSYS;
3052	__remove_wait_queue(&x->wait, &wait);	3052	__remove_wait_queue(&x->wait, &wait);
3053	goto out;	3053	goto out;
3054	}	3054	}
3055	__set_current_state(TASK_INTERRUPTIBLE);	3055	__set_current_state(TASK_INTERRUPTIBLE);
3056	spin_unlock_irq(&x->wait.lock);	3056	spin_unlock_irq(&x->wait.lock);
3057	schedule();	3057	schedule();
3058	spin_lock_irq(&x->wait.lock);	3058	spin_lock_irq(&x->wait.lock);
3059	} while (!x->done);	3059	} while (!x->done);
3060	__remove_wait_queue(&x->wait, &wait);	3060	__remove_wait_queue(&x->wait, &wait);
3061	}	3061	}
3062	x->done--;	3062	x->done--;
3063	out:	3063	out:
3064	spin_unlock_irq(&x->wait.lock);	3064	spin_unlock_irq(&x->wait.lock);
3065		3065
3066	return ret;	3066	return ret;
3067	}	3067	}
3068	EXPORT_SYMBOL(wait_for_completion_interruptible);	3068	EXPORT_SYMBOL(wait_for_completion_interruptible);
3069		3069
3070	unsigned long fastcall __sched	3070	unsigned long fastcall __sched
3071	wait_for_completion_interruptible_timeout(struct completion *x,	3071	wait_for_completion_interruptible_timeout(struct completion *x,
3072	unsigned long timeout)	3072	unsigned long timeout)
3073	{	3073	{
3074	might_sleep();	3074	might_sleep();
3075		3075
3076	spin_lock_irq(&x->wait.lock);	3076	spin_lock_irq(&x->wait.lock);
3077	if (!x->done) {	3077	if (!x->done) {
3078	DECLARE_WAITQUEUE(wait, current);	3078	DECLARE_WAITQUEUE(wait, current);
3079		3079
3080	wait.flags \|= WQ_FLAG_EXCLUSIVE;	3080	wait.flags \|= WQ_FLAG_EXCLUSIVE;
3081	__add_wait_queue_tail(&x->wait, &wait);	3081	__add_wait_queue_tail(&x->wait, &wait);
3082	do {	3082	do {
3083	if (signal_pending(current)) {	3083	if (signal_pending(current)) {
3084	timeout = -ERESTARTSYS;	3084	timeout = -ERESTARTSYS;
3085	__remove_wait_queue(&x->wait, &wait);	3085	__remove_wait_queue(&x->wait, &wait);
3086	goto out;	3086	goto out;
3087	}	3087	}
3088	__set_current_state(TASK_INTERRUPTIBLE);	3088	__set_current_state(TASK_INTERRUPTIBLE);
3089	spin_unlock_irq(&x->wait.lock);	3089	spin_unlock_irq(&x->wait.lock);
3090	timeout = schedule_timeout(timeout);	3090	timeout = schedule_timeout(timeout);
3091	spin_lock_irq(&x->wait.lock);	3091	spin_lock_irq(&x->wait.lock);
3092	if (!timeout) {	3092	if (!timeout) {
3093	__remove_wait_queue(&x->wait, &wait);	3093	__remove_wait_queue(&x->wait, &wait);
3094	goto out;	3094	goto out;
3095	}	3095	}
3096	} while (!x->done);	3096	} while (!x->done);
3097	__remove_wait_queue(&x->wait, &wait);	3097	__remove_wait_queue(&x->wait, &wait);
3098	}	3098	}
3099	x->done--;	3099	x->done--;
3100	out:	3100	out:
3101	spin_unlock_irq(&x->wait.lock);	3101	spin_unlock_irq(&x->wait.lock);
3102	return timeout;	3102	return timeout;
3103	}	3103	}
3104	EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);	3104	EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3105		3105
3106		3106
3107	#define SLEEP_ON_VAR \	3107	#define SLEEP_ON_VAR \
3108	unsigned long flags; \	3108	unsigned long flags; \
3109	wait_queue_t wait; \	3109	wait_queue_t wait; \
3110	init_waitqueue_entry(&wait, current);	3110	init_waitqueue_entry(&wait, current);
3111		3111
3112	#define SLEEP_ON_HEAD \	3112	#define SLEEP_ON_HEAD \
3113	spin_lock_irqsave(&q->lock,flags); \	3113	spin_lock_irqsave(&q->lock,flags); \
3114	__add_wait_queue(q, &wait); \	3114	__add_wait_queue(q, &wait); \
3115	spin_unlock(&q->lock);	3115	spin_unlock(&q->lock);
3116		3116
3117	#define SLEEP_ON_TAIL \	3117	#define SLEEP_ON_TAIL \
3118	spin_lock_irq(&q->lock); \	3118	spin_lock_irq(&q->lock); \
3119	__remove_wait_queue(q, &wait); \	3119	__remove_wait_queue(q, &wait); \
3120	spin_unlock_irqrestore(&q->lock, flags);	3120	spin_unlock_irqrestore(&q->lock, flags);
3121		3121
3122	void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)	3122	void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3123	{	3123	{
3124	SLEEP_ON_VAR	3124	SLEEP_ON_VAR
3125		3125
3126	current->state = TASK_INTERRUPTIBLE;	3126	current->state = TASK_INTERRUPTIBLE;
3127		3127
3128	SLEEP_ON_HEAD	3128	SLEEP_ON_HEAD
3129	schedule();	3129	schedule();
3130	SLEEP_ON_TAIL	3130	SLEEP_ON_TAIL
3131	}	3131	}
3132		3132
3133	EXPORT_SYMBOL(interruptible_sleep_on);	3133	EXPORT_SYMBOL(interruptible_sleep_on);
3134		3134
3135	long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)	3135	long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3136	{	3136	{
3137	SLEEP_ON_VAR	3137	SLEEP_ON_VAR
3138		3138
3139	current->state = TASK_INTERRUPTIBLE;	3139	current->state = TASK_INTERRUPTIBLE;
3140		3140
3141	SLEEP_ON_HEAD	3141	SLEEP_ON_HEAD
3142	timeout = schedule_timeout(timeout);	3142	timeout = schedule_timeout(timeout);
3143	SLEEP_ON_TAIL	3143	SLEEP_ON_TAIL
3144		3144
3145	return timeout;	3145	return timeout;
3146	}	3146	}
3147		3147
3148	EXPORT_SYMBOL(interruptible_sleep_on_timeout);	3148	EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3149		3149
3150	void fastcall __sched sleep_on(wait_queue_head_t *q)	3150	void fastcall __sched sleep_on(wait_queue_head_t *q)
3151	{	3151	{
3152	SLEEP_ON_VAR	3152	SLEEP_ON_VAR
3153		3153
3154	current->state = TASK_UNINTERRUPTIBLE;	3154	current->state = TASK_UNINTERRUPTIBLE;
3155		3155
3156	SLEEP_ON_HEAD	3156	SLEEP_ON_HEAD
3157	schedule();	3157	schedule();
3158	SLEEP_ON_TAIL	3158	SLEEP_ON_TAIL
3159	}	3159	}
3160		3160
3161	EXPORT_SYMBOL(sleep_on);	3161	EXPORT_SYMBOL(sleep_on);
3162		3162
3163	long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)	3163	long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3164	{	3164	{
3165	SLEEP_ON_VAR	3165	SLEEP_ON_VAR
3166		3166
3167	current->state = TASK_UNINTERRUPTIBLE;	3167	current->state = TASK_UNINTERRUPTIBLE;
3168		3168
3169	SLEEP_ON_HEAD	3169	SLEEP_ON_HEAD
3170	timeout = schedule_timeout(timeout);	3170	timeout = schedule_timeout(timeout);
3171	SLEEP_ON_TAIL	3171	SLEEP_ON_TAIL
3172		3172
3173	return timeout;	3173	return timeout;
3174	}	3174	}
3175		3175
3176	EXPORT_SYMBOL(sleep_on_timeout);	3176	EXPORT_SYMBOL(sleep_on_timeout);
3177		3177
3178	void set_user_nice(task_t *p, long nice)	3178	void set_user_nice(task_t *p, long nice)
3179	{	3179	{
3180	unsigned long flags;	3180	unsigned long flags;
3181	prio_array_t *array;	3181	prio_array_t *array;
3182	runqueue_t *rq;	3182	runqueue_t *rq;
3183	int old_prio, new_prio, delta;	3183	int old_prio, new_prio, delta;
3184		3184
3185	if (TASK_NICE(p) == nice \|\| nice < -20 \|\| nice > 19)	3185	if (TASK_NICE(p) == nice \|\| nice < -20 \|\| nice > 19)
3186	return;	3186	return;
3187	/*	3187	/*
3188	* We have to be careful, if called from sys_setpriority(),	3188	* We have to be careful, if called from sys_setpriority(),
3189	* the task might be in the middle of scheduling on another CPU.	3189	* the task might be in the middle of scheduling on another CPU.
3190	*/	3190	*/
3191	rq = task_rq_lock(p, &flags);	3191	rq = task_rq_lock(p, &flags);
3192	/*	3192	/*
3193	* The RT priorities are set via sched_setscheduler(), but we still	3193	* The RT priorities are set via sched_setscheduler(), but we still
3194	* allow the 'normal' nice value to be set - but as expected	3194	* allow the 'normal' nice value to be set - but as expected
3195	* it wont have any effect on scheduling until the task is	3195	* it wont have any effect on scheduling until the task is
3196	* not SCHED_NORMAL:	3196	* not SCHED_NORMAL:
3197	*/	3197	*/
3198	if (rt_task(p)) {	3198	if (rt_task(p)) {
3199	p->static_prio = NICE_TO_PRIO(nice);	3199	p->static_prio = NICE_TO_PRIO(nice);
3200	goto out_unlock;	3200	goto out_unlock;
3201	}	3201	}
3202	array = p->array;	3202	array = p->array;
3203	if (array)	3203	if (array)
3204	dequeue_task(p, array);	3204	dequeue_task(p, array);
3205		3205
3206	old_prio = p->prio;	3206	old_prio = p->prio;
3207	new_prio = NICE_TO_PRIO(nice);	3207	new_prio = NICE_TO_PRIO(nice);
3208	delta = new_prio - old_prio;	3208	delta = new_prio - old_prio;
3209	p->static_prio = NICE_TO_PRIO(nice);	3209	p->static_prio = NICE_TO_PRIO(nice);
3210	p->prio += delta;	3210	p->prio += delta;
3211		3211
3212	if (array) {	3212	if (array) {
3213	enqueue_task(p, array);	3213	enqueue_task(p, array);
3214	/*	3214	/*
3215	* If the task increased its priority or is running and	3215	* If the task increased its priority or is running and
3216	* lowered its priority, then reschedule its CPU:	3216	* lowered its priority, then reschedule its CPU:
3217	*/	3217	*/
3218	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))	3218	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
3219	resched_task(rq->curr);	3219	resched_task(rq->curr);
3220	}	3220	}
3221	out_unlock:	3221	out_unlock:
3222	task_rq_unlock(rq, &flags);	3222	task_rq_unlock(rq, &flags);
3223	}	3223	}
3224		3224
3225	EXPORT_SYMBOL(set_user_nice);	3225	EXPORT_SYMBOL(set_user_nice);
3226		3226
3227	/*	3227	/*
3228	* can_nice - check if a task can reduce its nice value	3228	* can_nice - check if a task can reduce its nice value
3229	* @p: task	3229	* @p: task
3230	* @nice: nice value	3230	* @nice: nice value
3231	*/	3231	*/
3232	int can_nice(const task_t *p, const int nice)	3232	int can_nice(const task_t *p, const int nice)
3233	{	3233	{
3234	/* convert nice value [19,-20] to rlimit style value [0,39] */	3234	/* convert nice value [19,-20] to rlimit style value [0,39] */
3235	int nice_rlim = 19 - nice;	3235	int nice_rlim = 19 - nice;
3236	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur \|\|	3236	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur \|\|
3237	capable(CAP_SYS_NICE));	3237	capable(CAP_SYS_NICE));
3238	}	3238	}
3239		3239
3240	#ifdef __ARCH_WANT_SYS_NICE	3240	#ifdef __ARCH_WANT_SYS_NICE
3241		3241
3242	/*	3242	/*
3243	* sys_nice - change the priority of the current process.	3243	* sys_nice - change the priority of the current process.
3244	* @increment: priority increment	3244	* @increment: priority increment
3245	*	3245	*
3246	* sys_setpriority is a more generic, but much slower function that	3246	* sys_setpriority is a more generic, but much slower function that
3247	* does similar things.	3247	* does similar things.
3248	*/	3248	*/
3249	asmlinkage long sys_nice(int increment)	3249	asmlinkage long sys_nice(int increment)
3250	{	3250	{
3251	int retval;	3251	int retval;
3252	long nice;	3252	long nice;
3253		3253
3254	/*	3254	/*
3255	* Setpriority might change our priority at the same moment.	3255	* Setpriority might change our priority at the same moment.
3256	* We don't have to worry. Conceptually one call occurs first	3256	* We don't have to worry. Conceptually one call occurs first
3257	* and we have a single winner.	3257	* and we have a single winner.
3258	*/	3258	*/
3259	if (increment < -40)	3259	if (increment < -40)
3260	increment = -40;	3260	increment = -40;
3261	if (increment > 40)	3261	if (increment > 40)
3262	increment = 40;	3262	increment = 40;
3263		3263
3264	nice = PRIO_TO_NICE(current->static_prio) + increment;	3264	nice = PRIO_TO_NICE(current->static_prio) + increment;
3265	if (nice < -20)	3265	if (nice < -20)
3266	nice = -20;	3266	nice = -20;
3267	if (nice > 19)	3267	if (nice > 19)
3268	nice = 19;	3268	nice = 19;
3269		3269
3270	if (increment < 0 && !can_nice(current, nice))	3270	if (increment < 0 && !can_nice(current, nice))
3271	return -EPERM;	3271	return -EPERM;
3272		3272
3273	retval = security_task_setnice(current, nice);	3273	retval = security_task_setnice(current, nice);
3274	if (retval)	3274	if (retval)
3275	return retval;	3275	return retval;
3276		3276
3277	set_user_nice(current, nice);	3277	set_user_nice(current, nice);
3278	return 0;	3278	return 0;
3279	}	3279	}
3280		3280
3281	#endif	3281	#endif
3282		3282
3283	/**	3283	/**
3284	* task_prio - return the priority value of a given task.	3284	* task_prio - return the priority value of a given task.
3285	* @p: the task in question.	3285	* @p: the task in question.
3286	*	3286	*
3287	* This is the priority value as seen by users in /proc.	3287	* This is the priority value as seen by users in /proc.
3288	* RT tasks are offset by -200. Normal tasks are centered	3288	* RT tasks are offset by -200. Normal tasks are centered
3289	* around 0, value goes from -16 to +15.	3289	* around 0, value goes from -16 to +15.
3290	*/	3290	*/
3291	int task_prio(const task_t *p)	3291	int task_prio(const task_t *p)
3292	{	3292	{
3293	return p->prio - MAX_RT_PRIO;	3293	return p->prio - MAX_RT_PRIO;
3294	}	3294	}
3295		3295
3296	/**	3296	/**
3297	* task_nice - return the nice value of a given task.	3297	* task_nice - return the nice value of a given task.
3298	* @p: the task in question.	3298	* @p: the task in question.
3299	*/	3299	*/
3300	int task_nice(const task_t *p)	3300	int task_nice(const task_t *p)
3301	{	3301	{
3302	return TASK_NICE(p);	3302	return TASK_NICE(p);
3303	}	3303	}
3304		3304
3305	/*	3305	/*
3306	* The only users of task_nice are binfmt_elf and binfmt_elf32.	3306	* The only users of task_nice are binfmt_elf and binfmt_elf32.
3307	* binfmt_elf is no longer modular, but binfmt_elf32 still is.	3307	* binfmt_elf is no longer modular, but binfmt_elf32 still is.
3308	* Therefore, task_nice is needed if there is a compat_mode.	3308	* Therefore, task_nice is needed if there is a compat_mode.
3309	*/	3309	*/
3310	#ifdef CONFIG_COMPAT	3310	#ifdef CONFIG_COMPAT
3311	EXPORT_SYMBOL_GPL(task_nice);	3311	EXPORT_SYMBOL_GPL(task_nice);
3312	#endif	3312	#endif
3313		3313
3314	/**	3314	/**
3315	* idle_cpu - is a given cpu idle currently?	3315	* idle_cpu - is a given cpu idle currently?
3316	* @cpu: the processor in question.	3316	* @cpu: the processor in question.
3317	*/	3317	*/
3318	int idle_cpu(int cpu)	3318	int idle_cpu(int cpu)
3319	{	3319	{
3320	return cpu_curr(cpu) == cpu_rq(cpu)->idle;	3320	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3321	}	3321	}
3322		3322
3323	EXPORT_SYMBOL_GPL(idle_cpu);	3323	EXPORT_SYMBOL_GPL(idle_cpu);
3324		3324
3325	/**	3325	/**
3326	* idle_task - return the idle task for a given cpu.	3326	* idle_task - return the idle task for a given cpu.
3327	* @cpu: the processor in question.	3327	* @cpu: the processor in question.
3328	*/	3328	*/
3329	task_t *idle_task(int cpu)	3329	task_t *idle_task(int cpu)
3330	{	3330	{
3331	return cpu_rq(cpu)->idle;	3331	return cpu_rq(cpu)->idle;
3332	}	3332	}
3333		3333
3334	/**	3334	/**
3335	* find_process_by_pid - find a process with a matching PID value.	3335	* find_process_by_pid - find a process with a matching PID value.
3336	* @pid: the pid in question.	3336	* @pid: the pid in question.
3337	*/	3337	*/
3338	static inline task_t *find_process_by_pid(pid_t pid)	3338	static inline task_t *find_process_by_pid(pid_t pid)
3339	{	3339	{
3340	return pid ? find_task_by_pid(pid) : current;	3340	return pid ? find_task_by_pid(pid) : current;
3341	}	3341	}
3342		3342
3343	/* Actually do priority change: must hold rq lock. */	3343	/* Actually do priority change: must hold rq lock. */
3344	static void __setscheduler(struct task_struct *p, int policy, int prio)	3344	static void __setscheduler(struct task_struct *p, int policy, int prio)
3345	{	3345	{
3346	BUG_ON(p->array);	3346	BUG_ON(p->array);
3347	p->policy = policy;	3347	p->policy = policy;
3348	p->rt_priority = prio;	3348	p->rt_priority = prio;
3349	if (policy != SCHED_NORMAL)	3349	if (policy != SCHED_NORMAL)
3350	p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;	3350	p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3351	else	3351	else
3352	p->prio = p->static_prio;	3352	p->prio = p->static_prio;
3353	}	3353	}
3354		3354
3355	/**	3355	/**
3356	* sched_setscheduler - change the scheduling policy and/or RT priority of	3356	* sched_setscheduler - change the scheduling policy and/or RT priority of
3357	* a thread.	3357	* a thread.
3358	* @p: the task in question.	3358	* @p: the task in question.
3359	* @policy: new policy.	3359	* @policy: new policy.
3360	* @param: structure containing the new RT priority.	3360	* @param: structure containing the new RT priority.
3361	*/	3361	*/
3362	int sched_setscheduler(struct task_struct p, int policy, struct sched_param param)	3362	int sched_setscheduler(struct task_struct p, int policy, struct sched_param param)
3363	{	3363	{
3364	int retval;	3364	int retval;
3365	int oldprio, oldpolicy = -1;	3365	int oldprio, oldpolicy = -1;
3366	prio_array_t *array;	3366	prio_array_t *array;
3367	unsigned long flags;	3367	unsigned long flags;
3368	runqueue_t *rq;	3368	runqueue_t *rq;
3369		3369
3370	recheck:	3370	recheck:
3371	/* double check policy once rq lock held */	3371	/* double check policy once rq lock held */
3372	if (policy < 0)	3372	if (policy < 0)
3373	policy = oldpolicy = p->policy;	3373	policy = oldpolicy = p->policy;
3374	else if (policy != SCHED_FIFO && policy != SCHED_RR &&	3374	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3375	policy != SCHED_NORMAL)	3375	policy != SCHED_NORMAL)
3376	return -EINVAL;	3376	return -EINVAL;
3377	/*	3377	/*
3378	* Valid priorities for SCHED_FIFO and SCHED_RR are	3378	* Valid priorities for SCHED_FIFO and SCHED_RR are
3379	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.	3379	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3380	*/	3380	*/
3381	if (param->sched_priority < 0 \|\|	3381	if (param->sched_priority < 0 \|\|
3382	param->sched_priority > MAX_USER_RT_PRIO-1)	3382	param->sched_priority > MAX_USER_RT_PRIO-1)
3383	return -EINVAL;	3383	return -EINVAL;
3384	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))	3384	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385	return -EINVAL;	3385	return -EINVAL;
3386		3386
3387	if ((policy == SCHED_FIFO \|\| policy == SCHED_RR) &&	3387	if ((policy == SCHED_FIFO \|\| policy == SCHED_RR) &&
3388	param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&	3388	param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
3389	!capable(CAP_SYS_NICE))	3389	!capable(CAP_SYS_NICE))
3390	return -EPERM;	3390	return -EPERM;
3391	if ((current->euid != p->euid) && (current->euid != p->uid) &&	3391	if ((current->euid != p->euid) && (current->euid != p->uid) &&
3392	!capable(CAP_SYS_NICE))	3392	!capable(CAP_SYS_NICE))
3393	return -EPERM;	3393	return -EPERM;
3394		3394
3395	retval = security_task_setscheduler(p, policy, param);	3395	retval = security_task_setscheduler(p, policy, param);
3396	if (retval)	3396	if (retval)
3397	return retval;	3397	return retval;
3398	/*	3398	/*
3399	* To be able to change p->policy safely, the apropriate	3399	* To be able to change p->policy safely, the apropriate
3400	* runqueue lock must be held.	3400	* runqueue lock must be held.
3401	*/	3401	*/
3402	rq = task_rq_lock(p, &flags);	3402	rq = task_rq_lock(p, &flags);
3403	/* recheck policy now with rq lock held */	3403	/* recheck policy now with rq lock held */
3404	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {	3404	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3405	policy = oldpolicy = -1;	3405	policy = oldpolicy = -1;
3406	task_rq_unlock(rq, &flags);	3406	task_rq_unlock(rq, &flags);
3407	goto recheck;	3407	goto recheck;
3408	}	3408	}
3409	array = p->array;	3409	array = p->array;
3410	if (array)	3410	if (array)
3411	deactivate_task(p, rq);	3411	deactivate_task(p, rq);
3412	oldprio = p->prio;	3412	oldprio = p->prio;
3413	__setscheduler(p, policy, param->sched_priority);	3413	__setscheduler(p, policy, param->sched_priority);
3414	if (array) {	3414	if (array) {
3415	__activate_task(p, rq);	3415	__activate_task(p, rq);
3416	/*	3416	/*
3417	* Reschedule if we are currently running on this runqueue and	3417	* Reschedule if we are currently running on this runqueue and
3418	* our priority decreased, or if we are not currently running on	3418	* our priority decreased, or if we are not currently running on
3419	* this runqueue and our priority is higher than the current's	3419	* this runqueue and our priority is higher than the current's
3420	*/	3420	*/
3421	if (task_running(rq, p)) {	3421	if (task_running(rq, p)) {
3422	if (p->prio > oldprio)	3422	if (p->prio > oldprio)
3423	resched_task(rq->curr);	3423	resched_task(rq->curr);
3424	} else if (TASK_PREEMPTS_CURR(p, rq))	3424	} else if (TASK_PREEMPTS_CURR(p, rq))
3425	resched_task(rq->curr);	3425	resched_task(rq->curr);
3426	}	3426	}
3427	task_rq_unlock(rq, &flags);	3427	task_rq_unlock(rq, &flags);
3428	return 0;	3428	return 0;
3429	}	3429	}
3430	EXPORT_SYMBOL_GPL(sched_setscheduler);	3430	EXPORT_SYMBOL_GPL(sched_setscheduler);
3431		3431
3432	static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)	3432	static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3433	{	3433	{
3434	int retval;	3434	int retval;
3435	struct sched_param lparam;	3435	struct sched_param lparam;
3436	struct task_struct *p;	3436	struct task_struct *p;
3437		3437
3438	if (!param \|\| pid < 0)	3438	if (!param \|\| pid < 0)
3439	return -EINVAL;	3439	return -EINVAL;
3440	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))	3440	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3441	return -EFAULT;	3441	return -EFAULT;
3442	read_lock_irq(&tasklist_lock);	3442	read_lock_irq(&tasklist_lock);
3443	p = find_process_by_pid(pid);	3443	p = find_process_by_pid(pid);
3444	if (!p) {	3444	if (!p) {
3445	read_unlock_irq(&tasklist_lock);	3445	read_unlock_irq(&tasklist_lock);
3446	return -ESRCH;	3446	return -ESRCH;
3447	}	3447	}
3448	retval = sched_setscheduler(p, policy, &lparam);	3448	retval = sched_setscheduler(p, policy, &lparam);
3449	read_unlock_irq(&tasklist_lock);	3449	read_unlock_irq(&tasklist_lock);
3450	return retval;	3450	return retval;
3451	}	3451	}
3452		3452
3453	/**	3453	/**
3454	* sys_sched_setscheduler - set/change the scheduler policy and RT priority	3454	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
3455	* @pid: the pid in question.	3455	* @pid: the pid in question.
3456	* @policy: new policy.	3456	* @policy: new policy.
3457	* @param: structure containing the new RT priority.	3457	* @param: structure containing the new RT priority.
3458	*/	3458	*/
3459	asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,	3459	asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3460	struct sched_param __user *param)	3460	struct sched_param __user *param)
3461	{	3461	{
3462	return do_sched_setscheduler(pid, policy, param);	3462	return do_sched_setscheduler(pid, policy, param);
3463	}	3463	}
3464		3464
3465	/**	3465	/**
3466	* sys_sched_setparam - set/change the RT priority of a thread	3466	* sys_sched_setparam - set/change the RT priority of a thread
3467	* @pid: the pid in question.	3467	* @pid: the pid in question.
3468	* @param: structure containing the new RT priority.	3468	* @param: structure containing the new RT priority.
3469	*/	3469	*/
3470	asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)	3470	asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3471	{	3471	{
3472	return do_sched_setscheduler(pid, -1, param);	3472	return do_sched_setscheduler(pid, -1, param);
3473	}	3473	}
3474		3474
3475	/**	3475	/**
3476	* sys_sched_getscheduler - get the policy (scheduling class) of a thread	3476	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
3477	* @pid: the pid in question.	3477	* @pid: the pid in question.
3478	*/	3478	*/
3479	asmlinkage long sys_sched_getscheduler(pid_t pid)	3479	asmlinkage long sys_sched_getscheduler(pid_t pid)
3480	{	3480	{
3481	int retval = -EINVAL;	3481	int retval = -EINVAL;
3482	task_t *p;	3482	task_t *p;
3483		3483
3484	if (pid < 0)	3484	if (pid < 0)
3485	goto out_nounlock;	3485	goto out_nounlock;
3486		3486
3487	retval = -ESRCH;	3487	retval = -ESRCH;
3488	read_lock(&tasklist_lock);	3488	read_lock(&tasklist_lock);
3489	p = find_process_by_pid(pid);	3489	p = find_process_by_pid(pid);
3490	if (p) {	3490	if (p) {
3491	retval = security_task_getscheduler(p);	3491	retval = security_task_getscheduler(p);
3492	if (!retval)	3492	if (!retval)
3493	retval = p->policy;	3493	retval = p->policy;
3494	}	3494	}
3495	read_unlock(&tasklist_lock);	3495	read_unlock(&tasklist_lock);
3496		3496
3497	out_nounlock:	3497	out_nounlock:
3498	return retval;	3498	return retval;
3499	}	3499	}
3500		3500
3501	/**	3501	/**
3502	* sys_sched_getscheduler - get the RT priority of a thread	3502	* sys_sched_getscheduler - get the RT priority of a thread
3503	* @pid: the pid in question.	3503	* @pid: the pid in question.
3504	* @param: structure containing the RT priority.	3504	* @param: structure containing the RT priority.
3505	*/	3505	*/
3506	asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)	3506	asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3507	{	3507	{
3508	struct sched_param lp;	3508	struct sched_param lp;
3509	int retval = -EINVAL;	3509	int retval = -EINVAL;
3510	task_t *p;	3510	task_t *p;
3511		3511
3512	if (!param \|\| pid < 0)	3512	if (!param \|\| pid < 0)
3513	goto out_nounlock;	3513	goto out_nounlock;
3514		3514
3515	read_lock(&tasklist_lock);	3515	read_lock(&tasklist_lock);
3516	p = find_process_by_pid(pid);	3516	p = find_process_by_pid(pid);
3517	retval = -ESRCH;	3517	retval = -ESRCH;
3518	if (!p)	3518	if (!p)
3519	goto out_unlock;	3519	goto out_unlock;
3520		3520
3521	retval = security_task_getscheduler(p);	3521	retval = security_task_getscheduler(p);
3522	if (retval)	3522	if (retval)
3523	goto out_unlock;	3523	goto out_unlock;
3524		3524
3525	lp.sched_priority = p->rt_priority;	3525	lp.sched_priority = p->rt_priority;
3526	read_unlock(&tasklist_lock);	3526	read_unlock(&tasklist_lock);
3527		3527
3528	/*	3528	/*
3529	* This one might sleep, we cannot do it with a spinlock held ...	3529	* This one might sleep, we cannot do it with a spinlock held ...
3530	*/	3530	*/
3531	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;	3531	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3532		3532
3533	out_nounlock:	3533	out_nounlock:
3534	return retval;	3534	return retval;
3535		3535
3536	out_unlock:	3536	out_unlock:
3537	read_unlock(&tasklist_lock);	3537	read_unlock(&tasklist_lock);
3538	return retval;	3538	return retval;
3539	}	3539	}
3540		3540
3541	long sched_setaffinity(pid_t pid, cpumask_t new_mask)	3541	long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3542	{	3542	{
3543	task_t *p;	3543	task_t *p;
3544	int retval;	3544	int retval;
3545	cpumask_t cpus_allowed;	3545	cpumask_t cpus_allowed;
3546		3546
3547	lock_cpu_hotplug();	3547	lock_cpu_hotplug();
3548	read_lock(&tasklist_lock);	3548	read_lock(&tasklist_lock);
3549		3549
3550	p = find_process_by_pid(pid);	3550	p = find_process_by_pid(pid);
3551	if (!p) {	3551	if (!p) {
3552	read_unlock(&tasklist_lock);	3552	read_unlock(&tasklist_lock);
3553	unlock_cpu_hotplug();	3553	unlock_cpu_hotplug();
3554	return -ESRCH;	3554	return -ESRCH;
3555	}	3555	}
3556		3556
3557	/*	3557	/*
3558	* It is not safe to call set_cpus_allowed with the	3558	* It is not safe to call set_cpus_allowed with the
3559	* tasklist_lock held. We will bump the task_struct's	3559	* tasklist_lock held. We will bump the task_struct's
3560	* usage count and then drop tasklist_lock.	3560	* usage count and then drop tasklist_lock.
3561	*/	3561	*/
3562	get_task_struct(p);	3562	get_task_struct(p);
3563	read_unlock(&tasklist_lock);	3563	read_unlock(&tasklist_lock);
3564		3564
3565	retval = -EPERM;	3565	retval = -EPERM;
3566	if ((current->euid != p->euid) && (current->euid != p->uid) &&	3566	if ((current->euid != p->euid) && (current->euid != p->uid) &&
3567	!capable(CAP_SYS_NICE))	3567	!capable(CAP_SYS_NICE))
3568	goto out_unlock;	3568	goto out_unlock;
3569		3569
3570	cpus_allowed = cpuset_cpus_allowed(p);	3570	cpus_allowed = cpuset_cpus_allowed(p);
3571	cpus_and(new_mask, new_mask, cpus_allowed);	3571	cpus_and(new_mask, new_mask, cpus_allowed);
3572	retval = set_cpus_allowed(p, new_mask);	3572	retval = set_cpus_allowed(p, new_mask);
3573		3573
3574	out_unlock:	3574	out_unlock:
3575	put_task_struct(p);	3575	put_task_struct(p);
3576	unlock_cpu_hotplug();	3576	unlock_cpu_hotplug();
3577	return retval;	3577	return retval;
3578	}	3578	}
3579		3579
3580	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,	3580	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3581	cpumask_t *new_mask)	3581	cpumask_t *new_mask)
3582	{	3582	{
3583	if (len < sizeof(cpumask_t)) {	3583	if (len < sizeof(cpumask_t)) {
3584	memset(new_mask, 0, sizeof(cpumask_t));	3584	memset(new_mask, 0, sizeof(cpumask_t));
3585	} else if (len > sizeof(cpumask_t)) {	3585	} else if (len > sizeof(cpumask_t)) {
3586	len = sizeof(cpumask_t);	3586	len = sizeof(cpumask_t);
3587	}	3587	}
3588	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;	3588	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3589	}	3589	}
3590		3590
3591	/**	3591	/**
3592	* sys_sched_setaffinity - set the cpu affinity of a process	3592	* sys_sched_setaffinity - set the cpu affinity of a process
3593	* @pid: pid of the process	3593	* @pid: pid of the process
3594	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	3594	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
3595	* @user_mask_ptr: user-space pointer to the new cpu mask	3595	* @user_mask_ptr: user-space pointer to the new cpu mask
3596	*/	3596	*/
3597	asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,	3597	asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3598	unsigned long __user *user_mask_ptr)	3598	unsigned long __user *user_mask_ptr)
3599	{	3599	{
3600	cpumask_t new_mask;	3600	cpumask_t new_mask;
3601	int retval;	3601	int retval;
3602		3602
3603	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);	3603	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3604	if (retval)	3604	if (retval)
3605	return retval;	3605	return retval;
3606		3606
3607	return sched_setaffinity(pid, new_mask);	3607	return sched_setaffinity(pid, new_mask);
3608	}	3608	}
3609		3609
3610	/*	3610	/*
3611	* Represents all cpu's present in the system	3611	* Represents all cpu's present in the system
3612	* In systems capable of hotplug, this map could dynamically grow	3612	* In systems capable of hotplug, this map could dynamically grow
3613	* as new cpu's are detected in the system via any platform specific	3613	* as new cpu's are detected in the system via any platform specific
3614	* method, such as ACPI for e.g.	3614	* method, such as ACPI for e.g.
3615	*/	3615	*/
3616		3616
3617	cpumask_t cpu_present_map;	3617	cpumask_t cpu_present_map;
3618	EXPORT_SYMBOL(cpu_present_map);	3618	EXPORT_SYMBOL(cpu_present_map);
3619		3619
3620	#ifndef CONFIG_SMP	3620	#ifndef CONFIG_SMP
3621	cpumask_t cpu_online_map = CPU_MASK_ALL;	3621	cpumask_t cpu_online_map = CPU_MASK_ALL;
3622	cpumask_t cpu_possible_map = CPU_MASK_ALL;	3622	cpumask_t cpu_possible_map = CPU_MASK_ALL;
3623	#endif	3623	#endif
3624		3624
3625	long sched_getaffinity(pid_t pid, cpumask_t *mask)	3625	long sched_getaffinity(pid_t pid, cpumask_t *mask)
3626	{	3626	{
3627	int retval;	3627	int retval;
3628	task_t *p;	3628	task_t *p;
3629		3629
3630	lock_cpu_hotplug();	3630	lock_cpu_hotplug();
3631	read_lock(&tasklist_lock);	3631	read_lock(&tasklist_lock);
3632		3632
3633	retval = -ESRCH;	3633	retval = -ESRCH;
3634	p = find_process_by_pid(pid);	3634	p = find_process_by_pid(pid);
3635	if (!p)	3635	if (!p)
3636	goto out_unlock;	3636	goto out_unlock;
3637		3637
3638	retval = 0;	3638	retval = 0;
3639	cpus_and(*mask, p->cpus_allowed, cpu_possible_map);	3639	cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3640		3640
3641	out_unlock:	3641	out_unlock:
3642	read_unlock(&tasklist_lock);	3642	read_unlock(&tasklist_lock);
3643	unlock_cpu_hotplug();	3643	unlock_cpu_hotplug();
3644	if (retval)	3644	if (retval)
3645	return retval;	3645	return retval;
3646		3646
3647	return 0;	3647	return 0;
3648	}	3648	}
3649		3649
3650	/**	3650	/**
3651	* sys_sched_getaffinity - get the cpu affinity of a process	3651	* sys_sched_getaffinity - get the cpu affinity of a process
3652	* @pid: pid of the process	3652	* @pid: pid of the process
3653	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	3653	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
3654	* @user_mask_ptr: user-space pointer to hold the current cpu mask	3654	* @user_mask_ptr: user-space pointer to hold the current cpu mask
3655	*/	3655	*/
3656	asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,	3656	asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3657	unsigned long __user *user_mask_ptr)	3657	unsigned long __user *user_mask_ptr)
3658	{	3658	{
3659	int ret;	3659	int ret;
3660	cpumask_t mask;	3660	cpumask_t mask;
3661		3661
3662	if (len < sizeof(cpumask_t))	3662	if (len < sizeof(cpumask_t))
3663	return -EINVAL;	3663	return -EINVAL;
3664		3664
3665	ret = sched_getaffinity(pid, &mask);	3665	ret = sched_getaffinity(pid, &mask);
3666	if (ret < 0)	3666	if (ret < 0)
3667	return ret;	3667	return ret;
3668		3668
3669	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))	3669	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3670	return -EFAULT;	3670	return -EFAULT;
3671		3671
3672	return sizeof(cpumask_t);	3672	return sizeof(cpumask_t);
3673	}	3673	}
3674		3674
3675	/**	3675	/**
3676	* sys_sched_yield - yield the current processor to other threads.	3676	* sys_sched_yield - yield the current processor to other threads.
3677	*	3677	*
3678	* this function yields the current CPU by moving the calling thread	3678	* this function yields the current CPU by moving the calling thread
3679	* to the expired array. If there are no other threads running on this	3679	* to the expired array. If there are no other threads running on this
3680	* CPU then this function will return.	3680	* CPU then this function will return.
3681	*/	3681	*/
3682	asmlinkage long sys_sched_yield(void)	3682	asmlinkage long sys_sched_yield(void)
3683	{	3683	{
3684	runqueue_t *rq = this_rq_lock();	3684	runqueue_t *rq = this_rq_lock();
3685	prio_array_t *array = current->array;	3685	prio_array_t *array = current->array;
3686	prio_array_t *target = rq->expired;	3686	prio_array_t *target = rq->expired;
3687		3687
3688	schedstat_inc(rq, yld_cnt);	3688	schedstat_inc(rq, yld_cnt);
3689	/*	3689	/*
3690	* We implement yielding by moving the task into the expired	3690	* We implement yielding by moving the task into the expired
3691	* queue.	3691	* queue.
3692	*	3692	*
3693	* (special rule: RT tasks will just roundrobin in the active	3693	* (special rule: RT tasks will just roundrobin in the active
3694	* array.)	3694	* array.)
3695	*/	3695	*/
3696	if (rt_task(current))	3696	if (rt_task(current))
3697	target = rq->active;	3697	target = rq->active;
3698		3698
3699	if (current->array->nr_active == 1) {	3699	if (current->array->nr_active == 1) {
3700	schedstat_inc(rq, yld_act_empty);	3700	schedstat_inc(rq, yld_act_empty);
3701	if (!rq->expired->nr_active)	3701	if (!rq->expired->nr_active)
3702	schedstat_inc(rq, yld_both_empty);	3702	schedstat_inc(rq, yld_both_empty);
3703	} else if (!rq->expired->nr_active)	3703	} else if (!rq->expired->nr_active)
3704	schedstat_inc(rq, yld_exp_empty);	3704	schedstat_inc(rq, yld_exp_empty);
3705		3705
3706	if (array != target) {	3706	if (array != target) {
3707	dequeue_task(current, array);	3707	dequeue_task(current, array);
3708	enqueue_task(current, target);	3708	enqueue_task(current, target);
3709	} else	3709	} else
3710	/*	3710	/*
3711	* requeue_task is cheaper so perform that if possible.	3711	* requeue_task is cheaper so perform that if possible.
3712	*/	3712	*/
3713	requeue_task(current, array);	3713	requeue_task(current, array);
3714		3714
3715	/*	3715	/*
3716	* Since we are going to call schedule() anyway, there's	3716	* Since we are going to call schedule() anyway, there's
3717	* no need to preempt or enable interrupts:	3717	* no need to preempt or enable interrupts:
3718	*/	3718	*/
3719	__release(rq->lock);	3719	__release(rq->lock);
3720	_raw_spin_unlock(&rq->lock);	3720	_raw_spin_unlock(&rq->lock);
3721	preempt_enable_no_resched();	3721	preempt_enable_no_resched();
3722		3722
3723	schedule();	3723	schedule();
3724		3724
3725	return 0;	3725	return 0;
3726	}	3726	}
3727		3727
3728	static inline void __cond_resched(void)	3728	static inline void __cond_resched(void)
3729	{	3729	{
3730	do {	3730	do {
3731	add_preempt_count(PREEMPT_ACTIVE);	3731	add_preempt_count(PREEMPT_ACTIVE);
3732	schedule();	3732	schedule();
3733	sub_preempt_count(PREEMPT_ACTIVE);	3733	sub_preempt_count(PREEMPT_ACTIVE);
3734	} while (need_resched());	3734	} while (need_resched());
3735	}	3735	}
3736		3736
3737	int __sched cond_resched(void)	3737	int __sched cond_resched(void)
3738	{	3738	{
3739	if (need_resched()) {	3739	if (need_resched()) {
3740	__cond_resched();	3740	__cond_resched();
3741	return 1;	3741	return 1;
3742	}	3742	}
3743	return 0;	3743	return 0;
3744	}	3744	}
3745		3745
3746	EXPORT_SYMBOL(cond_resched);	3746	EXPORT_SYMBOL(cond_resched);
3747		3747
3748	/*	3748	/*
3749	* cond_resched_lock() - if a reschedule is pending, drop the given lock,	3749	* cond_resched_lock() - if a reschedule is pending, drop the given lock,
3750	* call schedule, and on return reacquire the lock.	3750	* call schedule, and on return reacquire the lock.
3751	*	3751	*
3752	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level	3752	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
3753	* operations here to prevent schedule() from being called twice (once via	3753	* operations here to prevent schedule() from being called twice (once via
3754	* spin_unlock(), once by hand).	3754	* spin_unlock(), once by hand).
3755	*/	3755	*/
3756	int cond_resched_lock(spinlock_t * lock)	3756	int cond_resched_lock(spinlock_t * lock)
3757	{	3757	{
3758	int ret = 0;	3758	int ret = 0;
3759		3759
3760	if (need_lockbreak(lock)) {	3760	if (need_lockbreak(lock)) {
3761	spin_unlock(lock);	3761	spin_unlock(lock);
3762	cpu_relax();	3762	cpu_relax();
3763	ret = 1;	3763	ret = 1;
3764	spin_lock(lock);	3764	spin_lock(lock);
3765	}	3765	}
3766	if (need_resched()) {	3766	if (need_resched()) {
3767	_raw_spin_unlock(lock);	3767	_raw_spin_unlock(lock);
3768	preempt_enable_no_resched();	3768	preempt_enable_no_resched();
3769	__cond_resched();	3769	__cond_resched();
3770	ret = 1;	3770	ret = 1;
3771	spin_lock(lock);	3771	spin_lock(lock);
3772	}	3772	}
3773	return ret;	3773	return ret;
3774	}	3774	}
3775		3775
3776	EXPORT_SYMBOL(cond_resched_lock);	3776	EXPORT_SYMBOL(cond_resched_lock);
3777		3777
3778	int __sched cond_resched_softirq(void)	3778	int __sched cond_resched_softirq(void)
3779	{	3779	{
3780	BUG_ON(!in_softirq());	3780	BUG_ON(!in_softirq());
3781		3781
3782	if (need_resched()) {	3782	if (need_resched()) {
3783	__local_bh_enable();	3783	__local_bh_enable();
3784	__cond_resched();	3784	__cond_resched();
3785	local_bh_disable();	3785	local_bh_disable();
3786	return 1;	3786	return 1;
3787	}	3787	}
3788	return 0;	3788	return 0;
3789	}	3789	}
3790		3790
3791	EXPORT_SYMBOL(cond_resched_softirq);	3791	EXPORT_SYMBOL(cond_resched_softirq);
3792		3792
3793		3793
3794	/**	3794	/**
3795	* yield - yield the current processor to other threads.	3795	* yield - yield the current processor to other threads.
3796	*	3796	*
3797	* this is a shortcut for kernel-space yielding - it marks the	3797	* this is a shortcut for kernel-space yielding - it marks the
3798	* thread runnable and calls sys_sched_yield().	3798	* thread runnable and calls sys_sched_yield().
3799	*/	3799	*/
3800	void __sched yield(void)	3800	void __sched yield(void)
3801	{	3801	{
3802	set_current_state(TASK_RUNNING);	3802	set_current_state(TASK_RUNNING);
3803	sys_sched_yield();	3803	sys_sched_yield();
3804	}	3804	}
3805		3805
3806	EXPORT_SYMBOL(yield);	3806	EXPORT_SYMBOL(yield);
3807		3807
3808	/*	3808	/*
3809	* This task is about to go to sleep on IO. Increment rq->nr_iowait so	3809	* This task is about to go to sleep on IO. Increment rq->nr_iowait so
3810	* that process accounting knows that this is a task in IO wait state.	3810	* that process accounting knows that this is a task in IO wait state.
3811	*	3811	*
3812	* But don't do that if it is a deliberate, throttling IO wait (this task	3812	* But don't do that if it is a deliberate, throttling IO wait (this task
3813	* has set its backing_dev_info: the queue against which it should throttle)	3813	* has set its backing_dev_info: the queue against which it should throttle)
3814	*/	3814	*/
3815	void __sched io_schedule(void)	3815	void __sched io_schedule(void)
3816	{	3816	{
3817	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());	3817	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3818		3818
3819	atomic_inc(&rq->nr_iowait);	3819	atomic_inc(&rq->nr_iowait);
3820	schedule();	3820	schedule();
3821	atomic_dec(&rq->nr_iowait);	3821	atomic_dec(&rq->nr_iowait);
3822	}	3822	}
3823		3823
3824	EXPORT_SYMBOL(io_schedule);	3824	EXPORT_SYMBOL(io_schedule);
3825		3825
3826	long __sched io_schedule_timeout(long timeout)	3826	long __sched io_schedule_timeout(long timeout)
3827	{	3827	{
3828	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());	3828	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3829	long ret;	3829	long ret;
3830		3830
3831	atomic_inc(&rq->nr_iowait);	3831	atomic_inc(&rq->nr_iowait);
3832	ret = schedule_timeout(timeout);	3832	ret = schedule_timeout(timeout);
3833	atomic_dec(&rq->nr_iowait);	3833	atomic_dec(&rq->nr_iowait);
3834	return ret;	3834	return ret;
3835	}	3835	}
3836		3836
3837	/**	3837	/**
3838	* sys_sched_get_priority_max - return maximum RT priority.	3838	* sys_sched_get_priority_max - return maximum RT priority.
3839	* @policy: scheduling class.	3839	* @policy: scheduling class.
3840	*	3840	*
3841	* this syscall returns the maximum rt_priority that can be used	3841	* this syscall returns the maximum rt_priority that can be used
3842	* by a given scheduling class.	3842	* by a given scheduling class.
3843	*/	3843	*/
3844	asmlinkage long sys_sched_get_priority_max(int policy)	3844	asmlinkage long sys_sched_get_priority_max(int policy)
3845	{	3845	{
3846	int ret = -EINVAL;	3846	int ret = -EINVAL;
3847		3847
3848	switch (policy) {	3848	switch (policy) {
3849	case SCHED_FIFO:	3849	case SCHED_FIFO:
3850	case SCHED_RR:	3850	case SCHED_RR:
3851	ret = MAX_USER_RT_PRIO-1;	3851	ret = MAX_USER_RT_PRIO-1;
3852	break;	3852	break;
3853	case SCHED_NORMAL:	3853	case SCHED_NORMAL:
3854	ret = 0;	3854	ret = 0;
3855	break;	3855	break;
3856	}	3856	}
3857	return ret;	3857	return ret;
3858	}	3858	}
3859		3859
3860	/**	3860	/**
3861	* sys_sched_get_priority_min - return minimum RT priority.	3861	* sys_sched_get_priority_min - return minimum RT priority.
3862	* @policy: scheduling class.	3862	* @policy: scheduling class.
3863	*	3863	*
3864	* this syscall returns the minimum rt_priority that can be used	3864	* this syscall returns the minimum rt_priority that can be used
3865	* by a given scheduling class.	3865	* by a given scheduling class.
3866	*/	3866	*/
3867	asmlinkage long sys_sched_get_priority_min(int policy)	3867	asmlinkage long sys_sched_get_priority_min(int policy)
3868	{	3868	{
3869	int ret = -EINVAL;	3869	int ret = -EINVAL;
3870		3870
3871	switch (policy) {	3871	switch (policy) {
3872	case SCHED_FIFO:	3872	case SCHED_FIFO:
3873	case SCHED_RR:	3873	case SCHED_RR:
3874	ret = 1;	3874	ret = 1;
3875	break;	3875	break;
3876	case SCHED_NORMAL:	3876	case SCHED_NORMAL:
3877	ret = 0;	3877	ret = 0;
3878	}	3878	}
3879	return ret;	3879	return ret;
3880	}	3880	}
3881		3881
3882	/**	3882	/**
3883	* sys_sched_rr_get_interval - return the default timeslice of a process.	3883	* sys_sched_rr_get_interval - return the default timeslice of a process.
3884	* @pid: pid of the process.	3884	* @pid: pid of the process.
3885	* @interval: userspace pointer to the timeslice value.	3885	* @interval: userspace pointer to the timeslice value.
3886	*	3886	*
3887	* this syscall writes the default timeslice value of a given process	3887	* this syscall writes the default timeslice value of a given process
3888	* into the user-space timespec buffer. A value of '0' means infinity.	3888	* into the user-space timespec buffer. A value of '0' means infinity.
3889	*/	3889	*/
3890	asmlinkage	3890	asmlinkage
3891	long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)	3891	long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3892	{	3892	{
3893	int retval = -EINVAL;	3893	int retval = -EINVAL;
3894	struct timespec t;	3894	struct timespec t;
3895	task_t *p;	3895	task_t *p;
3896		3896
3897	if (pid < 0)	3897	if (pid < 0)
3898	goto out_nounlock;	3898	goto out_nounlock;
3899		3899
3900	retval = -ESRCH;	3900	retval = -ESRCH;
3901	read_lock(&tasklist_lock);	3901	read_lock(&tasklist_lock);
3902	p = find_process_by_pid(pid);	3902	p = find_process_by_pid(pid);
3903	if (!p)	3903	if (!p)
3904	goto out_unlock;	3904	goto out_unlock;
3905		3905
3906	retval = security_task_getscheduler(p);	3906	retval = security_task_getscheduler(p);
3907	if (retval)	3907	if (retval)
3908	goto out_unlock;	3908	goto out_unlock;
3909		3909
3910	jiffies_to_timespec(p->policy & SCHED_FIFO ?	3910	jiffies_to_timespec(p->policy & SCHED_FIFO ?
3911	0 : task_timeslice(p), &t);	3911	0 : task_timeslice(p), &t);
3912	read_unlock(&tasklist_lock);	3912	read_unlock(&tasklist_lock);
3913	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;	3913	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3914	out_nounlock:	3914	out_nounlock:
3915	return retval;	3915	return retval;
3916	out_unlock:	3916	out_unlock:
3917	read_unlock(&tasklist_lock);	3917	read_unlock(&tasklist_lock);
3918	return retval;	3918	return retval;
3919	}	3919	}
3920		3920
3921	static inline struct task_struct eldest_child(struct task_struct p)	3921	static inline struct task_struct eldest_child(struct task_struct p)
3922	{	3922	{
3923	if (list_empty(&p->children)) return NULL;	3923	if (list_empty(&p->children)) return NULL;
3924	return list_entry(p->children.next,struct task_struct,sibling);	3924	return list_entry(p->children.next,struct task_struct,sibling);
3925	}	3925	}
3926		3926
3927	static inline struct task_struct older_sibling(struct task_struct p)	3927	static inline struct task_struct older_sibling(struct task_struct p)
3928	{	3928	{
3929	if (p->sibling.prev==&p->parent->children) return NULL;	3929	if (p->sibling.prev==&p->parent->children) return NULL;
3930	return list_entry(p->sibling.prev,struct task_struct,sibling);	3930	return list_entry(p->sibling.prev,struct task_struct,sibling);
3931	}	3931	}
3932		3932
3933	static inline struct task_struct younger_sibling(struct task_struct p)	3933	static inline struct task_struct younger_sibling(struct task_struct p)
3934	{	3934	{
3935	if (p->sibling.next==&p->parent->children) return NULL;	3935	if (p->sibling.next==&p->parent->children) return NULL;
3936	return list_entry(p->sibling.next,struct task_struct,sibling);	3936	return list_entry(p->sibling.next,struct task_struct,sibling);
3937	}	3937	}
3938		3938
3939	static void show_task(task_t * p)	3939	static void show_task(task_t * p)
3940	{	3940	{
3941	task_t *relative;	3941	task_t *relative;
3942	unsigned state;	3942	unsigned state;
3943	unsigned long free = 0;	3943	unsigned long free = 0;
3944	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };	3944	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3945		3945
3946	printk("%-13.13s ", p->comm);	3946	printk("%-13.13s ", p->comm);
3947	state = p->state ? __ffs(p->state) + 1 : 0;	3947	state = p->state ? __ffs(p->state) + 1 : 0;
3948	if (state < ARRAY_SIZE(stat_nam))	3948	if (state < ARRAY_SIZE(stat_nam))
3949	printk(stat_nam[state]);	3949	printk(stat_nam[state]);
3950	else	3950	else
3951	printk("?");	3951	printk("?");
3952	#if (BITS_PER_LONG == 32)	3952	#if (BITS_PER_LONG == 32)
3953	if (state == TASK_RUNNING)	3953	if (state == TASK_RUNNING)
3954	printk(" running ");	3954	printk(" running ");
3955	else	3955	else
3956	printk(" %08lX ", thread_saved_pc(p));	3956	printk(" %08lX ", thread_saved_pc(p));
3957	#else	3957	#else
3958	if (state == TASK_RUNNING)	3958	if (state == TASK_RUNNING)
3959	printk(" running task ");	3959	printk(" running task ");
3960	else	3960	else
3961	printk(" %016lx ", thread_saved_pc(p));	3961	printk(" %016lx ", thread_saved_pc(p));
3962	#endif	3962	#endif
3963	#ifdef CONFIG_DEBUG_STACK_USAGE	3963	#ifdef CONFIG_DEBUG_STACK_USAGE
3964	{	3964	{
3965	unsigned long * n = (unsigned long *) (p->thread_info+1);	3965	unsigned long * n = (unsigned long *) (p->thread_info+1);
3966	while (!*n)	3966	while (!*n)
3967	n++;	3967	n++;
3968	free = (unsigned long) n - (unsigned long)(p->thread_info+1);	3968	free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3969	}	3969	}
3970	#endif	3970	#endif
3971	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);	3971	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3972	if ((relative = eldest_child(p)))	3972	if ((relative = eldest_child(p)))
3973	printk("%5d ", relative->pid);	3973	printk("%5d ", relative->pid);
3974	else	3974	else
3975	printk(" ");	3975	printk(" ");
3976	if ((relative = younger_sibling(p)))	3976	if ((relative = younger_sibling(p)))
3977	printk("%7d", relative->pid);	3977	printk("%7d", relative->pid);
3978	else	3978	else
3979	printk(" ");	3979	printk(" ");
3980	if ((relative = older_sibling(p)))	3980	if ((relative = older_sibling(p)))
3981	printk(" %5d", relative->pid);	3981	printk(" %5d", relative->pid);
3982	else	3982	else
3983	printk(" ");	3983	printk(" ");
3984	if (!p->mm)	3984	if (!p->mm)
3985	printk(" (L-TLB)\n");	3985	printk(" (L-TLB)\n");
3986	else	3986	else
3987	printk(" (NOTLB)\n");	3987	printk(" (NOTLB)\n");
3988		3988
3989	if (state != TASK_RUNNING)	3989	if (state != TASK_RUNNING)
3990	show_stack(p, NULL);	3990	show_stack(p, NULL);
3991	}	3991	}
3992		3992
3993	void show_state(void)	3993	void show_state(void)
3994	{	3994	{
3995	task_t g, p;	3995	task_t g, p;
3996		3996
3997	#if (BITS_PER_LONG == 32)	3997	#if (BITS_PER_LONG == 32)
3998	printk("\n"	3998	printk("\n"
3999	" sibling\n");	3999	" sibling\n");
4000	printk(" task PC pid father child younger older\n");	4000	printk(" task PC pid father child younger older\n");
4001	#else	4001	#else
4002	printk("\n"	4002	printk("\n"
4003	" sibling\n");	4003	" sibling\n");
4004	printk(" task PC pid father child younger older\n");	4004	printk(" task PC pid father child younger older\n");
4005	#endif	4005	#endif
4006	read_lock(&tasklist_lock);	4006	read_lock(&tasklist_lock);
4007	do_each_thread(g, p) {	4007	do_each_thread(g, p) {
4008	/*	4008	/*
4009	* reset the NMI-timeout, listing all files on a slow	4009	* reset the NMI-timeout, listing all files on a slow
4010	* console might take alot of time:	4010	* console might take alot of time:
4011	*/	4011	*/
4012	touch_nmi_watchdog();	4012	touch_nmi_watchdog();
4013	show_task(p);	4013	show_task(p);
4014	} while_each_thread(g, p);	4014	} while_each_thread(g, p);
4015		4015
4016	read_unlock(&tasklist_lock);	4016	read_unlock(&tasklist_lock);
4017	}	4017	}
4018		4018
4019	void __devinit init_idle(task_t *idle, int cpu)	4019	void __devinit init_idle(task_t *idle, int cpu)
4020	{	4020	{
4021	runqueue_t *rq = cpu_rq(cpu);	4021	runqueue_t *rq = cpu_rq(cpu);
4022	unsigned long flags;	4022	unsigned long flags;
4023		4023
4024	idle->sleep_avg = 0;	4024	idle->sleep_avg = 0;
4025	idle->array = NULL;	4025	idle->array = NULL;
4026	idle->prio = MAX_PRIO;	4026	idle->prio = MAX_PRIO;
4027	idle->state = TASK_RUNNING;	4027	idle->state = TASK_RUNNING;
4028	idle->cpus_allowed = cpumask_of_cpu(cpu);	4028	idle->cpus_allowed = cpumask_of_cpu(cpu);
4029	set_task_cpu(idle, cpu);	4029	set_task_cpu(idle, cpu);
4030		4030
4031	spin_lock_irqsave(&rq->lock, flags);	4031	spin_lock_irqsave(&rq->lock, flags);
4032	rq->curr = rq->idle = idle;	4032	rq->curr = rq->idle = idle;
4033	set_tsk_need_resched(idle);	4033	set_tsk_need_resched(idle);
4034	spin_unlock_irqrestore(&rq->lock, flags);	4034	spin_unlock_irqrestore(&rq->lock, flags);
4035		4035
4036	/* Set the preempt count _outside_ the spinlocks! */	4036	/* Set the preempt count _outside_ the spinlocks! */
4037	#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)	4037	#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4038	idle->thread_info->preempt_count = (idle->lock_depth >= 0);	4038	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4039	#else	4039	#else
4040	idle->thread_info->preempt_count = 0;	4040	idle->thread_info->preempt_count = 0;
4041	#endif	4041	#endif
4042	}	4042	}
4043		4043
4044	/*	4044	/*
4045	* In a system that switches off the HZ timer nohz_cpu_mask	4045	* In a system that switches off the HZ timer nohz_cpu_mask
4046	* indicates which cpus entered this state. This is used	4046	* indicates which cpus entered this state. This is used
4047	* in the rcu update to wait only for active cpus. For system	4047	* in the rcu update to wait only for active cpus. For system
4048	* which do not switch off the HZ timer nohz_cpu_mask should	4048	* which do not switch off the HZ timer nohz_cpu_mask should
4049	* always be CPU_MASK_NONE.	4049	* always be CPU_MASK_NONE.
4050	*/	4050	*/
4051	cpumask_t nohz_cpu_mask = CPU_MASK_NONE;	4051	cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4052		4052
4053	#ifdef CONFIG_SMP	4053	#ifdef CONFIG_SMP
4054	/*	4054	/*
4055	* This is how migration works:	4055	* This is how migration works:
4056	*	4056	*
4057	* 1) we queue a migration_req_t structure in the source CPU's	4057	* 1) we queue a migration_req_t structure in the source CPU's
4058	* runqueue and wake up that CPU's migration thread.	4058	* runqueue and wake up that CPU's migration thread.
4059	* 2) we down() the locked semaphore => thread blocks.	4059	* 2) we down() the locked semaphore => thread blocks.
4060	* 3) migration thread wakes up (implicitly it forces the migrated	4060	* 3) migration thread wakes up (implicitly it forces the migrated
4061	* thread off the CPU)	4061	* thread off the CPU)
4062	* 4) it gets the migration request and checks whether the migrated	4062	* 4) it gets the migration request and checks whether the migrated
4063	* task is still in the wrong runqueue.	4063	* task is still in the wrong runqueue.
4064	* 5) if it's in the wrong runqueue then the migration thread removes	4064	* 5) if it's in the wrong runqueue then the migration thread removes
4065	* it and puts it into the right queue.	4065	* it and puts it into the right queue.
4066	* 6) migration thread up()s the semaphore.	4066	* 6) migration thread up()s the semaphore.
4067	* 7) we wake up and the migration is done.	4067	* 7) we wake up and the migration is done.
4068	*/	4068	*/
4069		4069
4070	/*	4070	/*
4071	* Change a given task's CPU affinity. Migrate the thread to a	4071	* Change a given task's CPU affinity. Migrate the thread to a
4072	* proper CPU and schedule it away if the CPU it's executing on	4072	* proper CPU and schedule it away if the CPU it's executing on
4073	* is removed from the allowed bitmask.	4073	* is removed from the allowed bitmask.
4074	*	4074	*
4075	* NOTE: the caller must have a valid reference to the task, the	4075	* NOTE: the caller must have a valid reference to the task, the
4076	* task must not exit() & deallocate itself prematurely. The	4076	* task must not exit() & deallocate itself prematurely. The
4077	* call is not atomic; no spinlocks may be held.	4077	* call is not atomic; no spinlocks may be held.
4078	*/	4078	*/
4079	int set_cpus_allowed(task_t *p, cpumask_t new_mask)	4079	int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4080	{	4080	{
4081	unsigned long flags;	4081	unsigned long flags;
4082	int ret = 0;	4082	int ret = 0;
4083	migration_req_t req;	4083	migration_req_t req;
4084	runqueue_t *rq;	4084	runqueue_t *rq;
4085		4085
4086	rq = task_rq_lock(p, &flags);	4086	rq = task_rq_lock(p, &flags);
4087	if (!cpus_intersects(new_mask, cpu_online_map)) {	4087	if (!cpus_intersects(new_mask, cpu_online_map)) {
4088	ret = -EINVAL;	4088	ret = -EINVAL;
4089	goto out;	4089	goto out;
4090	}	4090	}
4091		4091
4092	p->cpus_allowed = new_mask;	4092	p->cpus_allowed = new_mask;
4093	/* Can the task run on the task's current CPU? If so, we're done */	4093	/* Can the task run on the task's current CPU? If so, we're done */
4094	if (cpu_isset(task_cpu(p), new_mask))	4094	if (cpu_isset(task_cpu(p), new_mask))
4095	goto out;	4095	goto out;
4096		4096
4097	if (migrate_task(p, any_online_cpu(new_mask), &req)) {	4097	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4098	/* Need help from migration thread: drop lock and wait. */	4098	/* Need help from migration thread: drop lock and wait. */
4099	task_rq_unlock(rq, &flags);	4099	task_rq_unlock(rq, &flags);
4100	wake_up_process(rq->migration_thread);	4100	wake_up_process(rq->migration_thread);
4101	wait_for_completion(&req.done);	4101	wait_for_completion(&req.done);
4102	tlb_migrate_finish(p->mm);	4102	tlb_migrate_finish(p->mm);
4103	return 0;	4103	return 0;
4104	}	4104	}
4105	out:	4105	out:
4106	task_rq_unlock(rq, &flags);	4106	task_rq_unlock(rq, &flags);
4107	return ret;	4107	return ret;
4108	}	4108	}
4109		4109
4110	EXPORT_SYMBOL_GPL(set_cpus_allowed);	4110	EXPORT_SYMBOL_GPL(set_cpus_allowed);
4111		4111
4112	/*	4112	/*
4113	* Move (not current) task off this cpu, onto dest cpu. We're doing	4113	* Move (not current) task off this cpu, onto dest cpu. We're doing
4114	* this because either it can't run here any more (set_cpus_allowed()	4114	* this because either it can't run here any more (set_cpus_allowed()
4115	* away from this CPU, or CPU going down), or because we're	4115	* away from this CPU, or CPU going down), or because we're
4116	* attempting to rebalance this task on exec (sched_exec).	4116	* attempting to rebalance this task on exec (sched_exec).
4117	*	4117	*
4118	* So we race with normal scheduler movements, but that's OK, as long	4118	* So we race with normal scheduler movements, but that's OK, as long
4119	* as the task is no longer on this CPU.	4119	* as the task is no longer on this CPU.
4120	*/	4120	*/
4121	static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)	4121	static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4122	{	4122	{
4123	runqueue_t rq_dest, rq_src;	4123	runqueue_t rq_dest, rq_src;
4124		4124
4125	if (unlikely(cpu_is_offline(dest_cpu)))	4125	if (unlikely(cpu_is_offline(dest_cpu)))
4126	return;	4126	return;
4127		4127
4128	rq_src = cpu_rq(src_cpu);	4128	rq_src = cpu_rq(src_cpu);
4129	rq_dest = cpu_rq(dest_cpu);	4129	rq_dest = cpu_rq(dest_cpu);
4130		4130
4131	double_rq_lock(rq_src, rq_dest);	4131	double_rq_lock(rq_src, rq_dest);
4132	/* Already moved. */	4132	/* Already moved. */
4133	if (task_cpu(p) != src_cpu)	4133	if (task_cpu(p) != src_cpu)
4134	goto out;	4134	goto out;
4135	/* Affinity changed (again). */	4135	/* Affinity changed (again). */
4136	if (!cpu_isset(dest_cpu, p->cpus_allowed))	4136	if (!cpu_isset(dest_cpu, p->cpus_allowed))
4137	goto out;	4137	goto out;
4138		4138
4139	set_task_cpu(p, dest_cpu);	4139	set_task_cpu(p, dest_cpu);
4140	if (p->array) {	4140	if (p->array) {
4141	/*	4141	/*
4142	* Sync timestamp with rq_dest's before activating.	4142	* Sync timestamp with rq_dest's before activating.
4143	* The same thing could be achieved by doing this step	4143	* The same thing could be achieved by doing this step
4144	* afterwards, and pretending it was a local activate.	4144	* afterwards, and pretending it was a local activate.
4145	* This way is cleaner and logically correct.	4145	* This way is cleaner and logically correct.
4146	*/	4146	*/
4147	p->timestamp = p->timestamp - rq_src->timestamp_last_tick	4147	p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4148	+ rq_dest->timestamp_last_tick;	4148	+ rq_dest->timestamp_last_tick;
4149	deactivate_task(p, rq_src);	4149	deactivate_task(p, rq_src);
4150	activate_task(p, rq_dest, 0);	4150	activate_task(p, rq_dest, 0);
4151	if (TASK_PREEMPTS_CURR(p, rq_dest))	4151	if (TASK_PREEMPTS_CURR(p, rq_dest))
4152	resched_task(rq_dest->curr);	4152	resched_task(rq_dest->curr);
4153	}	4153	}
4154		4154
4155	out:	4155	out:
4156	double_rq_unlock(rq_src, rq_dest);	4156	double_rq_unlock(rq_src, rq_dest);
4157	}	4157	}
4158		4158
4159	/*	4159	/*
4160	* migration_thread - this is a highprio system thread that performs	4160	* migration_thread - this is a highprio system thread that performs
4161	* thread migration by bumping thread off CPU then 'pushing' onto	4161	* thread migration by bumping thread off CPU then 'pushing' onto
4162	* another runqueue.	4162	* another runqueue.
4163	*/	4163	*/
4164	static int migration_thread(void * data)	4164	static int migration_thread(void * data)
4165	{	4165	{
4166	runqueue_t *rq;	4166	runqueue_t *rq;
4167	int cpu = (long)data;	4167	int cpu = (long)data;
4168		4168
4169	rq = cpu_rq(cpu);	4169	rq = cpu_rq(cpu);
4170	BUG_ON(rq->migration_thread != current);	4170	BUG_ON(rq->migration_thread != current);
4171		4171
4172	set_current_state(TASK_INTERRUPTIBLE);	4172	set_current_state(TASK_INTERRUPTIBLE);
4173	while (!kthread_should_stop()) {	4173	while (!kthread_should_stop()) {
4174	struct list_head *head;	4174	struct list_head *head;
4175	migration_req_t *req;	4175	migration_req_t *req;
4176		4176
4177	if (current->flags & PF_FREEZE)	4177	if (current->flags & PF_FREEZE)
4178	refrigerator(PF_FREEZE);	4178	refrigerator(PF_FREEZE);
4179		4179
4180	spin_lock_irq(&rq->lock);	4180	spin_lock_irq(&rq->lock);
4181		4181
4182	if (cpu_is_offline(cpu)) {	4182	if (cpu_is_offline(cpu)) {
4183	spin_unlock_irq(&rq->lock);	4183	spin_unlock_irq(&rq->lock);
4184	goto wait_to_die;	4184	goto wait_to_die;
4185	}	4185	}
4186		4186
4187	if (rq->active_balance) {	4187	if (rq->active_balance) {
4188	active_load_balance(rq, cpu);	4188	active_load_balance(rq, cpu);
4189	rq->active_balance = 0;	4189	rq->active_balance = 0;
4190	}	4190	}
4191		4191
4192	head = &rq->migration_queue;	4192	head = &rq->migration_queue;
4193		4193
4194	if (list_empty(head)) {	4194	if (list_empty(head)) {
4195	spin_unlock_irq(&rq->lock);	4195	spin_unlock_irq(&rq->lock);
4196	schedule();	4196	schedule();
4197	set_current_state(TASK_INTERRUPTIBLE);	4197	set_current_state(TASK_INTERRUPTIBLE);
4198	continue;	4198	continue;
4199	}	4199	}
4200	req = list_entry(head->next, migration_req_t, list);	4200	req = list_entry(head->next, migration_req_t, list);
4201	list_del_init(head->next);	4201	list_del_init(head->next);
4202		4202
4203	if (req->type == REQ_MOVE_TASK) {	4203	if (req->type == REQ_MOVE_TASK) {
4204	spin_unlock(&rq->lock);	4204	spin_unlock(&rq->lock);
4205	__migrate_task(req->task, cpu, req->dest_cpu);	4205	__migrate_task(req->task, cpu, req->dest_cpu);
4206	local_irq_enable();	4206	local_irq_enable();
4207	} else if (req->type == REQ_SET_DOMAIN) {	4207	} else if (req->type == REQ_SET_DOMAIN) {
4208	rq->sd = req->sd;	4208	rq->sd = req->sd;
4209	spin_unlock_irq(&rq->lock);	4209	spin_unlock_irq(&rq->lock);
4210	} else {	4210	} else {
4211	spin_unlock_irq(&rq->lock);	4211	spin_unlock_irq(&rq->lock);
4212	WARN_ON(1);	4212	WARN_ON(1);
4213	}	4213	}
4214		4214
4215	complete(&req->done);	4215	complete(&req->done);
4216	}	4216	}
4217	__set_current_state(TASK_RUNNING);	4217	__set_current_state(TASK_RUNNING);
4218	return 0;	4218	return 0;
4219		4219
4220	wait_to_die:	4220	wait_to_die:
4221	/* Wait for kthread_stop */	4221	/* Wait for kthread_stop */
4222	set_current_state(TASK_INTERRUPTIBLE);	4222	set_current_state(TASK_INTERRUPTIBLE);
4223	while (!kthread_should_stop()) {	4223	while (!kthread_should_stop()) {
4224	schedule();	4224	schedule();
4225	set_current_state(TASK_INTERRUPTIBLE);	4225	set_current_state(TASK_INTERRUPTIBLE);
4226	}	4226	}
4227	__set_current_state(TASK_RUNNING);	4227	__set_current_state(TASK_RUNNING);
4228	return 0;	4228	return 0;
4229	}	4229	}
4230		4230
4231	#ifdef CONFIG_HOTPLUG_CPU	4231	#ifdef CONFIG_HOTPLUG_CPU
4232	/* Figure out where task on dead CPU should go, use force if neccessary. */	4232	/* Figure out where task on dead CPU should go, use force if neccessary. */
4233	static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)	4233	static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4234	{	4234	{
4235	int dest_cpu;	4235	int dest_cpu;
4236	cpumask_t mask;	4236	cpumask_t mask;
4237		4237
4238	/* On same node? */	4238	/* On same node? */
4239	mask = node_to_cpumask(cpu_to_node(dead_cpu));	4239	mask = node_to_cpumask(cpu_to_node(dead_cpu));
4240	cpus_and(mask, mask, tsk->cpus_allowed);	4240	cpus_and(mask, mask, tsk->cpus_allowed);
4241	dest_cpu = any_online_cpu(mask);	4241	dest_cpu = any_online_cpu(mask);
4242		4242
4243	/* On any allowed CPU? */	4243	/* On any allowed CPU? */
4244	if (dest_cpu == NR_CPUS)	4244	if (dest_cpu == NR_CPUS)
4245	dest_cpu = any_online_cpu(tsk->cpus_allowed);	4245	dest_cpu = any_online_cpu(tsk->cpus_allowed);
4246		4246
4247	/* No more Mr. Nice Guy. */	4247	/* No more Mr. Nice Guy. */
4248	if (dest_cpu == NR_CPUS) {	4248	if (dest_cpu == NR_CPUS) {
4249	cpus_setall(tsk->cpus_allowed);	4249	cpus_setall(tsk->cpus_allowed);
4250	dest_cpu = any_online_cpu(tsk->cpus_allowed);	4250	dest_cpu = any_online_cpu(tsk->cpus_allowed);
4251		4251
4252	/*	4252	/*
4253	* Don't tell them about moving exiting tasks or	4253	* Don't tell them about moving exiting tasks or
4254	* kernel threads (both mm NULL), since they never	4254	* kernel threads (both mm NULL), since they never
4255	* leave kernel.	4255	* leave kernel.
4256	*/	4256	*/
4257	if (tsk->mm && printk_ratelimit())	4257	if (tsk->mm && printk_ratelimit())
4258	printk(KERN_INFO "process %d (%s) no "	4258	printk(KERN_INFO "process %d (%s) no "
4259	"longer affine to cpu%d\n",	4259	"longer affine to cpu%d\n",
4260	tsk->pid, tsk->comm, dead_cpu);	4260	tsk->pid, tsk->comm, dead_cpu);
4261	}	4261	}
4262	__migrate_task(tsk, dead_cpu, dest_cpu);	4262	__migrate_task(tsk, dead_cpu, dest_cpu);
4263	}	4263	}
4264		4264
4265	/*	4265	/*
4266	* While a dead CPU has no uninterruptible tasks queued at this point,	4266	* While a dead CPU has no uninterruptible tasks queued at this point,
4267	* it might still have a nonzero ->nr_uninterruptible counter, because	4267	* it might still have a nonzero ->nr_uninterruptible counter, because
4268	* for performance reasons the counter is not stricly tracking tasks to	4268	* for performance reasons the counter is not stricly tracking tasks to
4269	* their home CPUs. So we just add the counter to another CPU's counter,	4269	* their home CPUs. So we just add the counter to another CPU's counter,
4270	* to keep the global sum constant after CPU-down:	4270	* to keep the global sum constant after CPU-down:
4271	*/	4271	*/
4272	static void migrate_nr_uninterruptible(runqueue_t *rq_src)	4272	static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4273	{	4273	{
4274	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));	4274	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4275	unsigned long flags;	4275	unsigned long flags;
4276		4276
4277	local_irq_save(flags);	4277	local_irq_save(flags);
4278	double_rq_lock(rq_src, rq_dest);	4278	double_rq_lock(rq_src, rq_dest);
4279	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;	4279	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4280	rq_src->nr_uninterruptible = 0;	4280	rq_src->nr_uninterruptible = 0;
4281	double_rq_unlock(rq_src, rq_dest);	4281	double_rq_unlock(rq_src, rq_dest);
4282	local_irq_restore(flags);	4282	local_irq_restore(flags);
4283	}	4283	}
4284		4284
4285	/* Run through task list and migrate tasks from the dead cpu. */	4285	/* Run through task list and migrate tasks from the dead cpu. */
4286	static void migrate_live_tasks(int src_cpu)	4286	static void migrate_live_tasks(int src_cpu)
4287	{	4287	{
4288	struct task_struct tsk, t;	4288	struct task_struct tsk, t;
4289		4289
4290	write_lock_irq(&tasklist_lock);	4290	write_lock_irq(&tasklist_lock);
4291		4291
4292	do_each_thread(t, tsk) {	4292	do_each_thread(t, tsk) {
4293	if (tsk == current)	4293	if (tsk == current)
4294	continue;	4294	continue;
4295		4295
4296	if (task_cpu(tsk) == src_cpu)	4296	if (task_cpu(tsk) == src_cpu)
4297	move_task_off_dead_cpu(src_cpu, tsk);	4297	move_task_off_dead_cpu(src_cpu, tsk);
4298	} while_each_thread(t, tsk);	4298	} while_each_thread(t, tsk);
4299		4299
4300	write_unlock_irq(&tasklist_lock);	4300	write_unlock_irq(&tasklist_lock);
4301	}	4301	}
4302		4302
4303	/* Schedules idle task to be the next runnable task on current CPU.	4303	/* Schedules idle task to be the next runnable task on current CPU.
4304	* It does so by boosting its priority to highest possible and adding it to	4304	* It does so by boosting its priority to highest possible and adding it to
4305	* the _front_ of runqueue. Used by CPU offline code.	4305	* the _front_ of runqueue. Used by CPU offline code.
4306	*/	4306	*/
4307	void sched_idle_next(void)	4307	void sched_idle_next(void)
4308	{	4308	{
4309	int cpu = smp_processor_id();	4309	int cpu = smp_processor_id();
4310	runqueue_t *rq = this_rq();	4310	runqueue_t *rq = this_rq();
4311	struct task_struct *p = rq->idle;	4311	struct task_struct *p = rq->idle;
4312	unsigned long flags;	4312	unsigned long flags;
4313		4313
4314	/* cpu has to be offline */	4314	/* cpu has to be offline */
4315	BUG_ON(cpu_online(cpu));	4315	BUG_ON(cpu_online(cpu));
4316		4316
4317	/* Strictly not necessary since rest of the CPUs are stopped by now	4317	/* Strictly not necessary since rest of the CPUs are stopped by now
4318	* and interrupts disabled on current cpu.	4318	* and interrupts disabled on current cpu.
4319	*/	4319	*/
4320	spin_lock_irqsave(&rq->lock, flags);	4320	spin_lock_irqsave(&rq->lock, flags);
4321		4321
4322	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);	4322	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4323	/* Add idle task to _front_ of it's priority queue */	4323	/* Add idle task to _front_ of it's priority queue */
4324	__activate_idle_task(p, rq);	4324	__activate_idle_task(p, rq);
4325		4325
4326	spin_unlock_irqrestore(&rq->lock, flags);	4326	spin_unlock_irqrestore(&rq->lock, flags);
4327	}	4327	}
4328		4328
4329	/* Ensures that the idle task is using init_mm right before its cpu goes	4329	/* Ensures that the idle task is using init_mm right before its cpu goes
4330	* offline.	4330	* offline.
4331	*/	4331	*/
4332	void idle_task_exit(void)	4332	void idle_task_exit(void)
4333	{	4333	{
4334	struct mm_struct *mm = current->active_mm;	4334	struct mm_struct *mm = current->active_mm;
4335		4335
4336	BUG_ON(cpu_online(smp_processor_id()));	4336	BUG_ON(cpu_online(smp_processor_id()));
4337		4337
4338	if (mm != &init_mm)	4338	if (mm != &init_mm)
4339	switch_mm(mm, &init_mm, current);	4339	switch_mm(mm, &init_mm, current);
4340	mmdrop(mm);	4340	mmdrop(mm);
4341	}	4341	}
4342		4342
4343	static void migrate_dead(unsigned int dead_cpu, task_t *tsk)	4343	static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4344	{	4344	{
4345	struct runqueue *rq = cpu_rq(dead_cpu);	4345	struct runqueue *rq = cpu_rq(dead_cpu);
4346		4346
4347	/* Must be exiting, otherwise would be on tasklist. */	4347	/* Must be exiting, otherwise would be on tasklist. */
4348	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);	4348	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4349		4349
4350	/* Cannot have done final schedule yet: would have vanished. */	4350	/* Cannot have done final schedule yet: would have vanished. */
4351	BUG_ON(tsk->flags & PF_DEAD);	4351	BUG_ON(tsk->flags & PF_DEAD);
4352		4352
4353	get_task_struct(tsk);	4353	get_task_struct(tsk);
4354		4354
4355	/*	4355	/*
4356	* Drop lock around migration; if someone else moves it,	4356	* Drop lock around migration; if someone else moves it,
4357	* that's OK. No task can be added to this CPU, so iteration is	4357	* that's OK. No task can be added to this CPU, so iteration is
4358	* fine.	4358	* fine.
4359	*/	4359	*/
4360	spin_unlock_irq(&rq->lock);	4360	spin_unlock_irq(&rq->lock);
4361	move_task_off_dead_cpu(dead_cpu, tsk);	4361	move_task_off_dead_cpu(dead_cpu, tsk);
4362	spin_lock_irq(&rq->lock);	4362	spin_lock_irq(&rq->lock);
4363		4363
4364	put_task_struct(tsk);	4364	put_task_struct(tsk);
4365	}	4365	}
4366		4366
4367	/* release_task() removes task from tasklist, so we won't find dead tasks. */	4367	/* release_task() removes task from tasklist, so we won't find dead tasks. */
4368	static void migrate_dead_tasks(unsigned int dead_cpu)	4368	static void migrate_dead_tasks(unsigned int dead_cpu)
4369	{	4369	{
4370	unsigned arr, i;	4370	unsigned arr, i;
4371	struct runqueue *rq = cpu_rq(dead_cpu);	4371	struct runqueue *rq = cpu_rq(dead_cpu);
4372		4372
4373	for (arr = 0; arr < 2; arr++) {	4373	for (arr = 0; arr < 2; arr++) {
4374	for (i = 0; i < MAX_PRIO; i++) {	4374	for (i = 0; i < MAX_PRIO; i++) {
4375	struct list_head *list = &rq->arrays[arr].queue[i];	4375	struct list_head *list = &rq->arrays[arr].queue[i];
4376	while (!list_empty(list))	4376	while (!list_empty(list))
4377	migrate_dead(dead_cpu,	4377	migrate_dead(dead_cpu,
4378	list_entry(list->next, task_t,	4378	list_entry(list->next, task_t,
4379	run_list));	4379	run_list));
4380	}	4380	}
4381	}	4381	}
4382	}	4382	}
4383	#endif /* CONFIG_HOTPLUG_CPU */	4383	#endif /* CONFIG_HOTPLUG_CPU */
4384		4384
4385	/*	4385	/*
4386	* migration_call - callback that gets triggered when a CPU is added.	4386	* migration_call - callback that gets triggered when a CPU is added.
4387	* Here we can start up the necessary migration thread for the new CPU.	4387	* Here we can start up the necessary migration thread for the new CPU.
4388	*/	4388	*/
4389	static int migration_call(struct notifier_block *nfb, unsigned long action,	4389	static int migration_call(struct notifier_block *nfb, unsigned long action,
4390	void *hcpu)	4390	void *hcpu)
4391	{	4391	{
4392	int cpu = (long)hcpu;	4392	int cpu = (long)hcpu;
4393	struct task_struct *p;	4393	struct task_struct *p;
4394	struct runqueue *rq;	4394	struct runqueue *rq;
4395	unsigned long flags;	4395	unsigned long flags;
4396		4396
4397	switch (action) {	4397	switch (action) {
4398	case CPU_UP_PREPARE:	4398	case CPU_UP_PREPARE:
4399	p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);	4399	p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4400	if (IS_ERR(p))	4400	if (IS_ERR(p))
4401	return NOTIFY_BAD;	4401	return NOTIFY_BAD;
4402	p->flags \|= PF_NOFREEZE;	4402	p->flags \|= PF_NOFREEZE;
4403	kthread_bind(p, cpu);	4403	kthread_bind(p, cpu);
4404	/* Must be high prio: stop_machine expects to yield to it. */	4404	/* Must be high prio: stop_machine expects to yield to it. */
4405	rq = task_rq_lock(p, &flags);	4405	rq = task_rq_lock(p, &flags);
4406	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);	4406	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4407	task_rq_unlock(rq, &flags);	4407	task_rq_unlock(rq, &flags);
4408	cpu_rq(cpu)->migration_thread = p;	4408	cpu_rq(cpu)->migration_thread = p;
4409	break;	4409	break;
4410	case CPU_ONLINE:	4410	case CPU_ONLINE:
4411	/* Strictly unneccessary, as first user will wake it. */	4411	/* Strictly unneccessary, as first user will wake it. */
4412	wake_up_process(cpu_rq(cpu)->migration_thread);	4412	wake_up_process(cpu_rq(cpu)->migration_thread);
4413	break;	4413	break;
4414	#ifdef CONFIG_HOTPLUG_CPU	4414	#ifdef CONFIG_HOTPLUG_CPU
4415	case CPU_UP_CANCELED:	4415	case CPU_UP_CANCELED:
4416	/* Unbind it from offline cpu so it can run. Fall thru. */	4416	/* Unbind it from offline cpu so it can run. Fall thru. */
4417	kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());	4417	kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4418	kthread_stop(cpu_rq(cpu)->migration_thread);	4418	kthread_stop(cpu_rq(cpu)->migration_thread);
4419	cpu_rq(cpu)->migration_thread = NULL;	4419	cpu_rq(cpu)->migration_thread = NULL;
4420	break;	4420	break;
4421	case CPU_DEAD:	4421	case CPU_DEAD:
4422	migrate_live_tasks(cpu);	4422	migrate_live_tasks(cpu);
4423	rq = cpu_rq(cpu);	4423	rq = cpu_rq(cpu);
4424	kthread_stop(rq->migration_thread);	4424	kthread_stop(rq->migration_thread);
4425	rq->migration_thread = NULL;	4425	rq->migration_thread = NULL;
4426	/* Idle task back to normal (off runqueue, low prio) */	4426	/* Idle task back to normal (off runqueue, low prio) */
4427	rq = task_rq_lock(rq->idle, &flags);	4427	rq = task_rq_lock(rq->idle, &flags);
4428	deactivate_task(rq->idle, rq);	4428	deactivate_task(rq->idle, rq);
4429	rq->idle->static_prio = MAX_PRIO;	4429	rq->idle->static_prio = MAX_PRIO;
4430	__setscheduler(rq->idle, SCHED_NORMAL, 0);	4430	__setscheduler(rq->idle, SCHED_NORMAL, 0);
4431	migrate_dead_tasks(cpu);	4431	migrate_dead_tasks(cpu);
4432	task_rq_unlock(rq, &flags);	4432	task_rq_unlock(rq, &flags);
4433	migrate_nr_uninterruptible(rq);	4433	migrate_nr_uninterruptible(rq);
4434	BUG_ON(rq->nr_running != 0);	4434	BUG_ON(rq->nr_running != 0);
4435		4435
4436	/* No need to migrate the tasks: it was best-effort if	4436	/* No need to migrate the tasks: it was best-effort if
4437	* they didn't do lock_cpu_hotplug(). Just wake up	4437	* they didn't do lock_cpu_hotplug(). Just wake up
4438	* the requestors. */	4438	* the requestors. */
4439	spin_lock_irq(&rq->lock);	4439	spin_lock_irq(&rq->lock);
4440	while (!list_empty(&rq->migration_queue)) {	4440	while (!list_empty(&rq->migration_queue)) {
4441	migration_req_t *req;	4441	migration_req_t *req;
4442	req = list_entry(rq->migration_queue.next,	4442	req = list_entry(rq->migration_queue.next,
4443	migration_req_t, list);	4443	migration_req_t, list);
4444	BUG_ON(req->type != REQ_MOVE_TASK);	4444	BUG_ON(req->type != REQ_MOVE_TASK);
4445	list_del_init(&req->list);	4445	list_del_init(&req->list);
4446	complete(&req->done);	4446	complete(&req->done);
4447	}	4447	}
4448	spin_unlock_irq(&rq->lock);	4448	spin_unlock_irq(&rq->lock);
4449	break;	4449	break;
4450	#endif	4450	#endif
4451	}	4451	}
4452	return NOTIFY_OK;	4452	return NOTIFY_OK;
4453	}	4453	}
4454		4454
4455	/* Register at highest priority so that task migration (migrate_all_tasks)	4455	/* Register at highest priority so that task migration (migrate_all_tasks)
4456	* happens before everything else.	4456	* happens before everything else.
4457	*/	4457	*/
4458	static struct notifier_block __devinitdata migration_notifier = {	4458	static struct notifier_block __devinitdata migration_notifier = {
4459	.notifier_call = migration_call,	4459	.notifier_call = migration_call,
4460	.priority = 10	4460	.priority = 10
4461	};	4461	};
4462		4462
4463	int __init migration_init(void)	4463	int __init migration_init(void)
4464	{	4464	{
4465	void cpu = (void )(long)smp_processor_id();	4465	void cpu = (void )(long)smp_processor_id();
4466	/* Start one for boot CPU. */	4466	/* Start one for boot CPU. */
4467	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);	4467	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4468	migration_call(&migration_notifier, CPU_ONLINE, cpu);	4468	migration_call(&migration_notifier, CPU_ONLINE, cpu);
4469	register_cpu_notifier(&migration_notifier);	4469	register_cpu_notifier(&migration_notifier);
4470	return 0;	4470	return 0;
4471	}	4471	}
4472	#endif	4472	#endif
4473		4473
4474	#ifdef CONFIG_SMP	4474	#ifdef CONFIG_SMP
4475	#define SCHED_DOMAIN_DEBUG	4475	#define SCHED_DOMAIN_DEBUG
4476	#ifdef SCHED_DOMAIN_DEBUG	4476	#ifdef SCHED_DOMAIN_DEBUG
4477	static void sched_domain_debug(struct sched_domain *sd, int cpu)	4477	static void sched_domain_debug(struct sched_domain *sd, int cpu)
4478	{	4478	{
4479	int level = 0;	4479	int level = 0;
4480		4480
4481	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);	4481	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4482		4482
4483	do {	4483	do {
4484	int i;	4484	int i;
4485	char str[NR_CPUS];	4485	char str[NR_CPUS];
4486	struct sched_group *group = sd->groups;	4486	struct sched_group *group = sd->groups;
4487	cpumask_t groupmask;	4487	cpumask_t groupmask;
4488		4488
4489	cpumask_scnprintf(str, NR_CPUS, sd->span);	4489	cpumask_scnprintf(str, NR_CPUS, sd->span);
4490	cpus_clear(groupmask);	4490	cpus_clear(groupmask);
4491		4491
4492	printk(KERN_DEBUG);	4492	printk(KERN_DEBUG);
4493	for (i = 0; i < level + 1; i++)	4493	for (i = 0; i < level + 1; i++)
4494	printk(" ");	4494	printk(" ");
4495	printk("domain %d: ", level);	4495	printk("domain %d: ", level);
4496		4496
4497	if (!(sd->flags & SD_LOAD_BALANCE)) {	4497	if (!(sd->flags & SD_LOAD_BALANCE)) {
4498	printk("does not load-balance\n");	4498	printk("does not load-balance\n");
4499	if (sd->parent)	4499	if (sd->parent)
4500	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");	4500	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4501	break;	4501	break;
4502	}	4502	}
4503		4503
4504	printk("span %s\n", str);	4504	printk("span %s\n", str);
4505		4505
4506	if (!cpu_isset(cpu, sd->span))	4506	if (!cpu_isset(cpu, sd->span))
4507	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);	4507	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4508	if (!cpu_isset(cpu, group->cpumask))	4508	if (!cpu_isset(cpu, group->cpumask))
4509	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);	4509	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4510		4510
4511	printk(KERN_DEBUG);	4511	printk(KERN_DEBUG);
4512	for (i = 0; i < level + 2; i++)	4512	for (i = 0; i < level + 2; i++)
4513	printk(" ");	4513	printk(" ");
4514	printk("groups:");	4514	printk("groups:");
4515	do {	4515	do {
4516	if (!group) {	4516	if (!group) {
4517	printk("\n");	4517	printk("\n");
4518	printk(KERN_ERR "ERROR: group is NULL\n");	4518	printk(KERN_ERR "ERROR: group is NULL\n");
4519	break;	4519	break;
4520	}	4520	}
4521		4521
4522	if (!group->cpu_power) {	4522	if (!group->cpu_power) {
4523	printk("\n");	4523	printk("\n");
4524	printk(KERN_ERR "ERROR: domain->cpu_power not set\n");	4524	printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4525	}	4525	}
4526		4526
4527	if (!cpus_weight(group->cpumask)) {	4527	if (!cpus_weight(group->cpumask)) {
4528	printk("\n");	4528	printk("\n");
4529	printk(KERN_ERR "ERROR: empty group\n");	4529	printk(KERN_ERR "ERROR: empty group\n");
4530	}	4530	}
4531		4531
4532	if (cpus_intersects(groupmask, group->cpumask)) {	4532	if (cpus_intersects(groupmask, group->cpumask)) {
4533	printk("\n");	4533	printk("\n");
4534	printk(KERN_ERR "ERROR: repeated CPUs\n");	4534	printk(KERN_ERR "ERROR: repeated CPUs\n");
4535	}	4535	}
4536		4536
4537	cpus_or(groupmask, groupmask, group->cpumask);	4537	cpus_or(groupmask, groupmask, group->cpumask);
4538		4538
4539	cpumask_scnprintf(str, NR_CPUS, group->cpumask);	4539	cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4540	printk(" %s", str);	4540	printk(" %s", str);
4541		4541
4542	group = group->next;	4542	group = group->next;
4543	} while (group != sd->groups);	4543	} while (group != sd->groups);
4544	printk("\n");	4544	printk("\n");
4545		4545
4546	if (!cpus_equal(sd->span, groupmask))	4546	if (!cpus_equal(sd->span, groupmask))
4547	printk(KERN_ERR "ERROR: groups don't span domain->span\n");	4547	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4548		4548
4549	level++;	4549	level++;
4550	sd = sd->parent;	4550	sd = sd->parent;
4551		4551
4552	if (sd) {	4552	if (sd) {
4553	if (!cpus_subset(groupmask, sd->span))	4553	if (!cpus_subset(groupmask, sd->span))
4554	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");	4554	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4555	}	4555	}
4556		4556
4557	} while (sd);	4557	} while (sd);
4558	}	4558	}
4559	#else	4559	#else
4560	#define sched_domain_debug(sd, cpu) {}	4560	#define sched_domain_debug(sd, cpu) {}
4561	#endif	4561	#endif
4562		4562
4563	/*	4563	/*
4564	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must	4564	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4565	* hold the hotplug lock.	4565	* hold the hotplug lock.
4566	*/	4566	*/
4567	void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)	4567	void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4568	{	4568	{
4569	migration_req_t req;	4569	migration_req_t req;
4570	unsigned long flags;	4570	unsigned long flags;
4571	runqueue_t *rq = cpu_rq(cpu);	4571	runqueue_t *rq = cpu_rq(cpu);
4572	int local = 1;	4572	int local = 1;
4573		4573
4574	sched_domain_debug(sd, cpu);	4574	sched_domain_debug(sd, cpu);
4575		4575
4576	spin_lock_irqsave(&rq->lock, flags);	4576	spin_lock_irqsave(&rq->lock, flags);
4577		4577
4578	if (cpu == smp_processor_id() \|\| !cpu_online(cpu)) {	4578	if (cpu == smp_processor_id() \|\| !cpu_online(cpu)) {
4579	rq->sd = sd;	4579	rq->sd = sd;
4580	} else {	4580	} else {
4581	init_completion(&req.done);	4581	init_completion(&req.done);
4582	req.type = REQ_SET_DOMAIN;	4582	req.type = REQ_SET_DOMAIN;
4583	req.sd = sd;	4583	req.sd = sd;
4584	list_add(&req.list, &rq->migration_queue);	4584	list_add(&req.list, &rq->migration_queue);
4585	local = 0;	4585	local = 0;
4586	}	4586	}
4587		4587
4588	spin_unlock_irqrestore(&rq->lock, flags);	4588	spin_unlock_irqrestore(&rq->lock, flags);
4589		4589
4590	if (!local) {	4590	if (!local) {
4591	wake_up_process(rq->migration_thread);	4591	wake_up_process(rq->migration_thread);
4592	wait_for_completion(&req.done);	4592	wait_for_completion(&req.done);
4593	}	4593	}
4594	}	4594	}
4595		4595
4596	/* cpus with isolated domains */	4596	/* cpus with isolated domains */
4597	cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;	4597	cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4598		4598
4599	/* Setup the mask of cpus configured for isolated domains */	4599	/* Setup the mask of cpus configured for isolated domains */
4600	static int __init isolated_cpu_setup(char *str)	4600	static int __init isolated_cpu_setup(char *str)
4601	{	4601	{
4602	int ints[NR_CPUS], i;	4602	int ints[NR_CPUS], i;
4603		4603
4604	str = get_options(str, ARRAY_SIZE(ints), ints);	4604	str = get_options(str, ARRAY_SIZE(ints), ints);
4605	cpus_clear(cpu_isolated_map);	4605	cpus_clear(cpu_isolated_map);
4606	for (i = 1; i <= ints[0]; i++)	4606	for (i = 1; i <= ints[0]; i++)
4607	if (ints[i] < NR_CPUS)	4607	if (ints[i] < NR_CPUS)
4608	cpu_set(ints[i], cpu_isolated_map);	4608	cpu_set(ints[i], cpu_isolated_map);
4609	return 1;	4609	return 1;
4610	}	4610	}
4611		4611
4612	__setup ("isolcpus=", isolated_cpu_setup);	4612	__setup ("isolcpus=", isolated_cpu_setup);
4613		4613
4614	/*	4614	/*
4615	* init_sched_build_groups takes an array of groups, the cpumask we wish	4615	* init_sched_build_groups takes an array of groups, the cpumask we wish
4616	* to span, and a pointer to a function which identifies what group a CPU	4616	* to span, and a pointer to a function which identifies what group a CPU
4617	* belongs to. The return value of group_fn must be a valid index into the	4617	* belongs to. The return value of group_fn must be a valid index into the
4618	* groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we	4618	* groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4619	* keep track of groups covered with a cpumask_t).	4619	* keep track of groups covered with a cpumask_t).
4620	*	4620	*
4621	* init_sched_build_groups will build a circular linked list of the groups	4621	* init_sched_build_groups will build a circular linked list of the groups
4622	* covered by the given span, and will set each group's ->cpumask correctly,	4622	* covered by the given span, and will set each group's ->cpumask correctly,
4623	* and ->cpu_power to 0.	4623	* and ->cpu_power to 0.
4624	*/	4624	*/
4625	void __devinit init_sched_build_groups(struct sched_group groups[],	4625	void __devinit init_sched_build_groups(struct sched_group groups[],
4626	cpumask_t span, int (*group_fn)(int cpu))	4626	cpumask_t span, int (*group_fn)(int cpu))
4627	{	4627	{
4628	struct sched_group first = NULL, last = NULL;	4628	struct sched_group first = NULL, last = NULL;
4629	cpumask_t covered = CPU_MASK_NONE;	4629	cpumask_t covered = CPU_MASK_NONE;
4630	int i;	4630	int i;
4631		4631
4632	for_each_cpu_mask(i, span) {	4632	for_each_cpu_mask(i, span) {
4633	int group = group_fn(i);	4633	int group = group_fn(i);
4634	struct sched_group *sg = &groups[group];	4634	struct sched_group *sg = &groups[group];
4635	int j;	4635	int j;
4636		4636
4637	if (cpu_isset(i, covered))	4637	if (cpu_isset(i, covered))
4638	continue;	4638	continue;
4639		4639
4640	sg->cpumask = CPU_MASK_NONE;	4640	sg->cpumask = CPU_MASK_NONE;
4641	sg->cpu_power = 0;	4641	sg->cpu_power = 0;
4642		4642
4643	for_each_cpu_mask(j, span) {	4643	for_each_cpu_mask(j, span) {
4644	if (group_fn(j) != group)	4644	if (group_fn(j) != group)
4645	continue;	4645	continue;
4646		4646
4647	cpu_set(j, covered);	4647	cpu_set(j, covered);
4648	cpu_set(j, sg->cpumask);	4648	cpu_set(j, sg->cpumask);
4649	}	4649	}
4650	if (!first)	4650	if (!first)
4651	first = sg;	4651	first = sg;
4652	if (last)	4652	if (last)
4653	last->next = sg;	4653	last->next = sg;
4654	last = sg;	4654	last = sg;
4655	}	4655	}
4656	last->next = first;	4656	last->next = first;
4657	}	4657	}
4658		4658
4659		4659
4660	#ifdef ARCH_HAS_SCHED_DOMAIN	4660	#ifdef ARCH_HAS_SCHED_DOMAIN
4661	extern void __devinit arch_init_sched_domains(void);	4661	extern void __devinit arch_init_sched_domains(void);
4662	extern void __devinit arch_destroy_sched_domains(void);	4662	extern void __devinit arch_destroy_sched_domains(void);
4663	#else	4663	#else
4664	#ifdef CONFIG_SCHED_SMT	4664	#ifdef CONFIG_SCHED_SMT
4665	static DEFINE_PER_CPU(struct sched_domain, cpu_domains);	4665	static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4666	static struct sched_group sched_group_cpus[NR_CPUS];	4666	static struct sched_group sched_group_cpus[NR_CPUS];
4667	static int __devinit cpu_to_cpu_group(int cpu)	4667	static int __devinit cpu_to_cpu_group(int cpu)
4668	{	4668	{
4669	return cpu;	4669	return cpu;
4670	}	4670	}
4671	#endif	4671	#endif
4672		4672
4673	static DEFINE_PER_CPU(struct sched_domain, phys_domains);	4673	static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4674	static struct sched_group sched_group_phys[NR_CPUS];	4674	static struct sched_group sched_group_phys[NR_CPUS];
4675	static int __devinit cpu_to_phys_group(int cpu)	4675	static int __devinit cpu_to_phys_group(int cpu)
4676	{	4676	{
4677	#ifdef CONFIG_SCHED_SMT	4677	#ifdef CONFIG_SCHED_SMT
4678	return first_cpu(cpu_sibling_map[cpu]);	4678	return first_cpu(cpu_sibling_map[cpu]);
4679	#else	4679	#else
4680	return cpu;	4680	return cpu;
4681	#endif	4681	#endif
4682	}	4682	}
4683		4683
4684	#ifdef CONFIG_NUMA	4684	#ifdef CONFIG_NUMA
4685		4685
4686	static DEFINE_PER_CPU(struct sched_domain, node_domains);	4686	static DEFINE_PER_CPU(struct sched_domain, node_domains);
4687	static struct sched_group sched_group_nodes[MAX_NUMNODES];	4687	static struct sched_group sched_group_nodes[MAX_NUMNODES];
4688	static int __devinit cpu_to_node_group(int cpu)	4688	static int __devinit cpu_to_node_group(int cpu)
4689	{	4689	{
4690	return cpu_to_node(cpu);	4690	return cpu_to_node(cpu);
4691	}	4691	}
4692	#endif	4692	#endif
4693		4693
4694	#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)	4694	#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4695	/*	4695	/*
4696	* The domains setup code relies on siblings not spanning	4696	* The domains setup code relies on siblings not spanning
4697	* multiple nodes. Make sure the architecture has a proper	4697	* multiple nodes. Make sure the architecture has a proper
4698	* siblings map:	4698	* siblings map:
4699	*/	4699	*/
4700	static void check_sibling_maps(void)	4700	static void check_sibling_maps(void)
4701	{	4701	{
4702	int i, j;	4702	int i, j;
4703		4703
4704	for_each_online_cpu(i) {	4704	for_each_online_cpu(i) {
4705	for_each_cpu_mask(j, cpu_sibling_map[i]) {	4705	for_each_cpu_mask(j, cpu_sibling_map[i]) {
4706	if (cpu_to_node(i) != cpu_to_node(j)) {	4706	if (cpu_to_node(i) != cpu_to_node(j)) {
4707	printk(KERN_INFO "warning: CPU %d siblings map "	4707	printk(KERN_INFO "warning: CPU %d siblings map "
4708	"to different node - isolating "	4708	"to different node - isolating "
4709	"them.\n", i);	4709	"them.\n", i);
4710	cpu_sibling_map[i] = cpumask_of_cpu(i);	4710	cpu_sibling_map[i] = cpumask_of_cpu(i);
4711	break;	4711	break;
4712	}	4712	}
4713	}	4713	}
4714	}	4714	}
4715	}	4715	}
4716	#endif	4716	#endif
4717		4717
4718	/*	4718	/*
4719	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	4719	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
4720	*/	4720	*/
4721	static void __devinit arch_init_sched_domains(void)	4721	static void __devinit arch_init_sched_domains(void)
4722	{	4722	{
4723	int i;	4723	int i;
4724	cpumask_t cpu_default_map;	4724	cpumask_t cpu_default_map;
4725		4725
4726	#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)	4726	#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727	check_sibling_maps();	4727	check_sibling_maps();
4728	#endif	4728	#endif
4729	/*	4729	/*
4730	* Setup mask for cpus without special case scheduling requirements.	4730	* Setup mask for cpus without special case scheduling requirements.
4731	* For now this just excludes isolated cpus, but could be used to	4731	* For now this just excludes isolated cpus, but could be used to
4732	* exclude other special cases in the future.	4732	* exclude other special cases in the future.
4733	*/	4733	*/
4734	cpus_complement(cpu_default_map, cpu_isolated_map);	4734	cpus_complement(cpu_default_map, cpu_isolated_map);
4735	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);	4735	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4736		4736
4737	/*	4737	/*
4738	* Set up domains. Isolated domains just stay on the dummy domain.	4738	* Set up domains. Isolated domains just stay on the dummy domain.
4739	*/	4739	*/
4740	for_each_cpu_mask(i, cpu_default_map) {	4740	for_each_cpu_mask(i, cpu_default_map) {
4741	int group;	4741	int group;
4742	struct sched_domain sd = NULL, p;	4742	struct sched_domain sd = NULL, p;
4743	cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));	4743	cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4744		4744
4745	cpus_and(nodemask, nodemask, cpu_default_map);	4745	cpus_and(nodemask, nodemask, cpu_default_map);
4746		4746
4747	#ifdef CONFIG_NUMA	4747	#ifdef CONFIG_NUMA
4748	sd = &per_cpu(node_domains, i);	4748	sd = &per_cpu(node_domains, i);
4749	group = cpu_to_node_group(i);	4749	group = cpu_to_node_group(i);
4750	*sd = SD_NODE_INIT;	4750	*sd = SD_NODE_INIT;
4751	sd->span = cpu_default_map;	4751	sd->span = cpu_default_map;
4752	sd->groups = &sched_group_nodes[group];	4752	sd->groups = &sched_group_nodes[group];
4753	#endif	4753	#endif
4754		4754
4755	p = sd;	4755	p = sd;
4756	sd = &per_cpu(phys_domains, i);	4756	sd = &per_cpu(phys_domains, i);
4757	group = cpu_to_phys_group(i);	4757	group = cpu_to_phys_group(i);
4758	*sd = SD_CPU_INIT;	4758	*sd = SD_CPU_INIT;
4759	sd->span = nodemask;	4759	sd->span = nodemask;
4760	sd->parent = p;	4760	sd->parent = p;
4761	sd->groups = &sched_group_phys[group];	4761	sd->groups = &sched_group_phys[group];
4762		4762
4763	#ifdef CONFIG_SCHED_SMT	4763	#ifdef CONFIG_SCHED_SMT
4764	p = sd;	4764	p = sd;
4765	sd = &per_cpu(cpu_domains, i);	4765	sd = &per_cpu(cpu_domains, i);
4766	group = cpu_to_cpu_group(i);	4766	group = cpu_to_cpu_group(i);
4767	*sd = SD_SIBLING_INIT;	4767	*sd = SD_SIBLING_INIT;
4768	sd->span = cpu_sibling_map[i];	4768	sd->span = cpu_sibling_map[i];
4769	cpus_and(sd->span, sd->span, cpu_default_map);	4769	cpus_and(sd->span, sd->span, cpu_default_map);
4770	sd->parent = p;	4770	sd->parent = p;
4771	sd->groups = &sched_group_cpus[group];	4771	sd->groups = &sched_group_cpus[group];
4772	#endif	4772	#endif
4773	}	4773	}
4774		4774
4775	#ifdef CONFIG_SCHED_SMT	4775	#ifdef CONFIG_SCHED_SMT
4776	/* Set up CPU (sibling) groups */	4776	/* Set up CPU (sibling) groups */
4777	for_each_online_cpu(i) {	4777	for_each_online_cpu(i) {
4778	cpumask_t this_sibling_map = cpu_sibling_map[i];	4778	cpumask_t this_sibling_map = cpu_sibling_map[i];
4779	cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);	4779	cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4780	if (i != first_cpu(this_sibling_map))	4780	if (i != first_cpu(this_sibling_map))
4781	continue;	4781	continue;
4782		4782
4783	init_sched_build_groups(sched_group_cpus, this_sibling_map,	4783	init_sched_build_groups(sched_group_cpus, this_sibling_map,
4784	&cpu_to_cpu_group);	4784	&cpu_to_cpu_group);
4785	}	4785	}
4786	#endif	4786	#endif
4787		4787
4788	/* Set up physical groups */	4788	/* Set up physical groups */
4789	for (i = 0; i < MAX_NUMNODES; i++) {	4789	for (i = 0; i < MAX_NUMNODES; i++) {
4790	cpumask_t nodemask = node_to_cpumask(i);	4790	cpumask_t nodemask = node_to_cpumask(i);
4791		4791
4792	cpus_and(nodemask, nodemask, cpu_default_map);	4792	cpus_and(nodemask, nodemask, cpu_default_map);
4793	if (cpus_empty(nodemask))	4793	if (cpus_empty(nodemask))
4794	continue;	4794	continue;
4795		4795
4796	init_sched_build_groups(sched_group_phys, nodemask,	4796	init_sched_build_groups(sched_group_phys, nodemask,
4797	&cpu_to_phys_group);	4797	&cpu_to_phys_group);
4798	}	4798	}
4799		4799
4800	#ifdef CONFIG_NUMA	4800	#ifdef CONFIG_NUMA
4801	/* Set up node groups */	4801	/* Set up node groups */
4802	init_sched_build_groups(sched_group_nodes, cpu_default_map,	4802	init_sched_build_groups(sched_group_nodes, cpu_default_map,
4803	&cpu_to_node_group);	4803	&cpu_to_node_group);
4804	#endif	4804	#endif
4805		4805
4806	/* Calculate CPU power for physical packages and nodes */	4806	/* Calculate CPU power for physical packages and nodes */
4807	for_each_cpu_mask(i, cpu_default_map) {	4807	for_each_cpu_mask(i, cpu_default_map) {
4808	int power;	4808	int power;
4809	struct sched_domain *sd;	4809	struct sched_domain *sd;
4810	#ifdef CONFIG_SCHED_SMT	4810	#ifdef CONFIG_SCHED_SMT
4811	sd = &per_cpu(cpu_domains, i);	4811	sd = &per_cpu(cpu_domains, i);
4812	power = SCHED_LOAD_SCALE;	4812	power = SCHED_LOAD_SCALE;
4813	sd->groups->cpu_power = power;	4813	sd->groups->cpu_power = power;
4814	#endif	4814	#endif
4815		4815
4816	sd = &per_cpu(phys_domains, i);	4816	sd = &per_cpu(phys_domains, i);
4817	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *	4817	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4818	(cpus_weight(sd->groups->cpumask)-1) / 10;	4818	(cpus_weight(sd->groups->cpumask)-1) / 10;
4819	sd->groups->cpu_power = power;	4819	sd->groups->cpu_power = power;
4820		4820
4821	#ifdef CONFIG_NUMA	4821	#ifdef CONFIG_NUMA
4822	if (i == first_cpu(sd->groups->cpumask)) {	4822	if (i == first_cpu(sd->groups->cpumask)) {
4823	/* Only add "power" once for each physical package. */	4823	/* Only add "power" once for each physical package. */
4824	sd = &per_cpu(node_domains, i);	4824	sd = &per_cpu(node_domains, i);
4825	sd->groups->cpu_power += power;	4825	sd->groups->cpu_power += power;
4826	}	4826	}
4827	#endif	4827	#endif
4828	}	4828	}
4829		4829
4830	/* Attach the domains */	4830	/* Attach the domains */
4831	for_each_online_cpu(i) {	4831	for_each_online_cpu(i) {
4832	struct sched_domain *sd;	4832	struct sched_domain *sd;
4833	#ifdef CONFIG_SCHED_SMT	4833	#ifdef CONFIG_SCHED_SMT
4834	sd = &per_cpu(cpu_domains, i);	4834	sd = &per_cpu(cpu_domains, i);
4835	#else	4835	#else
4836	sd = &per_cpu(phys_domains, i);	4836	sd = &per_cpu(phys_domains, i);
4837	#endif	4837	#endif
4838	cpu_attach_domain(sd, i);	4838	cpu_attach_domain(sd, i);
4839	}	4839	}
4840	}	4840	}
4841		4841
4842	#ifdef CONFIG_HOTPLUG_CPU	4842	#ifdef CONFIG_HOTPLUG_CPU
4843	static void __devinit arch_destroy_sched_domains(void)	4843	static void __devinit arch_destroy_sched_domains(void)
4844	{	4844	{
4845	/* Do nothing: everything is statically allocated. */	4845	/* Do nothing: everything is statically allocated. */
4846	}	4846	}
4847	#endif	4847	#endif
4848		4848
4849	#endif /* ARCH_HAS_SCHED_DOMAIN */	4849	#endif /* ARCH_HAS_SCHED_DOMAIN */
4850		4850
4851	/*	4851	/*
4852	* Initial dummy domain for early boot and for hotplug cpu. Being static,	4852	* Initial dummy domain for early boot and for hotplug cpu. Being static,
4853	* it is initialized to zero, so all balancing flags are cleared which is	4853	* it is initialized to zero, so all balancing flags are cleared which is
4854	* what we want.	4854	* what we want.
4855	*/	4855	*/
4856	static struct sched_domain sched_domain_dummy;	4856	static struct sched_domain sched_domain_dummy;
4857		4857
4858	#ifdef CONFIG_HOTPLUG_CPU	4858	#ifdef CONFIG_HOTPLUG_CPU
4859	/*	4859	/*
4860	* Force a reinitialization of the sched domains hierarchy. The domains	4860	* Force a reinitialization of the sched domains hierarchy. The domains
4861	* and groups cannot be updated in place without racing with the balancing	4861	* and groups cannot be updated in place without racing with the balancing
4862	* code, so we temporarily attach all running cpus to a "dummy" domain	4862	* code, so we temporarily attach all running cpus to a "dummy" domain
4863	* which will prevent rebalancing while the sched domains are recalculated.	4863	* which will prevent rebalancing while the sched domains are recalculated.
4864	*/	4864	*/
4865	static int update_sched_domains(struct notifier_block *nfb,	4865	static int update_sched_domains(struct notifier_block *nfb,
4866	unsigned long action, void *hcpu)	4866	unsigned long action, void *hcpu)
4867	{	4867	{
4868	int i;	4868	int i;
4869		4869
4870	switch (action) {	4870	switch (action) {
4871	case CPU_UP_PREPARE:	4871	case CPU_UP_PREPARE:
4872	case CPU_DOWN_PREPARE:	4872	case CPU_DOWN_PREPARE:
4873	for_each_online_cpu(i)	4873	for_each_online_cpu(i)
4874	cpu_attach_domain(&sched_domain_dummy, i);	4874	cpu_attach_domain(&sched_domain_dummy, i);
4875	arch_destroy_sched_domains();	4875	arch_destroy_sched_domains();
4876	return NOTIFY_OK;	4876	return NOTIFY_OK;
4877		4877
4878	case CPU_UP_CANCELED:	4878	case CPU_UP_CANCELED:
4879	case CPU_DOWN_FAILED:	4879	case CPU_DOWN_FAILED:
4880	case CPU_ONLINE:	4880	case CPU_ONLINE:
4881	case CPU_DEAD:	4881	case CPU_DEAD:
4882	/*	4882	/*
4883	* Fall through and re-initialise the domains.	4883	* Fall through and re-initialise the domains.
4884	*/	4884	*/
4885	break;	4885	break;
4886	default:	4886	default:
4887	return NOTIFY_DONE;	4887	return NOTIFY_DONE;
4888	}	4888	}
4889		4889
4890	/* The hotplug lock is already held by cpu_up/cpu_down */	4890	/* The hotplug lock is already held by cpu_up/cpu_down */
4891	arch_init_sched_domains();	4891	arch_init_sched_domains();
4892		4892
4893	return NOTIFY_OK;	4893	return NOTIFY_OK;
4894	}	4894	}
4895	#endif	4895	#endif
4896		4896
4897	void __init sched_init_smp(void)	4897	void __init sched_init_smp(void)
4898	{	4898	{
4899	lock_cpu_hotplug();	4899	lock_cpu_hotplug();
4900	arch_init_sched_domains();	4900	arch_init_sched_domains();
4901	unlock_cpu_hotplug();	4901	unlock_cpu_hotplug();
4902	/* XXX: Theoretical race here - CPU may be hotplugged now */	4902	/* XXX: Theoretical race here - CPU may be hotplugged now */
4903	hotcpu_notifier(update_sched_domains, 0);	4903	hotcpu_notifier(update_sched_domains, 0);
4904	}	4904	}
4905	#else	4905	#else
4906	void __init sched_init_smp(void)	4906	void __init sched_init_smp(void)
4907	{	4907	{
4908	}	4908	}
4909	#endif /* CONFIG_SMP */	4909	#endif /* CONFIG_SMP */
4910		4910
4911	int in_sched_functions(unsigned long addr)	4911	int in_sched_functions(unsigned long addr)
4912	{	4912	{
4913	/* Linker adds these: start and end of __sched functions */	4913	/* Linker adds these: start and end of __sched functions */
4914	extern char __sched_text_start[], __sched_text_end[];	4914	extern char __sched_text_start[], __sched_text_end[];
4915	return in_lock_functions(addr) \|\|	4915	return in_lock_functions(addr) \|\|
4916	(addr >= (unsigned long)__sched_text_start	4916	(addr >= (unsigned long)__sched_text_start
4917	&& addr < (unsigned long)__sched_text_end);	4917	&& addr < (unsigned long)__sched_text_end);
4918	}	4918	}
4919		4919
4920	void __init sched_init(void)	4920	void __init sched_init(void)
4921	{	4921	{
4922	runqueue_t *rq;	4922	runqueue_t *rq;
4923	int i, j, k;	4923	int i, j, k;
4924		4924
4925	for (i = 0; i < NR_CPUS; i++) {	4925	for (i = 0; i < NR_CPUS; i++) {
4926	prio_array_t *array;	4926	prio_array_t *array;
4927		4927
4928	rq = cpu_rq(i);	4928	rq = cpu_rq(i);
4929	spin_lock_init(&rq->lock);	4929	spin_lock_init(&rq->lock);
4930	rq->active = rq->arrays;	4930	rq->active = rq->arrays;
4931	rq->expired = rq->arrays + 1;	4931	rq->expired = rq->arrays + 1;
4932	rq->best_expired_prio = MAX_PRIO;	4932	rq->best_expired_prio = MAX_PRIO;
4933		4933
4934	#ifdef CONFIG_SMP	4934	#ifdef CONFIG_SMP
4935	rq->sd = &sched_domain_dummy;	4935	rq->sd = &sched_domain_dummy;
4936	rq->cpu_load = 0;	4936	rq->cpu_load = 0;
4937	rq->active_balance = 0;	4937	rq->active_balance = 0;
4938	rq->push_cpu = 0;	4938	rq->push_cpu = 0;
4939	rq->migration_thread = NULL;	4939	rq->migration_thread = NULL;
4940	INIT_LIST_HEAD(&rq->migration_queue);	4940	INIT_LIST_HEAD(&rq->migration_queue);
4941	#endif	4941	#endif
4942	atomic_set(&rq->nr_iowait, 0);	4942	atomic_set(&rq->nr_iowait, 0);
4943		4943
4944	for (j = 0; j < 2; j++) {	4944	for (j = 0; j < 2; j++) {
4945	array = rq->arrays + j;	4945	array = rq->arrays + j;
4946	for (k = 0; k < MAX_PRIO; k++) {	4946	for (k = 0; k < MAX_PRIO; k++) {
4947	INIT_LIST_HEAD(array->queue + k);	4947	INIT_LIST_HEAD(array->queue + k);
4948	__clear_bit(k, array->bitmap);	4948	__clear_bit(k, array->bitmap);
4949	}	4949	}
4950	// delimiter for bitsearch	4950	// delimiter for bitsearch
4951	__set_bit(MAX_PRIO, array->bitmap);	4951	__set_bit(MAX_PRIO, array->bitmap);
4952	}	4952	}
4953	}	4953	}
4954		4954
4955	/*	4955	/*
4956	* The boot idle thread does lazy MMU switching as well:	4956	* The boot idle thread does lazy MMU switching as well:
4957	*/	4957	*/
4958	atomic_inc(&init_mm.mm_count);	4958	atomic_inc(&init_mm.mm_count);
4959	enter_lazy_tlb(&init_mm, current);	4959	enter_lazy_tlb(&init_mm, current);
4960		4960
4961	/*	4961	/*
4962	* Make us the idle thread. Technically, schedule() should not be	4962	* Make us the idle thread. Technically, schedule() should not be
4963	* called from this thread, however somewhere below it might be,	4963	* called from this thread, however somewhere below it might be,
4964	* but because we are the idle thread, we just pick up running again	4964	* but because we are the idle thread, we just pick up running again
4965	* when this runqueue becomes "idle".	4965	* when this runqueue becomes "idle".
4966	*/	4966	*/
4967	init_idle(current, smp_processor_id());	4967	init_idle(current, smp_processor_id());
4968	}	4968	}
4969		4969
4970	#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP	4970	#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4971	void __might_sleep(char *file, int line)	4971	void __might_sleep(char *file, int line)
4972	{	4972	{
4973	#if defined(in_atomic)	4973	#if defined(in_atomic)
4974	static unsigned long prev_jiffy; /* ratelimiting */	4974	static unsigned long prev_jiffy; /* ratelimiting */
4975		4975
4976	if ((in_atomic() \|\| irqs_disabled()) &&	4976	if ((in_atomic() \|\| irqs_disabled()) &&
4977	system_state == SYSTEM_RUNNING && !oops_in_progress) {	4977	system_state == SYSTEM_RUNNING && !oops_in_progress) {
4978	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)	4978	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4979	return;	4979	return;
4980	prev_jiffy = jiffies;	4980	prev_jiffy = jiffies;
4981	printk(KERN_ERR "Debug: sleeping function called from invalid"	4981	printk(KERN_ERR "Debug: sleeping function called from invalid"
4982	" context at %s:%d\n", file, line);	4982	" context at %s:%d\n", file, line);
4983	printk("in_atomic():%d, irqs_disabled():%d\n",	4983	printk("in_atomic():%d, irqs_disabled():%d\n",
4984	in_atomic(), irqs_disabled());	4984	in_atomic(), irqs_disabled());
4985	dump_stack();	4985	dump_stack();
4986	}	4986	}
4987	#endif	4987	#endif
4988	}	4988	}
4989	EXPORT_SYMBOL(__might_sleep);	4989	EXPORT_SYMBOL(__might_sleep);
4990	#endif	4990	#endif
4991		4991
4992	#ifdef CONFIG_MAGIC_SYSRQ	4992	#ifdef CONFIG_MAGIC_SYSRQ
4993	void normalize_rt_tasks(void)	4993	void normalize_rt_tasks(void)
4994	{	4994	{
4995	struct task_struct *p;	4995	struct task_struct *p;
4996	prio_array_t *array;	4996	prio_array_t *array;
4997	unsigned long flags;	4997	unsigned long flags;
4998	runqueue_t *rq;	4998	runqueue_t *rq;
4999		4999
5000	read_lock_irq(&tasklist_lock);	5000	read_lock_irq(&tasklist_lock);
5001	for_each_process (p) {	5001	for_each_process (p) {
5002	if (!rt_task(p))	5002	if (!rt_task(p))
5003	continue;	5003	continue;
5004		5004
5005	rq = task_rq_lock(p, &flags);	5005	rq = task_rq_lock(p, &flags);
5006		5006
5007	array = p->array;	5007	array = p->array;
5008	if (array)	5008	if (array)
5009	deactivate_task(p, task_rq(p));	5009	deactivate_task(p, task_rq(p));
5010	__setscheduler(p, SCHED_NORMAL, 0);	5010	__setscheduler(p, SCHED_NORMAL, 0);
5011	if (array) {	5011	if (array) {
5012	__activate_task(p, task_rq(p));	5012	__activate_task(p, task_rq(p));
5013	resched_task(rq->curr);	5013	resched_task(rq->curr);
5014	}	5014	}
5015		5015
5016	task_rq_unlock(rq, &flags);	5016	task_rq_unlock(rq, &flags);
5017	}	5017	}
5018	read_unlock_irq(&tasklist_lock);	5018	read_unlock_irq(&tasklist_lock);
5019	}	5019	}
5020		5020
5021	#endif /* CONFIG_MAGIC_SYSRQ */	5021	#endif /* CONFIG_MAGIC_SYSRQ */
5022		5022

kernel/stop_machine.c

Diff comments View file @ 39c715b

1	#include <linux/stop_machine.h>	1	#include <linux/stop_machine.h>
2	#include <linux/kthread.h>	2	#include <linux/kthread.h>
3	#include <linux/sched.h>	3	#include <linux/sched.h>
4	#include <linux/cpu.h>	4	#include <linux/cpu.h>
5	#include <linux/err.h>	5	#include <linux/err.h>
6	#include <linux/syscalls.h>	6	#include <linux/syscalls.h>
7	#include <asm/atomic.h>	7	#include <asm/atomic.h>
8	#include <asm/semaphore.h>	8	#include <asm/semaphore.h>
9	#include <asm/uaccess.h>	9	#include <asm/uaccess.h>
10		10
11	/* Since we effect priority and affinity (both of which are visible	11	/* Since we effect priority and affinity (both of which are visible
12	* to, and settable by outside processes) we do indirection via a	12	* to, and settable by outside processes) we do indirection via a
13	* kthread. */	13	* kthread. */
14		14
15	/* Thread to stop each CPU in user context. */	15	/* Thread to stop each CPU in user context. */
16	enum stopmachine_state {	16	enum stopmachine_state {
17	STOPMACHINE_WAIT,	17	STOPMACHINE_WAIT,
18	STOPMACHINE_PREPARE,	18	STOPMACHINE_PREPARE,
19	STOPMACHINE_DISABLE_IRQ,	19	STOPMACHINE_DISABLE_IRQ,
20	STOPMACHINE_EXIT,	20	STOPMACHINE_EXIT,
21	};	21	};
22		22
23	static enum stopmachine_state stopmachine_state;	23	static enum stopmachine_state stopmachine_state;
24	static unsigned int stopmachine_num_threads;	24	static unsigned int stopmachine_num_threads;
25	static atomic_t stopmachine_thread_ack;	25	static atomic_t stopmachine_thread_ack;
26	static DECLARE_MUTEX(stopmachine_mutex);	26	static DECLARE_MUTEX(stopmachine_mutex);
27		27
28	static int stopmachine(void *cpu)	28	static int stopmachine(void *cpu)
29	{	29	{
30	int irqs_disabled = 0;	30	int irqs_disabled = 0;
31	int prepared = 0;	31	int prepared = 0;
32		32
33	set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));	33	set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
34		34
35	/* Ack: we are alive */	35	/* Ack: we are alive */
36	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */	36	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
37	atomic_inc(&stopmachine_thread_ack);	37	atomic_inc(&stopmachine_thread_ack);
38		38
39	/* Simple state machine */	39	/* Simple state machine */
40	while (stopmachine_state != STOPMACHINE_EXIT) {	40	while (stopmachine_state != STOPMACHINE_EXIT) {
41	if (stopmachine_state == STOPMACHINE_DISABLE_IRQ	41	if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
42	&& !irqs_disabled) {	42	&& !irqs_disabled) {
43	local_irq_disable();	43	local_irq_disable();
44	irqs_disabled = 1;	44	irqs_disabled = 1;
45	/* Ack: irqs disabled. */	45	/* Ack: irqs disabled. */
46	smp_mb(); /* Must read state first. */	46	smp_mb(); /* Must read state first. */
47	atomic_inc(&stopmachine_thread_ack);	47	atomic_inc(&stopmachine_thread_ack);
48	} else if (stopmachine_state == STOPMACHINE_PREPARE	48	} else if (stopmachine_state == STOPMACHINE_PREPARE
49	&& !prepared) {	49	&& !prepared) {
50	/* Everyone is in place, hold CPU. */	50	/* Everyone is in place, hold CPU. */
51	preempt_disable();	51	preempt_disable();
52	prepared = 1;	52	prepared = 1;
53	smp_mb(); /* Must read state first. */	53	smp_mb(); /* Must read state first. */
54	atomic_inc(&stopmachine_thread_ack);	54	atomic_inc(&stopmachine_thread_ack);
55	}	55	}
56	/* Yield in first stage: migration threads need to	56	/* Yield in first stage: migration threads need to
57	* help our sisters onto their CPUs. */	57	* help our sisters onto their CPUs. */
58	if (!prepared && !irqs_disabled)	58	if (!prepared && !irqs_disabled)
59	yield();	59	yield();
60	else	60	else
61	cpu_relax();	61	cpu_relax();
62	}	62	}
63		63
64	/* Ack: we are exiting. */	64	/* Ack: we are exiting. */
65	smp_mb(); /* Must read state first. */	65	smp_mb(); /* Must read state first. */
66	atomic_inc(&stopmachine_thread_ack);	66	atomic_inc(&stopmachine_thread_ack);
67		67
68	if (irqs_disabled)	68	if (irqs_disabled)
69	local_irq_enable();	69	local_irq_enable();
70	if (prepared)	70	if (prepared)
71	preempt_enable();	71	preempt_enable();
72		72
73	return 0;	73	return 0;
74	}	74	}
75		75
76	/* Change the thread state */	76	/* Change the thread state */
77	static void stopmachine_set_state(enum stopmachine_state state)	77	static void stopmachine_set_state(enum stopmachine_state state)
78	{	78	{
79	atomic_set(&stopmachine_thread_ack, 0);	79	atomic_set(&stopmachine_thread_ack, 0);
80	smp_wmb();	80	smp_wmb();
81	stopmachine_state = state;	81	stopmachine_state = state;
82	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)	82	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
83	cpu_relax();	83	cpu_relax();
84	}	84	}
85		85
86	static int stop_machine(void)	86	static int stop_machine(void)
87	{	87	{
88	int i, ret = 0;	88	int i, ret = 0;
89	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };	89	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90	mm_segment_t old_fs = get_fs();	90	mm_segment_t old_fs = get_fs();
91		91
92	/* One high-prio thread per cpu. We'll do this one. */	92	/* One high-prio thread per cpu. We'll do this one. */
93	set_fs(KERNEL_DS);	93	set_fs(KERNEL_DS);
94	sys_sched_setscheduler(current->pid, SCHED_FIFO,	94	sys_sched_setscheduler(current->pid, SCHED_FIFO,
95	(struct sched_param __user *)&param);	95	(struct sched_param __user *)&param);
96	set_fs(old_fs);	96	set_fs(old_fs);
97		97
98	atomic_set(&stopmachine_thread_ack, 0);	98	atomic_set(&stopmachine_thread_ack, 0);
99	stopmachine_num_threads = 0;	99	stopmachine_num_threads = 0;
100	stopmachine_state = STOPMACHINE_WAIT;	100	stopmachine_state = STOPMACHINE_WAIT;
101		101
102	for_each_online_cpu(i) {	102	for_each_online_cpu(i) {
103	if (i == _smp_processor_id())	103	if (i == raw_smp_processor_id())
104	continue;	104	continue;
105	ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);	105	ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
106	if (ret < 0)	106	if (ret < 0)
107	break;	107	break;
108	stopmachine_num_threads++;	108	stopmachine_num_threads++;
109	}	109	}
110		110
111	/* Wait for them all to come to life. */	111	/* Wait for them all to come to life. */
112	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)	112	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
113	yield();	113	yield();
114		114
115	/* If some failed, kill them all. */	115	/* If some failed, kill them all. */
116	if (ret < 0) {	116	if (ret < 0) {
117	stopmachine_set_state(STOPMACHINE_EXIT);	117	stopmachine_set_state(STOPMACHINE_EXIT);
118	up(&stopmachine_mutex);	118	up(&stopmachine_mutex);
119	return ret;	119	return ret;
120	}	120	}
121		121
122	/* Don't schedule us away at this point, please. */	122	/* Don't schedule us away at this point, please. */
123	local_irq_disable();	123	local_irq_disable();
124		124
125	/* Now they are all started, make them hold the CPUs, ready. */	125	/* Now they are all started, make them hold the CPUs, ready. */
126	stopmachine_set_state(STOPMACHINE_PREPARE);	126	stopmachine_set_state(STOPMACHINE_PREPARE);
127		127
128	/* Make them disable irqs. */	128	/* Make them disable irqs. */
129	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);	129	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
130		130
131	return 0;	131	return 0;
132	}	132	}
133		133
134	static void restart_machine(void)	134	static void restart_machine(void)
135	{	135	{
136	stopmachine_set_state(STOPMACHINE_EXIT);	136	stopmachine_set_state(STOPMACHINE_EXIT);
137	local_irq_enable();	137	local_irq_enable();
138	}	138	}
139		139
140	struct stop_machine_data	140	struct stop_machine_data
141	{	141	{
142	int (fn)(void );	142	int (fn)(void );
143	void *data;	143	void *data;
144	struct completion done;	144	struct completion done;
145	};	145	};
146		146
147	static int do_stop(void *_smdata)	147	static int do_stop(void *_smdata)
148	{	148	{
149	struct stop_machine_data *smdata = _smdata;	149	struct stop_machine_data *smdata = _smdata;
150	int ret;	150	int ret;
151		151
152	ret = stop_machine();	152	ret = stop_machine();
153	if (ret == 0) {	153	if (ret == 0) {
154	ret = smdata->fn(smdata->data);	154	ret = smdata->fn(smdata->data);
155	restart_machine();	155	restart_machine();
156	}	156	}
157		157
158	/* We're done: you can kthread_stop us now */	158	/* We're done: you can kthread_stop us now */
159	complete(&smdata->done);	159	complete(&smdata->done);
160		160
161	/* Wait for kthread_stop */	161	/* Wait for kthread_stop */
162	set_current_state(TASK_INTERRUPTIBLE);	162	set_current_state(TASK_INTERRUPTIBLE);
163	while (!kthread_should_stop()) {	163	while (!kthread_should_stop()) {
164	schedule();	164	schedule();
165	set_current_state(TASK_INTERRUPTIBLE);	165	set_current_state(TASK_INTERRUPTIBLE);
166	}	166	}
167	__set_current_state(TASK_RUNNING);	167	__set_current_state(TASK_RUNNING);
168	return ret;	168	return ret;
169	}	169	}
170		170
171	struct task_struct __stop_machine_run(int (fn)(void ), void data,	171	struct task_struct __stop_machine_run(int (fn)(void ), void data,
172	unsigned int cpu)	172	unsigned int cpu)
173	{	173	{
174	struct stop_machine_data smdata;	174	struct stop_machine_data smdata;
175	struct task_struct *p;	175	struct task_struct *p;
176		176
177	smdata.fn = fn;	177	smdata.fn = fn;
178	smdata.data = data;	178	smdata.data = data;
179	init_completion(&smdata.done);	179	init_completion(&smdata.done);
180		180
181	down(&stopmachine_mutex);	181	down(&stopmachine_mutex);
182		182
183	/* If they don't care which CPU fn runs on, bind to any online one. */	183	/* If they don't care which CPU fn runs on, bind to any online one. */
184	if (cpu == NR_CPUS)	184	if (cpu == NR_CPUS)
185	cpu = _smp_processor_id();	185	cpu = raw_smp_processor_id();
186		186
187	p = kthread_create(do_stop, &smdata, "kstopmachine");	187	p = kthread_create(do_stop, &smdata, "kstopmachine");
188	if (!IS_ERR(p)) {	188	if (!IS_ERR(p)) {
189	kthread_bind(p, cpu);	189	kthread_bind(p, cpu);
190	wake_up_process(p);	190	wake_up_process(p);
191	wait_for_completion(&smdata.done);	191	wait_for_completion(&smdata.done);
192	}	192	}
193	up(&stopmachine_mutex);	193	up(&stopmachine_mutex);
194	return p;	194	return p;
195	}	195	}
196		196
197	int stop_machine_run(int (fn)(void ), void *data, unsigned int cpu)	197	int stop_machine_run(int (fn)(void ), void *data, unsigned int cpu)
198	{	198	{
199	struct task_struct *p;	199	struct task_struct *p;
200	int ret;	200	int ret;
201		201
202	/* No CPUs can come up or down during this. */	202	/* No CPUs can come up or down during this. */
203	lock_cpu_hotplug();	203	lock_cpu_hotplug();
204	p = __stop_machine_run(fn, data, cpu);	204	p = __stop_machine_run(fn, data, cpu);
205	if (!IS_ERR(p))	205	if (!IS_ERR(p))
206	ret = kthread_stop(p);	206	ret = kthread_stop(p);
207	else	207	else
208	ret = PTR_ERR(p);	208	ret = PTR_ERR(p);
209	unlock_cpu_hotplug();	209	unlock_cpu_hotplug();
210		210
211	return ret;	211	return ret;
212	}	212	}
213		213

lib/Makefile

Diff comments View file @ 39c715b

 #
 # Makefile for some libs needed in the kernel.
 #
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o halfmd4.o
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 obj-y += sort.o parser.o
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
 CFLAGS_kobject_uevent.o += -DDEBUG
 endif
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
   lib-y += dec_and_lock.o
 endif
 obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC32)	+= crc32.o
 obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 $(obj)/crc32.o: $(obj)/crc32table.h
 quiet_cmd_crc32 = GEN     $@
       cmd_crc32 = $< > $@
 $(obj)/crc32table.h: $(obj)/gen_crc32table
 	$(call cmd,crc32)

lib/kernel_lock.c

Diff comments View file @ 39c715b

 /*
  * lib/kernel_lock.c
  *
  * This is the traditional BKL - big kernel lock. Largely
  * relegated to obsolescense, but used by various less
  * important (or lazy) subsystems.
  */
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
-#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \
-		defined(CONFIG_DEBUG_PREEMPT)
-/*
- * Debugging check.
- */
-unsigned int smp_processor_id(void)
-{
-	unsigned long preempt_count = preempt_count();
-	int this_cpu = __smp_processor_id();
-	cpumask_t this_mask;
-	if (likely(preempt_count))
-		goto out;
-	if (irqs_disabled())
-		goto out;
-	/*
-	 * Kernel threads bound to a single CPU can safely use
-	 * smp_processor_id():
-	 */
-	this_mask = cpumask_of_cpu(this_cpu);
-	if (cpus_equal(current->cpus_allowed, this_mask))
-		goto out;
-	/*
-	 * It is valid to assume CPU-locality during early bootup:
-	 */
-	if (system_state != SYSTEM_RUNNING)
-		goto out;
-	/*
-	 * Avoid recursion:
-	 */
-	preempt_disable();
-	if (!printk_ratelimit())
-		goto out_enable;
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
-	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
-	dump_stack();
-out_enable:
-	preempt_enable_no_resched();
-out:
-	return this_cpu;
-}
-EXPORT_SYMBOL(smp_processor_id);
-#endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */
 #ifdef CONFIG_PREEMPT_BKL
 /*
  * The 'big kernel semaphore'
  *
  * This mutex is taken and released recursively by lock_kernel()
  * and unlock_kernel().  It is transparently dropped and reaquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
  * Note: code locked by this semaphore will only be serialized against
  * other code using the same locking facility. The code guarantees that
  * the task remains on the same CPU.
  *
  * Don't use in new code.
  */
 static DECLARE_MUTEX(kernel_sem);
 /*
  * Re-acquire the kernel semaphore.
  *
  * This function is called with preemption off.
  *
  * We are executing in schedule() so the code must be extremely careful
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 	BUG_ON(saved_lock_depth < 0);
 	task->lock_depth = -1;
 	preempt_enable_no_resched();
 	down(&kernel_sem);
 	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 	return 0;
 }
 void __lockfunc __release_kernel_lock(void)
 {
 	up(&kernel_sem);
 }
 /*
  * Getting the big kernel semaphore.
  */
 void __lockfunc lock_kernel(void)
 {
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 	if (likely(!depth))
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
 	task->lock_depth = depth;
 }
 void __lockfunc unlock_kernel(void)
 {
 	struct task_struct *task = current;
 	BUG_ON(task->lock_depth < 0);
 	if (likely(--task->lock_depth < 0))
 		up(&kernel_sem);
 }
 #else
 /*
  * The 'big kernel lock'
  *
  * This spinlock is taken and released recursively by lock_kernel()
  * and unlock_kernel().  It is transparently dropped and reaquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
  * Don't use in new code.
  */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
 /*
  * Acquire/release the underlying lock from the scheduler.
  *
  * This is called with preemption disabled, and should
  * return an error value if it cannot get the lock and
  * TIF_NEED_RESCHED gets set.
  *
  * If it successfully gets the lock, it should increment
  * the preemption count like any spinlock does.
  *
  * (This works on UP too - _raw_spin_trylock will never
  * return false in that case)
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	while (!_raw_spin_trylock(&kernel_flag)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
 			return -EAGAIN;
 		cpu_relax();
 	}
 	preempt_disable();
 	return 0;
 }
 void __lockfunc __release_kernel_lock(void)
 {
 	_raw_spin_unlock(&kernel_flag);
 	preempt_enable_no_resched();
 }
 /*
  * These are the BKL spinlocks - we try to be polite about preemption.
  * If SMP is not on (ie UP preemption), this all goes away because the
  * _raw_spin_trylock() will always succeed.
  */
 #ifdef CONFIG_PREEMPT
 static inline void __lock_kernel(void)
 {
 	preempt_disable();
 	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
 		/*
 		 * If preemption was disabled even before this
 		 * was called, there's nothing we can be polite
 		 * about - just spin.
 		 */
 		if (preempt_count() > 1) {
 			_raw_spin_lock(&kernel_flag);
 			return;
 		}
 		/*
 		 * Otherwise, let's wait for the kernel lock
 		 * with preemption enabled..
 		 */
 		do {
 			preempt_enable();
 			while (spin_is_locked(&kernel_flag))
 				cpu_relax();
 			preempt_disable();
 		} while (!_raw_spin_trylock(&kernel_flag));
 	}
 }
 #else
 /*
  * Non-preemption case - just get the spinlock
  */
 static inline void __lock_kernel(void)
 {
 	_raw_spin_lock(&kernel_flag);
 }
 #endif
 static inline void __unlock_kernel(void)
 {
 	_raw_spin_unlock(&kernel_flag);
 	preempt_enable();
 }
 /*
  * Getting the big kernel lock.
  *
  * This cannot happen asynchronously, so we only need to
  * worry about other CPU's.
  */
 void __lockfunc lock_kernel(void)
 {
 	int depth = current->lock_depth+1;
 	if (likely(!depth))
 		__lock_kernel();
 	current->lock_depth = depth;
 }
 void __lockfunc unlock_kernel(void)
 {
 	BUG_ON(current->lock_depth < 0);
 	if (likely(--current->lock_depth < 0))
 		__unlock_kernel();
 }
 #endif
 EXPORT_SYMBOL(lock_kernel);
 EXPORT_SYMBOL(unlock_kernel);

lib/smp_processor_id.c

Diff comments View file @ 39c715b

File was created	1	/*
	2	* lib/smp_processor_id.c
	3	*
	4	* DEBUG_PREEMPT variant of smp_processor_id().
	5	*/
	6	#include <linux/module.h>
	7	#include <linux/kallsyms.h>
	8
	9	unsigned int debug_smp_processor_id(void)
	10	{
	11	unsigned long preempt_count = preempt_count();
	12	int this_cpu = raw_smp_processor_id();
	13	cpumask_t this_mask;
	14
	15	if (likely(preempt_count))
	16	goto out;
	17
	18	if (irqs_disabled())
	19	goto out;
	20
	21	/*
	22	* Kernel threads bound to a single CPU can safely use
	23	* smp_processor_id():
	24	*/
	25	this_mask = cpumask_of_cpu(this_cpu);
	26
	27	if (cpus_equal(current->cpus_allowed, this_mask))
	28	goto out;
	29
	30	/*
	31	* It is valid to assume CPU-locality during early bootup:
	32	*/
	33	if (system_state != SYSTEM_RUNNING)
	34	goto out;
	35
	36	/*
	37	* Avoid recursion:
	38	*/
	39	preempt_disable();
	40
	41	if (!printk_ratelimit())
	42	goto out_enable;
	43
	44	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
	45	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
	46	dump_stack();
	47
	48	out_enable:
	49	preempt_enable_no_resched();
	50	out:
	51	return this_cpu;
	52	}
	53
	54	EXPORT_SYMBOL(debug_smp_processor_id);
	55
	56

1	/*	1	/*
2	* linux/include/asm-arm/smp.h	2	* linux/include/asm-arm/smp.h
3	*	3	*
4	* Copyright (C) 2004-2005 ARM Ltd.	4	* Copyright (C) 2004-2005 ARM Ltd.
5	*	5	*
6	* This program is free software; you can redistribute it and/or modify	6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License version 2 as	7	* it under the terms of the GNU General Public License version 2 as
8	* published by the Free Software Foundation.	8	* published by the Free Software Foundation.
9	*/	9	*/
10	#ifndef __ASM_ARM_SMP_H	10	#ifndef __ASM_ARM_SMP_H
11	#define __ASM_ARM_SMP_H	11	#define __ASM_ARM_SMP_H
12		12
13	#include <linux/config.h>	13	#include <linux/config.h>
14	#include <linux/threads.h>	14	#include <linux/threads.h>
15	#include <linux/cpumask.h>	15	#include <linux/cpumask.h>
16	#include <linux/thread_info.h>	16	#include <linux/thread_info.h>
17		17
18	#include <asm/arch/smp.h>	18	#include <asm/arch/smp.h>
19		19
20	#ifndef CONFIG_SMP	20	#ifndef CONFIG_SMP
21	# error "<asm-arm/smp.h> included in non-SMP build"	21	# error "<asm-arm/smp.h> included in non-SMP build"
22	#endif	22	#endif
23		23
24	#define smp_processor_id() (current_thread_info()->cpu)	24	#define raw_smp_processor_id() (current_thread_info()->cpu)
25		25
26	extern cpumask_t cpu_present_mask;	26	extern cpumask_t cpu_present_mask;
27	#define cpu_possible_map cpu_present_mask	27	#define cpu_possible_map cpu_present_mask
28		28
29	/*	29	/*
30	* at the moment, there's not a big penalty for changing CPUs	30	* at the moment, there's not a big penalty for changing CPUs
31	* (the >big< penalty is running SMP in the first place)	31	* (the >big< penalty is running SMP in the first place)
32	*/	32	*/
33	#define PROC_CHANGE_PENALTY 15	33	#define PROC_CHANGE_PENALTY 15
34		34
35	struct seq_file;	35	struct seq_file;
36		36
37	/*	37	/*
38	* generate IPI list text	38	* generate IPI list text
39	*/	39	*/
40	extern void show_ipi_list(struct seq_file *p);	40	extern void show_ipi_list(struct seq_file *p);
41		41
42	/*	42	/*
43	* Move global data into per-processor storage.	43	* Move global data into per-processor storage.
44	*/	44	*/
45	extern void smp_store_cpu_info(unsigned int cpuid);	45	extern void smp_store_cpu_info(unsigned int cpuid);
46		46
47	/*	47	/*
48	* Raise an IPI cross call on CPUs in callmap.	48	* Raise an IPI cross call on CPUs in callmap.
49	*/	49	*/
50	extern void smp_cross_call(cpumask_t callmap);	50	extern void smp_cross_call(cpumask_t callmap);
51		51
52	/*	52	/*
53	* Boot a secondary CPU, and assign it the specified idle task.	53	* Boot a secondary CPU, and assign it the specified idle task.
54	* This also gives us the initial stack to use for this CPU.	54	* This also gives us the initial stack to use for this CPU.
55	*/	55	*/
56	extern int boot_secondary(unsigned int cpu, struct task_struct *);	56	extern int boot_secondary(unsigned int cpu, struct task_struct *);
57		57
58	/*	58	/*
59	* Perform platform specific initialisation of the specified CPU.	59	* Perform platform specific initialisation of the specified CPU.
60	*/	60	*/
61	extern void platform_secondary_init(unsigned int cpu);	61	extern void platform_secondary_init(unsigned int cpu);
62		62
63	/*	63	/*
64	* Initial data for bringing up a secondary CPU.	64	* Initial data for bringing up a secondary CPU.
65	*/	65	*/
66	struct secondary_data {	66	struct secondary_data {
67	unsigned long pgdir;	67	unsigned long pgdir;
68	void *stack;	68	void *stack;
69	};	69	};
70	extern struct secondary_data secondary_data;	70	extern struct secondary_data secondary_data;
71		71
72	#endif /* ifndef __ASM_ARM_SMP_H */	72	#endif /* ifndef __ASM_ARM_SMP_H */
73		73

1	#ifndef _ASM_M32R_SMP_H	1	#ifndef _ASM_M32R_SMP_H
2	#define _ASM_M32R_SMP_H	2	#define _ASM_M32R_SMP_H
3		3
4	/* $Id$ */	4	/* $Id$ */
5		5
6	#include <linux/config.h>	6	#include <linux/config.h>
7		7
8	#ifdef CONFIG_SMP	8	#ifdef CONFIG_SMP
9	#ifndef __ASSEMBLY__	9	#ifndef __ASSEMBLY__
10		10
11	#include <linux/cpumask.h>	11	#include <linux/cpumask.h>
12	#include <linux/spinlock.h>	12	#include <linux/spinlock.h>
13	#include <linux/threads.h>	13	#include <linux/threads.h>
14	#include <asm/m32r.h>	14	#include <asm/m32r.h>
15		15
16	#define PHYSID_ARRAY_SIZE 1	16	#define PHYSID_ARRAY_SIZE 1
17		17
18	struct physid_mask	18	struct physid_mask
19	{	19	{
20	unsigned long mask[PHYSID_ARRAY_SIZE];	20	unsigned long mask[PHYSID_ARRAY_SIZE];
21	};	21	};
22		22
23	typedef struct physid_mask physid_mask_t;	23	typedef struct physid_mask physid_mask_t;
24		24
25	#define physid_set(physid, map) set_bit(physid, (map).mask)	25	#define physid_set(physid, map) set_bit(physid, (map).mask)
26	#define physid_clear(physid, map) clear_bit(physid, (map).mask)	26	#define physid_clear(physid, map) clear_bit(physid, (map).mask)
27	#define physid_isset(physid, map) test_bit(physid, (map).mask)	27	#define physid_isset(physid, map) test_bit(physid, (map).mask)
28	#define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask)	28	#define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask)
29		29
30	#define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)	30	#define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
31	#define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)	31	#define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
32	#define physids_clear(map) bitmap_zero((map).mask, MAX_APICS)	32	#define physids_clear(map) bitmap_zero((map).mask, MAX_APICS)
33	#define physids_complement(dst, src) bitmap_complement((dst).mask,(src).mask, MAX_APICS)	33	#define physids_complement(dst, src) bitmap_complement((dst).mask,(src).mask, MAX_APICS)
34	#define physids_empty(map) bitmap_empty((map).mask, MAX_APICS)	34	#define physids_empty(map) bitmap_empty((map).mask, MAX_APICS)
35	#define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS)	35	#define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
36	#define physids_weight(map) bitmap_weight((map).mask, MAX_APICS)	36	#define physids_weight(map) bitmap_weight((map).mask, MAX_APICS)
37	#define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)	37	#define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
38	#define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)	38	#define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
39	#define physids_coerce(map) ((map).mask[0])	39	#define physids_coerce(map) ((map).mask[0])
40		40
41	#define physids_promote(physids) \	41	#define physids_promote(physids) \
42	({ \	42	({ \
43	physid_mask_t __physid_mask = PHYSID_MASK_NONE; \	43	physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
44	__physid_mask.mask[0] = physids; \	44	__physid_mask.mask[0] = physids; \
45	__physid_mask; \	45	__physid_mask; \
46	})	46	})
47		47
48	#define physid_mask_of_physid(physid) \	48	#define physid_mask_of_physid(physid) \
49	({ \	49	({ \
50	physid_mask_t __physid_mask = PHYSID_MASK_NONE; \	50	physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
51	physid_set(physid, __physid_mask); \	51	physid_set(physid, __physid_mask); \
52	__physid_mask; \	52	__physid_mask; \
53	})	53	})
54		54
55	#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }	55	#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
56	#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }	56	#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
57		57
58	extern physid_mask_t phys_cpu_present_map;	58	extern physid_mask_t phys_cpu_present_map;
59		59
60	/*	60	/*
61	* Some lowlevel functions might want to know about	61	* Some lowlevel functions might want to know about
62	* the real CPU ID <-> CPU # mapping.	62	* the real CPU ID <-> CPU # mapping.
63	*/	63	*/
64	extern volatile int physid_2_cpu[NR_CPUS];	64	extern volatile int physid_2_cpu[NR_CPUS];
65	extern volatile int cpu_2_physid[NR_CPUS];	65	extern volatile int cpu_2_physid[NR_CPUS];
66	#define physid_to_cpu(physid) physid_2_cpu[physid]	66	#define physid_to_cpu(physid) physid_2_cpu[physid]
67	#define cpu_to_physid(cpu_id) cpu_2_physid[cpu_id]	67	#define cpu_to_physid(cpu_id) cpu_2_physid[cpu_id]
68		68
69	#define smp_processor_id() (current_thread_info()->cpu)	69	#define raw_smp_processor_id() (current_thread_info()->cpu)
70		70
71	extern cpumask_t cpu_callout_map;	71	extern cpumask_t cpu_callout_map;
72	#define cpu_possible_map cpu_callout_map	72	#define cpu_possible_map cpu_callout_map
73		73
74	static __inline__ int hard_smp_processor_id(void)	74	static __inline__ int hard_smp_processor_id(void)
75	{	75	{
76	return (int)(volatile long )M32R_CPUID_PORTL;	76	return (int)(volatile long )M32R_CPUID_PORTL;
77	}	77	}
78		78
79	static __inline__ int cpu_logical_map(int cpu)	79	static __inline__ int cpu_logical_map(int cpu)
80	{	80	{
81	return cpu;	81	return cpu;
82	}	82	}
83		83
84	static __inline__ int cpu_number_map(int cpu)	84	static __inline__ int cpu_number_map(int cpu)
85	{	85	{
86	return cpu;	86	return cpu;
87	}	87	}
88		88
89	static __inline__ unsigned int num_booting_cpus(void)	89	static __inline__ unsigned int num_booting_cpus(void)
90	{	90	{
91	return cpus_weight(cpu_callout_map);	91	return cpus_weight(cpu_callout_map);
92	}	92	}
93		93
94	extern void smp_send_timer(void);	94	extern void smp_send_timer(void);
95	extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);	95	extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);
96		96
97	#endif /* not __ASSEMBLY__ */	97	#endif /* not __ASSEMBLY__ */
98		98
99	#define NO_PROC_ID (0xff) /* No processor magic marker */	99	#define NO_PROC_ID (0xff) /* No processor magic marker */
100		100
101	#define PROC_CHANGE_PENALTY (15) /* Schedule penalty */	101	#define PROC_CHANGE_PENALTY (15) /* Schedule penalty */
102		102
103	/*	103	/*
104	* M32R-mp IPI	104	* M32R-mp IPI
105	*/	105	*/
106	#define RESCHEDULE_IPI (M32R_IRQ_IPI0-M32R_IRQ_IPI0)	106	#define RESCHEDULE_IPI (M32R_IRQ_IPI0-M32R_IRQ_IPI0)
107	#define INVALIDATE_TLB_IPI (M32R_IRQ_IPI1-M32R_IRQ_IPI0)	107	#define INVALIDATE_TLB_IPI (M32R_IRQ_IPI1-M32R_IRQ_IPI0)
108	#define CALL_FUNCTION_IPI (M32R_IRQ_IPI2-M32R_IRQ_IPI0)	108	#define CALL_FUNCTION_IPI (M32R_IRQ_IPI2-M32R_IRQ_IPI0)
109	#define LOCAL_TIMER_IPI (M32R_IRQ_IPI3-M32R_IRQ_IPI0)	109	#define LOCAL_TIMER_IPI (M32R_IRQ_IPI3-M32R_IRQ_IPI0)
110	#define INVALIDATE_CACHE_IPI (M32R_IRQ_IPI4-M32R_IRQ_IPI0)	110	#define INVALIDATE_CACHE_IPI (M32R_IRQ_IPI4-M32R_IRQ_IPI0)
111	#define CPU_BOOT_IPI (M32R_IRQ_IPI5-M32R_IRQ_IPI0)	111	#define CPU_BOOT_IPI (M32R_IRQ_IPI5-M32R_IRQ_IPI0)
112		112
113	#define IPI_SHIFT (0)	113	#define IPI_SHIFT (0)
114	#define NR_IPIS (8)	114	#define NR_IPIS (8)
115		115
116	#endif /* CONFIG_SMP */	116	#endif /* CONFIG_SMP */
117		117
118	#endif /* _ASM_M32R_SMP_H */	118	#endif /* _ASM_M32R_SMP_H */
119		119

1	/*	1	/*
2	* This file is subject to the terms and conditions of the GNU General	2	* This file is subject to the terms and conditions of the GNU General
3	* Public License. See the file "COPYING" in the main directory of this	3	* Public License. See the file "COPYING" in the main directory of this
4	* archive for more details.	4	* archive for more details.
5	*	5	*
6	* Copyright (C) 2000 - 2001 by Kanoj Sarcar (kanoj@sgi.com)	6	* Copyright (C) 2000 - 2001 by Kanoj Sarcar (kanoj@sgi.com)
7	* Copyright (C) 2000 - 2001 by Silicon Graphics, Inc.	7	* Copyright (C) 2000 - 2001 by Silicon Graphics, Inc.
8	* Copyright (C) 2000, 2001, 2002 Ralf Baechle	8	* Copyright (C) 2000, 2001, 2002 Ralf Baechle
9	* Copyright (C) 2000, 2001 Broadcom Corporation	9	* Copyright (C) 2000, 2001 Broadcom Corporation
10	*/	10	*/
11	#ifndef __ASM_SMP_H	11	#ifndef __ASM_SMP_H
12	#define __ASM_SMP_H	12	#define __ASM_SMP_H
13		13
14	#include <linux/config.h>	14	#include <linux/config.h>
15		15
16	#ifdef CONFIG_SMP	16	#ifdef CONFIG_SMP
17		17
18	#include <linux/bitops.h>	18	#include <linux/bitops.h>
19	#include <linux/linkage.h>	19	#include <linux/linkage.h>
20	#include <linux/threads.h>	20	#include <linux/threads.h>
21	#include <linux/cpumask.h>	21	#include <linux/cpumask.h>
22	#include <asm/atomic.h>	22	#include <asm/atomic.h>
23		23
24	#define smp_processor_id() (current_thread_info()->cpu)	24	#define raw_smp_processor_id() (current_thread_info()->cpu)
25		25
26	/* Map from cpu id to sequential logical cpu number. This will only	26	/* Map from cpu id to sequential logical cpu number. This will only
27	not be idempotent when cpus failed to come on-line. */	27	not be idempotent when cpus failed to come on-line. */
28	extern int __cpu_number_map[NR_CPUS];	28	extern int __cpu_number_map[NR_CPUS];
29	#define cpu_number_map(cpu) __cpu_number_map[cpu]	29	#define cpu_number_map(cpu) __cpu_number_map[cpu]
30		30
31	/* The reverse map from sequential logical cpu number to cpu id. */	31	/* The reverse map from sequential logical cpu number to cpu id. */
32	extern int __cpu_logical_map[NR_CPUS];	32	extern int __cpu_logical_map[NR_CPUS];
33	#define cpu_logical_map(cpu) __cpu_logical_map[cpu]	33	#define cpu_logical_map(cpu) __cpu_logical_map[cpu]
34		34
35	#define NO_PROC_ID (-1)	35	#define NO_PROC_ID (-1)
36		36
37	struct call_data_struct {	37	struct call_data_struct {
38	void (func)(void );	38	void (func)(void );
39	void *info;	39	void *info;
40	atomic_t started;	40	atomic_t started;
41	atomic_t finished;	41	atomic_t finished;
42	int wait;	42	int wait;
43	};	43	};
44		44
45	extern struct call_data_struct *call_data;	45	extern struct call_data_struct *call_data;
46		46
47	#define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */	47	#define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */
48	#define SMP_CALL_FUNCTION 0x2	48	#define SMP_CALL_FUNCTION 0x2
49		49
50	extern cpumask_t phys_cpu_present_map;	50	extern cpumask_t phys_cpu_present_map;
51	extern cpumask_t cpu_online_map;	51	extern cpumask_t cpu_online_map;
52	#define cpu_possible_map phys_cpu_present_map	52	#define cpu_possible_map phys_cpu_present_map
53		53
54	extern cpumask_t cpu_callout_map;	54	extern cpumask_t cpu_callout_map;
55	/* We don't mark CPUs online until __cpu_up(), so we need another measure */	55	/* We don't mark CPUs online until __cpu_up(), so we need another measure */
56	static inline int num_booting_cpus(void)	56	static inline int num_booting_cpus(void)
57	{	57	{
58	return cpus_weight(cpu_callout_map);	58	return cpus_weight(cpu_callout_map);
59	}	59	}
60		60
61	/* These are defined by the board-specific code. */	61	/* These are defined by the board-specific code. */
62		62
63	/*	63	/*
64	* Cause the function described by call_data to be executed on the passed	64	* Cause the function described by call_data to be executed on the passed
65	* cpu. When the function has finished, increment the finished field of	65	* cpu. When the function has finished, increment the finished field of
66	* call_data.	66	* call_data.
67	*/	67	*/
68	extern void core_send_ipi(int cpu, unsigned int action);	68	extern void core_send_ipi(int cpu, unsigned int action);
69		69
70	/*	70	/*
71	* Firmware CPU startup hook	71	* Firmware CPU startup hook
72	*/	72	*/
73	extern void prom_boot_secondary(int cpu, struct task_struct *idle);	73	extern void prom_boot_secondary(int cpu, struct task_struct *idle);
74		74
75	/*	75	/*
76	* After we've done initial boot, this function is called to allow the	76	* After we've done initial boot, this function is called to allow the
77	* board code to clean up state, if needed	77	* board code to clean up state, if needed
78	*/	78	*/
79	extern void prom_init_secondary(void);	79	extern void prom_init_secondary(void);
80		80
81	/*	81	/*
82	* Detect available CPUs, populate phys_cpu_present_map before smp_init	82	* Detect available CPUs, populate phys_cpu_present_map before smp_init
83	*/	83	*/
84	extern void prom_prepare_cpus(unsigned int max_cpus);	84	extern void prom_prepare_cpus(unsigned int max_cpus);
85		85
86	/*	86	/*
87	* Last chance for the board code to finish SMP initialization before	87	* Last chance for the board code to finish SMP initialization before
88	* the CPU is "online".	88	* the CPU is "online".
89	*/	89	*/
90	extern void prom_smp_finish(void);	90	extern void prom_smp_finish(void);
91		91
92	/* Hook for after all CPUs are online */	92	/* Hook for after all CPUs are online */
93	extern void prom_cpus_done(void);	93	extern void prom_cpus_done(void);
94		94
95	extern void asmlinkage smp_bootstrap(void);	95	extern void asmlinkage smp_bootstrap(void);
96		96
97	/*	97	/*
98	* this function sends a 'reschedule' IPI to another CPU.	98	* this function sends a 'reschedule' IPI to another CPU.
99	* it goes straight through and wastes no time serializing	99	* it goes straight through and wastes no time serializing
100	* anything. Worst case is that we lose a reschedule ...	100	* anything. Worst case is that we lose a reschedule ...
101	*/	101	*/
102	static inline void smp_send_reschedule(int cpu)	102	static inline void smp_send_reschedule(int cpu)
103	{	103	{
104	core_send_ipi(cpu, SMP_RESCHEDULE_YOURSELF);	104	core_send_ipi(cpu, SMP_RESCHEDULE_YOURSELF);
105	}	105	}
106		106
107	extern asmlinkage void smp_call_function_interrupt(void);	107	extern asmlinkage void smp_call_function_interrupt(void);
108		108
109	#endif /* CONFIG_SMP */	109	#endif /* CONFIG_SMP */
110		110
111	#endif /* __ASM_SMP_H */	111	#endif /* __ASM_SMP_H */
112		112