Commit 39c715b71740c4a78ba4769fb54826929bac03cb

Authored by Ingo Molnar
Committed by Linus Torvalds
1 parent 84929801e1

[PATCH] smp_processor_id() cleanup

This patch implements a number of smp_processor_id() cleanup ideas that
Arjan van de Ven and I came up with.

The previous __smp_processor_id/_smp_processor_id/smp_processor_id API
spaghetti was hard to follow both on the implementational and on the
usage side.

Some of the complexity arose from picking wrong names, some of the
complexity comes from the fact that not all architectures defined
__smp_processor_id.

In the new code, there are two externally visible symbols:

 - smp_processor_id(): debug variant.

 - raw_smp_processor_id(): nondebug variant. Replaces all existing
   uses of _smp_processor_id() and __smp_processor_id(). Defined
   by every SMP architecture in include/asm-*/smp.h.

There is one new internal symbol, dependent on DEBUG_PREEMPT:

 - debug_smp_processor_id(): internal debug variant, mapped to
                             smp_processor_id().

Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new
lib/smp_processor_id.c file.  All related comments got updated and/or
clarified.

I have build/boot tested the following 8 .config combinations on x86:

 {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT}

I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT.  (Other
architectures are untested, but should work just fine.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 37 changed files with 119 additions and 125 deletions Inline Diff

arch/i386/kernel/traps.c
1 /* 1 /*
2 * linux/arch/i386/traps.c 2 * linux/arch/i386/traps.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * 5 *
6 * Pentium III FXSR, SSE support 6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */ 8 */
9 9
10 /* 10 /*
11 * 'Traps.c' handles hardware traps and faults after we have saved some 11 * 'Traps.c' handles hardware traps and faults after we have saved some
12 * state in 'asm.s'. 12 * state in 'asm.s'.
13 */ 13 */
14 #include <linux/config.h> 14 #include <linux/config.h>
15 #include <linux/sched.h> 15 #include <linux/sched.h>
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/string.h> 17 #include <linux/string.h>
18 #include <linux/errno.h> 18 #include <linux/errno.h>
19 #include <linux/timer.h> 19 #include <linux/timer.h>
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/init.h> 21 #include <linux/init.h>
22 #include <linux/delay.h> 22 #include <linux/delay.h>
23 #include <linux/spinlock.h> 23 #include <linux/spinlock.h>
24 #include <linux/interrupt.h> 24 #include <linux/interrupt.h>
25 #include <linux/highmem.h> 25 #include <linux/highmem.h>
26 #include <linux/kallsyms.h> 26 #include <linux/kallsyms.h>
27 #include <linux/ptrace.h> 27 #include <linux/ptrace.h>
28 #include <linux/utsname.h> 28 #include <linux/utsname.h>
29 #include <linux/kprobes.h> 29 #include <linux/kprobes.h>
30 30
31 #ifdef CONFIG_EISA 31 #ifdef CONFIG_EISA
32 #include <linux/ioport.h> 32 #include <linux/ioport.h>
33 #include <linux/eisa.h> 33 #include <linux/eisa.h>
34 #endif 34 #endif
35 35
36 #ifdef CONFIG_MCA 36 #ifdef CONFIG_MCA
37 #include <linux/mca.h> 37 #include <linux/mca.h>
38 #endif 38 #endif
39 39
40 #include <asm/processor.h> 40 #include <asm/processor.h>
41 #include <asm/system.h> 41 #include <asm/system.h>
42 #include <asm/uaccess.h> 42 #include <asm/uaccess.h>
43 #include <asm/io.h> 43 #include <asm/io.h>
44 #include <asm/atomic.h> 44 #include <asm/atomic.h>
45 #include <asm/debugreg.h> 45 #include <asm/debugreg.h>
46 #include <asm/desc.h> 46 #include <asm/desc.h>
47 #include <asm/i387.h> 47 #include <asm/i387.h>
48 #include <asm/nmi.h> 48 #include <asm/nmi.h>
49 49
50 #include <asm/smp.h> 50 #include <asm/smp.h>
51 #include <asm/arch_hooks.h> 51 #include <asm/arch_hooks.h>
52 #include <asm/kdebug.h> 52 #include <asm/kdebug.h>
53 53
54 #include <linux/irq.h> 54 #include <linux/irq.h>
55 #include <linux/module.h> 55 #include <linux/module.h>
56 56
57 #include "mach_traps.h" 57 #include "mach_traps.h"
58 58
59 asmlinkage int system_call(void); 59 asmlinkage int system_call(void);
60 60
61 struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, 61 struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
62 { 0, 0 }, { 0, 0 } }; 62 { 0, 0 }, { 0, 0 } };
63 63
64 /* Do we ignore FPU interrupts ? */ 64 /* Do we ignore FPU interrupts ? */
65 char ignore_fpu_irq = 0; 65 char ignore_fpu_irq = 0;
66 66
67 /* 67 /*
68 * The IDT has to be page-aligned to simplify the Pentium 68 * The IDT has to be page-aligned to simplify the Pentium
69 * F0 0F bug workaround.. We have a special link segment 69 * F0 0F bug workaround.. We have a special link segment
70 * for this. 70 * for this.
71 */ 71 */
72 struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; 72 struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
73 73
74 asmlinkage void divide_error(void); 74 asmlinkage void divide_error(void);
75 asmlinkage void debug(void); 75 asmlinkage void debug(void);
76 asmlinkage void nmi(void); 76 asmlinkage void nmi(void);
77 asmlinkage void int3(void); 77 asmlinkage void int3(void);
78 asmlinkage void overflow(void); 78 asmlinkage void overflow(void);
79 asmlinkage void bounds(void); 79 asmlinkage void bounds(void);
80 asmlinkage void invalid_op(void); 80 asmlinkage void invalid_op(void);
81 asmlinkage void device_not_available(void); 81 asmlinkage void device_not_available(void);
82 asmlinkage void coprocessor_segment_overrun(void); 82 asmlinkage void coprocessor_segment_overrun(void);
83 asmlinkage void invalid_TSS(void); 83 asmlinkage void invalid_TSS(void);
84 asmlinkage void segment_not_present(void); 84 asmlinkage void segment_not_present(void);
85 asmlinkage void stack_segment(void); 85 asmlinkage void stack_segment(void);
86 asmlinkage void general_protection(void); 86 asmlinkage void general_protection(void);
87 asmlinkage void page_fault(void); 87 asmlinkage void page_fault(void);
88 asmlinkage void coprocessor_error(void); 88 asmlinkage void coprocessor_error(void);
89 asmlinkage void simd_coprocessor_error(void); 89 asmlinkage void simd_coprocessor_error(void);
90 asmlinkage void alignment_check(void); 90 asmlinkage void alignment_check(void);
91 asmlinkage void spurious_interrupt_bug(void); 91 asmlinkage void spurious_interrupt_bug(void);
92 asmlinkage void machine_check(void); 92 asmlinkage void machine_check(void);
93 93
94 static int kstack_depth_to_print = 24; 94 static int kstack_depth_to_print = 24;
95 struct notifier_block *i386die_chain; 95 struct notifier_block *i386die_chain;
96 static DEFINE_SPINLOCK(die_notifier_lock); 96 static DEFINE_SPINLOCK(die_notifier_lock);
97 97
98 int register_die_notifier(struct notifier_block *nb) 98 int register_die_notifier(struct notifier_block *nb)
99 { 99 {
100 int err = 0; 100 int err = 0;
101 unsigned long flags; 101 unsigned long flags;
102 spin_lock_irqsave(&die_notifier_lock, flags); 102 spin_lock_irqsave(&die_notifier_lock, flags);
103 err = notifier_chain_register(&i386die_chain, nb); 103 err = notifier_chain_register(&i386die_chain, nb);
104 spin_unlock_irqrestore(&die_notifier_lock, flags); 104 spin_unlock_irqrestore(&die_notifier_lock, flags);
105 return err; 105 return err;
106 } 106 }
107 107
108 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) 108 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
109 { 109 {
110 return p > (void *)tinfo && 110 return p > (void *)tinfo &&
111 p < (void *)tinfo + THREAD_SIZE - 3; 111 p < (void *)tinfo + THREAD_SIZE - 3;
112 } 112 }
113 113
114 static inline unsigned long print_context_stack(struct thread_info *tinfo, 114 static inline unsigned long print_context_stack(struct thread_info *tinfo,
115 unsigned long *stack, unsigned long ebp) 115 unsigned long *stack, unsigned long ebp)
116 { 116 {
117 unsigned long addr; 117 unsigned long addr;
118 118
119 #ifdef CONFIG_FRAME_POINTER 119 #ifdef CONFIG_FRAME_POINTER
120 while (valid_stack_ptr(tinfo, (void *)ebp)) { 120 while (valid_stack_ptr(tinfo, (void *)ebp)) {
121 addr = *(unsigned long *)(ebp + 4); 121 addr = *(unsigned long *)(ebp + 4);
122 printk(" [<%08lx>] ", addr); 122 printk(" [<%08lx>] ", addr);
123 print_symbol("%s", addr); 123 print_symbol("%s", addr);
124 printk("\n"); 124 printk("\n");
125 ebp = *(unsigned long *)ebp; 125 ebp = *(unsigned long *)ebp;
126 } 126 }
127 #else 127 #else
128 while (valid_stack_ptr(tinfo, stack)) { 128 while (valid_stack_ptr(tinfo, stack)) {
129 addr = *stack++; 129 addr = *stack++;
130 if (__kernel_text_address(addr)) { 130 if (__kernel_text_address(addr)) {
131 printk(" [<%08lx>]", addr); 131 printk(" [<%08lx>]", addr);
132 print_symbol(" %s", addr); 132 print_symbol(" %s", addr);
133 printk("\n"); 133 printk("\n");
134 } 134 }
135 } 135 }
136 #endif 136 #endif
137 return ebp; 137 return ebp;
138 } 138 }
139 139
140 void show_trace(struct task_struct *task, unsigned long * stack) 140 void show_trace(struct task_struct *task, unsigned long * stack)
141 { 141 {
142 unsigned long ebp; 142 unsigned long ebp;
143 143
144 if (!task) 144 if (!task)
145 task = current; 145 task = current;
146 146
147 if (task == current) { 147 if (task == current) {
148 /* Grab ebp right from our regs */ 148 /* Grab ebp right from our regs */
149 asm ("movl %%ebp, %0" : "=r" (ebp) : ); 149 asm ("movl %%ebp, %0" : "=r" (ebp) : );
150 } else { 150 } else {
151 /* ebp is the last reg pushed by switch_to */ 151 /* ebp is the last reg pushed by switch_to */
152 ebp = *(unsigned long *) task->thread.esp; 152 ebp = *(unsigned long *) task->thread.esp;
153 } 153 }
154 154
155 while (1) { 155 while (1) {
156 struct thread_info *context; 156 struct thread_info *context;
157 context = (struct thread_info *) 157 context = (struct thread_info *)
158 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 158 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
159 ebp = print_context_stack(context, stack, ebp); 159 ebp = print_context_stack(context, stack, ebp);
160 stack = (unsigned long*)context->previous_esp; 160 stack = (unsigned long*)context->previous_esp;
161 if (!stack) 161 if (!stack)
162 break; 162 break;
163 printk(" =======================\n"); 163 printk(" =======================\n");
164 } 164 }
165 } 165 }
166 166
167 void show_stack(struct task_struct *task, unsigned long *esp) 167 void show_stack(struct task_struct *task, unsigned long *esp)
168 { 168 {
169 unsigned long *stack; 169 unsigned long *stack;
170 int i; 170 int i;
171 171
172 if (esp == NULL) { 172 if (esp == NULL) {
173 if (task) 173 if (task)
174 esp = (unsigned long*)task->thread.esp; 174 esp = (unsigned long*)task->thread.esp;
175 else 175 else
176 esp = (unsigned long *)&esp; 176 esp = (unsigned long *)&esp;
177 } 177 }
178 178
179 stack = esp; 179 stack = esp;
180 for(i = 0; i < kstack_depth_to_print; i++) { 180 for(i = 0; i < kstack_depth_to_print; i++) {
181 if (kstack_end(stack)) 181 if (kstack_end(stack))
182 break; 182 break;
183 if (i && ((i % 8) == 0)) 183 if (i && ((i % 8) == 0))
184 printk("\n "); 184 printk("\n ");
185 printk("%08lx ", *stack++); 185 printk("%08lx ", *stack++);
186 } 186 }
187 printk("\nCall Trace:\n"); 187 printk("\nCall Trace:\n");
188 show_trace(task, esp); 188 show_trace(task, esp);
189 } 189 }
190 190
191 /* 191 /*
192 * The architecture-independent dump_stack generator 192 * The architecture-independent dump_stack generator
193 */ 193 */
194 void dump_stack(void) 194 void dump_stack(void)
195 { 195 {
196 unsigned long stack; 196 unsigned long stack;
197 197
198 show_trace(current, &stack); 198 show_trace(current, &stack);
199 } 199 }
200 200
201 EXPORT_SYMBOL(dump_stack); 201 EXPORT_SYMBOL(dump_stack);
202 202
203 void show_registers(struct pt_regs *regs) 203 void show_registers(struct pt_regs *regs)
204 { 204 {
205 int i; 205 int i;
206 int in_kernel = 1; 206 int in_kernel = 1;
207 unsigned long esp; 207 unsigned long esp;
208 unsigned short ss; 208 unsigned short ss;
209 209
210 esp = (unsigned long) (&regs->esp); 210 esp = (unsigned long) (&regs->esp);
211 ss = __KERNEL_DS; 211 ss = __KERNEL_DS;
212 if (regs->xcs & 3) { 212 if (regs->xcs & 3) {
213 in_kernel = 0; 213 in_kernel = 0;
214 esp = regs->esp; 214 esp = regs->esp;
215 ss = regs->xss & 0xffff; 215 ss = regs->xss & 0xffff;
216 } 216 }
217 print_modules(); 217 print_modules();
218 printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx" 218 printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx"
219 " (%s) \n", 219 " (%s) \n",
220 smp_processor_id(), 0xffff & regs->xcs, regs->eip, 220 smp_processor_id(), 0xffff & regs->xcs, regs->eip,
221 print_tainted(), regs->eflags, system_utsname.release); 221 print_tainted(), regs->eflags, system_utsname.release);
222 print_symbol("EIP is at %s\n", regs->eip); 222 print_symbol("EIP is at %s\n", regs->eip);
223 printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", 223 printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
224 regs->eax, regs->ebx, regs->ecx, regs->edx); 224 regs->eax, regs->ebx, regs->ecx, regs->edx);
225 printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 225 printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
226 regs->esi, regs->edi, regs->ebp, esp); 226 regs->esi, regs->edi, regs->ebp, esp);
227 printk("ds: %04x es: %04x ss: %04x\n", 227 printk("ds: %04x es: %04x ss: %04x\n",
228 regs->xds & 0xffff, regs->xes & 0xffff, ss); 228 regs->xds & 0xffff, regs->xes & 0xffff, ss);
229 printk("Process %s (pid: %d, threadinfo=%p task=%p)", 229 printk("Process %s (pid: %d, threadinfo=%p task=%p)",
230 current->comm, current->pid, current_thread_info(), current); 230 current->comm, current->pid, current_thread_info(), current);
231 /* 231 /*
232 * When in-kernel, we also print out the stack and code at the 232 * When in-kernel, we also print out the stack and code at the
233 * time of the fault.. 233 * time of the fault..
234 */ 234 */
235 if (in_kernel) { 235 if (in_kernel) {
236 u8 *eip; 236 u8 *eip;
237 237
238 printk("\nStack: "); 238 printk("\nStack: ");
239 show_stack(NULL, (unsigned long*)esp); 239 show_stack(NULL, (unsigned long*)esp);
240 240
241 printk("Code: "); 241 printk("Code: ");
242 242
243 eip = (u8 *)regs->eip - 43; 243 eip = (u8 *)regs->eip - 43;
244 for (i = 0; i < 64; i++, eip++) { 244 for (i = 0; i < 64; i++, eip++) {
245 unsigned char c; 245 unsigned char c;
246 246
247 if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { 247 if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) {
248 printk(" Bad EIP value."); 248 printk(" Bad EIP value.");
249 break; 249 break;
250 } 250 }
251 if (eip == (u8 *)regs->eip) 251 if (eip == (u8 *)regs->eip)
252 printk("<%02x> ", c); 252 printk("<%02x> ", c);
253 else 253 else
254 printk("%02x ", c); 254 printk("%02x ", c);
255 } 255 }
256 } 256 }
257 printk("\n"); 257 printk("\n");
258 } 258 }
259 259
260 static void handle_BUG(struct pt_regs *regs) 260 static void handle_BUG(struct pt_regs *regs)
261 { 261 {
262 unsigned short ud2; 262 unsigned short ud2;
263 unsigned short line; 263 unsigned short line;
264 char *file; 264 char *file;
265 char c; 265 char c;
266 unsigned long eip; 266 unsigned long eip;
267 267
268 if (regs->xcs & 3) 268 if (regs->xcs & 3)
269 goto no_bug; /* Not in kernel */ 269 goto no_bug; /* Not in kernel */
270 270
271 eip = regs->eip; 271 eip = regs->eip;
272 272
273 if (eip < PAGE_OFFSET) 273 if (eip < PAGE_OFFSET)
274 goto no_bug; 274 goto no_bug;
275 if (__get_user(ud2, (unsigned short *)eip)) 275 if (__get_user(ud2, (unsigned short *)eip))
276 goto no_bug; 276 goto no_bug;
277 if (ud2 != 0x0b0f) 277 if (ud2 != 0x0b0f)
278 goto no_bug; 278 goto no_bug;
279 if (__get_user(line, (unsigned short *)(eip + 2))) 279 if (__get_user(line, (unsigned short *)(eip + 2)))
280 goto bug; 280 goto bug;
281 if (__get_user(file, (char **)(eip + 4)) || 281 if (__get_user(file, (char **)(eip + 4)) ||
282 (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) 282 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
283 file = "<bad filename>"; 283 file = "<bad filename>";
284 284
285 printk("------------[ cut here ]------------\n"); 285 printk("------------[ cut here ]------------\n");
286 printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line); 286 printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line);
287 287
288 no_bug: 288 no_bug:
289 return; 289 return;
290 290
291 /* Here we know it was a BUG but file-n-line is unavailable */ 291 /* Here we know it was a BUG but file-n-line is unavailable */
292 bug: 292 bug:
293 printk("Kernel BUG\n"); 293 printk("Kernel BUG\n");
294 } 294 }
295 295
296 void die(const char * str, struct pt_regs * regs, long err) 296 void die(const char * str, struct pt_regs * regs, long err)
297 { 297 {
298 static struct { 298 static struct {
299 spinlock_t lock; 299 spinlock_t lock;
300 u32 lock_owner; 300 u32 lock_owner;
301 int lock_owner_depth; 301 int lock_owner_depth;
302 } die = { 302 } die = {
303 .lock = SPIN_LOCK_UNLOCKED, 303 .lock = SPIN_LOCK_UNLOCKED,
304 .lock_owner = -1, 304 .lock_owner = -1,
305 .lock_owner_depth = 0 305 .lock_owner_depth = 0
306 }; 306 };
307 static int die_counter; 307 static int die_counter;
308 308
309 if (die.lock_owner != _smp_processor_id()) { 309 if (die.lock_owner != raw_smp_processor_id()) {
310 console_verbose(); 310 console_verbose();
311 spin_lock_irq(&die.lock); 311 spin_lock_irq(&die.lock);
312 die.lock_owner = smp_processor_id(); 312 die.lock_owner = smp_processor_id();
313 die.lock_owner_depth = 0; 313 die.lock_owner_depth = 0;
314 bust_spinlocks(1); 314 bust_spinlocks(1);
315 } 315 }
316 316
317 if (++die.lock_owner_depth < 3) { 317 if (++die.lock_owner_depth < 3) {
318 int nl = 0; 318 int nl = 0;
319 handle_BUG(regs); 319 handle_BUG(regs);
320 printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); 320 printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
321 #ifdef CONFIG_PREEMPT 321 #ifdef CONFIG_PREEMPT
322 printk("PREEMPT "); 322 printk("PREEMPT ");
323 nl = 1; 323 nl = 1;
324 #endif 324 #endif
325 #ifdef CONFIG_SMP 325 #ifdef CONFIG_SMP
326 printk("SMP "); 326 printk("SMP ");
327 nl = 1; 327 nl = 1;
328 #endif 328 #endif
329 #ifdef CONFIG_DEBUG_PAGEALLOC 329 #ifdef CONFIG_DEBUG_PAGEALLOC
330 printk("DEBUG_PAGEALLOC"); 330 printk("DEBUG_PAGEALLOC");
331 nl = 1; 331 nl = 1;
332 #endif 332 #endif
333 if (nl) 333 if (nl)
334 printk("\n"); 334 printk("\n");
335 notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); 335 notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
336 show_registers(regs); 336 show_registers(regs);
337 } else 337 } else
338 printk(KERN_ERR "Recursive die() failure, output suppressed\n"); 338 printk(KERN_ERR "Recursive die() failure, output suppressed\n");
339 339
340 bust_spinlocks(0); 340 bust_spinlocks(0);
341 die.lock_owner = -1; 341 die.lock_owner = -1;
342 spin_unlock_irq(&die.lock); 342 spin_unlock_irq(&die.lock);
343 if (in_interrupt()) 343 if (in_interrupt())
344 panic("Fatal exception in interrupt"); 344 panic("Fatal exception in interrupt");
345 345
346 if (panic_on_oops) { 346 if (panic_on_oops) {
347 printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); 347 printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
348 ssleep(5); 348 ssleep(5);
349 panic("Fatal exception"); 349 panic("Fatal exception");
350 } 350 }
351 do_exit(SIGSEGV); 351 do_exit(SIGSEGV);
352 } 352 }
353 353
354 static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) 354 static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
355 { 355 {
356 if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) 356 if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs))
357 die(str, regs, err); 357 die(str, regs, err);
358 } 358 }
359 359
360 static void do_trap(int trapnr, int signr, char *str, int vm86, 360 static void do_trap(int trapnr, int signr, char *str, int vm86,
361 struct pt_regs * regs, long error_code, siginfo_t *info) 361 struct pt_regs * regs, long error_code, siginfo_t *info)
362 { 362 {
363 if (regs->eflags & VM_MASK) { 363 if (regs->eflags & VM_MASK) {
364 if (vm86) 364 if (vm86)
365 goto vm86_trap; 365 goto vm86_trap;
366 goto trap_signal; 366 goto trap_signal;
367 } 367 }
368 368
369 if (!(regs->xcs & 3)) 369 if (!(regs->xcs & 3))
370 goto kernel_trap; 370 goto kernel_trap;
371 371
372 trap_signal: { 372 trap_signal: {
373 struct task_struct *tsk = current; 373 struct task_struct *tsk = current;
374 tsk->thread.error_code = error_code; 374 tsk->thread.error_code = error_code;
375 tsk->thread.trap_no = trapnr; 375 tsk->thread.trap_no = trapnr;
376 if (info) 376 if (info)
377 force_sig_info(signr, info, tsk); 377 force_sig_info(signr, info, tsk);
378 else 378 else
379 force_sig(signr, tsk); 379 force_sig(signr, tsk);
380 return; 380 return;
381 } 381 }
382 382
383 kernel_trap: { 383 kernel_trap: {
384 if (!fixup_exception(regs)) 384 if (!fixup_exception(regs))
385 die(str, regs, error_code); 385 die(str, regs, error_code);
386 return; 386 return;
387 } 387 }
388 388
389 vm86_trap: { 389 vm86_trap: {
390 int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); 390 int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
391 if (ret) goto trap_signal; 391 if (ret) goto trap_signal;
392 return; 392 return;
393 } 393 }
394 } 394 }
395 395
396 #define DO_ERROR(trapnr, signr, str, name) \ 396 #define DO_ERROR(trapnr, signr, str, name) \
397 fastcall void do_##name(struct pt_regs * regs, long error_code) \ 397 fastcall void do_##name(struct pt_regs * regs, long error_code) \
398 { \ 398 { \
399 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 399 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
400 == NOTIFY_STOP) \ 400 == NOTIFY_STOP) \
401 return; \ 401 return; \
402 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ 402 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
403 } 403 }
404 404
405 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 405 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
406 fastcall void do_##name(struct pt_regs * regs, long error_code) \ 406 fastcall void do_##name(struct pt_regs * regs, long error_code) \
407 { \ 407 { \
408 siginfo_t info; \ 408 siginfo_t info; \
409 info.si_signo = signr; \ 409 info.si_signo = signr; \
410 info.si_errno = 0; \ 410 info.si_errno = 0; \
411 info.si_code = sicode; \ 411 info.si_code = sicode; \
412 info.si_addr = (void __user *)siaddr; \ 412 info.si_addr = (void __user *)siaddr; \
413 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 413 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
414 == NOTIFY_STOP) \ 414 == NOTIFY_STOP) \
415 return; \ 415 return; \
416 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 416 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
417 } 417 }
418 418
419 #define DO_VM86_ERROR(trapnr, signr, str, name) \ 419 #define DO_VM86_ERROR(trapnr, signr, str, name) \
420 fastcall void do_##name(struct pt_regs * regs, long error_code) \ 420 fastcall void do_##name(struct pt_regs * regs, long error_code) \
421 { \ 421 { \
422 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 422 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
423 == NOTIFY_STOP) \ 423 == NOTIFY_STOP) \
424 return; \ 424 return; \
425 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ 425 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
426 } 426 }
427 427
428 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 428 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
429 fastcall void do_##name(struct pt_regs * regs, long error_code) \ 429 fastcall void do_##name(struct pt_regs * regs, long error_code) \
430 { \ 430 { \
431 siginfo_t info; \ 431 siginfo_t info; \
432 info.si_signo = signr; \ 432 info.si_signo = signr; \
433 info.si_errno = 0; \ 433 info.si_errno = 0; \
434 info.si_code = sicode; \ 434 info.si_code = sicode; \
435 info.si_addr = (void __user *)siaddr; \ 435 info.si_addr = (void __user *)siaddr; \
436 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 436 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
437 == NOTIFY_STOP) \ 437 == NOTIFY_STOP) \
438 return; \ 438 return; \
439 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 439 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
440 } 440 }
441 441
442 DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) 442 DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
443 #ifndef CONFIG_KPROBES 443 #ifndef CONFIG_KPROBES
444 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) 444 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
445 #endif 445 #endif
446 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) 446 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
447 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) 447 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
448 DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) 448 DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
449 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 449 DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
450 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 450 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
451 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 451 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
452 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 452 DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
453 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) 453 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
454 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) 454 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
455 455
456 fastcall void do_general_protection(struct pt_regs * regs, long error_code) 456 fastcall void do_general_protection(struct pt_regs * regs, long error_code)
457 { 457 {
458 int cpu = get_cpu(); 458 int cpu = get_cpu();
459 struct tss_struct *tss = &per_cpu(init_tss, cpu); 459 struct tss_struct *tss = &per_cpu(init_tss, cpu);
460 struct thread_struct *thread = &current->thread; 460 struct thread_struct *thread = &current->thread;
461 461
462 /* 462 /*
463 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an 463 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
464 * invalid offset set (the LAZY one) and the faulting thread has 464 * invalid offset set (the LAZY one) and the faulting thread has
465 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS 465 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
466 * and we set the offset field correctly. Then we let the CPU to 466 * and we set the offset field correctly. Then we let the CPU to
467 * restart the faulting instruction. 467 * restart the faulting instruction.
468 */ 468 */
469 if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && 469 if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
470 thread->io_bitmap_ptr) { 470 thread->io_bitmap_ptr) {
471 memcpy(tss->io_bitmap, thread->io_bitmap_ptr, 471 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
472 thread->io_bitmap_max); 472 thread->io_bitmap_max);
473 /* 473 /*
474 * If the previously set map was extending to higher ports 474 * If the previously set map was extending to higher ports
475 * than the current one, pad extra space with 0xff (no access). 475 * than the current one, pad extra space with 0xff (no access).
476 */ 476 */
477 if (thread->io_bitmap_max < tss->io_bitmap_max) 477 if (thread->io_bitmap_max < tss->io_bitmap_max)
478 memset((char *) tss->io_bitmap + 478 memset((char *) tss->io_bitmap +
479 thread->io_bitmap_max, 0xff, 479 thread->io_bitmap_max, 0xff,
480 tss->io_bitmap_max - thread->io_bitmap_max); 480 tss->io_bitmap_max - thread->io_bitmap_max);
481 tss->io_bitmap_max = thread->io_bitmap_max; 481 tss->io_bitmap_max = thread->io_bitmap_max;
482 tss->io_bitmap_base = IO_BITMAP_OFFSET; 482 tss->io_bitmap_base = IO_BITMAP_OFFSET;
483 put_cpu(); 483 put_cpu();
484 return; 484 return;
485 } 485 }
486 put_cpu(); 486 put_cpu();
487 487
488 if (regs->eflags & VM_MASK) 488 if (regs->eflags & VM_MASK)
489 goto gp_in_vm86; 489 goto gp_in_vm86;
490 490
491 if (!(regs->xcs & 3)) 491 if (!(regs->xcs & 3))
492 goto gp_in_kernel; 492 goto gp_in_kernel;
493 493
494 current->thread.error_code = error_code; 494 current->thread.error_code = error_code;
495 current->thread.trap_no = 13; 495 current->thread.trap_no = 13;
496 force_sig(SIGSEGV, current); 496 force_sig(SIGSEGV, current);
497 return; 497 return;
498 498
499 gp_in_vm86: 499 gp_in_vm86:
500 local_irq_enable(); 500 local_irq_enable();
501 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 501 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
502 return; 502 return;
503 503
504 gp_in_kernel: 504 gp_in_kernel:
505 if (!fixup_exception(regs)) { 505 if (!fixup_exception(regs)) {
506 if (notify_die(DIE_GPF, "general protection fault", regs, 506 if (notify_die(DIE_GPF, "general protection fault", regs,
507 error_code, 13, SIGSEGV) == NOTIFY_STOP) 507 error_code, 13, SIGSEGV) == NOTIFY_STOP)
508 return; 508 return;
509 die("general protection fault", regs, error_code); 509 die("general protection fault", regs, error_code);
510 } 510 }
511 } 511 }
512 512
513 static void mem_parity_error(unsigned char reason, struct pt_regs * regs) 513 static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
514 { 514 {
515 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); 515 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
516 printk("You probably have a hardware problem with your RAM chips\n"); 516 printk("You probably have a hardware problem with your RAM chips\n");
517 517
518 /* Clear and disable the memory parity error line. */ 518 /* Clear and disable the memory parity error line. */
519 clear_mem_error(reason); 519 clear_mem_error(reason);
520 } 520 }
521 521
522 static void io_check_error(unsigned char reason, struct pt_regs * regs) 522 static void io_check_error(unsigned char reason, struct pt_regs * regs)
523 { 523 {
524 unsigned long i; 524 unsigned long i;
525 525
526 printk("NMI: IOCK error (debug interrupt?)\n"); 526 printk("NMI: IOCK error (debug interrupt?)\n");
527 show_registers(regs); 527 show_registers(regs);
528 528
529 /* Re-enable the IOCK line, wait for a few seconds */ 529 /* Re-enable the IOCK line, wait for a few seconds */
530 reason = (reason & 0xf) | 8; 530 reason = (reason & 0xf) | 8;
531 outb(reason, 0x61); 531 outb(reason, 0x61);
532 i = 2000; 532 i = 2000;
533 while (--i) udelay(1000); 533 while (--i) udelay(1000);
534 reason &= ~8; 534 reason &= ~8;
535 outb(reason, 0x61); 535 outb(reason, 0x61);
536 } 536 }
537 537
538 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 538 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
539 { 539 {
540 #ifdef CONFIG_MCA 540 #ifdef CONFIG_MCA
541 /* Might actually be able to figure out what the guilty party 541 /* Might actually be able to figure out what the guilty party
542 * is. */ 542 * is. */
543 if( MCA_bus ) { 543 if( MCA_bus ) {
544 mca_handle_nmi(); 544 mca_handle_nmi();
545 return; 545 return;
546 } 546 }
547 #endif 547 #endif
548 printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 548 printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
549 reason, smp_processor_id()); 549 reason, smp_processor_id());
550 printk("Dazed and confused, but trying to continue\n"); 550 printk("Dazed and confused, but trying to continue\n");
551 printk("Do you have a strange power saving mode enabled?\n"); 551 printk("Do you have a strange power saving mode enabled?\n");
552 } 552 }
553 553
554 static DEFINE_SPINLOCK(nmi_print_lock); 554 static DEFINE_SPINLOCK(nmi_print_lock);
555 555
556 void die_nmi (struct pt_regs *regs, const char *msg) 556 void die_nmi (struct pt_regs *regs, const char *msg)
557 { 557 {
558 spin_lock(&nmi_print_lock); 558 spin_lock(&nmi_print_lock);
559 /* 559 /*
560 * We are in trouble anyway, lets at least try 560 * We are in trouble anyway, lets at least try
561 * to get a message out. 561 * to get a message out.
562 */ 562 */
563 bust_spinlocks(1); 563 bust_spinlocks(1);
564 printk(msg); 564 printk(msg);
565 printk(" on CPU%d, eip %08lx, registers:\n", 565 printk(" on CPU%d, eip %08lx, registers:\n",
566 smp_processor_id(), regs->eip); 566 smp_processor_id(), regs->eip);
567 show_registers(regs); 567 show_registers(regs);
568 printk("console shuts up ...\n"); 568 printk("console shuts up ...\n");
569 console_silent(); 569 console_silent();
570 spin_unlock(&nmi_print_lock); 570 spin_unlock(&nmi_print_lock);
571 bust_spinlocks(0); 571 bust_spinlocks(0);
572 do_exit(SIGSEGV); 572 do_exit(SIGSEGV);
573 } 573 }
574 574
575 static void default_do_nmi(struct pt_regs * regs) 575 static void default_do_nmi(struct pt_regs * regs)
576 { 576 {
577 unsigned char reason = 0; 577 unsigned char reason = 0;
578 578
579 /* Only the BSP gets external NMIs from the system. */ 579 /* Only the BSP gets external NMIs from the system. */
580 if (!smp_processor_id()) 580 if (!smp_processor_id())
581 reason = get_nmi_reason(); 581 reason = get_nmi_reason();
582 582
583 if (!(reason & 0xc0)) { 583 if (!(reason & 0xc0)) {
584 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) 584 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
585 == NOTIFY_STOP) 585 == NOTIFY_STOP)
586 return; 586 return;
587 #ifdef CONFIG_X86_LOCAL_APIC 587 #ifdef CONFIG_X86_LOCAL_APIC
588 /* 588 /*
589 * Ok, so this is none of the documented NMI sources, 589 * Ok, so this is none of the documented NMI sources,
590 * so it must be the NMI watchdog. 590 * so it must be the NMI watchdog.
591 */ 591 */
592 if (nmi_watchdog) { 592 if (nmi_watchdog) {
593 nmi_watchdog_tick(regs); 593 nmi_watchdog_tick(regs);
594 return; 594 return;
595 } 595 }
596 #endif 596 #endif
597 unknown_nmi_error(reason, regs); 597 unknown_nmi_error(reason, regs);
598 return; 598 return;
599 } 599 }
600 if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) 600 if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
601 return; 601 return;
602 if (reason & 0x80) 602 if (reason & 0x80)
603 mem_parity_error(reason, regs); 603 mem_parity_error(reason, regs);
604 if (reason & 0x40) 604 if (reason & 0x40)
605 io_check_error(reason, regs); 605 io_check_error(reason, regs);
606 /* 606 /*
607 * Reassert NMI in case it became active meanwhile 607 * Reassert NMI in case it became active meanwhile
608 * as it's edge-triggered. 608 * as it's edge-triggered.
609 */ 609 */
610 reassert_nmi(); 610 reassert_nmi();
611 } 611 }
612 612
613 static int dummy_nmi_callback(struct pt_regs * regs, int cpu) 613 static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
614 { 614 {
615 return 0; 615 return 0;
616 } 616 }
617 617
618 static nmi_callback_t nmi_callback = dummy_nmi_callback; 618 static nmi_callback_t nmi_callback = dummy_nmi_callback;
619 619
620 fastcall void do_nmi(struct pt_regs * regs, long error_code) 620 fastcall void do_nmi(struct pt_regs * regs, long error_code)
621 { 621 {
622 int cpu; 622 int cpu;
623 623
624 nmi_enter(); 624 nmi_enter();
625 625
626 cpu = smp_processor_id(); 626 cpu = smp_processor_id();
627 ++nmi_count(cpu); 627 ++nmi_count(cpu);
628 628
629 if (!nmi_callback(regs, cpu)) 629 if (!nmi_callback(regs, cpu))
630 default_do_nmi(regs); 630 default_do_nmi(regs);
631 631
632 nmi_exit(); 632 nmi_exit();
633 } 633 }
634 634
635 void set_nmi_callback(nmi_callback_t callback) 635 void set_nmi_callback(nmi_callback_t callback)
636 { 636 {
637 nmi_callback = callback; 637 nmi_callback = callback;
638 } 638 }
639 639
640 void unset_nmi_callback(void) 640 void unset_nmi_callback(void)
641 { 641 {
642 nmi_callback = dummy_nmi_callback; 642 nmi_callback = dummy_nmi_callback;
643 } 643 }
644 644
645 #ifdef CONFIG_KPROBES 645 #ifdef CONFIG_KPROBES
646 fastcall void do_int3(struct pt_regs *regs, long error_code) 646 fastcall void do_int3(struct pt_regs *regs, long error_code)
647 { 647 {
648 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 648 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
649 == NOTIFY_STOP) 649 == NOTIFY_STOP)
650 return; 650 return;
651 /* This is an interrupt gate, because kprobes wants interrupts 651 /* This is an interrupt gate, because kprobes wants interrupts
652 disabled. Normal trap handlers don't. */ 652 disabled. Normal trap handlers don't. */
653 restore_interrupts(regs); 653 restore_interrupts(regs);
654 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 654 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
655 } 655 }
656 #endif 656 #endif
657 657
658 /* 658 /*
659 * Our handling of the processor debug registers is non-trivial. 659 * Our handling of the processor debug registers is non-trivial.
660 * We do not clear them on entry and exit from the kernel. Therefore 660 * We do not clear them on entry and exit from the kernel. Therefore
661 * it is possible to get a watchpoint trap here from inside the kernel. 661 * it is possible to get a watchpoint trap here from inside the kernel.
662 * However, the code in ./ptrace.c has ensured that the user can 662 * However, the code in ./ptrace.c has ensured that the user can
663 * only set watchpoints on userspace addresses. Therefore the in-kernel 663 * only set watchpoints on userspace addresses. Therefore the in-kernel
664 * watchpoint trap can only occur in code which is reading/writing 664 * watchpoint trap can only occur in code which is reading/writing
665 * from user space. Such code must not hold kernel locks (since it 665 * from user space. Such code must not hold kernel locks (since it
666 * can equally take a page fault), therefore it is safe to call 666 * can equally take a page fault), therefore it is safe to call
667 * force_sig_info even though that claims and releases locks. 667 * force_sig_info even though that claims and releases locks.
668 * 668 *
669 * Code in ./signal.c ensures that the debug control register 669 * Code in ./signal.c ensures that the debug control register
670 * is restored before we deliver any signal, and therefore that 670 * is restored before we deliver any signal, and therefore that
671 * user code runs with the correct debug control register even though 671 * user code runs with the correct debug control register even though
672 * we clear it here. 672 * we clear it here.
673 * 673 *
674 * Being careful here means that we don't have to be as careful in a 674 * Being careful here means that we don't have to be as careful in a
675 * lot of more complicated places (task switching can be a bit lazy 675 * lot of more complicated places (task switching can be a bit lazy
676 * about restoring all the debug state, and ptrace doesn't have to 676 * about restoring all the debug state, and ptrace doesn't have to
677 * find every occurrence of the TF bit that could be saved away even 677 * find every occurrence of the TF bit that could be saved away even
678 * by user code) 678 * by user code)
679 */ 679 */
680 fastcall void do_debug(struct pt_regs * regs, long error_code) 680 fastcall void do_debug(struct pt_regs * regs, long error_code)
681 { 681 {
682 unsigned int condition; 682 unsigned int condition;
683 struct task_struct *tsk = current; 683 struct task_struct *tsk = current;
684 684
685 __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); 685 __asm__ __volatile__("movl %%db6,%0" : "=r" (condition));
686 686
687 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 687 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
688 SIGTRAP) == NOTIFY_STOP) 688 SIGTRAP) == NOTIFY_STOP)
689 return; 689 return;
690 /* It's safe to allow irq's after DR6 has been saved */ 690 /* It's safe to allow irq's after DR6 has been saved */
691 if (regs->eflags & X86_EFLAGS_IF) 691 if (regs->eflags & X86_EFLAGS_IF)
692 local_irq_enable(); 692 local_irq_enable();
693 693
694 /* Mask out spurious debug traps due to lazy DR7 setting */ 694 /* Mask out spurious debug traps due to lazy DR7 setting */
695 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 695 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
696 if (!tsk->thread.debugreg[7]) 696 if (!tsk->thread.debugreg[7])
697 goto clear_dr7; 697 goto clear_dr7;
698 } 698 }
699 699
700 if (regs->eflags & VM_MASK) 700 if (regs->eflags & VM_MASK)
701 goto debug_vm86; 701 goto debug_vm86;
702 702
703 /* Save debug status register where ptrace can see it */ 703 /* Save debug status register where ptrace can see it */
704 tsk->thread.debugreg[6] = condition; 704 tsk->thread.debugreg[6] = condition;
705 705
706 /* 706 /*
707 * Single-stepping through TF: make sure we ignore any events in 707 * Single-stepping through TF: make sure we ignore any events in
708 * kernel space (but re-enable TF when returning to user mode). 708 * kernel space (but re-enable TF when returning to user mode).
709 */ 709 */
710 if (condition & DR_STEP) { 710 if (condition & DR_STEP) {
711 /* 711 /*
712 * We already checked v86 mode above, so we can 712 * We already checked v86 mode above, so we can
713 * check for kernel mode by just checking the CPL 713 * check for kernel mode by just checking the CPL
714 * of CS. 714 * of CS.
715 */ 715 */
716 if ((regs->xcs & 3) == 0) 716 if ((regs->xcs & 3) == 0)
717 goto clear_TF_reenable; 717 goto clear_TF_reenable;
718 } 718 }
719 719
720 /* Ok, finally something we can handle */ 720 /* Ok, finally something we can handle */
721 send_sigtrap(tsk, regs, error_code); 721 send_sigtrap(tsk, regs, error_code);
722 722
723 /* Disable additional traps. They'll be re-enabled when 723 /* Disable additional traps. They'll be re-enabled when
724 * the signal is delivered. 724 * the signal is delivered.
725 */ 725 */
726 clear_dr7: 726 clear_dr7:
727 __asm__("movl %0,%%db7" 727 __asm__("movl %0,%%db7"
728 : /* no output */ 728 : /* no output */
729 : "r" (0)); 729 : "r" (0));
730 return; 730 return;
731 731
732 debug_vm86: 732 debug_vm86:
733 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 733 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
734 return; 734 return;
735 735
736 clear_TF_reenable: 736 clear_TF_reenable:
737 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 737 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
738 regs->eflags &= ~TF_MASK; 738 regs->eflags &= ~TF_MASK;
739 return; 739 return;
740 } 740 }
741 741
742 /* 742 /*
743 * Note that we play around with the 'TS' bit in an attempt to get 743 * Note that we play around with the 'TS' bit in an attempt to get
744 * the correct behaviour even in the presence of the asynchronous 744 * the correct behaviour even in the presence of the asynchronous
745 * IRQ13 behaviour 745 * IRQ13 behaviour
746 */ 746 */
747 void math_error(void __user *eip) 747 void math_error(void __user *eip)
748 { 748 {
749 struct task_struct * task; 749 struct task_struct * task;
750 siginfo_t info; 750 siginfo_t info;
751 unsigned short cwd, swd; 751 unsigned short cwd, swd;
752 752
753 /* 753 /*
754 * Save the info for the exception handler and clear the error. 754 * Save the info for the exception handler and clear the error.
755 */ 755 */
756 task = current; 756 task = current;
757 save_init_fpu(task); 757 save_init_fpu(task);
758 task->thread.trap_no = 16; 758 task->thread.trap_no = 16;
759 task->thread.error_code = 0; 759 task->thread.error_code = 0;
760 info.si_signo = SIGFPE; 760 info.si_signo = SIGFPE;
761 info.si_errno = 0; 761 info.si_errno = 0;
762 info.si_code = __SI_FAULT; 762 info.si_code = __SI_FAULT;
763 info.si_addr = eip; 763 info.si_addr = eip;
764 /* 764 /*
765 * (~cwd & swd) will mask out exceptions that are not set to unmasked 765 * (~cwd & swd) will mask out exceptions that are not set to unmasked
766 * status. 0x3f is the exception bits in these regs, 0x200 is the 766 * status. 0x3f is the exception bits in these regs, 0x200 is the
767 * C1 reg you need in case of a stack fault, 0x040 is the stack 767 * C1 reg you need in case of a stack fault, 0x040 is the stack
768 * fault bit. We should only be taking one exception at a time, 768 * fault bit. We should only be taking one exception at a time,
769 * so if this combination doesn't produce any single exception, 769 * so if this combination doesn't produce any single exception,
770 * then we have a bad program that isn't syncronizing its FPU usage 770 * then we have a bad program that isn't syncronizing its FPU usage
771 * and it will suffer the consequences since we won't be able to 771 * and it will suffer the consequences since we won't be able to
772 * fully reproduce the context of the exception 772 * fully reproduce the context of the exception
773 */ 773 */
774 cwd = get_fpu_cwd(task); 774 cwd = get_fpu_cwd(task);
775 swd = get_fpu_swd(task); 775 swd = get_fpu_swd(task);
776 switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { 776 switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
777 case 0x000: 777 case 0x000:
778 default: 778 default:
779 break; 779 break;
780 case 0x001: /* Invalid Op */ 780 case 0x001: /* Invalid Op */
781 case 0x041: /* Stack Fault */ 781 case 0x041: /* Stack Fault */
782 case 0x241: /* Stack Fault | Direction */ 782 case 0x241: /* Stack Fault | Direction */
783 info.si_code = FPE_FLTINV; 783 info.si_code = FPE_FLTINV;
784 /* Should we clear the SF or let user space do it ???? */ 784 /* Should we clear the SF or let user space do it ???? */
785 break; 785 break;
786 case 0x002: /* Denormalize */ 786 case 0x002: /* Denormalize */
787 case 0x010: /* Underflow */ 787 case 0x010: /* Underflow */
788 info.si_code = FPE_FLTUND; 788 info.si_code = FPE_FLTUND;
789 break; 789 break;
790 case 0x004: /* Zero Divide */ 790 case 0x004: /* Zero Divide */
791 info.si_code = FPE_FLTDIV; 791 info.si_code = FPE_FLTDIV;
792 break; 792 break;
793 case 0x008: /* Overflow */ 793 case 0x008: /* Overflow */
794 info.si_code = FPE_FLTOVF; 794 info.si_code = FPE_FLTOVF;
795 break; 795 break;
796 case 0x020: /* Precision */ 796 case 0x020: /* Precision */
797 info.si_code = FPE_FLTRES; 797 info.si_code = FPE_FLTRES;
798 break; 798 break;
799 } 799 }
800 force_sig_info(SIGFPE, &info, task); 800 force_sig_info(SIGFPE, &info, task);
801 } 801 }
802 802
803 fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) 803 fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
804 { 804 {
805 ignore_fpu_irq = 1; 805 ignore_fpu_irq = 1;
806 math_error((void __user *)regs->eip); 806 math_error((void __user *)regs->eip);
807 } 807 }
808 808
809 static void simd_math_error(void __user *eip) 809 static void simd_math_error(void __user *eip)
810 { 810 {
811 struct task_struct * task; 811 struct task_struct * task;
812 siginfo_t info; 812 siginfo_t info;
813 unsigned short mxcsr; 813 unsigned short mxcsr;
814 814
815 /* 815 /*
816 * Save the info for the exception handler and clear the error. 816 * Save the info for the exception handler and clear the error.
817 */ 817 */
818 task = current; 818 task = current;
819 save_init_fpu(task); 819 save_init_fpu(task);
820 task->thread.trap_no = 19; 820 task->thread.trap_no = 19;
821 task->thread.error_code = 0; 821 task->thread.error_code = 0;
822 info.si_signo = SIGFPE; 822 info.si_signo = SIGFPE;
823 info.si_errno = 0; 823 info.si_errno = 0;
824 info.si_code = __SI_FAULT; 824 info.si_code = __SI_FAULT;
825 info.si_addr = eip; 825 info.si_addr = eip;
826 /* 826 /*
827 * The SIMD FPU exceptions are handled a little differently, as there 827 * The SIMD FPU exceptions are handled a little differently, as there
828 * is only a single status/control register. Thus, to determine which 828 * is only a single status/control register. Thus, to determine which
829 * unmasked exception was caught we must mask the exception mask bits 829 * unmasked exception was caught we must mask the exception mask bits
830 * at 0x1f80, and then use these to mask the exception bits at 0x3f. 830 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
831 */ 831 */
832 mxcsr = get_fpu_mxcsr(task); 832 mxcsr = get_fpu_mxcsr(task);
833 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { 833 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
834 case 0x000: 834 case 0x000:
835 default: 835 default:
836 break; 836 break;
837 case 0x001: /* Invalid Op */ 837 case 0x001: /* Invalid Op */
838 info.si_code = FPE_FLTINV; 838 info.si_code = FPE_FLTINV;
839 break; 839 break;
840 case 0x002: /* Denormalize */ 840 case 0x002: /* Denormalize */
841 case 0x010: /* Underflow */ 841 case 0x010: /* Underflow */
842 info.si_code = FPE_FLTUND; 842 info.si_code = FPE_FLTUND;
843 break; 843 break;
844 case 0x004: /* Zero Divide */ 844 case 0x004: /* Zero Divide */
845 info.si_code = FPE_FLTDIV; 845 info.si_code = FPE_FLTDIV;
846 break; 846 break;
847 case 0x008: /* Overflow */ 847 case 0x008: /* Overflow */
848 info.si_code = FPE_FLTOVF; 848 info.si_code = FPE_FLTOVF;
849 break; 849 break;
850 case 0x020: /* Precision */ 850 case 0x020: /* Precision */
851 info.si_code = FPE_FLTRES; 851 info.si_code = FPE_FLTRES;
852 break; 852 break;
853 } 853 }
854 force_sig_info(SIGFPE, &info, task); 854 force_sig_info(SIGFPE, &info, task);
855 } 855 }
856 856
857 fastcall void do_simd_coprocessor_error(struct pt_regs * regs, 857 fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
858 long error_code) 858 long error_code)
859 { 859 {
860 if (cpu_has_xmm) { 860 if (cpu_has_xmm) {
861 /* Handle SIMD FPU exceptions on PIII+ processors. */ 861 /* Handle SIMD FPU exceptions on PIII+ processors. */
862 ignore_fpu_irq = 1; 862 ignore_fpu_irq = 1;
863 simd_math_error((void __user *)regs->eip); 863 simd_math_error((void __user *)regs->eip);
864 } else { 864 } else {
865 /* 865 /*
866 * Handle strange cache flush from user space exception 866 * Handle strange cache flush from user space exception
867 * in all other cases. This is undocumented behaviour. 867 * in all other cases. This is undocumented behaviour.
868 */ 868 */
869 if (regs->eflags & VM_MASK) { 869 if (regs->eflags & VM_MASK) {
870 handle_vm86_fault((struct kernel_vm86_regs *)regs, 870 handle_vm86_fault((struct kernel_vm86_regs *)regs,
871 error_code); 871 error_code);
872 return; 872 return;
873 } 873 }
874 die_if_kernel("cache flush denied", regs, error_code); 874 die_if_kernel("cache flush denied", regs, error_code);
875 current->thread.trap_no = 19; 875 current->thread.trap_no = 19;
876 current->thread.error_code = error_code; 876 current->thread.error_code = error_code;
877 force_sig(SIGSEGV, current); 877 force_sig(SIGSEGV, current);
878 } 878 }
879 } 879 }
880 880
881 fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, 881 fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
882 long error_code) 882 long error_code)
883 { 883 {
884 #if 0 884 #if 0
885 /* No need to warn about this any longer. */ 885 /* No need to warn about this any longer. */
886 printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 886 printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
887 #endif 887 #endif
888 } 888 }
889 889
890 fastcall void setup_x86_bogus_stack(unsigned char * stk) 890 fastcall void setup_x86_bogus_stack(unsigned char * stk)
891 { 891 {
892 unsigned long *switch16_ptr, *switch32_ptr; 892 unsigned long *switch16_ptr, *switch32_ptr;
893 struct pt_regs *regs; 893 struct pt_regs *regs;
894 unsigned long stack_top, stack_bot; 894 unsigned long stack_top, stack_bot;
895 unsigned short iret_frame16_off; 895 unsigned short iret_frame16_off;
896 int cpu = smp_processor_id(); 896 int cpu = smp_processor_id();
897 /* reserve the space on 32bit stack for the magic switch16 pointer */ 897 /* reserve the space on 32bit stack for the magic switch16 pointer */
898 memmove(stk, stk + 8, sizeof(struct pt_regs)); 898 memmove(stk, stk + 8, sizeof(struct pt_regs));
899 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); 899 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
900 regs = (struct pt_regs *)stk; 900 regs = (struct pt_regs *)stk;
901 /* now the switch32 on 16bit stack */ 901 /* now the switch32 on 16bit stack */
902 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 902 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
903 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 903 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
904 switch32_ptr = (unsigned long *)(stack_top - 8); 904 switch32_ptr = (unsigned long *)(stack_top - 8);
905 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; 905 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
906 /* copy iret frame on 16bit stack */ 906 /* copy iret frame on 16bit stack */
907 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20); 907 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
908 /* fill in the switch pointers */ 908 /* fill in the switch pointers */
909 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; 909 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
910 switch16_ptr[1] = __ESPFIX_SS; 910 switch16_ptr[1] = __ESPFIX_SS;
911 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + 911 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
912 8 - CPU_16BIT_STACK_SIZE; 912 8 - CPU_16BIT_STACK_SIZE;
913 switch32_ptr[1] = __KERNEL_DS; 913 switch32_ptr[1] = __KERNEL_DS;
914 } 914 }
915 915
916 fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) 916 fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
917 { 917 {
918 unsigned long *switch32_ptr; 918 unsigned long *switch32_ptr;
919 unsigned char *stack16, *stack32; 919 unsigned char *stack16, *stack32;
920 unsigned long stack_top, stack_bot; 920 unsigned long stack_top, stack_bot;
921 int len; 921 int len;
922 int cpu = smp_processor_id(); 922 int cpu = smp_processor_id();
923 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 923 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
924 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 924 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
925 switch32_ptr = (unsigned long *)(stack_top - 8); 925 switch32_ptr = (unsigned long *)(stack_top - 8);
926 /* copy the data from 16bit stack to 32bit stack */ 926 /* copy the data from 16bit stack to 32bit stack */
927 len = CPU_16BIT_STACK_SIZE - 8 - sp; 927 len = CPU_16BIT_STACK_SIZE - 8 - sp;
928 stack16 = (unsigned char *)(stack_bot + sp); 928 stack16 = (unsigned char *)(stack_bot + sp);
929 stack32 = (unsigned char *) 929 stack32 = (unsigned char *)
930 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); 930 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
931 memcpy(stack32, stack16, len); 931 memcpy(stack32, stack16, len);
932 return stack32; 932 return stack32;
933 } 933 }
934 934
935 /* 935 /*
936 * 'math_state_restore()' saves the current math information in the 936 * 'math_state_restore()' saves the current math information in the
937 * old math state array, and gets the new ones from the current task 937 * old math state array, and gets the new ones from the current task
938 * 938 *
939 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 939 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
940 * Don't touch unless you *really* know how it works. 940 * Don't touch unless you *really* know how it works.
941 * 941 *
942 * Must be called with kernel preemption disabled (in this case, 942 * Must be called with kernel preemption disabled (in this case,
943 * local interrupts are disabled at the call-site in entry.S). 943 * local interrupts are disabled at the call-site in entry.S).
944 */ 944 */
945 asmlinkage void math_state_restore(struct pt_regs regs) 945 asmlinkage void math_state_restore(struct pt_regs regs)
946 { 946 {
947 struct thread_info *thread = current_thread_info(); 947 struct thread_info *thread = current_thread_info();
948 struct task_struct *tsk = thread->task; 948 struct task_struct *tsk = thread->task;
949 949
950 clts(); /* Allow maths ops (or we recurse) */ 950 clts(); /* Allow maths ops (or we recurse) */
951 if (!tsk_used_math(tsk)) 951 if (!tsk_used_math(tsk))
952 init_fpu(tsk); 952 init_fpu(tsk);
953 restore_fpu(tsk); 953 restore_fpu(tsk);
954 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 954 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
955 } 955 }
956 956
957 #ifndef CONFIG_MATH_EMULATION 957 #ifndef CONFIG_MATH_EMULATION
958 958
959 asmlinkage void math_emulate(long arg) 959 asmlinkage void math_emulate(long arg)
960 { 960 {
961 printk("math-emulation not enabled and no coprocessor found.\n"); 961 printk("math-emulation not enabled and no coprocessor found.\n");
962 printk("killing %s.\n",current->comm); 962 printk("killing %s.\n",current->comm);
963 force_sig(SIGFPE,current); 963 force_sig(SIGFPE,current);
964 schedule(); 964 schedule();
965 } 965 }
966 966
967 #endif /* CONFIG_MATH_EMULATION */ 967 #endif /* CONFIG_MATH_EMULATION */
968 968
969 #ifdef CONFIG_X86_F00F_BUG 969 #ifdef CONFIG_X86_F00F_BUG
970 void __init trap_init_f00f_bug(void) 970 void __init trap_init_f00f_bug(void)
971 { 971 {
972 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 972 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
973 973
974 /* 974 /*
975 * Update the IDT descriptor and reload the IDT so that 975 * Update the IDT descriptor and reload the IDT so that
976 * it uses the read-only mapped virtual address. 976 * it uses the read-only mapped virtual address.
977 */ 977 */
978 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 978 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
979 __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); 979 __asm__ __volatile__("lidt %0" : : "m" (idt_descr));
980 } 980 }
981 #endif 981 #endif
982 982
983 #define _set_gate(gate_addr,type,dpl,addr,seg) \ 983 #define _set_gate(gate_addr,type,dpl,addr,seg) \
984 do { \ 984 do { \
985 int __d0, __d1; \ 985 int __d0, __d1; \
986 __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ 986 __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
987 "movw %4,%%dx\n\t" \ 987 "movw %4,%%dx\n\t" \
988 "movl %%eax,%0\n\t" \ 988 "movl %%eax,%0\n\t" \
989 "movl %%edx,%1" \ 989 "movl %%edx,%1" \
990 :"=m" (*((long *) (gate_addr))), \ 990 :"=m" (*((long *) (gate_addr))), \
991 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ 991 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
992 :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ 992 :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
993 "3" ((char *) (addr)),"2" ((seg) << 16)); \ 993 "3" ((char *) (addr)),"2" ((seg) << 16)); \
994 } while (0) 994 } while (0)
995 995
996 996
997 /* 997 /*
998 * This needs to use 'idt_table' rather than 'idt', and 998 * This needs to use 'idt_table' rather than 'idt', and
999 * thus use the _nonmapped_ version of the IDT, as the 999 * thus use the _nonmapped_ version of the IDT, as the
1000 * Pentium F0 0F bugfix can have resulted in the mapped 1000 * Pentium F0 0F bugfix can have resulted in the mapped
1001 * IDT being write-protected. 1001 * IDT being write-protected.
1002 */ 1002 */
1003 void set_intr_gate(unsigned int n, void *addr) 1003 void set_intr_gate(unsigned int n, void *addr)
1004 { 1004 {
1005 _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); 1005 _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
1006 } 1006 }
1007 1007
1008 /* 1008 /*
1009 * This routine sets up an interrupt gate at directory privilege level 3. 1009 * This routine sets up an interrupt gate at directory privilege level 3.
1010 */ 1010 */
1011 static inline void set_system_intr_gate(unsigned int n, void *addr) 1011 static inline void set_system_intr_gate(unsigned int n, void *addr)
1012 { 1012 {
1013 _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); 1013 _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
1014 } 1014 }
1015 1015
1016 static void __init set_trap_gate(unsigned int n, void *addr) 1016 static void __init set_trap_gate(unsigned int n, void *addr)
1017 { 1017 {
1018 _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); 1018 _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
1019 } 1019 }
1020 1020
1021 static void __init set_system_gate(unsigned int n, void *addr) 1021 static void __init set_system_gate(unsigned int n, void *addr)
1022 { 1022 {
1023 _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); 1023 _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
1024 } 1024 }
1025 1025
1026 static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) 1026 static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1027 { 1027 {
1028 _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); 1028 _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
1029 } 1029 }
1030 1030
1031 1031
1032 void __init trap_init(void) 1032 void __init trap_init(void)
1033 { 1033 {
1034 #ifdef CONFIG_EISA 1034 #ifdef CONFIG_EISA
1035 void __iomem *p = ioremap(0x0FFFD9, 4); 1035 void __iomem *p = ioremap(0x0FFFD9, 4);
1036 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { 1036 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1037 EISA_bus = 1; 1037 EISA_bus = 1;
1038 } 1038 }
1039 iounmap(p); 1039 iounmap(p);
1040 #endif 1040 #endif
1041 1041
1042 #ifdef CONFIG_X86_LOCAL_APIC 1042 #ifdef CONFIG_X86_LOCAL_APIC
1043 init_apic_mappings(); 1043 init_apic_mappings();
1044 #endif 1044 #endif
1045 1045
1046 set_trap_gate(0,&divide_error); 1046 set_trap_gate(0,&divide_error);
1047 set_intr_gate(1,&debug); 1047 set_intr_gate(1,&debug);
1048 set_intr_gate(2,&nmi); 1048 set_intr_gate(2,&nmi);
1049 set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ 1049 set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
1050 set_system_gate(4,&overflow); 1050 set_system_gate(4,&overflow);
1051 set_system_gate(5,&bounds); 1051 set_system_gate(5,&bounds);
1052 set_trap_gate(6,&invalid_op); 1052 set_trap_gate(6,&invalid_op);
1053 set_trap_gate(7,&device_not_available); 1053 set_trap_gate(7,&device_not_available);
1054 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); 1054 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
1055 set_trap_gate(9,&coprocessor_segment_overrun); 1055 set_trap_gate(9,&coprocessor_segment_overrun);
1056 set_trap_gate(10,&invalid_TSS); 1056 set_trap_gate(10,&invalid_TSS);
1057 set_trap_gate(11,&segment_not_present); 1057 set_trap_gate(11,&segment_not_present);
1058 set_trap_gate(12,&stack_segment); 1058 set_trap_gate(12,&stack_segment);
1059 set_trap_gate(13,&general_protection); 1059 set_trap_gate(13,&general_protection);
1060 set_intr_gate(14,&page_fault); 1060 set_intr_gate(14,&page_fault);
1061 set_trap_gate(15,&spurious_interrupt_bug); 1061 set_trap_gate(15,&spurious_interrupt_bug);
1062 set_trap_gate(16,&coprocessor_error); 1062 set_trap_gate(16,&coprocessor_error);
1063 set_trap_gate(17,&alignment_check); 1063 set_trap_gate(17,&alignment_check);
1064 #ifdef CONFIG_X86_MCE 1064 #ifdef CONFIG_X86_MCE
1065 set_trap_gate(18,&machine_check); 1065 set_trap_gate(18,&machine_check);
1066 #endif 1066 #endif
1067 set_trap_gate(19,&simd_coprocessor_error); 1067 set_trap_gate(19,&simd_coprocessor_error);
1068 1068
1069 set_system_gate(SYSCALL_VECTOR,&system_call); 1069 set_system_gate(SYSCALL_VECTOR,&system_call);
1070 1070
1071 /* 1071 /*
1072 * Should be a barrier for any external CPU state. 1072 * Should be a barrier for any external CPU state.
1073 */ 1073 */
1074 cpu_init(); 1074 cpu_init();
1075 1075
1076 trap_init_hook(); 1076 trap_init_hook();
1077 } 1077 }
1078 1078
1079 static int __init kstack_setup(char *s) 1079 static int __init kstack_setup(char *s)
1080 { 1080 {
1081 kstack_depth_to_print = simple_strtoul(s, NULL, 0); 1081 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1082 return 0; 1082 return 0;
1083 } 1083 }
1084 __setup("kstack=", kstack_setup); 1084 __setup("kstack=", kstack_setup);
1085 1085
arch/i386/lib/delay.c
1 /* 1 /*
2 * Precise Delay Loops for i386 2 * Precise Delay Loops for i386
3 * 3 *
4 * Copyright (C) 1993 Linus Torvalds 4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> 5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 * 6 *
7 * The __delay function must _NOT_ be inlined as its execution time 7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors. The additional 8 * depends wildly on alignment on many x86 processors. The additional
9 * jump magic is needed to get the timing stable on all the CPU's 9 * jump magic is needed to get the timing stable on all the CPU's
10 * we have to worry about. 10 * we have to worry about.
11 */ 11 */
12 12
13 #include <linux/config.h> 13 #include <linux/config.h>
14 #include <linux/sched.h> 14 #include <linux/sched.h>
15 #include <linux/delay.h> 15 #include <linux/delay.h>
16 #include <asm/processor.h> 16 #include <asm/processor.h>
17 #include <asm/delay.h> 17 #include <asm/delay.h>
18 #include <asm/timer.h> 18 #include <asm/timer.h>
19 19
20 #ifdef CONFIG_SMP 20 #ifdef CONFIG_SMP
21 #include <asm/smp.h> 21 #include <asm/smp.h>
22 #endif 22 #endif
23 23
24 extern struct timer_opts* timer; 24 extern struct timer_opts* timer;
25 25
26 void __delay(unsigned long loops) 26 void __delay(unsigned long loops)
27 { 27 {
28 cur_timer->delay(loops); 28 cur_timer->delay(loops);
29 } 29 }
30 30
31 inline void __const_udelay(unsigned long xloops) 31 inline void __const_udelay(unsigned long xloops)
32 { 32 {
33 int d0; 33 int d0;
34 xloops *= 4; 34 xloops *= 4;
35 __asm__("mull %0" 35 __asm__("mull %0"
36 :"=d" (xloops), "=&a" (d0) 36 :"=d" (xloops), "=&a" (d0)
37 :"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4))); 37 :"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
38 __delay(++xloops); 38 __delay(++xloops);
39 } 39 }
40 40
41 void __udelay(unsigned long usecs) 41 void __udelay(unsigned long usecs)
42 { 42 {
43 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ 43 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
44 } 44 }
45 45
46 void __ndelay(unsigned long nsecs) 46 void __ndelay(unsigned long nsecs)
47 { 47 {
48 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ 48 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
49 } 49 }
50 50
arch/ppc/lib/locks.c
1 /* 1 /*
2 * Locks for smp ppc 2 * Locks for smp ppc
3 * 3 *
4 * Written by Cort Dougan (cort@cs.nmt.edu) 4 * Written by Cort Dougan (cort@cs.nmt.edu)
5 */ 5 */
6 6
7 #include <linux/config.h> 7 #include <linux/config.h>
8 #include <linux/sched.h> 8 #include <linux/sched.h>
9 #include <linux/spinlock.h> 9 #include <linux/spinlock.h>
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <asm/ppc_asm.h> 11 #include <asm/ppc_asm.h>
12 #include <asm/smp.h> 12 #include <asm/smp.h>
13 13
14 #ifdef CONFIG_DEBUG_SPINLOCK 14 #ifdef CONFIG_DEBUG_SPINLOCK
15 15
16 #undef INIT_STUCK 16 #undef INIT_STUCK
17 #define INIT_STUCK 200000000 /*0xffffffff*/ 17 #define INIT_STUCK 200000000 /*0xffffffff*/
18 18
19 /* 19 /*
20 * Try to acquire a spinlock. 20 * Try to acquire a spinlock.
21 * Only does the stwcx. if the load returned 0 - the Programming 21 * Only does the stwcx. if the load returned 0 - the Programming
22 * Environments Manual suggests not doing unnecessary stcwx.'s 22 * Environments Manual suggests not doing unnecessary stcwx.'s
23 * since they may inhibit forward progress by other CPUs in getting 23 * since they may inhibit forward progress by other CPUs in getting
24 * a lock. 24 * a lock.
25 */ 25 */
26 static inline unsigned long __spin_trylock(volatile unsigned long *lock) 26 static inline unsigned long __spin_trylock(volatile unsigned long *lock)
27 { 27 {
28 unsigned long ret; 28 unsigned long ret;
29 29
30 __asm__ __volatile__ ("\n\ 30 __asm__ __volatile__ ("\n\
31 1: lwarx %0,0,%1\n\ 31 1: lwarx %0,0,%1\n\
32 cmpwi 0,%0,0\n\ 32 cmpwi 0,%0,0\n\
33 bne 2f\n" 33 bne 2f\n"
34 PPC405_ERR77(0,%1) 34 PPC405_ERR77(0,%1)
35 " stwcx. %2,0,%1\n\ 35 " stwcx. %2,0,%1\n\
36 bne- 1b\n\ 36 bne- 1b\n\
37 isync\n\ 37 isync\n\
38 2:" 38 2:"
39 : "=&r"(ret) 39 : "=&r"(ret)
40 : "r"(lock), "r"(1) 40 : "r"(lock), "r"(1)
41 : "cr0", "memory"); 41 : "cr0", "memory");
42 42
43 return ret; 43 return ret;
44 } 44 }
45 45
46 void _raw_spin_lock(spinlock_t *lock) 46 void _raw_spin_lock(spinlock_t *lock)
47 { 47 {
48 int cpu = smp_processor_id(); 48 int cpu = smp_processor_id();
49 unsigned int stuck = INIT_STUCK; 49 unsigned int stuck = INIT_STUCK;
50 while (__spin_trylock(&lock->lock)) { 50 while (__spin_trylock(&lock->lock)) {
51 while ((unsigned volatile long)lock->lock != 0) { 51 while ((unsigned volatile long)lock->lock != 0) {
52 if (!--stuck) { 52 if (!--stuck) {
53 printk("_spin_lock(%p) CPU#%d NIP %p" 53 printk("_spin_lock(%p) CPU#%d NIP %p"
54 " holder: cpu %ld pc %08lX\n", 54 " holder: cpu %ld pc %08lX\n",
55 lock, cpu, __builtin_return_address(0), 55 lock, cpu, __builtin_return_address(0),
56 lock->owner_cpu,lock->owner_pc); 56 lock->owner_cpu,lock->owner_pc);
57 stuck = INIT_STUCK; 57 stuck = INIT_STUCK;
58 /* steal the lock */ 58 /* steal the lock */
59 /*xchg_u32((void *)&lock->lock,0);*/ 59 /*xchg_u32((void *)&lock->lock,0);*/
60 } 60 }
61 } 61 }
62 } 62 }
63 lock->owner_pc = (unsigned long)__builtin_return_address(0); 63 lock->owner_pc = (unsigned long)__builtin_return_address(0);
64 lock->owner_cpu = cpu; 64 lock->owner_cpu = cpu;
65 } 65 }
66 EXPORT_SYMBOL(_raw_spin_lock); 66 EXPORT_SYMBOL(_raw_spin_lock);
67 67
68 int _raw_spin_trylock(spinlock_t *lock) 68 int _raw_spin_trylock(spinlock_t *lock)
69 { 69 {
70 if (__spin_trylock(&lock->lock)) 70 if (__spin_trylock(&lock->lock))
71 return 0; 71 return 0;
72 lock->owner_cpu = smp_processor_id(); 72 lock->owner_cpu = smp_processor_id();
73 lock->owner_pc = (unsigned long)__builtin_return_address(0); 73 lock->owner_pc = (unsigned long)__builtin_return_address(0);
74 return 1; 74 return 1;
75 } 75 }
76 EXPORT_SYMBOL(_raw_spin_trylock); 76 EXPORT_SYMBOL(_raw_spin_trylock);
77 77
78 void _raw_spin_unlock(spinlock_t *lp) 78 void _raw_spin_unlock(spinlock_t *lp)
79 { 79 {
80 if ( !lp->lock ) 80 if ( !lp->lock )
81 printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", 81 printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
82 lp, smp_processor_id(), __builtin_return_address(0), 82 lp, smp_processor_id(), __builtin_return_address(0),
83 current->comm, current->pid); 83 current->comm, current->pid);
84 if ( lp->owner_cpu != smp_processor_id() ) 84 if ( lp->owner_cpu != smp_processor_id() )
85 printk("_spin_unlock(%p): cpu %d trying clear of cpu %d pc %lx val %lx\n", 85 printk("_spin_unlock(%p): cpu %d trying clear of cpu %d pc %lx val %lx\n",
86 lp, smp_processor_id(), (int)lp->owner_cpu, 86 lp, smp_processor_id(), (int)lp->owner_cpu,
87 lp->owner_pc,lp->lock); 87 lp->owner_pc,lp->lock);
88 lp->owner_pc = lp->owner_cpu = 0; 88 lp->owner_pc = lp->owner_cpu = 0;
89 wmb(); 89 wmb();
90 lp->lock = 0; 90 lp->lock = 0;
91 } 91 }
92 EXPORT_SYMBOL(_raw_spin_unlock); 92 EXPORT_SYMBOL(_raw_spin_unlock);
93 93
94 /* 94 /*
95 * For rwlocks, zero is unlocked, -1 is write-locked, 95 * For rwlocks, zero is unlocked, -1 is write-locked,
96 * positive is read-locked. 96 * positive is read-locked.
97 */ 97 */
98 static __inline__ int __read_trylock(rwlock_t *rw) 98 static __inline__ int __read_trylock(rwlock_t *rw)
99 { 99 {
100 signed int tmp; 100 signed int tmp;
101 101
102 __asm__ __volatile__( 102 __asm__ __volatile__(
103 "2: lwarx %0,0,%1 # __read_trylock\n\ 103 "2: lwarx %0,0,%1 # __read_trylock\n\
104 addic. %0,%0,1\n\ 104 addic. %0,%0,1\n\
105 ble- 1f\n" 105 ble- 1f\n"
106 PPC405_ERR77(0,%1) 106 PPC405_ERR77(0,%1)
107 " stwcx. %0,0,%1\n\ 107 " stwcx. %0,0,%1\n\
108 bne- 2b\n\ 108 bne- 2b\n\
109 isync\n\ 109 isync\n\
110 1:" 110 1:"
111 : "=&r"(tmp) 111 : "=&r"(tmp)
112 : "r"(&rw->lock) 112 : "r"(&rw->lock)
113 : "cr0", "memory"); 113 : "cr0", "memory");
114 114
115 return tmp; 115 return tmp;
116 } 116 }
117 117
118 int _raw_read_trylock(rwlock_t *rw) 118 int _raw_read_trylock(rwlock_t *rw)
119 { 119 {
120 return __read_trylock(rw) > 0; 120 return __read_trylock(rw) > 0;
121 } 121 }
122 EXPORT_SYMBOL(_raw_read_trylock); 122 EXPORT_SYMBOL(_raw_read_trylock);
123 123
124 void _raw_read_lock(rwlock_t *rw) 124 void _raw_read_lock(rwlock_t *rw)
125 { 125 {
126 unsigned int stuck; 126 unsigned int stuck;
127 127
128 while (__read_trylock(rw) <= 0) { 128 while (__read_trylock(rw) <= 0) {
129 stuck = INIT_STUCK; 129 stuck = INIT_STUCK;
130 while (!read_can_lock(rw)) { 130 while (!read_can_lock(rw)) {
131 if (--stuck == 0) { 131 if (--stuck == 0) {
132 printk("_read_lock(%p) CPU#%d lock %d\n", 132 printk("_read_lock(%p) CPU#%d lock %d\n",
133 rw, _smp_processor_id(), rw->lock); 133 rw, raw_smp_processor_id(), rw->lock);
134 stuck = INIT_STUCK; 134 stuck = INIT_STUCK;
135 } 135 }
136 } 136 }
137 } 137 }
138 } 138 }
139 EXPORT_SYMBOL(_raw_read_lock); 139 EXPORT_SYMBOL(_raw_read_lock);
140 140
141 void _raw_read_unlock(rwlock_t *rw) 141 void _raw_read_unlock(rwlock_t *rw)
142 { 142 {
143 if ( rw->lock == 0 ) 143 if ( rw->lock == 0 )
144 printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", 144 printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
145 current->comm,current->pid,current->thread.regs->nip, 145 current->comm,current->pid,current->thread.regs->nip,
146 rw->lock); 146 rw->lock);
147 wmb(); 147 wmb();
148 atomic_dec((atomic_t *) &(rw)->lock); 148 atomic_dec((atomic_t *) &(rw)->lock);
149 } 149 }
150 EXPORT_SYMBOL(_raw_read_unlock); 150 EXPORT_SYMBOL(_raw_read_unlock);
151 151
152 void _raw_write_lock(rwlock_t *rw) 152 void _raw_write_lock(rwlock_t *rw)
153 { 153 {
154 unsigned int stuck; 154 unsigned int stuck;
155 155
156 while (cmpxchg(&rw->lock, 0, -1) != 0) { 156 while (cmpxchg(&rw->lock, 0, -1) != 0) {
157 stuck = INIT_STUCK; 157 stuck = INIT_STUCK;
158 while (!write_can_lock(rw)) { 158 while (!write_can_lock(rw)) {
159 if (--stuck == 0) { 159 if (--stuck == 0) {
160 printk("write_lock(%p) CPU#%d lock %d)\n", 160 printk("write_lock(%p) CPU#%d lock %d)\n",
161 rw, _smp_processor_id(), rw->lock); 161 rw, raw_smp_processor_id(), rw->lock);
162 stuck = INIT_STUCK; 162 stuck = INIT_STUCK;
163 } 163 }
164 } 164 }
165 } 165 }
166 wmb(); 166 wmb();
167 } 167 }
168 EXPORT_SYMBOL(_raw_write_lock); 168 EXPORT_SYMBOL(_raw_write_lock);
169 169
170 int _raw_write_trylock(rwlock_t *rw) 170 int _raw_write_trylock(rwlock_t *rw)
171 { 171 {
172 if (cmpxchg(&rw->lock, 0, -1) != 0) 172 if (cmpxchg(&rw->lock, 0, -1) != 0)
173 return 0; 173 return 0;
174 wmb(); 174 wmb();
175 return 1; 175 return 1;
176 } 176 }
177 EXPORT_SYMBOL(_raw_write_trylock); 177 EXPORT_SYMBOL(_raw_write_trylock);
178 178
179 void _raw_write_unlock(rwlock_t *rw) 179 void _raw_write_unlock(rwlock_t *rw)
180 { 180 {
181 if (rw->lock >= 0) 181 if (rw->lock >= 0)
182 printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", 182 printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
183 current->comm,current->pid,current->thread.regs->nip, 183 current->comm,current->pid,current->thread.regs->nip,
184 rw->lock); 184 rw->lock);
185 wmb(); 185 wmb();
186 rw->lock = 0; 186 rw->lock = 0;
187 } 187 }
188 EXPORT_SYMBOL(_raw_write_unlock); 188 EXPORT_SYMBOL(_raw_write_unlock);
189 189
190 #endif 190 #endif
191 191
arch/ppc64/kernel/idle.c
1 /* 1 /*
2 * Idle daemon for PowerPC. Idle daemon will handle any action 2 * Idle daemon for PowerPC. Idle daemon will handle any action
3 * that needs to be taken when the system becomes idle. 3 * that needs to be taken when the system becomes idle.
4 * 4 *
5 * Originally Written by Cort Dougan (cort@cs.nmt.edu) 5 * Originally Written by Cort Dougan (cort@cs.nmt.edu)
6 * 6 *
7 * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com> 7 * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
8 * 8 *
9 * Additional shared processor, SMT, and firmware support 9 * Additional shared processor, SMT, and firmware support
10 * Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com> 10 * Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
11 * 11 *
12 * This program is free software; you can redistribute it and/or 12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License 13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version. 15 * 2 of the License, or (at your option) any later version.
16 */ 16 */
17 17
18 #include <linux/config.h> 18 #include <linux/config.h>
19 #include <linux/sched.h> 19 #include <linux/sched.h>
20 #include <linux/kernel.h> 20 #include <linux/kernel.h>
21 #include <linux/smp.h> 21 #include <linux/smp.h>
22 #include <linux/cpu.h> 22 #include <linux/cpu.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/sysctl.h> 24 #include <linux/sysctl.h>
25 #include <linux/smp.h> 25 #include <linux/smp.h>
26 26
27 #include <asm/system.h> 27 #include <asm/system.h>
28 #include <asm/processor.h> 28 #include <asm/processor.h>
29 #include <asm/mmu.h> 29 #include <asm/mmu.h>
30 #include <asm/cputable.h> 30 #include <asm/cputable.h>
31 #include <asm/time.h> 31 #include <asm/time.h>
32 #include <asm/iSeries/HvCall.h> 32 #include <asm/iSeries/HvCall.h>
33 #include <asm/iSeries/ItLpQueue.h> 33 #include <asm/iSeries/ItLpQueue.h>
34 #include <asm/plpar_wrappers.h> 34 #include <asm/plpar_wrappers.h>
35 #include <asm/systemcfg.h> 35 #include <asm/systemcfg.h>
36 36
37 extern void power4_idle(void); 37 extern void power4_idle(void);
38 38
39 static int (*idle_loop)(void); 39 static int (*idle_loop)(void);
40 40
41 #ifdef CONFIG_PPC_ISERIES 41 #ifdef CONFIG_PPC_ISERIES
42 static unsigned long maxYieldTime = 0; 42 static unsigned long maxYieldTime = 0;
43 static unsigned long minYieldTime = 0xffffffffffffffffUL; 43 static unsigned long minYieldTime = 0xffffffffffffffffUL;
44 44
45 static void yield_shared_processor(void) 45 static void yield_shared_processor(void)
46 { 46 {
47 unsigned long tb; 47 unsigned long tb;
48 unsigned long yieldTime; 48 unsigned long yieldTime;
49 49
50 HvCall_setEnabledInterrupts(HvCall_MaskIPI | 50 HvCall_setEnabledInterrupts(HvCall_MaskIPI |
51 HvCall_MaskLpEvent | 51 HvCall_MaskLpEvent |
52 HvCall_MaskLpProd | 52 HvCall_MaskLpProd |
53 HvCall_MaskTimeout); 53 HvCall_MaskTimeout);
54 54
55 tb = get_tb(); 55 tb = get_tb();
56 /* Compute future tb value when yield should expire */ 56 /* Compute future tb value when yield should expire */
57 HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy); 57 HvCall_yieldProcessor(HvCall_YieldTimed, tb+tb_ticks_per_jiffy);
58 58
59 yieldTime = get_tb() - tb; 59 yieldTime = get_tb() - tb;
60 if (yieldTime > maxYieldTime) 60 if (yieldTime > maxYieldTime)
61 maxYieldTime = yieldTime; 61 maxYieldTime = yieldTime;
62 62
63 if (yieldTime < minYieldTime) 63 if (yieldTime < minYieldTime)
64 minYieldTime = yieldTime; 64 minYieldTime = yieldTime;
65 65
66 /* 66 /*
67 * The decrementer stops during the yield. Force a fake decrementer 67 * The decrementer stops during the yield. Force a fake decrementer
68 * here and let the timer_interrupt code sort out the actual time. 68 * here and let the timer_interrupt code sort out the actual time.
69 */ 69 */
70 get_paca()->lppaca.int_dword.fields.decr_int = 1; 70 get_paca()->lppaca.int_dword.fields.decr_int = 1;
71 process_iSeries_events(); 71 process_iSeries_events();
72 } 72 }
73 73
74 static int iSeries_idle(void) 74 static int iSeries_idle(void)
75 { 75 {
76 struct paca_struct *lpaca; 76 struct paca_struct *lpaca;
77 long oldval; 77 long oldval;
78 78
79 /* ensure iSeries run light will be out when idle */ 79 /* ensure iSeries run light will be out when idle */
80 ppc64_runlatch_off(); 80 ppc64_runlatch_off();
81 81
82 lpaca = get_paca(); 82 lpaca = get_paca();
83 83
84 while (1) { 84 while (1) {
85 if (lpaca->lppaca.shared_proc) { 85 if (lpaca->lppaca.shared_proc) {
86 if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr)) 86 if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
87 process_iSeries_events(); 87 process_iSeries_events();
88 if (!need_resched()) 88 if (!need_resched())
89 yield_shared_processor(); 89 yield_shared_processor();
90 } else { 90 } else {
91 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); 91 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
92 92
93 if (!oldval) { 93 if (!oldval) {
94 set_thread_flag(TIF_POLLING_NRFLAG); 94 set_thread_flag(TIF_POLLING_NRFLAG);
95 95
96 while (!need_resched()) { 96 while (!need_resched()) {
97 HMT_medium(); 97 HMT_medium();
98 if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr)) 98 if (ItLpQueue_isLpIntPending(lpaca->lpqueue_ptr))
99 process_iSeries_events(); 99 process_iSeries_events();
100 HMT_low(); 100 HMT_low();
101 } 101 }
102 102
103 HMT_medium(); 103 HMT_medium();
104 clear_thread_flag(TIF_POLLING_NRFLAG); 104 clear_thread_flag(TIF_POLLING_NRFLAG);
105 } else { 105 } else {
106 set_need_resched(); 106 set_need_resched();
107 } 107 }
108 } 108 }
109 109
110 ppc64_runlatch_on(); 110 ppc64_runlatch_on();
111 schedule(); 111 schedule();
112 ppc64_runlatch_off(); 112 ppc64_runlatch_off();
113 } 113 }
114 114
115 return 0; 115 return 0;
116 } 116 }
117 117
118 #else 118 #else
119 119
120 static int default_idle(void) 120 static int default_idle(void)
121 { 121 {
122 long oldval; 122 long oldval;
123 unsigned int cpu = smp_processor_id(); 123 unsigned int cpu = smp_processor_id();
124 124
125 while (1) { 125 while (1) {
126 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); 126 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
127 127
128 if (!oldval) { 128 if (!oldval) {
129 set_thread_flag(TIF_POLLING_NRFLAG); 129 set_thread_flag(TIF_POLLING_NRFLAG);
130 130
131 while (!need_resched() && !cpu_is_offline(cpu)) { 131 while (!need_resched() && !cpu_is_offline(cpu)) {
132 barrier(); 132 barrier();
133 /* 133 /*
134 * Go into low thread priority and possibly 134 * Go into low thread priority and possibly
135 * low power mode. 135 * low power mode.
136 */ 136 */
137 HMT_low(); 137 HMT_low();
138 HMT_very_low(); 138 HMT_very_low();
139 } 139 }
140 140
141 HMT_medium(); 141 HMT_medium();
142 clear_thread_flag(TIF_POLLING_NRFLAG); 142 clear_thread_flag(TIF_POLLING_NRFLAG);
143 } else { 143 } else {
144 set_need_resched(); 144 set_need_resched();
145 } 145 }
146 146
147 schedule(); 147 schedule();
148 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) 148 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
149 cpu_die(); 149 cpu_die();
150 } 150 }
151 151
152 return 0; 152 return 0;
153 } 153 }
154 154
155 #ifdef CONFIG_PPC_PSERIES 155 #ifdef CONFIG_PPC_PSERIES
156 156
157 DECLARE_PER_CPU(unsigned long, smt_snooze_delay); 157 DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
158 158
159 int dedicated_idle(void) 159 int dedicated_idle(void)
160 { 160 {
161 long oldval; 161 long oldval;
162 struct paca_struct *lpaca = get_paca(), *ppaca; 162 struct paca_struct *lpaca = get_paca(), *ppaca;
163 unsigned long start_snooze; 163 unsigned long start_snooze;
164 unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay); 164 unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
165 unsigned int cpu = smp_processor_id(); 165 unsigned int cpu = smp_processor_id();
166 166
167 ppaca = &paca[cpu ^ 1]; 167 ppaca = &paca[cpu ^ 1];
168 168
169 while (1) { 169 while (1) {
170 /* 170 /*
171 * Indicate to the HV that we are idle. Now would be 171 * Indicate to the HV that we are idle. Now would be
172 * a good time to find other work to dispatch. 172 * a good time to find other work to dispatch.
173 */ 173 */
174 lpaca->lppaca.idle = 1; 174 lpaca->lppaca.idle = 1;
175 175
176 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); 176 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
177 if (!oldval) { 177 if (!oldval) {
178 set_thread_flag(TIF_POLLING_NRFLAG); 178 set_thread_flag(TIF_POLLING_NRFLAG);
179 start_snooze = __get_tb() + 179 start_snooze = __get_tb() +
180 *smt_snooze_delay * tb_ticks_per_usec; 180 *smt_snooze_delay * tb_ticks_per_usec;
181 while (!need_resched() && !cpu_is_offline(cpu)) { 181 while (!need_resched() && !cpu_is_offline(cpu)) {
182 /* 182 /*
183 * Go into low thread priority and possibly 183 * Go into low thread priority and possibly
184 * low power mode. 184 * low power mode.
185 */ 185 */
186 HMT_low(); 186 HMT_low();
187 HMT_very_low(); 187 HMT_very_low();
188 188
189 if (*smt_snooze_delay == 0 || 189 if (*smt_snooze_delay == 0 ||
190 __get_tb() < start_snooze) 190 __get_tb() < start_snooze)
191 continue; 191 continue;
192 192
193 HMT_medium(); 193 HMT_medium();
194 194
195 if (!(ppaca->lppaca.idle)) { 195 if (!(ppaca->lppaca.idle)) {
196 local_irq_disable(); 196 local_irq_disable();
197 197
198 /* 198 /*
199 * We are about to sleep the thread 199 * We are about to sleep the thread
200 * and so wont be polling any 200 * and so wont be polling any
201 * more. 201 * more.
202 */ 202 */
203 clear_thread_flag(TIF_POLLING_NRFLAG); 203 clear_thread_flag(TIF_POLLING_NRFLAG);
204 204
205 /* 205 /*
206 * SMT dynamic mode. Cede will result 206 * SMT dynamic mode. Cede will result
207 * in this thread going dormant, if the 207 * in this thread going dormant, if the
208 * partner thread is still doing work. 208 * partner thread is still doing work.
209 * Thread wakes up if partner goes idle, 209 * Thread wakes up if partner goes idle,
210 * an interrupt is presented, or a prod 210 * an interrupt is presented, or a prod
211 * occurs. Returning from the cede 211 * occurs. Returning from the cede
212 * enables external interrupts. 212 * enables external interrupts.
213 */ 213 */
214 if (!need_resched()) 214 if (!need_resched())
215 cede_processor(); 215 cede_processor();
216 else 216 else
217 local_irq_enable(); 217 local_irq_enable();
218 } else { 218 } else {
219 /* 219 /*
220 * Give the HV an opportunity at the 220 * Give the HV an opportunity at the
221 * processor, since we are not doing 221 * processor, since we are not doing
222 * any work. 222 * any work.
223 */ 223 */
224 poll_pending(); 224 poll_pending();
225 } 225 }
226 } 226 }
227 227
228 clear_thread_flag(TIF_POLLING_NRFLAG); 228 clear_thread_flag(TIF_POLLING_NRFLAG);
229 } else { 229 } else {
230 set_need_resched(); 230 set_need_resched();
231 } 231 }
232 232
233 HMT_medium(); 233 HMT_medium();
234 lpaca->lppaca.idle = 0; 234 lpaca->lppaca.idle = 0;
235 schedule(); 235 schedule();
236 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) 236 if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
237 cpu_die(); 237 cpu_die();
238 } 238 }
239 return 0; 239 return 0;
240 } 240 }
241 241
242 static int shared_idle(void) 242 static int shared_idle(void)
243 { 243 {
244 struct paca_struct *lpaca = get_paca(); 244 struct paca_struct *lpaca = get_paca();
245 unsigned int cpu = smp_processor_id(); 245 unsigned int cpu = smp_processor_id();
246 246
247 while (1) { 247 while (1) {
248 /* 248 /*
249 * Indicate to the HV that we are idle. Now would be 249 * Indicate to the HV that we are idle. Now would be
250 * a good time to find other work to dispatch. 250 * a good time to find other work to dispatch.
251 */ 251 */
252 lpaca->lppaca.idle = 1; 252 lpaca->lppaca.idle = 1;
253 253
254 while (!need_resched() && !cpu_is_offline(cpu)) { 254 while (!need_resched() && !cpu_is_offline(cpu)) {
255 local_irq_disable(); 255 local_irq_disable();
256 256
257 /* 257 /*
258 * Yield the processor to the hypervisor. We return if 258 * Yield the processor to the hypervisor. We return if
259 * an external interrupt occurs (which are driven prior 259 * an external interrupt occurs (which are driven prior
260 * to returning here) or if a prod occurs from another 260 * to returning here) or if a prod occurs from another
261 * processor. When returning here, external interrupts 261 * processor. When returning here, external interrupts
262 * are enabled. 262 * are enabled.
263 * 263 *
264 * Check need_resched() again with interrupts disabled 264 * Check need_resched() again with interrupts disabled
265 * to avoid a race. 265 * to avoid a race.
266 */ 266 */
267 if (!need_resched()) 267 if (!need_resched())
268 cede_processor(); 268 cede_processor();
269 else 269 else
270 local_irq_enable(); 270 local_irq_enable();
271 } 271 }
272 272
273 HMT_medium(); 273 HMT_medium();
274 lpaca->lppaca.idle = 0; 274 lpaca->lppaca.idle = 0;
275 schedule(); 275 schedule();
276 if (cpu_is_offline(smp_processor_id()) && 276 if (cpu_is_offline(smp_processor_id()) &&
277 system_state == SYSTEM_RUNNING) 277 system_state == SYSTEM_RUNNING)
278 cpu_die(); 278 cpu_die();
279 } 279 }
280 280
281 return 0; 281 return 0;
282 } 282 }
283 283
284 #endif /* CONFIG_PPC_PSERIES */ 284 #endif /* CONFIG_PPC_PSERIES */
285 285
286 static int native_idle(void) 286 static int native_idle(void)
287 { 287 {
288 while(1) { 288 while(1) {
289 /* check CPU type here */ 289 /* check CPU type here */
290 if (!need_resched()) 290 if (!need_resched())
291 power4_idle(); 291 power4_idle();
292 if (need_resched()) 292 if (need_resched())
293 schedule(); 293 schedule();
294 294
295 if (cpu_is_offline(_smp_processor_id()) && 295 if (cpu_is_offline(raw_smp_processor_id()) &&
296 system_state == SYSTEM_RUNNING) 296 system_state == SYSTEM_RUNNING)
297 cpu_die(); 297 cpu_die();
298 } 298 }
299 return 0; 299 return 0;
300 } 300 }
301 301
302 #endif /* CONFIG_PPC_ISERIES */ 302 #endif /* CONFIG_PPC_ISERIES */
303 303
304 void cpu_idle(void) 304 void cpu_idle(void)
305 { 305 {
306 idle_loop(); 306 idle_loop();
307 } 307 }
308 308
309 int powersave_nap; 309 int powersave_nap;
310 310
311 #ifdef CONFIG_SYSCTL 311 #ifdef CONFIG_SYSCTL
312 /* 312 /*
313 * Register the sysctl to set/clear powersave_nap. 313 * Register the sysctl to set/clear powersave_nap.
314 */ 314 */
315 static ctl_table powersave_nap_ctl_table[]={ 315 static ctl_table powersave_nap_ctl_table[]={
316 { 316 {
317 .ctl_name = KERN_PPC_POWERSAVE_NAP, 317 .ctl_name = KERN_PPC_POWERSAVE_NAP,
318 .procname = "powersave-nap", 318 .procname = "powersave-nap",
319 .data = &powersave_nap, 319 .data = &powersave_nap,
320 .maxlen = sizeof(int), 320 .maxlen = sizeof(int),
321 .mode = 0644, 321 .mode = 0644,
322 .proc_handler = &proc_dointvec, 322 .proc_handler = &proc_dointvec,
323 }, 323 },
324 { 0, }, 324 { 0, },
325 }; 325 };
326 static ctl_table powersave_nap_sysctl_root[] = { 326 static ctl_table powersave_nap_sysctl_root[] = {
327 { 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, }, 327 { 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, },
328 { 0,}, 328 { 0,},
329 }; 329 };
330 330
331 static int __init 331 static int __init
332 register_powersave_nap_sysctl(void) 332 register_powersave_nap_sysctl(void)
333 { 333 {
334 register_sysctl_table(powersave_nap_sysctl_root, 0); 334 register_sysctl_table(powersave_nap_sysctl_root, 0);
335 335
336 return 0; 336 return 0;
337 } 337 }
338 __initcall(register_powersave_nap_sysctl); 338 __initcall(register_powersave_nap_sysctl);
339 #endif 339 #endif
340 340
341 int idle_setup(void) 341 int idle_setup(void)
342 { 342 {
343 /* 343 /*
344 * Move that junk to each platform specific file, eventually define 344 * Move that junk to each platform specific file, eventually define
345 * a pSeries_idle for shared processor stuff 345 * a pSeries_idle for shared processor stuff
346 */ 346 */
347 #ifdef CONFIG_PPC_ISERIES 347 #ifdef CONFIG_PPC_ISERIES
348 idle_loop = iSeries_idle; 348 idle_loop = iSeries_idle;
349 return 1; 349 return 1;
350 #else 350 #else
351 idle_loop = default_idle; 351 idle_loop = default_idle;
352 #endif 352 #endif
353 #ifdef CONFIG_PPC_PSERIES 353 #ifdef CONFIG_PPC_PSERIES
354 if (systemcfg->platform & PLATFORM_PSERIES) { 354 if (systemcfg->platform & PLATFORM_PSERIES) {
355 if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { 355 if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
356 if (get_paca()->lppaca.shared_proc) { 356 if (get_paca()->lppaca.shared_proc) {
357 printk(KERN_INFO "Using shared processor idle loop\n"); 357 printk(KERN_INFO "Using shared processor idle loop\n");
358 idle_loop = shared_idle; 358 idle_loop = shared_idle;
359 } else { 359 } else {
360 printk(KERN_INFO "Using dedicated idle loop\n"); 360 printk(KERN_INFO "Using dedicated idle loop\n");
361 idle_loop = dedicated_idle; 361 idle_loop = dedicated_idle;
362 } 362 }
363 } else { 363 } else {
364 printk(KERN_INFO "Using default idle loop\n"); 364 printk(KERN_INFO "Using default idle loop\n");
365 idle_loop = default_idle; 365 idle_loop = default_idle;
366 } 366 }
367 } 367 }
368 #endif /* CONFIG_PPC_PSERIES */ 368 #endif /* CONFIG_PPC_PSERIES */
369 #ifndef CONFIG_PPC_ISERIES 369 #ifndef CONFIG_PPC_ISERIES
370 if (systemcfg->platform == PLATFORM_POWERMAC || 370 if (systemcfg->platform == PLATFORM_POWERMAC ||
371 systemcfg->platform == PLATFORM_MAPLE) { 371 systemcfg->platform == PLATFORM_MAPLE) {
372 printk(KERN_INFO "Using native/NAP idle loop\n"); 372 printk(KERN_INFO "Using native/NAP idle loop\n");
373 idle_loop = native_idle; 373 idle_loop = native_idle;
374 } 374 }
375 #endif /* CONFIG_PPC_ISERIES */ 375 #endif /* CONFIG_PPC_ISERIES */
376 376
377 return 1; 377 return 1;
378 } 378 }
379 379
1 /* 1 /*
2 * Precise Delay Loops for SuperH 2 * Precise Delay Loops for SuperH
3 * 3 *
4 * Copyright (C) 1999 Niibe Yutaka & Kaz Kojima 4 * Copyright (C) 1999 Niibe Yutaka & Kaz Kojima
5 */ 5 */
6 6
7 #include <linux/sched.h> 7 #include <linux/sched.h>
8 #include <linux/delay.h> 8 #include <linux/delay.h>
9 9
10 void __delay(unsigned long loops) 10 void __delay(unsigned long loops)
11 { 11 {
12 __asm__ __volatile__( 12 __asm__ __volatile__(
13 "tst %0, %0\n\t" 13 "tst %0, %0\n\t"
14 "1:\t" 14 "1:\t"
15 "bf/s 1b\n\t" 15 "bf/s 1b\n\t"
16 " dt %0" 16 " dt %0"
17 : "=r" (loops) 17 : "=r" (loops)
18 : "0" (loops) 18 : "0" (loops)
19 : "t"); 19 : "t");
20 } 20 }
21 21
22 inline void __const_udelay(unsigned long xloops) 22 inline void __const_udelay(unsigned long xloops)
23 { 23 {
24 __asm__("dmulu.l %0, %2\n\t" 24 __asm__("dmulu.l %0, %2\n\t"
25 "sts mach, %0" 25 "sts mach, %0"
26 : "=r" (xloops) 26 : "=r" (xloops)
27 : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy) 27 : "0" (xloops), "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy)
28 : "macl", "mach"); 28 : "macl", "mach");
29 __delay(xloops * HZ); 29 __delay(xloops * HZ);
30 } 30 }
31 31
32 void __udelay(unsigned long usecs) 32 void __udelay(unsigned long usecs)
33 { 33 {
34 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ 34 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
35 } 35 }
36 36
37 void __ndelay(unsigned long nsecs) 37 void __ndelay(unsigned long nsecs)
38 { 38 {
39 __const_udelay(nsecs * 0x00000005); 39 __const_udelay(nsecs * 0x00000005);
40 } 40 }
41 41
42 42
arch/sparc64/lib/delay.c
1 /* delay.c: Delay loops for sparc64 1 /* delay.c: Delay loops for sparc64
2 * 2 *
3 * Copyright (C) 2004 David S. Miller <davem@redhat.com> 3 * Copyright (C) 2004 David S. Miller <davem@redhat.com>
4 * 4 *
5 * Based heavily upon x86 variant which is: 5 * Based heavily upon x86 variant which is:
6 * Copyright (C) 1993 Linus Torvalds 6 * Copyright (C) 1993 Linus Torvalds
7 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> 7 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
8 */ 8 */
9 9
10 #include <linux/delay.h> 10 #include <linux/delay.h>
11 11
12 void __delay(unsigned long loops) 12 void __delay(unsigned long loops)
13 { 13 {
14 __asm__ __volatile__( 14 __asm__ __volatile__(
15 " b,pt %%xcc, 1f\n" 15 " b,pt %%xcc, 1f\n"
16 " cmp %0, 0\n" 16 " cmp %0, 0\n"
17 " .align 32\n" 17 " .align 32\n"
18 "1:\n" 18 "1:\n"
19 " bne,pt %%xcc, 1b\n" 19 " bne,pt %%xcc, 1b\n"
20 " subcc %0, 1, %0\n" 20 " subcc %0, 1, %0\n"
21 : "=&r" (loops) 21 : "=&r" (loops)
22 : "0" (loops) 22 : "0" (loops)
23 : "cc"); 23 : "cc");
24 } 24 }
25 25
26 /* We used to multiply by HZ after shifting down by 32 bits 26 /* We used to multiply by HZ after shifting down by 32 bits
27 * but that runs into problems for higher values of HZ and 27 * but that runs into problems for higher values of HZ and
28 * slow cpus. 28 * slow cpus.
29 */ 29 */
30 void __const_udelay(unsigned long n) 30 void __const_udelay(unsigned long n)
31 { 31 {
32 n *= 4; 32 n *= 4;
33 33
34 n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4)); 34 n *= (cpu_data(raw_smp_processor_id()).udelay_val * (HZ/4));
35 n >>= 32; 35 n >>= 32;
36 36
37 __delay(n + 1); 37 __delay(n + 1);
38 } 38 }
39 39
40 void __udelay(unsigned long n) 40 void __udelay(unsigned long n)
41 { 41 {
42 __const_udelay(n * 0x10c7UL); 42 __const_udelay(n * 0x10c7UL);
43 } 43 }
44 44
45 45
46 void __ndelay(unsigned long n) 46 void __ndelay(unsigned long n)
47 { 47 {
48 __const_udelay(n * 0x5UL); 48 __const_udelay(n * 0x5UL);
49 } 49 }
50 50
arch/x86_64/lib/delay.c
1 /* 1 /*
2 * Precise Delay Loops for x86-64 2 * Precise Delay Loops for x86-64
3 * 3 *
4 * Copyright (C) 1993 Linus Torvalds 4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> 5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 * 6 *
7 * The __delay function must _NOT_ be inlined as its execution time 7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors. 8 * depends wildly on alignment on many x86 processors.
9 */ 9 */
10 10
11 #include <linux/config.h> 11 #include <linux/config.h>
12 #include <linux/sched.h> 12 #include <linux/sched.h>
13 #include <linux/delay.h> 13 #include <linux/delay.h>
14 #include <asm/delay.h> 14 #include <asm/delay.h>
15 15
16 #ifdef CONFIG_SMP 16 #ifdef CONFIG_SMP
17 #include <asm/smp.h> 17 #include <asm/smp.h>
18 #endif 18 #endif
19 19
20 int x86_udelay_tsc = 0; /* Delay via TSC */ 20 int x86_udelay_tsc = 0; /* Delay via TSC */
21 21
22 void __delay(unsigned long loops) 22 void __delay(unsigned long loops)
23 { 23 {
24 unsigned bclock, now; 24 unsigned bclock, now;
25 25
26 rdtscl(bclock); 26 rdtscl(bclock);
27 do 27 do
28 { 28 {
29 rep_nop(); 29 rep_nop();
30 rdtscl(now); 30 rdtscl(now);
31 } 31 }
32 while((now-bclock) < loops); 32 while((now-bclock) < loops);
33 } 33 }
34 34
35 inline void __const_udelay(unsigned long xloops) 35 inline void __const_udelay(unsigned long xloops)
36 { 36 {
37 __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ); 37 __delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
38 } 38 }
39 39
40 void __udelay(unsigned long usecs) 40 void __udelay(unsigned long usecs)
41 { 41 {
42 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ 42 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
43 } 43 }
44 44
45 void __ndelay(unsigned long nsecs) 45 void __ndelay(unsigned long nsecs)
46 { 46 {
47 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ 47 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
48 } 48 }
49 49
drivers/acpi/processor_idle.c
1 /* 1 /*
2 * processor_idle - idle state submodule to the ACPI processor driver 2 * processor_idle - idle state submodule to the ACPI processor driver
3 * 3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> 4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> 5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2004 Dominik Brodowski <linux@brodo.de> 6 * Copyright (C) 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> 7 * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8 * - Added processor hotplug support 8 * - Added processor hotplug support
9 * 9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 * 11 *
12 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at 14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version. 15 * your option) any later version.
16 * 16 *
17 * This program is distributed in the hope that it will be useful, but 17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of 18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details. 20 * General Public License for more details.
21 * 21 *
22 * You should have received a copy of the GNU General Public License along 22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc., 23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 24 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
25 * 25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */ 27 */
28 28
29 #include <linux/kernel.h> 29 #include <linux/kernel.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/init.h> 31 #include <linux/init.h>
32 #include <linux/cpufreq.h> 32 #include <linux/cpufreq.h>
33 #include <linux/proc_fs.h> 33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h> 34 #include <linux/seq_file.h>
35 #include <linux/acpi.h> 35 #include <linux/acpi.h>
36 #include <linux/dmi.h> 36 #include <linux/dmi.h>
37 #include <linux/moduleparam.h> 37 #include <linux/moduleparam.h>
38 38
39 #include <asm/io.h> 39 #include <asm/io.h>
40 #include <asm/uaccess.h> 40 #include <asm/uaccess.h>
41 41
42 #include <acpi/acpi_bus.h> 42 #include <acpi/acpi_bus.h>
43 #include <acpi/processor.h> 43 #include <acpi/processor.h>
44 44
45 #define ACPI_PROCESSOR_COMPONENT 0x01000000 45 #define ACPI_PROCESSOR_COMPONENT 0x01000000
46 #define ACPI_PROCESSOR_CLASS "processor" 46 #define ACPI_PROCESSOR_CLASS "processor"
47 #define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver" 47 #define ACPI_PROCESSOR_DRIVER_NAME "ACPI Processor Driver"
48 #define _COMPONENT ACPI_PROCESSOR_COMPONENT 48 #define _COMPONENT ACPI_PROCESSOR_COMPONENT
49 ACPI_MODULE_NAME ("acpi_processor") 49 ACPI_MODULE_NAME ("acpi_processor")
50 50
51 #define ACPI_PROCESSOR_FILE_POWER "power" 51 #define ACPI_PROCESSOR_FILE_POWER "power"
52 52
53 #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) 53 #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
54 #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 54 #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
55 #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 55 #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
56 56
57 static void (*pm_idle_save)(void); 57 static void (*pm_idle_save)(void);
58 module_param(max_cstate, uint, 0644); 58 module_param(max_cstate, uint, 0644);
59 59
60 static unsigned int nocst = 0; 60 static unsigned int nocst = 0;
61 module_param(nocst, uint, 0000); 61 module_param(nocst, uint, 0000);
62 62
63 /* 63 /*
64 * bm_history -- bit-mask with a bit per jiffy of bus-master activity 64 * bm_history -- bit-mask with a bit per jiffy of bus-master activity
65 * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms 65 * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
66 * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms 66 * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
67 * 100 HZ: 0x0000000F: 4 jiffies = 40ms 67 * 100 HZ: 0x0000000F: 4 jiffies = 40ms
68 * reduce history for more aggressive entry into C3 68 * reduce history for more aggressive entry into C3
69 */ 69 */
70 static unsigned int bm_history = (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); 70 static unsigned int bm_history = (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
71 module_param(bm_history, uint, 0644); 71 module_param(bm_history, uint, 0644);
72 /* -------------------------------------------------------------------------- 72 /* --------------------------------------------------------------------------
73 Power Management 73 Power Management
74 -------------------------------------------------------------------------- */ 74 -------------------------------------------------------------------------- */
75 75
76 /* 76 /*
77 * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3. 77 * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
78 * For now disable this. Probably a bug somewhere else. 78 * For now disable this. Probably a bug somewhere else.
79 * 79 *
80 * To skip this limit, boot/load with a large max_cstate limit. 80 * To skip this limit, boot/load with a large max_cstate limit.
81 */ 81 */
82 static int no_c2c3(struct dmi_system_id *id) 82 static int no_c2c3(struct dmi_system_id *id)
83 { 83 {
84 if (max_cstate > ACPI_PROCESSOR_MAX_POWER) 84 if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
85 return 0; 85 return 0;
86 86
87 printk(KERN_NOTICE PREFIX "%s detected - C2,C3 disabled." 87 printk(KERN_NOTICE PREFIX "%s detected - C2,C3 disabled."
88 " Override with \"processor.max_cstate=%d\"\n", id->ident, 88 " Override with \"processor.max_cstate=%d\"\n", id->ident,
89 ACPI_PROCESSOR_MAX_POWER + 1); 89 ACPI_PROCESSOR_MAX_POWER + 1);
90 90
91 max_cstate = 1; 91 max_cstate = 1;
92 92
93 return 0; 93 return 0;
94 } 94 }
95 95
96 96
97 97
98 98
99 static struct dmi_system_id __initdata processor_power_dmi_table[] = { 99 static struct dmi_system_id __initdata processor_power_dmi_table[] = {
100 { no_c2c3, "IBM ThinkPad R40e", { 100 { no_c2c3, "IBM ThinkPad R40e", {
101 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"), 101 DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
102 DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }}, 102 DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }},
103 { no_c2c3, "Medion 41700", { 103 { no_c2c3, "Medion 41700", {
104 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 104 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
105 DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J") }}, 105 DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J") }},
106 {}, 106 {},
107 }; 107 };
108 108
109 109
110 static inline u32 110 static inline u32
111 ticks_elapsed ( 111 ticks_elapsed (
112 u32 t1, 112 u32 t1,
113 u32 t2) 113 u32 t2)
114 { 114 {
115 if (t2 >= t1) 115 if (t2 >= t1)
116 return (t2 - t1); 116 return (t2 - t1);
117 else if (!acpi_fadt.tmr_val_ext) 117 else if (!acpi_fadt.tmr_val_ext)
118 return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); 118 return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
119 else 119 else
120 return ((0xFFFFFFFF - t1) + t2); 120 return ((0xFFFFFFFF - t1) + t2);
121 } 121 }
122 122
123 123
124 static void 124 static void
125 acpi_processor_power_activate ( 125 acpi_processor_power_activate (
126 struct acpi_processor *pr, 126 struct acpi_processor *pr,
127 struct acpi_processor_cx *new) 127 struct acpi_processor_cx *new)
128 { 128 {
129 struct acpi_processor_cx *old; 129 struct acpi_processor_cx *old;
130 130
131 if (!pr || !new) 131 if (!pr || !new)
132 return; 132 return;
133 133
134 old = pr->power.state; 134 old = pr->power.state;
135 135
136 if (old) 136 if (old)
137 old->promotion.count = 0; 137 old->promotion.count = 0;
138 new->demotion.count = 0; 138 new->demotion.count = 0;
139 139
140 /* Cleanup from old state. */ 140 /* Cleanup from old state. */
141 if (old) { 141 if (old) {
142 switch (old->type) { 142 switch (old->type) {
143 case ACPI_STATE_C3: 143 case ACPI_STATE_C3:
144 /* Disable bus master reload */ 144 /* Disable bus master reload */
145 if (new->type != ACPI_STATE_C3) 145 if (new->type != ACPI_STATE_C3)
146 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0, ACPI_MTX_DO_NOT_LOCK); 146 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0, ACPI_MTX_DO_NOT_LOCK);
147 break; 147 break;
148 } 148 }
149 } 149 }
150 150
151 /* Prepare to use new state. */ 151 /* Prepare to use new state. */
152 switch (new->type) { 152 switch (new->type) {
153 case ACPI_STATE_C3: 153 case ACPI_STATE_C3:
154 /* Enable bus master reload */ 154 /* Enable bus master reload */
155 if (old->type != ACPI_STATE_C3) 155 if (old->type != ACPI_STATE_C3)
156 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1, ACPI_MTX_DO_NOT_LOCK); 156 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1, ACPI_MTX_DO_NOT_LOCK);
157 break; 157 break;
158 } 158 }
159 159
160 pr->power.state = new; 160 pr->power.state = new;
161 161
162 return; 162 return;
163 } 163 }
164 164
165 165
166 static void acpi_processor_idle (void) 166 static void acpi_processor_idle (void)
167 { 167 {
168 struct acpi_processor *pr = NULL; 168 struct acpi_processor *pr = NULL;
169 struct acpi_processor_cx *cx = NULL; 169 struct acpi_processor_cx *cx = NULL;
170 struct acpi_processor_cx *next_state = NULL; 170 struct acpi_processor_cx *next_state = NULL;
171 int sleep_ticks = 0; 171 int sleep_ticks = 0;
172 u32 t1, t2 = 0; 172 u32 t1, t2 = 0;
173 173
174 pr = processors[_smp_processor_id()]; 174 pr = processors[raw_smp_processor_id()];
175 if (!pr) 175 if (!pr)
176 return; 176 return;
177 177
178 /* 178 /*
179 * Interrupts must be disabled during bus mastering calculations and 179 * Interrupts must be disabled during bus mastering calculations and
180 * for C2/C3 transitions. 180 * for C2/C3 transitions.
181 */ 181 */
182 local_irq_disable(); 182 local_irq_disable();
183 183
184 /* 184 /*
185 * Check whether we truly need to go idle, or should 185 * Check whether we truly need to go idle, or should
186 * reschedule: 186 * reschedule:
187 */ 187 */
188 if (unlikely(need_resched())) { 188 if (unlikely(need_resched())) {
189 local_irq_enable(); 189 local_irq_enable();
190 return; 190 return;
191 } 191 }
192 192
193 cx = pr->power.state; 193 cx = pr->power.state;
194 if (!cx) 194 if (!cx)
195 goto easy_out; 195 goto easy_out;
196 196
197 /* 197 /*
198 * Check BM Activity 198 * Check BM Activity
199 * ----------------- 199 * -----------------
200 * Check for bus mastering activity (if required), record, and check 200 * Check for bus mastering activity (if required), record, and check
201 * for demotion. 201 * for demotion.
202 */ 202 */
203 if (pr->flags.bm_check) { 203 if (pr->flags.bm_check) {
204 u32 bm_status = 0; 204 u32 bm_status = 0;
205 unsigned long diff = jiffies - pr->power.bm_check_timestamp; 205 unsigned long diff = jiffies - pr->power.bm_check_timestamp;
206 206
207 if (diff > 32) 207 if (diff > 32)
208 diff = 32; 208 diff = 32;
209 209
210 while (diff) { 210 while (diff) {
211 /* if we didn't get called, assume there was busmaster activity */ 211 /* if we didn't get called, assume there was busmaster activity */
212 diff--; 212 diff--;
213 if (diff) 213 if (diff)
214 pr->power.bm_activity |= 0x1; 214 pr->power.bm_activity |= 0x1;
215 pr->power.bm_activity <<= 1; 215 pr->power.bm_activity <<= 1;
216 } 216 }
217 217
218 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, 218 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS,
219 &bm_status, ACPI_MTX_DO_NOT_LOCK); 219 &bm_status, ACPI_MTX_DO_NOT_LOCK);
220 if (bm_status) { 220 if (bm_status) {
221 pr->power.bm_activity++; 221 pr->power.bm_activity++;
222 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 222 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS,
223 1, ACPI_MTX_DO_NOT_LOCK); 223 1, ACPI_MTX_DO_NOT_LOCK);
224 } 224 }
225 /* 225 /*
226 * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect 226 * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
227 * the true state of bus mastering activity; forcing us to 227 * the true state of bus mastering activity; forcing us to
228 * manually check the BMIDEA bit of each IDE channel. 228 * manually check the BMIDEA bit of each IDE channel.
229 */ 229 */
230 else if (errata.piix4.bmisx) { 230 else if (errata.piix4.bmisx) {
231 if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01) 231 if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
232 || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01)) 232 || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
233 pr->power.bm_activity++; 233 pr->power.bm_activity++;
234 } 234 }
235 235
236 pr->power.bm_check_timestamp = jiffies; 236 pr->power.bm_check_timestamp = jiffies;
237 237
238 /* 238 /*
239 * Apply bus mastering demotion policy. Automatically demote 239 * Apply bus mastering demotion policy. Automatically demote
240 * to avoid a faulty transition. Note that the processor 240 * to avoid a faulty transition. Note that the processor
241 * won't enter a low-power state during this call (to this 241 * won't enter a low-power state during this call (to this
242 * funciton) but should upon the next. 242 * funciton) but should upon the next.
243 * 243 *
244 * TBD: A better policy might be to fallback to the demotion 244 * TBD: A better policy might be to fallback to the demotion
245 * state (use it for this quantum only) istead of 245 * state (use it for this quantum only) istead of
246 * demoting -- and rely on duration as our sole demotion 246 * demoting -- and rely on duration as our sole demotion
247 * qualification. This may, however, introduce DMA 247 * qualification. This may, however, introduce DMA
248 * issues (e.g. floppy DMA transfer overrun/underrun). 248 * issues (e.g. floppy DMA transfer overrun/underrun).
249 */ 249 */
250 if (pr->power.bm_activity & cx->demotion.threshold.bm) { 250 if (pr->power.bm_activity & cx->demotion.threshold.bm) {
251 local_irq_enable(); 251 local_irq_enable();
252 next_state = cx->demotion.state; 252 next_state = cx->demotion.state;
253 goto end; 253 goto end;
254 } 254 }
255 } 255 }
256 256
257 cx->usage++; 257 cx->usage++;
258 258
259 /* 259 /*
260 * Sleep: 260 * Sleep:
261 * ------ 261 * ------
262 * Invoke the current Cx state to put the processor to sleep. 262 * Invoke the current Cx state to put the processor to sleep.
263 */ 263 */
264 switch (cx->type) { 264 switch (cx->type) {
265 265
266 case ACPI_STATE_C1: 266 case ACPI_STATE_C1:
267 /* 267 /*
268 * Invoke C1. 268 * Invoke C1.
269 * Use the appropriate idle routine, the one that would 269 * Use the appropriate idle routine, the one that would
270 * be used without acpi C-states. 270 * be used without acpi C-states.
271 */ 271 */
272 if (pm_idle_save) 272 if (pm_idle_save)
273 pm_idle_save(); 273 pm_idle_save();
274 else 274 else
275 safe_halt(); 275 safe_halt();
276 /* 276 /*
277 * TBD: Can't get time duration while in C1, as resumes 277 * TBD: Can't get time duration while in C1, as resumes
278 * go to an ISR rather than here. Need to instrument 278 * go to an ISR rather than here. Need to instrument
279 * base interrupt handler. 279 * base interrupt handler.
280 */ 280 */
281 sleep_ticks = 0xFFFFFFFF; 281 sleep_ticks = 0xFFFFFFFF;
282 break; 282 break;
283 283
284 case ACPI_STATE_C2: 284 case ACPI_STATE_C2:
285 /* Get start time (ticks) */ 285 /* Get start time (ticks) */
286 t1 = inl(acpi_fadt.xpm_tmr_blk.address); 286 t1 = inl(acpi_fadt.xpm_tmr_blk.address);
287 /* Invoke C2 */ 287 /* Invoke C2 */
288 inb(cx->address); 288 inb(cx->address);
289 /* Dummy op - must do something useless after P_LVL2 read */ 289 /* Dummy op - must do something useless after P_LVL2 read */
290 t2 = inl(acpi_fadt.xpm_tmr_blk.address); 290 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
291 /* Get end time (ticks) */ 291 /* Get end time (ticks) */
292 t2 = inl(acpi_fadt.xpm_tmr_blk.address); 292 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
293 /* Re-enable interrupts */ 293 /* Re-enable interrupts */
294 local_irq_enable(); 294 local_irq_enable();
295 /* Compute time (ticks) that we were actually asleep */ 295 /* Compute time (ticks) that we were actually asleep */
296 sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; 296 sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
297 break; 297 break;
298 298
299 case ACPI_STATE_C3: 299 case ACPI_STATE_C3:
300 /* Disable bus master arbitration */ 300 /* Disable bus master arbitration */
301 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1, ACPI_MTX_DO_NOT_LOCK); 301 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1, ACPI_MTX_DO_NOT_LOCK);
302 /* Get start time (ticks) */ 302 /* Get start time (ticks) */
303 t1 = inl(acpi_fadt.xpm_tmr_blk.address); 303 t1 = inl(acpi_fadt.xpm_tmr_blk.address);
304 /* Invoke C3 */ 304 /* Invoke C3 */
305 inb(cx->address); 305 inb(cx->address);
306 /* Dummy op - must do something useless after P_LVL3 read */ 306 /* Dummy op - must do something useless after P_LVL3 read */
307 t2 = inl(acpi_fadt.xpm_tmr_blk.address); 307 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
308 /* Get end time (ticks) */ 308 /* Get end time (ticks) */
309 t2 = inl(acpi_fadt.xpm_tmr_blk.address); 309 t2 = inl(acpi_fadt.xpm_tmr_blk.address);
310 /* Enable bus master arbitration */ 310 /* Enable bus master arbitration */
311 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0, ACPI_MTX_DO_NOT_LOCK); 311 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0, ACPI_MTX_DO_NOT_LOCK);
312 /* Re-enable interrupts */ 312 /* Re-enable interrupts */
313 local_irq_enable(); 313 local_irq_enable();
314 /* Compute time (ticks) that we were actually asleep */ 314 /* Compute time (ticks) that we were actually asleep */
315 sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; 315 sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
316 break; 316 break;
317 317
318 default: 318 default:
319 local_irq_enable(); 319 local_irq_enable();
320 return; 320 return;
321 } 321 }
322 322
323 next_state = pr->power.state; 323 next_state = pr->power.state;
324 324
325 /* 325 /*
326 * Promotion? 326 * Promotion?
327 * ---------- 327 * ----------
328 * Track the number of longs (time asleep is greater than threshold) 328 * Track the number of longs (time asleep is greater than threshold)
329 * and promote when the count threshold is reached. Note that bus 329 * and promote when the count threshold is reached. Note that bus
330 * mastering activity may prevent promotions. 330 * mastering activity may prevent promotions.
331 * Do not promote above max_cstate. 331 * Do not promote above max_cstate.
332 */ 332 */
333 if (cx->promotion.state && 333 if (cx->promotion.state &&
334 ((cx->promotion.state - pr->power.states) <= max_cstate)) { 334 ((cx->promotion.state - pr->power.states) <= max_cstate)) {
335 if (sleep_ticks > cx->promotion.threshold.ticks) { 335 if (sleep_ticks > cx->promotion.threshold.ticks) {
336 cx->promotion.count++; 336 cx->promotion.count++;
337 cx->demotion.count = 0; 337 cx->demotion.count = 0;
338 if (cx->promotion.count >= cx->promotion.threshold.count) { 338 if (cx->promotion.count >= cx->promotion.threshold.count) {
339 if (pr->flags.bm_check) { 339 if (pr->flags.bm_check) {
340 if (!(pr->power.bm_activity & cx->promotion.threshold.bm)) { 340 if (!(pr->power.bm_activity & cx->promotion.threshold.bm)) {
341 next_state = cx->promotion.state; 341 next_state = cx->promotion.state;
342 goto end; 342 goto end;
343 } 343 }
344 } 344 }
345 else { 345 else {
346 next_state = cx->promotion.state; 346 next_state = cx->promotion.state;
347 goto end; 347 goto end;
348 } 348 }
349 } 349 }
350 } 350 }
351 } 351 }
352 352
353 /* 353 /*
354 * Demotion? 354 * Demotion?
355 * --------- 355 * ---------
356 * Track the number of shorts (time asleep is less than time threshold) 356 * Track the number of shorts (time asleep is less than time threshold)
357 * and demote when the usage threshold is reached. 357 * and demote when the usage threshold is reached.
358 */ 358 */
359 if (cx->demotion.state) { 359 if (cx->demotion.state) {
360 if (sleep_ticks < cx->demotion.threshold.ticks) { 360 if (sleep_ticks < cx->demotion.threshold.ticks) {
361 cx->demotion.count++; 361 cx->demotion.count++;
362 cx->promotion.count = 0; 362 cx->promotion.count = 0;
363 if (cx->demotion.count >= cx->demotion.threshold.count) { 363 if (cx->demotion.count >= cx->demotion.threshold.count) {
364 next_state = cx->demotion.state; 364 next_state = cx->demotion.state;
365 goto end; 365 goto end;
366 } 366 }
367 } 367 }
368 } 368 }
369 369
370 end: 370 end:
371 /* 371 /*
372 * Demote if current state exceeds max_cstate 372 * Demote if current state exceeds max_cstate
373 */ 373 */
374 if ((pr->power.state - pr->power.states) > max_cstate) { 374 if ((pr->power.state - pr->power.states) > max_cstate) {
375 if (cx->demotion.state) 375 if (cx->demotion.state)
376 next_state = cx->demotion.state; 376 next_state = cx->demotion.state;
377 } 377 }
378 378
379 /* 379 /*
380 * New Cx State? 380 * New Cx State?
381 * ------------- 381 * -------------
382 * If we're going to start using a new Cx state we must clean up 382 * If we're going to start using a new Cx state we must clean up
383 * from the previous and prepare to use the new. 383 * from the previous and prepare to use the new.
384 */ 384 */
385 if (next_state != pr->power.state) 385 if (next_state != pr->power.state)
386 acpi_processor_power_activate(pr, next_state); 386 acpi_processor_power_activate(pr, next_state);
387 387
388 return; 388 return;
389 389
390 easy_out: 390 easy_out:
391 /* do C1 instead of busy loop */ 391 /* do C1 instead of busy loop */
392 if (pm_idle_save) 392 if (pm_idle_save)
393 pm_idle_save(); 393 pm_idle_save();
394 else 394 else
395 safe_halt(); 395 safe_halt();
396 return; 396 return;
397 } 397 }
398 398
399 399
400 static int 400 static int
401 acpi_processor_set_power_policy ( 401 acpi_processor_set_power_policy (
402 struct acpi_processor *pr) 402 struct acpi_processor *pr)
403 { 403 {
404 unsigned int i; 404 unsigned int i;
405 unsigned int state_is_set = 0; 405 unsigned int state_is_set = 0;
406 struct acpi_processor_cx *lower = NULL; 406 struct acpi_processor_cx *lower = NULL;
407 struct acpi_processor_cx *higher = NULL; 407 struct acpi_processor_cx *higher = NULL;
408 struct acpi_processor_cx *cx; 408 struct acpi_processor_cx *cx;
409 409
410 ACPI_FUNCTION_TRACE("acpi_processor_set_power_policy"); 410 ACPI_FUNCTION_TRACE("acpi_processor_set_power_policy");
411 411
412 if (!pr) 412 if (!pr)
413 return_VALUE(-EINVAL); 413 return_VALUE(-EINVAL);
414 414
415 /* 415 /*
416 * This function sets the default Cx state policy (OS idle handler). 416 * This function sets the default Cx state policy (OS idle handler).
417 * Our scheme is to promote quickly to C2 but more conservatively 417 * Our scheme is to promote quickly to C2 but more conservatively
418 * to C3. We're favoring C2 for its characteristics of low latency 418 * to C3. We're favoring C2 for its characteristics of low latency
419 * (quick response), good power savings, and ability to allow bus 419 * (quick response), good power savings, and ability to allow bus
420 * mastering activity. Note that the Cx state policy is completely 420 * mastering activity. Note that the Cx state policy is completely
421 * customizable and can be altered dynamically. 421 * customizable and can be altered dynamically.
422 */ 422 */
423 423
424 /* startup state */ 424 /* startup state */
425 for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) { 425 for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
426 cx = &pr->power.states[i]; 426 cx = &pr->power.states[i];
427 if (!cx->valid) 427 if (!cx->valid)
428 continue; 428 continue;
429 429
430 if (!state_is_set) 430 if (!state_is_set)
431 pr->power.state = cx; 431 pr->power.state = cx;
432 state_is_set++; 432 state_is_set++;
433 break; 433 break;
434 } 434 }
435 435
436 if (!state_is_set) 436 if (!state_is_set)
437 return_VALUE(-ENODEV); 437 return_VALUE(-ENODEV);
438 438
439 /* demotion */ 439 /* demotion */
440 for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) { 440 for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
441 cx = &pr->power.states[i]; 441 cx = &pr->power.states[i];
442 if (!cx->valid) 442 if (!cx->valid)
443 continue; 443 continue;
444 444
445 if (lower) { 445 if (lower) {
446 cx->demotion.state = lower; 446 cx->demotion.state = lower;
447 cx->demotion.threshold.ticks = cx->latency_ticks; 447 cx->demotion.threshold.ticks = cx->latency_ticks;
448 cx->demotion.threshold.count = 1; 448 cx->demotion.threshold.count = 1;
449 if (cx->type == ACPI_STATE_C3) 449 if (cx->type == ACPI_STATE_C3)
450 cx->demotion.threshold.bm = bm_history; 450 cx->demotion.threshold.bm = bm_history;
451 } 451 }
452 452
453 lower = cx; 453 lower = cx;
454 } 454 }
455 455
456 /* promotion */ 456 /* promotion */
457 for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) { 457 for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
458 cx = &pr->power.states[i]; 458 cx = &pr->power.states[i];
459 if (!cx->valid) 459 if (!cx->valid)
460 continue; 460 continue;
461 461
462 if (higher) { 462 if (higher) {
463 cx->promotion.state = higher; 463 cx->promotion.state = higher;
464 cx->promotion.threshold.ticks = cx->latency_ticks; 464 cx->promotion.threshold.ticks = cx->latency_ticks;
465 if (cx->type >= ACPI_STATE_C2) 465 if (cx->type >= ACPI_STATE_C2)
466 cx->promotion.threshold.count = 4; 466 cx->promotion.threshold.count = 4;
467 else 467 else
468 cx->promotion.threshold.count = 10; 468 cx->promotion.threshold.count = 10;
469 if (higher->type == ACPI_STATE_C3) 469 if (higher->type == ACPI_STATE_C3)
470 cx->promotion.threshold.bm = bm_history; 470 cx->promotion.threshold.bm = bm_history;
471 } 471 }
472 472
473 higher = cx; 473 higher = cx;
474 } 474 }
475 475
476 return_VALUE(0); 476 return_VALUE(0);
477 } 477 }
478 478
479 479
480 static int acpi_processor_get_power_info_fadt (struct acpi_processor *pr) 480 static int acpi_processor_get_power_info_fadt (struct acpi_processor *pr)
481 { 481 {
482 int i; 482 int i;
483 483
484 ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_fadt"); 484 ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_fadt");
485 485
486 if (!pr) 486 if (!pr)
487 return_VALUE(-EINVAL); 487 return_VALUE(-EINVAL);
488 488
489 if (!pr->pblk) 489 if (!pr->pblk)
490 return_VALUE(-ENODEV); 490 return_VALUE(-ENODEV);
491 491
492 for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++) 492 for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++)
493 memset(pr->power.states, 0, sizeof(struct acpi_processor_cx)); 493 memset(pr->power.states, 0, sizeof(struct acpi_processor_cx));
494 494
495 /* if info is obtained from pblk/fadt, type equals state */ 495 /* if info is obtained from pblk/fadt, type equals state */
496 pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1; 496 pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
497 pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2; 497 pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;
498 pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3; 498 pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;
499 499
500 /* the C0 state only exists as a filler in our array, 500 /* the C0 state only exists as a filler in our array,
501 * and all processors need to support C1 */ 501 * and all processors need to support C1 */
502 pr->power.states[ACPI_STATE_C0].valid = 1; 502 pr->power.states[ACPI_STATE_C0].valid = 1;
503 pr->power.states[ACPI_STATE_C1].valid = 1; 503 pr->power.states[ACPI_STATE_C1].valid = 1;
504 504
505 /* determine C2 and C3 address from pblk */ 505 /* determine C2 and C3 address from pblk */
506 pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4; 506 pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;
507 pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5; 507 pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;
508 508
509 /* determine latencies from FADT */ 509 /* determine latencies from FADT */
510 pr->power.states[ACPI_STATE_C2].latency = acpi_fadt.plvl2_lat; 510 pr->power.states[ACPI_STATE_C2].latency = acpi_fadt.plvl2_lat;
511 pr->power.states[ACPI_STATE_C3].latency = acpi_fadt.plvl3_lat; 511 pr->power.states[ACPI_STATE_C3].latency = acpi_fadt.plvl3_lat;
512 512
513 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 513 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
514 "lvl2[0x%08x] lvl3[0x%08x]\n", 514 "lvl2[0x%08x] lvl3[0x%08x]\n",
515 pr->power.states[ACPI_STATE_C2].address, 515 pr->power.states[ACPI_STATE_C2].address,
516 pr->power.states[ACPI_STATE_C3].address)); 516 pr->power.states[ACPI_STATE_C3].address));
517 517
518 return_VALUE(0); 518 return_VALUE(0);
519 } 519 }
520 520
521 521
522 static int acpi_processor_get_power_info_cst (struct acpi_processor *pr) 522 static int acpi_processor_get_power_info_cst (struct acpi_processor *pr)
523 { 523 {
524 acpi_status status = 0; 524 acpi_status status = 0;
525 acpi_integer count; 525 acpi_integer count;
526 int i; 526 int i;
527 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; 527 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
528 union acpi_object *cst; 528 union acpi_object *cst;
529 529
530 ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_cst"); 530 ACPI_FUNCTION_TRACE("acpi_processor_get_power_info_cst");
531 531
532 if (errata.smp) 532 if (errata.smp)
533 return_VALUE(-ENODEV); 533 return_VALUE(-ENODEV);
534 534
535 if (nocst) 535 if (nocst)
536 return_VALUE(-ENODEV); 536 return_VALUE(-ENODEV);
537 537
538 pr->power.count = 0; 538 pr->power.count = 0;
539 for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++) 539 for (i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++)
540 memset(pr->power.states, 0, sizeof(struct acpi_processor_cx)); 540 memset(pr->power.states, 0, sizeof(struct acpi_processor_cx));
541 541
542 status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer); 542 status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
543 if (ACPI_FAILURE(status)) { 543 if (ACPI_FAILURE(status)) {
544 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n")); 544 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
545 return_VALUE(-ENODEV); 545 return_VALUE(-ENODEV);
546 } 546 }
547 547
548 cst = (union acpi_object *) buffer.pointer; 548 cst = (union acpi_object *) buffer.pointer;
549 549
550 /* There must be at least 2 elements */ 550 /* There must be at least 2 elements */
551 if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) { 551 if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
552 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "not enough elements in _CST\n")); 552 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "not enough elements in _CST\n"));
553 status = -EFAULT; 553 status = -EFAULT;
554 goto end; 554 goto end;
555 } 555 }
556 556
557 count = cst->package.elements[0].integer.value; 557 count = cst->package.elements[0].integer.value;
558 558
559 /* Validate number of power states. */ 559 /* Validate number of power states. */
560 if (count < 1 || count != cst->package.count - 1) { 560 if (count < 1 || count != cst->package.count - 1) {
561 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "count given by _CST is not valid\n")); 561 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "count given by _CST is not valid\n"));
562 status = -EFAULT; 562 status = -EFAULT;
563 goto end; 563 goto end;
564 } 564 }
565 565
566 /* We support up to ACPI_PROCESSOR_MAX_POWER. */ 566 /* We support up to ACPI_PROCESSOR_MAX_POWER. */
567 if (count > ACPI_PROCESSOR_MAX_POWER) { 567 if (count > ACPI_PROCESSOR_MAX_POWER) {
568 printk(KERN_WARNING "Limiting number of power states to max (%d)\n", ACPI_PROCESSOR_MAX_POWER); 568 printk(KERN_WARNING "Limiting number of power states to max (%d)\n", ACPI_PROCESSOR_MAX_POWER);
569 printk(KERN_WARNING "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n"); 569 printk(KERN_WARNING "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
570 count = ACPI_PROCESSOR_MAX_POWER; 570 count = ACPI_PROCESSOR_MAX_POWER;
571 } 571 }
572 572
573 /* Tell driver that at least _CST is supported. */ 573 /* Tell driver that at least _CST is supported. */
574 pr->flags.has_cst = 1; 574 pr->flags.has_cst = 1;
575 575
576 for (i = 1; i <= count; i++) { 576 for (i = 1; i <= count; i++) {
577 union acpi_object *element; 577 union acpi_object *element;
578 union acpi_object *obj; 578 union acpi_object *obj;
579 struct acpi_power_register *reg; 579 struct acpi_power_register *reg;
580 struct acpi_processor_cx cx; 580 struct acpi_processor_cx cx;
581 581
582 memset(&cx, 0, sizeof(cx)); 582 memset(&cx, 0, sizeof(cx));
583 583
584 element = (union acpi_object *) &(cst->package.elements[i]); 584 element = (union acpi_object *) &(cst->package.elements[i]);
585 if (element->type != ACPI_TYPE_PACKAGE) 585 if (element->type != ACPI_TYPE_PACKAGE)
586 continue; 586 continue;
587 587
588 if (element->package.count != 4) 588 if (element->package.count != 4)
589 continue; 589 continue;
590 590
591 obj = (union acpi_object *) &(element->package.elements[0]); 591 obj = (union acpi_object *) &(element->package.elements[0]);
592 592
593 if (obj->type != ACPI_TYPE_BUFFER) 593 if (obj->type != ACPI_TYPE_BUFFER)
594 continue; 594 continue;
595 595
596 reg = (struct acpi_power_register *) obj->buffer.pointer; 596 reg = (struct acpi_power_register *) obj->buffer.pointer;
597 597
598 if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO && 598 if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
599 (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) 599 (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
600 continue; 600 continue;
601 601
602 cx.address = (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) ? 602 cx.address = (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) ?
603 0 : reg->address; 603 0 : reg->address;
604 604
605 /* There should be an easy way to extract an integer... */ 605 /* There should be an easy way to extract an integer... */
606 obj = (union acpi_object *) &(element->package.elements[1]); 606 obj = (union acpi_object *) &(element->package.elements[1]);
607 if (obj->type != ACPI_TYPE_INTEGER) 607 if (obj->type != ACPI_TYPE_INTEGER)
608 continue; 608 continue;
609 609
610 cx.type = obj->integer.value; 610 cx.type = obj->integer.value;
611 611
612 if ((cx.type != ACPI_STATE_C1) && 612 if ((cx.type != ACPI_STATE_C1) &&
613 (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO)) 613 (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO))
614 continue; 614 continue;
615 615
616 if ((cx.type < ACPI_STATE_C1) || 616 if ((cx.type < ACPI_STATE_C1) ||
617 (cx.type > ACPI_STATE_C3)) 617 (cx.type > ACPI_STATE_C3))
618 continue; 618 continue;
619 619
620 obj = (union acpi_object *) &(element->package.elements[2]); 620 obj = (union acpi_object *) &(element->package.elements[2]);
621 if (obj->type != ACPI_TYPE_INTEGER) 621 if (obj->type != ACPI_TYPE_INTEGER)
622 continue; 622 continue;
623 623
624 cx.latency = obj->integer.value; 624 cx.latency = obj->integer.value;
625 625
626 obj = (union acpi_object *) &(element->package.elements[3]); 626 obj = (union acpi_object *) &(element->package.elements[3]);
627 if (obj->type != ACPI_TYPE_INTEGER) 627 if (obj->type != ACPI_TYPE_INTEGER)
628 continue; 628 continue;
629 629
630 cx.power = obj->integer.value; 630 cx.power = obj->integer.value;
631 631
632 (pr->power.count)++; 632 (pr->power.count)++;
633 memcpy(&(pr->power.states[pr->power.count]), &cx, sizeof(cx)); 633 memcpy(&(pr->power.states[pr->power.count]), &cx, sizeof(cx));
634 } 634 }
635 635
636 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n", pr->power.count)); 636 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n", pr->power.count));
637 637
638 /* Validate number of power states discovered */ 638 /* Validate number of power states discovered */
639 if (pr->power.count < 2) 639 if (pr->power.count < 2)
640 status = -ENODEV; 640 status = -ENODEV;
641 641
642 end: 642 end:
643 acpi_os_free(buffer.pointer); 643 acpi_os_free(buffer.pointer);
644 644
645 return_VALUE(status); 645 return_VALUE(status);
646 } 646 }
647 647
648 648
649 static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx) 649 static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)
650 { 650 {
651 ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c2"); 651 ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c2");
652 652
653 if (!cx->address) 653 if (!cx->address)
654 return_VOID; 654 return_VOID;
655 655
656 /* 656 /*
657 * C2 latency must be less than or equal to 100 657 * C2 latency must be less than or equal to 100
658 * microseconds. 658 * microseconds.
659 */ 659 */
660 else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) { 660 else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {
661 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 661 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
662 "latency too large [%d]\n", 662 "latency too large [%d]\n",
663 cx->latency)); 663 cx->latency));
664 return_VOID; 664 return_VOID;
665 } 665 }
666 666
667 /* We're (currently) only supporting C2 on UP */ 667 /* We're (currently) only supporting C2 on UP */
668 else if (errata.smp) { 668 else if (errata.smp) {
669 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 669 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
670 "C2 not supported in SMP mode\n")); 670 "C2 not supported in SMP mode\n"));
671 return_VOID; 671 return_VOID;
672 } 672 }
673 673
674 /* 674 /*
675 * Otherwise we've met all of our C2 requirements. 675 * Otherwise we've met all of our C2 requirements.
676 * Normalize the C2 latency to expidite policy 676 * Normalize the C2 latency to expidite policy
677 */ 677 */
678 cx->valid = 1; 678 cx->valid = 1;
679 cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); 679 cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
680 680
681 return_VOID; 681 return_VOID;
682 } 682 }
683 683
684 684
685 static void acpi_processor_power_verify_c3( 685 static void acpi_processor_power_verify_c3(
686 struct acpi_processor *pr, 686 struct acpi_processor *pr,
687 struct acpi_processor_cx *cx) 687 struct acpi_processor_cx *cx)
688 { 688 {
689 ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c3"); 689 ACPI_FUNCTION_TRACE("acpi_processor_get_power_verify_c3");
690 690
691 if (!cx->address) 691 if (!cx->address)
692 return_VOID; 692 return_VOID;
693 693
694 /* 694 /*
695 * C3 latency must be less than or equal to 1000 695 * C3 latency must be less than or equal to 1000
696 * microseconds. 696 * microseconds.
697 */ 697 */
698 else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) { 698 else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {
699 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 699 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
700 "latency too large [%d]\n", 700 "latency too large [%d]\n",
701 cx->latency)); 701 cx->latency));
702 return_VOID; 702 return_VOID;
703 } 703 }
704 704
705 /* bus mastering control is necessary */ 705 /* bus mastering control is necessary */
706 else if (!pr->flags.bm_control) { 706 else if (!pr->flags.bm_control) {
707 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 707 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
708 "C3 support requires bus mastering control\n")); 708 "C3 support requires bus mastering control\n"));
709 return_VOID; 709 return_VOID;
710 } 710 }
711 711
712 /* We're (currently) only supporting C2 on UP */ 712 /* We're (currently) only supporting C2 on UP */
713 else if (errata.smp) { 713 else if (errata.smp) {
714 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 714 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
715 "C3 not supported in SMP mode\n")); 715 "C3 not supported in SMP mode\n"));
716 return_VOID; 716 return_VOID;
717 } 717 }
718 718
719 /* 719 /*
720 * PIIX4 Erratum #18: We don't support C3 when Type-F (fast) 720 * PIIX4 Erratum #18: We don't support C3 when Type-F (fast)
721 * DMA transfers are used by any ISA device to avoid livelock. 721 * DMA transfers are used by any ISA device to avoid livelock.
722 * Note that we could disable Type-F DMA (as recommended by 722 * Note that we could disable Type-F DMA (as recommended by
723 * the erratum), but this is known to disrupt certain ISA 723 * the erratum), but this is known to disrupt certain ISA
724 * devices thus we take the conservative approach. 724 * devices thus we take the conservative approach.
725 */ 725 */
726 else if (errata.piix4.fdma) { 726 else if (errata.piix4.fdma) {
727 ACPI_DEBUG_PRINT((ACPI_DB_INFO, 727 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
728 "C3 not supported on PIIX4 with Type-F DMA\n")); 728 "C3 not supported on PIIX4 with Type-F DMA\n"));
729 return_VOID; 729 return_VOID;
730 } 730 }
731 731
732 /* 732 /*
733 * Otherwise we've met all of our C3 requirements. 733 * Otherwise we've met all of our C3 requirements.
734 * Normalize the C3 latency to expidite policy. Enable 734 * Normalize the C3 latency to expidite policy. Enable
735 * checking of bus mastering status (bm_check) so we can 735 * checking of bus mastering status (bm_check) so we can
736 * use this in our C3 policy 736 * use this in our C3 policy
737 */ 737 */
738 cx->valid = 1; 738 cx->valid = 1;
739 cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); 739 cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
740 pr->flags.bm_check = 1; 740 pr->flags.bm_check = 1;
741 741
742 return_VOID; 742 return_VOID;
743 } 743 }
744 744
745 745
746 static int acpi_processor_power_verify(struct acpi_processor *pr) 746 static int acpi_processor_power_verify(struct acpi_processor *pr)
747 { 747 {
748 unsigned int i; 748 unsigned int i;
749 unsigned int working = 0; 749 unsigned int working = 0;
750 750
751 for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) { 751 for (i=1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
752 struct acpi_processor_cx *cx = &pr->power.states[i]; 752 struct acpi_processor_cx *cx = &pr->power.states[i];
753 753
754 switch (cx->type) { 754 switch (cx->type) {
755 case ACPI_STATE_C1: 755 case ACPI_STATE_C1:
756 cx->valid = 1; 756 cx->valid = 1;
757 break; 757 break;
758 758
759 case ACPI_STATE_C2: 759 case ACPI_STATE_C2:
760 acpi_processor_power_verify_c2(cx); 760 acpi_processor_power_verify_c2(cx);
761 break; 761 break;
762 762
763 case ACPI_STATE_C3: 763 case ACPI_STATE_C3:
764 acpi_processor_power_verify_c3(pr, cx); 764 acpi_processor_power_verify_c3(pr, cx);
765 break; 765 break;
766 } 766 }
767 767
768 if (cx->valid) 768 if (cx->valid)
769 working++; 769 working++;
770 } 770 }
771 771
772 return (working); 772 return (working);
773 } 773 }
774 774
775 static int acpi_processor_get_power_info ( 775 static int acpi_processor_get_power_info (
776 struct acpi_processor *pr) 776 struct acpi_processor *pr)
777 { 777 {
778 unsigned int i; 778 unsigned int i;
779 int result; 779 int result;
780 780
781 ACPI_FUNCTION_TRACE("acpi_processor_get_power_info"); 781 ACPI_FUNCTION_TRACE("acpi_processor_get_power_info");
782 782
783 /* NOTE: the idle thread may not be running while calling 783 /* NOTE: the idle thread may not be running while calling
784 * this function */ 784 * this function */
785 785
786 result = acpi_processor_get_power_info_cst(pr); 786 result = acpi_processor_get_power_info_cst(pr);
787 if ((result) || (acpi_processor_power_verify(pr) < 2)) { 787 if ((result) || (acpi_processor_power_verify(pr) < 2)) {
788 result = acpi_processor_get_power_info_fadt(pr); 788 result = acpi_processor_get_power_info_fadt(pr);
789 if (result) 789 if (result)
790 return_VALUE(result); 790 return_VALUE(result);
791 791
792 if (acpi_processor_power_verify(pr) < 2) 792 if (acpi_processor_power_verify(pr) < 2)
793 return_VALUE(-ENODEV); 793 return_VALUE(-ENODEV);
794 } 794 }
795 795
796 /* 796 /*
797 * Set Default Policy 797 * Set Default Policy
798 * ------------------ 798 * ------------------
799 * Now that we know which states are supported, set the default 799 * Now that we know which states are supported, set the default
800 * policy. Note that this policy can be changed dynamically 800 * policy. Note that this policy can be changed dynamically
801 * (e.g. encourage deeper sleeps to conserve battery life when 801 * (e.g. encourage deeper sleeps to conserve battery life when
802 * not on AC). 802 * not on AC).
803 */ 803 */
804 result = acpi_processor_set_power_policy(pr); 804 result = acpi_processor_set_power_policy(pr);
805 if (result) 805 if (result)
806 return_VALUE(result); 806 return_VALUE(result);
807 807
808 /* 808 /*
809 * if one state of type C2 or C3 is available, mark this 809 * if one state of type C2 or C3 is available, mark this
810 * CPU as being "idle manageable" 810 * CPU as being "idle manageable"
811 */ 811 */
812 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { 812 for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
813 if (pr->power.states[i].valid) 813 if (pr->power.states[i].valid)
814 pr->power.count = i; 814 pr->power.count = i;
815 if ((pr->power.states[i].valid) && 815 if ((pr->power.states[i].valid) &&
816 (pr->power.states[i].type >= ACPI_STATE_C2)) 816 (pr->power.states[i].type >= ACPI_STATE_C2))
817 pr->flags.power = 1; 817 pr->flags.power = 1;
818 } 818 }
819 819
820 return_VALUE(0); 820 return_VALUE(0);
821 } 821 }
822 822
823 int acpi_processor_cst_has_changed (struct acpi_processor *pr) 823 int acpi_processor_cst_has_changed (struct acpi_processor *pr)
824 { 824 {
825 int result = 0; 825 int result = 0;
826 826
827 ACPI_FUNCTION_TRACE("acpi_processor_cst_has_changed"); 827 ACPI_FUNCTION_TRACE("acpi_processor_cst_has_changed");
828 828
829 if (!pr) 829 if (!pr)
830 return_VALUE(-EINVAL); 830 return_VALUE(-EINVAL);
831 831
832 if (errata.smp || nocst) { 832 if (errata.smp || nocst) {
833 return_VALUE(-ENODEV); 833 return_VALUE(-ENODEV);
834 } 834 }
835 835
836 if (!pr->flags.power_setup_done) 836 if (!pr->flags.power_setup_done)
837 return_VALUE(-ENODEV); 837 return_VALUE(-ENODEV);
838 838
839 /* Fall back to the default idle loop */ 839 /* Fall back to the default idle loop */
840 pm_idle = pm_idle_save; 840 pm_idle = pm_idle_save;
841 synchronize_sched(); /* Relies on interrupts forcing exit from idle. */ 841 synchronize_sched(); /* Relies on interrupts forcing exit from idle. */
842 842
843 pr->flags.power = 0; 843 pr->flags.power = 0;
844 result = acpi_processor_get_power_info(pr); 844 result = acpi_processor_get_power_info(pr);
845 if ((pr->flags.power == 1) && (pr->flags.power_setup_done)) 845 if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
846 pm_idle = acpi_processor_idle; 846 pm_idle = acpi_processor_idle;
847 847
848 return_VALUE(result); 848 return_VALUE(result);
849 } 849 }
850 850
851 /* proc interface */ 851 /* proc interface */
852 852
853 static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset) 853 static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
854 { 854 {
855 struct acpi_processor *pr = (struct acpi_processor *)seq->private; 855 struct acpi_processor *pr = (struct acpi_processor *)seq->private;
856 unsigned int i; 856 unsigned int i;
857 857
858 ACPI_FUNCTION_TRACE("acpi_processor_power_seq_show"); 858 ACPI_FUNCTION_TRACE("acpi_processor_power_seq_show");
859 859
860 if (!pr) 860 if (!pr)
861 goto end; 861 goto end;
862 862
863 seq_printf(seq, "active state: C%zd\n" 863 seq_printf(seq, "active state: C%zd\n"
864 "max_cstate: C%d\n" 864 "max_cstate: C%d\n"
865 "bus master activity: %08x\n", 865 "bus master activity: %08x\n",
866 pr->power.state ? pr->power.state - pr->power.states : 0, 866 pr->power.state ? pr->power.state - pr->power.states : 0,
867 max_cstate, 867 max_cstate,
868 (unsigned)pr->power.bm_activity); 868 (unsigned)pr->power.bm_activity);
869 869
870 seq_puts(seq, "states:\n"); 870 seq_puts(seq, "states:\n");
871 871
872 for (i = 1; i <= pr->power.count; i++) { 872 for (i = 1; i <= pr->power.count; i++) {
873 seq_printf(seq, " %cC%d: ", 873 seq_printf(seq, " %cC%d: ",
874 (&pr->power.states[i] == pr->power.state?'*':' '), i); 874 (&pr->power.states[i] == pr->power.state?'*':' '), i);
875 875
876 if (!pr->power.states[i].valid) { 876 if (!pr->power.states[i].valid) {
877 seq_puts(seq, "<not supported>\n"); 877 seq_puts(seq, "<not supported>\n");
878 continue; 878 continue;
879 } 879 }
880 880
881 switch (pr->power.states[i].type) { 881 switch (pr->power.states[i].type) {
882 case ACPI_STATE_C1: 882 case ACPI_STATE_C1:
883 seq_printf(seq, "type[C1] "); 883 seq_printf(seq, "type[C1] ");
884 break; 884 break;
885 case ACPI_STATE_C2: 885 case ACPI_STATE_C2:
886 seq_printf(seq, "type[C2] "); 886 seq_printf(seq, "type[C2] ");
887 break; 887 break;
888 case ACPI_STATE_C3: 888 case ACPI_STATE_C3:
889 seq_printf(seq, "type[C3] "); 889 seq_printf(seq, "type[C3] ");
890 break; 890 break;
891 default: 891 default:
892 seq_printf(seq, "type[--] "); 892 seq_printf(seq, "type[--] ");
893 break; 893 break;
894 } 894 }
895 895
896 if (pr->power.states[i].promotion.state) 896 if (pr->power.states[i].promotion.state)
897 seq_printf(seq, "promotion[C%zd] ", 897 seq_printf(seq, "promotion[C%zd] ",
898 (pr->power.states[i].promotion.state - 898 (pr->power.states[i].promotion.state -
899 pr->power.states)); 899 pr->power.states));
900 else 900 else
901 seq_puts(seq, "promotion[--] "); 901 seq_puts(seq, "promotion[--] ");
902 902
903 if (pr->power.states[i].demotion.state) 903 if (pr->power.states[i].demotion.state)
904 seq_printf(seq, "demotion[C%zd] ", 904 seq_printf(seq, "demotion[C%zd] ",
905 (pr->power.states[i].demotion.state - 905 (pr->power.states[i].demotion.state -
906 pr->power.states)); 906 pr->power.states));
907 else 907 else
908 seq_puts(seq, "demotion[--] "); 908 seq_puts(seq, "demotion[--] ");
909 909
910 seq_printf(seq, "latency[%03d] usage[%08d]\n", 910 seq_printf(seq, "latency[%03d] usage[%08d]\n",
911 pr->power.states[i].latency, 911 pr->power.states[i].latency,
912 pr->power.states[i].usage); 912 pr->power.states[i].usage);
913 } 913 }
914 914
915 end: 915 end:
916 return_VALUE(0); 916 return_VALUE(0);
917 } 917 }
918 918
919 static int acpi_processor_power_open_fs(struct inode *inode, struct file *file) 919 static int acpi_processor_power_open_fs(struct inode *inode, struct file *file)
920 { 920 {
921 return single_open(file, acpi_processor_power_seq_show, 921 return single_open(file, acpi_processor_power_seq_show,
922 PDE(inode)->data); 922 PDE(inode)->data);
923 } 923 }
924 924
925 static struct file_operations acpi_processor_power_fops = { 925 static struct file_operations acpi_processor_power_fops = {
926 .open = acpi_processor_power_open_fs, 926 .open = acpi_processor_power_open_fs,
927 .read = seq_read, 927 .read = seq_read,
928 .llseek = seq_lseek, 928 .llseek = seq_lseek,
929 .release = single_release, 929 .release = single_release,
930 }; 930 };
931 931
932 932
933 int acpi_processor_power_init(struct acpi_processor *pr, struct acpi_device *device) 933 int acpi_processor_power_init(struct acpi_processor *pr, struct acpi_device *device)
934 { 934 {
935 acpi_status status = 0; 935 acpi_status status = 0;
936 static int first_run = 0; 936 static int first_run = 0;
937 struct proc_dir_entry *entry = NULL; 937 struct proc_dir_entry *entry = NULL;
938 unsigned int i; 938 unsigned int i;
939 939
940 ACPI_FUNCTION_TRACE("acpi_processor_power_init"); 940 ACPI_FUNCTION_TRACE("acpi_processor_power_init");
941 941
942 if (!first_run) { 942 if (!first_run) {
943 dmi_check_system(processor_power_dmi_table); 943 dmi_check_system(processor_power_dmi_table);
944 if (max_cstate < ACPI_C_STATES_MAX) 944 if (max_cstate < ACPI_C_STATES_MAX)
945 printk(KERN_NOTICE "ACPI: processor limited to max C-state %d\n", max_cstate); 945 printk(KERN_NOTICE "ACPI: processor limited to max C-state %d\n", max_cstate);
946 first_run++; 946 first_run++;
947 } 947 }
948 948
949 if (!errata.smp && (pr->id == 0) && acpi_fadt.cst_cnt && !nocst) { 949 if (!errata.smp && (pr->id == 0) && acpi_fadt.cst_cnt && !nocst) {
950 status = acpi_os_write_port(acpi_fadt.smi_cmd, acpi_fadt.cst_cnt, 8); 950 status = acpi_os_write_port(acpi_fadt.smi_cmd, acpi_fadt.cst_cnt, 8);
951 if (ACPI_FAILURE(status)) { 951 if (ACPI_FAILURE(status)) {
952 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, 952 ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
953 "Notifying BIOS of _CST ability failed\n")); 953 "Notifying BIOS of _CST ability failed\n"));
954 } 954 }
955 } 955 }
956 956
957 acpi_processor_get_power_info(pr); 957 acpi_processor_get_power_info(pr);
958 958
959 /* 959 /*
960 * Install the idle handler if processor power management is supported. 960 * Install the idle handler if processor power management is supported.
961 * Note that we use previously set idle handler will be used on 961 * Note that we use previously set idle handler will be used on
962 * platforms that only support C1. 962 * platforms that only support C1.
963 */ 963 */
964 if ((pr->flags.power) && (!boot_option_idle_override)) { 964 if ((pr->flags.power) && (!boot_option_idle_override)) {
965 printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id); 965 printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
966 for (i = 1; i <= pr->power.count; i++) 966 for (i = 1; i <= pr->power.count; i++)
967 if (pr->power.states[i].valid) 967 if (pr->power.states[i].valid)
968 printk(" C%d[C%d]", i, pr->power.states[i].type); 968 printk(" C%d[C%d]", i, pr->power.states[i].type);
969 printk(")\n"); 969 printk(")\n");
970 970
971 if (pr->id == 0) { 971 if (pr->id == 0) {
972 pm_idle_save = pm_idle; 972 pm_idle_save = pm_idle;
973 pm_idle = acpi_processor_idle; 973 pm_idle = acpi_processor_idle;
974 } 974 }
975 } 975 }
976 976
977 /* 'power' [R] */ 977 /* 'power' [R] */
978 entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER, 978 entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,
979 S_IRUGO, acpi_device_dir(device)); 979 S_IRUGO, acpi_device_dir(device));
980 if (!entry) 980 if (!entry)
981 ACPI_DEBUG_PRINT((ACPI_DB_ERROR, 981 ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
982 "Unable to create '%s' fs entry\n", 982 "Unable to create '%s' fs entry\n",
983 ACPI_PROCESSOR_FILE_POWER)); 983 ACPI_PROCESSOR_FILE_POWER));
984 else { 984 else {
985 entry->proc_fops = &acpi_processor_power_fops; 985 entry->proc_fops = &acpi_processor_power_fops;
986 entry->data = acpi_driver_data(device); 986 entry->data = acpi_driver_data(device);
987 entry->owner = THIS_MODULE; 987 entry->owner = THIS_MODULE;
988 } 988 }
989 989
990 pr->flags.power_setup_done = 1; 990 pr->flags.power_setup_done = 1;
991 991
992 return_VALUE(0); 992 return_VALUE(0);
993 } 993 }
994 994
995 int acpi_processor_power_exit(struct acpi_processor *pr, struct acpi_device *device) 995 int acpi_processor_power_exit(struct acpi_processor *pr, struct acpi_device *device)
996 { 996 {
997 ACPI_FUNCTION_TRACE("acpi_processor_power_exit"); 997 ACPI_FUNCTION_TRACE("acpi_processor_power_exit");
998 998
999 pr->flags.power_setup_done = 0; 999 pr->flags.power_setup_done = 0;
1000 1000
1001 if (acpi_device_dir(device)) 1001 if (acpi_device_dir(device))
1002 remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,acpi_device_dir(device)); 1002 remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,acpi_device_dir(device));
1003 1003
1004 /* Unregister the idle handler when processor #0 is removed. */ 1004 /* Unregister the idle handler when processor #0 is removed. */
1005 if (pr->id == 0) { 1005 if (pr->id == 0) {
1006 pm_idle = pm_idle_save; 1006 pm_idle = pm_idle_save;
1007 1007
1008 /* 1008 /*
1009 * We are about to unload the current idle thread pm callback 1009 * We are about to unload the current idle thread pm callback
1010 * (pm_idle), Wait for all processors to update cached/local 1010 * (pm_idle), Wait for all processors to update cached/local
1011 * copies of pm_idle before proceeding. 1011 * copies of pm_idle before proceeding.
1012 */ 1012 */
1013 cpu_idle_wait(); 1013 cpu_idle_wait();
1014 } 1014 }
1015 1015
1016 return_VALUE(0); 1016 return_VALUE(0);
1017 } 1017 }
1018 1018
drivers/input/gameport/gameport.c
1 /* 1 /*
2 * Generic gameport layer 2 * Generic gameport layer
3 * 3 *
4 * Copyright (c) 1999-2002 Vojtech Pavlik 4 * Copyright (c) 1999-2002 Vojtech Pavlik
5 * Copyright (c) 2005 Dmitry Torokhov 5 * Copyright (c) 2005 Dmitry Torokhov
6 */ 6 */
7 7
8 /* 8 /*
9 * This program is free software; you can redistribute it and/or modify it 9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License version 2 as published by 10 * under the terms of the GNU General Public License version 2 as published by
11 * the Free Software Foundation. 11 * the Free Software Foundation.
12 */ 12 */
13 13
14 #include <linux/stddef.h> 14 #include <linux/stddef.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/ioport.h> 16 #include <linux/ioport.h>
17 #include <linux/init.h> 17 #include <linux/init.h>
18 #include <linux/gameport.h> 18 #include <linux/gameport.h>
19 #include <linux/wait.h> 19 #include <linux/wait.h>
20 #include <linux/completion.h> 20 #include <linux/completion.h>
21 #include <linux/sched.h> 21 #include <linux/sched.h>
22 #include <linux/smp_lock.h> 22 #include <linux/smp_lock.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/delay.h> 24 #include <linux/delay.h>
25 25
26 /*#include <asm/io.h>*/ 26 /*#include <asm/io.h>*/
27 27
28 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); 28 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
29 MODULE_DESCRIPTION("Generic gameport layer"); 29 MODULE_DESCRIPTION("Generic gameport layer");
30 MODULE_LICENSE("GPL"); 30 MODULE_LICENSE("GPL");
31 31
32 EXPORT_SYMBOL(__gameport_register_port); 32 EXPORT_SYMBOL(__gameport_register_port);
33 EXPORT_SYMBOL(gameport_unregister_port); 33 EXPORT_SYMBOL(gameport_unregister_port);
34 EXPORT_SYMBOL(__gameport_register_driver); 34 EXPORT_SYMBOL(__gameport_register_driver);
35 EXPORT_SYMBOL(gameport_unregister_driver); 35 EXPORT_SYMBOL(gameport_unregister_driver);
36 EXPORT_SYMBOL(gameport_open); 36 EXPORT_SYMBOL(gameport_open);
37 EXPORT_SYMBOL(gameport_close); 37 EXPORT_SYMBOL(gameport_close);
38 EXPORT_SYMBOL(gameport_rescan); 38 EXPORT_SYMBOL(gameport_rescan);
39 EXPORT_SYMBOL(gameport_cooked_read); 39 EXPORT_SYMBOL(gameport_cooked_read);
40 EXPORT_SYMBOL(gameport_set_name); 40 EXPORT_SYMBOL(gameport_set_name);
41 EXPORT_SYMBOL(gameport_set_phys); 41 EXPORT_SYMBOL(gameport_set_phys);
42 EXPORT_SYMBOL(gameport_start_polling); 42 EXPORT_SYMBOL(gameport_start_polling);
43 EXPORT_SYMBOL(gameport_stop_polling); 43 EXPORT_SYMBOL(gameport_stop_polling);
44 44
45 /* 45 /*
46 * gameport_sem protects entire gameport subsystem and is taken 46 * gameport_sem protects entire gameport subsystem and is taken
47 * every time gameport port or driver registrered or unregistered. 47 * every time gameport port or driver registrered or unregistered.
48 */ 48 */
49 static DECLARE_MUTEX(gameport_sem); 49 static DECLARE_MUTEX(gameport_sem);
50 50
51 static LIST_HEAD(gameport_list); 51 static LIST_HEAD(gameport_list);
52 52
53 static struct bus_type gameport_bus = { 53 static struct bus_type gameport_bus = {
54 .name = "gameport", 54 .name = "gameport",
55 }; 55 };
56 56
57 static void gameport_add_port(struct gameport *gameport); 57 static void gameport_add_port(struct gameport *gameport);
58 static void gameport_destroy_port(struct gameport *gameport); 58 static void gameport_destroy_port(struct gameport *gameport);
59 static void gameport_reconnect_port(struct gameport *gameport); 59 static void gameport_reconnect_port(struct gameport *gameport);
60 static void gameport_disconnect_port(struct gameport *gameport); 60 static void gameport_disconnect_port(struct gameport *gameport);
61 61
62 #if defined(__i386__) 62 #if defined(__i386__)
63 63
64 #define DELTA(x,y) ((y)-(x)+((y)<(x)?1193182/HZ:0)) 64 #define DELTA(x,y) ((y)-(x)+((y)<(x)?1193182/HZ:0))
65 #define GET_TIME(x) do { x = get_time_pit(); } while (0) 65 #define GET_TIME(x) do { x = get_time_pit(); } while (0)
66 66
67 static unsigned int get_time_pit(void) 67 static unsigned int get_time_pit(void)
68 { 68 {
69 extern spinlock_t i8253_lock; 69 extern spinlock_t i8253_lock;
70 unsigned long flags; 70 unsigned long flags;
71 unsigned int count; 71 unsigned int count;
72 72
73 spin_lock_irqsave(&i8253_lock, flags); 73 spin_lock_irqsave(&i8253_lock, flags);
74 outb_p(0x00, 0x43); 74 outb_p(0x00, 0x43);
75 count = inb_p(0x40); 75 count = inb_p(0x40);
76 count |= inb_p(0x40) << 8; 76 count |= inb_p(0x40) << 8;
77 spin_unlock_irqrestore(&i8253_lock, flags); 77 spin_unlock_irqrestore(&i8253_lock, flags);
78 78
79 return count; 79 return count;
80 } 80 }
81 81
82 #endif 82 #endif
83 83
84 84
85 85
86 /* 86 /*
87 * gameport_measure_speed() measures the gameport i/o speed. 87 * gameport_measure_speed() measures the gameport i/o speed.
88 */ 88 */
89 89
90 static int gameport_measure_speed(struct gameport *gameport) 90 static int gameport_measure_speed(struct gameport *gameport)
91 { 91 {
92 #if defined(__i386__) 92 #if defined(__i386__)
93 93
94 unsigned int i, t, t1, t2, t3, tx; 94 unsigned int i, t, t1, t2, t3, tx;
95 unsigned long flags; 95 unsigned long flags;
96 96
97 if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW)) 97 if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))
98 return 0; 98 return 0;
99 99
100 tx = 1 << 30; 100 tx = 1 << 30;
101 101
102 for(i = 0; i < 50; i++) { 102 for(i = 0; i < 50; i++) {
103 local_irq_save(flags); 103 local_irq_save(flags);
104 GET_TIME(t1); 104 GET_TIME(t1);
105 for (t = 0; t < 50; t++) gameport_read(gameport); 105 for (t = 0; t < 50; t++) gameport_read(gameport);
106 GET_TIME(t2); 106 GET_TIME(t2);
107 GET_TIME(t3); 107 GET_TIME(t3);
108 local_irq_restore(flags); 108 local_irq_restore(flags);
109 udelay(i * 10); 109 udelay(i * 10);
110 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; 110 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
111 } 111 }
112 112
113 gameport_close(gameport); 113 gameport_close(gameport);
114 return 59659 / (tx < 1 ? 1 : tx); 114 return 59659 / (tx < 1 ? 1 : tx);
115 115
116 #elif defined (__x86_64__) 116 #elif defined (__x86_64__)
117 117
118 unsigned int i, t; 118 unsigned int i, t;
119 unsigned long tx, t1, t2, flags; 119 unsigned long tx, t1, t2, flags;
120 120
121 if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW)) 121 if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))
122 return 0; 122 return 0;
123 123
124 tx = 1 << 30; 124 tx = 1 << 30;
125 125
126 for(i = 0; i < 50; i++) { 126 for(i = 0; i < 50; i++) {
127 local_irq_save(flags); 127 local_irq_save(flags);
128 rdtscl(t1); 128 rdtscl(t1);
129 for (t = 0; t < 50; t++) gameport_read(gameport); 129 for (t = 0; t < 50; t++) gameport_read(gameport);
130 rdtscl(t2); 130 rdtscl(t2);
131 local_irq_restore(flags); 131 local_irq_restore(flags);
132 udelay(i * 10); 132 udelay(i * 10);
133 if (t2 - t1 < tx) tx = t2 - t1; 133 if (t2 - t1 < tx) tx = t2 - t1;
134 } 134 }
135 135
136 gameport_close(gameport); 136 gameport_close(gameport);
137 return (cpu_data[_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx); 137 return (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
138 138
139 #else 139 #else
140 140
141 unsigned int j, t = 0; 141 unsigned int j, t = 0;
142 142
143 if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW)) 143 if (gameport_open(gameport, NULL, GAMEPORT_MODE_RAW))
144 return 0; 144 return 0;
145 145
146 j = jiffies; while (j == jiffies); 146 j = jiffies; while (j == jiffies);
147 j = jiffies; while (j == jiffies) { t++; gameport_read(gameport); } 147 j = jiffies; while (j == jiffies) { t++; gameport_read(gameport); }
148 148
149 gameport_close(gameport); 149 gameport_close(gameport);
150 return t * HZ / 1000; 150 return t * HZ / 1000;
151 151
152 #endif 152 #endif
153 } 153 }
154 154
155 void gameport_start_polling(struct gameport *gameport) 155 void gameport_start_polling(struct gameport *gameport)
156 { 156 {
157 spin_lock(&gameport->timer_lock); 157 spin_lock(&gameport->timer_lock);
158 158
159 if (!gameport->poll_cnt++) { 159 if (!gameport->poll_cnt++) {
160 BUG_ON(!gameport->poll_handler); 160 BUG_ON(!gameport->poll_handler);
161 BUG_ON(!gameport->poll_interval); 161 BUG_ON(!gameport->poll_interval);
162 mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval)); 162 mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval));
163 } 163 }
164 164
165 spin_unlock(&gameport->timer_lock); 165 spin_unlock(&gameport->timer_lock);
166 } 166 }
167 167
168 void gameport_stop_polling(struct gameport *gameport) 168 void gameport_stop_polling(struct gameport *gameport)
169 { 169 {
170 spin_lock(&gameport->timer_lock); 170 spin_lock(&gameport->timer_lock);
171 171
172 if (!--gameport->poll_cnt) 172 if (!--gameport->poll_cnt)
173 del_timer(&gameport->poll_timer); 173 del_timer(&gameport->poll_timer);
174 174
175 spin_unlock(&gameport->timer_lock); 175 spin_unlock(&gameport->timer_lock);
176 } 176 }
177 177
178 static void gameport_run_poll_handler(unsigned long d) 178 static void gameport_run_poll_handler(unsigned long d)
179 { 179 {
180 struct gameport *gameport = (struct gameport *)d; 180 struct gameport *gameport = (struct gameport *)d;
181 181
182 gameport->poll_handler(gameport); 182 gameport->poll_handler(gameport);
183 if (gameport->poll_cnt) 183 if (gameport->poll_cnt)
184 mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval)); 184 mod_timer(&gameport->poll_timer, jiffies + msecs_to_jiffies(gameport->poll_interval));
185 } 185 }
186 186
187 /* 187 /*
188 * Basic gameport -> driver core mappings 188 * Basic gameport -> driver core mappings
189 */ 189 */
190 190
191 static void gameport_bind_driver(struct gameport *gameport, struct gameport_driver *drv) 191 static void gameport_bind_driver(struct gameport *gameport, struct gameport_driver *drv)
192 { 192 {
193 down_write(&gameport_bus.subsys.rwsem); 193 down_write(&gameport_bus.subsys.rwsem);
194 194
195 gameport->dev.driver = &drv->driver; 195 gameport->dev.driver = &drv->driver;
196 if (drv->connect(gameport, drv)) { 196 if (drv->connect(gameport, drv)) {
197 gameport->dev.driver = NULL; 197 gameport->dev.driver = NULL;
198 goto out; 198 goto out;
199 } 199 }
200 device_bind_driver(&gameport->dev); 200 device_bind_driver(&gameport->dev);
201 out: 201 out:
202 up_write(&gameport_bus.subsys.rwsem); 202 up_write(&gameport_bus.subsys.rwsem);
203 } 203 }
204 204
205 static void gameport_release_driver(struct gameport *gameport) 205 static void gameport_release_driver(struct gameport *gameport)
206 { 206 {
207 down_write(&gameport_bus.subsys.rwsem); 207 down_write(&gameport_bus.subsys.rwsem);
208 device_release_driver(&gameport->dev); 208 device_release_driver(&gameport->dev);
209 up_write(&gameport_bus.subsys.rwsem); 209 up_write(&gameport_bus.subsys.rwsem);
210 } 210 }
211 211
212 static void gameport_find_driver(struct gameport *gameport) 212 static void gameport_find_driver(struct gameport *gameport)
213 { 213 {
214 down_write(&gameport_bus.subsys.rwsem); 214 down_write(&gameport_bus.subsys.rwsem);
215 device_attach(&gameport->dev); 215 device_attach(&gameport->dev);
216 up_write(&gameport_bus.subsys.rwsem); 216 up_write(&gameport_bus.subsys.rwsem);
217 } 217 }
218 218
219 219
220 /* 220 /*
221 * Gameport event processing. 221 * Gameport event processing.
222 */ 222 */
223 223
224 enum gameport_event_type { 224 enum gameport_event_type {
225 GAMEPORT_RESCAN, 225 GAMEPORT_RESCAN,
226 GAMEPORT_RECONNECT, 226 GAMEPORT_RECONNECT,
227 GAMEPORT_REGISTER_PORT, 227 GAMEPORT_REGISTER_PORT,
228 GAMEPORT_REGISTER_DRIVER, 228 GAMEPORT_REGISTER_DRIVER,
229 }; 229 };
230 230
231 struct gameport_event { 231 struct gameport_event {
232 enum gameport_event_type type; 232 enum gameport_event_type type;
233 void *object; 233 void *object;
234 struct module *owner; 234 struct module *owner;
235 struct list_head node; 235 struct list_head node;
236 }; 236 };
237 237
238 static DEFINE_SPINLOCK(gameport_event_lock); /* protects gameport_event_list */ 238 static DEFINE_SPINLOCK(gameport_event_lock); /* protects gameport_event_list */
239 static LIST_HEAD(gameport_event_list); 239 static LIST_HEAD(gameport_event_list);
240 static DECLARE_WAIT_QUEUE_HEAD(gameport_wait); 240 static DECLARE_WAIT_QUEUE_HEAD(gameport_wait);
241 static DECLARE_COMPLETION(gameport_exited); 241 static DECLARE_COMPLETION(gameport_exited);
242 static int gameport_pid; 242 static int gameport_pid;
243 243
244 static void gameport_queue_event(void *object, struct module *owner, 244 static void gameport_queue_event(void *object, struct module *owner,
245 enum gameport_event_type event_type) 245 enum gameport_event_type event_type)
246 { 246 {
247 unsigned long flags; 247 unsigned long flags;
248 struct gameport_event *event; 248 struct gameport_event *event;
249 249
250 spin_lock_irqsave(&gameport_event_lock, flags); 250 spin_lock_irqsave(&gameport_event_lock, flags);
251 251
252 /* 252 /*
253 * Scan event list for the other events for the same gameport port, 253 * Scan event list for the other events for the same gameport port,
254 * starting with the most recent one. If event is the same we 254 * starting with the most recent one. If event is the same we
255 * do not need add new one. If event is of different type we 255 * do not need add new one. If event is of different type we
256 * need to add this event and should not look further because 256 * need to add this event and should not look further because
257 * we need to preseve sequence of distinct events. 257 * we need to preseve sequence of distinct events.
258 */ 258 */
259 list_for_each_entry_reverse(event, &gameport_event_list, node) { 259 list_for_each_entry_reverse(event, &gameport_event_list, node) {
260 if (event->object == object) { 260 if (event->object == object) {
261 if (event->type == event_type) 261 if (event->type == event_type)
262 goto out; 262 goto out;
263 break; 263 break;
264 } 264 }
265 } 265 }
266 266
267 if ((event = kmalloc(sizeof(struct gameport_event), GFP_ATOMIC))) { 267 if ((event = kmalloc(sizeof(struct gameport_event), GFP_ATOMIC))) {
268 if (!try_module_get(owner)) { 268 if (!try_module_get(owner)) {
269 printk(KERN_WARNING "gameport: Can't get module reference, dropping event %d\n", event_type); 269 printk(KERN_WARNING "gameport: Can't get module reference, dropping event %d\n", event_type);
270 goto out; 270 goto out;
271 } 271 }
272 272
273 event->type = event_type; 273 event->type = event_type;
274 event->object = object; 274 event->object = object;
275 event->owner = owner; 275 event->owner = owner;
276 276
277 list_add_tail(&event->node, &gameport_event_list); 277 list_add_tail(&event->node, &gameport_event_list);
278 wake_up(&gameport_wait); 278 wake_up(&gameport_wait);
279 } else { 279 } else {
280 printk(KERN_ERR "gameport: Not enough memory to queue event %d\n", event_type); 280 printk(KERN_ERR "gameport: Not enough memory to queue event %d\n", event_type);
281 } 281 }
282 out: 282 out:
283 spin_unlock_irqrestore(&gameport_event_lock, flags); 283 spin_unlock_irqrestore(&gameport_event_lock, flags);
284 } 284 }
285 285
286 static void gameport_free_event(struct gameport_event *event) 286 static void gameport_free_event(struct gameport_event *event)
287 { 287 {
288 module_put(event->owner); 288 module_put(event->owner);
289 kfree(event); 289 kfree(event);
290 } 290 }
291 291
292 static void gameport_remove_duplicate_events(struct gameport_event *event) 292 static void gameport_remove_duplicate_events(struct gameport_event *event)
293 { 293 {
294 struct list_head *node, *next; 294 struct list_head *node, *next;
295 struct gameport_event *e; 295 struct gameport_event *e;
296 unsigned long flags; 296 unsigned long flags;
297 297
298 spin_lock_irqsave(&gameport_event_lock, flags); 298 spin_lock_irqsave(&gameport_event_lock, flags);
299 299
300 list_for_each_safe(node, next, &gameport_event_list) { 300 list_for_each_safe(node, next, &gameport_event_list) {
301 e = list_entry(node, struct gameport_event, node); 301 e = list_entry(node, struct gameport_event, node);
302 if (event->object == e->object) { 302 if (event->object == e->object) {
303 /* 303 /*
304 * If this event is of different type we should not 304 * If this event is of different type we should not
305 * look further - we only suppress duplicate events 305 * look further - we only suppress duplicate events
306 * that were sent back-to-back. 306 * that were sent back-to-back.
307 */ 307 */
308 if (event->type != e->type) 308 if (event->type != e->type)
309 break; 309 break;
310 310
311 list_del_init(node); 311 list_del_init(node);
312 gameport_free_event(e); 312 gameport_free_event(e);
313 } 313 }
314 } 314 }
315 315
316 spin_unlock_irqrestore(&gameport_event_lock, flags); 316 spin_unlock_irqrestore(&gameport_event_lock, flags);
317 } 317 }
318 318
319 319
320 static struct gameport_event *gameport_get_event(void) 320 static struct gameport_event *gameport_get_event(void)
321 { 321 {
322 struct gameport_event *event; 322 struct gameport_event *event;
323 struct list_head *node; 323 struct list_head *node;
324 unsigned long flags; 324 unsigned long flags;
325 325
326 spin_lock_irqsave(&gameport_event_lock, flags); 326 spin_lock_irqsave(&gameport_event_lock, flags);
327 327
328 if (list_empty(&gameport_event_list)) { 328 if (list_empty(&gameport_event_list)) {
329 spin_unlock_irqrestore(&gameport_event_lock, flags); 329 spin_unlock_irqrestore(&gameport_event_lock, flags);
330 return NULL; 330 return NULL;
331 } 331 }
332 332
333 node = gameport_event_list.next; 333 node = gameport_event_list.next;
334 event = list_entry(node, struct gameport_event, node); 334 event = list_entry(node, struct gameport_event, node);
335 list_del_init(node); 335 list_del_init(node);
336 336
337 spin_unlock_irqrestore(&gameport_event_lock, flags); 337 spin_unlock_irqrestore(&gameport_event_lock, flags);
338 338
339 return event; 339 return event;
340 } 340 }
341 341
342 static void gameport_handle_events(void) 342 static void gameport_handle_events(void)
343 { 343 {
344 struct gameport_event *event; 344 struct gameport_event *event;
345 struct gameport_driver *gameport_drv; 345 struct gameport_driver *gameport_drv;
346 346
347 down(&gameport_sem); 347 down(&gameport_sem);
348 348
349 while ((event = gameport_get_event())) { 349 while ((event = gameport_get_event())) {
350 350
351 switch (event->type) { 351 switch (event->type) {
352 case GAMEPORT_REGISTER_PORT: 352 case GAMEPORT_REGISTER_PORT:
353 gameport_add_port(event->object); 353 gameport_add_port(event->object);
354 break; 354 break;
355 355
356 case GAMEPORT_RECONNECT: 356 case GAMEPORT_RECONNECT:
357 gameport_reconnect_port(event->object); 357 gameport_reconnect_port(event->object);
358 break; 358 break;
359 359
360 case GAMEPORT_RESCAN: 360 case GAMEPORT_RESCAN:
361 gameport_disconnect_port(event->object); 361 gameport_disconnect_port(event->object);
362 gameport_find_driver(event->object); 362 gameport_find_driver(event->object);
363 break; 363 break;
364 364
365 case GAMEPORT_REGISTER_DRIVER: 365 case GAMEPORT_REGISTER_DRIVER:
366 gameport_drv = event->object; 366 gameport_drv = event->object;
367 driver_register(&gameport_drv->driver); 367 driver_register(&gameport_drv->driver);
368 break; 368 break;
369 369
370 default: 370 default:
371 break; 371 break;
372 } 372 }
373 373
374 gameport_remove_duplicate_events(event); 374 gameport_remove_duplicate_events(event);
375 gameport_free_event(event); 375 gameport_free_event(event);
376 } 376 }
377 377
378 up(&gameport_sem); 378 up(&gameport_sem);
379 } 379 }
380 380
381 /* 381 /*
382 * Remove all events that have been submitted for a given gameport port. 382 * Remove all events that have been submitted for a given gameport port.
383 */ 383 */
384 static void gameport_remove_pending_events(struct gameport *gameport) 384 static void gameport_remove_pending_events(struct gameport *gameport)
385 { 385 {
386 struct list_head *node, *next; 386 struct list_head *node, *next;
387 struct gameport_event *event; 387 struct gameport_event *event;
388 unsigned long flags; 388 unsigned long flags;
389 389
390 spin_lock_irqsave(&gameport_event_lock, flags); 390 spin_lock_irqsave(&gameport_event_lock, flags);
391 391
392 list_for_each_safe(node, next, &gameport_event_list) { 392 list_for_each_safe(node, next, &gameport_event_list) {
393 event = list_entry(node, struct gameport_event, node); 393 event = list_entry(node, struct gameport_event, node);
394 if (event->object == gameport) { 394 if (event->object == gameport) {
395 list_del_init(node); 395 list_del_init(node);
396 gameport_free_event(event); 396 gameport_free_event(event);
397 } 397 }
398 } 398 }
399 399
400 spin_unlock_irqrestore(&gameport_event_lock, flags); 400 spin_unlock_irqrestore(&gameport_event_lock, flags);
401 } 401 }
402 402
403 /* 403 /*
404 * Destroy child gameport port (if any) that has not been fully registered yet. 404 * Destroy child gameport port (if any) that has not been fully registered yet.
405 * 405 *
406 * Note that we rely on the fact that port can have only one child and therefore 406 * Note that we rely on the fact that port can have only one child and therefore
407 * only one child registration request can be pending. Additionally, children 407 * only one child registration request can be pending. Additionally, children
408 * are registered by driver's connect() handler so there can't be a grandchild 408 * are registered by driver's connect() handler so there can't be a grandchild
409 * pending registration together with a child. 409 * pending registration together with a child.
410 */ 410 */
411 static struct gameport *gameport_get_pending_child(struct gameport *parent) 411 static struct gameport *gameport_get_pending_child(struct gameport *parent)
412 { 412 {
413 struct gameport_event *event; 413 struct gameport_event *event;
414 struct gameport *gameport, *child = NULL; 414 struct gameport *gameport, *child = NULL;
415 unsigned long flags; 415 unsigned long flags;
416 416
417 spin_lock_irqsave(&gameport_event_lock, flags); 417 spin_lock_irqsave(&gameport_event_lock, flags);
418 418
419 list_for_each_entry(event, &gameport_event_list, node) { 419 list_for_each_entry(event, &gameport_event_list, node) {
420 if (event->type == GAMEPORT_REGISTER_PORT) { 420 if (event->type == GAMEPORT_REGISTER_PORT) {
421 gameport = event->object; 421 gameport = event->object;
422 if (gameport->parent == parent) { 422 if (gameport->parent == parent) {
423 child = gameport; 423 child = gameport;
424 break; 424 break;
425 } 425 }
426 } 426 }
427 } 427 }
428 428
429 spin_unlock_irqrestore(&gameport_event_lock, flags); 429 spin_unlock_irqrestore(&gameport_event_lock, flags);
430 return child; 430 return child;
431 } 431 }
432 432
433 static int gameport_thread(void *nothing) 433 static int gameport_thread(void *nothing)
434 { 434 {
435 lock_kernel(); 435 lock_kernel();
436 daemonize("kgameportd"); 436 daemonize("kgameportd");
437 allow_signal(SIGTERM); 437 allow_signal(SIGTERM);
438 438
439 do { 439 do {
440 gameport_handle_events(); 440 gameport_handle_events();
441 wait_event_interruptible(gameport_wait, !list_empty(&gameport_event_list)); 441 wait_event_interruptible(gameport_wait, !list_empty(&gameport_event_list));
442 try_to_freeze(PF_FREEZE); 442 try_to_freeze(PF_FREEZE);
443 } while (!signal_pending(current)); 443 } while (!signal_pending(current));
444 444
445 printk(KERN_DEBUG "gameport: kgameportd exiting\n"); 445 printk(KERN_DEBUG "gameport: kgameportd exiting\n");
446 446
447 unlock_kernel(); 447 unlock_kernel();
448 complete_and_exit(&gameport_exited, 0); 448 complete_and_exit(&gameport_exited, 0);
449 } 449 }
450 450
451 451
452 /* 452 /*
453 * Gameport port operations 453 * Gameport port operations
454 */ 454 */
455 455
456 static ssize_t gameport_show_description(struct device *dev, struct device_attribute *attr, char *buf) 456 static ssize_t gameport_show_description(struct device *dev, struct device_attribute *attr, char *buf)
457 { 457 {
458 struct gameport *gameport = to_gameport_port(dev); 458 struct gameport *gameport = to_gameport_port(dev);
459 return sprintf(buf, "%s\n", gameport->name); 459 return sprintf(buf, "%s\n", gameport->name);
460 } 460 }
461 461
462 static ssize_t gameport_rebind_driver(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) 462 static ssize_t gameport_rebind_driver(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
463 { 463 {
464 struct gameport *gameport = to_gameport_port(dev); 464 struct gameport *gameport = to_gameport_port(dev);
465 struct device_driver *drv; 465 struct device_driver *drv;
466 int retval; 466 int retval;
467 467
468 retval = down_interruptible(&gameport_sem); 468 retval = down_interruptible(&gameport_sem);
469 if (retval) 469 if (retval)
470 return retval; 470 return retval;
471 471
472 retval = count; 472 retval = count;
473 if (!strncmp(buf, "none", count)) { 473 if (!strncmp(buf, "none", count)) {
474 gameport_disconnect_port(gameport); 474 gameport_disconnect_port(gameport);
475 } else if (!strncmp(buf, "reconnect", count)) { 475 } else if (!strncmp(buf, "reconnect", count)) {
476 gameport_reconnect_port(gameport); 476 gameport_reconnect_port(gameport);
477 } else if (!strncmp(buf, "rescan", count)) { 477 } else if (!strncmp(buf, "rescan", count)) {
478 gameport_disconnect_port(gameport); 478 gameport_disconnect_port(gameport);
479 gameport_find_driver(gameport); 479 gameport_find_driver(gameport);
480 } else if ((drv = driver_find(buf, &gameport_bus)) != NULL) { 480 } else if ((drv = driver_find(buf, &gameport_bus)) != NULL) {
481 gameport_disconnect_port(gameport); 481 gameport_disconnect_port(gameport);
482 gameport_bind_driver(gameport, to_gameport_driver(drv)); 482 gameport_bind_driver(gameport, to_gameport_driver(drv));
483 put_driver(drv); 483 put_driver(drv);
484 } else { 484 } else {
485 retval = -EINVAL; 485 retval = -EINVAL;
486 } 486 }
487 487
488 up(&gameport_sem); 488 up(&gameport_sem);
489 489
490 return retval; 490 return retval;
491 } 491 }
492 492
493 static struct device_attribute gameport_device_attrs[] = { 493 static struct device_attribute gameport_device_attrs[] = {
494 __ATTR(description, S_IRUGO, gameport_show_description, NULL), 494 __ATTR(description, S_IRUGO, gameport_show_description, NULL),
495 __ATTR(drvctl, S_IWUSR, NULL, gameport_rebind_driver), 495 __ATTR(drvctl, S_IWUSR, NULL, gameport_rebind_driver),
496 __ATTR_NULL 496 __ATTR_NULL
497 }; 497 };
498 498
499 static void gameport_release_port(struct device *dev) 499 static void gameport_release_port(struct device *dev)
500 { 500 {
501 struct gameport *gameport = to_gameport_port(dev); 501 struct gameport *gameport = to_gameport_port(dev);
502 502
503 kfree(gameport); 503 kfree(gameport);
504 module_put(THIS_MODULE); 504 module_put(THIS_MODULE);
505 } 505 }
506 506
507 void gameport_set_phys(struct gameport *gameport, const char *fmt, ...) 507 void gameport_set_phys(struct gameport *gameport, const char *fmt, ...)
508 { 508 {
509 va_list args; 509 va_list args;
510 510
511 va_start(args, fmt); 511 va_start(args, fmt);
512 vsnprintf(gameport->phys, sizeof(gameport->phys), fmt, args); 512 vsnprintf(gameport->phys, sizeof(gameport->phys), fmt, args);
513 va_end(args); 513 va_end(args);
514 } 514 }
515 515
516 /* 516 /*
517 * Prepare gameport port for registration. 517 * Prepare gameport port for registration.
518 */ 518 */
519 static void gameport_init_port(struct gameport *gameport) 519 static void gameport_init_port(struct gameport *gameport)
520 { 520 {
521 static atomic_t gameport_no = ATOMIC_INIT(0); 521 static atomic_t gameport_no = ATOMIC_INIT(0);
522 522
523 __module_get(THIS_MODULE); 523 __module_get(THIS_MODULE);
524 524
525 init_MUTEX(&gameport->drv_sem); 525 init_MUTEX(&gameport->drv_sem);
526 device_initialize(&gameport->dev); 526 device_initialize(&gameport->dev);
527 snprintf(gameport->dev.bus_id, sizeof(gameport->dev.bus_id), 527 snprintf(gameport->dev.bus_id, sizeof(gameport->dev.bus_id),
528 "gameport%lu", (unsigned long)atomic_inc_return(&gameport_no) - 1); 528 "gameport%lu", (unsigned long)atomic_inc_return(&gameport_no) - 1);
529 gameport->dev.bus = &gameport_bus; 529 gameport->dev.bus = &gameport_bus;
530 gameport->dev.release = gameport_release_port; 530 gameport->dev.release = gameport_release_port;
531 if (gameport->parent) 531 if (gameport->parent)
532 gameport->dev.parent = &gameport->parent->dev; 532 gameport->dev.parent = &gameport->parent->dev;
533 533
534 spin_lock_init(&gameport->timer_lock); 534 spin_lock_init(&gameport->timer_lock);
535 init_timer(&gameport->poll_timer); 535 init_timer(&gameport->poll_timer);
536 gameport->poll_timer.function = gameport_run_poll_handler; 536 gameport->poll_timer.function = gameport_run_poll_handler;
537 gameport->poll_timer.data = (unsigned long)gameport; 537 gameport->poll_timer.data = (unsigned long)gameport;
538 } 538 }
539 539
540 /* 540 /*
541 * Complete gameport port registration. 541 * Complete gameport port registration.
542 * Driver core will attempt to find appropriate driver for the port. 542 * Driver core will attempt to find appropriate driver for the port.
543 */ 543 */
544 static void gameport_add_port(struct gameport *gameport) 544 static void gameport_add_port(struct gameport *gameport)
545 { 545 {
546 if (gameport->parent) 546 if (gameport->parent)
547 gameport->parent->child = gameport; 547 gameport->parent->child = gameport;
548 548
549 gameport->speed = gameport_measure_speed(gameport); 549 gameport->speed = gameport_measure_speed(gameport);
550 550
551 list_add_tail(&gameport->node, &gameport_list); 551 list_add_tail(&gameport->node, &gameport_list);
552 552
553 if (gameport->io) 553 if (gameport->io)
554 printk(KERN_INFO "gameport: %s is %s, io %#x, speed %dkHz\n", 554 printk(KERN_INFO "gameport: %s is %s, io %#x, speed %dkHz\n",
555 gameport->name, gameport->phys, gameport->io, gameport->speed); 555 gameport->name, gameport->phys, gameport->io, gameport->speed);
556 else 556 else
557 printk(KERN_INFO "gameport: %s is %s, speed %dkHz\n", 557 printk(KERN_INFO "gameport: %s is %s, speed %dkHz\n",
558 gameport->name, gameport->phys, gameport->speed); 558 gameport->name, gameport->phys, gameport->speed);
559 559
560 device_add(&gameport->dev); 560 device_add(&gameport->dev);
561 gameport->registered = 1; 561 gameport->registered = 1;
562 } 562 }
563 563
564 /* 564 /*
565 * gameport_destroy_port() completes deregistration process and removes 565 * gameport_destroy_port() completes deregistration process and removes
566 * port from the system 566 * port from the system
567 */ 567 */
568 static void gameport_destroy_port(struct gameport *gameport) 568 static void gameport_destroy_port(struct gameport *gameport)
569 { 569 {
570 struct gameport *child; 570 struct gameport *child;
571 571
572 child = gameport_get_pending_child(gameport); 572 child = gameport_get_pending_child(gameport);
573 if (child) { 573 if (child) {
574 gameport_remove_pending_events(child); 574 gameport_remove_pending_events(child);
575 put_device(&child->dev); 575 put_device(&child->dev);
576 } 576 }
577 577
578 if (gameport->parent) { 578 if (gameport->parent) {
579 gameport->parent->child = NULL; 579 gameport->parent->child = NULL;
580 gameport->parent = NULL; 580 gameport->parent = NULL;
581 } 581 }
582 582
583 if (gameport->registered) { 583 if (gameport->registered) {
584 device_del(&gameport->dev); 584 device_del(&gameport->dev);
585 list_del_init(&gameport->node); 585 list_del_init(&gameport->node);
586 gameport->registered = 0; 586 gameport->registered = 0;
587 } 587 }
588 588
589 gameport_remove_pending_events(gameport); 589 gameport_remove_pending_events(gameport);
590 put_device(&gameport->dev); 590 put_device(&gameport->dev);
591 } 591 }
592 592
593 /* 593 /*
594 * Reconnect gameport port and all its children (re-initialize attached devices) 594 * Reconnect gameport port and all its children (re-initialize attached devices)
595 */ 595 */
596 static void gameport_reconnect_port(struct gameport *gameport) 596 static void gameport_reconnect_port(struct gameport *gameport)
597 { 597 {
598 do { 598 do {
599 if (!gameport->drv || !gameport->drv->reconnect || gameport->drv->reconnect(gameport)) { 599 if (!gameport->drv || !gameport->drv->reconnect || gameport->drv->reconnect(gameport)) {
600 gameport_disconnect_port(gameport); 600 gameport_disconnect_port(gameport);
601 gameport_find_driver(gameport); 601 gameport_find_driver(gameport);
602 /* Ok, old children are now gone, we are done */ 602 /* Ok, old children are now gone, we are done */
603 break; 603 break;
604 } 604 }
605 gameport = gameport->child; 605 gameport = gameport->child;
606 } while (gameport); 606 } while (gameport);
607 } 607 }
608 608
609 /* 609 /*
610 * gameport_disconnect_port() unbinds a port from its driver. As a side effect 610 * gameport_disconnect_port() unbinds a port from its driver. As a side effect
611 * all child ports are unbound and destroyed. 611 * all child ports are unbound and destroyed.
612 */ 612 */
613 static void gameport_disconnect_port(struct gameport *gameport) 613 static void gameport_disconnect_port(struct gameport *gameport)
614 { 614 {
615 struct gameport *s, *parent; 615 struct gameport *s, *parent;
616 616
617 if (gameport->child) { 617 if (gameport->child) {
618 /* 618 /*
619 * Children ports should be disconnected and destroyed 619 * Children ports should be disconnected and destroyed
620 * first, staring with the leaf one, since we don't want 620 * first, staring with the leaf one, since we don't want
621 * to do recursion 621 * to do recursion
622 */ 622 */
623 for (s = gameport; s->child; s = s->child) 623 for (s = gameport; s->child; s = s->child)
624 /* empty */; 624 /* empty */;
625 625
626 do { 626 do {
627 parent = s->parent; 627 parent = s->parent;
628 628
629 gameport_release_driver(s); 629 gameport_release_driver(s);
630 gameport_destroy_port(s); 630 gameport_destroy_port(s);
631 } while ((s = parent) != gameport); 631 } while ((s = parent) != gameport);
632 } 632 }
633 633
634 /* 634 /*
635 * Ok, no children left, now disconnect this port 635 * Ok, no children left, now disconnect this port
636 */ 636 */
637 gameport_release_driver(gameport); 637 gameport_release_driver(gameport);
638 } 638 }
639 639
640 void gameport_rescan(struct gameport *gameport) 640 void gameport_rescan(struct gameport *gameport)
641 { 641 {
642 gameport_queue_event(gameport, NULL, GAMEPORT_RESCAN); 642 gameport_queue_event(gameport, NULL, GAMEPORT_RESCAN);
643 } 643 }
644 644
645 void gameport_reconnect(struct gameport *gameport) 645 void gameport_reconnect(struct gameport *gameport)
646 { 646 {
647 gameport_queue_event(gameport, NULL, GAMEPORT_RECONNECT); 647 gameport_queue_event(gameport, NULL, GAMEPORT_RECONNECT);
648 } 648 }
649 649
650 /* 650 /*
651 * Submits register request to kgameportd for subsequent execution. 651 * Submits register request to kgameportd for subsequent execution.
652 * Note that port registration is always asynchronous. 652 * Note that port registration is always asynchronous.
653 */ 653 */
654 void __gameport_register_port(struct gameport *gameport, struct module *owner) 654 void __gameport_register_port(struct gameport *gameport, struct module *owner)
655 { 655 {
656 gameport_init_port(gameport); 656 gameport_init_port(gameport);
657 gameport_queue_event(gameport, owner, GAMEPORT_REGISTER_PORT); 657 gameport_queue_event(gameport, owner, GAMEPORT_REGISTER_PORT);
658 } 658 }
659 659
660 /* 660 /*
661 * Synchronously unregisters gameport port. 661 * Synchronously unregisters gameport port.
662 */ 662 */
663 void gameport_unregister_port(struct gameport *gameport) 663 void gameport_unregister_port(struct gameport *gameport)
664 { 664 {
665 down(&gameport_sem); 665 down(&gameport_sem);
666 gameport_disconnect_port(gameport); 666 gameport_disconnect_port(gameport);
667 gameport_destroy_port(gameport); 667 gameport_destroy_port(gameport);
668 up(&gameport_sem); 668 up(&gameport_sem);
669 } 669 }
670 670
671 671
672 /* 672 /*
673 * Gameport driver operations 673 * Gameport driver operations
674 */ 674 */
675 675
676 static ssize_t gameport_driver_show_description(struct device_driver *drv, char *buf) 676 static ssize_t gameport_driver_show_description(struct device_driver *drv, char *buf)
677 { 677 {
678 struct gameport_driver *driver = to_gameport_driver(drv); 678 struct gameport_driver *driver = to_gameport_driver(drv);
679 return sprintf(buf, "%s\n", driver->description ? driver->description : "(none)"); 679 return sprintf(buf, "%s\n", driver->description ? driver->description : "(none)");
680 } 680 }
681 681
682 static struct driver_attribute gameport_driver_attrs[] = { 682 static struct driver_attribute gameport_driver_attrs[] = {
683 __ATTR(description, S_IRUGO, gameport_driver_show_description, NULL), 683 __ATTR(description, S_IRUGO, gameport_driver_show_description, NULL),
684 __ATTR_NULL 684 __ATTR_NULL
685 }; 685 };
686 686
687 static int gameport_driver_probe(struct device *dev) 687 static int gameport_driver_probe(struct device *dev)
688 { 688 {
689 struct gameport *gameport = to_gameport_port(dev); 689 struct gameport *gameport = to_gameport_port(dev);
690 struct gameport_driver *drv = to_gameport_driver(dev->driver); 690 struct gameport_driver *drv = to_gameport_driver(dev->driver);
691 691
692 drv->connect(gameport, drv); 692 drv->connect(gameport, drv);
693 return gameport->drv ? 0 : -ENODEV; 693 return gameport->drv ? 0 : -ENODEV;
694 } 694 }
695 695
696 static int gameport_driver_remove(struct device *dev) 696 static int gameport_driver_remove(struct device *dev)
697 { 697 {
698 struct gameport *gameport = to_gameport_port(dev); 698 struct gameport *gameport = to_gameport_port(dev);
699 struct gameport_driver *drv = to_gameport_driver(dev->driver); 699 struct gameport_driver *drv = to_gameport_driver(dev->driver);
700 700
701 drv->disconnect(gameport); 701 drv->disconnect(gameport);
702 return 0; 702 return 0;
703 } 703 }
704 704
705 void __gameport_register_driver(struct gameport_driver *drv, struct module *owner) 705 void __gameport_register_driver(struct gameport_driver *drv, struct module *owner)
706 { 706 {
707 drv->driver.bus = &gameport_bus; 707 drv->driver.bus = &gameport_bus;
708 drv->driver.probe = gameport_driver_probe; 708 drv->driver.probe = gameport_driver_probe;
709 drv->driver.remove = gameport_driver_remove; 709 drv->driver.remove = gameport_driver_remove;
710 gameport_queue_event(drv, owner, GAMEPORT_REGISTER_DRIVER); 710 gameport_queue_event(drv, owner, GAMEPORT_REGISTER_DRIVER);
711 } 711 }
712 712
713 void gameport_unregister_driver(struct gameport_driver *drv) 713 void gameport_unregister_driver(struct gameport_driver *drv)
714 { 714 {
715 struct gameport *gameport; 715 struct gameport *gameport;
716 716
717 down(&gameport_sem); 717 down(&gameport_sem);
718 drv->ignore = 1; /* so gameport_find_driver ignores it */ 718 drv->ignore = 1; /* so gameport_find_driver ignores it */
719 719
720 start_over: 720 start_over:
721 list_for_each_entry(gameport, &gameport_list, node) { 721 list_for_each_entry(gameport, &gameport_list, node) {
722 if (gameport->drv == drv) { 722 if (gameport->drv == drv) {
723 gameport_disconnect_port(gameport); 723 gameport_disconnect_port(gameport);
724 gameport_find_driver(gameport); 724 gameport_find_driver(gameport);
725 /* we could've deleted some ports, restart */ 725 /* we could've deleted some ports, restart */
726 goto start_over; 726 goto start_over;
727 } 727 }
728 } 728 }
729 729
730 driver_unregister(&drv->driver); 730 driver_unregister(&drv->driver);
731 up(&gameport_sem); 731 up(&gameport_sem);
732 } 732 }
733 733
734 static int gameport_bus_match(struct device *dev, struct device_driver *drv) 734 static int gameport_bus_match(struct device *dev, struct device_driver *drv)
735 { 735 {
736 struct gameport_driver *gameport_drv = to_gameport_driver(drv); 736 struct gameport_driver *gameport_drv = to_gameport_driver(drv);
737 737
738 return !gameport_drv->ignore; 738 return !gameport_drv->ignore;
739 } 739 }
740 740
741 static void gameport_set_drv(struct gameport *gameport, struct gameport_driver *drv) 741 static void gameport_set_drv(struct gameport *gameport, struct gameport_driver *drv)
742 { 742 {
743 down(&gameport->drv_sem); 743 down(&gameport->drv_sem);
744 gameport->drv = drv; 744 gameport->drv = drv;
745 up(&gameport->drv_sem); 745 up(&gameport->drv_sem);
746 } 746 }
747 747
748 int gameport_open(struct gameport *gameport, struct gameport_driver *drv, int mode) 748 int gameport_open(struct gameport *gameport, struct gameport_driver *drv, int mode)
749 { 749 {
750 750
751 if (gameport->open) { 751 if (gameport->open) {
752 if (gameport->open(gameport, mode)) { 752 if (gameport->open(gameport, mode)) {
753 return -1; 753 return -1;
754 } 754 }
755 } else { 755 } else {
756 if (mode != GAMEPORT_MODE_RAW) 756 if (mode != GAMEPORT_MODE_RAW)
757 return -1; 757 return -1;
758 } 758 }
759 759
760 gameport_set_drv(gameport, drv); 760 gameport_set_drv(gameport, drv);
761 return 0; 761 return 0;
762 } 762 }
763 763
764 void gameport_close(struct gameport *gameport) 764 void gameport_close(struct gameport *gameport)
765 { 765 {
766 del_timer_sync(&gameport->poll_timer); 766 del_timer_sync(&gameport->poll_timer);
767 gameport->poll_handler = NULL; 767 gameport->poll_handler = NULL;
768 gameport->poll_interval = 0; 768 gameport->poll_interval = 0;
769 gameport_set_drv(gameport, NULL); 769 gameport_set_drv(gameport, NULL);
770 if (gameport->close) 770 if (gameport->close)
771 gameport->close(gameport); 771 gameport->close(gameport);
772 } 772 }
773 773
774 static int __init gameport_init(void) 774 static int __init gameport_init(void)
775 { 775 {
776 if (!(gameport_pid = kernel_thread(gameport_thread, NULL, CLONE_KERNEL))) { 776 if (!(gameport_pid = kernel_thread(gameport_thread, NULL, CLONE_KERNEL))) {
777 printk(KERN_ERR "gameport: Failed to start kgameportd\n"); 777 printk(KERN_ERR "gameport: Failed to start kgameportd\n");
778 return -1; 778 return -1;
779 } 779 }
780 780
781 gameport_bus.dev_attrs = gameport_device_attrs; 781 gameport_bus.dev_attrs = gameport_device_attrs;
782 gameport_bus.drv_attrs = gameport_driver_attrs; 782 gameport_bus.drv_attrs = gameport_driver_attrs;
783 gameport_bus.match = gameport_bus_match; 783 gameport_bus.match = gameport_bus_match;
784 bus_register(&gameport_bus); 784 bus_register(&gameport_bus);
785 785
786 return 0; 786 return 0;
787 } 787 }
788 788
789 static void __exit gameport_exit(void) 789 static void __exit gameport_exit(void)
790 { 790 {
791 bus_unregister(&gameport_bus); 791 bus_unregister(&gameport_bus);
792 kill_proc(gameport_pid, SIGTERM, 1); 792 kill_proc(gameport_pid, SIGTERM, 1);
793 wait_for_completion(&gameport_exited); 793 wait_for_completion(&gameport_exited);
794 } 794 }
795 795
796 module_init(gameport_init); 796 module_init(gameport_init);
797 module_exit(gameport_exit); 797 module_exit(gameport_exit);
798 798
drivers/oprofile/buffer_sync.c
1 /** 1 /**
2 * @file buffer_sync.c 2 * @file buffer_sync.c
3 * 3 *
4 * @remark Copyright 2002 OProfile authors 4 * @remark Copyright 2002 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * 8 *
9 * This is the core of the buffer management. Each 9 * This is the core of the buffer management. Each
10 * CPU buffer is processed and entered into the 10 * CPU buffer is processed and entered into the
11 * global event buffer. Such processing is necessary 11 * global event buffer. Such processing is necessary
12 * in several circumstances, mentioned below. 12 * in several circumstances, mentioned below.
13 * 13 *
14 * The processing does the job of converting the 14 * The processing does the job of converting the
15 * transitory EIP value into a persistent dentry/offset 15 * transitory EIP value into a persistent dentry/offset
16 * value that the profiler can record at its leisure. 16 * value that the profiler can record at its leisure.
17 * 17 *
18 * See fs/dcookies.c for a description of the dentry/offset 18 * See fs/dcookies.c for a description of the dentry/offset
19 * objects. 19 * objects.
20 */ 20 */
21 21
22 #include <linux/mm.h> 22 #include <linux/mm.h>
23 #include <linux/workqueue.h> 23 #include <linux/workqueue.h>
24 #include <linux/notifier.h> 24 #include <linux/notifier.h>
25 #include <linux/dcookies.h> 25 #include <linux/dcookies.h>
26 #include <linux/profile.h> 26 #include <linux/profile.h>
27 #include <linux/module.h> 27 #include <linux/module.h>
28 #include <linux/fs.h> 28 #include <linux/fs.h>
29 29
30 #include "oprofile_stats.h" 30 #include "oprofile_stats.h"
31 #include "event_buffer.h" 31 #include "event_buffer.h"
32 #include "cpu_buffer.h" 32 #include "cpu_buffer.h"
33 #include "buffer_sync.h" 33 #include "buffer_sync.h"
34 34
35 static LIST_HEAD(dying_tasks); 35 static LIST_HEAD(dying_tasks);
36 static LIST_HEAD(dead_tasks); 36 static LIST_HEAD(dead_tasks);
37 static cpumask_t marked_cpus = CPU_MASK_NONE; 37 static cpumask_t marked_cpus = CPU_MASK_NONE;
38 static DEFINE_SPINLOCK(task_mortuary); 38 static DEFINE_SPINLOCK(task_mortuary);
39 static void process_task_mortuary(void); 39 static void process_task_mortuary(void);
40 40
41 41
42 /* Take ownership of the task struct and place it on the 42 /* Take ownership of the task struct and place it on the
43 * list for processing. Only after two full buffer syncs 43 * list for processing. Only after two full buffer syncs
44 * does the task eventually get freed, because by then 44 * does the task eventually get freed, because by then
45 * we are sure we will not reference it again. 45 * we are sure we will not reference it again.
46 */ 46 */
47 static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) 47 static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)
48 { 48 {
49 struct task_struct * task = data; 49 struct task_struct * task = data;
50 spin_lock(&task_mortuary); 50 spin_lock(&task_mortuary);
51 list_add(&task->tasks, &dying_tasks); 51 list_add(&task->tasks, &dying_tasks);
52 spin_unlock(&task_mortuary); 52 spin_unlock(&task_mortuary);
53 return NOTIFY_OK; 53 return NOTIFY_OK;
54 } 54 }
55 55
56 56
57 /* The task is on its way out. A sync of the buffer means we can catch 57 /* The task is on its way out. A sync of the buffer means we can catch
58 * any remaining samples for this task. 58 * any remaining samples for this task.
59 */ 59 */
60 static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) 60 static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data)
61 { 61 {
62 /* To avoid latency problems, we only process the current CPU, 62 /* To avoid latency problems, we only process the current CPU,
63 * hoping that most samples for the task are on this CPU 63 * hoping that most samples for the task are on this CPU
64 */ 64 */
65 sync_buffer(_smp_processor_id()); 65 sync_buffer(raw_smp_processor_id());
66 return 0; 66 return 0;
67 } 67 }
68 68
69 69
70 /* The task is about to try a do_munmap(). We peek at what it's going to 70 /* The task is about to try a do_munmap(). We peek at what it's going to
71 * do, and if it's an executable region, process the samples first, so 71 * do, and if it's an executable region, process the samples first, so
72 * we don't lose any. This does not have to be exact, it's a QoI issue 72 * we don't lose any. This does not have to be exact, it's a QoI issue
73 * only. 73 * only.
74 */ 74 */
75 static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) 75 static int munmap_notify(struct notifier_block * self, unsigned long val, void * data)
76 { 76 {
77 unsigned long addr = (unsigned long)data; 77 unsigned long addr = (unsigned long)data;
78 struct mm_struct * mm = current->mm; 78 struct mm_struct * mm = current->mm;
79 struct vm_area_struct * mpnt; 79 struct vm_area_struct * mpnt;
80 80
81 down_read(&mm->mmap_sem); 81 down_read(&mm->mmap_sem);
82 82
83 mpnt = find_vma(mm, addr); 83 mpnt = find_vma(mm, addr);
84 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) { 84 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
85 up_read(&mm->mmap_sem); 85 up_read(&mm->mmap_sem);
86 /* To avoid latency problems, we only process the current CPU, 86 /* To avoid latency problems, we only process the current CPU,
87 * hoping that most samples for the task are on this CPU 87 * hoping that most samples for the task are on this CPU
88 */ 88 */
89 sync_buffer(_smp_processor_id()); 89 sync_buffer(raw_smp_processor_id());
90 return 0; 90 return 0;
91 } 91 }
92 92
93 up_read(&mm->mmap_sem); 93 up_read(&mm->mmap_sem);
94 return 0; 94 return 0;
95 } 95 }
96 96
97 97
98 /* We need to be told about new modules so we don't attribute to a previously 98 /* We need to be told about new modules so we don't attribute to a previously
99 * loaded module, or drop the samples on the floor. 99 * loaded module, or drop the samples on the floor.
100 */ 100 */
101 static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) 101 static int module_load_notify(struct notifier_block * self, unsigned long val, void * data)
102 { 102 {
103 #ifdef CONFIG_MODULES 103 #ifdef CONFIG_MODULES
104 if (val != MODULE_STATE_COMING) 104 if (val != MODULE_STATE_COMING)
105 return 0; 105 return 0;
106 106
107 /* FIXME: should we process all CPU buffers ? */ 107 /* FIXME: should we process all CPU buffers ? */
108 down(&buffer_sem); 108 down(&buffer_sem);
109 add_event_entry(ESCAPE_CODE); 109 add_event_entry(ESCAPE_CODE);
110 add_event_entry(MODULE_LOADED_CODE); 110 add_event_entry(MODULE_LOADED_CODE);
111 up(&buffer_sem); 111 up(&buffer_sem);
112 #endif 112 #endif
113 return 0; 113 return 0;
114 } 114 }
115 115
116 116
117 static struct notifier_block task_free_nb = { 117 static struct notifier_block task_free_nb = {
118 .notifier_call = task_free_notify, 118 .notifier_call = task_free_notify,
119 }; 119 };
120 120
121 static struct notifier_block task_exit_nb = { 121 static struct notifier_block task_exit_nb = {
122 .notifier_call = task_exit_notify, 122 .notifier_call = task_exit_notify,
123 }; 123 };
124 124
125 static struct notifier_block munmap_nb = { 125 static struct notifier_block munmap_nb = {
126 .notifier_call = munmap_notify, 126 .notifier_call = munmap_notify,
127 }; 127 };
128 128
129 static struct notifier_block module_load_nb = { 129 static struct notifier_block module_load_nb = {
130 .notifier_call = module_load_notify, 130 .notifier_call = module_load_notify,
131 }; 131 };
132 132
133 133
134 static void end_sync(void) 134 static void end_sync(void)
135 { 135 {
136 end_cpu_work(); 136 end_cpu_work();
137 /* make sure we don't leak task structs */ 137 /* make sure we don't leak task structs */
138 process_task_mortuary(); 138 process_task_mortuary();
139 process_task_mortuary(); 139 process_task_mortuary();
140 } 140 }
141 141
142 142
143 int sync_start(void) 143 int sync_start(void)
144 { 144 {
145 int err; 145 int err;
146 146
147 start_cpu_work(); 147 start_cpu_work();
148 148
149 err = task_handoff_register(&task_free_nb); 149 err = task_handoff_register(&task_free_nb);
150 if (err) 150 if (err)
151 goto out1; 151 goto out1;
152 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb); 152 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
153 if (err) 153 if (err)
154 goto out2; 154 goto out2;
155 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb); 155 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
156 if (err) 156 if (err)
157 goto out3; 157 goto out3;
158 err = register_module_notifier(&module_load_nb); 158 err = register_module_notifier(&module_load_nb);
159 if (err) 159 if (err)
160 goto out4; 160 goto out4;
161 161
162 out: 162 out:
163 return err; 163 return err;
164 out4: 164 out4:
165 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 165 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
166 out3: 166 out3:
167 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 167 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
168 out2: 168 out2:
169 task_handoff_unregister(&task_free_nb); 169 task_handoff_unregister(&task_free_nb);
170 out1: 170 out1:
171 end_sync(); 171 end_sync();
172 goto out; 172 goto out;
173 } 173 }
174 174
175 175
176 void sync_stop(void) 176 void sync_stop(void)
177 { 177 {
178 unregister_module_notifier(&module_load_nb); 178 unregister_module_notifier(&module_load_nb);
179 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 179 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
180 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 180 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
181 task_handoff_unregister(&task_free_nb); 181 task_handoff_unregister(&task_free_nb);
182 end_sync(); 182 end_sync();
183 } 183 }
184 184
185 185
186 /* Optimisation. We can manage without taking the dcookie sem 186 /* Optimisation. We can manage without taking the dcookie sem
187 * because we cannot reach this code without at least one 187 * because we cannot reach this code without at least one
188 * dcookie user still being registered (namely, the reader 188 * dcookie user still being registered (namely, the reader
189 * of the event buffer). */ 189 * of the event buffer). */
190 static inline unsigned long fast_get_dcookie(struct dentry * dentry, 190 static inline unsigned long fast_get_dcookie(struct dentry * dentry,
191 struct vfsmount * vfsmnt) 191 struct vfsmount * vfsmnt)
192 { 192 {
193 unsigned long cookie; 193 unsigned long cookie;
194 194
195 if (dentry->d_cookie) 195 if (dentry->d_cookie)
196 return (unsigned long)dentry; 196 return (unsigned long)dentry;
197 get_dcookie(dentry, vfsmnt, &cookie); 197 get_dcookie(dentry, vfsmnt, &cookie);
198 return cookie; 198 return cookie;
199 } 199 }
200 200
201 201
202 /* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 202 /* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
203 * which corresponds loosely to "application name". This is 203 * which corresponds loosely to "application name". This is
204 * not strictly necessary but allows oprofile to associate 204 * not strictly necessary but allows oprofile to associate
205 * shared-library samples with particular applications 205 * shared-library samples with particular applications
206 */ 206 */
207 static unsigned long get_exec_dcookie(struct mm_struct * mm) 207 static unsigned long get_exec_dcookie(struct mm_struct * mm)
208 { 208 {
209 unsigned long cookie = 0; 209 unsigned long cookie = 0;
210 struct vm_area_struct * vma; 210 struct vm_area_struct * vma;
211 211
212 if (!mm) 212 if (!mm)
213 goto out; 213 goto out;
214 214
215 for (vma = mm->mmap; vma; vma = vma->vm_next) { 215 for (vma = mm->mmap; vma; vma = vma->vm_next) {
216 if (!vma->vm_file) 216 if (!vma->vm_file)
217 continue; 217 continue;
218 if (!(vma->vm_flags & VM_EXECUTABLE)) 218 if (!(vma->vm_flags & VM_EXECUTABLE))
219 continue; 219 continue;
220 cookie = fast_get_dcookie(vma->vm_file->f_dentry, 220 cookie = fast_get_dcookie(vma->vm_file->f_dentry,
221 vma->vm_file->f_vfsmnt); 221 vma->vm_file->f_vfsmnt);
222 break; 222 break;
223 } 223 }
224 224
225 out: 225 out:
226 return cookie; 226 return cookie;
227 } 227 }
228 228
229 229
230 /* Convert the EIP value of a sample into a persistent dentry/offset 230 /* Convert the EIP value of a sample into a persistent dentry/offset
231 * pair that can then be added to the global event buffer. We make 231 * pair that can then be added to the global event buffer. We make
232 * sure to do this lookup before a mm->mmap modification happens so 232 * sure to do this lookup before a mm->mmap modification happens so
233 * we don't lose track. 233 * we don't lose track.
234 */ 234 */
235 static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) 235 static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset)
236 { 236 {
237 unsigned long cookie = 0; 237 unsigned long cookie = 0;
238 struct vm_area_struct * vma; 238 struct vm_area_struct * vma;
239 239
240 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { 240 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
241 241
242 if (!vma->vm_file) 242 if (!vma->vm_file)
243 continue; 243 continue;
244 244
245 if (addr < vma->vm_start || addr >= vma->vm_end) 245 if (addr < vma->vm_start || addr >= vma->vm_end)
246 continue; 246 continue;
247 247
248 cookie = fast_get_dcookie(vma->vm_file->f_dentry, 248 cookie = fast_get_dcookie(vma->vm_file->f_dentry,
249 vma->vm_file->f_vfsmnt); 249 vma->vm_file->f_vfsmnt);
250 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start; 250 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start;
251 break; 251 break;
252 } 252 }
253 253
254 return cookie; 254 return cookie;
255 } 255 }
256 256
257 257
258 static unsigned long last_cookie = ~0UL; 258 static unsigned long last_cookie = ~0UL;
259 259
260 static void add_cpu_switch(int i) 260 static void add_cpu_switch(int i)
261 { 261 {
262 add_event_entry(ESCAPE_CODE); 262 add_event_entry(ESCAPE_CODE);
263 add_event_entry(CPU_SWITCH_CODE); 263 add_event_entry(CPU_SWITCH_CODE);
264 add_event_entry(i); 264 add_event_entry(i);
265 last_cookie = ~0UL; 265 last_cookie = ~0UL;
266 } 266 }
267 267
268 static void add_kernel_ctx_switch(unsigned int in_kernel) 268 static void add_kernel_ctx_switch(unsigned int in_kernel)
269 { 269 {
270 add_event_entry(ESCAPE_CODE); 270 add_event_entry(ESCAPE_CODE);
271 if (in_kernel) 271 if (in_kernel)
272 add_event_entry(KERNEL_ENTER_SWITCH_CODE); 272 add_event_entry(KERNEL_ENTER_SWITCH_CODE);
273 else 273 else
274 add_event_entry(KERNEL_EXIT_SWITCH_CODE); 274 add_event_entry(KERNEL_EXIT_SWITCH_CODE);
275 } 275 }
276 276
277 static void 277 static void
278 add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) 278 add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
279 { 279 {
280 add_event_entry(ESCAPE_CODE); 280 add_event_entry(ESCAPE_CODE);
281 add_event_entry(CTX_SWITCH_CODE); 281 add_event_entry(CTX_SWITCH_CODE);
282 add_event_entry(task->pid); 282 add_event_entry(task->pid);
283 add_event_entry(cookie); 283 add_event_entry(cookie);
284 /* Another code for daemon back-compat */ 284 /* Another code for daemon back-compat */
285 add_event_entry(ESCAPE_CODE); 285 add_event_entry(ESCAPE_CODE);
286 add_event_entry(CTX_TGID_CODE); 286 add_event_entry(CTX_TGID_CODE);
287 add_event_entry(task->tgid); 287 add_event_entry(task->tgid);
288 } 288 }
289 289
290 290
291 static void add_cookie_switch(unsigned long cookie) 291 static void add_cookie_switch(unsigned long cookie)
292 { 292 {
293 add_event_entry(ESCAPE_CODE); 293 add_event_entry(ESCAPE_CODE);
294 add_event_entry(COOKIE_SWITCH_CODE); 294 add_event_entry(COOKIE_SWITCH_CODE);
295 add_event_entry(cookie); 295 add_event_entry(cookie);
296 } 296 }
297 297
298 298
299 static void add_trace_begin(void) 299 static void add_trace_begin(void)
300 { 300 {
301 add_event_entry(ESCAPE_CODE); 301 add_event_entry(ESCAPE_CODE);
302 add_event_entry(TRACE_BEGIN_CODE); 302 add_event_entry(TRACE_BEGIN_CODE);
303 } 303 }
304 304
305 305
306 static void add_sample_entry(unsigned long offset, unsigned long event) 306 static void add_sample_entry(unsigned long offset, unsigned long event)
307 { 307 {
308 add_event_entry(offset); 308 add_event_entry(offset);
309 add_event_entry(event); 309 add_event_entry(event);
310 } 310 }
311 311
312 312
313 static int add_us_sample(struct mm_struct * mm, struct op_sample * s) 313 static int add_us_sample(struct mm_struct * mm, struct op_sample * s)
314 { 314 {
315 unsigned long cookie; 315 unsigned long cookie;
316 off_t offset; 316 off_t offset;
317 317
318 cookie = lookup_dcookie(mm, s->eip, &offset); 318 cookie = lookup_dcookie(mm, s->eip, &offset);
319 319
320 if (!cookie) { 320 if (!cookie) {
321 atomic_inc(&oprofile_stats.sample_lost_no_mapping); 321 atomic_inc(&oprofile_stats.sample_lost_no_mapping);
322 return 0; 322 return 0;
323 } 323 }
324 324
325 if (cookie != last_cookie) { 325 if (cookie != last_cookie) {
326 add_cookie_switch(cookie); 326 add_cookie_switch(cookie);
327 last_cookie = cookie; 327 last_cookie = cookie;
328 } 328 }
329 329
330 add_sample_entry(offset, s->event); 330 add_sample_entry(offset, s->event);
331 331
332 return 1; 332 return 1;
333 } 333 }
334 334
335 335
336 /* Add a sample to the global event buffer. If possible the 336 /* Add a sample to the global event buffer. If possible the
337 * sample is converted into a persistent dentry/offset pair 337 * sample is converted into a persistent dentry/offset pair
338 * for later lookup from userspace. 338 * for later lookup from userspace.
339 */ 339 */
340 static int 340 static int
341 add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) 341 add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
342 { 342 {
343 if (in_kernel) { 343 if (in_kernel) {
344 add_sample_entry(s->eip, s->event); 344 add_sample_entry(s->eip, s->event);
345 return 1; 345 return 1;
346 } else if (mm) { 346 } else if (mm) {
347 return add_us_sample(mm, s); 347 return add_us_sample(mm, s);
348 } else { 348 } else {
349 atomic_inc(&oprofile_stats.sample_lost_no_mm); 349 atomic_inc(&oprofile_stats.sample_lost_no_mm);
350 } 350 }
351 return 0; 351 return 0;
352 } 352 }
353 353
354 354
355 static void release_mm(struct mm_struct * mm) 355 static void release_mm(struct mm_struct * mm)
356 { 356 {
357 if (!mm) 357 if (!mm)
358 return; 358 return;
359 up_read(&mm->mmap_sem); 359 up_read(&mm->mmap_sem);
360 mmput(mm); 360 mmput(mm);
361 } 361 }
362 362
363 363
364 static struct mm_struct * take_tasks_mm(struct task_struct * task) 364 static struct mm_struct * take_tasks_mm(struct task_struct * task)
365 { 365 {
366 struct mm_struct * mm = get_task_mm(task); 366 struct mm_struct * mm = get_task_mm(task);
367 if (mm) 367 if (mm)
368 down_read(&mm->mmap_sem); 368 down_read(&mm->mmap_sem);
369 return mm; 369 return mm;
370 } 370 }
371 371
372 372
373 static inline int is_code(unsigned long val) 373 static inline int is_code(unsigned long val)
374 { 374 {
375 return val == ESCAPE_CODE; 375 return val == ESCAPE_CODE;
376 } 376 }
377 377
378 378
379 /* "acquire" as many cpu buffer slots as we can */ 379 /* "acquire" as many cpu buffer slots as we can */
380 static unsigned long get_slots(struct oprofile_cpu_buffer * b) 380 static unsigned long get_slots(struct oprofile_cpu_buffer * b)
381 { 381 {
382 unsigned long head = b->head_pos; 382 unsigned long head = b->head_pos;
383 unsigned long tail = b->tail_pos; 383 unsigned long tail = b->tail_pos;
384 384
385 /* 385 /*
386 * Subtle. This resets the persistent last_task 386 * Subtle. This resets the persistent last_task
387 * and in_kernel values used for switching notes. 387 * and in_kernel values used for switching notes.
388 * BUT, there is a small window between reading 388 * BUT, there is a small window between reading
389 * head_pos, and this call, that means samples 389 * head_pos, and this call, that means samples
390 * can appear at the new head position, but not 390 * can appear at the new head position, but not
391 * be prefixed with the notes for switching 391 * be prefixed with the notes for switching
392 * kernel mode or a task switch. This small hole 392 * kernel mode or a task switch. This small hole
393 * can lead to mis-attribution or samples where 393 * can lead to mis-attribution or samples where
394 * we don't know if it's in the kernel or not, 394 * we don't know if it's in the kernel or not,
395 * at the start of an event buffer. 395 * at the start of an event buffer.
396 */ 396 */
397 cpu_buffer_reset(b); 397 cpu_buffer_reset(b);
398 398
399 if (head >= tail) 399 if (head >= tail)
400 return head - tail; 400 return head - tail;
401 401
402 return head + (b->buffer_size - tail); 402 return head + (b->buffer_size - tail);
403 } 403 }
404 404
405 405
406 static void increment_tail(struct oprofile_cpu_buffer * b) 406 static void increment_tail(struct oprofile_cpu_buffer * b)
407 { 407 {
408 unsigned long new_tail = b->tail_pos + 1; 408 unsigned long new_tail = b->tail_pos + 1;
409 409
410 rmb(); 410 rmb();
411 411
412 if (new_tail < b->buffer_size) 412 if (new_tail < b->buffer_size)
413 b->tail_pos = new_tail; 413 b->tail_pos = new_tail;
414 else 414 else
415 b->tail_pos = 0; 415 b->tail_pos = 0;
416 } 416 }
417 417
418 418
419 /* Move tasks along towards death. Any tasks on dead_tasks 419 /* Move tasks along towards death. Any tasks on dead_tasks
420 * will definitely have no remaining references in any 420 * will definitely have no remaining references in any
421 * CPU buffers at this point, because we use two lists, 421 * CPU buffers at this point, because we use two lists,
422 * and to have reached the list, it must have gone through 422 * and to have reached the list, it must have gone through
423 * one full sync already. 423 * one full sync already.
424 */ 424 */
425 static void process_task_mortuary(void) 425 static void process_task_mortuary(void)
426 { 426 {
427 struct list_head * pos; 427 struct list_head * pos;
428 struct list_head * pos2; 428 struct list_head * pos2;
429 struct task_struct * task; 429 struct task_struct * task;
430 430
431 spin_lock(&task_mortuary); 431 spin_lock(&task_mortuary);
432 432
433 list_for_each_safe(pos, pos2, &dead_tasks) { 433 list_for_each_safe(pos, pos2, &dead_tasks) {
434 task = list_entry(pos, struct task_struct, tasks); 434 task = list_entry(pos, struct task_struct, tasks);
435 list_del(&task->tasks); 435 list_del(&task->tasks);
436 free_task(task); 436 free_task(task);
437 } 437 }
438 438
439 list_for_each_safe(pos, pos2, &dying_tasks) { 439 list_for_each_safe(pos, pos2, &dying_tasks) {
440 task = list_entry(pos, struct task_struct, tasks); 440 task = list_entry(pos, struct task_struct, tasks);
441 list_del(&task->tasks); 441 list_del(&task->tasks);
442 list_add_tail(&task->tasks, &dead_tasks); 442 list_add_tail(&task->tasks, &dead_tasks);
443 } 443 }
444 444
445 spin_unlock(&task_mortuary); 445 spin_unlock(&task_mortuary);
446 } 446 }
447 447
448 448
449 static void mark_done(int cpu) 449 static void mark_done(int cpu)
450 { 450 {
451 int i; 451 int i;
452 452
453 cpu_set(cpu, marked_cpus); 453 cpu_set(cpu, marked_cpus);
454 454
455 for_each_online_cpu(i) { 455 for_each_online_cpu(i) {
456 if (!cpu_isset(i, marked_cpus)) 456 if (!cpu_isset(i, marked_cpus))
457 return; 457 return;
458 } 458 }
459 459
460 /* All CPUs have been processed at least once, 460 /* All CPUs have been processed at least once,
461 * we can process the mortuary once 461 * we can process the mortuary once
462 */ 462 */
463 process_task_mortuary(); 463 process_task_mortuary();
464 464
465 cpus_clear(marked_cpus); 465 cpus_clear(marked_cpus);
466 } 466 }
467 467
468 468
469 /* FIXME: this is not sufficient if we implement syscall barrier backtrace 469 /* FIXME: this is not sufficient if we implement syscall barrier backtrace
470 * traversal, the code switch to sb_sample_start at first kernel enter/exit 470 * traversal, the code switch to sb_sample_start at first kernel enter/exit
471 * switch so we need a fifth state and some special handling in sync_buffer() 471 * switch so we need a fifth state and some special handling in sync_buffer()
472 */ 472 */
473 typedef enum { 473 typedef enum {
474 sb_bt_ignore = -2, 474 sb_bt_ignore = -2,
475 sb_buffer_start, 475 sb_buffer_start,
476 sb_bt_start, 476 sb_bt_start,
477 sb_sample_start, 477 sb_sample_start,
478 } sync_buffer_state; 478 } sync_buffer_state;
479 479
480 /* Sync one of the CPU's buffers into the global event buffer. 480 /* Sync one of the CPU's buffers into the global event buffer.
481 * Here we need to go through each batch of samples punctuated 481 * Here we need to go through each batch of samples punctuated
482 * by context switch notes, taking the task's mmap_sem and doing 482 * by context switch notes, taking the task's mmap_sem and doing
483 * lookup in task->mm->mmap to convert EIP into dcookie/offset 483 * lookup in task->mm->mmap to convert EIP into dcookie/offset
484 * value. 484 * value.
485 */ 485 */
486 void sync_buffer(int cpu) 486 void sync_buffer(int cpu)
487 { 487 {
488 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu]; 488 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu];
489 struct mm_struct *mm = NULL; 489 struct mm_struct *mm = NULL;
490 struct task_struct * new; 490 struct task_struct * new;
491 unsigned long cookie = 0; 491 unsigned long cookie = 0;
492 int in_kernel = 1; 492 int in_kernel = 1;
493 unsigned int i; 493 unsigned int i;
494 sync_buffer_state state = sb_buffer_start; 494 sync_buffer_state state = sb_buffer_start;
495 unsigned long available; 495 unsigned long available;
496 496
497 down(&buffer_sem); 497 down(&buffer_sem);
498 498
499 add_cpu_switch(cpu); 499 add_cpu_switch(cpu);
500 500
501 /* Remember, only we can modify tail_pos */ 501 /* Remember, only we can modify tail_pos */
502 502
503 available = get_slots(cpu_buf); 503 available = get_slots(cpu_buf);
504 504
505 for (i = 0; i < available; ++i) { 505 for (i = 0; i < available; ++i) {
506 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; 506 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
507 507
508 if (is_code(s->eip)) { 508 if (is_code(s->eip)) {
509 if (s->event <= CPU_IS_KERNEL) { 509 if (s->event <= CPU_IS_KERNEL) {
510 /* kernel/userspace switch */ 510 /* kernel/userspace switch */
511 in_kernel = s->event; 511 in_kernel = s->event;
512 if (state == sb_buffer_start) 512 if (state == sb_buffer_start)
513 state = sb_sample_start; 513 state = sb_sample_start;
514 add_kernel_ctx_switch(s->event); 514 add_kernel_ctx_switch(s->event);
515 } else if (s->event == CPU_TRACE_BEGIN) { 515 } else if (s->event == CPU_TRACE_BEGIN) {
516 state = sb_bt_start; 516 state = sb_bt_start;
517 add_trace_begin(); 517 add_trace_begin();
518 } else { 518 } else {
519 struct mm_struct * oldmm = mm; 519 struct mm_struct * oldmm = mm;
520 520
521 /* userspace context switch */ 521 /* userspace context switch */
522 new = (struct task_struct *)s->event; 522 new = (struct task_struct *)s->event;
523 523
524 release_mm(oldmm); 524 release_mm(oldmm);
525 mm = take_tasks_mm(new); 525 mm = take_tasks_mm(new);
526 if (mm != oldmm) 526 if (mm != oldmm)
527 cookie = get_exec_dcookie(mm); 527 cookie = get_exec_dcookie(mm);
528 add_user_ctx_switch(new, cookie); 528 add_user_ctx_switch(new, cookie);
529 } 529 }
530 } else { 530 } else {
531 if (state >= sb_bt_start && 531 if (state >= sb_bt_start &&
532 !add_sample(mm, s, in_kernel)) { 532 !add_sample(mm, s, in_kernel)) {
533 if (state == sb_bt_start) { 533 if (state == sb_bt_start) {
534 state = sb_bt_ignore; 534 state = sb_bt_ignore;
535 atomic_inc(&oprofile_stats.bt_lost_no_mapping); 535 atomic_inc(&oprofile_stats.bt_lost_no_mapping);
536 } 536 }
537 } 537 }
538 } 538 }
539 539
540 increment_tail(cpu_buf); 540 increment_tail(cpu_buf);
541 } 541 }
542 release_mm(mm); 542 release_mm(mm);
543 543
544 mark_done(cpu); 544 mark_done(cpu);
545 545
546 up(&buffer_sem); 546 up(&buffer_sem);
547 } 547 }
548 548
fs/xfs/linux-2.6/xfs_linux.h
1 /* 1 /*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. 2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as 5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 * 7 *
8 * This program is distributed in the hope that it would be useful, but 8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 * 11 *
12 * Further, this software is distributed without any warranty that it is 12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement 13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or 14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if 15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with 16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever. 17 * other software, or any other product whatsoever.
18 * 18 *
19 * You should have received a copy of the GNU General Public License along 19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59 20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA. 21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 * 22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, 23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or: 24 * Mountain View, CA 94043, or:
25 * 25 *
26 * http://www.sgi.com 26 * http://www.sgi.com
27 * 27 *
28 * For further information regarding this notice, see: 28 * For further information regarding this notice, see:
29 * 29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ 30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */ 31 */
32 #ifndef __XFS_LINUX__ 32 #ifndef __XFS_LINUX__
33 #define __XFS_LINUX__ 33 #define __XFS_LINUX__
34 34
35 #include <linux/types.h> 35 #include <linux/types.h>
36 #include <linux/config.h> 36 #include <linux/config.h>
37 37
38 /* 38 /*
39 * Some types are conditional depending on the target system. 39 * Some types are conditional depending on the target system.
40 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. 40 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
41 * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well 41 * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
42 * as requiring XFS_BIG_BLKNOS to be set. 42 * as requiring XFS_BIG_BLKNOS to be set.
43 */ 43 */
44 #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) 44 #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
45 # define XFS_BIG_BLKNOS 1 45 # define XFS_BIG_BLKNOS 1
46 # if BITS_PER_LONG == 64 46 # if BITS_PER_LONG == 64
47 # define XFS_BIG_INUMS 1 47 # define XFS_BIG_INUMS 1
48 # else 48 # else
49 # define XFS_BIG_INUMS 0 49 # define XFS_BIG_INUMS 0
50 # endif 50 # endif
51 #else 51 #else
52 # define XFS_BIG_BLKNOS 0 52 # define XFS_BIG_BLKNOS 0
53 # define XFS_BIG_INUMS 0 53 # define XFS_BIG_INUMS 0
54 #endif 54 #endif
55 55
56 #include <xfs_types.h> 56 #include <xfs_types.h>
57 #include <xfs_arch.h> 57 #include <xfs_arch.h>
58 58
59 #include <kmem.h> 59 #include <kmem.h>
60 #include <mrlock.h> 60 #include <mrlock.h>
61 #include <spin.h> 61 #include <spin.h>
62 #include <sv.h> 62 #include <sv.h>
63 #include <mutex.h> 63 #include <mutex.h>
64 #include <sema.h> 64 #include <sema.h>
65 #include <time.h> 65 #include <time.h>
66 66
67 #include <support/qsort.h> 67 #include <support/qsort.h>
68 #include <support/ktrace.h> 68 #include <support/ktrace.h>
69 #include <support/debug.h> 69 #include <support/debug.h>
70 #include <support/move.h> 70 #include <support/move.h>
71 #include <support/uuid.h> 71 #include <support/uuid.h>
72 72
73 #include <linux/mm.h> 73 #include <linux/mm.h>
74 #include <linux/kernel.h> 74 #include <linux/kernel.h>
75 #include <linux/blkdev.h> 75 #include <linux/blkdev.h>
76 #include <linux/slab.h> 76 #include <linux/slab.h>
77 #include <linux/module.h> 77 #include <linux/module.h>
78 #include <linux/file.h> 78 #include <linux/file.h>
79 #include <linux/swap.h> 79 #include <linux/swap.h>
80 #include <linux/errno.h> 80 #include <linux/errno.h>
81 #include <linux/sched.h> 81 #include <linux/sched.h>
82 #include <linux/bitops.h> 82 #include <linux/bitops.h>
83 #include <linux/major.h> 83 #include <linux/major.h>
84 #include <linux/pagemap.h> 84 #include <linux/pagemap.h>
85 #include <linux/vfs.h> 85 #include <linux/vfs.h>
86 #include <linux/seq_file.h> 86 #include <linux/seq_file.h>
87 #include <linux/init.h> 87 #include <linux/init.h>
88 #include <linux/list.h> 88 #include <linux/list.h>
89 #include <linux/proc_fs.h> 89 #include <linux/proc_fs.h>
90 #include <linux/version.h> 90 #include <linux/version.h>
91 #include <linux/sort.h> 91 #include <linux/sort.h>
92 92
93 #include <asm/page.h> 93 #include <asm/page.h>
94 #include <asm/div64.h> 94 #include <asm/div64.h>
95 #include <asm/param.h> 95 #include <asm/param.h>
96 #include <asm/uaccess.h> 96 #include <asm/uaccess.h>
97 #include <asm/byteorder.h> 97 #include <asm/byteorder.h>
98 #include <asm/unaligned.h> 98 #include <asm/unaligned.h>
99 99
100 #include <xfs_behavior.h> 100 #include <xfs_behavior.h>
101 #include <xfs_vfs.h> 101 #include <xfs_vfs.h>
102 #include <xfs_cred.h> 102 #include <xfs_cred.h>
103 #include <xfs_vnode.h> 103 #include <xfs_vnode.h>
104 #include <xfs_stats.h> 104 #include <xfs_stats.h>
105 #include <xfs_sysctl.h> 105 #include <xfs_sysctl.h>
106 #include <xfs_iops.h> 106 #include <xfs_iops.h>
107 #include <xfs_super.h> 107 #include <xfs_super.h>
108 #include <xfs_globals.h> 108 #include <xfs_globals.h>
109 #include <xfs_fs_subr.h> 109 #include <xfs_fs_subr.h>
110 #include <xfs_lrw.h> 110 #include <xfs_lrw.h>
111 #include <xfs_buf.h> 111 #include <xfs_buf.h>
112 112
113 /* 113 /*
114 * Feature macros (disable/enable) 114 * Feature macros (disable/enable)
115 */ 115 */
116 #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ 116 #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
117 #define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ 117 #define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */
118 118
119 /* 119 /*
120 * State flag for unwritten extent buffers. 120 * State flag for unwritten extent buffers.
121 * 121 *
122 * We need to be able to distinguish between these and delayed 122 * We need to be able to distinguish between these and delayed
123 * allocate buffers within XFS. The generic IO path code does 123 * allocate buffers within XFS. The generic IO path code does
124 * not need to distinguish - we use the BH_Delay flag for both 124 * not need to distinguish - we use the BH_Delay flag for both
125 * delalloc and these ondisk-uninitialised buffers. 125 * delalloc and these ondisk-uninitialised buffers.
126 */ 126 */
127 BUFFER_FNS(PrivateStart, unwritten); 127 BUFFER_FNS(PrivateStart, unwritten);
128 static inline void set_buffer_unwritten_io(struct buffer_head *bh) 128 static inline void set_buffer_unwritten_io(struct buffer_head *bh)
129 { 129 {
130 bh->b_end_io = linvfs_unwritten_done; 130 bh->b_end_io = linvfs_unwritten_done;
131 } 131 }
132 132
133 #define restricted_chown xfs_params.restrict_chown.val 133 #define restricted_chown xfs_params.restrict_chown.val
134 #define irix_sgid_inherit xfs_params.sgid_inherit.val 134 #define irix_sgid_inherit xfs_params.sgid_inherit.val
135 #define irix_symlink_mode xfs_params.symlink_mode.val 135 #define irix_symlink_mode xfs_params.symlink_mode.val
136 #define xfs_panic_mask xfs_params.panic_mask.val 136 #define xfs_panic_mask xfs_params.panic_mask.val
137 #define xfs_error_level xfs_params.error_level.val 137 #define xfs_error_level xfs_params.error_level.val
138 #define xfs_syncd_centisecs xfs_params.syncd_timer.val 138 #define xfs_syncd_centisecs xfs_params.syncd_timer.val
139 #define xfs_stats_clear xfs_params.stats_clear.val 139 #define xfs_stats_clear xfs_params.stats_clear.val
140 #define xfs_inherit_sync xfs_params.inherit_sync.val 140 #define xfs_inherit_sync xfs_params.inherit_sync.val
141 #define xfs_inherit_nodump xfs_params.inherit_nodump.val 141 #define xfs_inherit_nodump xfs_params.inherit_nodump.val
142 #define xfs_inherit_noatime xfs_params.inherit_noatim.val 142 #define xfs_inherit_noatime xfs_params.inherit_noatim.val
143 #define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val 143 #define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
144 #define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val 144 #define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
145 #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val 145 #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
146 #define xfs_rotorstep xfs_params.rotorstep.val 146 #define xfs_rotorstep xfs_params.rotorstep.val
147 147
148 #ifndef __smp_processor_id 148 #ifndef raw_smp_processor_id
149 #define __smp_processor_id() smp_processor_id() 149 #define raw_smp_processor_id() smp_processor_id()
150 #endif 150 #endif
151 #define current_cpu() __smp_processor_id() 151 #define current_cpu() raw_smp_processor_id()
152 #define current_pid() (current->pid) 152 #define current_pid() (current->pid)
153 #define current_fsuid(cred) (current->fsuid) 153 #define current_fsuid(cred) (current->fsuid)
154 #define current_fsgid(cred) (current->fsgid) 154 #define current_fsgid(cred) (current->fsgid)
155 155
156 #define NBPP PAGE_SIZE 156 #define NBPP PAGE_SIZE
157 #define DPPSHFT (PAGE_SHIFT - 9) 157 #define DPPSHFT (PAGE_SHIFT - 9)
158 #define NDPP (1 << (PAGE_SHIFT - 9)) 158 #define NDPP (1 << (PAGE_SHIFT - 9))
159 #define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) 159 #define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT)
160 #define dtopt(DD) ((DD) >> DPPSHFT) 160 #define dtopt(DD) ((DD) >> DPPSHFT)
161 #define dpoff(DD) ((DD) & (NDPP-1)) 161 #define dpoff(DD) ((DD) & (NDPP-1))
162 162
163 #define NBBY 8 /* number of bits per byte */ 163 #define NBBY 8 /* number of bits per byte */
164 #define NBPC PAGE_SIZE /* Number of bytes per click */ 164 #define NBPC PAGE_SIZE /* Number of bytes per click */
165 #define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */ 165 #define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */
166 166
167 /* 167 /*
168 * Size of block device i/o is parameterized here. 168 * Size of block device i/o is parameterized here.
169 * Currently the system supports page-sized i/o. 169 * Currently the system supports page-sized i/o.
170 */ 170 */
171 #define BLKDEV_IOSHIFT BPCSHIFT 171 #define BLKDEV_IOSHIFT BPCSHIFT
172 #define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT) 172 #define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)
173 /* number of BB's per block device block */ 173 /* number of BB's per block device block */
174 #define BLKDEV_BB BTOBB(BLKDEV_IOSIZE) 174 #define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)
175 175
176 /* bytes to clicks */ 176 /* bytes to clicks */
177 #define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) 177 #define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
178 #define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) 178 #define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)
179 #define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) 179 #define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
180 #define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) 180 #define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT)
181 #define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) 181 #define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
182 #define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) 182 #define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT)
183 183
184 /* off_t bytes to clicks */ 184 /* off_t bytes to clicks */
185 #define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) 185 #define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
186 #define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT) 186 #define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT)
187 187
188 /* clicks to off_t bytes */ 188 /* clicks to off_t bytes */
189 #define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT) 189 #define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT)
190 190
191 /* clicks to bytes */ 191 /* clicks to bytes */
192 #define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT) 192 #define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT)
193 #define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) 193 #define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT)
194 #define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT) 194 #define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT)
195 #define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT) 195 #define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT)
196 196
197 /* bytes to clicks */ 197 /* bytes to clicks */
198 #define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) 198 #define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
199 199
200 #ifndef CELL_CAPABLE 200 #ifndef CELL_CAPABLE
201 #define FSC_NOTIFY_NAME_CHANGED(vp) 201 #define FSC_NOTIFY_NAME_CHANGED(vp)
202 #endif 202 #endif
203 203
204 #ifndef ENOATTR 204 #ifndef ENOATTR
205 #define ENOATTR ENODATA /* Attribute not found */ 205 #define ENOATTR ENODATA /* Attribute not found */
206 #endif 206 #endif
207 207
208 /* Note: EWRONGFS never visible outside the kernel */ 208 /* Note: EWRONGFS never visible outside the kernel */
209 #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ 209 #define EWRONGFS EINVAL /* Mount with wrong filesystem type */
210 210
211 /* 211 /*
212 * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't 212 * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
213 * return codes out of its known range in errno. 213 * return codes out of its known range in errno.
214 * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't 214 * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
215 * conflict with any code we use already or any code a driver may use) 215 * conflict with any code we use already or any code a driver may use)
216 * XXX Some options (currently we do #2): 216 * XXX Some options (currently we do #2):
217 * 1/ New error code ["Filesystem is corrupted", _after_ glibc updated] 217 * 1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
218 * 2/ 990 ["Unknown error 990"] 218 * 2/ 990 ["Unknown error 990"]
219 * 3/ EUCLEAN ["Structure needs cleaning"] 219 * 3/ EUCLEAN ["Structure needs cleaning"]
220 * 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace] 220 * 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
221 */ 221 */
222 #define EFSCORRUPTED 990 /* Filesystem is corrupted */ 222 #define EFSCORRUPTED 990 /* Filesystem is corrupted */
223 223
224 #define SYNCHRONIZE() barrier() 224 #define SYNCHRONIZE() barrier()
225 #define __return_address __builtin_return_address(0) 225 #define __return_address __builtin_return_address(0)
226 226
227 /* 227 /*
228 * IRIX (BSD) quotactl makes use of separate commands for user/group, 228 * IRIX (BSD) quotactl makes use of separate commands for user/group,
229 * whereas on Linux the syscall encodes this information into the cmd 229 * whereas on Linux the syscall encodes this information into the cmd
230 * field (see the QCMD macro in quota.h). These macros help keep the 230 * field (see the QCMD macro in quota.h). These macros help keep the
231 * code portable - they are not visible from the syscall interface. 231 * code portable - they are not visible from the syscall interface.
232 */ 232 */
233 #define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */ 233 #define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */
234 #define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */ 234 #define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */
235 235
236 /* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */ 236 /* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */
237 /* we may well need to fine-tune this if it ever becomes an issue. */ 237 /* we may well need to fine-tune this if it ever becomes an issue. */
238 #define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */ 238 #define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */
239 #define ndquot DQUOT_MAX_HEURISTIC 239 #define ndquot DQUOT_MAX_HEURISTIC
240 240
241 /* IRIX uses the current size of the name cache to guess a good value */ 241 /* IRIX uses the current size of the name cache to guess a good value */
242 /* - this isn't the same but is a good enough starting point for now. */ 242 /* - this isn't the same but is a good enough starting point for now. */
243 #define DQUOT_HASH_HEURISTIC files_stat.nr_files 243 #define DQUOT_HASH_HEURISTIC files_stat.nr_files
244 244
245 /* IRIX inodes maintain the project ID also, zero this field on Linux */ 245 /* IRIX inodes maintain the project ID also, zero this field on Linux */
246 #define DEFAULT_PROJID 0 246 #define DEFAULT_PROJID 0
247 #define dfltprid DEFAULT_PROJID 247 #define dfltprid DEFAULT_PROJID
248 248
249 #define MAXPATHLEN 1024 249 #define MAXPATHLEN 1024
250 250
251 #define MIN(a,b) (min(a,b)) 251 #define MIN(a,b) (min(a,b))
252 #define MAX(a,b) (max(a,b)) 252 #define MAX(a,b) (max(a,b))
253 #define howmany(x, y) (((x)+((y)-1))/(y)) 253 #define howmany(x, y) (((x)+((y)-1))/(y))
254 #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) 254 #define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
255 255
256 #define xfs_stack_trace() dump_stack() 256 #define xfs_stack_trace() dump_stack()
257 257
258 #define xfs_itruncate_data(ip, off) \ 258 #define xfs_itruncate_data(ip, off) \
259 (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off))) 259 (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
260 260
261 261
262 /* Move the kernel do_div definition off to one side */ 262 /* Move the kernel do_div definition off to one side */
263 263
264 #if defined __i386__ 264 #if defined __i386__
265 /* For ia32 we need to pull some tricks to get past various versions 265 /* For ia32 we need to pull some tricks to get past various versions
266 * of the compiler which do not like us using do_div in the middle 266 * of the compiler which do not like us using do_div in the middle
267 * of large functions. 267 * of large functions.
268 */ 268 */
269 static inline __u32 xfs_do_div(void *a, __u32 b, int n) 269 static inline __u32 xfs_do_div(void *a, __u32 b, int n)
270 { 270 {
271 __u32 mod; 271 __u32 mod;
272 272
273 switch (n) { 273 switch (n) {
274 case 4: 274 case 4:
275 mod = *(__u32 *)a % b; 275 mod = *(__u32 *)a % b;
276 *(__u32 *)a = *(__u32 *)a / b; 276 *(__u32 *)a = *(__u32 *)a / b;
277 return mod; 277 return mod;
278 case 8: 278 case 8:
279 { 279 {
280 unsigned long __upper, __low, __high, __mod; 280 unsigned long __upper, __low, __high, __mod;
281 __u64 c = *(__u64 *)a; 281 __u64 c = *(__u64 *)a;
282 __upper = __high = c >> 32; 282 __upper = __high = c >> 32;
283 __low = c; 283 __low = c;
284 if (__high) { 284 if (__high) {
285 __upper = __high % (b); 285 __upper = __high % (b);
286 __high = __high / (b); 286 __high = __high / (b);
287 } 287 }
288 asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); 288 asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
289 asm("":"=A" (c):"a" (__low),"d" (__high)); 289 asm("":"=A" (c):"a" (__low),"d" (__high));
290 *(__u64 *)a = c; 290 *(__u64 *)a = c;
291 return __mod; 291 return __mod;
292 } 292 }
293 } 293 }
294 294
295 /* NOTREACHED */ 295 /* NOTREACHED */
296 return 0; 296 return 0;
297 } 297 }
298 298
299 /* Side effect free 64 bit mod operation */ 299 /* Side effect free 64 bit mod operation */
300 static inline __u32 xfs_do_mod(void *a, __u32 b, int n) 300 static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
301 { 301 {
302 switch (n) { 302 switch (n) {
303 case 4: 303 case 4:
304 return *(__u32 *)a % b; 304 return *(__u32 *)a % b;
305 case 8: 305 case 8:
306 { 306 {
307 unsigned long __upper, __low, __high, __mod; 307 unsigned long __upper, __low, __high, __mod;
308 __u64 c = *(__u64 *)a; 308 __u64 c = *(__u64 *)a;
309 __upper = __high = c >> 32; 309 __upper = __high = c >> 32;
310 __low = c; 310 __low = c;
311 if (__high) { 311 if (__high) {
312 __upper = __high % (b); 312 __upper = __high % (b);
313 __high = __high / (b); 313 __high = __high / (b);
314 } 314 }
315 asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); 315 asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
316 asm("":"=A" (c):"a" (__low),"d" (__high)); 316 asm("":"=A" (c):"a" (__low),"d" (__high));
317 return __mod; 317 return __mod;
318 } 318 }
319 } 319 }
320 320
321 /* NOTREACHED */ 321 /* NOTREACHED */
322 return 0; 322 return 0;
323 } 323 }
324 #else 324 #else
325 static inline __u32 xfs_do_div(void *a, __u32 b, int n) 325 static inline __u32 xfs_do_div(void *a, __u32 b, int n)
326 { 326 {
327 __u32 mod; 327 __u32 mod;
328 328
329 switch (n) { 329 switch (n) {
330 case 4: 330 case 4:
331 mod = *(__u32 *)a % b; 331 mod = *(__u32 *)a % b;
332 *(__u32 *)a = *(__u32 *)a / b; 332 *(__u32 *)a = *(__u32 *)a / b;
333 return mod; 333 return mod;
334 case 8: 334 case 8:
335 mod = do_div(*(__u64 *)a, b); 335 mod = do_div(*(__u64 *)a, b);
336 return mod; 336 return mod;
337 } 337 }
338 338
339 /* NOTREACHED */ 339 /* NOTREACHED */
340 return 0; 340 return 0;
341 } 341 }
342 342
343 /* Side effect free 64 bit mod operation */ 343 /* Side effect free 64 bit mod operation */
344 static inline __u32 xfs_do_mod(void *a, __u32 b, int n) 344 static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
345 { 345 {
346 switch (n) { 346 switch (n) {
347 case 4: 347 case 4:
348 return *(__u32 *)a % b; 348 return *(__u32 *)a % b;
349 case 8: 349 case 8:
350 { 350 {
351 __u64 c = *(__u64 *)a; 351 __u64 c = *(__u64 *)a;
352 return do_div(c, b); 352 return do_div(c, b);
353 } 353 }
354 } 354 }
355 355
356 /* NOTREACHED */ 356 /* NOTREACHED */
357 return 0; 357 return 0;
358 } 358 }
359 #endif 359 #endif
360 360
361 #undef do_div 361 #undef do_div
362 #define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) 362 #define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
363 #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) 363 #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
364 364
365 static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) 365 static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
366 { 366 {
367 x += y - 1; 367 x += y - 1;
368 do_div(x, y); 368 do_div(x, y);
369 return(x * y); 369 return(x * y);
370 } 370 }
371 371
372 #define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL) 372 #define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)
373 373
374 #endif /* __XFS_LINUX__ */ 374 #endif /* __XFS_LINUX__ */
375 375
include/asm-alpha/smp.h
1 #ifndef __ASM_SMP_H 1 #ifndef __ASM_SMP_H
2 #define __ASM_SMP_H 2 #define __ASM_SMP_H
3 3
4 #include <linux/config.h> 4 #include <linux/config.h>
5 #include <linux/threads.h> 5 #include <linux/threads.h>
6 #include <linux/cpumask.h> 6 #include <linux/cpumask.h>
7 #include <linux/bitops.h> 7 #include <linux/bitops.h>
8 #include <asm/pal.h> 8 #include <asm/pal.h>
9 9
10 /* HACK: Cabrio WHAMI return value is bogus if more than 8 bits used.. :-( */ 10 /* HACK: Cabrio WHAMI return value is bogus if more than 8 bits used.. :-( */
11 11
12 static __inline__ unsigned char 12 static __inline__ unsigned char
13 __hard_smp_processor_id(void) 13 __hard_smp_processor_id(void)
14 { 14 {
15 register unsigned char __r0 __asm__("$0"); 15 register unsigned char __r0 __asm__("$0");
16 __asm__ __volatile__( 16 __asm__ __volatile__(
17 "call_pal %1 #whami" 17 "call_pal %1 #whami"
18 : "=r"(__r0) 18 : "=r"(__r0)
19 :"i" (PAL_whami) 19 :"i" (PAL_whami)
20 : "$1", "$22", "$23", "$24", "$25"); 20 : "$1", "$22", "$23", "$24", "$25");
21 return __r0; 21 return __r0;
22 } 22 }
23 23
24 #ifdef CONFIG_SMP 24 #ifdef CONFIG_SMP
25 25
26 #include <asm/irq.h> 26 #include <asm/irq.h>
27 27
28 struct cpuinfo_alpha { 28 struct cpuinfo_alpha {
29 unsigned long loops_per_jiffy; 29 unsigned long loops_per_jiffy;
30 unsigned long last_asn; 30 unsigned long last_asn;
31 int need_new_asn; 31 int need_new_asn;
32 int asn_lock; 32 int asn_lock;
33 unsigned long ipi_count; 33 unsigned long ipi_count;
34 unsigned long prof_multiplier; 34 unsigned long prof_multiplier;
35 unsigned long prof_counter; 35 unsigned long prof_counter;
36 unsigned char mcheck_expected; 36 unsigned char mcheck_expected;
37 unsigned char mcheck_taken; 37 unsigned char mcheck_taken;
38 unsigned char mcheck_extra; 38 unsigned char mcheck_extra;
39 } __attribute__((aligned(64))); 39 } __attribute__((aligned(64)));
40 40
41 extern struct cpuinfo_alpha cpu_data[NR_CPUS]; 41 extern struct cpuinfo_alpha cpu_data[NR_CPUS];
42 42
43 #define PROC_CHANGE_PENALTY 20 43 #define PROC_CHANGE_PENALTY 20
44 44
45 #define hard_smp_processor_id() __hard_smp_processor_id() 45 #define hard_smp_processor_id() __hard_smp_processor_id()
46 #define smp_processor_id() (current_thread_info()->cpu) 46 #define raw_smp_processor_id() (current_thread_info()->cpu)
47 47
48 extern cpumask_t cpu_present_mask; 48 extern cpumask_t cpu_present_mask;
49 extern cpumask_t cpu_online_map; 49 extern cpumask_t cpu_online_map;
50 extern int smp_num_cpus; 50 extern int smp_num_cpus;
51 #define cpu_possible_map cpu_present_mask 51 #define cpu_possible_map cpu_present_mask
52 52
53 int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, int wait, cpumask_t cpu); 53 int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, int wait, cpumask_t cpu);
54 54
55 #else /* CONFIG_SMP */ 55 #else /* CONFIG_SMP */
56 56
57 #define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; }) 57 #define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; })
58 58
59 #endif /* CONFIG_SMP */ 59 #endif /* CONFIG_SMP */
60 60
61 #define NO_PROC_ID (-1) 61 #define NO_PROC_ID (-1)
62 62
63 #endif 63 #endif
64 64
include/asm-arm/smp.h
1 /* 1 /*
2 * linux/include/asm-arm/smp.h 2 * linux/include/asm-arm/smp.h
3 * 3 *
4 * Copyright (C) 2004-2005 ARM Ltd. 4 * Copyright (C) 2004-2005 ARM Ltd.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10 #ifndef __ASM_ARM_SMP_H 10 #ifndef __ASM_ARM_SMP_H
11 #define __ASM_ARM_SMP_H 11 #define __ASM_ARM_SMP_H
12 12
13 #include <linux/config.h> 13 #include <linux/config.h>
14 #include <linux/threads.h> 14 #include <linux/threads.h>
15 #include <linux/cpumask.h> 15 #include <linux/cpumask.h>
16 #include <linux/thread_info.h> 16 #include <linux/thread_info.h>
17 17
18 #include <asm/arch/smp.h> 18 #include <asm/arch/smp.h>
19 19
20 #ifndef CONFIG_SMP 20 #ifndef CONFIG_SMP
21 # error "<asm-arm/smp.h> included in non-SMP build" 21 # error "<asm-arm/smp.h> included in non-SMP build"
22 #endif 22 #endif
23 23
24 #define smp_processor_id() (current_thread_info()->cpu) 24 #define raw_smp_processor_id() (current_thread_info()->cpu)
25 25
26 extern cpumask_t cpu_present_mask; 26 extern cpumask_t cpu_present_mask;
27 #define cpu_possible_map cpu_present_mask 27 #define cpu_possible_map cpu_present_mask
28 28
29 /* 29 /*
30 * at the moment, there's not a big penalty for changing CPUs 30 * at the moment, there's not a big penalty for changing CPUs
31 * (the >big< penalty is running SMP in the first place) 31 * (the >big< penalty is running SMP in the first place)
32 */ 32 */
33 #define PROC_CHANGE_PENALTY 15 33 #define PROC_CHANGE_PENALTY 15
34 34
35 struct seq_file; 35 struct seq_file;
36 36
37 /* 37 /*
38 * generate IPI list text 38 * generate IPI list text
39 */ 39 */
40 extern void show_ipi_list(struct seq_file *p); 40 extern void show_ipi_list(struct seq_file *p);
41 41
42 /* 42 /*
43 * Move global data into per-processor storage. 43 * Move global data into per-processor storage.
44 */ 44 */
45 extern void smp_store_cpu_info(unsigned int cpuid); 45 extern void smp_store_cpu_info(unsigned int cpuid);
46 46
47 /* 47 /*
48 * Raise an IPI cross call on CPUs in callmap. 48 * Raise an IPI cross call on CPUs in callmap.
49 */ 49 */
50 extern void smp_cross_call(cpumask_t callmap); 50 extern void smp_cross_call(cpumask_t callmap);
51 51
52 /* 52 /*
53 * Boot a secondary CPU, and assign it the specified idle task. 53 * Boot a secondary CPU, and assign it the specified idle task.
54 * This also gives us the initial stack to use for this CPU. 54 * This also gives us the initial stack to use for this CPU.
55 */ 55 */
56 extern int boot_secondary(unsigned int cpu, struct task_struct *); 56 extern int boot_secondary(unsigned int cpu, struct task_struct *);
57 57
58 /* 58 /*
59 * Perform platform specific initialisation of the specified CPU. 59 * Perform platform specific initialisation of the specified CPU.
60 */ 60 */
61 extern void platform_secondary_init(unsigned int cpu); 61 extern void platform_secondary_init(unsigned int cpu);
62 62
63 /* 63 /*
64 * Initial data for bringing up a secondary CPU. 64 * Initial data for bringing up a secondary CPU.
65 */ 65 */
66 struct secondary_data { 66 struct secondary_data {
67 unsigned long pgdir; 67 unsigned long pgdir;
68 void *stack; 68 void *stack;
69 }; 69 };
70 extern struct secondary_data secondary_data; 70 extern struct secondary_data secondary_data;
71 71
72 #endif /* ifndef __ASM_ARM_SMP_H */ 72 #endif /* ifndef __ASM_ARM_SMP_H */
73 73
include/asm-i386/smp.h
1 #ifndef __ASM_SMP_H 1 #ifndef __ASM_SMP_H
2 #define __ASM_SMP_H 2 #define __ASM_SMP_H
3 3
4 /* 4 /*
5 * We need the APIC definitions automatically as part of 'smp.h' 5 * We need the APIC definitions automatically as part of 'smp.h'
6 */ 6 */
7 #ifndef __ASSEMBLY__ 7 #ifndef __ASSEMBLY__
8 #include <linux/config.h> 8 #include <linux/config.h>
9 #include <linux/kernel.h> 9 #include <linux/kernel.h>
10 #include <linux/threads.h> 10 #include <linux/threads.h>
11 #include <linux/cpumask.h> 11 #include <linux/cpumask.h>
12 #endif 12 #endif
13 13
14 #ifdef CONFIG_X86_LOCAL_APIC 14 #ifdef CONFIG_X86_LOCAL_APIC
15 #ifndef __ASSEMBLY__ 15 #ifndef __ASSEMBLY__
16 #include <asm/fixmap.h> 16 #include <asm/fixmap.h>
17 #include <asm/bitops.h> 17 #include <asm/bitops.h>
18 #include <asm/mpspec.h> 18 #include <asm/mpspec.h>
19 #ifdef CONFIG_X86_IO_APIC 19 #ifdef CONFIG_X86_IO_APIC
20 #include <asm/io_apic.h> 20 #include <asm/io_apic.h>
21 #endif 21 #endif
22 #include <asm/apic.h> 22 #include <asm/apic.h>
23 #endif 23 #endif
24 #endif 24 #endif
25 25
26 #define BAD_APICID 0xFFu 26 #define BAD_APICID 0xFFu
27 #ifdef CONFIG_SMP 27 #ifdef CONFIG_SMP
28 #ifndef __ASSEMBLY__ 28 #ifndef __ASSEMBLY__
29 29
30 /* 30 /*
31 * Private routines/data 31 * Private routines/data
32 */ 32 */
33 33
34 extern void smp_alloc_memory(void); 34 extern void smp_alloc_memory(void);
35 extern int pic_mode; 35 extern int pic_mode;
36 extern int smp_num_siblings; 36 extern int smp_num_siblings;
37 extern cpumask_t cpu_sibling_map[]; 37 extern cpumask_t cpu_sibling_map[];
38 extern cpumask_t cpu_core_map[]; 38 extern cpumask_t cpu_core_map[];
39 39
40 extern void smp_flush_tlb(void); 40 extern void smp_flush_tlb(void);
41 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); 41 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
42 extern void smp_invalidate_rcv(void); /* Process an NMI */ 42 extern void smp_invalidate_rcv(void); /* Process an NMI */
43 extern void (*mtrr_hook) (void); 43 extern void (*mtrr_hook) (void);
44 extern void zap_low_mappings (void); 44 extern void zap_low_mappings (void);
45 45
46 #define MAX_APICID 256 46 #define MAX_APICID 256
47 extern u8 x86_cpu_to_apicid[]; 47 extern u8 x86_cpu_to_apicid[];
48 48
49 /* 49 /*
50 * This function is needed by all SMP systems. It must _always_ be valid 50 * This function is needed by all SMP systems. It must _always_ be valid
51 * from the initial startup. We map APIC_BASE very early in page_setup(), 51 * from the initial startup. We map APIC_BASE very early in page_setup(),
52 * so this is correct in the x86 case. 52 * so this is correct in the x86 case.
53 */ 53 */
54 #define __smp_processor_id() (current_thread_info()->cpu) 54 #define raw_smp_processor_id() (current_thread_info()->cpu)
55 55
56 extern cpumask_t cpu_callout_map; 56 extern cpumask_t cpu_callout_map;
57 extern cpumask_t cpu_callin_map; 57 extern cpumask_t cpu_callin_map;
58 #define cpu_possible_map cpu_callout_map 58 #define cpu_possible_map cpu_callout_map
59 59
60 /* We don't mark CPUs online until __cpu_up(), so we need another measure */ 60 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
61 static inline int num_booting_cpus(void) 61 static inline int num_booting_cpus(void)
62 { 62 {
63 return cpus_weight(cpu_callout_map); 63 return cpus_weight(cpu_callout_map);
64 } 64 }
65 65
66 #ifdef CONFIG_X86_LOCAL_APIC 66 #ifdef CONFIG_X86_LOCAL_APIC
67 67
68 #ifdef APIC_DEFINITION 68 #ifdef APIC_DEFINITION
69 extern int hard_smp_processor_id(void); 69 extern int hard_smp_processor_id(void);
70 #else 70 #else
71 #include <mach_apicdef.h> 71 #include <mach_apicdef.h>
72 static inline int hard_smp_processor_id(void) 72 static inline int hard_smp_processor_id(void)
73 { 73 {
74 /* we don't want to mark this access volatile - bad code generation */ 74 /* we don't want to mark this access volatile - bad code generation */
75 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); 75 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
76 } 76 }
77 #endif 77 #endif
78 78
79 static __inline int logical_smp_processor_id(void) 79 static __inline int logical_smp_processor_id(void)
80 { 80 {
81 /* we don't want to mark this access volatile - bad code generation */ 81 /* we don't want to mark this access volatile - bad code generation */
82 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); 82 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
83 } 83 }
84 84
85 #endif 85 #endif
86 #endif /* !__ASSEMBLY__ */ 86 #endif /* !__ASSEMBLY__ */
87 87
88 #define NO_PROC_ID 0xFF /* No processor magic marker */ 88 #define NO_PROC_ID 0xFF /* No processor magic marker */
89 89
90 #endif 90 #endif
91 #endif 91 #endif
92 92
include/asm-ia64/smp.h
1 /* 1 /*
2 * SMP Support 2 * SMP Support
3 * 3 *
4 * Copyright (C) 1999 VA Linux Systems 4 * Copyright (C) 1999 VA Linux Systems
5 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> 5 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
6 * (c) Copyright 2001-2003, 2005 Hewlett-Packard Development Company, L.P. 6 * (c) Copyright 2001-2003, 2005 Hewlett-Packard Development Company, L.P.
7 * David Mosberger-Tang <davidm@hpl.hp.com> 7 * David Mosberger-Tang <davidm@hpl.hp.com>
8 * Bjorn Helgaas <bjorn.helgaas@hp.com> 8 * Bjorn Helgaas <bjorn.helgaas@hp.com>
9 */ 9 */
10 #ifndef _ASM_IA64_SMP_H 10 #ifndef _ASM_IA64_SMP_H
11 #define _ASM_IA64_SMP_H 11 #define _ASM_IA64_SMP_H
12 12
13 #include <linux/config.h> 13 #include <linux/config.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <linux/threads.h> 15 #include <linux/threads.h>
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/cpumask.h> 17 #include <linux/cpumask.h>
18 18
19 #include <asm/bitops.h> 19 #include <asm/bitops.h>
20 #include <asm/io.h> 20 #include <asm/io.h>
21 #include <asm/param.h> 21 #include <asm/param.h>
22 #include <asm/processor.h> 22 #include <asm/processor.h>
23 #include <asm/ptrace.h> 23 #include <asm/ptrace.h>
24 24
25 static inline unsigned int 25 static inline unsigned int
26 ia64_get_lid (void) 26 ia64_get_lid (void)
27 { 27 {
28 union { 28 union {
29 struct { 29 struct {
30 unsigned long reserved : 16; 30 unsigned long reserved : 16;
31 unsigned long eid : 8; 31 unsigned long eid : 8;
32 unsigned long id : 8; 32 unsigned long id : 8;
33 unsigned long ignored : 32; 33 unsigned long ignored : 32;
34 } f; 34 } f;
35 unsigned long bits; 35 unsigned long bits;
36 } lid; 36 } lid;
37 37
38 lid.bits = ia64_getreg(_IA64_REG_CR_LID); 38 lid.bits = ia64_getreg(_IA64_REG_CR_LID);
39 return lid.f.id << 8 | lid.f.eid; 39 return lid.f.id << 8 | lid.f.eid;
40 } 40 }
41 41
42 #ifdef CONFIG_SMP 42 #ifdef CONFIG_SMP
43 43
44 #define XTP_OFFSET 0x1e0008 44 #define XTP_OFFSET 0x1e0008
45 45
46 #define SMP_IRQ_REDIRECTION (1 << 0) 46 #define SMP_IRQ_REDIRECTION (1 << 0)
47 #define SMP_IPI_REDIRECTION (1 << 1) 47 #define SMP_IPI_REDIRECTION (1 << 1)
48 48
49 #define smp_processor_id() (current_thread_info()->cpu) 49 #define raw_smp_processor_id() (current_thread_info()->cpu)
50 50
51 extern struct smp_boot_data { 51 extern struct smp_boot_data {
52 int cpu_count; 52 int cpu_count;
53 int cpu_phys_id[NR_CPUS]; 53 int cpu_phys_id[NR_CPUS];
54 } smp_boot_data __initdata; 54 } smp_boot_data __initdata;
55 55
56 extern char no_int_routing __devinitdata; 56 extern char no_int_routing __devinitdata;
57 57
58 extern cpumask_t cpu_online_map; 58 extern cpumask_t cpu_online_map;
59 extern cpumask_t cpu_core_map[NR_CPUS]; 59 extern cpumask_t cpu_core_map[NR_CPUS];
60 extern cpumask_t cpu_sibling_map[NR_CPUS]; 60 extern cpumask_t cpu_sibling_map[NR_CPUS];
61 extern int smp_num_siblings; 61 extern int smp_num_siblings;
62 extern int smp_num_cpucores; 62 extern int smp_num_cpucores;
63 extern void __iomem *ipi_base_addr; 63 extern void __iomem *ipi_base_addr;
64 extern unsigned char smp_int_redirect; 64 extern unsigned char smp_int_redirect;
65 65
66 extern volatile int ia64_cpu_to_sapicid[]; 66 extern volatile int ia64_cpu_to_sapicid[];
67 #define cpu_physical_id(i) ia64_cpu_to_sapicid[i] 67 #define cpu_physical_id(i) ia64_cpu_to_sapicid[i]
68 68
69 extern unsigned long ap_wakeup_vector; 69 extern unsigned long ap_wakeup_vector;
70 70
71 /* 71 /*
72 * Function to map hard smp processor id to logical id. Slow, so don't use this in 72 * Function to map hard smp processor id to logical id. Slow, so don't use this in
73 * performance-critical code. 73 * performance-critical code.
74 */ 74 */
75 static inline int 75 static inline int
76 cpu_logical_id (int cpuid) 76 cpu_logical_id (int cpuid)
77 { 77 {
78 int i; 78 int i;
79 79
80 for (i = 0; i < NR_CPUS; ++i) 80 for (i = 0; i < NR_CPUS; ++i)
81 if (cpu_physical_id(i) == cpuid) 81 if (cpu_physical_id(i) == cpuid)
82 break; 82 break;
83 return i; 83 return i;
84 } 84 }
85 85
86 /* 86 /*
87 * XTP control functions: 87 * XTP control functions:
88 * min_xtp : route all interrupts to this CPU 88 * min_xtp : route all interrupts to this CPU
89 * normal_xtp: nominal XTP value 89 * normal_xtp: nominal XTP value
90 * max_xtp : never deliver interrupts to this CPU. 90 * max_xtp : never deliver interrupts to this CPU.
91 */ 91 */
92 92
93 static inline void 93 static inline void
94 min_xtp (void) 94 min_xtp (void)
95 { 95 {
96 if (smp_int_redirect & SMP_IRQ_REDIRECTION) 96 if (smp_int_redirect & SMP_IRQ_REDIRECTION)
97 writeb(0x00, ipi_base_addr + XTP_OFFSET); /* XTP to min */ 97 writeb(0x00, ipi_base_addr + XTP_OFFSET); /* XTP to min */
98 } 98 }
99 99
100 static inline void 100 static inline void
101 normal_xtp (void) 101 normal_xtp (void)
102 { 102 {
103 if (smp_int_redirect & SMP_IRQ_REDIRECTION) 103 if (smp_int_redirect & SMP_IRQ_REDIRECTION)
104 writeb(0x08, ipi_base_addr + XTP_OFFSET); /* XTP normal */ 104 writeb(0x08, ipi_base_addr + XTP_OFFSET); /* XTP normal */
105 } 105 }
106 106
107 static inline void 107 static inline void
108 max_xtp (void) 108 max_xtp (void)
109 { 109 {
110 if (smp_int_redirect & SMP_IRQ_REDIRECTION) 110 if (smp_int_redirect & SMP_IRQ_REDIRECTION)
111 writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */ 111 writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */
112 } 112 }
113 113
114 #define hard_smp_processor_id() ia64_get_lid() 114 #define hard_smp_processor_id() ia64_get_lid()
115 115
116 /* Upping and downing of CPUs */ 116 /* Upping and downing of CPUs */
117 extern int __cpu_disable (void); 117 extern int __cpu_disable (void);
118 extern void __cpu_die (unsigned int cpu); 118 extern void __cpu_die (unsigned int cpu);
119 extern void cpu_die (void) __attribute__ ((noreturn)); 119 extern void cpu_die (void) __attribute__ ((noreturn));
120 extern int __cpu_up (unsigned int cpu); 120 extern int __cpu_up (unsigned int cpu);
121 extern void __init smp_build_cpu_map(void); 121 extern void __init smp_build_cpu_map(void);
122 122
123 extern void __init init_smp_config (void); 123 extern void __init init_smp_config (void);
124 extern void smp_do_timer (struct pt_regs *regs); 124 extern void smp_do_timer (struct pt_regs *regs);
125 125
126 extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info, 126 extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info,
127 int retry, int wait); 127 int retry, int wait);
128 extern void smp_send_reschedule (int cpu); 128 extern void smp_send_reschedule (int cpu);
129 extern void lock_ipi_calllock(void); 129 extern void lock_ipi_calllock(void);
130 extern void unlock_ipi_calllock(void); 130 extern void unlock_ipi_calllock(void);
131 extern void identify_siblings (struct cpuinfo_ia64 *); 131 extern void identify_siblings (struct cpuinfo_ia64 *);
132 132
133 #else 133 #else
134 134
135 #define cpu_logical_id(i) 0 135 #define cpu_logical_id(i) 0
136 #define cpu_physical_id(i) ia64_get_lid() 136 #define cpu_physical_id(i) ia64_get_lid()
137 137
138 #endif /* CONFIG_SMP */ 138 #endif /* CONFIG_SMP */
139 #endif /* _ASM_IA64_SMP_H */ 139 #endif /* _ASM_IA64_SMP_H */
140 140
include/asm-m32r/smp.h
1 #ifndef _ASM_M32R_SMP_H 1 #ifndef _ASM_M32R_SMP_H
2 #define _ASM_M32R_SMP_H 2 #define _ASM_M32R_SMP_H
3 3
4 /* $Id$ */ 4 /* $Id$ */
5 5
6 #include <linux/config.h> 6 #include <linux/config.h>
7 7
8 #ifdef CONFIG_SMP 8 #ifdef CONFIG_SMP
9 #ifndef __ASSEMBLY__ 9 #ifndef __ASSEMBLY__
10 10
11 #include <linux/cpumask.h> 11 #include <linux/cpumask.h>
12 #include <linux/spinlock.h> 12 #include <linux/spinlock.h>
13 #include <linux/threads.h> 13 #include <linux/threads.h>
14 #include <asm/m32r.h> 14 #include <asm/m32r.h>
15 15
16 #define PHYSID_ARRAY_SIZE 1 16 #define PHYSID_ARRAY_SIZE 1
17 17
18 struct physid_mask 18 struct physid_mask
19 { 19 {
20 unsigned long mask[PHYSID_ARRAY_SIZE]; 20 unsigned long mask[PHYSID_ARRAY_SIZE];
21 }; 21 };
22 22
23 typedef struct physid_mask physid_mask_t; 23 typedef struct physid_mask physid_mask_t;
24 24
25 #define physid_set(physid, map) set_bit(physid, (map).mask) 25 #define physid_set(physid, map) set_bit(physid, (map).mask)
26 #define physid_clear(physid, map) clear_bit(physid, (map).mask) 26 #define physid_clear(physid, map) clear_bit(physid, (map).mask)
27 #define physid_isset(physid, map) test_bit(physid, (map).mask) 27 #define physid_isset(physid, map) test_bit(physid, (map).mask)
28 #define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask) 28 #define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask)
29 29
30 #define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 30 #define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
31 #define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 31 #define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
32 #define physids_clear(map) bitmap_zero((map).mask, MAX_APICS) 32 #define physids_clear(map) bitmap_zero((map).mask, MAX_APICS)
33 #define physids_complement(dst, src) bitmap_complement((dst).mask,(src).mask, MAX_APICS) 33 #define physids_complement(dst, src) bitmap_complement((dst).mask,(src).mask, MAX_APICS)
34 #define physids_empty(map) bitmap_empty((map).mask, MAX_APICS) 34 #define physids_empty(map) bitmap_empty((map).mask, MAX_APICS)
35 #define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS) 35 #define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
36 #define physids_weight(map) bitmap_weight((map).mask, MAX_APICS) 36 #define physids_weight(map) bitmap_weight((map).mask, MAX_APICS)
37 #define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) 37 #define physids_shift_right(d, s, n) bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
38 #define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 38 #define physids_shift_left(d, s, n) bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
39 #define physids_coerce(map) ((map).mask[0]) 39 #define physids_coerce(map) ((map).mask[0])
40 40
41 #define physids_promote(physids) \ 41 #define physids_promote(physids) \
42 ({ \ 42 ({ \
43 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ 43 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
44 __physid_mask.mask[0] = physids; \ 44 __physid_mask.mask[0] = physids; \
45 __physid_mask; \ 45 __physid_mask; \
46 }) 46 })
47 47
48 #define physid_mask_of_physid(physid) \ 48 #define physid_mask_of_physid(physid) \
49 ({ \ 49 ({ \
50 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ 50 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
51 physid_set(physid, __physid_mask); \ 51 physid_set(physid, __physid_mask); \
52 __physid_mask; \ 52 __physid_mask; \
53 }) 53 })
54 54
55 #define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } 55 #define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
56 #define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } 56 #define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
57 57
58 extern physid_mask_t phys_cpu_present_map; 58 extern physid_mask_t phys_cpu_present_map;
59 59
60 /* 60 /*
61 * Some lowlevel functions might want to know about 61 * Some lowlevel functions might want to know about
62 * the real CPU ID <-> CPU # mapping. 62 * the real CPU ID <-> CPU # mapping.
63 */ 63 */
64 extern volatile int physid_2_cpu[NR_CPUS]; 64 extern volatile int physid_2_cpu[NR_CPUS];
65 extern volatile int cpu_2_physid[NR_CPUS]; 65 extern volatile int cpu_2_physid[NR_CPUS];
66 #define physid_to_cpu(physid) physid_2_cpu[physid] 66 #define physid_to_cpu(physid) physid_2_cpu[physid]
67 #define cpu_to_physid(cpu_id) cpu_2_physid[cpu_id] 67 #define cpu_to_physid(cpu_id) cpu_2_physid[cpu_id]
68 68
69 #define smp_processor_id() (current_thread_info()->cpu) 69 #define raw_smp_processor_id() (current_thread_info()->cpu)
70 70
71 extern cpumask_t cpu_callout_map; 71 extern cpumask_t cpu_callout_map;
72 #define cpu_possible_map cpu_callout_map 72 #define cpu_possible_map cpu_callout_map
73 73
74 static __inline__ int hard_smp_processor_id(void) 74 static __inline__ int hard_smp_processor_id(void)
75 { 75 {
76 return (int)*(volatile long *)M32R_CPUID_PORTL; 76 return (int)*(volatile long *)M32R_CPUID_PORTL;
77 } 77 }
78 78
79 static __inline__ int cpu_logical_map(int cpu) 79 static __inline__ int cpu_logical_map(int cpu)
80 { 80 {
81 return cpu; 81 return cpu;
82 } 82 }
83 83
84 static __inline__ int cpu_number_map(int cpu) 84 static __inline__ int cpu_number_map(int cpu)
85 { 85 {
86 return cpu; 86 return cpu;
87 } 87 }
88 88
89 static __inline__ unsigned int num_booting_cpus(void) 89 static __inline__ unsigned int num_booting_cpus(void)
90 { 90 {
91 return cpus_weight(cpu_callout_map); 91 return cpus_weight(cpu_callout_map);
92 } 92 }
93 93
94 extern void smp_send_timer(void); 94 extern void smp_send_timer(void);
95 extern unsigned long send_IPI_mask_phys(cpumask_t, int, int); 95 extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);
96 96
97 #endif /* not __ASSEMBLY__ */ 97 #endif /* not __ASSEMBLY__ */
98 98
99 #define NO_PROC_ID (0xff) /* No processor magic marker */ 99 #define NO_PROC_ID (0xff) /* No processor magic marker */
100 100
101 #define PROC_CHANGE_PENALTY (15) /* Schedule penalty */ 101 #define PROC_CHANGE_PENALTY (15) /* Schedule penalty */
102 102
103 /* 103 /*
104 * M32R-mp IPI 104 * M32R-mp IPI
105 */ 105 */
106 #define RESCHEDULE_IPI (M32R_IRQ_IPI0-M32R_IRQ_IPI0) 106 #define RESCHEDULE_IPI (M32R_IRQ_IPI0-M32R_IRQ_IPI0)
107 #define INVALIDATE_TLB_IPI (M32R_IRQ_IPI1-M32R_IRQ_IPI0) 107 #define INVALIDATE_TLB_IPI (M32R_IRQ_IPI1-M32R_IRQ_IPI0)
108 #define CALL_FUNCTION_IPI (M32R_IRQ_IPI2-M32R_IRQ_IPI0) 108 #define CALL_FUNCTION_IPI (M32R_IRQ_IPI2-M32R_IRQ_IPI0)
109 #define LOCAL_TIMER_IPI (M32R_IRQ_IPI3-M32R_IRQ_IPI0) 109 #define LOCAL_TIMER_IPI (M32R_IRQ_IPI3-M32R_IRQ_IPI0)
110 #define INVALIDATE_CACHE_IPI (M32R_IRQ_IPI4-M32R_IRQ_IPI0) 110 #define INVALIDATE_CACHE_IPI (M32R_IRQ_IPI4-M32R_IRQ_IPI0)
111 #define CPU_BOOT_IPI (M32R_IRQ_IPI5-M32R_IRQ_IPI0) 111 #define CPU_BOOT_IPI (M32R_IRQ_IPI5-M32R_IRQ_IPI0)
112 112
113 #define IPI_SHIFT (0) 113 #define IPI_SHIFT (0)
114 #define NR_IPIS (8) 114 #define NR_IPIS (8)
115 115
116 #endif /* CONFIG_SMP */ 116 #endif /* CONFIG_SMP */
117 117
118 #endif /* _ASM_M32R_SMP_H */ 118 #endif /* _ASM_M32R_SMP_H */
119 119
include/asm-mips/smp.h
1 /* 1 /*
2 * This file is subject to the terms and conditions of the GNU General 2 * This file is subject to the terms and conditions of the GNU General
3 * Public License. See the file "COPYING" in the main directory of this 3 * Public License. See the file "COPYING" in the main directory of this
4 * archive for more details. 4 * archive for more details.
5 * 5 *
6 * Copyright (C) 2000 - 2001 by Kanoj Sarcar (kanoj@sgi.com) 6 * Copyright (C) 2000 - 2001 by Kanoj Sarcar (kanoj@sgi.com)
7 * Copyright (C) 2000 - 2001 by Silicon Graphics, Inc. 7 * Copyright (C) 2000 - 2001 by Silicon Graphics, Inc.
8 * Copyright (C) 2000, 2001, 2002 Ralf Baechle 8 * Copyright (C) 2000, 2001, 2002 Ralf Baechle
9 * Copyright (C) 2000, 2001 Broadcom Corporation 9 * Copyright (C) 2000, 2001 Broadcom Corporation
10 */ 10 */
11 #ifndef __ASM_SMP_H 11 #ifndef __ASM_SMP_H
12 #define __ASM_SMP_H 12 #define __ASM_SMP_H
13 13
14 #include <linux/config.h> 14 #include <linux/config.h>
15 15
16 #ifdef CONFIG_SMP 16 #ifdef CONFIG_SMP
17 17
18 #include <linux/bitops.h> 18 #include <linux/bitops.h>
19 #include <linux/linkage.h> 19 #include <linux/linkage.h>
20 #include <linux/threads.h> 20 #include <linux/threads.h>
21 #include <linux/cpumask.h> 21 #include <linux/cpumask.h>
22 #include <asm/atomic.h> 22 #include <asm/atomic.h>
23 23
24 #define smp_processor_id() (current_thread_info()->cpu) 24 #define raw_smp_processor_id() (current_thread_info()->cpu)
25 25
26 /* Map from cpu id to sequential logical cpu number. This will only 26 /* Map from cpu id to sequential logical cpu number. This will only
27 not be idempotent when cpus failed to come on-line. */ 27 not be idempotent when cpus failed to come on-line. */
28 extern int __cpu_number_map[NR_CPUS]; 28 extern int __cpu_number_map[NR_CPUS];
29 #define cpu_number_map(cpu) __cpu_number_map[cpu] 29 #define cpu_number_map(cpu) __cpu_number_map[cpu]
30 30
31 /* The reverse map from sequential logical cpu number to cpu id. */ 31 /* The reverse map from sequential logical cpu number to cpu id. */
32 extern int __cpu_logical_map[NR_CPUS]; 32 extern int __cpu_logical_map[NR_CPUS];
33 #define cpu_logical_map(cpu) __cpu_logical_map[cpu] 33 #define cpu_logical_map(cpu) __cpu_logical_map[cpu]
34 34
35 #define NO_PROC_ID (-1) 35 #define NO_PROC_ID (-1)
36 36
37 struct call_data_struct { 37 struct call_data_struct {
38 void (*func)(void *); 38 void (*func)(void *);
39 void *info; 39 void *info;
40 atomic_t started; 40 atomic_t started;
41 atomic_t finished; 41 atomic_t finished;
42 int wait; 42 int wait;
43 }; 43 };
44 44
45 extern struct call_data_struct *call_data; 45 extern struct call_data_struct *call_data;
46 46
47 #define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */ 47 #define SMP_RESCHEDULE_YOURSELF 0x1 /* XXX braindead */
48 #define SMP_CALL_FUNCTION 0x2 48 #define SMP_CALL_FUNCTION 0x2
49 49
50 extern cpumask_t phys_cpu_present_map; 50 extern cpumask_t phys_cpu_present_map;
51 extern cpumask_t cpu_online_map; 51 extern cpumask_t cpu_online_map;
52 #define cpu_possible_map phys_cpu_present_map 52 #define cpu_possible_map phys_cpu_present_map
53 53
54 extern cpumask_t cpu_callout_map; 54 extern cpumask_t cpu_callout_map;
55 /* We don't mark CPUs online until __cpu_up(), so we need another measure */ 55 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
56 static inline int num_booting_cpus(void) 56 static inline int num_booting_cpus(void)
57 { 57 {
58 return cpus_weight(cpu_callout_map); 58 return cpus_weight(cpu_callout_map);
59 } 59 }
60 60
61 /* These are defined by the board-specific code. */ 61 /* These are defined by the board-specific code. */
62 62
63 /* 63 /*
64 * Cause the function described by call_data to be executed on the passed 64 * Cause the function described by call_data to be executed on the passed
65 * cpu. When the function has finished, increment the finished field of 65 * cpu. When the function has finished, increment the finished field of
66 * call_data. 66 * call_data.
67 */ 67 */
68 extern void core_send_ipi(int cpu, unsigned int action); 68 extern void core_send_ipi(int cpu, unsigned int action);
69 69
70 /* 70 /*
71 * Firmware CPU startup hook 71 * Firmware CPU startup hook
72 */ 72 */
73 extern void prom_boot_secondary(int cpu, struct task_struct *idle); 73 extern void prom_boot_secondary(int cpu, struct task_struct *idle);
74 74
75 /* 75 /*
76 * After we've done initial boot, this function is called to allow the 76 * After we've done initial boot, this function is called to allow the
77 * board code to clean up state, if needed 77 * board code to clean up state, if needed
78 */ 78 */
79 extern void prom_init_secondary(void); 79 extern void prom_init_secondary(void);
80 80
81 /* 81 /*
82 * Detect available CPUs, populate phys_cpu_present_map before smp_init 82 * Detect available CPUs, populate phys_cpu_present_map before smp_init
83 */ 83 */
84 extern void prom_prepare_cpus(unsigned int max_cpus); 84 extern void prom_prepare_cpus(unsigned int max_cpus);
85 85
86 /* 86 /*
87 * Last chance for the board code to finish SMP initialization before 87 * Last chance for the board code to finish SMP initialization before
88 * the CPU is "online". 88 * the CPU is "online".
89 */ 89 */
90 extern void prom_smp_finish(void); 90 extern void prom_smp_finish(void);
91 91
92 /* Hook for after all CPUs are online */ 92 /* Hook for after all CPUs are online */
93 extern void prom_cpus_done(void); 93 extern void prom_cpus_done(void);
94 94
95 extern void asmlinkage smp_bootstrap(void); 95 extern void asmlinkage smp_bootstrap(void);
96 96
97 /* 97 /*
98 * this function sends a 'reschedule' IPI to another CPU. 98 * this function sends a 'reschedule' IPI to another CPU.
99 * it goes straight through and wastes no time serializing 99 * it goes straight through and wastes no time serializing
100 * anything. Worst case is that we lose a reschedule ... 100 * anything. Worst case is that we lose a reschedule ...
101 */ 101 */
102 static inline void smp_send_reschedule(int cpu) 102 static inline void smp_send_reschedule(int cpu)
103 { 103 {
104 core_send_ipi(cpu, SMP_RESCHEDULE_YOURSELF); 104 core_send_ipi(cpu, SMP_RESCHEDULE_YOURSELF);
105 } 105 }
106 106
107 extern asmlinkage void smp_call_function_interrupt(void); 107 extern asmlinkage void smp_call_function_interrupt(void);
108 108
109 #endif /* CONFIG_SMP */ 109 #endif /* CONFIG_SMP */
110 110
111 #endif /* __ASM_SMP_H */ 111 #endif /* __ASM_SMP_H */
112 112
include/asm-parisc/smp.h
1 #ifndef __ASM_SMP_H 1 #ifndef __ASM_SMP_H
2 #define __ASM_SMP_H 2 #define __ASM_SMP_H
3 3
4 #include <linux/config.h> 4 #include <linux/config.h>
5 5
6 #if defined(CONFIG_SMP) 6 #if defined(CONFIG_SMP)
7 7
8 /* Page Zero Location PDC will look for the address to branch to when we poke 8 /* Page Zero Location PDC will look for the address to branch to when we poke
9 ** slave CPUs still in "Icache loop". 9 ** slave CPUs still in "Icache loop".
10 */ 10 */
11 #define PDC_OS_BOOT_RENDEZVOUS 0x10 11 #define PDC_OS_BOOT_RENDEZVOUS 0x10
12 #define PDC_OS_BOOT_RENDEZVOUS_HI 0x28 12 #define PDC_OS_BOOT_RENDEZVOUS_HI 0x28
13 13
14 #ifndef ASSEMBLY 14 #ifndef ASSEMBLY
15 #include <linux/bitops.h> 15 #include <linux/bitops.h>
16 #include <linux/threads.h> /* for NR_CPUS */ 16 #include <linux/threads.h> /* for NR_CPUS */
17 #include <linux/cpumask.h> 17 #include <linux/cpumask.h>
18 typedef unsigned long address_t; 18 typedef unsigned long address_t;
19 19
20 extern cpumask_t cpu_online_map; 20 extern cpumask_t cpu_online_map;
21 21
22 22
23 /* 23 /*
24 * Private routines/data 24 * Private routines/data
25 * 25 *
26 * physical and logical are equivalent until we support CPU hotplug. 26 * physical and logical are equivalent until we support CPU hotplug.
27 */ 27 */
28 #define cpu_number_map(cpu) (cpu) 28 #define cpu_number_map(cpu) (cpu)
29 #define cpu_logical_map(cpu) (cpu) 29 #define cpu_logical_map(cpu) (cpu)
30 30
31 extern void smp_send_reschedule(int cpu); 31 extern void smp_send_reschedule(int cpu);
32 32
33 #endif /* !ASSEMBLY */ 33 #endif /* !ASSEMBLY */
34 34
35 /* 35 /*
36 * This magic constant controls our willingness to transfer 36 * This magic constant controls our willingness to transfer
37 * a process across CPUs. Such a transfer incurs cache and tlb 37 * a process across CPUs. Such a transfer incurs cache and tlb
38 * misses. The current value is inherited from i386. Still needs 38 * misses. The current value is inherited from i386. Still needs
39 * to be tuned for parisc. 39 * to be tuned for parisc.
40 */ 40 */
41 41
42 #define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ 42 #define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
43 43
44 #undef ENTRY_SYS_CPUS 44 #undef ENTRY_SYS_CPUS
45 #ifdef ENTRY_SYS_CPUS 45 #ifdef ENTRY_SYS_CPUS
46 #define STATE_RENDEZVOUS 0 46 #define STATE_RENDEZVOUS 0
47 #define STATE_STOPPED 1 47 #define STATE_STOPPED 1
48 #define STATE_RUNNING 2 48 #define STATE_RUNNING 2
49 #define STATE_HALTED 3 49 #define STATE_HALTED 3
50 #endif 50 #endif
51 51
52 extern unsigned long cpu_present_mask; 52 extern unsigned long cpu_present_mask;
53 53
54 #define smp_processor_id() (current_thread_info()->cpu) 54 #define raw_smp_processor_id() (current_thread_info()->cpu)
55 55
56 #endif /* CONFIG_SMP */ 56 #endif /* CONFIG_SMP */
57 57
58 #define NO_PROC_ID 0xFF /* No processor magic marker */ 58 #define NO_PROC_ID 0xFF /* No processor magic marker */
59 #define ANY_PROC_ID 0xFF /* Any processor magic marker */ 59 #define ANY_PROC_ID 0xFF /* Any processor magic marker */
60 static inline int __cpu_disable (void) { 60 static inline int __cpu_disable (void) {
61 return 0; 61 return 0;
62 } 62 }
63 static inline void __cpu_die (unsigned int cpu) { 63 static inline void __cpu_die (unsigned int cpu) {
64 while(1) 64 while(1)
65 ; 65 ;
66 } 66 }
67 extern int __cpu_up (unsigned int cpu); 67 extern int __cpu_up (unsigned int cpu);
68 68
69 #endif /* __ASM_SMP_H */ 69 #endif /* __ASM_SMP_H */
70 70
include/asm-ppc/smp.h
1 /* smp.h: PPC specific SMP stuff. 1 /* smp.h: PPC specific SMP stuff.
2 * 2 *
3 * Original was a copy of sparc smp.h. Now heavily modified 3 * Original was a copy of sparc smp.h. Now heavily modified
4 * for PPC. 4 * for PPC.
5 * 5 *
6 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 6 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
7 * Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com> 7 * Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com>
8 */ 8 */
9 #ifdef __KERNEL__ 9 #ifdef __KERNEL__
10 #ifndef _PPC_SMP_H 10 #ifndef _PPC_SMP_H
11 #define _PPC_SMP_H 11 #define _PPC_SMP_H
12 12
13 #include <linux/config.h> 13 #include <linux/config.h>
14 #include <linux/kernel.h> 14 #include <linux/kernel.h>
15 #include <linux/bitops.h> 15 #include <linux/bitops.h>
16 #include <linux/errno.h> 16 #include <linux/errno.h>
17 #include <linux/cpumask.h> 17 #include <linux/cpumask.h>
18 #include <linux/threads.h> 18 #include <linux/threads.h>
19 19
20 #ifdef CONFIG_SMP 20 #ifdef CONFIG_SMP
21 21
22 #ifndef __ASSEMBLY__ 22 #ifndef __ASSEMBLY__
23 23
24 struct cpuinfo_PPC { 24 struct cpuinfo_PPC {
25 unsigned long loops_per_jiffy; 25 unsigned long loops_per_jiffy;
26 unsigned long pvr; 26 unsigned long pvr;
27 unsigned long *pgd_cache; 27 unsigned long *pgd_cache;
28 unsigned long *pte_cache; 28 unsigned long *pte_cache;
29 unsigned long pgtable_cache_sz; 29 unsigned long pgtable_cache_sz;
30 }; 30 };
31 31
32 extern struct cpuinfo_PPC cpu_data[]; 32 extern struct cpuinfo_PPC cpu_data[];
33 extern cpumask_t cpu_online_map; 33 extern cpumask_t cpu_online_map;
34 extern cpumask_t cpu_possible_map; 34 extern cpumask_t cpu_possible_map;
35 extern unsigned long smp_proc_in_lock[]; 35 extern unsigned long smp_proc_in_lock[];
36 extern volatile unsigned long cpu_callin_map[]; 36 extern volatile unsigned long cpu_callin_map[];
37 extern int smp_tb_synchronized; 37 extern int smp_tb_synchronized;
38 38
39 extern void smp_send_tlb_invalidate(int); 39 extern void smp_send_tlb_invalidate(int);
40 extern void smp_send_xmon_break(int cpu); 40 extern void smp_send_xmon_break(int cpu);
41 struct pt_regs; 41 struct pt_regs;
42 extern void smp_message_recv(int, struct pt_regs *); 42 extern void smp_message_recv(int, struct pt_regs *);
43 43
44 #define NO_PROC_ID 0xFF /* No processor magic marker */ 44 #define NO_PROC_ID 0xFF /* No processor magic marker */
45 #define PROC_CHANGE_PENALTY 20 45 #define PROC_CHANGE_PENALTY 20
46 46
47 #define smp_processor_id() (current_thread_info()->cpu) 47 #define raw_smp_processor_id() (current_thread_info()->cpu)
48 48
49 extern int __cpu_up(unsigned int cpu); 49 extern int __cpu_up(unsigned int cpu);
50 50
51 extern int smp_hw_index[]; 51 extern int smp_hw_index[];
52 #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()]) 52 #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()])
53 53
54 struct klock_info_struct { 54 struct klock_info_struct {
55 unsigned long kernel_flag; 55 unsigned long kernel_flag;
56 unsigned char akp; 56 unsigned char akp;
57 }; 57 };
58 58
59 extern struct klock_info_struct klock_info; 59 extern struct klock_info_struct klock_info;
60 #define KLOCK_HELD 0xffffffff 60 #define KLOCK_HELD 0xffffffff
61 #define KLOCK_CLEAR 0x0 61 #define KLOCK_CLEAR 0x0
62 62
63 #endif /* __ASSEMBLY__ */ 63 #endif /* __ASSEMBLY__ */
64 64
65 #else /* !(CONFIG_SMP) */ 65 #else /* !(CONFIG_SMP) */
66 66
67 #endif /* !(CONFIG_SMP) */ 67 #endif /* !(CONFIG_SMP) */
68 68
69 #endif /* !(_PPC_SMP_H) */ 69 #endif /* !(_PPC_SMP_H) */
70 #endif /* __KERNEL__ */ 70 #endif /* __KERNEL__ */
71 71
include/asm-ppc64/smp.h
1 /* 1 /*
2 * smp.h: PPC64 specific SMP code. 2 * smp.h: PPC64 specific SMP code.
3 * 3 *
4 * Original was a copy of sparc smp.h. Now heavily modified 4 * Original was a copy of sparc smp.h. Now heavily modified
5 * for PPC. 5 * for PPC.
6 * 6 *
7 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 7 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
8 * Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com> 8 * Copyright (C) 1996-2001 Cort Dougan <cort@fsmlabs.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License 11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version 12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version. 13 * 2 of the License, or (at your option) any later version.
14 */ 14 */
15 15
16 #ifdef __KERNEL__ 16 #ifdef __KERNEL__
17 #ifndef _PPC64_SMP_H 17 #ifndef _PPC64_SMP_H
18 #define _PPC64_SMP_H 18 #define _PPC64_SMP_H
19 19
20 #include <linux/config.h> 20 #include <linux/config.h>
21 #include <linux/threads.h> 21 #include <linux/threads.h>
22 #include <linux/cpumask.h> 22 #include <linux/cpumask.h>
23 #include <linux/kernel.h> 23 #include <linux/kernel.h>
24 24
25 #ifndef __ASSEMBLY__ 25 #ifndef __ASSEMBLY__
26 26
27 #include <asm/paca.h> 27 #include <asm/paca.h>
28 28
29 extern int boot_cpuid; 29 extern int boot_cpuid;
30 extern int boot_cpuid_phys; 30 extern int boot_cpuid_phys;
31 31
32 extern void cpu_die(void); 32 extern void cpu_die(void);
33 33
34 #ifdef CONFIG_SMP 34 #ifdef CONFIG_SMP
35 35
36 extern void smp_send_debugger_break(int cpu); 36 extern void smp_send_debugger_break(int cpu);
37 struct pt_regs; 37 struct pt_regs;
38 extern void smp_message_recv(int, struct pt_regs *); 38 extern void smp_message_recv(int, struct pt_regs *);
39 39
40 #ifdef CONFIG_HOTPLUG_CPU 40 #ifdef CONFIG_HOTPLUG_CPU
41 extern void fixup_irqs(cpumask_t map); 41 extern void fixup_irqs(cpumask_t map);
42 int generic_cpu_disable(void); 42 int generic_cpu_disable(void);
43 int generic_cpu_enable(unsigned int cpu); 43 int generic_cpu_enable(unsigned int cpu);
44 void generic_cpu_die(unsigned int cpu); 44 void generic_cpu_die(unsigned int cpu);
45 void generic_mach_cpu_die(void); 45 void generic_mach_cpu_die(void);
46 #endif 46 #endif
47 47
48 #define __smp_processor_id() (get_paca()->paca_index) 48 #define raw_smp_processor_id() (get_paca()->paca_index)
49 #define hard_smp_processor_id() (get_paca()->hw_cpu_id) 49 #define hard_smp_processor_id() (get_paca()->hw_cpu_id)
50 50
51 extern cpumask_t cpu_sibling_map[NR_CPUS]; 51 extern cpumask_t cpu_sibling_map[NR_CPUS];
52 52
53 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. 53 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
54 * 54 *
55 * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up 55 * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
56 * in /proc/interrupts will be wrong!!! --Troy */ 56 * in /proc/interrupts will be wrong!!! --Troy */
57 #define PPC_MSG_CALL_FUNCTION 0 57 #define PPC_MSG_CALL_FUNCTION 0
58 #define PPC_MSG_RESCHEDULE 1 58 #define PPC_MSG_RESCHEDULE 1
59 /* This is unused now */ 59 /* This is unused now */
60 #if 0 60 #if 0
61 #define PPC_MSG_MIGRATE_TASK 2 61 #define PPC_MSG_MIGRATE_TASK 2
62 #endif 62 #endif
63 #define PPC_MSG_DEBUGGER_BREAK 3 63 #define PPC_MSG_DEBUGGER_BREAK 3
64 64
65 void smp_init_iSeries(void); 65 void smp_init_iSeries(void);
66 void smp_init_pSeries(void); 66 void smp_init_pSeries(void);
67 67
68 extern int __cpu_disable(void); 68 extern int __cpu_disable(void);
69 extern void __cpu_die(unsigned int cpu); 69 extern void __cpu_die(unsigned int cpu);
70 #endif /* CONFIG_SMP */ 70 #endif /* CONFIG_SMP */
71 71
72 #define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id) 72 #define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id)
73 #define set_hard_smp_processor_id(CPU, VAL) \ 73 #define set_hard_smp_processor_id(CPU, VAL) \
74 do { (paca[(CPU)].hw_cpu_id = (VAL)); } while (0) 74 do { (paca[(CPU)].hw_cpu_id = (VAL)); } while (0)
75 75
76 extern int smt_enabled_at_boot; 76 extern int smt_enabled_at_boot;
77 77
78 extern int smp_mpic_probe(void); 78 extern int smp_mpic_probe(void);
79 extern void smp_mpic_setup_cpu(int cpu); 79 extern void smp_mpic_setup_cpu(int cpu);
80 extern void smp_mpic_message_pass(int target, int msg); 80 extern void smp_mpic_message_pass(int target, int msg);
81 extern void smp_generic_kick_cpu(int nr); 81 extern void smp_generic_kick_cpu(int nr);
82 82
83 extern void smp_generic_give_timebase(void); 83 extern void smp_generic_give_timebase(void);
84 extern void smp_generic_take_timebase(void); 84 extern void smp_generic_take_timebase(void);
85 85
86 extern struct smp_ops_t *smp_ops; 86 extern struct smp_ops_t *smp_ops;
87 87
88 #endif /* __ASSEMBLY__ */ 88 #endif /* __ASSEMBLY__ */
89 89
90 #endif /* !(_PPC64_SMP_H) */ 90 #endif /* !(_PPC64_SMP_H) */
91 #endif /* __KERNEL__ */ 91 #endif /* __KERNEL__ */
92 92
include/asm-s390/smp.h
1 /* 1 /*
2 * include/asm-s390/smp.h 2 * include/asm-s390/smp.h
3 * 3 *
4 * S390 version 4 * S390 version
5 * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation 5 * Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation
6 * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), 6 * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
7 * Martin Schwidefsky (schwidefsky@de.ibm.com) 7 * Martin Schwidefsky (schwidefsky@de.ibm.com)
8 * Heiko Carstens (heiko.carstens@de.ibm.com) 8 * Heiko Carstens (heiko.carstens@de.ibm.com)
9 */ 9 */
10 #ifndef __ASM_SMP_H 10 #ifndef __ASM_SMP_H
11 #define __ASM_SMP_H 11 #define __ASM_SMP_H
12 12
13 #include <linux/config.h> 13 #include <linux/config.h>
14 #include <linux/threads.h> 14 #include <linux/threads.h>
15 #include <linux/cpumask.h> 15 #include <linux/cpumask.h>
16 #include <linux/bitops.h> 16 #include <linux/bitops.h>
17 17
18 #if defined(__KERNEL__) && defined(CONFIG_SMP) && !defined(__ASSEMBLY__) 18 #if defined(__KERNEL__) && defined(CONFIG_SMP) && !defined(__ASSEMBLY__)
19 19
20 #include <asm/lowcore.h> 20 #include <asm/lowcore.h>
21 #include <asm/sigp.h> 21 #include <asm/sigp.h>
22 22
23 /* 23 /*
24 s390 specific smp.c headers 24 s390 specific smp.c headers
25 */ 25 */
26 typedef struct 26 typedef struct
27 { 27 {
28 int intresting; 28 int intresting;
29 sigp_ccode ccode; 29 sigp_ccode ccode;
30 __u32 status; 30 __u32 status;
31 __u16 cpu; 31 __u16 cpu;
32 } sigp_info; 32 } sigp_info;
33 33
34 extern int smp_call_function_on(void (*func) (void *info), void *info, 34 extern int smp_call_function_on(void (*func) (void *info), void *info,
35 int nonatomic, int wait, int cpu); 35 int nonatomic, int wait, int cpu);
36 #define NO_PROC_ID 0xFF /* No processor magic marker */ 36 #define NO_PROC_ID 0xFF /* No processor magic marker */
37 37
38 /* 38 /*
39 * This magic constant controls our willingness to transfer 39 * This magic constant controls our willingness to transfer
40 * a process across CPUs. Such a transfer incurs misses on the L1 40 * a process across CPUs. Such a transfer incurs misses on the L1
41 * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My 41 * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
42 * gut feeling is this will vary by board in value. For a board 42 * gut feeling is this will vary by board in value. For a board
43 * with separate L2 cache it probably depends also on the RSS, and 43 * with separate L2 cache it probably depends also on the RSS, and
44 * for a board with shared L2 cache it ought to decay fast as other 44 * for a board with shared L2 cache it ought to decay fast as other
45 * processes are run. 45 * processes are run.
46 */ 46 */
47 47
48 #define PROC_CHANGE_PENALTY 20 /* Schedule penalty */ 48 #define PROC_CHANGE_PENALTY 20 /* Schedule penalty */
49 49
50 #define smp_processor_id() (S390_lowcore.cpu_data.cpu_nr) 50 #define raw_smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)
51 51
52 extern int smp_get_cpu(cpumask_t cpu_map); 52 extern int smp_get_cpu(cpumask_t cpu_map);
53 extern void smp_put_cpu(int cpu); 53 extern void smp_put_cpu(int cpu);
54 54
55 extern __inline__ __u16 hard_smp_processor_id(void) 55 extern __inline__ __u16 hard_smp_processor_id(void)
56 { 56 {
57 __u16 cpu_address; 57 __u16 cpu_address;
58 58
59 __asm__ ("stap %0\n" : "=m" (cpu_address)); 59 __asm__ ("stap %0\n" : "=m" (cpu_address));
60 return cpu_address; 60 return cpu_address;
61 } 61 }
62 62
63 /* 63 /*
64 * returns 1 if cpu is in stopped/check stopped state or not operational 64 * returns 1 if cpu is in stopped/check stopped state or not operational
65 * returns 0 otherwise 65 * returns 0 otherwise
66 */ 66 */
67 static inline int 67 static inline int
68 smp_cpu_not_running(int cpu) 68 smp_cpu_not_running(int cpu)
69 { 69 {
70 __u32 status; 70 __u32 status;
71 71
72 switch (signal_processor_ps(&status, 0, cpu, sigp_sense)) { 72 switch (signal_processor_ps(&status, 0, cpu, sigp_sense)) {
73 case sigp_order_code_accepted: 73 case sigp_order_code_accepted:
74 case sigp_status_stored: 74 case sigp_status_stored:
75 /* Check for stopped and check stop state */ 75 /* Check for stopped and check stop state */
76 if (status & 0x50) 76 if (status & 0x50)
77 return 1; 77 return 1;
78 break; 78 break;
79 case sigp_not_operational: 79 case sigp_not_operational:
80 return 1; 80 return 1;
81 default: 81 default:
82 break; 82 break;
83 } 83 }
84 return 0; 84 return 0;
85 } 85 }
86 86
87 #define cpu_logical_map(cpu) (cpu) 87 #define cpu_logical_map(cpu) (cpu)
88 88
89 extern int __cpu_disable (void); 89 extern int __cpu_disable (void);
90 extern void __cpu_die (unsigned int cpu); 90 extern void __cpu_die (unsigned int cpu);
91 extern void cpu_die (void) __attribute__ ((noreturn)); 91 extern void cpu_die (void) __attribute__ ((noreturn));
92 extern int __cpu_up (unsigned int cpu); 92 extern int __cpu_up (unsigned int cpu);
93 93
94 #endif 94 #endif
95 95
96 #ifndef CONFIG_SMP 96 #ifndef CONFIG_SMP
97 static inline int 97 static inline int
98 smp_call_function_on(void (*func) (void *info), void *info, 98 smp_call_function_on(void (*func) (void *info), void *info,
99 int nonatomic, int wait, int cpu) 99 int nonatomic, int wait, int cpu)
100 { 100 {
101 func(info); 101 func(info);
102 return 0; 102 return 0;
103 } 103 }
104 #define smp_get_cpu(cpu) ({ 0; }) 104 #define smp_get_cpu(cpu) ({ 0; })
105 #define smp_put_cpu(cpu) ({ 0; }) 105 #define smp_put_cpu(cpu) ({ 0; })
106 #endif 106 #endif
107 107
108 #endif 108 #endif
109 109
include/asm-sh/smp.h
1 /* 1 /*
2 * include/asm-sh/smp.h 2 * include/asm-sh/smp.h
3 * 3 *
4 * Copyright (C) 2002, 2003 Paul Mundt 4 * Copyright (C) 2002, 2003 Paul Mundt
5 * 5 *
6 * This file is subject to the terms and conditions of the GNU General Public 6 * This file is subject to the terms and conditions of the GNU General Public
7 * License. See the file "COPYING" in the main directory of this archive for 7 * License. See the file "COPYING" in the main directory of this archive for
8 * more details. 8 * more details.
9 */ 9 */
10 #ifndef __ASM_SH_SMP_H 10 #ifndef __ASM_SH_SMP_H
11 #define __ASM_SH_SMP_H 11 #define __ASM_SH_SMP_H
12 12
13 #include <linux/config.h> 13 #include <linux/config.h>
14 #include <linux/bitops.h> 14 #include <linux/bitops.h>
15 #include <linux/cpumask.h> 15 #include <linux/cpumask.h>
16 16
17 #ifdef CONFIG_SMP 17 #ifdef CONFIG_SMP
18 18
19 #include <asm/spinlock.h> 19 #include <asm/spinlock.h>
20 #include <asm/atomic.h> 20 #include <asm/atomic.h>
21 #include <asm/current.h> 21 #include <asm/current.h>
22 22
23 extern cpumask_t cpu_online_map; 23 extern cpumask_t cpu_online_map;
24 extern cpumask_t cpu_possible_map; 24 extern cpumask_t cpu_possible_map;
25 25
26 #define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) 26 #define cpu_online(cpu) cpu_isset(cpu, cpu_online_map)
27 27
28 #define smp_processor_id() (current_thread_info()->cpu) 28 #define raw_smp_processor_id() (current_thread_info()->cpu)
29 29
30 /* I've no idea what the real meaning of this is */ 30 /* I've no idea what the real meaning of this is */
31 #define PROC_CHANGE_PENALTY 20 31 #define PROC_CHANGE_PENALTY 20
32 32
33 #define NO_PROC_ID (-1) 33 #define NO_PROC_ID (-1)
34 34
35 struct smp_fn_call_struct { 35 struct smp_fn_call_struct {
36 spinlock_t lock; 36 spinlock_t lock;
37 atomic_t finished; 37 atomic_t finished;
38 void (*fn)(void *); 38 void (*fn)(void *);
39 void *data; 39 void *data;
40 }; 40 };
41 41
42 extern struct smp_fn_call_struct smp_fn_call; 42 extern struct smp_fn_call_struct smp_fn_call;
43 43
44 #define SMP_MSG_RESCHEDULE 0x0001 44 #define SMP_MSG_RESCHEDULE 0x0001
45 45
46 #endif /* CONFIG_SMP */ 46 #endif /* CONFIG_SMP */
47 47
48 #endif /* __ASM_SH_SMP_H */ 48 #endif /* __ASM_SH_SMP_H */
49 49
include/asm-sparc/smp.h
1 /* smp.h: Sparc specific SMP stuff. 1 /* smp.h: Sparc specific SMP stuff.
2 * 2 *
3 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 3 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
4 */ 4 */
5 5
6 #ifndef _SPARC_SMP_H 6 #ifndef _SPARC_SMP_H
7 #define _SPARC_SMP_H 7 #define _SPARC_SMP_H
8 8
9 #include <linux/config.h> 9 #include <linux/config.h>
10 #include <linux/threads.h> 10 #include <linux/threads.h>
11 #include <asm/head.h> 11 #include <asm/head.h>
12 #include <asm/btfixup.h> 12 #include <asm/btfixup.h>
13 13
14 #ifndef __ASSEMBLY__ 14 #ifndef __ASSEMBLY__
15 15
16 #include <linux/cpumask.h> 16 #include <linux/cpumask.h>
17 17
18 #endif /* __ASSEMBLY__ */ 18 #endif /* __ASSEMBLY__ */
19 19
20 #ifdef CONFIG_SMP 20 #ifdef CONFIG_SMP
21 21
22 #ifndef __ASSEMBLY__ 22 #ifndef __ASSEMBLY__
23 23
24 #include <asm/ptrace.h> 24 #include <asm/ptrace.h>
25 #include <asm/asi.h> 25 #include <asm/asi.h>
26 #include <asm/atomic.h> 26 #include <asm/atomic.h>
27 27
28 /* 28 /*
29 * Private routines/data 29 * Private routines/data
30 */ 30 */
31 31
32 extern unsigned char boot_cpu_id; 32 extern unsigned char boot_cpu_id;
33 extern cpumask_t phys_cpu_present_map; 33 extern cpumask_t phys_cpu_present_map;
34 #define cpu_possible_map phys_cpu_present_map 34 #define cpu_possible_map phys_cpu_present_map
35 35
36 typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long, 36 typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long,
37 unsigned long, unsigned long); 37 unsigned long, unsigned long);
38 38
39 /* 39 /*
40 * General functions that each host system must provide. 40 * General functions that each host system must provide.
41 */ 41 */
42 42
43 void sun4m_init_smp(void); 43 void sun4m_init_smp(void);
44 void sun4d_init_smp(void); 44 void sun4d_init_smp(void);
45 45
46 void smp_callin(void); 46 void smp_callin(void);
47 void smp_boot_cpus(void); 47 void smp_boot_cpus(void);
48 void smp_store_cpu_info(int); 48 void smp_store_cpu_info(int);
49 49
50 struct seq_file; 50 struct seq_file;
51 void smp_bogo(struct seq_file *); 51 void smp_bogo(struct seq_file *);
52 void smp_info(struct seq_file *); 52 void smp_info(struct seq_file *);
53 53
54 BTFIXUPDEF_CALL(void, smp_cross_call, smpfunc_t, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) 54 BTFIXUPDEF_CALL(void, smp_cross_call, smpfunc_t, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long)
55 BTFIXUPDEF_CALL(void, smp_message_pass, int, int, unsigned long, int) 55 BTFIXUPDEF_CALL(void, smp_message_pass, int, int, unsigned long, int)
56 BTFIXUPDEF_CALL(int, __hard_smp_processor_id, void) 56 BTFIXUPDEF_CALL(int, __hard_smp_processor_id, void)
57 BTFIXUPDEF_BLACKBOX(hard_smp_processor_id) 57 BTFIXUPDEF_BLACKBOX(hard_smp_processor_id)
58 BTFIXUPDEF_BLACKBOX(load_current) 58 BTFIXUPDEF_BLACKBOX(load_current)
59 59
60 #define smp_cross_call(func,arg1,arg2,arg3,arg4,arg5) BTFIXUP_CALL(smp_cross_call)(func,arg1,arg2,arg3,arg4,arg5) 60 #define smp_cross_call(func,arg1,arg2,arg3,arg4,arg5) BTFIXUP_CALL(smp_cross_call)(func,arg1,arg2,arg3,arg4,arg5)
61 #define smp_message_pass(target,msg,data,wait) BTFIXUP_CALL(smp_message_pass)(target,msg,data,wait) 61 #define smp_message_pass(target,msg,data,wait) BTFIXUP_CALL(smp_message_pass)(target,msg,data,wait)
62 62
63 extern __inline__ void xc0(smpfunc_t func) { smp_cross_call(func, 0, 0, 0, 0, 0); } 63 extern __inline__ void xc0(smpfunc_t func) { smp_cross_call(func, 0, 0, 0, 0, 0); }
64 extern __inline__ void xc1(smpfunc_t func, unsigned long arg1) 64 extern __inline__ void xc1(smpfunc_t func, unsigned long arg1)
65 { smp_cross_call(func, arg1, 0, 0, 0, 0); } 65 { smp_cross_call(func, arg1, 0, 0, 0, 0); }
66 extern __inline__ void xc2(smpfunc_t func, unsigned long arg1, unsigned long arg2) 66 extern __inline__ void xc2(smpfunc_t func, unsigned long arg1, unsigned long arg2)
67 { smp_cross_call(func, arg1, arg2, 0, 0, 0); } 67 { smp_cross_call(func, arg1, arg2, 0, 0, 0); }
68 extern __inline__ void xc3(smpfunc_t func, unsigned long arg1, unsigned long arg2, 68 extern __inline__ void xc3(smpfunc_t func, unsigned long arg1, unsigned long arg2,
69 unsigned long arg3) 69 unsigned long arg3)
70 { smp_cross_call(func, arg1, arg2, arg3, 0, 0); } 70 { smp_cross_call(func, arg1, arg2, arg3, 0, 0); }
71 extern __inline__ void xc4(smpfunc_t func, unsigned long arg1, unsigned long arg2, 71 extern __inline__ void xc4(smpfunc_t func, unsigned long arg1, unsigned long arg2,
72 unsigned long arg3, unsigned long arg4) 72 unsigned long arg3, unsigned long arg4)
73 { smp_cross_call(func, arg1, arg2, arg3, arg4, 0); } 73 { smp_cross_call(func, arg1, arg2, arg3, arg4, 0); }
74 extern __inline__ void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg2, 74 extern __inline__ void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg2,
75 unsigned long arg3, unsigned long arg4, unsigned long arg5) 75 unsigned long arg3, unsigned long arg4, unsigned long arg5)
76 { smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); } 76 { smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); }
77 77
78 extern __inline__ int smp_call_function(void (*func)(void *info), void *info, int nonatomic, int wait) 78 extern __inline__ int smp_call_function(void (*func)(void *info), void *info, int nonatomic, int wait)
79 { 79 {
80 xc1((smpfunc_t)func, (unsigned long)info); 80 xc1((smpfunc_t)func, (unsigned long)info);
81 return 0; 81 return 0;
82 } 82 }
83 83
84 extern __volatile__ int __cpu_number_map[NR_CPUS]; 84 extern __volatile__ int __cpu_number_map[NR_CPUS];
85 extern __volatile__ int __cpu_logical_map[NR_CPUS]; 85 extern __volatile__ int __cpu_logical_map[NR_CPUS];
86 86
87 extern __inline__ int cpu_logical_map(int cpu) 87 extern __inline__ int cpu_logical_map(int cpu)
88 { 88 {
89 return __cpu_logical_map[cpu]; 89 return __cpu_logical_map[cpu];
90 } 90 }
91 extern __inline__ int cpu_number_map(int cpu) 91 extern __inline__ int cpu_number_map(int cpu)
92 { 92 {
93 return __cpu_number_map[cpu]; 93 return __cpu_number_map[cpu];
94 } 94 }
95 95
96 extern __inline__ int hard_smp4m_processor_id(void) 96 extern __inline__ int hard_smp4m_processor_id(void)
97 { 97 {
98 int cpuid; 98 int cpuid;
99 99
100 __asm__ __volatile__("rd %%tbr, %0\n\t" 100 __asm__ __volatile__("rd %%tbr, %0\n\t"
101 "srl %0, 12, %0\n\t" 101 "srl %0, 12, %0\n\t"
102 "and %0, 3, %0\n\t" : 102 "and %0, 3, %0\n\t" :
103 "=&r" (cpuid)); 103 "=&r" (cpuid));
104 return cpuid; 104 return cpuid;
105 } 105 }
106 106
107 extern __inline__ int hard_smp4d_processor_id(void) 107 extern __inline__ int hard_smp4d_processor_id(void)
108 { 108 {
109 int cpuid; 109 int cpuid;
110 110
111 __asm__ __volatile__("lda [%%g0] %1, %0\n\t" : 111 __asm__ __volatile__("lda [%%g0] %1, %0\n\t" :
112 "=&r" (cpuid) : "i" (ASI_M_VIKING_TMP1)); 112 "=&r" (cpuid) : "i" (ASI_M_VIKING_TMP1));
113 return cpuid; 113 return cpuid;
114 } 114 }
115 115
116 #ifndef MODULE 116 #ifndef MODULE
117 extern __inline__ int hard_smp_processor_id(void) 117 extern __inline__ int hard_smp_processor_id(void)
118 { 118 {
119 int cpuid; 119 int cpuid;
120 120
121 /* Black box - sun4m 121 /* Black box - sun4m
122 __asm__ __volatile__("rd %%tbr, %0\n\t" 122 __asm__ __volatile__("rd %%tbr, %0\n\t"
123 "srl %0, 12, %0\n\t" 123 "srl %0, 12, %0\n\t"
124 "and %0, 3, %0\n\t" : 124 "and %0, 3, %0\n\t" :
125 "=&r" (cpuid)); 125 "=&r" (cpuid));
126 - sun4d 126 - sun4d
127 __asm__ __volatile__("lda [%g0] ASI_M_VIKING_TMP1, %0\n\t" 127 __asm__ __volatile__("lda [%g0] ASI_M_VIKING_TMP1, %0\n\t"
128 "nop; nop" : 128 "nop; nop" :
129 "=&r" (cpuid)); 129 "=&r" (cpuid));
130 See btfixup.h and btfixupprep.c to understand how a blackbox works. 130 See btfixup.h and btfixupprep.c to understand how a blackbox works.
131 */ 131 */
132 __asm__ __volatile__("sethi %%hi(___b_hard_smp_processor_id), %0\n\t" 132 __asm__ __volatile__("sethi %%hi(___b_hard_smp_processor_id), %0\n\t"
133 "sethi %%hi(boot_cpu_id), %0\n\t" 133 "sethi %%hi(boot_cpu_id), %0\n\t"
134 "ldub [%0 + %%lo(boot_cpu_id)], %0\n\t" : 134 "ldub [%0 + %%lo(boot_cpu_id)], %0\n\t" :
135 "=&r" (cpuid)); 135 "=&r" (cpuid));
136 return cpuid; 136 return cpuid;
137 } 137 }
138 #else 138 #else
139 extern __inline__ int hard_smp_processor_id(void) 139 extern __inline__ int hard_smp_processor_id(void)
140 { 140 {
141 int cpuid; 141 int cpuid;
142 142
143 __asm__ __volatile__("mov %%o7, %%g1\n\t" 143 __asm__ __volatile__("mov %%o7, %%g1\n\t"
144 "call ___f___hard_smp_processor_id\n\t" 144 "call ___f___hard_smp_processor_id\n\t"
145 " nop\n\t" 145 " nop\n\t"
146 "mov %%g2, %0\n\t" : "=r"(cpuid) : : "g1", "g2"); 146 "mov %%g2, %0\n\t" : "=r"(cpuid) : : "g1", "g2");
147 return cpuid; 147 return cpuid;
148 } 148 }
149 #endif 149 #endif
150 150
151 #define smp_processor_id() (current_thread_info()->cpu) 151 #define raw_smp_processor_id() (current_thread_info()->cpu)
152 152
153 #define prof_multiplier(__cpu) cpu_data(__cpu).multiplier 153 #define prof_multiplier(__cpu) cpu_data(__cpu).multiplier
154 #define prof_counter(__cpu) cpu_data(__cpu).counter 154 #define prof_counter(__cpu) cpu_data(__cpu).counter
155 155
156 #endif /* !(__ASSEMBLY__) */ 156 #endif /* !(__ASSEMBLY__) */
157 157
158 /* Sparc specific messages. */ 158 /* Sparc specific messages. */
159 #define MSG_CROSS_CALL 0x0005 /* run func on cpus */ 159 #define MSG_CROSS_CALL 0x0005 /* run func on cpus */
160 160
161 /* Empirical PROM processor mailbox constants. If the per-cpu mailbox 161 /* Empirical PROM processor mailbox constants. If the per-cpu mailbox
162 * contains something other than one of these then the ipi is from 162 * contains something other than one of these then the ipi is from
163 * Linux's active_kernel_processor. This facility exists so that 163 * Linux's active_kernel_processor. This facility exists so that
164 * the boot monitor can capture all the other cpus when one catches 164 * the boot monitor can capture all the other cpus when one catches
165 * a watchdog reset or the user enters the monitor using L1-A keys. 165 * a watchdog reset or the user enters the monitor using L1-A keys.
166 */ 166 */
167 #define MBOX_STOPCPU 0xFB 167 #define MBOX_STOPCPU 0xFB
168 #define MBOX_IDLECPU 0xFC 168 #define MBOX_IDLECPU 0xFC
169 #define MBOX_IDLECPU2 0xFD 169 #define MBOX_IDLECPU2 0xFD
170 #define MBOX_STOPCPU2 0xFE 170 #define MBOX_STOPCPU2 0xFE
171 171
172 #endif /* SMP */ 172 #endif /* SMP */
173 173
174 #define NO_PROC_ID 0xFF 174 #define NO_PROC_ID 0xFF
175 175
176 #endif /* !(_SPARC_SMP_H) */ 176 #endif /* !(_SPARC_SMP_H) */
177 177
include/asm-sparc64/smp.h
1 /* smp.h: Sparc64 specific SMP stuff. 1 /* smp.h: Sparc64 specific SMP stuff.
2 * 2 *
3 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 3 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
4 */ 4 */
5 5
6 #ifndef _SPARC64_SMP_H 6 #ifndef _SPARC64_SMP_H
7 #define _SPARC64_SMP_H 7 #define _SPARC64_SMP_H
8 8
9 #include <linux/config.h> 9 #include <linux/config.h>
10 #include <linux/threads.h> 10 #include <linux/threads.h>
11 #include <asm/asi.h> 11 #include <asm/asi.h>
12 #include <asm/starfire.h> 12 #include <asm/starfire.h>
13 #include <asm/spitfire.h> 13 #include <asm/spitfire.h>
14 14
15 #ifndef __ASSEMBLY__ 15 #ifndef __ASSEMBLY__
16 16
17 #include <linux/cpumask.h> 17 #include <linux/cpumask.h>
18 #include <linux/cache.h> 18 #include <linux/cache.h>
19 19
20 #endif /* !(__ASSEMBLY__) */ 20 #endif /* !(__ASSEMBLY__) */
21 21
22 #ifdef CONFIG_SMP 22 #ifdef CONFIG_SMP
23 23
24 #ifndef __ASSEMBLY__ 24 #ifndef __ASSEMBLY__
25 25
26 /* 26 /*
27 * Private routines/data 27 * Private routines/data
28 */ 28 */
29 29
30 #include <asm/bitops.h> 30 #include <asm/bitops.h>
31 #include <asm/atomic.h> 31 #include <asm/atomic.h>
32 32
33 extern cpumask_t phys_cpu_present_map; 33 extern cpumask_t phys_cpu_present_map;
34 #define cpu_possible_map phys_cpu_present_map 34 #define cpu_possible_map phys_cpu_present_map
35 35
36 /* 36 /*
37 * General functions that each host system must provide. 37 * General functions that each host system must provide.
38 */ 38 */
39 39
40 static __inline__ int hard_smp_processor_id(void) 40 static __inline__ int hard_smp_processor_id(void)
41 { 41 {
42 if (tlb_type == cheetah || tlb_type == cheetah_plus) { 42 if (tlb_type == cheetah || tlb_type == cheetah_plus) {
43 unsigned long cfg, ver; 43 unsigned long cfg, ver;
44 __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver)); 44 __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
45 if ((ver >> 32) == 0x003e0016) { 45 if ((ver >> 32) == 0x003e0016) {
46 __asm__ __volatile__("ldxa [%%g0] %1, %0" 46 __asm__ __volatile__("ldxa [%%g0] %1, %0"
47 : "=r" (cfg) 47 : "=r" (cfg)
48 : "i" (ASI_JBUS_CONFIG)); 48 : "i" (ASI_JBUS_CONFIG));
49 return ((cfg >> 17) & 0x1f); 49 return ((cfg >> 17) & 0x1f);
50 } else { 50 } else {
51 __asm__ __volatile__("ldxa [%%g0] %1, %0" 51 __asm__ __volatile__("ldxa [%%g0] %1, %0"
52 : "=r" (cfg) 52 : "=r" (cfg)
53 : "i" (ASI_SAFARI_CONFIG)); 53 : "i" (ASI_SAFARI_CONFIG));
54 return ((cfg >> 17) & 0x3ff); 54 return ((cfg >> 17) & 0x3ff);
55 } 55 }
56 } else if (this_is_starfire != 0) { 56 } else if (this_is_starfire != 0) {
57 return starfire_hard_smp_processor_id(); 57 return starfire_hard_smp_processor_id();
58 } else { 58 } else {
59 unsigned long upaconfig; 59 unsigned long upaconfig;
60 __asm__ __volatile__("ldxa [%%g0] %1, %0" 60 __asm__ __volatile__("ldxa [%%g0] %1, %0"
61 : "=r" (upaconfig) 61 : "=r" (upaconfig)
62 : "i" (ASI_UPA_CONFIG)); 62 : "i" (ASI_UPA_CONFIG));
63 return ((upaconfig >> 17) & 0x1f); 63 return ((upaconfig >> 17) & 0x1f);
64 } 64 }
65 } 65 }
66 66
67 #define smp_processor_id() (current_thread_info()->cpu) 67 #define raw_smp_processor_id() (current_thread_info()->cpu)
68 68
69 #endif /* !(__ASSEMBLY__) */ 69 #endif /* !(__ASSEMBLY__) */
70 70
71 #endif /* !(CONFIG_SMP) */ 71 #endif /* !(CONFIG_SMP) */
72 72
73 #define NO_PROC_ID 0xFF 73 #define NO_PROC_ID 0xFF
74 74
75 #endif /* !(_SPARC64_SMP_H) */ 75 #endif /* !(_SPARC64_SMP_H) */
76 76
include/asm-um/smp.h
1 #ifndef __UM_SMP_H 1 #ifndef __UM_SMP_H
2 #define __UM_SMP_H 2 #define __UM_SMP_H
3 3
4 #ifdef CONFIG_SMP 4 #ifdef CONFIG_SMP
5 5
6 #include "linux/config.h" 6 #include "linux/config.h"
7 #include "linux/bitops.h" 7 #include "linux/bitops.h"
8 #include "asm/current.h" 8 #include "asm/current.h"
9 #include "linux/cpumask.h" 9 #include "linux/cpumask.h"
10 10
11 #define smp_processor_id() (current_thread->cpu) 11 #define raw_smp_processor_id() (current_thread->cpu)
12
12 #define cpu_logical_map(n) (n) 13 #define cpu_logical_map(n) (n)
13 #define cpu_number_map(n) (n) 14 #define cpu_number_map(n) (n)
14 #define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */ 15 #define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */
15 extern int hard_smp_processor_id(void); 16 extern int hard_smp_processor_id(void);
16 #define NO_PROC_ID -1 17 #define NO_PROC_ID -1
17 18
18 extern int ncpus; 19 extern int ncpus;
19 20
20 21
21 extern inline void smp_cpus_done(unsigned int maxcpus) 22 extern inline void smp_cpus_done(unsigned int maxcpus)
22 { 23 {
23 } 24 }
24 25
25 #endif 26 #endif
26 27
27 #endif 28 #endif
28 29
include/asm-x86_64/smp.h
1 #ifndef __ASM_SMP_H 1 #ifndef __ASM_SMP_H
2 #define __ASM_SMP_H 2 #define __ASM_SMP_H
3 3
4 /* 4 /*
5 * We need the APIC definitions automatically as part of 'smp.h' 5 * We need the APIC definitions automatically as part of 'smp.h'
6 */ 6 */
7 #ifndef __ASSEMBLY__ 7 #ifndef __ASSEMBLY__
8 #include <linux/config.h> 8 #include <linux/config.h>
9 #include <linux/threads.h> 9 #include <linux/threads.h>
10 #include <linux/cpumask.h> 10 #include <linux/cpumask.h>
11 #include <linux/bitops.h> 11 #include <linux/bitops.h>
12 extern int disable_apic; 12 extern int disable_apic;
13 #endif 13 #endif
14 14
15 #ifdef CONFIG_X86_LOCAL_APIC 15 #ifdef CONFIG_X86_LOCAL_APIC
16 #ifndef __ASSEMBLY__ 16 #ifndef __ASSEMBLY__
17 #include <asm/fixmap.h> 17 #include <asm/fixmap.h>
18 #include <asm/mpspec.h> 18 #include <asm/mpspec.h>
19 #ifdef CONFIG_X86_IO_APIC 19 #ifdef CONFIG_X86_IO_APIC
20 #include <asm/io_apic.h> 20 #include <asm/io_apic.h>
21 #endif 21 #endif
22 #include <asm/apic.h> 22 #include <asm/apic.h>
23 #include <asm/thread_info.h> 23 #include <asm/thread_info.h>
24 #endif 24 #endif
25 #endif 25 #endif
26 26
27 #ifdef CONFIG_SMP 27 #ifdef CONFIG_SMP
28 #ifndef ASSEMBLY 28 #ifndef ASSEMBLY
29 29
30 #include <asm/pda.h> 30 #include <asm/pda.h>
31 31
32 struct pt_regs; 32 struct pt_regs;
33 33
34 extern cpumask_t cpu_present_mask; 34 extern cpumask_t cpu_present_mask;
35 extern cpumask_t cpu_possible_map; 35 extern cpumask_t cpu_possible_map;
36 extern cpumask_t cpu_online_map; 36 extern cpumask_t cpu_online_map;
37 extern cpumask_t cpu_callout_map; 37 extern cpumask_t cpu_callout_map;
38 38
39 /* 39 /*
40 * Private routines/data 40 * Private routines/data
41 */ 41 */
42 42
43 extern void smp_alloc_memory(void); 43 extern void smp_alloc_memory(void);
44 extern volatile unsigned long smp_invalidate_needed; 44 extern volatile unsigned long smp_invalidate_needed;
45 extern int pic_mode; 45 extern int pic_mode;
46 extern int smp_num_siblings; 46 extern int smp_num_siblings;
47 extern void smp_flush_tlb(void); 47 extern void smp_flush_tlb(void);
48 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); 48 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
49 extern void smp_send_reschedule(int cpu); 49 extern void smp_send_reschedule(int cpu);
50 extern void smp_invalidate_rcv(void); /* Process an NMI */ 50 extern void smp_invalidate_rcv(void); /* Process an NMI */
51 extern void zap_low_mappings(void); 51 extern void zap_low_mappings(void);
52 void smp_stop_cpu(void); 52 void smp_stop_cpu(void);
53 extern cpumask_t cpu_sibling_map[NR_CPUS]; 53 extern cpumask_t cpu_sibling_map[NR_CPUS];
54 extern cpumask_t cpu_core_map[NR_CPUS]; 54 extern cpumask_t cpu_core_map[NR_CPUS];
55 extern u8 phys_proc_id[NR_CPUS]; 55 extern u8 phys_proc_id[NR_CPUS];
56 extern u8 cpu_core_id[NR_CPUS]; 56 extern u8 cpu_core_id[NR_CPUS];
57 57
58 #define SMP_TRAMPOLINE_BASE 0x6000 58 #define SMP_TRAMPOLINE_BASE 0x6000
59 59
60 /* 60 /*
61 * On x86 all CPUs are mapped 1:1 to the APIC space. 61 * On x86 all CPUs are mapped 1:1 to the APIC space.
62 * This simplifies scheduling and IPI sending and 62 * This simplifies scheduling and IPI sending and
63 * compresses data structures. 63 * compresses data structures.
64 */ 64 */
65 65
66 static inline int num_booting_cpus(void) 66 static inline int num_booting_cpus(void)
67 { 67 {
68 return cpus_weight(cpu_callout_map); 68 return cpus_weight(cpu_callout_map);
69 } 69 }
70 70
71 #define __smp_processor_id() read_pda(cpunumber) 71 #define raw_smp_processor_id() read_pda(cpunumber)
72 72
73 extern __inline int hard_smp_processor_id(void) 73 extern __inline int hard_smp_processor_id(void)
74 { 74 {
75 /* we don't want to mark this access volatile - bad code generation */ 75 /* we don't want to mark this access volatile - bad code generation */
76 return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); 76 return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
77 } 77 }
78 78
79 extern int safe_smp_processor_id(void); 79 extern int safe_smp_processor_id(void);
80 80
81 #endif /* !ASSEMBLY */ 81 #endif /* !ASSEMBLY */
82 82
83 #define NO_PROC_ID 0xFF /* No processor magic marker */ 83 #define NO_PROC_ID 0xFF /* No processor magic marker */
84 84
85 #endif 85 #endif
86 86
87 #ifndef ASSEMBLY 87 #ifndef ASSEMBLY
88 /* 88 /*
89 * Some lowlevel functions might want to know about 89 * Some lowlevel functions might want to know about
90 * the real APIC ID <-> CPU # mapping. 90 * the real APIC ID <-> CPU # mapping.
91 */ 91 */
92 extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */ 92 extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */
93 extern u8 x86_cpu_to_log_apicid[NR_CPUS]; 93 extern u8 x86_cpu_to_log_apicid[NR_CPUS];
94 extern u8 bios_cpu_apicid[]; 94 extern u8 bios_cpu_apicid[];
95 95
96 static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 96 static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
97 { 97 {
98 return cpus_addr(cpumask)[0]; 98 return cpus_addr(cpumask)[0];
99 } 99 }
100 100
101 static inline int cpu_present_to_apicid(int mps_cpu) 101 static inline int cpu_present_to_apicid(int mps_cpu)
102 { 102 {
103 if (mps_cpu < NR_CPUS) 103 if (mps_cpu < NR_CPUS)
104 return (int)bios_cpu_apicid[mps_cpu]; 104 return (int)bios_cpu_apicid[mps_cpu];
105 else 105 else
106 return BAD_APICID; 106 return BAD_APICID;
107 } 107 }
108 108
109 #endif /* !ASSEMBLY */ 109 #endif /* !ASSEMBLY */
110 110
111 #ifndef CONFIG_SMP 111 #ifndef CONFIG_SMP
112 #define stack_smp_processor_id() 0 112 #define stack_smp_processor_id() 0
113 #define safe_smp_processor_id() 0 113 #define safe_smp_processor_id() 0
114 #define cpu_logical_map(x) (x) 114 #define cpu_logical_map(x) (x)
115 #else 115 #else
116 #include <asm/thread_info.h> 116 #include <asm/thread_info.h>
117 #define stack_smp_processor_id() \ 117 #define stack_smp_processor_id() \
118 ({ \ 118 ({ \
119 struct thread_info *ti; \ 119 struct thread_info *ti; \
120 __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ 120 __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
121 ti->cpu; \ 121 ti->cpu; \
122 }) 122 })
123 #endif 123 #endif
124 124
125 #ifndef __ASSEMBLY__ 125 #ifndef __ASSEMBLY__
126 static __inline int logical_smp_processor_id(void) 126 static __inline int logical_smp_processor_id(void)
127 { 127 {
128 /* we don't want to mark this access volatile - bad code generation */ 128 /* we don't want to mark this access volatile - bad code generation */
129 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); 129 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
130 } 130 }
131 #endif 131 #endif
132 132
133 #endif 133 #endif
134 134
135 135
include/linux/mmzone.h
1 #ifndef _LINUX_MMZONE_H 1 #ifndef _LINUX_MMZONE_H
2 #define _LINUX_MMZONE_H 2 #define _LINUX_MMZONE_H
3 3
4 #ifdef __KERNEL__ 4 #ifdef __KERNEL__
5 #ifndef __ASSEMBLY__ 5 #ifndef __ASSEMBLY__
6 6
7 #include <linux/config.h> 7 #include <linux/config.h>
8 #include <linux/spinlock.h> 8 #include <linux/spinlock.h>
9 #include <linux/list.h> 9 #include <linux/list.h>
10 #include <linux/wait.h> 10 #include <linux/wait.h>
11 #include <linux/cache.h> 11 #include <linux/cache.h>
12 #include <linux/threads.h> 12 #include <linux/threads.h>
13 #include <linux/numa.h> 13 #include <linux/numa.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <asm/atomic.h> 15 #include <asm/atomic.h>
16 16
17 /* Free memory management - zoned buddy allocator. */ 17 /* Free memory management - zoned buddy allocator. */
18 #ifndef CONFIG_FORCE_MAX_ZONEORDER 18 #ifndef CONFIG_FORCE_MAX_ZONEORDER
19 #define MAX_ORDER 11 19 #define MAX_ORDER 11
20 #else 20 #else
21 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER 21 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
22 #endif 22 #endif
23 23
24 struct free_area { 24 struct free_area {
25 struct list_head free_list; 25 struct list_head free_list;
26 unsigned long nr_free; 26 unsigned long nr_free;
27 }; 27 };
28 28
29 struct pglist_data; 29 struct pglist_data;
30 30
31 /* 31 /*
32 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. 32 * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
33 * So add a wild amount of padding here to ensure that they fall into separate 33 * So add a wild amount of padding here to ensure that they fall into separate
34 * cachelines. There are very few zone structures in the machine, so space 34 * cachelines. There are very few zone structures in the machine, so space
35 * consumption is not a concern here. 35 * consumption is not a concern here.
36 */ 36 */
37 #if defined(CONFIG_SMP) 37 #if defined(CONFIG_SMP)
38 struct zone_padding { 38 struct zone_padding {
39 char x[0]; 39 char x[0];
40 } ____cacheline_maxaligned_in_smp; 40 } ____cacheline_maxaligned_in_smp;
41 #define ZONE_PADDING(name) struct zone_padding name; 41 #define ZONE_PADDING(name) struct zone_padding name;
42 #else 42 #else
43 #define ZONE_PADDING(name) 43 #define ZONE_PADDING(name)
44 #endif 44 #endif
45 45
46 struct per_cpu_pages { 46 struct per_cpu_pages {
47 int count; /* number of pages in the list */ 47 int count; /* number of pages in the list */
48 int low; /* low watermark, refill needed */ 48 int low; /* low watermark, refill needed */
49 int high; /* high watermark, emptying needed */ 49 int high; /* high watermark, emptying needed */
50 int batch; /* chunk size for buddy add/remove */ 50 int batch; /* chunk size for buddy add/remove */
51 struct list_head list; /* the list of pages */ 51 struct list_head list; /* the list of pages */
52 }; 52 };
53 53
54 struct per_cpu_pageset { 54 struct per_cpu_pageset {
55 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 55 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
56 #ifdef CONFIG_NUMA 56 #ifdef CONFIG_NUMA
57 unsigned long numa_hit; /* allocated in intended node */ 57 unsigned long numa_hit; /* allocated in intended node */
58 unsigned long numa_miss; /* allocated in non intended node */ 58 unsigned long numa_miss; /* allocated in non intended node */
59 unsigned long numa_foreign; /* was intended here, hit elsewhere */ 59 unsigned long numa_foreign; /* was intended here, hit elsewhere */
60 unsigned long interleave_hit; /* interleaver prefered this zone */ 60 unsigned long interleave_hit; /* interleaver prefered this zone */
61 unsigned long local_node; /* allocation from local node */ 61 unsigned long local_node; /* allocation from local node */
62 unsigned long other_node; /* allocation from other node */ 62 unsigned long other_node; /* allocation from other node */
63 #endif 63 #endif
64 } ____cacheline_aligned_in_smp; 64 } ____cacheline_aligned_in_smp;
65 65
66 #define ZONE_DMA 0 66 #define ZONE_DMA 0
67 #define ZONE_NORMAL 1 67 #define ZONE_NORMAL 1
68 #define ZONE_HIGHMEM 2 68 #define ZONE_HIGHMEM 2
69 69
70 #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ 70 #define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
71 #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ 71 #define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
72 72
73 73
74 /* 74 /*
75 * When a memory allocation must conform to specific limitations (such 75 * When a memory allocation must conform to specific limitations (such
76 * as being suitable for DMA) the caller will pass in hints to the 76 * as being suitable for DMA) the caller will pass in hints to the
77 * allocator in the gfp_mask, in the zone modifier bits. These bits 77 * allocator in the gfp_mask, in the zone modifier bits. These bits
78 * are used to select a priority ordered list of memory zones which 78 * are used to select a priority ordered list of memory zones which
79 * match the requested limits. GFP_ZONEMASK defines which bits within 79 * match the requested limits. GFP_ZONEMASK defines which bits within
80 * the gfp_mask should be considered as zone modifiers. Each valid 80 * the gfp_mask should be considered as zone modifiers. Each valid
81 * combination of the zone modifier bits has a corresponding list 81 * combination of the zone modifier bits has a corresponding list
82 * of zones (in node_zonelists). Thus for two zone modifiers there 82 * of zones (in node_zonelists). Thus for two zone modifiers there
83 * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will 83 * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will
84 * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible 84 * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible
85 * combinations of zone modifiers in "zone modifier space". 85 * combinations of zone modifiers in "zone modifier space".
86 */ 86 */
87 #define GFP_ZONEMASK 0x03 87 #define GFP_ZONEMASK 0x03
88 /* 88 /*
89 * As an optimisation any zone modifier bits which are only valid when 89 * As an optimisation any zone modifier bits which are only valid when
90 * no other zone modifier bits are set (loners) should be placed in 90 * no other zone modifier bits are set (loners) should be placed in
91 * the highest order bits of this field. This allows us to reduce the 91 * the highest order bits of this field. This allows us to reduce the
92 * extent of the zonelists thus saving space. For example in the case 92 * extent of the zonelists thus saving space. For example in the case
93 * of three zone modifier bits, we could require up to eight zonelists. 93 * of three zone modifier bits, we could require up to eight zonelists.
94 * If the left most zone modifier is a "loner" then the highest valid 94 * If the left most zone modifier is a "loner" then the highest valid
95 * zonelist would be four allowing us to allocate only five zonelists. 95 * zonelist would be four allowing us to allocate only five zonelists.
96 * Use the first form when the left most bit is not a "loner", otherwise 96 * Use the first form when the left most bit is not a "loner", otherwise
97 * use the second. 97 * use the second.
98 */ 98 */
99 /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ 99 /* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */
100 #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ 100 #define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */
101 101
102 /* 102 /*
103 * On machines where it is needed (eg PCs) we divide physical memory 103 * On machines where it is needed (eg PCs) we divide physical memory
104 * into multiple physical zones. On a PC we have 3 zones: 104 * into multiple physical zones. On a PC we have 3 zones:
105 * 105 *
106 * ZONE_DMA < 16 MB ISA DMA capable memory 106 * ZONE_DMA < 16 MB ISA DMA capable memory
107 * ZONE_NORMAL 16-896 MB direct mapped by the kernel 107 * ZONE_NORMAL 16-896 MB direct mapped by the kernel
108 * ZONE_HIGHMEM > 896 MB only page cache and user processes 108 * ZONE_HIGHMEM > 896 MB only page cache and user processes
109 */ 109 */
110 110
111 struct zone { 111 struct zone {
112 /* Fields commonly accessed by the page allocator */ 112 /* Fields commonly accessed by the page allocator */
113 unsigned long free_pages; 113 unsigned long free_pages;
114 unsigned long pages_min, pages_low, pages_high; 114 unsigned long pages_min, pages_low, pages_high;
115 /* 115 /*
116 * We don't know if the memory that we're going to allocate will be freeable 116 * We don't know if the memory that we're going to allocate will be freeable
117 * or/and it will be released eventually, so to avoid totally wasting several 117 * or/and it will be released eventually, so to avoid totally wasting several
118 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 118 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
119 * to run OOM on the lower zones despite there's tons of freeable ram 119 * to run OOM on the lower zones despite there's tons of freeable ram
120 * on the higher zones). This array is recalculated at runtime if the 120 * on the higher zones). This array is recalculated at runtime if the
121 * sysctl_lowmem_reserve_ratio sysctl changes. 121 * sysctl_lowmem_reserve_ratio sysctl changes.
122 */ 122 */
123 unsigned long lowmem_reserve[MAX_NR_ZONES]; 123 unsigned long lowmem_reserve[MAX_NR_ZONES];
124 124
125 struct per_cpu_pageset pageset[NR_CPUS]; 125 struct per_cpu_pageset pageset[NR_CPUS];
126 126
127 /* 127 /*
128 * free areas of different sizes 128 * free areas of different sizes
129 */ 129 */
130 spinlock_t lock; 130 spinlock_t lock;
131 struct free_area free_area[MAX_ORDER]; 131 struct free_area free_area[MAX_ORDER];
132 132
133 133
134 ZONE_PADDING(_pad1_) 134 ZONE_PADDING(_pad1_)
135 135
136 /* Fields commonly accessed by the page reclaim scanner */ 136 /* Fields commonly accessed by the page reclaim scanner */
137 spinlock_t lru_lock; 137 spinlock_t lru_lock;
138 struct list_head active_list; 138 struct list_head active_list;
139 struct list_head inactive_list; 139 struct list_head inactive_list;
140 unsigned long nr_scan_active; 140 unsigned long nr_scan_active;
141 unsigned long nr_scan_inactive; 141 unsigned long nr_scan_inactive;
142 unsigned long nr_active; 142 unsigned long nr_active;
143 unsigned long nr_inactive; 143 unsigned long nr_inactive;
144 unsigned long pages_scanned; /* since last reclaim */ 144 unsigned long pages_scanned; /* since last reclaim */
145 int all_unreclaimable; /* All pages pinned */ 145 int all_unreclaimable; /* All pages pinned */
146 146
147 /* 147 /*
148 * prev_priority holds the scanning priority for this zone. It is 148 * prev_priority holds the scanning priority for this zone. It is
149 * defined as the scanning priority at which we achieved our reclaim 149 * defined as the scanning priority at which we achieved our reclaim
150 * target at the previous try_to_free_pages() or balance_pgdat() 150 * target at the previous try_to_free_pages() or balance_pgdat()
151 * invokation. 151 * invokation.
152 * 152 *
153 * We use prev_priority as a measure of how much stress page reclaim is 153 * We use prev_priority as a measure of how much stress page reclaim is
154 * under - it drives the swappiness decision: whether to unmap mapped 154 * under - it drives the swappiness decision: whether to unmap mapped
155 * pages. 155 * pages.
156 * 156 *
157 * temp_priority is used to remember the scanning priority at which 157 * temp_priority is used to remember the scanning priority at which
158 * this zone was successfully refilled to free_pages == pages_high. 158 * this zone was successfully refilled to free_pages == pages_high.
159 * 159 *
160 * Access to both these fields is quite racy even on uniprocessor. But 160 * Access to both these fields is quite racy even on uniprocessor. But
161 * it is expected to average out OK. 161 * it is expected to average out OK.
162 */ 162 */
163 int temp_priority; 163 int temp_priority;
164 int prev_priority; 164 int prev_priority;
165 165
166 166
167 ZONE_PADDING(_pad2_) 167 ZONE_PADDING(_pad2_)
168 /* Rarely used or read-mostly fields */ 168 /* Rarely used or read-mostly fields */
169 169
170 /* 170 /*
171 * wait_table -- the array holding the hash table 171 * wait_table -- the array holding the hash table
172 * wait_table_size -- the size of the hash table array 172 * wait_table_size -- the size of the hash table array
173 * wait_table_bits -- wait_table_size == (1 << wait_table_bits) 173 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
174 * 174 *
175 * The purpose of all these is to keep track of the people 175 * The purpose of all these is to keep track of the people
176 * waiting for a page to become available and make them 176 * waiting for a page to become available and make them
177 * runnable again when possible. The trouble is that this 177 * runnable again when possible. The trouble is that this
178 * consumes a lot of space, especially when so few things 178 * consumes a lot of space, especially when so few things
179 * wait on pages at a given time. So instead of using 179 * wait on pages at a given time. So instead of using
180 * per-page waitqueues, we use a waitqueue hash table. 180 * per-page waitqueues, we use a waitqueue hash table.
181 * 181 *
182 * The bucket discipline is to sleep on the same queue when 182 * The bucket discipline is to sleep on the same queue when
183 * colliding and wake all in that wait queue when removing. 183 * colliding and wake all in that wait queue when removing.
184 * When something wakes, it must check to be sure its page is 184 * When something wakes, it must check to be sure its page is
185 * truly available, a la thundering herd. The cost of a 185 * truly available, a la thundering herd. The cost of a
186 * collision is great, but given the expected load of the 186 * collision is great, but given the expected load of the
187 * table, they should be so rare as to be outweighed by the 187 * table, they should be so rare as to be outweighed by the
188 * benefits from the saved space. 188 * benefits from the saved space.
189 * 189 *
190 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the 190 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
191 * primary users of these fields, and in mm/page_alloc.c 191 * primary users of these fields, and in mm/page_alloc.c
192 * free_area_init_core() performs the initialization of them. 192 * free_area_init_core() performs the initialization of them.
193 */ 193 */
194 wait_queue_head_t * wait_table; 194 wait_queue_head_t * wait_table;
195 unsigned long wait_table_size; 195 unsigned long wait_table_size;
196 unsigned long wait_table_bits; 196 unsigned long wait_table_bits;
197 197
198 /* 198 /*
199 * Discontig memory support fields. 199 * Discontig memory support fields.
200 */ 200 */
201 struct pglist_data *zone_pgdat; 201 struct pglist_data *zone_pgdat;
202 struct page *zone_mem_map; 202 struct page *zone_mem_map;
203 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 203 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
204 unsigned long zone_start_pfn; 204 unsigned long zone_start_pfn;
205 205
206 unsigned long spanned_pages; /* total size, including holes */ 206 unsigned long spanned_pages; /* total size, including holes */
207 unsigned long present_pages; /* amount of memory (excluding holes) */ 207 unsigned long present_pages; /* amount of memory (excluding holes) */
208 208
209 /* 209 /*
210 * rarely used fields: 210 * rarely used fields:
211 */ 211 */
212 char *name; 212 char *name;
213 } ____cacheline_maxaligned_in_smp; 213 } ____cacheline_maxaligned_in_smp;
214 214
215 215
216 /* 216 /*
217 * The "priority" of VM scanning is how much of the queues we will scan in one 217 * The "priority" of VM scanning is how much of the queues we will scan in one
218 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 218 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
219 * queues ("queue_length >> 12") during an aging round. 219 * queues ("queue_length >> 12") during an aging round.
220 */ 220 */
221 #define DEF_PRIORITY 12 221 #define DEF_PRIORITY 12
222 222
223 /* 223 /*
224 * One allocation request operates on a zonelist. A zonelist 224 * One allocation request operates on a zonelist. A zonelist
225 * is a list of zones, the first one is the 'goal' of the 225 * is a list of zones, the first one is the 'goal' of the
226 * allocation, the other zones are fallback zones, in decreasing 226 * allocation, the other zones are fallback zones, in decreasing
227 * priority. 227 * priority.
228 * 228 *
229 * Right now a zonelist takes up less than a cacheline. We never 229 * Right now a zonelist takes up less than a cacheline. We never
230 * modify it apart from boot-up, and only a few indices are used, 230 * modify it apart from boot-up, and only a few indices are used,
231 * so despite the zonelist table being relatively big, the cache 231 * so despite the zonelist table being relatively big, the cache
232 * footprint of this construct is very small. 232 * footprint of this construct is very small.
233 */ 233 */
234 struct zonelist { 234 struct zonelist {
235 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 235 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
236 }; 236 };
237 237
238 238
239 /* 239 /*
240 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM 240 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
241 * (mostly NUMA machines?) to denote a higher-level memory zone than the 241 * (mostly NUMA machines?) to denote a higher-level memory zone than the
242 * zone denotes. 242 * zone denotes.
243 * 243 *
244 * On NUMA machines, each NUMA node would have a pg_data_t to describe 244 * On NUMA machines, each NUMA node would have a pg_data_t to describe
245 * it's memory layout. 245 * it's memory layout.
246 * 246 *
247 * Memory statistics and page replacement data structures are maintained on a 247 * Memory statistics and page replacement data structures are maintained on a
248 * per-zone basis. 248 * per-zone basis.
249 */ 249 */
250 struct bootmem_data; 250 struct bootmem_data;
251 typedef struct pglist_data { 251 typedef struct pglist_data {
252 struct zone node_zones[MAX_NR_ZONES]; 252 struct zone node_zones[MAX_NR_ZONES];
253 struct zonelist node_zonelists[GFP_ZONETYPES]; 253 struct zonelist node_zonelists[GFP_ZONETYPES];
254 int nr_zones; 254 int nr_zones;
255 struct page *node_mem_map; 255 struct page *node_mem_map;
256 struct bootmem_data *bdata; 256 struct bootmem_data *bdata;
257 unsigned long node_start_pfn; 257 unsigned long node_start_pfn;
258 unsigned long node_present_pages; /* total number of physical pages */ 258 unsigned long node_present_pages; /* total number of physical pages */
259 unsigned long node_spanned_pages; /* total size of physical page 259 unsigned long node_spanned_pages; /* total size of physical page
260 range, including holes */ 260 range, including holes */
261 int node_id; 261 int node_id;
262 struct pglist_data *pgdat_next; 262 struct pglist_data *pgdat_next;
263 wait_queue_head_t kswapd_wait; 263 wait_queue_head_t kswapd_wait;
264 struct task_struct *kswapd; 264 struct task_struct *kswapd;
265 int kswapd_max_order; 265 int kswapd_max_order;
266 } pg_data_t; 266 } pg_data_t;
267 267
268 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 268 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
269 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 269 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
270 270
271 extern struct pglist_data *pgdat_list; 271 extern struct pglist_data *pgdat_list;
272 272
273 void __get_zone_counts(unsigned long *active, unsigned long *inactive, 273 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
274 unsigned long *free, struct pglist_data *pgdat); 274 unsigned long *free, struct pglist_data *pgdat);
275 void get_zone_counts(unsigned long *active, unsigned long *inactive, 275 void get_zone_counts(unsigned long *active, unsigned long *inactive,
276 unsigned long *free); 276 unsigned long *free);
277 void build_all_zonelists(void); 277 void build_all_zonelists(void);
278 void wakeup_kswapd(struct zone *zone, int order); 278 void wakeup_kswapd(struct zone *zone, int order);
279 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 279 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
280 int alloc_type, int can_try_harder, int gfp_high); 280 int alloc_type, int can_try_harder, int gfp_high);
281 281
282 #ifdef CONFIG_HAVE_MEMORY_PRESENT 282 #ifdef CONFIG_HAVE_MEMORY_PRESENT
283 void memory_present(int nid, unsigned long start, unsigned long end); 283 void memory_present(int nid, unsigned long start, unsigned long end);
284 #else 284 #else
285 static inline void memory_present(int nid, unsigned long start, unsigned long end) {} 285 static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
286 #endif 286 #endif
287 287
288 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE 288 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
289 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); 289 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
290 #endif 290 #endif
291 291
292 /* 292 /*
293 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. 293 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
294 */ 294 */
295 #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) 295 #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
296 296
297 /** 297 /**
298 * for_each_pgdat - helper macro to iterate over all nodes 298 * for_each_pgdat - helper macro to iterate over all nodes
299 * @pgdat - pointer to a pg_data_t variable 299 * @pgdat - pointer to a pg_data_t variable
300 * 300 *
301 * Meant to help with common loops of the form 301 * Meant to help with common loops of the form
302 * pgdat = pgdat_list; 302 * pgdat = pgdat_list;
303 * while(pgdat) { 303 * while(pgdat) {
304 * ... 304 * ...
305 * pgdat = pgdat->pgdat_next; 305 * pgdat = pgdat->pgdat_next;
306 * } 306 * }
307 */ 307 */
308 #define for_each_pgdat(pgdat) \ 308 #define for_each_pgdat(pgdat) \
309 for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) 309 for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
310 310
311 /* 311 /*
312 * next_zone - helper magic for for_each_zone() 312 * next_zone - helper magic for for_each_zone()
313 * Thanks to William Lee Irwin III for this piece of ingenuity. 313 * Thanks to William Lee Irwin III for this piece of ingenuity.
314 */ 314 */
315 static inline struct zone *next_zone(struct zone *zone) 315 static inline struct zone *next_zone(struct zone *zone)
316 { 316 {
317 pg_data_t *pgdat = zone->zone_pgdat; 317 pg_data_t *pgdat = zone->zone_pgdat;
318 318
319 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) 319 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
320 zone++; 320 zone++;
321 else if (pgdat->pgdat_next) { 321 else if (pgdat->pgdat_next) {
322 pgdat = pgdat->pgdat_next; 322 pgdat = pgdat->pgdat_next;
323 zone = pgdat->node_zones; 323 zone = pgdat->node_zones;
324 } else 324 } else
325 zone = NULL; 325 zone = NULL;
326 326
327 return zone; 327 return zone;
328 } 328 }
329 329
330 /** 330 /**
331 * for_each_zone - helper macro to iterate over all memory zones 331 * for_each_zone - helper macro to iterate over all memory zones
332 * @zone - pointer to struct zone variable 332 * @zone - pointer to struct zone variable
333 * 333 *
334 * The user only needs to declare the zone variable, for_each_zone 334 * The user only needs to declare the zone variable, for_each_zone
335 * fills it in. This basically means for_each_zone() is an 335 * fills it in. This basically means for_each_zone() is an
336 * easier to read version of this piece of code: 336 * easier to read version of this piece of code:
337 * 337 *
338 * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) 338 * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
339 * for (i = 0; i < MAX_NR_ZONES; ++i) { 339 * for (i = 0; i < MAX_NR_ZONES; ++i) {
340 * struct zone * z = pgdat->node_zones + i; 340 * struct zone * z = pgdat->node_zones + i;
341 * ... 341 * ...
342 * } 342 * }
343 * } 343 * }
344 */ 344 */
345 #define for_each_zone(zone) \ 345 #define for_each_zone(zone) \
346 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) 346 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
347 347
348 static inline int is_highmem_idx(int idx) 348 static inline int is_highmem_idx(int idx)
349 { 349 {
350 return (idx == ZONE_HIGHMEM); 350 return (idx == ZONE_HIGHMEM);
351 } 351 }
352 352
353 static inline int is_normal_idx(int idx) 353 static inline int is_normal_idx(int idx)
354 { 354 {
355 return (idx == ZONE_NORMAL); 355 return (idx == ZONE_NORMAL);
356 } 356 }
357 /** 357 /**
358 * is_highmem - helper function to quickly check if a struct zone is a 358 * is_highmem - helper function to quickly check if a struct zone is a
359 * highmem zone or not. This is an attempt to keep references 359 * highmem zone or not. This is an attempt to keep references
360 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. 360 * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
361 * @zone - pointer to struct zone variable 361 * @zone - pointer to struct zone variable
362 */ 362 */
363 static inline int is_highmem(struct zone *zone) 363 static inline int is_highmem(struct zone *zone)
364 { 364 {
365 return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; 365 return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
366 } 366 }
367 367
368 static inline int is_normal(struct zone *zone) 368 static inline int is_normal(struct zone *zone)
369 { 369 {
370 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; 370 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
371 } 371 }
372 372
373 /* These two functions are used to setup the per zone pages min values */ 373 /* These two functions are used to setup the per zone pages min values */
374 struct ctl_table; 374 struct ctl_table;
375 struct file; 375 struct file;
376 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 376 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
377 void __user *, size_t *, loff_t *); 377 void __user *, size_t *, loff_t *);
378 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; 378 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
379 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, 379 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
380 void __user *, size_t *, loff_t *); 380 void __user *, size_t *, loff_t *);
381 381
382 #include <linux/topology.h> 382 #include <linux/topology.h>
383 /* Returns the number of the current Node. */ 383 /* Returns the number of the current Node. */
384 #define numa_node_id() (cpu_to_node(_smp_processor_id())) 384 #define numa_node_id() (cpu_to_node(raw_smp_processor_id()))
385 385
386 #ifndef CONFIG_DISCONTIGMEM 386 #ifndef CONFIG_DISCONTIGMEM
387 387
388 extern struct pglist_data contig_page_data; 388 extern struct pglist_data contig_page_data;
389 #define NODE_DATA(nid) (&contig_page_data) 389 #define NODE_DATA(nid) (&contig_page_data)
390 #define NODE_MEM_MAP(nid) mem_map 390 #define NODE_MEM_MAP(nid) mem_map
391 #define MAX_NODES_SHIFT 1 391 #define MAX_NODES_SHIFT 1
392 #define pfn_to_nid(pfn) (0) 392 #define pfn_to_nid(pfn) (0)
393 393
394 #else /* CONFIG_DISCONTIGMEM */ 394 #else /* CONFIG_DISCONTIGMEM */
395 395
396 #include <asm/mmzone.h> 396 #include <asm/mmzone.h>
397 397
398 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 398 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
399 /* 399 /*
400 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 400 * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
401 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. 401 * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
402 */ 402 */
403 #define MAX_NODES_SHIFT 6 403 #define MAX_NODES_SHIFT 6
404 #elif BITS_PER_LONG == 64 404 #elif BITS_PER_LONG == 64
405 /* 405 /*
406 * with 64 bit flags field, there's plenty of room. 406 * with 64 bit flags field, there's plenty of room.
407 */ 407 */
408 #define MAX_NODES_SHIFT 10 408 #define MAX_NODES_SHIFT 10
409 #endif 409 #endif
410 410
411 #endif /* !CONFIG_DISCONTIGMEM */ 411 #endif /* !CONFIG_DISCONTIGMEM */
412 412
413 #if NODES_SHIFT > MAX_NODES_SHIFT 413 #if NODES_SHIFT > MAX_NODES_SHIFT
414 #error NODES_SHIFT > MAX_NODES_SHIFT 414 #error NODES_SHIFT > MAX_NODES_SHIFT
415 #endif 415 #endif
416 416
417 /* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ 417 /* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */
418 #define MAX_ZONES_SHIFT 2 418 #define MAX_ZONES_SHIFT 2
419 419
420 #if ZONES_SHIFT > MAX_ZONES_SHIFT 420 #if ZONES_SHIFT > MAX_ZONES_SHIFT
421 #error ZONES_SHIFT > MAX_ZONES_SHIFT 421 #error ZONES_SHIFT > MAX_ZONES_SHIFT
422 #endif 422 #endif
423 423
424 #endif /* !__ASSEMBLY__ */ 424 #endif /* !__ASSEMBLY__ */
425 #endif /* __KERNEL__ */ 425 #endif /* __KERNEL__ */
426 #endif /* _LINUX_MMZONE_H */ 426 #endif /* _LINUX_MMZONE_H */
427 427
1 #ifndef __LINUX_SMP_H 1 #ifndef __LINUX_SMP_H
2 #define __LINUX_SMP_H 2 #define __LINUX_SMP_H
3 3
4 /* 4 /*
5 * Generic SMP support 5 * Generic SMP support
6 * Alan Cox. <alan@redhat.com> 6 * Alan Cox. <alan@redhat.com>
7 */ 7 */
8 8
9 #include <linux/config.h> 9 #include <linux/config.h>
10 10
11 extern void cpu_idle(void); 11 extern void cpu_idle(void);
12 12
13 #ifdef CONFIG_SMP 13 #ifdef CONFIG_SMP
14 14
15 #include <linux/preempt.h> 15 #include <linux/preempt.h>
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/compiler.h> 17 #include <linux/compiler.h>
18 #include <linux/thread_info.h> 18 #include <linux/thread_info.h>
19 #include <asm/smp.h> 19 #include <asm/smp.h>
20 #include <asm/bug.h> 20 #include <asm/bug.h>
21 21
22 /* 22 /*
23 * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. 23 * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
24 * (defined in asm header): 24 * (defined in asm header):
25 */ 25 */
26 26
27 /* 27 /*
28 * stops all CPUs but the current one: 28 * stops all CPUs but the current one:
29 */ 29 */
30 extern void smp_send_stop(void); 30 extern void smp_send_stop(void);
31 31
32 /* 32 /*
33 * sends a 'reschedule' event to another CPU: 33 * sends a 'reschedule' event to another CPU:
34 */ 34 */
35 extern void smp_send_reschedule(int cpu); 35 extern void smp_send_reschedule(int cpu);
36 36
37 37
38 /* 38 /*
39 * Prepare machine for booting other CPUs. 39 * Prepare machine for booting other CPUs.
40 */ 40 */
41 extern void smp_prepare_cpus(unsigned int max_cpus); 41 extern void smp_prepare_cpus(unsigned int max_cpus);
42 42
43 /* 43 /*
44 * Bring a CPU up 44 * Bring a CPU up
45 */ 45 */
46 extern int __cpu_up(unsigned int cpunum); 46 extern int __cpu_up(unsigned int cpunum);
47 47
48 /* 48 /*
49 * Final polishing of CPUs 49 * Final polishing of CPUs
50 */ 50 */
51 extern void smp_cpus_done(unsigned int max_cpus); 51 extern void smp_cpus_done(unsigned int max_cpus);
52 52
53 /* 53 /*
54 * Call a function on all other processors 54 * Call a function on all other processors
55 */ 55 */
56 extern int smp_call_function (void (*func) (void *info), void *info, 56 extern int smp_call_function (void (*func) (void *info), void *info,
57 int retry, int wait); 57 int retry, int wait);
58 58
59 /* 59 /*
60 * Call a function on all processors 60 * Call a function on all processors
61 */ 61 */
62 static inline int on_each_cpu(void (*func) (void *info), void *info, 62 static inline int on_each_cpu(void (*func) (void *info), void *info,
63 int retry, int wait) 63 int retry, int wait)
64 { 64 {
65 int ret = 0; 65 int ret = 0;
66 66
67 preempt_disable(); 67 preempt_disable();
68 ret = smp_call_function(func, info, retry, wait); 68 ret = smp_call_function(func, info, retry, wait);
69 func(info); 69 func(info);
70 preempt_enable(); 70 preempt_enable();
71 return ret; 71 return ret;
72 } 72 }
73 73
74 #define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */ 74 #define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */
75 #define MSG_ALL 0x8001 75 #define MSG_ALL 0x8001
76 76
77 #define MSG_INVALIDATE_TLB 0x0001 /* Remote processor TLB invalidate */ 77 #define MSG_INVALIDATE_TLB 0x0001 /* Remote processor TLB invalidate */
78 #define MSG_STOP_CPU 0x0002 /* Sent to shut down slave CPU's 78 #define MSG_STOP_CPU 0x0002 /* Sent to shut down slave CPU's
79 * when rebooting 79 * when rebooting
80 */ 80 */
81 #define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/ 81 #define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/
82 #define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */ 82 #define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */
83 83
84 /* 84 /*
85 * Mark the boot cpu "online" so that it can call console drivers in 85 * Mark the boot cpu "online" so that it can call console drivers in
86 * printk() and can access its per-cpu storage. 86 * printk() and can access its per-cpu storage.
87 */ 87 */
88 void smp_prepare_boot_cpu(void); 88 void smp_prepare_boot_cpu(void);
89 89
90 #else /* !SMP */ 90 #else /* !SMP */
91 91
92 /* 92 /*
93 * These macros fold the SMP functionality into a single CPU system 93 * These macros fold the SMP functionality into a single CPU system
94 */ 94 */
95 95 #define raw_smp_processor_id() 0
96 #if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT)
97 # define smp_processor_id() 0
98 #endif
99 #define hard_smp_processor_id() 0 96 #define hard_smp_processor_id() 0
100 #define smp_call_function(func,info,retry,wait) ({ 0; }) 97 #define smp_call_function(func,info,retry,wait) ({ 0; })
101 #define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) 98 #define on_each_cpu(func,info,retry,wait) ({ func(info); 0; })
102 static inline void smp_send_reschedule(int cpu) { } 99 static inline void smp_send_reschedule(int cpu) { }
103 #define num_booting_cpus() 1 100 #define num_booting_cpus() 1
104 #define smp_prepare_boot_cpu() do {} while (0) 101 #define smp_prepare_boot_cpu() do {} while (0)
105 102
106 #endif /* !SMP */ 103 #endif /* !SMP */
107 104
108 /* 105 /*
109 * DEBUG_PREEMPT support: check whether smp_processor_id() is being 106 * smp_processor_id(): get the current CPU ID.
110 * used in a preemption-safe way.
111 * 107 *
112 * An architecture has to enable this debugging code explicitly. 108 * if DEBUG_PREEMPT is enabled the we check whether it is
113 * It can do so by renaming the smp_processor_id() macro to 109 * used in a preemption-safe way. (smp_processor_id() is safe
114 * __smp_processor_id(). This should only be done after some minimal 110 * if it's used in a preemption-off critical section, or in
115 * testing, because usually there are a number of false positives 111 * a thread that is bound to the current CPU.)
116 * that an architecture will trigger.
117 * 112 *
118 * To fix a false positive (i.e. smp_processor_id() use that the 113 * NOTE: raw_smp_processor_id() is for internal use only
119 * debugging code reports but which use for some reason is legal), 114 * (smp_processor_id() is the preferred variant), but in rare
120 * change the smp_processor_id() reference to _smp_processor_id(), 115 * instances it might also be used to turn off false positives
121 * which is the nondebug variant. NOTE: don't use this to hack around 116 * (i.e. smp_processor_id() use that the debugging code reports but
122 * real bugs. 117 * which use for some reason is legal). Don't use this to hack around
118 * the warning message, as your code might not work under PREEMPT.
123 */ 119 */
124 #ifdef __smp_processor_id 120 #ifdef CONFIG_DEBUG_PREEMPT
125 # if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 121 extern unsigned int debug_smp_processor_id(void);
126 extern unsigned int smp_processor_id(void); 122 # define smp_processor_id() debug_smp_processor_id()
127 # else
128 # define smp_processor_id() __smp_processor_id()
129 # endif
130 # define _smp_processor_id() __smp_processor_id()
131 #else 123 #else
132 # define _smp_processor_id() smp_processor_id() 124 # define smp_processor_id() raw_smp_processor_id()
133 #endif 125 #endif
134 126
135 #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) 127 #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
136 #define put_cpu() preempt_enable() 128 #define put_cpu() preempt_enable()
137 #define put_cpu_no_resched() preempt_enable_no_resched() 129 #define put_cpu_no_resched() preempt_enable_no_resched()
138 130
139 #endif /* __LINUX_SMP_H */ 131 #endif /* __LINUX_SMP_H */
1 /* 1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX 2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket 3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level. 4 * interface as the means of communication with the user level.
5 * 5 *
6 * Definitions for the IP router. 6 * Definitions for the IP router.
7 * 7 *
8 * Version: @(#)route.h 1.0.4 05/27/93 8 * Version: @(#)route.h 1.0.4 05/27/93
9 * 9 *
10 * Authors: Ross Biro 10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Fixes: 12 * Fixes:
13 * Alan Cox : Reformatted. Added ip_rt_local() 13 * Alan Cox : Reformatted. Added ip_rt_local()
14 * Alan Cox : Support for TCP parameters. 14 * Alan Cox : Support for TCP parameters.
15 * Alexey Kuznetsov: Major changes for new routing code. 15 * Alexey Kuznetsov: Major changes for new routing code.
16 * Mike McLagan : Routing by source 16 * Mike McLagan : Routing by source
17 * Robert Olsson : Added rt_cache statistics 17 * Robert Olsson : Added rt_cache statistics
18 * 18 *
19 * This program is free software; you can redistribute it and/or 19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License 20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version 21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version. 22 * 2 of the License, or (at your option) any later version.
23 */ 23 */
24 #ifndef _ROUTE_H 24 #ifndef _ROUTE_H
25 #define _ROUTE_H 25 #define _ROUTE_H
26 26
27 #include <linux/config.h> 27 #include <linux/config.h>
28 #include <net/dst.h> 28 #include <net/dst.h>
29 #include <net/inetpeer.h> 29 #include <net/inetpeer.h>
30 #include <net/flow.h> 30 #include <net/flow.h>
31 #include <linux/in_route.h> 31 #include <linux/in_route.h>
32 #include <linux/rtnetlink.h> 32 #include <linux/rtnetlink.h>
33 #include <linux/route.h> 33 #include <linux/route.h>
34 #include <linux/ip.h> 34 #include <linux/ip.h>
35 #include <linux/cache.h> 35 #include <linux/cache.h>
36 36
37 #ifndef __KERNEL__ 37 #ifndef __KERNEL__
38 #warning This file is not supposed to be used outside of kernel. 38 #warning This file is not supposed to be used outside of kernel.
39 #endif 39 #endif
40 40
41 #define RTO_ONLINK 0x01 41 #define RTO_ONLINK 0x01
42 42
43 #define RTO_CONN 0 43 #define RTO_CONN 0
44 /* RTO_CONN is not used (being alias for 0), but preserved not to break 44 /* RTO_CONN is not used (being alias for 0), but preserved not to break
45 * some modules referring to it. */ 45 * some modules referring to it. */
46 46
47 #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) 47 #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
48 48
49 struct fib_nh; 49 struct fib_nh;
50 struct inet_peer; 50 struct inet_peer;
51 struct rtable 51 struct rtable
52 { 52 {
53 union 53 union
54 { 54 {
55 struct dst_entry dst; 55 struct dst_entry dst;
56 struct rtable *rt_next; 56 struct rtable *rt_next;
57 } u; 57 } u;
58 58
59 struct in_device *idev; 59 struct in_device *idev;
60 60
61 unsigned rt_flags; 61 unsigned rt_flags;
62 __u16 rt_type; 62 __u16 rt_type;
63 __u16 rt_multipath_alg; 63 __u16 rt_multipath_alg;
64 64
65 __u32 rt_dst; /* Path destination */ 65 __u32 rt_dst; /* Path destination */
66 __u32 rt_src; /* Path source */ 66 __u32 rt_src; /* Path source */
67 int rt_iif; 67 int rt_iif;
68 68
69 /* Info on neighbour */ 69 /* Info on neighbour */
70 __u32 rt_gateway; 70 __u32 rt_gateway;
71 71
72 /* Cache lookup keys */ 72 /* Cache lookup keys */
73 struct flowi fl; 73 struct flowi fl;
74 74
75 /* Miscellaneous cached information */ 75 /* Miscellaneous cached information */
76 __u32 rt_spec_dst; /* RFC1122 specific destination */ 76 __u32 rt_spec_dst; /* RFC1122 specific destination */
77 struct inet_peer *peer; /* long-living peer info */ 77 struct inet_peer *peer; /* long-living peer info */
78 }; 78 };
79 79
80 struct ip_rt_acct 80 struct ip_rt_acct
81 { 81 {
82 __u32 o_bytes; 82 __u32 o_bytes;
83 __u32 o_packets; 83 __u32 o_packets;
84 __u32 i_bytes; 84 __u32 i_bytes;
85 __u32 i_packets; 85 __u32 i_packets;
86 }; 86 };
87 87
88 struct rt_cache_stat 88 struct rt_cache_stat
89 { 89 {
90 unsigned int in_hit; 90 unsigned int in_hit;
91 unsigned int in_slow_tot; 91 unsigned int in_slow_tot;
92 unsigned int in_slow_mc; 92 unsigned int in_slow_mc;
93 unsigned int in_no_route; 93 unsigned int in_no_route;
94 unsigned int in_brd; 94 unsigned int in_brd;
95 unsigned int in_martian_dst; 95 unsigned int in_martian_dst;
96 unsigned int in_martian_src; 96 unsigned int in_martian_src;
97 unsigned int out_hit; 97 unsigned int out_hit;
98 unsigned int out_slow_tot; 98 unsigned int out_slow_tot;
99 unsigned int out_slow_mc; 99 unsigned int out_slow_mc;
100 unsigned int gc_total; 100 unsigned int gc_total;
101 unsigned int gc_ignored; 101 unsigned int gc_ignored;
102 unsigned int gc_goal_miss; 102 unsigned int gc_goal_miss;
103 unsigned int gc_dst_overflow; 103 unsigned int gc_dst_overflow;
104 unsigned int in_hlist_search; 104 unsigned int in_hlist_search;
105 unsigned int out_hlist_search; 105 unsigned int out_hlist_search;
106 }; 106 };
107 107
108 extern struct rt_cache_stat *rt_cache_stat; 108 extern struct rt_cache_stat *rt_cache_stat;
109 #define RT_CACHE_STAT_INC(field) \ 109 #define RT_CACHE_STAT_INC(field) \
110 (per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++) 110 (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
111 111
112 extern struct ip_rt_acct *ip_rt_acct; 112 extern struct ip_rt_acct *ip_rt_acct;
113 113
114 struct in_device; 114 struct in_device;
115 extern int ip_rt_init(void); 115 extern int ip_rt_init(void);
116 extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw, 116 extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
117 u32 src, u8 tos, struct net_device *dev); 117 u32 src, u8 tos, struct net_device *dev);
118 extern void ip_rt_advice(struct rtable **rp, int advice); 118 extern void ip_rt_advice(struct rtable **rp, int advice);
119 extern void rt_cache_flush(int how); 119 extern void rt_cache_flush(int how);
120 extern int __ip_route_output_key(struct rtable **, const struct flowi *flp); 120 extern int __ip_route_output_key(struct rtable **, const struct flowi *flp);
121 extern int ip_route_output_key(struct rtable **, struct flowi *flp); 121 extern int ip_route_output_key(struct rtable **, struct flowi *flp);
122 extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); 122 extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
123 extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct net_device *devin); 123 extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct net_device *devin);
124 extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); 124 extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu);
125 extern void ip_rt_send_redirect(struct sk_buff *skb); 125 extern void ip_rt_send_redirect(struct sk_buff *skb);
126 126
127 extern unsigned inet_addr_type(u32 addr); 127 extern unsigned inet_addr_type(u32 addr);
128 extern void ip_rt_multicast_event(struct in_device *); 128 extern void ip_rt_multicast_event(struct in_device *);
129 extern int ip_rt_ioctl(unsigned int cmd, void __user *arg); 129 extern int ip_rt_ioctl(unsigned int cmd, void __user *arg);
130 extern void ip_rt_get_source(u8 *src, struct rtable *rt); 130 extern void ip_rt_get_source(u8 *src, struct rtable *rt);
131 extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb); 131 extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb);
132 132
133 static inline void ip_rt_put(struct rtable * rt) 133 static inline void ip_rt_put(struct rtable * rt)
134 { 134 {
135 if (rt) 135 if (rt)
136 dst_release(&rt->u.dst); 136 dst_release(&rt->u.dst);
137 } 137 }
138 138
139 #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) 139 #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3)
140 140
141 extern __u8 ip_tos2prio[16]; 141 extern __u8 ip_tos2prio[16];
142 142
143 static inline char rt_tos2priority(u8 tos) 143 static inline char rt_tos2priority(u8 tos)
144 { 144 {
145 return ip_tos2prio[IPTOS_TOS(tos)>>1]; 145 return ip_tos2prio[IPTOS_TOS(tos)>>1];
146 } 146 }
147 147
148 static inline int ip_route_connect(struct rtable **rp, u32 dst, 148 static inline int ip_route_connect(struct rtable **rp, u32 dst,
149 u32 src, u32 tos, int oif, u8 protocol, 149 u32 src, u32 tos, int oif, u8 protocol,
150 u16 sport, u16 dport, struct sock *sk) 150 u16 sport, u16 dport, struct sock *sk)
151 { 151 {
152 struct flowi fl = { .oif = oif, 152 struct flowi fl = { .oif = oif,
153 .nl_u = { .ip4_u = { .daddr = dst, 153 .nl_u = { .ip4_u = { .daddr = dst,
154 .saddr = src, 154 .saddr = src,
155 .tos = tos } }, 155 .tos = tos } },
156 .proto = protocol, 156 .proto = protocol,
157 .uli_u = { .ports = 157 .uli_u = { .ports =
158 { .sport = sport, 158 { .sport = sport,
159 .dport = dport } } }; 159 .dport = dport } } };
160 160
161 int err; 161 int err;
162 if (!dst || !src) { 162 if (!dst || !src) {
163 err = __ip_route_output_key(rp, &fl); 163 err = __ip_route_output_key(rp, &fl);
164 if (err) 164 if (err)
165 return err; 165 return err;
166 fl.fl4_dst = (*rp)->rt_dst; 166 fl.fl4_dst = (*rp)->rt_dst;
167 fl.fl4_src = (*rp)->rt_src; 167 fl.fl4_src = (*rp)->rt_src;
168 ip_rt_put(*rp); 168 ip_rt_put(*rp);
169 *rp = NULL; 169 *rp = NULL;
170 } 170 }
171 return ip_route_output_flow(rp, &fl, sk, 0); 171 return ip_route_output_flow(rp, &fl, sk, 0);
172 } 172 }
173 173
174 static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport, 174 static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport,
175 struct sock *sk) 175 struct sock *sk)
176 { 176 {
177 if (sport != (*rp)->fl.fl_ip_sport || 177 if (sport != (*rp)->fl.fl_ip_sport ||
178 dport != (*rp)->fl.fl_ip_dport) { 178 dport != (*rp)->fl.fl_ip_dport) {
179 struct flowi fl; 179 struct flowi fl;
180 180
181 memcpy(&fl, &(*rp)->fl, sizeof(fl)); 181 memcpy(&fl, &(*rp)->fl, sizeof(fl));
182 fl.fl_ip_sport = sport; 182 fl.fl_ip_sport = sport;
183 fl.fl_ip_dport = dport; 183 fl.fl_ip_dport = dport;
184 ip_rt_put(*rp); 184 ip_rt_put(*rp);
185 *rp = NULL; 185 *rp = NULL;
186 return ip_route_output_flow(rp, &fl, sk, 0); 186 return ip_route_output_flow(rp, &fl, sk, 0);
187 } 187 }
188 return 0; 188 return 0;
189 } 189 }
190 190
191 extern void rt_bind_peer(struct rtable *rt, int create); 191 extern void rt_bind_peer(struct rtable *rt, int create);
192 192
193 static inline struct inet_peer *rt_get_peer(struct rtable *rt) 193 static inline struct inet_peer *rt_get_peer(struct rtable *rt)
194 { 194 {
195 if (rt->peer) 195 if (rt->peer)
196 return rt->peer; 196 return rt->peer;
197 197
198 rt_bind_peer(rt, 0); 198 rt_bind_peer(rt, 0);
199 return rt->peer; 199 return rt->peer;
200 } 200 }
201 201
202 #endif /* _ROUTE_H */ 202 #endif /* _ROUTE_H */
203 203
1 /* 1 /*
2 * 2 *
3 * SNMP MIB entries for the IP subsystem. 3 * SNMP MIB entries for the IP subsystem.
4 * 4 *
5 * Alan Cox <gw4pts@gw4pts.ampr.org> 5 * Alan Cox <gw4pts@gw4pts.ampr.org>
6 * 6 *
7 * We don't chose to implement SNMP in the kernel (this would 7 * We don't chose to implement SNMP in the kernel (this would
8 * be silly as SNMP is a pain in the backside in places). We do 8 * be silly as SNMP is a pain in the backside in places). We do
9 * however need to collect the MIB statistics and export them 9 * however need to collect the MIB statistics and export them
10 * out of /proc (eventually) 10 * out of /proc (eventually)
11 * 11 *
12 * This program is free software; you can redistribute it and/or 12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License 13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version. 15 * 2 of the License, or (at your option) any later version.
16 * 16 *
17 * $Id: snmp.h,v 1.19 2001/06/14 13:40:46 davem Exp $ 17 * $Id: snmp.h,v 1.19 2001/06/14 13:40:46 davem Exp $
18 * 18 *
19 */ 19 */
20 20
21 #ifndef _SNMP_H 21 #ifndef _SNMP_H
22 #define _SNMP_H 22 #define _SNMP_H
23 23
24 #include <linux/cache.h> 24 #include <linux/cache.h>
25 #include <linux/snmp.h> 25 #include <linux/snmp.h>
26 26
27 /* 27 /*
28 * Mibs are stored in array of unsigned long. 28 * Mibs are stored in array of unsigned long.
29 */ 29 */
30 /* 30 /*
31 * struct snmp_mib{} 31 * struct snmp_mib{}
32 * - list of entries for particular API (such as /proc/net/snmp) 32 * - list of entries for particular API (such as /proc/net/snmp)
33 * - name of entries. 33 * - name of entries.
34 */ 34 */
35 struct snmp_mib { 35 struct snmp_mib {
36 char *name; 36 char *name;
37 int entry; 37 int entry;
38 }; 38 };
39 39
40 #define SNMP_MIB_ITEM(_name,_entry) { \ 40 #define SNMP_MIB_ITEM(_name,_entry) { \
41 .name = _name, \ 41 .name = _name, \
42 .entry = _entry, \ 42 .entry = _entry, \
43 } 43 }
44 44
45 #define SNMP_MIB_SENTINEL { \ 45 #define SNMP_MIB_SENTINEL { \
46 .name = NULL, \ 46 .name = NULL, \
47 .entry = 0, \ 47 .entry = 0, \
48 } 48 }
49 49
50 /* 50 /*
51 * We use all unsigned longs. Linux will soon be so reliable that even 51 * We use all unsigned longs. Linux will soon be so reliable that even
52 * these will rapidly get too small 8-). Seriously consider the IpInReceives 52 * these will rapidly get too small 8-). Seriously consider the IpInReceives
53 * count on the 20Gb/s + networks people expect in a few years time! 53 * count on the 20Gb/s + networks people expect in a few years time!
54 */ 54 */
55 55
56 /* 56 /*
57 * The rule for padding: 57 * The rule for padding:
58 * Best is power of two because then the right structure can be found by a 58 * Best is power of two because then the right structure can be found by a
59 * simple shift. The structure should be always cache line aligned. 59 * simple shift. The structure should be always cache line aligned.
60 * gcc needs n=alignto(cachelinesize, popcnt(sizeof(bla_mib))) shift/add 60 * gcc needs n=alignto(cachelinesize, popcnt(sizeof(bla_mib))) shift/add
61 * instructions to emulate multiply in case it is not power-of-two. 61 * instructions to emulate multiply in case it is not power-of-two.
62 * Currently n is always <=3 for all sizes so simple cache line alignment 62 * Currently n is always <=3 for all sizes so simple cache line alignment
63 * is enough. 63 * is enough.
64 * 64 *
65 * The best solution would be a global CPU local area , especially on 64 65 * The best solution would be a global CPU local area , especially on 64
66 * and 128byte cacheline machine it makes a *lot* of sense -AK 66 * and 128byte cacheline machine it makes a *lot* of sense -AK
67 */ 67 */
68 68
69 #define __SNMP_MIB_ALIGN__ ____cacheline_aligned 69 #define __SNMP_MIB_ALIGN__ ____cacheline_aligned
70 70
71 /* IPstats */ 71 /* IPstats */
72 #define IPSTATS_MIB_MAX __IPSTATS_MIB_MAX 72 #define IPSTATS_MIB_MAX __IPSTATS_MIB_MAX
73 struct ipstats_mib { 73 struct ipstats_mib {
74 unsigned long mibs[IPSTATS_MIB_MAX]; 74 unsigned long mibs[IPSTATS_MIB_MAX];
75 } __SNMP_MIB_ALIGN__; 75 } __SNMP_MIB_ALIGN__;
76 76
77 /* ICMP */ 77 /* ICMP */
78 #define ICMP_MIB_DUMMY __ICMP_MIB_MAX 78 #define ICMP_MIB_DUMMY __ICMP_MIB_MAX
79 #define ICMP_MIB_MAX (__ICMP_MIB_MAX + 1) 79 #define ICMP_MIB_MAX (__ICMP_MIB_MAX + 1)
80 80
81 struct icmp_mib { 81 struct icmp_mib {
82 unsigned long mibs[ICMP_MIB_MAX]; 82 unsigned long mibs[ICMP_MIB_MAX];
83 } __SNMP_MIB_ALIGN__; 83 } __SNMP_MIB_ALIGN__;
84 84
85 /* ICMP6 (IPv6-ICMP) */ 85 /* ICMP6 (IPv6-ICMP) */
86 #define ICMP6_MIB_MAX __ICMP6_MIB_MAX 86 #define ICMP6_MIB_MAX __ICMP6_MIB_MAX
87 struct icmpv6_mib { 87 struct icmpv6_mib {
88 unsigned long mibs[ICMP6_MIB_MAX]; 88 unsigned long mibs[ICMP6_MIB_MAX];
89 } __SNMP_MIB_ALIGN__; 89 } __SNMP_MIB_ALIGN__;
90 90
91 /* TCP */ 91 /* TCP */
92 #define TCP_MIB_MAX __TCP_MIB_MAX 92 #define TCP_MIB_MAX __TCP_MIB_MAX
93 struct tcp_mib { 93 struct tcp_mib {
94 unsigned long mibs[TCP_MIB_MAX]; 94 unsigned long mibs[TCP_MIB_MAX];
95 } __SNMP_MIB_ALIGN__; 95 } __SNMP_MIB_ALIGN__;
96 96
97 /* UDP */ 97 /* UDP */
98 #define UDP_MIB_MAX __UDP_MIB_MAX 98 #define UDP_MIB_MAX __UDP_MIB_MAX
99 struct udp_mib { 99 struct udp_mib {
100 unsigned long mibs[UDP_MIB_MAX]; 100 unsigned long mibs[UDP_MIB_MAX];
101 } __SNMP_MIB_ALIGN__; 101 } __SNMP_MIB_ALIGN__;
102 102
103 /* SCTP */ 103 /* SCTP */
104 #define SCTP_MIB_MAX __SCTP_MIB_MAX 104 #define SCTP_MIB_MAX __SCTP_MIB_MAX
105 struct sctp_mib { 105 struct sctp_mib {
106 unsigned long mibs[SCTP_MIB_MAX]; 106 unsigned long mibs[SCTP_MIB_MAX];
107 } __SNMP_MIB_ALIGN__; 107 } __SNMP_MIB_ALIGN__;
108 108
109 /* Linux */ 109 /* Linux */
110 #define LINUX_MIB_MAX __LINUX_MIB_MAX 110 #define LINUX_MIB_MAX __LINUX_MIB_MAX
111 struct linux_mib { 111 struct linux_mib {
112 unsigned long mibs[LINUX_MIB_MAX]; 112 unsigned long mibs[LINUX_MIB_MAX];
113 }; 113 };
114 114
115 115
116 /* 116 /*
117 * FIXME: On x86 and some other CPUs the split into user and softirq parts 117 * FIXME: On x86 and some other CPUs the split into user and softirq parts
118 * is not needed because addl $1,memory is atomic against interrupts (but 118 * is not needed because addl $1,memory is atomic against interrupts (but
119 * atomic_inc would be overkill because of the lock cycles). Wants new 119 * atomic_inc would be overkill because of the lock cycles). Wants new
120 * nonlocked_atomic_inc() primitives -AK 120 * nonlocked_atomic_inc() primitives -AK
121 */ 121 */
122 #define DEFINE_SNMP_STAT(type, name) \ 122 #define DEFINE_SNMP_STAT(type, name) \
123 __typeof__(type) *name[2] 123 __typeof__(type) *name[2]
124 #define DECLARE_SNMP_STAT(type, name) \ 124 #define DECLARE_SNMP_STAT(type, name) \
125 extern __typeof__(type) *name[2] 125 extern __typeof__(type) *name[2]
126 126
127 #define SNMP_STAT_BHPTR(name) (name[0]) 127 #define SNMP_STAT_BHPTR(name) (name[0])
128 #define SNMP_STAT_USRPTR(name) (name[1]) 128 #define SNMP_STAT_USRPTR(name) (name[1])
129 129
130 #define SNMP_INC_STATS_BH(mib, field) \ 130 #define SNMP_INC_STATS_BH(mib, field) \
131 (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++) 131 (per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field]++)
132 #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \ 132 #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \
133 (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++) 133 (per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field + (offset)]++)
134 #define SNMP_INC_STATS_USER(mib, field) \ 134 #define SNMP_INC_STATS_USER(mib, field) \
135 (per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++) 135 (per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field]++)
136 #define SNMP_INC_STATS(mib, field) \ 136 #define SNMP_INC_STATS(mib, field) \
137 (per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++) 137 (per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]++)
138 #define SNMP_DEC_STATS(mib, field) \ 138 #define SNMP_DEC_STATS(mib, field) \
139 (per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--) 139 (per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]--)
140 #define SNMP_ADD_STATS_BH(mib, field, addend) \ 140 #define SNMP_ADD_STATS_BH(mib, field, addend) \
141 (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend) 141 (per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field] += addend)
142 #define SNMP_ADD_STATS_USER(mib, field, addend) \ 142 #define SNMP_ADD_STATS_USER(mib, field, addend) \
143 (per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend) 143 (per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field] += addend)
144 144
145 #endif 145 #endif
146 146
1 /* Rewritten by Rusty Russell, on the backs of many others... 1 /* Rewritten by Rusty Russell, on the backs of many others...
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or 7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version. 8 (at your option) any later version.
9 9
10 This program is distributed in the hope that it will be useful, 10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details. 13 GNU General Public License for more details.
14 14
15 You should have received a copy of the GNU General Public License 15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 #include <linux/config.h> 19 #include <linux/config.h>
20 #include <linux/module.h> 20 #include <linux/module.h>
21 #include <linux/moduleloader.h> 21 #include <linux/moduleloader.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/vmalloc.h> 24 #include <linux/vmalloc.h>
25 #include <linux/elf.h> 25 #include <linux/elf.h>
26 #include <linux/seq_file.h> 26 #include <linux/seq_file.h>
27 #include <linux/syscalls.h> 27 #include <linux/syscalls.h>
28 #include <linux/fcntl.h> 28 #include <linux/fcntl.h>
29 #include <linux/rcupdate.h> 29 #include <linux/rcupdate.h>
30 #include <linux/cpu.h> 30 #include <linux/cpu.h>
31 #include <linux/moduleparam.h> 31 #include <linux/moduleparam.h>
32 #include <linux/errno.h> 32 #include <linux/errno.h>
33 #include <linux/err.h> 33 #include <linux/err.h>
34 #include <linux/vermagic.h> 34 #include <linux/vermagic.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/stop_machine.h> 36 #include <linux/stop_machine.h>
37 #include <linux/device.h> 37 #include <linux/device.h>
38 #include <asm/uaccess.h> 38 #include <asm/uaccess.h>
39 #include <asm/semaphore.h> 39 #include <asm/semaphore.h>
40 #include <asm/cacheflush.h> 40 #include <asm/cacheflush.h>
41 41
42 #if 0 42 #if 0
43 #define DEBUGP printk 43 #define DEBUGP printk
44 #else 44 #else
45 #define DEBUGP(fmt , a...) 45 #define DEBUGP(fmt , a...)
46 #endif 46 #endif
47 47
48 #ifndef ARCH_SHF_SMALL 48 #ifndef ARCH_SHF_SMALL
49 #define ARCH_SHF_SMALL 0 49 #define ARCH_SHF_SMALL 0
50 #endif 50 #endif
51 51
52 /* If this is set, the section belongs in the init part of the module */ 52 /* If this is set, the section belongs in the init part of the module */
53 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 53 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
54 54
55 /* Protects module list */ 55 /* Protects module list */
56 static DEFINE_SPINLOCK(modlist_lock); 56 static DEFINE_SPINLOCK(modlist_lock);
57 57
58 /* List of modules, protected by module_mutex AND modlist_lock */ 58 /* List of modules, protected by module_mutex AND modlist_lock */
59 static DECLARE_MUTEX(module_mutex); 59 static DECLARE_MUTEX(module_mutex);
60 static LIST_HEAD(modules); 60 static LIST_HEAD(modules);
61 61
62 static DECLARE_MUTEX(notify_mutex); 62 static DECLARE_MUTEX(notify_mutex);
63 static struct notifier_block * module_notify_list; 63 static struct notifier_block * module_notify_list;
64 64
65 int register_module_notifier(struct notifier_block * nb) 65 int register_module_notifier(struct notifier_block * nb)
66 { 66 {
67 int err; 67 int err;
68 down(&notify_mutex); 68 down(&notify_mutex);
69 err = notifier_chain_register(&module_notify_list, nb); 69 err = notifier_chain_register(&module_notify_list, nb);
70 up(&notify_mutex); 70 up(&notify_mutex);
71 return err; 71 return err;
72 } 72 }
73 EXPORT_SYMBOL(register_module_notifier); 73 EXPORT_SYMBOL(register_module_notifier);
74 74
75 int unregister_module_notifier(struct notifier_block * nb) 75 int unregister_module_notifier(struct notifier_block * nb)
76 { 76 {
77 int err; 77 int err;
78 down(&notify_mutex); 78 down(&notify_mutex);
79 err = notifier_chain_unregister(&module_notify_list, nb); 79 err = notifier_chain_unregister(&module_notify_list, nb);
80 up(&notify_mutex); 80 up(&notify_mutex);
81 return err; 81 return err;
82 } 82 }
83 EXPORT_SYMBOL(unregister_module_notifier); 83 EXPORT_SYMBOL(unregister_module_notifier);
84 84
85 /* We require a truly strong try_module_get() */ 85 /* We require a truly strong try_module_get() */
86 static inline int strong_try_module_get(struct module *mod) 86 static inline int strong_try_module_get(struct module *mod)
87 { 87 {
88 if (mod && mod->state == MODULE_STATE_COMING) 88 if (mod && mod->state == MODULE_STATE_COMING)
89 return 0; 89 return 0;
90 return try_module_get(mod); 90 return try_module_get(mod);
91 } 91 }
92 92
93 /* A thread that wants to hold a reference to a module only while it 93 /* A thread that wants to hold a reference to a module only while it
94 * is running can call ths to safely exit. 94 * is running can call ths to safely exit.
95 * nfsd and lockd use this. 95 * nfsd and lockd use this.
96 */ 96 */
97 void __module_put_and_exit(struct module *mod, long code) 97 void __module_put_and_exit(struct module *mod, long code)
98 { 98 {
99 module_put(mod); 99 module_put(mod);
100 do_exit(code); 100 do_exit(code);
101 } 101 }
102 EXPORT_SYMBOL(__module_put_and_exit); 102 EXPORT_SYMBOL(__module_put_and_exit);
103 103
104 /* Find a module section: 0 means not found. */ 104 /* Find a module section: 0 means not found. */
105 static unsigned int find_sec(Elf_Ehdr *hdr, 105 static unsigned int find_sec(Elf_Ehdr *hdr,
106 Elf_Shdr *sechdrs, 106 Elf_Shdr *sechdrs,
107 const char *secstrings, 107 const char *secstrings,
108 const char *name) 108 const char *name)
109 { 109 {
110 unsigned int i; 110 unsigned int i;
111 111
112 for (i = 1; i < hdr->e_shnum; i++) 112 for (i = 1; i < hdr->e_shnum; i++)
113 /* Alloc bit cleared means "ignore it." */ 113 /* Alloc bit cleared means "ignore it." */
114 if ((sechdrs[i].sh_flags & SHF_ALLOC) 114 if ((sechdrs[i].sh_flags & SHF_ALLOC)
115 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) 115 && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
116 return i; 116 return i;
117 return 0; 117 return 0;
118 } 118 }
119 119
120 /* Provided by the linker */ 120 /* Provided by the linker */
121 extern const struct kernel_symbol __start___ksymtab[]; 121 extern const struct kernel_symbol __start___ksymtab[];
122 extern const struct kernel_symbol __stop___ksymtab[]; 122 extern const struct kernel_symbol __stop___ksymtab[];
123 extern const struct kernel_symbol __start___ksymtab_gpl[]; 123 extern const struct kernel_symbol __start___ksymtab_gpl[];
124 extern const struct kernel_symbol __stop___ksymtab_gpl[]; 124 extern const struct kernel_symbol __stop___ksymtab_gpl[];
125 extern const unsigned long __start___kcrctab[]; 125 extern const unsigned long __start___kcrctab[];
126 extern const unsigned long __start___kcrctab_gpl[]; 126 extern const unsigned long __start___kcrctab_gpl[];
127 127
128 #ifndef CONFIG_MODVERSIONS 128 #ifndef CONFIG_MODVERSIONS
129 #define symversion(base, idx) NULL 129 #define symversion(base, idx) NULL
130 #else 130 #else
131 #define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) 131 #define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
132 #endif 132 #endif
133 133
134 /* Find a symbol, return value, crc and module which owns it */ 134 /* Find a symbol, return value, crc and module which owns it */
135 static unsigned long __find_symbol(const char *name, 135 static unsigned long __find_symbol(const char *name,
136 struct module **owner, 136 struct module **owner,
137 const unsigned long **crc, 137 const unsigned long **crc,
138 int gplok) 138 int gplok)
139 { 139 {
140 struct module *mod; 140 struct module *mod;
141 unsigned int i; 141 unsigned int i;
142 142
143 /* Core kernel first. */ 143 /* Core kernel first. */
144 *owner = NULL; 144 *owner = NULL;
145 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) { 145 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
146 if (strcmp(__start___ksymtab[i].name, name) == 0) { 146 if (strcmp(__start___ksymtab[i].name, name) == 0) {
147 *crc = symversion(__start___kcrctab, i); 147 *crc = symversion(__start___kcrctab, i);
148 return __start___ksymtab[i].value; 148 return __start___ksymtab[i].value;
149 } 149 }
150 } 150 }
151 if (gplok) { 151 if (gplok) {
152 for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++) 152 for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
153 if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) { 153 if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
154 *crc = symversion(__start___kcrctab_gpl, i); 154 *crc = symversion(__start___kcrctab_gpl, i);
155 return __start___ksymtab_gpl[i].value; 155 return __start___ksymtab_gpl[i].value;
156 } 156 }
157 } 157 }
158 158
159 /* Now try modules. */ 159 /* Now try modules. */
160 list_for_each_entry(mod, &modules, list) { 160 list_for_each_entry(mod, &modules, list) {
161 *owner = mod; 161 *owner = mod;
162 for (i = 0; i < mod->num_syms; i++) 162 for (i = 0; i < mod->num_syms; i++)
163 if (strcmp(mod->syms[i].name, name) == 0) { 163 if (strcmp(mod->syms[i].name, name) == 0) {
164 *crc = symversion(mod->crcs, i); 164 *crc = symversion(mod->crcs, i);
165 return mod->syms[i].value; 165 return mod->syms[i].value;
166 } 166 }
167 167
168 if (gplok) { 168 if (gplok) {
169 for (i = 0; i < mod->num_gpl_syms; i++) { 169 for (i = 0; i < mod->num_gpl_syms; i++) {
170 if (strcmp(mod->gpl_syms[i].name, name) == 0) { 170 if (strcmp(mod->gpl_syms[i].name, name) == 0) {
171 *crc = symversion(mod->gpl_crcs, i); 171 *crc = symversion(mod->gpl_crcs, i);
172 return mod->gpl_syms[i].value; 172 return mod->gpl_syms[i].value;
173 } 173 }
174 } 174 }
175 } 175 }
176 } 176 }
177 DEBUGP("Failed to find symbol %s\n", name); 177 DEBUGP("Failed to find symbol %s\n", name);
178 return 0; 178 return 0;
179 } 179 }
180 180
181 /* Find a symbol in this elf symbol table */ 181 /* Find a symbol in this elf symbol table */
182 static unsigned long find_local_symbol(Elf_Shdr *sechdrs, 182 static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
183 unsigned int symindex, 183 unsigned int symindex,
184 const char *strtab, 184 const char *strtab,
185 const char *name) 185 const char *name)
186 { 186 {
187 unsigned int i; 187 unsigned int i;
188 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; 188 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
189 189
190 /* Search (defined) internal symbols first. */ 190 /* Search (defined) internal symbols first. */
191 for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) { 191 for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
192 if (sym[i].st_shndx != SHN_UNDEF 192 if (sym[i].st_shndx != SHN_UNDEF
193 && strcmp(name, strtab + sym[i].st_name) == 0) 193 && strcmp(name, strtab + sym[i].st_name) == 0)
194 return sym[i].st_value; 194 return sym[i].st_value;
195 } 195 }
196 return 0; 196 return 0;
197 } 197 }
198 198
199 /* Search for module by name: must hold module_mutex. */ 199 /* Search for module by name: must hold module_mutex. */
200 static struct module *find_module(const char *name) 200 static struct module *find_module(const char *name)
201 { 201 {
202 struct module *mod; 202 struct module *mod;
203 203
204 list_for_each_entry(mod, &modules, list) { 204 list_for_each_entry(mod, &modules, list) {
205 if (strcmp(mod->name, name) == 0) 205 if (strcmp(mod->name, name) == 0)
206 return mod; 206 return mod;
207 } 207 }
208 return NULL; 208 return NULL;
209 } 209 }
210 210
211 #ifdef CONFIG_SMP 211 #ifdef CONFIG_SMP
212 /* Number of blocks used and allocated. */ 212 /* Number of blocks used and allocated. */
213 static unsigned int pcpu_num_used, pcpu_num_allocated; 213 static unsigned int pcpu_num_used, pcpu_num_allocated;
214 /* Size of each block. -ve means used. */ 214 /* Size of each block. -ve means used. */
215 static int *pcpu_size; 215 static int *pcpu_size;
216 216
217 static int split_block(unsigned int i, unsigned short size) 217 static int split_block(unsigned int i, unsigned short size)
218 { 218 {
219 /* Reallocation required? */ 219 /* Reallocation required? */
220 if (pcpu_num_used + 1 > pcpu_num_allocated) { 220 if (pcpu_num_used + 1 > pcpu_num_allocated) {
221 int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, 221 int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
222 GFP_KERNEL); 222 GFP_KERNEL);
223 if (!new) 223 if (!new)
224 return 0; 224 return 0;
225 225
226 memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); 226 memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
227 pcpu_num_allocated *= 2; 227 pcpu_num_allocated *= 2;
228 kfree(pcpu_size); 228 kfree(pcpu_size);
229 pcpu_size = new; 229 pcpu_size = new;
230 } 230 }
231 231
232 /* Insert a new subblock */ 232 /* Insert a new subblock */
233 memmove(&pcpu_size[i+1], &pcpu_size[i], 233 memmove(&pcpu_size[i+1], &pcpu_size[i],
234 sizeof(pcpu_size[0]) * (pcpu_num_used - i)); 234 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
235 pcpu_num_used++; 235 pcpu_num_used++;
236 236
237 pcpu_size[i+1] -= size; 237 pcpu_size[i+1] -= size;
238 pcpu_size[i] = size; 238 pcpu_size[i] = size;
239 return 1; 239 return 1;
240 } 240 }
241 241
242 static inline unsigned int block_size(int val) 242 static inline unsigned int block_size(int val)
243 { 243 {
244 if (val < 0) 244 if (val < 0)
245 return -val; 245 return -val;
246 return val; 246 return val;
247 } 247 }
248 248
249 /* Created by linker magic */ 249 /* Created by linker magic */
250 extern char __per_cpu_start[], __per_cpu_end[]; 250 extern char __per_cpu_start[], __per_cpu_end[];
251 251
252 static void *percpu_modalloc(unsigned long size, unsigned long align) 252 static void *percpu_modalloc(unsigned long size, unsigned long align)
253 { 253 {
254 unsigned long extra; 254 unsigned long extra;
255 unsigned int i; 255 unsigned int i;
256 void *ptr; 256 void *ptr;
257 257
258 BUG_ON(align > SMP_CACHE_BYTES); 258 BUG_ON(align > SMP_CACHE_BYTES);
259 259
260 ptr = __per_cpu_start; 260 ptr = __per_cpu_start;
261 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 261 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
262 /* Extra for alignment requirement. */ 262 /* Extra for alignment requirement. */
263 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; 263 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
264 BUG_ON(i == 0 && extra != 0); 264 BUG_ON(i == 0 && extra != 0);
265 265
266 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) 266 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
267 continue; 267 continue;
268 268
269 /* Transfer extra to previous block. */ 269 /* Transfer extra to previous block. */
270 if (pcpu_size[i-1] < 0) 270 if (pcpu_size[i-1] < 0)
271 pcpu_size[i-1] -= extra; 271 pcpu_size[i-1] -= extra;
272 else 272 else
273 pcpu_size[i-1] += extra; 273 pcpu_size[i-1] += extra;
274 pcpu_size[i] -= extra; 274 pcpu_size[i] -= extra;
275 ptr += extra; 275 ptr += extra;
276 276
277 /* Split block if warranted */ 277 /* Split block if warranted */
278 if (pcpu_size[i] - size > sizeof(unsigned long)) 278 if (pcpu_size[i] - size > sizeof(unsigned long))
279 if (!split_block(i, size)) 279 if (!split_block(i, size))
280 return NULL; 280 return NULL;
281 281
282 /* Mark allocated */ 282 /* Mark allocated */
283 pcpu_size[i] = -pcpu_size[i]; 283 pcpu_size[i] = -pcpu_size[i];
284 return ptr; 284 return ptr;
285 } 285 }
286 286
287 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", 287 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
288 size); 288 size);
289 return NULL; 289 return NULL;
290 } 290 }
291 291
292 static void percpu_modfree(void *freeme) 292 static void percpu_modfree(void *freeme)
293 { 293 {
294 unsigned int i; 294 unsigned int i;
295 void *ptr = __per_cpu_start + block_size(pcpu_size[0]); 295 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
296 296
297 /* First entry is core kernel percpu data. */ 297 /* First entry is core kernel percpu data. */
298 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 298 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
299 if (ptr == freeme) { 299 if (ptr == freeme) {
300 pcpu_size[i] = -pcpu_size[i]; 300 pcpu_size[i] = -pcpu_size[i];
301 goto free; 301 goto free;
302 } 302 }
303 } 303 }
304 BUG(); 304 BUG();
305 305
306 free: 306 free:
307 /* Merge with previous? */ 307 /* Merge with previous? */
308 if (pcpu_size[i-1] >= 0) { 308 if (pcpu_size[i-1] >= 0) {
309 pcpu_size[i-1] += pcpu_size[i]; 309 pcpu_size[i-1] += pcpu_size[i];
310 pcpu_num_used--; 310 pcpu_num_used--;
311 memmove(&pcpu_size[i], &pcpu_size[i+1], 311 memmove(&pcpu_size[i], &pcpu_size[i+1],
312 (pcpu_num_used - i) * sizeof(pcpu_size[0])); 312 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
313 i--; 313 i--;
314 } 314 }
315 /* Merge with next? */ 315 /* Merge with next? */
316 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { 316 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
317 pcpu_size[i] += pcpu_size[i+1]; 317 pcpu_size[i] += pcpu_size[i+1];
318 pcpu_num_used--; 318 pcpu_num_used--;
319 memmove(&pcpu_size[i+1], &pcpu_size[i+2], 319 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
320 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); 320 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
321 } 321 }
322 } 322 }
323 323
324 static unsigned int find_pcpusec(Elf_Ehdr *hdr, 324 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
325 Elf_Shdr *sechdrs, 325 Elf_Shdr *sechdrs,
326 const char *secstrings) 326 const char *secstrings)
327 { 327 {
328 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 328 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
329 } 329 }
330 330
331 static int percpu_modinit(void) 331 static int percpu_modinit(void)
332 { 332 {
333 pcpu_num_used = 2; 333 pcpu_num_used = 2;
334 pcpu_num_allocated = 2; 334 pcpu_num_allocated = 2;
335 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, 335 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
336 GFP_KERNEL); 336 GFP_KERNEL);
337 /* Static in-kernel percpu data (used). */ 337 /* Static in-kernel percpu data (used). */
338 pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); 338 pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
339 /* Free room. */ 339 /* Free room. */
340 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; 340 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
341 if (pcpu_size[1] < 0) { 341 if (pcpu_size[1] < 0) {
342 printk(KERN_ERR "No per-cpu room for modules.\n"); 342 printk(KERN_ERR "No per-cpu room for modules.\n");
343 pcpu_num_used = 1; 343 pcpu_num_used = 1;
344 } 344 }
345 345
346 return 0; 346 return 0;
347 } 347 }
348 __initcall(percpu_modinit); 348 __initcall(percpu_modinit);
349 #else /* ... !CONFIG_SMP */ 349 #else /* ... !CONFIG_SMP */
350 static inline void *percpu_modalloc(unsigned long size, unsigned long align) 350 static inline void *percpu_modalloc(unsigned long size, unsigned long align)
351 { 351 {
352 return NULL; 352 return NULL;
353 } 353 }
354 static inline void percpu_modfree(void *pcpuptr) 354 static inline void percpu_modfree(void *pcpuptr)
355 { 355 {
356 BUG(); 356 BUG();
357 } 357 }
358 static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 358 static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
359 Elf_Shdr *sechdrs, 359 Elf_Shdr *sechdrs,
360 const char *secstrings) 360 const char *secstrings)
361 { 361 {
362 return 0; 362 return 0;
363 } 363 }
364 static inline void percpu_modcopy(void *pcpudst, const void *src, 364 static inline void percpu_modcopy(void *pcpudst, const void *src,
365 unsigned long size) 365 unsigned long size)
366 { 366 {
367 /* pcpusec should be 0, and size of that section should be 0. */ 367 /* pcpusec should be 0, and size of that section should be 0. */
368 BUG_ON(size != 0); 368 BUG_ON(size != 0);
369 } 369 }
370 #endif /* CONFIG_SMP */ 370 #endif /* CONFIG_SMP */
371 371
372 #ifdef CONFIG_MODULE_UNLOAD 372 #ifdef CONFIG_MODULE_UNLOAD
373 /* Init the unload section of the module. */ 373 /* Init the unload section of the module. */
374 static void module_unload_init(struct module *mod) 374 static void module_unload_init(struct module *mod)
375 { 375 {
376 unsigned int i; 376 unsigned int i;
377 377
378 INIT_LIST_HEAD(&mod->modules_which_use_me); 378 INIT_LIST_HEAD(&mod->modules_which_use_me);
379 for (i = 0; i < NR_CPUS; i++) 379 for (i = 0; i < NR_CPUS; i++)
380 local_set(&mod->ref[i].count, 0); 380 local_set(&mod->ref[i].count, 0);
381 /* Hold reference count during initialization. */ 381 /* Hold reference count during initialization. */
382 local_set(&mod->ref[_smp_processor_id()].count, 1); 382 local_set(&mod->ref[raw_smp_processor_id()].count, 1);
383 /* Backwards compatibility macros put refcount during init. */ 383 /* Backwards compatibility macros put refcount during init. */
384 mod->waiter = current; 384 mod->waiter = current;
385 } 385 }
386 386
387 /* modules using other modules */ 387 /* modules using other modules */
388 struct module_use 388 struct module_use
389 { 389 {
390 struct list_head list; 390 struct list_head list;
391 struct module *module_which_uses; 391 struct module *module_which_uses;
392 }; 392 };
393 393
394 /* Does a already use b? */ 394 /* Does a already use b? */
395 static int already_uses(struct module *a, struct module *b) 395 static int already_uses(struct module *a, struct module *b)
396 { 396 {
397 struct module_use *use; 397 struct module_use *use;
398 398
399 list_for_each_entry(use, &b->modules_which_use_me, list) { 399 list_for_each_entry(use, &b->modules_which_use_me, list) {
400 if (use->module_which_uses == a) { 400 if (use->module_which_uses == a) {
401 DEBUGP("%s uses %s!\n", a->name, b->name); 401 DEBUGP("%s uses %s!\n", a->name, b->name);
402 return 1; 402 return 1;
403 } 403 }
404 } 404 }
405 DEBUGP("%s does not use %s!\n", a->name, b->name); 405 DEBUGP("%s does not use %s!\n", a->name, b->name);
406 return 0; 406 return 0;
407 } 407 }
408 408
409 /* Module a uses b */ 409 /* Module a uses b */
410 static int use_module(struct module *a, struct module *b) 410 static int use_module(struct module *a, struct module *b)
411 { 411 {
412 struct module_use *use; 412 struct module_use *use;
413 if (b == NULL || already_uses(a, b)) return 1; 413 if (b == NULL || already_uses(a, b)) return 1;
414 414
415 if (!strong_try_module_get(b)) 415 if (!strong_try_module_get(b))
416 return 0; 416 return 0;
417 417
418 DEBUGP("Allocating new usage for %s.\n", a->name); 418 DEBUGP("Allocating new usage for %s.\n", a->name);
419 use = kmalloc(sizeof(*use), GFP_ATOMIC); 419 use = kmalloc(sizeof(*use), GFP_ATOMIC);
420 if (!use) { 420 if (!use) {
421 printk("%s: out of memory loading\n", a->name); 421 printk("%s: out of memory loading\n", a->name);
422 module_put(b); 422 module_put(b);
423 return 0; 423 return 0;
424 } 424 }
425 425
426 use->module_which_uses = a; 426 use->module_which_uses = a;
427 list_add(&use->list, &b->modules_which_use_me); 427 list_add(&use->list, &b->modules_which_use_me);
428 return 1; 428 return 1;
429 } 429 }
430 430
431 /* Clear the unload stuff of the module. */ 431 /* Clear the unload stuff of the module. */
432 static void module_unload_free(struct module *mod) 432 static void module_unload_free(struct module *mod)
433 { 433 {
434 struct module *i; 434 struct module *i;
435 435
436 list_for_each_entry(i, &modules, list) { 436 list_for_each_entry(i, &modules, list) {
437 struct module_use *use; 437 struct module_use *use;
438 438
439 list_for_each_entry(use, &i->modules_which_use_me, list) { 439 list_for_each_entry(use, &i->modules_which_use_me, list) {
440 if (use->module_which_uses == mod) { 440 if (use->module_which_uses == mod) {
441 DEBUGP("%s unusing %s\n", mod->name, i->name); 441 DEBUGP("%s unusing %s\n", mod->name, i->name);
442 module_put(i); 442 module_put(i);
443 list_del(&use->list); 443 list_del(&use->list);
444 kfree(use); 444 kfree(use);
445 /* There can be at most one match. */ 445 /* There can be at most one match. */
446 break; 446 break;
447 } 447 }
448 } 448 }
449 } 449 }
450 } 450 }
451 451
452 #ifdef CONFIG_MODULE_FORCE_UNLOAD 452 #ifdef CONFIG_MODULE_FORCE_UNLOAD
453 static inline int try_force(unsigned int flags) 453 static inline int try_force(unsigned int flags)
454 { 454 {
455 int ret = (flags & O_TRUNC); 455 int ret = (flags & O_TRUNC);
456 if (ret) 456 if (ret)
457 tainted |= TAINT_FORCED_MODULE; 457 tainted |= TAINT_FORCED_MODULE;
458 return ret; 458 return ret;
459 } 459 }
460 #else 460 #else
461 static inline int try_force(unsigned int flags) 461 static inline int try_force(unsigned int flags)
462 { 462 {
463 return 0; 463 return 0;
464 } 464 }
465 #endif /* CONFIG_MODULE_FORCE_UNLOAD */ 465 #endif /* CONFIG_MODULE_FORCE_UNLOAD */
466 466
467 struct stopref 467 struct stopref
468 { 468 {
469 struct module *mod; 469 struct module *mod;
470 int flags; 470 int flags;
471 int *forced; 471 int *forced;
472 }; 472 };
473 473
474 /* Whole machine is stopped with interrupts off when this runs. */ 474 /* Whole machine is stopped with interrupts off when this runs. */
475 static int __try_stop_module(void *_sref) 475 static int __try_stop_module(void *_sref)
476 { 476 {
477 struct stopref *sref = _sref; 477 struct stopref *sref = _sref;
478 478
479 /* If it's not unused, quit unless we are told to block. */ 479 /* If it's not unused, quit unless we are told to block. */
480 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { 480 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
481 if (!(*sref->forced = try_force(sref->flags))) 481 if (!(*sref->forced = try_force(sref->flags)))
482 return -EWOULDBLOCK; 482 return -EWOULDBLOCK;
483 } 483 }
484 484
485 /* Mark it as dying. */ 485 /* Mark it as dying. */
486 sref->mod->state = MODULE_STATE_GOING; 486 sref->mod->state = MODULE_STATE_GOING;
487 return 0; 487 return 0;
488 } 488 }
489 489
490 static int try_stop_module(struct module *mod, int flags, int *forced) 490 static int try_stop_module(struct module *mod, int flags, int *forced)
491 { 491 {
492 struct stopref sref = { mod, flags, forced }; 492 struct stopref sref = { mod, flags, forced };
493 493
494 return stop_machine_run(__try_stop_module, &sref, NR_CPUS); 494 return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
495 } 495 }
496 496
497 unsigned int module_refcount(struct module *mod) 497 unsigned int module_refcount(struct module *mod)
498 { 498 {
499 unsigned int i, total = 0; 499 unsigned int i, total = 0;
500 500
501 for (i = 0; i < NR_CPUS; i++) 501 for (i = 0; i < NR_CPUS; i++)
502 total += local_read(&mod->ref[i].count); 502 total += local_read(&mod->ref[i].count);
503 return total; 503 return total;
504 } 504 }
505 EXPORT_SYMBOL(module_refcount); 505 EXPORT_SYMBOL(module_refcount);
506 506
507 /* This exists whether we can unload or not */ 507 /* This exists whether we can unload or not */
508 static void free_module(struct module *mod); 508 static void free_module(struct module *mod);
509 509
510 static void wait_for_zero_refcount(struct module *mod) 510 static void wait_for_zero_refcount(struct module *mod)
511 { 511 {
512 /* Since we might sleep for some time, drop the semaphore first */ 512 /* Since we might sleep for some time, drop the semaphore first */
513 up(&module_mutex); 513 up(&module_mutex);
514 for (;;) { 514 for (;;) {
515 DEBUGP("Looking at refcount...\n"); 515 DEBUGP("Looking at refcount...\n");
516 set_current_state(TASK_UNINTERRUPTIBLE); 516 set_current_state(TASK_UNINTERRUPTIBLE);
517 if (module_refcount(mod) == 0) 517 if (module_refcount(mod) == 0)
518 break; 518 break;
519 schedule(); 519 schedule();
520 } 520 }
521 current->state = TASK_RUNNING; 521 current->state = TASK_RUNNING;
522 down(&module_mutex); 522 down(&module_mutex);
523 } 523 }
524 524
525 asmlinkage long 525 asmlinkage long
526 sys_delete_module(const char __user *name_user, unsigned int flags) 526 sys_delete_module(const char __user *name_user, unsigned int flags)
527 { 527 {
528 struct module *mod; 528 struct module *mod;
529 char name[MODULE_NAME_LEN]; 529 char name[MODULE_NAME_LEN];
530 int ret, forced = 0; 530 int ret, forced = 0;
531 531
532 if (!capable(CAP_SYS_MODULE)) 532 if (!capable(CAP_SYS_MODULE))
533 return -EPERM; 533 return -EPERM;
534 534
535 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) 535 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
536 return -EFAULT; 536 return -EFAULT;
537 name[MODULE_NAME_LEN-1] = '\0'; 537 name[MODULE_NAME_LEN-1] = '\0';
538 538
539 if (down_interruptible(&module_mutex) != 0) 539 if (down_interruptible(&module_mutex) != 0)
540 return -EINTR; 540 return -EINTR;
541 541
542 mod = find_module(name); 542 mod = find_module(name);
543 if (!mod) { 543 if (!mod) {
544 ret = -ENOENT; 544 ret = -ENOENT;
545 goto out; 545 goto out;
546 } 546 }
547 547
548 if (!list_empty(&mod->modules_which_use_me)) { 548 if (!list_empty(&mod->modules_which_use_me)) {
549 /* Other modules depend on us: get rid of them first. */ 549 /* Other modules depend on us: get rid of them first. */
550 ret = -EWOULDBLOCK; 550 ret = -EWOULDBLOCK;
551 goto out; 551 goto out;
552 } 552 }
553 553
554 /* Doing init or already dying? */ 554 /* Doing init or already dying? */
555 if (mod->state != MODULE_STATE_LIVE) { 555 if (mod->state != MODULE_STATE_LIVE) {
556 /* FIXME: if (force), slam module count and wake up 556 /* FIXME: if (force), slam module count and wake up
557 waiter --RR */ 557 waiter --RR */
558 DEBUGP("%s already dying\n", mod->name); 558 DEBUGP("%s already dying\n", mod->name);
559 ret = -EBUSY; 559 ret = -EBUSY;
560 goto out; 560 goto out;
561 } 561 }
562 562
563 /* If it has an init func, it must have an exit func to unload */ 563 /* If it has an init func, it must have an exit func to unload */
564 if ((mod->init != NULL && mod->exit == NULL) 564 if ((mod->init != NULL && mod->exit == NULL)
565 || mod->unsafe) { 565 || mod->unsafe) {
566 forced = try_force(flags); 566 forced = try_force(flags);
567 if (!forced) { 567 if (!forced) {
568 /* This module can't be removed */ 568 /* This module can't be removed */
569 ret = -EBUSY; 569 ret = -EBUSY;
570 goto out; 570 goto out;
571 } 571 }
572 } 572 }
573 573
574 /* Set this up before setting mod->state */ 574 /* Set this up before setting mod->state */
575 mod->waiter = current; 575 mod->waiter = current;
576 576
577 /* Stop the machine so refcounts can't move and disable module. */ 577 /* Stop the machine so refcounts can't move and disable module. */
578 ret = try_stop_module(mod, flags, &forced); 578 ret = try_stop_module(mod, flags, &forced);
579 if (ret != 0) 579 if (ret != 0)
580 goto out; 580 goto out;
581 581
582 /* Never wait if forced. */ 582 /* Never wait if forced. */
583 if (!forced && module_refcount(mod) != 0) 583 if (!forced && module_refcount(mod) != 0)
584 wait_for_zero_refcount(mod); 584 wait_for_zero_refcount(mod);
585 585
586 /* Final destruction now noone is using it. */ 586 /* Final destruction now noone is using it. */
587 if (mod->exit != NULL) { 587 if (mod->exit != NULL) {
588 up(&module_mutex); 588 up(&module_mutex);
589 mod->exit(); 589 mod->exit();
590 down(&module_mutex); 590 down(&module_mutex);
591 } 591 }
592 free_module(mod); 592 free_module(mod);
593 593
594 out: 594 out:
595 up(&module_mutex); 595 up(&module_mutex);
596 return ret; 596 return ret;
597 } 597 }
598 598
599 static void print_unload_info(struct seq_file *m, struct module *mod) 599 static void print_unload_info(struct seq_file *m, struct module *mod)
600 { 600 {
601 struct module_use *use; 601 struct module_use *use;
602 int printed_something = 0; 602 int printed_something = 0;
603 603
604 seq_printf(m, " %u ", module_refcount(mod)); 604 seq_printf(m, " %u ", module_refcount(mod));
605 605
606 /* Always include a trailing , so userspace can differentiate 606 /* Always include a trailing , so userspace can differentiate
607 between this and the old multi-field proc format. */ 607 between this and the old multi-field proc format. */
608 list_for_each_entry(use, &mod->modules_which_use_me, list) { 608 list_for_each_entry(use, &mod->modules_which_use_me, list) {
609 printed_something = 1; 609 printed_something = 1;
610 seq_printf(m, "%s,", use->module_which_uses->name); 610 seq_printf(m, "%s,", use->module_which_uses->name);
611 } 611 }
612 612
613 if (mod->unsafe) { 613 if (mod->unsafe) {
614 printed_something = 1; 614 printed_something = 1;
615 seq_printf(m, "[unsafe],"); 615 seq_printf(m, "[unsafe],");
616 } 616 }
617 617
618 if (mod->init != NULL && mod->exit == NULL) { 618 if (mod->init != NULL && mod->exit == NULL) {
619 printed_something = 1; 619 printed_something = 1;
620 seq_printf(m, "[permanent],"); 620 seq_printf(m, "[permanent],");
621 } 621 }
622 622
623 if (!printed_something) 623 if (!printed_something)
624 seq_printf(m, "-"); 624 seq_printf(m, "-");
625 } 625 }
626 626
627 void __symbol_put(const char *symbol) 627 void __symbol_put(const char *symbol)
628 { 628 {
629 struct module *owner; 629 struct module *owner;
630 unsigned long flags; 630 unsigned long flags;
631 const unsigned long *crc; 631 const unsigned long *crc;
632 632
633 spin_lock_irqsave(&modlist_lock, flags); 633 spin_lock_irqsave(&modlist_lock, flags);
634 if (!__find_symbol(symbol, &owner, &crc, 1)) 634 if (!__find_symbol(symbol, &owner, &crc, 1))
635 BUG(); 635 BUG();
636 module_put(owner); 636 module_put(owner);
637 spin_unlock_irqrestore(&modlist_lock, flags); 637 spin_unlock_irqrestore(&modlist_lock, flags);
638 } 638 }
639 EXPORT_SYMBOL(__symbol_put); 639 EXPORT_SYMBOL(__symbol_put);
640 640
641 void symbol_put_addr(void *addr) 641 void symbol_put_addr(void *addr)
642 { 642 {
643 unsigned long flags; 643 unsigned long flags;
644 644
645 spin_lock_irqsave(&modlist_lock, flags); 645 spin_lock_irqsave(&modlist_lock, flags);
646 if (!kernel_text_address((unsigned long)addr)) 646 if (!kernel_text_address((unsigned long)addr))
647 BUG(); 647 BUG();
648 648
649 module_put(module_text_address((unsigned long)addr)); 649 module_put(module_text_address((unsigned long)addr));
650 spin_unlock_irqrestore(&modlist_lock, flags); 650 spin_unlock_irqrestore(&modlist_lock, flags);
651 } 651 }
652 EXPORT_SYMBOL_GPL(symbol_put_addr); 652 EXPORT_SYMBOL_GPL(symbol_put_addr);
653 653
654 static ssize_t show_refcnt(struct module_attribute *mattr, 654 static ssize_t show_refcnt(struct module_attribute *mattr,
655 struct module *mod, char *buffer) 655 struct module *mod, char *buffer)
656 { 656 {
657 /* sysfs holds a reference */ 657 /* sysfs holds a reference */
658 return sprintf(buffer, "%u\n", module_refcount(mod)-1); 658 return sprintf(buffer, "%u\n", module_refcount(mod)-1);
659 } 659 }
660 660
661 static struct module_attribute refcnt = { 661 static struct module_attribute refcnt = {
662 .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, 662 .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },
663 .show = show_refcnt, 663 .show = show_refcnt,
664 }; 664 };
665 665
666 #else /* !CONFIG_MODULE_UNLOAD */ 666 #else /* !CONFIG_MODULE_UNLOAD */
667 static void print_unload_info(struct seq_file *m, struct module *mod) 667 static void print_unload_info(struct seq_file *m, struct module *mod)
668 { 668 {
669 /* We don't know the usage count, or what modules are using. */ 669 /* We don't know the usage count, or what modules are using. */
670 seq_printf(m, " - -"); 670 seq_printf(m, " - -");
671 } 671 }
672 672
673 static inline void module_unload_free(struct module *mod) 673 static inline void module_unload_free(struct module *mod)
674 { 674 {
675 } 675 }
676 676
677 static inline int use_module(struct module *a, struct module *b) 677 static inline int use_module(struct module *a, struct module *b)
678 { 678 {
679 return strong_try_module_get(b); 679 return strong_try_module_get(b);
680 } 680 }
681 681
682 static inline void module_unload_init(struct module *mod) 682 static inline void module_unload_init(struct module *mod)
683 { 683 {
684 } 684 }
685 #endif /* CONFIG_MODULE_UNLOAD */ 685 #endif /* CONFIG_MODULE_UNLOAD */
686 686
687 #ifdef CONFIG_OBSOLETE_MODPARM 687 #ifdef CONFIG_OBSOLETE_MODPARM
688 /* Bounds checking done below */ 688 /* Bounds checking done below */
689 static int obsparm_copy_string(const char *val, struct kernel_param *kp) 689 static int obsparm_copy_string(const char *val, struct kernel_param *kp)
690 { 690 {
691 strcpy(kp->arg, val); 691 strcpy(kp->arg, val);
692 return 0; 692 return 0;
693 } 693 }
694 694
695 int set_obsolete(const char *val, struct kernel_param *kp) 695 int set_obsolete(const char *val, struct kernel_param *kp)
696 { 696 {
697 unsigned int min, max; 697 unsigned int min, max;
698 unsigned int size, maxsize; 698 unsigned int size, maxsize;
699 int dummy; 699 int dummy;
700 char *endp; 700 char *endp;
701 const char *p; 701 const char *p;
702 struct obsolete_modparm *obsparm = kp->arg; 702 struct obsolete_modparm *obsparm = kp->arg;
703 703
704 if (!val) { 704 if (!val) {
705 printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); 705 printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
706 return -EINVAL; 706 return -EINVAL;
707 } 707 }
708 708
709 /* type is: [min[-max]]{b,h,i,l,s} */ 709 /* type is: [min[-max]]{b,h,i,l,s} */
710 p = obsparm->type; 710 p = obsparm->type;
711 min = simple_strtol(p, &endp, 10); 711 min = simple_strtol(p, &endp, 10);
712 if (endp == obsparm->type) 712 if (endp == obsparm->type)
713 min = max = 1; 713 min = max = 1;
714 else if (*endp == '-') { 714 else if (*endp == '-') {
715 p = endp+1; 715 p = endp+1;
716 max = simple_strtol(p, &endp, 10); 716 max = simple_strtol(p, &endp, 10);
717 } else 717 } else
718 max = min; 718 max = min;
719 switch (*endp) { 719 switch (*endp) {
720 case 'b': 720 case 'b':
721 return param_array(kp->name, val, min, max, obsparm->addr, 721 return param_array(kp->name, val, min, max, obsparm->addr,
722 1, param_set_byte, &dummy); 722 1, param_set_byte, &dummy);
723 case 'h': 723 case 'h':
724 return param_array(kp->name, val, min, max, obsparm->addr, 724 return param_array(kp->name, val, min, max, obsparm->addr,
725 sizeof(short), param_set_short, &dummy); 725 sizeof(short), param_set_short, &dummy);
726 case 'i': 726 case 'i':
727 return param_array(kp->name, val, min, max, obsparm->addr, 727 return param_array(kp->name, val, min, max, obsparm->addr,
728 sizeof(int), param_set_int, &dummy); 728 sizeof(int), param_set_int, &dummy);
729 case 'l': 729 case 'l':
730 return param_array(kp->name, val, min, max, obsparm->addr, 730 return param_array(kp->name, val, min, max, obsparm->addr,
731 sizeof(long), param_set_long, &dummy); 731 sizeof(long), param_set_long, &dummy);
732 case 's': 732 case 's':
733 return param_array(kp->name, val, min, max, obsparm->addr, 733 return param_array(kp->name, val, min, max, obsparm->addr,
734 sizeof(char *), param_set_charp, &dummy); 734 sizeof(char *), param_set_charp, &dummy);
735 735
736 case 'c': 736 case 'c':
737 /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars, 737 /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
738 and the decl is "char xxx[5][50];" */ 738 and the decl is "char xxx[5][50];" */
739 p = endp+1; 739 p = endp+1;
740 maxsize = simple_strtol(p, &endp, 10); 740 maxsize = simple_strtol(p, &endp, 10);
741 /* We check lengths here (yes, this is a hack). */ 741 /* We check lengths here (yes, this is a hack). */
742 p = val; 742 p = val;
743 while (p[size = strcspn(p, ",")]) { 743 while (p[size = strcspn(p, ",")]) {
744 if (size >= maxsize) 744 if (size >= maxsize)
745 goto oversize; 745 goto oversize;
746 p += size+1; 746 p += size+1;
747 } 747 }
748 if (size >= maxsize) 748 if (size >= maxsize)
749 goto oversize; 749 goto oversize;
750 return param_array(kp->name, val, min, max, obsparm->addr, 750 return param_array(kp->name, val, min, max, obsparm->addr,
751 maxsize, obsparm_copy_string, &dummy); 751 maxsize, obsparm_copy_string, &dummy);
752 } 752 }
753 printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); 753 printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
754 return -EINVAL; 754 return -EINVAL;
755 oversize: 755 oversize:
756 printk(KERN_ERR 756 printk(KERN_ERR
757 "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize); 757 "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
758 return -EINVAL; 758 return -EINVAL;
759 } 759 }
760 760
761 static int obsolete_params(const char *name, 761 static int obsolete_params(const char *name,
762 char *args, 762 char *args,
763 struct obsolete_modparm obsparm[], 763 struct obsolete_modparm obsparm[],
764 unsigned int num, 764 unsigned int num,
765 Elf_Shdr *sechdrs, 765 Elf_Shdr *sechdrs,
766 unsigned int symindex, 766 unsigned int symindex,
767 const char *strtab) 767 const char *strtab)
768 { 768 {
769 struct kernel_param *kp; 769 struct kernel_param *kp;
770 unsigned int i; 770 unsigned int i;
771 int ret; 771 int ret;
772 772
773 kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); 773 kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
774 if (!kp) 774 if (!kp)
775 return -ENOMEM; 775 return -ENOMEM;
776 776
777 for (i = 0; i < num; i++) { 777 for (i = 0; i < num; i++) {
778 char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)]; 778 char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
779 779
780 snprintf(sym_name, sizeof(sym_name), "%s%s", 780 snprintf(sym_name, sizeof(sym_name), "%s%s",
781 MODULE_SYMBOL_PREFIX, obsparm[i].name); 781 MODULE_SYMBOL_PREFIX, obsparm[i].name);
782 782
783 kp[i].name = obsparm[i].name; 783 kp[i].name = obsparm[i].name;
784 kp[i].perm = 000; 784 kp[i].perm = 000;
785 kp[i].set = set_obsolete; 785 kp[i].set = set_obsolete;
786 kp[i].get = NULL; 786 kp[i].get = NULL;
787 obsparm[i].addr 787 obsparm[i].addr
788 = (void *)find_local_symbol(sechdrs, symindex, strtab, 788 = (void *)find_local_symbol(sechdrs, symindex, strtab,
789 sym_name); 789 sym_name);
790 if (!obsparm[i].addr) { 790 if (!obsparm[i].addr) {
791 printk("%s: falsely claims to have parameter %s\n", 791 printk("%s: falsely claims to have parameter %s\n",
792 name, obsparm[i].name); 792 name, obsparm[i].name);
793 ret = -EINVAL; 793 ret = -EINVAL;
794 goto out; 794 goto out;
795 } 795 }
796 kp[i].arg = &obsparm[i]; 796 kp[i].arg = &obsparm[i];
797 } 797 }
798 798
799 ret = parse_args(name, args, kp, num, NULL); 799 ret = parse_args(name, args, kp, num, NULL);
800 out: 800 out:
801 kfree(kp); 801 kfree(kp);
802 return ret; 802 return ret;
803 } 803 }
804 #else 804 #else
805 static int obsolete_params(const char *name, 805 static int obsolete_params(const char *name,
806 char *args, 806 char *args,
807 struct obsolete_modparm obsparm[], 807 struct obsolete_modparm obsparm[],
808 unsigned int num, 808 unsigned int num,
809 Elf_Shdr *sechdrs, 809 Elf_Shdr *sechdrs,
810 unsigned int symindex, 810 unsigned int symindex,
811 const char *strtab) 811 const char *strtab)
812 { 812 {
813 if (num != 0) 813 if (num != 0)
814 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 814 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
815 name); 815 name);
816 return 0; 816 return 0;
817 } 817 }
818 #endif /* CONFIG_OBSOLETE_MODPARM */ 818 #endif /* CONFIG_OBSOLETE_MODPARM */
819 819
820 static const char vermagic[] = VERMAGIC_STRING; 820 static const char vermagic[] = VERMAGIC_STRING;
821 821
822 #ifdef CONFIG_MODVERSIONS 822 #ifdef CONFIG_MODVERSIONS
823 static int check_version(Elf_Shdr *sechdrs, 823 static int check_version(Elf_Shdr *sechdrs,
824 unsigned int versindex, 824 unsigned int versindex,
825 const char *symname, 825 const char *symname,
826 struct module *mod, 826 struct module *mod,
827 const unsigned long *crc) 827 const unsigned long *crc)
828 { 828 {
829 unsigned int i, num_versions; 829 unsigned int i, num_versions;
830 struct modversion_info *versions; 830 struct modversion_info *versions;
831 831
832 /* Exporting module didn't supply crcs? OK, we're already tainted. */ 832 /* Exporting module didn't supply crcs? OK, we're already tainted. */
833 if (!crc) 833 if (!crc)
834 return 1; 834 return 1;
835 835
836 versions = (void *) sechdrs[versindex].sh_addr; 836 versions = (void *) sechdrs[versindex].sh_addr;
837 num_versions = sechdrs[versindex].sh_size 837 num_versions = sechdrs[versindex].sh_size
838 / sizeof(struct modversion_info); 838 / sizeof(struct modversion_info);
839 839
840 for (i = 0; i < num_versions; i++) { 840 for (i = 0; i < num_versions; i++) {
841 if (strcmp(versions[i].name, symname) != 0) 841 if (strcmp(versions[i].name, symname) != 0)
842 continue; 842 continue;
843 843
844 if (versions[i].crc == *crc) 844 if (versions[i].crc == *crc)
845 return 1; 845 return 1;
846 printk("%s: disagrees about version of symbol %s\n", 846 printk("%s: disagrees about version of symbol %s\n",
847 mod->name, symname); 847 mod->name, symname);
848 DEBUGP("Found checksum %lX vs module %lX\n", 848 DEBUGP("Found checksum %lX vs module %lX\n",
849 *crc, versions[i].crc); 849 *crc, versions[i].crc);
850 return 0; 850 return 0;
851 } 851 }
852 /* Not in module's version table. OK, but that taints the kernel. */ 852 /* Not in module's version table. OK, but that taints the kernel. */
853 if (!(tainted & TAINT_FORCED_MODULE)) { 853 if (!(tainted & TAINT_FORCED_MODULE)) {
854 printk("%s: no version for \"%s\" found: kernel tainted.\n", 854 printk("%s: no version for \"%s\" found: kernel tainted.\n",
855 mod->name, symname); 855 mod->name, symname);
856 tainted |= TAINT_FORCED_MODULE; 856 tainted |= TAINT_FORCED_MODULE;
857 } 857 }
858 return 1; 858 return 1;
859 } 859 }
860 860
861 static inline int check_modstruct_version(Elf_Shdr *sechdrs, 861 static inline int check_modstruct_version(Elf_Shdr *sechdrs,
862 unsigned int versindex, 862 unsigned int versindex,
863 struct module *mod) 863 struct module *mod)
864 { 864 {
865 const unsigned long *crc; 865 const unsigned long *crc;
866 struct module *owner; 866 struct module *owner;
867 867
868 if (!__find_symbol("struct_module", &owner, &crc, 1)) 868 if (!__find_symbol("struct_module", &owner, &crc, 1))
869 BUG(); 869 BUG();
870 return check_version(sechdrs, versindex, "struct_module", mod, 870 return check_version(sechdrs, versindex, "struct_module", mod,
871 crc); 871 crc);
872 } 872 }
873 873
874 /* First part is kernel version, which we ignore. */ 874 /* First part is kernel version, which we ignore. */
875 static inline int same_magic(const char *amagic, const char *bmagic) 875 static inline int same_magic(const char *amagic, const char *bmagic)
876 { 876 {
877 amagic += strcspn(amagic, " "); 877 amagic += strcspn(amagic, " ");
878 bmagic += strcspn(bmagic, " "); 878 bmagic += strcspn(bmagic, " ");
879 return strcmp(amagic, bmagic) == 0; 879 return strcmp(amagic, bmagic) == 0;
880 } 880 }
881 #else 881 #else
882 static inline int check_version(Elf_Shdr *sechdrs, 882 static inline int check_version(Elf_Shdr *sechdrs,
883 unsigned int versindex, 883 unsigned int versindex,
884 const char *symname, 884 const char *symname,
885 struct module *mod, 885 struct module *mod,
886 const unsigned long *crc) 886 const unsigned long *crc)
887 { 887 {
888 return 1; 888 return 1;
889 } 889 }
890 890
891 static inline int check_modstruct_version(Elf_Shdr *sechdrs, 891 static inline int check_modstruct_version(Elf_Shdr *sechdrs,
892 unsigned int versindex, 892 unsigned int versindex,
893 struct module *mod) 893 struct module *mod)
894 { 894 {
895 return 1; 895 return 1;
896 } 896 }
897 897
898 static inline int same_magic(const char *amagic, const char *bmagic) 898 static inline int same_magic(const char *amagic, const char *bmagic)
899 { 899 {
900 return strcmp(amagic, bmagic) == 0; 900 return strcmp(amagic, bmagic) == 0;
901 } 901 }
902 #endif /* CONFIG_MODVERSIONS */ 902 #endif /* CONFIG_MODVERSIONS */
903 903
904 /* Resolve a symbol for this module. I.e. if we find one, record usage. 904 /* Resolve a symbol for this module. I.e. if we find one, record usage.
905 Must be holding module_mutex. */ 905 Must be holding module_mutex. */
906 static unsigned long resolve_symbol(Elf_Shdr *sechdrs, 906 static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
907 unsigned int versindex, 907 unsigned int versindex,
908 const char *name, 908 const char *name,
909 struct module *mod) 909 struct module *mod)
910 { 910 {
911 struct module *owner; 911 struct module *owner;
912 unsigned long ret; 912 unsigned long ret;
913 const unsigned long *crc; 913 const unsigned long *crc;
914 914
915 spin_lock_irq(&modlist_lock); 915 spin_lock_irq(&modlist_lock);
916 ret = __find_symbol(name, &owner, &crc, mod->license_gplok); 916 ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
917 if (ret) { 917 if (ret) {
918 /* use_module can fail due to OOM, or module unloading */ 918 /* use_module can fail due to OOM, or module unloading */
919 if (!check_version(sechdrs, versindex, name, mod, crc) || 919 if (!check_version(sechdrs, versindex, name, mod, crc) ||
920 !use_module(mod, owner)) 920 !use_module(mod, owner))
921 ret = 0; 921 ret = 0;
922 } 922 }
923 spin_unlock_irq(&modlist_lock); 923 spin_unlock_irq(&modlist_lock);
924 return ret; 924 return ret;
925 } 925 }
926 926
927 927
928 /* 928 /*
929 * /sys/module/foo/sections stuff 929 * /sys/module/foo/sections stuff
930 * J. Corbet <corbet@lwn.net> 930 * J. Corbet <corbet@lwn.net>
931 */ 931 */
932 #ifdef CONFIG_KALLSYMS 932 #ifdef CONFIG_KALLSYMS
933 static ssize_t module_sect_show(struct module_attribute *mattr, 933 static ssize_t module_sect_show(struct module_attribute *mattr,
934 struct module *mod, char *buf) 934 struct module *mod, char *buf)
935 { 935 {
936 struct module_sect_attr *sattr = 936 struct module_sect_attr *sattr =
937 container_of(mattr, struct module_sect_attr, mattr); 937 container_of(mattr, struct module_sect_attr, mattr);
938 return sprintf(buf, "0x%lx\n", sattr->address); 938 return sprintf(buf, "0x%lx\n", sattr->address);
939 } 939 }
940 940
941 static void add_sect_attrs(struct module *mod, unsigned int nsect, 941 static void add_sect_attrs(struct module *mod, unsigned int nsect,
942 char *secstrings, Elf_Shdr *sechdrs) 942 char *secstrings, Elf_Shdr *sechdrs)
943 { 943 {
944 unsigned int nloaded = 0, i, size[2]; 944 unsigned int nloaded = 0, i, size[2];
945 struct module_sect_attrs *sect_attrs; 945 struct module_sect_attrs *sect_attrs;
946 struct module_sect_attr *sattr; 946 struct module_sect_attr *sattr;
947 struct attribute **gattr; 947 struct attribute **gattr;
948 948
949 /* Count loaded sections and allocate structures */ 949 /* Count loaded sections and allocate structures */
950 for (i = 0; i < nsect; i++) 950 for (i = 0; i < nsect; i++)
951 if (sechdrs[i].sh_flags & SHF_ALLOC) 951 if (sechdrs[i].sh_flags & SHF_ALLOC)
952 nloaded++; 952 nloaded++;
953 size[0] = ALIGN(sizeof(*sect_attrs) 953 size[0] = ALIGN(sizeof(*sect_attrs)
954 + nloaded * sizeof(sect_attrs->attrs[0]), 954 + nloaded * sizeof(sect_attrs->attrs[0]),
955 sizeof(sect_attrs->grp.attrs[0])); 955 sizeof(sect_attrs->grp.attrs[0]));
956 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); 956 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
957 if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) 957 if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
958 return; 958 return;
959 959
960 /* Setup section attributes. */ 960 /* Setup section attributes. */
961 sect_attrs->grp.name = "sections"; 961 sect_attrs->grp.name = "sections";
962 sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; 962 sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
963 963
964 sattr = &sect_attrs->attrs[0]; 964 sattr = &sect_attrs->attrs[0];
965 gattr = &sect_attrs->grp.attrs[0]; 965 gattr = &sect_attrs->grp.attrs[0];
966 for (i = 0; i < nsect; i++) { 966 for (i = 0; i < nsect; i++) {
967 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 967 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
968 continue; 968 continue;
969 sattr->address = sechdrs[i].sh_addr; 969 sattr->address = sechdrs[i].sh_addr;
970 strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, 970 strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
971 MODULE_SECT_NAME_LEN); 971 MODULE_SECT_NAME_LEN);
972 sattr->mattr.show = module_sect_show; 972 sattr->mattr.show = module_sect_show;
973 sattr->mattr.store = NULL; 973 sattr->mattr.store = NULL;
974 sattr->mattr.attr.name = sattr->name; 974 sattr->mattr.attr.name = sattr->name;
975 sattr->mattr.attr.owner = mod; 975 sattr->mattr.attr.owner = mod;
976 sattr->mattr.attr.mode = S_IRUGO; 976 sattr->mattr.attr.mode = S_IRUGO;
977 *(gattr++) = &(sattr++)->mattr.attr; 977 *(gattr++) = &(sattr++)->mattr.attr;
978 } 978 }
979 *gattr = NULL; 979 *gattr = NULL;
980 980
981 if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp)) 981 if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
982 goto out; 982 goto out;
983 983
984 mod->sect_attrs = sect_attrs; 984 mod->sect_attrs = sect_attrs;
985 return; 985 return;
986 out: 986 out:
987 kfree(sect_attrs); 987 kfree(sect_attrs);
988 } 988 }
989 989
990 static void remove_sect_attrs(struct module *mod) 990 static void remove_sect_attrs(struct module *mod)
991 { 991 {
992 if (mod->sect_attrs) { 992 if (mod->sect_attrs) {
993 sysfs_remove_group(&mod->mkobj.kobj, 993 sysfs_remove_group(&mod->mkobj.kobj,
994 &mod->sect_attrs->grp); 994 &mod->sect_attrs->grp);
995 /* We are positive that no one is using any sect attrs 995 /* We are positive that no one is using any sect attrs
996 * at this point. Deallocate immediately. */ 996 * at this point. Deallocate immediately. */
997 kfree(mod->sect_attrs); 997 kfree(mod->sect_attrs);
998 mod->sect_attrs = NULL; 998 mod->sect_attrs = NULL;
999 } 999 }
1000 } 1000 }
1001 1001
1002 1002
1003 #else 1003 #else
1004 static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1004 static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
1005 char *sectstrings, Elf_Shdr *sechdrs) 1005 char *sectstrings, Elf_Shdr *sechdrs)
1006 { 1006 {
1007 } 1007 }
1008 1008
1009 static inline void remove_sect_attrs(struct module *mod) 1009 static inline void remove_sect_attrs(struct module *mod)
1010 { 1010 {
1011 } 1011 }
1012 #endif /* CONFIG_KALLSYMS */ 1012 #endif /* CONFIG_KALLSYMS */
1013 1013
1014 1014
1015 #ifdef CONFIG_MODULE_UNLOAD 1015 #ifdef CONFIG_MODULE_UNLOAD
1016 static inline int module_add_refcnt_attr(struct module *mod) 1016 static inline int module_add_refcnt_attr(struct module *mod)
1017 { 1017 {
1018 return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr); 1018 return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
1019 } 1019 }
1020 static void module_remove_refcnt_attr(struct module *mod) 1020 static void module_remove_refcnt_attr(struct module *mod)
1021 { 1021 {
1022 return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr); 1022 return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
1023 } 1023 }
1024 #else 1024 #else
1025 static inline int module_add_refcnt_attr(struct module *mod) 1025 static inline int module_add_refcnt_attr(struct module *mod)
1026 { 1026 {
1027 return 0; 1027 return 0;
1028 } 1028 }
1029 static void module_remove_refcnt_attr(struct module *mod) 1029 static void module_remove_refcnt_attr(struct module *mod)
1030 { 1030 {
1031 } 1031 }
1032 #endif 1032 #endif
1033 1033
1034 1034
1035 static int mod_sysfs_setup(struct module *mod, 1035 static int mod_sysfs_setup(struct module *mod,
1036 struct kernel_param *kparam, 1036 struct kernel_param *kparam,
1037 unsigned int num_params) 1037 unsigned int num_params)
1038 { 1038 {
1039 int err; 1039 int err;
1040 1040
1041 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); 1041 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1042 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); 1042 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
1043 if (err) 1043 if (err)
1044 goto out; 1044 goto out;
1045 kobj_set_kset_s(&mod->mkobj, module_subsys); 1045 kobj_set_kset_s(&mod->mkobj, module_subsys);
1046 mod->mkobj.mod = mod; 1046 mod->mkobj.mod = mod;
1047 err = kobject_register(&mod->mkobj.kobj); 1047 err = kobject_register(&mod->mkobj.kobj);
1048 if (err) 1048 if (err)
1049 goto out; 1049 goto out;
1050 1050
1051 err = module_add_refcnt_attr(mod); 1051 err = module_add_refcnt_attr(mod);
1052 if (err) 1052 if (err)
1053 goto out_unreg; 1053 goto out_unreg;
1054 1054
1055 err = module_param_sysfs_setup(mod, kparam, num_params); 1055 err = module_param_sysfs_setup(mod, kparam, num_params);
1056 if (err) 1056 if (err)
1057 goto out_unreg; 1057 goto out_unreg;
1058 1058
1059 return 0; 1059 return 0;
1060 1060
1061 out_unreg: 1061 out_unreg:
1062 kobject_unregister(&mod->mkobj.kobj); 1062 kobject_unregister(&mod->mkobj.kobj);
1063 out: 1063 out:
1064 return err; 1064 return err;
1065 } 1065 }
1066 1066
1067 static void mod_kobject_remove(struct module *mod) 1067 static void mod_kobject_remove(struct module *mod)
1068 { 1068 {
1069 module_remove_refcnt_attr(mod); 1069 module_remove_refcnt_attr(mod);
1070 module_param_sysfs_remove(mod); 1070 module_param_sysfs_remove(mod);
1071 1071
1072 kobject_unregister(&mod->mkobj.kobj); 1072 kobject_unregister(&mod->mkobj.kobj);
1073 } 1073 }
1074 1074
1075 /* 1075 /*
1076 * unlink the module with the whole machine is stopped with interrupts off 1076 * unlink the module with the whole machine is stopped with interrupts off
1077 * - this defends against kallsyms not taking locks 1077 * - this defends against kallsyms not taking locks
1078 */ 1078 */
1079 static int __unlink_module(void *_mod) 1079 static int __unlink_module(void *_mod)
1080 { 1080 {
1081 struct module *mod = _mod; 1081 struct module *mod = _mod;
1082 list_del(&mod->list); 1082 list_del(&mod->list);
1083 return 0; 1083 return 0;
1084 } 1084 }
1085 1085
1086 /* Free a module, remove from lists, etc (must hold module mutex). */ 1086 /* Free a module, remove from lists, etc (must hold module mutex). */
1087 static void free_module(struct module *mod) 1087 static void free_module(struct module *mod)
1088 { 1088 {
1089 /* Delete from various lists */ 1089 /* Delete from various lists */
1090 stop_machine_run(__unlink_module, mod, NR_CPUS); 1090 stop_machine_run(__unlink_module, mod, NR_CPUS);
1091 remove_sect_attrs(mod); 1091 remove_sect_attrs(mod);
1092 mod_kobject_remove(mod); 1092 mod_kobject_remove(mod);
1093 1093
1094 /* Arch-specific cleanup. */ 1094 /* Arch-specific cleanup. */
1095 module_arch_cleanup(mod); 1095 module_arch_cleanup(mod);
1096 1096
1097 /* Module unload stuff */ 1097 /* Module unload stuff */
1098 module_unload_free(mod); 1098 module_unload_free(mod);
1099 1099
1100 /* This may be NULL, but that's OK */ 1100 /* This may be NULL, but that's OK */
1101 module_free(mod, mod->module_init); 1101 module_free(mod, mod->module_init);
1102 kfree(mod->args); 1102 kfree(mod->args);
1103 if (mod->percpu) 1103 if (mod->percpu)
1104 percpu_modfree(mod->percpu); 1104 percpu_modfree(mod->percpu);
1105 1105
1106 /* Finally, free the core (containing the module structure) */ 1106 /* Finally, free the core (containing the module structure) */
1107 module_free(mod, mod->module_core); 1107 module_free(mod, mod->module_core);
1108 } 1108 }
1109 1109
1110 void *__symbol_get(const char *symbol) 1110 void *__symbol_get(const char *symbol)
1111 { 1111 {
1112 struct module *owner; 1112 struct module *owner;
1113 unsigned long value, flags; 1113 unsigned long value, flags;
1114 const unsigned long *crc; 1114 const unsigned long *crc;
1115 1115
1116 spin_lock_irqsave(&modlist_lock, flags); 1116 spin_lock_irqsave(&modlist_lock, flags);
1117 value = __find_symbol(symbol, &owner, &crc, 1); 1117 value = __find_symbol(symbol, &owner, &crc, 1);
1118 if (value && !strong_try_module_get(owner)) 1118 if (value && !strong_try_module_get(owner))
1119 value = 0; 1119 value = 0;
1120 spin_unlock_irqrestore(&modlist_lock, flags); 1120 spin_unlock_irqrestore(&modlist_lock, flags);
1121 1121
1122 return (void *)value; 1122 return (void *)value;
1123 } 1123 }
1124 EXPORT_SYMBOL_GPL(__symbol_get); 1124 EXPORT_SYMBOL_GPL(__symbol_get);
1125 1125
1126 /* Change all symbols so that sh_value encodes the pointer directly. */ 1126 /* Change all symbols so that sh_value encodes the pointer directly. */
1127 static int simplify_symbols(Elf_Shdr *sechdrs, 1127 static int simplify_symbols(Elf_Shdr *sechdrs,
1128 unsigned int symindex, 1128 unsigned int symindex,
1129 const char *strtab, 1129 const char *strtab,
1130 unsigned int versindex, 1130 unsigned int versindex,
1131 unsigned int pcpuindex, 1131 unsigned int pcpuindex,
1132 struct module *mod) 1132 struct module *mod)
1133 { 1133 {
1134 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; 1134 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
1135 unsigned long secbase; 1135 unsigned long secbase;
1136 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1136 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1137 int ret = 0; 1137 int ret = 0;
1138 1138
1139 for (i = 1; i < n; i++) { 1139 for (i = 1; i < n; i++) {
1140 switch (sym[i].st_shndx) { 1140 switch (sym[i].st_shndx) {
1141 case SHN_COMMON: 1141 case SHN_COMMON:
1142 /* We compiled with -fno-common. These are not 1142 /* We compiled with -fno-common. These are not
1143 supposed to happen. */ 1143 supposed to happen. */
1144 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); 1144 DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
1145 printk("%s: please compile with -fno-common\n", 1145 printk("%s: please compile with -fno-common\n",
1146 mod->name); 1146 mod->name);
1147 ret = -ENOEXEC; 1147 ret = -ENOEXEC;
1148 break; 1148 break;
1149 1149
1150 case SHN_ABS: 1150 case SHN_ABS:
1151 /* Don't need to do anything */ 1151 /* Don't need to do anything */
1152 DEBUGP("Absolute symbol: 0x%08lx\n", 1152 DEBUGP("Absolute symbol: 0x%08lx\n",
1153 (long)sym[i].st_value); 1153 (long)sym[i].st_value);
1154 break; 1154 break;
1155 1155
1156 case SHN_UNDEF: 1156 case SHN_UNDEF:
1157 sym[i].st_value 1157 sym[i].st_value
1158 = resolve_symbol(sechdrs, versindex, 1158 = resolve_symbol(sechdrs, versindex,
1159 strtab + sym[i].st_name, mod); 1159 strtab + sym[i].st_name, mod);
1160 1160
1161 /* Ok if resolved. */ 1161 /* Ok if resolved. */
1162 if (sym[i].st_value != 0) 1162 if (sym[i].st_value != 0)
1163 break; 1163 break;
1164 /* Ok if weak. */ 1164 /* Ok if weak. */
1165 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1165 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1166 break; 1166 break;
1167 1167
1168 printk(KERN_WARNING "%s: Unknown symbol %s\n", 1168 printk(KERN_WARNING "%s: Unknown symbol %s\n",
1169 mod->name, strtab + sym[i].st_name); 1169 mod->name, strtab + sym[i].st_name);
1170 ret = -ENOENT; 1170 ret = -ENOENT;
1171 break; 1171 break;
1172 1172
1173 default: 1173 default:
1174 /* Divert to percpu allocation if a percpu var. */ 1174 /* Divert to percpu allocation if a percpu var. */
1175 if (sym[i].st_shndx == pcpuindex) 1175 if (sym[i].st_shndx == pcpuindex)
1176 secbase = (unsigned long)mod->percpu; 1176 secbase = (unsigned long)mod->percpu;
1177 else 1177 else
1178 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1178 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1179 sym[i].st_value += secbase; 1179 sym[i].st_value += secbase;
1180 break; 1180 break;
1181 } 1181 }
1182 } 1182 }
1183 1183
1184 return ret; 1184 return ret;
1185 } 1185 }
1186 1186
1187 /* Update size with this section: return offset. */ 1187 /* Update size with this section: return offset. */
1188 static long get_offset(unsigned long *size, Elf_Shdr *sechdr) 1188 static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
1189 { 1189 {
1190 long ret; 1190 long ret;
1191 1191
1192 ret = ALIGN(*size, sechdr->sh_addralign ?: 1); 1192 ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1193 *size = ret + sechdr->sh_size; 1193 *size = ret + sechdr->sh_size;
1194 return ret; 1194 return ret;
1195 } 1195 }
1196 1196
1197 /* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld 1197 /* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
1198 might -- code, read-only data, read-write data, small data. Tally 1198 might -- code, read-only data, read-write data, small data. Tally
1199 sizes, and place the offsets into sh_entsize fields: high bit means it 1199 sizes, and place the offsets into sh_entsize fields: high bit means it
1200 belongs in init. */ 1200 belongs in init. */
1201 static void layout_sections(struct module *mod, 1201 static void layout_sections(struct module *mod,
1202 const Elf_Ehdr *hdr, 1202 const Elf_Ehdr *hdr,
1203 Elf_Shdr *sechdrs, 1203 Elf_Shdr *sechdrs,
1204 const char *secstrings) 1204 const char *secstrings)
1205 { 1205 {
1206 static unsigned long const masks[][2] = { 1206 static unsigned long const masks[][2] = {
1207 /* NOTE: all executable code must be the first section 1207 /* NOTE: all executable code must be the first section
1208 * in this array; otherwise modify the text_size 1208 * in this array; otherwise modify the text_size
1209 * finder in the two loops below */ 1209 * finder in the two loops below */
1210 { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, 1210 { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
1211 { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, 1211 { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
1212 { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, 1212 { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
1213 { ARCH_SHF_SMALL | SHF_ALLOC, 0 } 1213 { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
1214 }; 1214 };
1215 unsigned int m, i; 1215 unsigned int m, i;
1216 1216
1217 for (i = 0; i < hdr->e_shnum; i++) 1217 for (i = 0; i < hdr->e_shnum; i++)
1218 sechdrs[i].sh_entsize = ~0UL; 1218 sechdrs[i].sh_entsize = ~0UL;
1219 1219
1220 DEBUGP("Core section allocation order:\n"); 1220 DEBUGP("Core section allocation order:\n");
1221 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1221 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1222 for (i = 0; i < hdr->e_shnum; ++i) { 1222 for (i = 0; i < hdr->e_shnum; ++i) {
1223 Elf_Shdr *s = &sechdrs[i]; 1223 Elf_Shdr *s = &sechdrs[i];
1224 1224
1225 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1225 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1226 || (s->sh_flags & masks[m][1]) 1226 || (s->sh_flags & masks[m][1])
1227 || s->sh_entsize != ~0UL 1227 || s->sh_entsize != ~0UL
1228 || strncmp(secstrings + s->sh_name, 1228 || strncmp(secstrings + s->sh_name,
1229 ".init", 5) == 0) 1229 ".init", 5) == 0)
1230 continue; 1230 continue;
1231 s->sh_entsize = get_offset(&mod->core_size, s); 1231 s->sh_entsize = get_offset(&mod->core_size, s);
1232 DEBUGP("\t%s\n", secstrings + s->sh_name); 1232 DEBUGP("\t%s\n", secstrings + s->sh_name);
1233 } 1233 }
1234 if (m == 0) 1234 if (m == 0)
1235 mod->core_text_size = mod->core_size; 1235 mod->core_text_size = mod->core_size;
1236 } 1236 }
1237 1237
1238 DEBUGP("Init section allocation order:\n"); 1238 DEBUGP("Init section allocation order:\n");
1239 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 1239 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1240 for (i = 0; i < hdr->e_shnum; ++i) { 1240 for (i = 0; i < hdr->e_shnum; ++i) {
1241 Elf_Shdr *s = &sechdrs[i]; 1241 Elf_Shdr *s = &sechdrs[i];
1242 1242
1243 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1243 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1244 || (s->sh_flags & masks[m][1]) 1244 || (s->sh_flags & masks[m][1])
1245 || s->sh_entsize != ~0UL 1245 || s->sh_entsize != ~0UL
1246 || strncmp(secstrings + s->sh_name, 1246 || strncmp(secstrings + s->sh_name,
1247 ".init", 5) != 0) 1247 ".init", 5) != 0)
1248 continue; 1248 continue;
1249 s->sh_entsize = (get_offset(&mod->init_size, s) 1249 s->sh_entsize = (get_offset(&mod->init_size, s)
1250 | INIT_OFFSET_MASK); 1250 | INIT_OFFSET_MASK);
1251 DEBUGP("\t%s\n", secstrings + s->sh_name); 1251 DEBUGP("\t%s\n", secstrings + s->sh_name);
1252 } 1252 }
1253 if (m == 0) 1253 if (m == 0)
1254 mod->init_text_size = mod->init_size; 1254 mod->init_text_size = mod->init_size;
1255 } 1255 }
1256 } 1256 }
1257 1257
1258 static inline int license_is_gpl_compatible(const char *license) 1258 static inline int license_is_gpl_compatible(const char *license)
1259 { 1259 {
1260 return (strcmp(license, "GPL") == 0 1260 return (strcmp(license, "GPL") == 0
1261 || strcmp(license, "GPL v2") == 0 1261 || strcmp(license, "GPL v2") == 0
1262 || strcmp(license, "GPL and additional rights") == 0 1262 || strcmp(license, "GPL and additional rights") == 0
1263 || strcmp(license, "Dual BSD/GPL") == 0 1263 || strcmp(license, "Dual BSD/GPL") == 0
1264 || strcmp(license, "Dual MPL/GPL") == 0); 1264 || strcmp(license, "Dual MPL/GPL") == 0);
1265 } 1265 }
1266 1266
1267 static void set_license(struct module *mod, const char *license) 1267 static void set_license(struct module *mod, const char *license)
1268 { 1268 {
1269 if (!license) 1269 if (!license)
1270 license = "unspecified"; 1270 license = "unspecified";
1271 1271
1272 mod->license_gplok = license_is_gpl_compatible(license); 1272 mod->license_gplok = license_is_gpl_compatible(license);
1273 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { 1273 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
1274 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", 1274 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
1275 mod->name, license); 1275 mod->name, license);
1276 tainted |= TAINT_PROPRIETARY_MODULE; 1276 tainted |= TAINT_PROPRIETARY_MODULE;
1277 } 1277 }
1278 } 1278 }
1279 1279
1280 /* Parse tag=value strings from .modinfo section */ 1280 /* Parse tag=value strings from .modinfo section */
1281 static char *next_string(char *string, unsigned long *secsize) 1281 static char *next_string(char *string, unsigned long *secsize)
1282 { 1282 {
1283 /* Skip non-zero chars */ 1283 /* Skip non-zero chars */
1284 while (string[0]) { 1284 while (string[0]) {
1285 string++; 1285 string++;
1286 if ((*secsize)-- <= 1) 1286 if ((*secsize)-- <= 1)
1287 return NULL; 1287 return NULL;
1288 } 1288 }
1289 1289
1290 /* Skip any zero padding. */ 1290 /* Skip any zero padding. */
1291 while (!string[0]) { 1291 while (!string[0]) {
1292 string++; 1292 string++;
1293 if ((*secsize)-- <= 1) 1293 if ((*secsize)-- <= 1)
1294 return NULL; 1294 return NULL;
1295 } 1295 }
1296 return string; 1296 return string;
1297 } 1297 }
1298 1298
1299 static char *get_modinfo(Elf_Shdr *sechdrs, 1299 static char *get_modinfo(Elf_Shdr *sechdrs,
1300 unsigned int info, 1300 unsigned int info,
1301 const char *tag) 1301 const char *tag)
1302 { 1302 {
1303 char *p; 1303 char *p;
1304 unsigned int taglen = strlen(tag); 1304 unsigned int taglen = strlen(tag);
1305 unsigned long size = sechdrs[info].sh_size; 1305 unsigned long size = sechdrs[info].sh_size;
1306 1306
1307 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { 1307 for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
1308 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') 1308 if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
1309 return p + taglen + 1; 1309 return p + taglen + 1;
1310 } 1310 }
1311 return NULL; 1311 return NULL;
1312 } 1312 }
1313 1313
1314 #ifdef CONFIG_KALLSYMS 1314 #ifdef CONFIG_KALLSYMS
1315 int is_exported(const char *name, const struct module *mod) 1315 int is_exported(const char *name, const struct module *mod)
1316 { 1316 {
1317 unsigned int i; 1317 unsigned int i;
1318 1318
1319 if (!mod) { 1319 if (!mod) {
1320 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) 1320 for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
1321 if (strcmp(__start___ksymtab[i].name, name) == 0) 1321 if (strcmp(__start___ksymtab[i].name, name) == 0)
1322 return 1; 1322 return 1;
1323 return 0; 1323 return 0;
1324 } 1324 }
1325 for (i = 0; i < mod->num_syms; i++) 1325 for (i = 0; i < mod->num_syms; i++)
1326 if (strcmp(mod->syms[i].name, name) == 0) 1326 if (strcmp(mod->syms[i].name, name) == 0)
1327 return 1; 1327 return 1;
1328 return 0; 1328 return 0;
1329 } 1329 }
1330 1330
1331 /* As per nm */ 1331 /* As per nm */
1332 static char elf_type(const Elf_Sym *sym, 1332 static char elf_type(const Elf_Sym *sym,
1333 Elf_Shdr *sechdrs, 1333 Elf_Shdr *sechdrs,
1334 const char *secstrings, 1334 const char *secstrings,
1335 struct module *mod) 1335 struct module *mod)
1336 { 1336 {
1337 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { 1337 if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
1338 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) 1338 if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
1339 return 'v'; 1339 return 'v';
1340 else 1340 else
1341 return 'w'; 1341 return 'w';
1342 } 1342 }
1343 if (sym->st_shndx == SHN_UNDEF) 1343 if (sym->st_shndx == SHN_UNDEF)
1344 return 'U'; 1344 return 'U';
1345 if (sym->st_shndx == SHN_ABS) 1345 if (sym->st_shndx == SHN_ABS)
1346 return 'a'; 1346 return 'a';
1347 if (sym->st_shndx >= SHN_LORESERVE) 1347 if (sym->st_shndx >= SHN_LORESERVE)
1348 return '?'; 1348 return '?';
1349 if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) 1349 if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
1350 return 't'; 1350 return 't';
1351 if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC 1351 if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
1352 && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { 1352 && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
1353 if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) 1353 if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
1354 return 'r'; 1354 return 'r';
1355 else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) 1355 else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
1356 return 'g'; 1356 return 'g';
1357 else 1357 else
1358 return 'd'; 1358 return 'd';
1359 } 1359 }
1360 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { 1360 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
1361 if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) 1361 if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
1362 return 's'; 1362 return 's';
1363 else 1363 else
1364 return 'b'; 1364 return 'b';
1365 } 1365 }
1366 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, 1366 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
1367 ".debug", strlen(".debug")) == 0) 1367 ".debug", strlen(".debug")) == 0)
1368 return 'n'; 1368 return 'n';
1369 return '?'; 1369 return '?';
1370 } 1370 }
1371 1371
1372 static void add_kallsyms(struct module *mod, 1372 static void add_kallsyms(struct module *mod,
1373 Elf_Shdr *sechdrs, 1373 Elf_Shdr *sechdrs,
1374 unsigned int symindex, 1374 unsigned int symindex,
1375 unsigned int strindex, 1375 unsigned int strindex,
1376 const char *secstrings) 1376 const char *secstrings)
1377 { 1377 {
1378 unsigned int i; 1378 unsigned int i;
1379 1379
1380 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1380 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1381 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1381 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1382 mod->strtab = (void *)sechdrs[strindex].sh_addr; 1382 mod->strtab = (void *)sechdrs[strindex].sh_addr;
1383 1383
1384 /* Set types up while we still have access to sections. */ 1384 /* Set types up while we still have access to sections. */
1385 for (i = 0; i < mod->num_symtab; i++) 1385 for (i = 0; i < mod->num_symtab; i++)
1386 mod->symtab[i].st_info 1386 mod->symtab[i].st_info
1387 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1387 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1388 } 1388 }
1389 #else 1389 #else
1390 static inline void add_kallsyms(struct module *mod, 1390 static inline void add_kallsyms(struct module *mod,
1391 Elf_Shdr *sechdrs, 1391 Elf_Shdr *sechdrs,
1392 unsigned int symindex, 1392 unsigned int symindex,
1393 unsigned int strindex, 1393 unsigned int strindex,
1394 const char *secstrings) 1394 const char *secstrings)
1395 { 1395 {
1396 } 1396 }
1397 #endif /* CONFIG_KALLSYMS */ 1397 #endif /* CONFIG_KALLSYMS */
1398 1398
1399 /* Allocate and load the module: note that size of section 0 is always 1399 /* Allocate and load the module: note that size of section 0 is always
1400 zero, and we rely on this for optional sections. */ 1400 zero, and we rely on this for optional sections. */
1401 static struct module *load_module(void __user *umod, 1401 static struct module *load_module(void __user *umod,
1402 unsigned long len, 1402 unsigned long len,
1403 const char __user *uargs) 1403 const char __user *uargs)
1404 { 1404 {
1405 Elf_Ehdr *hdr; 1405 Elf_Ehdr *hdr;
1406 Elf_Shdr *sechdrs; 1406 Elf_Shdr *sechdrs;
1407 char *secstrings, *args, *modmagic, *strtab = NULL; 1407 char *secstrings, *args, *modmagic, *strtab = NULL;
1408 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1408 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
1409 exportindex, modindex, obsparmindex, infoindex, gplindex, 1409 exportindex, modindex, obsparmindex, infoindex, gplindex,
1410 crcindex, gplcrcindex, versindex, pcpuindex; 1410 crcindex, gplcrcindex, versindex, pcpuindex;
1411 long arglen; 1411 long arglen;
1412 struct module *mod; 1412 struct module *mod;
1413 long err = 0; 1413 long err = 0;
1414 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1414 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1415 struct exception_table_entry *extable; 1415 struct exception_table_entry *extable;
1416 1416
1417 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1417 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
1418 umod, len, uargs); 1418 umod, len, uargs);
1419 if (len < sizeof(*hdr)) 1419 if (len < sizeof(*hdr))
1420 return ERR_PTR(-ENOEXEC); 1420 return ERR_PTR(-ENOEXEC);
1421 1421
1422 /* Suck in entire file: we'll want most of it. */ 1422 /* Suck in entire file: we'll want most of it. */
1423 /* vmalloc barfs on "unusual" numbers. Check here */ 1423 /* vmalloc barfs on "unusual" numbers. Check here */
1424 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1424 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1425 return ERR_PTR(-ENOMEM); 1425 return ERR_PTR(-ENOMEM);
1426 if (copy_from_user(hdr, umod, len) != 0) { 1426 if (copy_from_user(hdr, umod, len) != 0) {
1427 err = -EFAULT; 1427 err = -EFAULT;
1428 goto free_hdr; 1428 goto free_hdr;
1429 } 1429 }
1430 1430
1431 /* Sanity checks against insmoding binaries or wrong arch, 1431 /* Sanity checks against insmoding binaries or wrong arch,
1432 weird elf version */ 1432 weird elf version */
1433 if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 1433 if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
1434 || hdr->e_type != ET_REL 1434 || hdr->e_type != ET_REL
1435 || !elf_check_arch(hdr) 1435 || !elf_check_arch(hdr)
1436 || hdr->e_shentsize != sizeof(*sechdrs)) { 1436 || hdr->e_shentsize != sizeof(*sechdrs)) {
1437 err = -ENOEXEC; 1437 err = -ENOEXEC;
1438 goto free_hdr; 1438 goto free_hdr;
1439 } 1439 }
1440 1440
1441 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) 1441 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
1442 goto truncated; 1442 goto truncated;
1443 1443
1444 /* Convenience variables */ 1444 /* Convenience variables */
1445 sechdrs = (void *)hdr + hdr->e_shoff; 1445 sechdrs = (void *)hdr + hdr->e_shoff;
1446 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 1446 secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
1447 sechdrs[0].sh_addr = 0; 1447 sechdrs[0].sh_addr = 0;
1448 1448
1449 for (i = 1; i < hdr->e_shnum; i++) { 1449 for (i = 1; i < hdr->e_shnum; i++) {
1450 if (sechdrs[i].sh_type != SHT_NOBITS 1450 if (sechdrs[i].sh_type != SHT_NOBITS
1451 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) 1451 && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
1452 goto truncated; 1452 goto truncated;
1453 1453
1454 /* Mark all sections sh_addr with their address in the 1454 /* Mark all sections sh_addr with their address in the
1455 temporary image. */ 1455 temporary image. */
1456 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; 1456 sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
1457 1457
1458 /* Internal symbols and strings. */ 1458 /* Internal symbols and strings. */
1459 if (sechdrs[i].sh_type == SHT_SYMTAB) { 1459 if (sechdrs[i].sh_type == SHT_SYMTAB) {
1460 symindex = i; 1460 symindex = i;
1461 strindex = sechdrs[i].sh_link; 1461 strindex = sechdrs[i].sh_link;
1462 strtab = (char *)hdr + sechdrs[strindex].sh_offset; 1462 strtab = (char *)hdr + sechdrs[strindex].sh_offset;
1463 } 1463 }
1464 #ifndef CONFIG_MODULE_UNLOAD 1464 #ifndef CONFIG_MODULE_UNLOAD
1465 /* Don't load .exit sections */ 1465 /* Don't load .exit sections */
1466 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) 1466 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
1467 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 1467 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1468 #endif 1468 #endif
1469 } 1469 }
1470 1470
1471 modindex = find_sec(hdr, sechdrs, secstrings, 1471 modindex = find_sec(hdr, sechdrs, secstrings,
1472 ".gnu.linkonce.this_module"); 1472 ".gnu.linkonce.this_module");
1473 if (!modindex) { 1473 if (!modindex) {
1474 printk(KERN_WARNING "No module found in object\n"); 1474 printk(KERN_WARNING "No module found in object\n");
1475 err = -ENOEXEC; 1475 err = -ENOEXEC;
1476 goto free_hdr; 1476 goto free_hdr;
1477 } 1477 }
1478 mod = (void *)sechdrs[modindex].sh_addr; 1478 mod = (void *)sechdrs[modindex].sh_addr;
1479 1479
1480 if (symindex == 0) { 1480 if (symindex == 0) {
1481 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", 1481 printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
1482 mod->name); 1482 mod->name);
1483 err = -ENOEXEC; 1483 err = -ENOEXEC;
1484 goto free_hdr; 1484 goto free_hdr;
1485 } 1485 }
1486 1486
1487 /* Optional sections */ 1487 /* Optional sections */
1488 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1488 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1489 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1489 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1490 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1490 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1491 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1491 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1492 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1492 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1493 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1493 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1494 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1494 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1495 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1495 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1496 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1496 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1497 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1497 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1498 1498
1499 /* Don't keep modinfo section */ 1499 /* Don't keep modinfo section */
1500 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1500 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1501 #ifdef CONFIG_KALLSYMS 1501 #ifdef CONFIG_KALLSYMS
1502 /* Keep symbol and string tables for decoding later. */ 1502 /* Keep symbol and string tables for decoding later. */
1503 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1503 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1504 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1504 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1505 #endif 1505 #endif
1506 1506
1507 /* Check module struct version now, before we try to use module. */ 1507 /* Check module struct version now, before we try to use module. */
1508 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1508 if (!check_modstruct_version(sechdrs, versindex, mod)) {
1509 err = -ENOEXEC; 1509 err = -ENOEXEC;
1510 goto free_hdr; 1510 goto free_hdr;
1511 } 1511 }
1512 1512
1513 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1513 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1514 /* This is allowed: modprobe --force will invalidate it. */ 1514 /* This is allowed: modprobe --force will invalidate it. */
1515 if (!modmagic) { 1515 if (!modmagic) {
1516 tainted |= TAINT_FORCED_MODULE; 1516 tainted |= TAINT_FORCED_MODULE;
1517 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1517 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1518 mod->name); 1518 mod->name);
1519 } else if (!same_magic(modmagic, vermagic)) { 1519 } else if (!same_magic(modmagic, vermagic)) {
1520 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", 1520 printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
1521 mod->name, modmagic, vermagic); 1521 mod->name, modmagic, vermagic);
1522 err = -ENOEXEC; 1522 err = -ENOEXEC;
1523 goto free_hdr; 1523 goto free_hdr;
1524 } 1524 }
1525 1525
1526 /* Now copy in args */ 1526 /* Now copy in args */
1527 arglen = strlen_user(uargs); 1527 arglen = strlen_user(uargs);
1528 if (!arglen) { 1528 if (!arglen) {
1529 err = -EFAULT; 1529 err = -EFAULT;
1530 goto free_hdr; 1530 goto free_hdr;
1531 } 1531 }
1532 args = kmalloc(arglen, GFP_KERNEL); 1532 args = kmalloc(arglen, GFP_KERNEL);
1533 if (!args) { 1533 if (!args) {
1534 err = -ENOMEM; 1534 err = -ENOMEM;
1535 goto free_hdr; 1535 goto free_hdr;
1536 } 1536 }
1537 if (copy_from_user(args, uargs, arglen) != 0) { 1537 if (copy_from_user(args, uargs, arglen) != 0) {
1538 err = -EFAULT; 1538 err = -EFAULT;
1539 goto free_mod; 1539 goto free_mod;
1540 } 1540 }
1541 1541
1542 if (find_module(mod->name)) { 1542 if (find_module(mod->name)) {
1543 err = -EEXIST; 1543 err = -EEXIST;
1544 goto free_mod; 1544 goto free_mod;
1545 } 1545 }
1546 1546
1547 mod->state = MODULE_STATE_COMING; 1547 mod->state = MODULE_STATE_COMING;
1548 1548
1549 /* Allow arches to frob section contents and sizes. */ 1549 /* Allow arches to frob section contents and sizes. */
1550 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); 1550 err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
1551 if (err < 0) 1551 if (err < 0)
1552 goto free_mod; 1552 goto free_mod;
1553 1553
1554 if (pcpuindex) { 1554 if (pcpuindex) {
1555 /* We have a special allocation for this section. */ 1555 /* We have a special allocation for this section. */
1556 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 1556 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
1557 sechdrs[pcpuindex].sh_addralign); 1557 sechdrs[pcpuindex].sh_addralign);
1558 if (!percpu) { 1558 if (!percpu) {
1559 err = -ENOMEM; 1559 err = -ENOMEM;
1560 goto free_mod; 1560 goto free_mod;
1561 } 1561 }
1562 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1562 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
1563 mod->percpu = percpu; 1563 mod->percpu = percpu;
1564 } 1564 }
1565 1565
1566 /* Determine total sizes, and put offsets in sh_entsize. For now 1566 /* Determine total sizes, and put offsets in sh_entsize. For now
1567 this is done generically; there doesn't appear to be any 1567 this is done generically; there doesn't appear to be any
1568 special cases for the architectures. */ 1568 special cases for the architectures. */
1569 layout_sections(mod, hdr, sechdrs, secstrings); 1569 layout_sections(mod, hdr, sechdrs, secstrings);
1570 1570
1571 /* Do the allocs. */ 1571 /* Do the allocs. */
1572 ptr = module_alloc(mod->core_size); 1572 ptr = module_alloc(mod->core_size);
1573 if (!ptr) { 1573 if (!ptr) {
1574 err = -ENOMEM; 1574 err = -ENOMEM;
1575 goto free_percpu; 1575 goto free_percpu;
1576 } 1576 }
1577 memset(ptr, 0, mod->core_size); 1577 memset(ptr, 0, mod->core_size);
1578 mod->module_core = ptr; 1578 mod->module_core = ptr;
1579 1579
1580 ptr = module_alloc(mod->init_size); 1580 ptr = module_alloc(mod->init_size);
1581 if (!ptr && mod->init_size) { 1581 if (!ptr && mod->init_size) {
1582 err = -ENOMEM; 1582 err = -ENOMEM;
1583 goto free_core; 1583 goto free_core;
1584 } 1584 }
1585 memset(ptr, 0, mod->init_size); 1585 memset(ptr, 0, mod->init_size);
1586 mod->module_init = ptr; 1586 mod->module_init = ptr;
1587 1587
1588 /* Transfer each section which specifies SHF_ALLOC */ 1588 /* Transfer each section which specifies SHF_ALLOC */
1589 DEBUGP("final section addresses:\n"); 1589 DEBUGP("final section addresses:\n");
1590 for (i = 0; i < hdr->e_shnum; i++) { 1590 for (i = 0; i < hdr->e_shnum; i++) {
1591 void *dest; 1591 void *dest;
1592 1592
1593 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1593 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1594 continue; 1594 continue;
1595 1595
1596 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) 1596 if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
1597 dest = mod->module_init 1597 dest = mod->module_init
1598 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); 1598 + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
1599 else 1599 else
1600 dest = mod->module_core + sechdrs[i].sh_entsize; 1600 dest = mod->module_core + sechdrs[i].sh_entsize;
1601 1601
1602 if (sechdrs[i].sh_type != SHT_NOBITS) 1602 if (sechdrs[i].sh_type != SHT_NOBITS)
1603 memcpy(dest, (void *)sechdrs[i].sh_addr, 1603 memcpy(dest, (void *)sechdrs[i].sh_addr,
1604 sechdrs[i].sh_size); 1604 sechdrs[i].sh_size);
1605 /* Update sh_addr to point to copy in image. */ 1605 /* Update sh_addr to point to copy in image. */
1606 sechdrs[i].sh_addr = (unsigned long)dest; 1606 sechdrs[i].sh_addr = (unsigned long)dest;
1607 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); 1607 DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
1608 } 1608 }
1609 /* Module has been moved. */ 1609 /* Module has been moved. */
1610 mod = (void *)sechdrs[modindex].sh_addr; 1610 mod = (void *)sechdrs[modindex].sh_addr;
1611 1611
1612 /* Now we've moved module, initialize linked lists, etc. */ 1612 /* Now we've moved module, initialize linked lists, etc. */
1613 module_unload_init(mod); 1613 module_unload_init(mod);
1614 1614
1615 /* Set up license info based on the info section */ 1615 /* Set up license info based on the info section */
1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617 1617
1618 /* Fix up syms, so that st_value is a pointer to location. */ 1618 /* Fix up syms, so that st_value is a pointer to location. */
1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, 1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620 mod); 1620 mod);
1621 if (err < 0) 1621 if (err < 0)
1622 goto cleanup; 1622 goto cleanup;
1623 1623
1624 /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ 1624 /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
1625 mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); 1625 mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
1626 mod->syms = (void *)sechdrs[exportindex].sh_addr; 1626 mod->syms = (void *)sechdrs[exportindex].sh_addr;
1627 if (crcindex) 1627 if (crcindex)
1628 mod->crcs = (void *)sechdrs[crcindex].sh_addr; 1628 mod->crcs = (void *)sechdrs[crcindex].sh_addr;
1629 mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); 1629 mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
1630 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; 1630 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
1631 if (gplcrcindex) 1631 if (gplcrcindex)
1632 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 1632 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1633 1633
1634 #ifdef CONFIG_MODVERSIONS 1634 #ifdef CONFIG_MODVERSIONS
1635 if ((mod->num_syms && !crcindex) || 1635 if ((mod->num_syms && !crcindex) ||
1636 (mod->num_gpl_syms && !gplcrcindex)) { 1636 (mod->num_gpl_syms && !gplcrcindex)) {
1637 printk(KERN_WARNING "%s: No versions for exported symbols." 1637 printk(KERN_WARNING "%s: No versions for exported symbols."
1638 " Tainting kernel.\n", mod->name); 1638 " Tainting kernel.\n", mod->name);
1639 tainted |= TAINT_FORCED_MODULE; 1639 tainted |= TAINT_FORCED_MODULE;
1640 } 1640 }
1641 #endif 1641 #endif
1642 1642
1643 /* Now do relocations. */ 1643 /* Now do relocations. */
1644 for (i = 1; i < hdr->e_shnum; i++) { 1644 for (i = 1; i < hdr->e_shnum; i++) {
1645 const char *strtab = (char *)sechdrs[strindex].sh_addr; 1645 const char *strtab = (char *)sechdrs[strindex].sh_addr;
1646 unsigned int info = sechdrs[i].sh_info; 1646 unsigned int info = sechdrs[i].sh_info;
1647 1647
1648 /* Not a valid relocation section? */ 1648 /* Not a valid relocation section? */
1649 if (info >= hdr->e_shnum) 1649 if (info >= hdr->e_shnum)
1650 continue; 1650 continue;
1651 1651
1652 /* Don't bother with non-allocated sections */ 1652 /* Don't bother with non-allocated sections */
1653 if (!(sechdrs[info].sh_flags & SHF_ALLOC)) 1653 if (!(sechdrs[info].sh_flags & SHF_ALLOC))
1654 continue; 1654 continue;
1655 1655
1656 if (sechdrs[i].sh_type == SHT_REL) 1656 if (sechdrs[i].sh_type == SHT_REL)
1657 err = apply_relocate(sechdrs, strtab, symindex, i,mod); 1657 err = apply_relocate(sechdrs, strtab, symindex, i,mod);
1658 else if (sechdrs[i].sh_type == SHT_RELA) 1658 else if (sechdrs[i].sh_type == SHT_RELA)
1659 err = apply_relocate_add(sechdrs, strtab, symindex, i, 1659 err = apply_relocate_add(sechdrs, strtab, symindex, i,
1660 mod); 1660 mod);
1661 if (err < 0) 1661 if (err < 0)
1662 goto cleanup; 1662 goto cleanup;
1663 } 1663 }
1664 1664
1665 /* Set up and sort exception table */ 1665 /* Set up and sort exception table */
1666 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); 1666 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
1667 mod->extable = extable = (void *)sechdrs[exindex].sh_addr; 1667 mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
1668 sort_extable(extable, extable + mod->num_exentries); 1668 sort_extable(extable, extable + mod->num_exentries);
1669 1669
1670 /* Finally, copy percpu area over. */ 1670 /* Finally, copy percpu area over. */
1671 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 1671 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
1672 sechdrs[pcpuindex].sh_size); 1672 sechdrs[pcpuindex].sh_size);
1673 1673
1674 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 1674 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
1675 1675
1676 err = module_finalize(hdr, sechdrs, mod); 1676 err = module_finalize(hdr, sechdrs, mod);
1677 if (err < 0) 1677 if (err < 0)
1678 goto cleanup; 1678 goto cleanup;
1679 1679
1680 mod->args = args; 1680 mod->args = args;
1681 if (obsparmindex) { 1681 if (obsparmindex) {
1682 err = obsolete_params(mod->name, mod->args, 1682 err = obsolete_params(mod->name, mod->args,
1683 (struct obsolete_modparm *) 1683 (struct obsolete_modparm *)
1684 sechdrs[obsparmindex].sh_addr, 1684 sechdrs[obsparmindex].sh_addr,
1685 sechdrs[obsparmindex].sh_size 1685 sechdrs[obsparmindex].sh_size
1686 / sizeof(struct obsolete_modparm), 1686 / sizeof(struct obsolete_modparm),
1687 sechdrs, symindex, 1687 sechdrs, symindex,
1688 (char *)sechdrs[strindex].sh_addr); 1688 (char *)sechdrs[strindex].sh_addr);
1689 if (setupindex) 1689 if (setupindex)
1690 printk(KERN_WARNING "%s: Ignoring new-style " 1690 printk(KERN_WARNING "%s: Ignoring new-style "
1691 "parameters in presence of obsolete ones\n", 1691 "parameters in presence of obsolete ones\n",
1692 mod->name); 1692 mod->name);
1693 } else { 1693 } else {
1694 /* Size of section 0 is 0, so this works well if no params */ 1694 /* Size of section 0 is 0, so this works well if no params */
1695 err = parse_args(mod->name, mod->args, 1695 err = parse_args(mod->name, mod->args,
1696 (struct kernel_param *) 1696 (struct kernel_param *)
1697 sechdrs[setupindex].sh_addr, 1697 sechdrs[setupindex].sh_addr,
1698 sechdrs[setupindex].sh_size 1698 sechdrs[setupindex].sh_size
1699 / sizeof(struct kernel_param), 1699 / sizeof(struct kernel_param),
1700 NULL); 1700 NULL);
1701 } 1701 }
1702 if (err < 0) 1702 if (err < 0)
1703 goto arch_cleanup; 1703 goto arch_cleanup;
1704 1704
1705 err = mod_sysfs_setup(mod, 1705 err = mod_sysfs_setup(mod,
1706 (struct kernel_param *) 1706 (struct kernel_param *)
1707 sechdrs[setupindex].sh_addr, 1707 sechdrs[setupindex].sh_addr,
1708 sechdrs[setupindex].sh_size 1708 sechdrs[setupindex].sh_size
1709 / sizeof(struct kernel_param)); 1709 / sizeof(struct kernel_param));
1710 if (err < 0) 1710 if (err < 0)
1711 goto arch_cleanup; 1711 goto arch_cleanup;
1712 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1712 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1713 1713
1714 /* Get rid of temporary copy */ 1714 /* Get rid of temporary copy */
1715 vfree(hdr); 1715 vfree(hdr);
1716 1716
1717 /* Done! */ 1717 /* Done! */
1718 return mod; 1718 return mod;
1719 1719
1720 arch_cleanup: 1720 arch_cleanup:
1721 module_arch_cleanup(mod); 1721 module_arch_cleanup(mod);
1722 cleanup: 1722 cleanup:
1723 module_unload_free(mod); 1723 module_unload_free(mod);
1724 module_free(mod, mod->module_init); 1724 module_free(mod, mod->module_init);
1725 free_core: 1725 free_core:
1726 module_free(mod, mod->module_core); 1726 module_free(mod, mod->module_core);
1727 free_percpu: 1727 free_percpu:
1728 if (percpu) 1728 if (percpu)
1729 percpu_modfree(percpu); 1729 percpu_modfree(percpu);
1730 free_mod: 1730 free_mod:
1731 kfree(args); 1731 kfree(args);
1732 free_hdr: 1732 free_hdr:
1733 vfree(hdr); 1733 vfree(hdr);
1734 if (err < 0) return ERR_PTR(err); 1734 if (err < 0) return ERR_PTR(err);
1735 else return ptr; 1735 else return ptr;
1736 1736
1737 truncated: 1737 truncated:
1738 printk(KERN_ERR "Module len %lu truncated\n", len); 1738 printk(KERN_ERR "Module len %lu truncated\n", len);
1739 err = -ENOEXEC; 1739 err = -ENOEXEC;
1740 goto free_hdr; 1740 goto free_hdr;
1741 } 1741 }
1742 1742
1743 /* 1743 /*
1744 * link the module with the whole machine is stopped with interrupts off 1744 * link the module with the whole machine is stopped with interrupts off
1745 * - this defends against kallsyms not taking locks 1745 * - this defends against kallsyms not taking locks
1746 */ 1746 */
1747 static int __link_module(void *_mod) 1747 static int __link_module(void *_mod)
1748 { 1748 {
1749 struct module *mod = _mod; 1749 struct module *mod = _mod;
1750 list_add(&mod->list, &modules); 1750 list_add(&mod->list, &modules);
1751 return 0; 1751 return 0;
1752 } 1752 }
1753 1753
1754 /* This is where the real work happens */ 1754 /* This is where the real work happens */
1755 asmlinkage long 1755 asmlinkage long
1756 sys_init_module(void __user *umod, 1756 sys_init_module(void __user *umod,
1757 unsigned long len, 1757 unsigned long len,
1758 const char __user *uargs) 1758 const char __user *uargs)
1759 { 1759 {
1760 struct module *mod; 1760 struct module *mod;
1761 mm_segment_t old_fs = get_fs(); 1761 mm_segment_t old_fs = get_fs();
1762 int ret = 0; 1762 int ret = 0;
1763 1763
1764 /* Must have permission */ 1764 /* Must have permission */
1765 if (!capable(CAP_SYS_MODULE)) 1765 if (!capable(CAP_SYS_MODULE))
1766 return -EPERM; 1766 return -EPERM;
1767 1767
1768 /* Only one module load at a time, please */ 1768 /* Only one module load at a time, please */
1769 if (down_interruptible(&module_mutex) != 0) 1769 if (down_interruptible(&module_mutex) != 0)
1770 return -EINTR; 1770 return -EINTR;
1771 1771
1772 /* Do all the hard work */ 1772 /* Do all the hard work */
1773 mod = load_module(umod, len, uargs); 1773 mod = load_module(umod, len, uargs);
1774 if (IS_ERR(mod)) { 1774 if (IS_ERR(mod)) {
1775 up(&module_mutex); 1775 up(&module_mutex);
1776 return PTR_ERR(mod); 1776 return PTR_ERR(mod);
1777 } 1777 }
1778 1778
1779 /* flush the icache in correct context */ 1779 /* flush the icache in correct context */
1780 set_fs(KERNEL_DS); 1780 set_fs(KERNEL_DS);
1781 1781
1782 /* Flush the instruction cache, since we've played with text */ 1782 /* Flush the instruction cache, since we've played with text */
1783 if (mod->module_init) 1783 if (mod->module_init)
1784 flush_icache_range((unsigned long)mod->module_init, 1784 flush_icache_range((unsigned long)mod->module_init,
1785 (unsigned long)mod->module_init 1785 (unsigned long)mod->module_init
1786 + mod->init_size); 1786 + mod->init_size);
1787 flush_icache_range((unsigned long)mod->module_core, 1787 flush_icache_range((unsigned long)mod->module_core,
1788 (unsigned long)mod->module_core + mod->core_size); 1788 (unsigned long)mod->module_core + mod->core_size);
1789 1789
1790 set_fs(old_fs); 1790 set_fs(old_fs);
1791 1791
1792 /* Now sew it into the lists. They won't access us, since 1792 /* Now sew it into the lists. They won't access us, since
1793 strong_try_module_get() will fail. */ 1793 strong_try_module_get() will fail. */
1794 stop_machine_run(__link_module, mod, NR_CPUS); 1794 stop_machine_run(__link_module, mod, NR_CPUS);
1795 1795
1796 /* Drop lock so they can recurse */ 1796 /* Drop lock so they can recurse */
1797 up(&module_mutex); 1797 up(&module_mutex);
1798 1798
1799 down(&notify_mutex); 1799 down(&notify_mutex);
1800 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); 1800 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
1801 up(&notify_mutex); 1801 up(&notify_mutex);
1802 1802
1803 /* Start the module */ 1803 /* Start the module */
1804 if (mod->init != NULL) 1804 if (mod->init != NULL)
1805 ret = mod->init(); 1805 ret = mod->init();
1806 if (ret < 0) { 1806 if (ret < 0) {
1807 /* Init routine failed: abort. Try to protect us from 1807 /* Init routine failed: abort. Try to protect us from
1808 buggy refcounters. */ 1808 buggy refcounters. */
1809 mod->state = MODULE_STATE_GOING; 1809 mod->state = MODULE_STATE_GOING;
1810 synchronize_sched(); 1810 synchronize_sched();
1811 if (mod->unsafe) 1811 if (mod->unsafe)
1812 printk(KERN_ERR "%s: module is now stuck!\n", 1812 printk(KERN_ERR "%s: module is now stuck!\n",
1813 mod->name); 1813 mod->name);
1814 else { 1814 else {
1815 module_put(mod); 1815 module_put(mod);
1816 down(&module_mutex); 1816 down(&module_mutex);
1817 free_module(mod); 1817 free_module(mod);
1818 up(&module_mutex); 1818 up(&module_mutex);
1819 } 1819 }
1820 return ret; 1820 return ret;
1821 } 1821 }
1822 1822
1823 /* Now it's a first class citizen! */ 1823 /* Now it's a first class citizen! */
1824 down(&module_mutex); 1824 down(&module_mutex);
1825 mod->state = MODULE_STATE_LIVE; 1825 mod->state = MODULE_STATE_LIVE;
1826 /* Drop initial reference. */ 1826 /* Drop initial reference. */
1827 module_put(mod); 1827 module_put(mod);
1828 module_free(mod, mod->module_init); 1828 module_free(mod, mod->module_init);
1829 mod->module_init = NULL; 1829 mod->module_init = NULL;
1830 mod->init_size = 0; 1830 mod->init_size = 0;
1831 mod->init_text_size = 0; 1831 mod->init_text_size = 0;
1832 up(&module_mutex); 1832 up(&module_mutex);
1833 1833
1834 return 0; 1834 return 0;
1835 } 1835 }
1836 1836
1837 static inline int within(unsigned long addr, void *start, unsigned long size) 1837 static inline int within(unsigned long addr, void *start, unsigned long size)
1838 { 1838 {
1839 return ((void *)addr >= start && (void *)addr < start + size); 1839 return ((void *)addr >= start && (void *)addr < start + size);
1840 } 1840 }
1841 1841
1842 #ifdef CONFIG_KALLSYMS 1842 #ifdef CONFIG_KALLSYMS
1843 /* 1843 /*
1844 * This ignores the intensely annoying "mapping symbols" found 1844 * This ignores the intensely annoying "mapping symbols" found
1845 * in ARM ELF files: $a, $t and $d. 1845 * in ARM ELF files: $a, $t and $d.
1846 */ 1846 */
1847 static inline int is_arm_mapping_symbol(const char *str) 1847 static inline int is_arm_mapping_symbol(const char *str)
1848 { 1848 {
1849 return str[0] == '$' && strchr("atd", str[1]) 1849 return str[0] == '$' && strchr("atd", str[1])
1850 && (str[2] == '\0' || str[2] == '.'); 1850 && (str[2] == '\0' || str[2] == '.');
1851 } 1851 }
1852 1852
1853 static const char *get_ksymbol(struct module *mod, 1853 static const char *get_ksymbol(struct module *mod,
1854 unsigned long addr, 1854 unsigned long addr,
1855 unsigned long *size, 1855 unsigned long *size,
1856 unsigned long *offset) 1856 unsigned long *offset)
1857 { 1857 {
1858 unsigned int i, best = 0; 1858 unsigned int i, best = 0;
1859 unsigned long nextval; 1859 unsigned long nextval;
1860 1860
1861 /* At worse, next value is at end of module */ 1861 /* At worse, next value is at end of module */
1862 if (within(addr, mod->module_init, mod->init_size)) 1862 if (within(addr, mod->module_init, mod->init_size))
1863 nextval = (unsigned long)mod->module_init+mod->init_text_size; 1863 nextval = (unsigned long)mod->module_init+mod->init_text_size;
1864 else 1864 else
1865 nextval = (unsigned long)mod->module_core+mod->core_text_size; 1865 nextval = (unsigned long)mod->module_core+mod->core_text_size;
1866 1866
1867 /* Scan for closest preceeding symbol, and next symbol. (ELF 1867 /* Scan for closest preceeding symbol, and next symbol. (ELF
1868 starts real symbols at 1). */ 1868 starts real symbols at 1). */
1869 for (i = 1; i < mod->num_symtab; i++) { 1869 for (i = 1; i < mod->num_symtab; i++) {
1870 if (mod->symtab[i].st_shndx == SHN_UNDEF) 1870 if (mod->symtab[i].st_shndx == SHN_UNDEF)
1871 continue; 1871 continue;
1872 1872
1873 /* We ignore unnamed symbols: they're uninformative 1873 /* We ignore unnamed symbols: they're uninformative
1874 * and inserted at a whim. */ 1874 * and inserted at a whim. */
1875 if (mod->symtab[i].st_value <= addr 1875 if (mod->symtab[i].st_value <= addr
1876 && mod->symtab[i].st_value > mod->symtab[best].st_value 1876 && mod->symtab[i].st_value > mod->symtab[best].st_value
1877 && *(mod->strtab + mod->symtab[i].st_name) != '\0' 1877 && *(mod->strtab + mod->symtab[i].st_name) != '\0'
1878 && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) 1878 && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
1879 best = i; 1879 best = i;
1880 if (mod->symtab[i].st_value > addr 1880 if (mod->symtab[i].st_value > addr
1881 && mod->symtab[i].st_value < nextval 1881 && mod->symtab[i].st_value < nextval
1882 && *(mod->strtab + mod->symtab[i].st_name) != '\0' 1882 && *(mod->strtab + mod->symtab[i].st_name) != '\0'
1883 && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) 1883 && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
1884 nextval = mod->symtab[i].st_value; 1884 nextval = mod->symtab[i].st_value;
1885 } 1885 }
1886 1886
1887 if (!best) 1887 if (!best)
1888 return NULL; 1888 return NULL;
1889 1889
1890 *size = nextval - mod->symtab[best].st_value; 1890 *size = nextval - mod->symtab[best].st_value;
1891 *offset = addr - mod->symtab[best].st_value; 1891 *offset = addr - mod->symtab[best].st_value;
1892 return mod->strtab + mod->symtab[best].st_name; 1892 return mod->strtab + mod->symtab[best].st_name;
1893 } 1893 }
1894 1894
1895 /* For kallsyms to ask for address resolution. NULL means not found. 1895 /* For kallsyms to ask for address resolution. NULL means not found.
1896 We don't lock, as this is used for oops resolution and races are a 1896 We don't lock, as this is used for oops resolution and races are a
1897 lesser concern. */ 1897 lesser concern. */
1898 const char *module_address_lookup(unsigned long addr, 1898 const char *module_address_lookup(unsigned long addr,
1899 unsigned long *size, 1899 unsigned long *size,
1900 unsigned long *offset, 1900 unsigned long *offset,
1901 char **modname) 1901 char **modname)
1902 { 1902 {
1903 struct module *mod; 1903 struct module *mod;
1904 1904
1905 list_for_each_entry(mod, &modules, list) { 1905 list_for_each_entry(mod, &modules, list) {
1906 if (within(addr, mod->module_init, mod->init_size) 1906 if (within(addr, mod->module_init, mod->init_size)
1907 || within(addr, mod->module_core, mod->core_size)) { 1907 || within(addr, mod->module_core, mod->core_size)) {
1908 *modname = mod->name; 1908 *modname = mod->name;
1909 return get_ksymbol(mod, addr, size, offset); 1909 return get_ksymbol(mod, addr, size, offset);
1910 } 1910 }
1911 } 1911 }
1912 return NULL; 1912 return NULL;
1913 } 1913 }
1914 1914
1915 struct module *module_get_kallsym(unsigned int symnum, 1915 struct module *module_get_kallsym(unsigned int symnum,
1916 unsigned long *value, 1916 unsigned long *value,
1917 char *type, 1917 char *type,
1918 char namebuf[128]) 1918 char namebuf[128])
1919 { 1919 {
1920 struct module *mod; 1920 struct module *mod;
1921 1921
1922 down(&module_mutex); 1922 down(&module_mutex);
1923 list_for_each_entry(mod, &modules, list) { 1923 list_for_each_entry(mod, &modules, list) {
1924 if (symnum < mod->num_symtab) { 1924 if (symnum < mod->num_symtab) {
1925 *value = mod->symtab[symnum].st_value; 1925 *value = mod->symtab[symnum].st_value;
1926 *type = mod->symtab[symnum].st_info; 1926 *type = mod->symtab[symnum].st_info;
1927 strncpy(namebuf, 1927 strncpy(namebuf,
1928 mod->strtab + mod->symtab[symnum].st_name, 1928 mod->strtab + mod->symtab[symnum].st_name,
1929 127); 1929 127);
1930 up(&module_mutex); 1930 up(&module_mutex);
1931 return mod; 1931 return mod;
1932 } 1932 }
1933 symnum -= mod->num_symtab; 1933 symnum -= mod->num_symtab;
1934 } 1934 }
1935 up(&module_mutex); 1935 up(&module_mutex);
1936 return NULL; 1936 return NULL;
1937 } 1937 }
1938 1938
1939 static unsigned long mod_find_symname(struct module *mod, const char *name) 1939 static unsigned long mod_find_symname(struct module *mod, const char *name)
1940 { 1940 {
1941 unsigned int i; 1941 unsigned int i;
1942 1942
1943 for (i = 0; i < mod->num_symtab; i++) 1943 for (i = 0; i < mod->num_symtab; i++)
1944 if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0) 1944 if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)
1945 return mod->symtab[i].st_value; 1945 return mod->symtab[i].st_value;
1946 return 0; 1946 return 0;
1947 } 1947 }
1948 1948
1949 /* Look for this name: can be of form module:name. */ 1949 /* Look for this name: can be of form module:name. */
1950 unsigned long module_kallsyms_lookup_name(const char *name) 1950 unsigned long module_kallsyms_lookup_name(const char *name)
1951 { 1951 {
1952 struct module *mod; 1952 struct module *mod;
1953 char *colon; 1953 char *colon;
1954 unsigned long ret = 0; 1954 unsigned long ret = 0;
1955 1955
1956 /* Don't lock: we're in enough trouble already. */ 1956 /* Don't lock: we're in enough trouble already. */
1957 if ((colon = strchr(name, ':')) != NULL) { 1957 if ((colon = strchr(name, ':')) != NULL) {
1958 *colon = '\0'; 1958 *colon = '\0';
1959 if ((mod = find_module(name)) != NULL) 1959 if ((mod = find_module(name)) != NULL)
1960 ret = mod_find_symname(mod, colon+1); 1960 ret = mod_find_symname(mod, colon+1);
1961 *colon = ':'; 1961 *colon = ':';
1962 } else { 1962 } else {
1963 list_for_each_entry(mod, &modules, list) 1963 list_for_each_entry(mod, &modules, list)
1964 if ((ret = mod_find_symname(mod, name)) != 0) 1964 if ((ret = mod_find_symname(mod, name)) != 0)
1965 break; 1965 break;
1966 } 1966 }
1967 return ret; 1967 return ret;
1968 } 1968 }
1969 #endif /* CONFIG_KALLSYMS */ 1969 #endif /* CONFIG_KALLSYMS */
1970 1970
1971 /* Called by the /proc file system to return a list of modules. */ 1971 /* Called by the /proc file system to return a list of modules. */
1972 static void *m_start(struct seq_file *m, loff_t *pos) 1972 static void *m_start(struct seq_file *m, loff_t *pos)
1973 { 1973 {
1974 struct list_head *i; 1974 struct list_head *i;
1975 loff_t n = 0; 1975 loff_t n = 0;
1976 1976
1977 down(&module_mutex); 1977 down(&module_mutex);
1978 list_for_each(i, &modules) { 1978 list_for_each(i, &modules) {
1979 if (n++ == *pos) 1979 if (n++ == *pos)
1980 break; 1980 break;
1981 } 1981 }
1982 if (i == &modules) 1982 if (i == &modules)
1983 return NULL; 1983 return NULL;
1984 return i; 1984 return i;
1985 } 1985 }
1986 1986
1987 static void *m_next(struct seq_file *m, void *p, loff_t *pos) 1987 static void *m_next(struct seq_file *m, void *p, loff_t *pos)
1988 { 1988 {
1989 struct list_head *i = p; 1989 struct list_head *i = p;
1990 (*pos)++; 1990 (*pos)++;
1991 if (i->next == &modules) 1991 if (i->next == &modules)
1992 return NULL; 1992 return NULL;
1993 return i->next; 1993 return i->next;
1994 } 1994 }
1995 1995
1996 static void m_stop(struct seq_file *m, void *p) 1996 static void m_stop(struct seq_file *m, void *p)
1997 { 1997 {
1998 up(&module_mutex); 1998 up(&module_mutex);
1999 } 1999 }
2000 2000
2001 static int m_show(struct seq_file *m, void *p) 2001 static int m_show(struct seq_file *m, void *p)
2002 { 2002 {
2003 struct module *mod = list_entry(p, struct module, list); 2003 struct module *mod = list_entry(p, struct module, list);
2004 seq_printf(m, "%s %lu", 2004 seq_printf(m, "%s %lu",
2005 mod->name, mod->init_size + mod->core_size); 2005 mod->name, mod->init_size + mod->core_size);
2006 print_unload_info(m, mod); 2006 print_unload_info(m, mod);
2007 2007
2008 /* Informative for users. */ 2008 /* Informative for users. */
2009 seq_printf(m, " %s", 2009 seq_printf(m, " %s",
2010 mod->state == MODULE_STATE_GOING ? "Unloading": 2010 mod->state == MODULE_STATE_GOING ? "Unloading":
2011 mod->state == MODULE_STATE_COMING ? "Loading": 2011 mod->state == MODULE_STATE_COMING ? "Loading":
2012 "Live"); 2012 "Live");
2013 /* Used by oprofile and other similar tools. */ 2013 /* Used by oprofile and other similar tools. */
2014 seq_printf(m, " 0x%p", mod->module_core); 2014 seq_printf(m, " 0x%p", mod->module_core);
2015 2015
2016 seq_printf(m, "\n"); 2016 seq_printf(m, "\n");
2017 return 0; 2017 return 0;
2018 } 2018 }
2019 2019
2020 /* Format: modulename size refcount deps address 2020 /* Format: modulename size refcount deps address
2021 2021
2022 Where refcount is a number or -, and deps is a comma-separated list 2022 Where refcount is a number or -, and deps is a comma-separated list
2023 of depends or -. 2023 of depends or -.
2024 */ 2024 */
2025 struct seq_operations modules_op = { 2025 struct seq_operations modules_op = {
2026 .start = m_start, 2026 .start = m_start,
2027 .next = m_next, 2027 .next = m_next,
2028 .stop = m_stop, 2028 .stop = m_stop,
2029 .show = m_show 2029 .show = m_show
2030 }; 2030 };
2031 2031
2032 /* Given an address, look for it in the module exception tables. */ 2032 /* Given an address, look for it in the module exception tables. */
2033 const struct exception_table_entry *search_module_extables(unsigned long addr) 2033 const struct exception_table_entry *search_module_extables(unsigned long addr)
2034 { 2034 {
2035 unsigned long flags; 2035 unsigned long flags;
2036 const struct exception_table_entry *e = NULL; 2036 const struct exception_table_entry *e = NULL;
2037 struct module *mod; 2037 struct module *mod;
2038 2038
2039 spin_lock_irqsave(&modlist_lock, flags); 2039 spin_lock_irqsave(&modlist_lock, flags);
2040 list_for_each_entry(mod, &modules, list) { 2040 list_for_each_entry(mod, &modules, list) {
2041 if (mod->num_exentries == 0) 2041 if (mod->num_exentries == 0)
2042 continue; 2042 continue;
2043 2043
2044 e = search_extable(mod->extable, 2044 e = search_extable(mod->extable,
2045 mod->extable + mod->num_exentries - 1, 2045 mod->extable + mod->num_exentries - 1,
2046 addr); 2046 addr);
2047 if (e) 2047 if (e)
2048 break; 2048 break;
2049 } 2049 }
2050 spin_unlock_irqrestore(&modlist_lock, flags); 2050 spin_unlock_irqrestore(&modlist_lock, flags);
2051 2051
2052 /* Now, if we found one, we are running inside it now, hence 2052 /* Now, if we found one, we are running inside it now, hence
2053 we cannot unload the module, hence no refcnt needed. */ 2053 we cannot unload the module, hence no refcnt needed. */
2054 return e; 2054 return e;
2055 } 2055 }
2056 2056
2057 /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ 2057 /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
2058 struct module *__module_text_address(unsigned long addr) 2058 struct module *__module_text_address(unsigned long addr)
2059 { 2059 {
2060 struct module *mod; 2060 struct module *mod;
2061 2061
2062 list_for_each_entry(mod, &modules, list) 2062 list_for_each_entry(mod, &modules, list)
2063 if (within(addr, mod->module_init, mod->init_text_size) 2063 if (within(addr, mod->module_init, mod->init_text_size)
2064 || within(addr, mod->module_core, mod->core_text_size)) 2064 || within(addr, mod->module_core, mod->core_text_size))
2065 return mod; 2065 return mod;
2066 return NULL; 2066 return NULL;
2067 } 2067 }
2068 2068
2069 struct module *module_text_address(unsigned long addr) 2069 struct module *module_text_address(unsigned long addr)
2070 { 2070 {
2071 struct module *mod; 2071 struct module *mod;
2072 unsigned long flags; 2072 unsigned long flags;
2073 2073
2074 spin_lock_irqsave(&modlist_lock, flags); 2074 spin_lock_irqsave(&modlist_lock, flags);
2075 mod = __module_text_address(addr); 2075 mod = __module_text_address(addr);
2076 spin_unlock_irqrestore(&modlist_lock, flags); 2076 spin_unlock_irqrestore(&modlist_lock, flags);
2077 2077
2078 return mod; 2078 return mod;
2079 } 2079 }
2080 2080
2081 /* Don't grab lock, we're oopsing. */ 2081 /* Don't grab lock, we're oopsing. */
2082 void print_modules(void) 2082 void print_modules(void)
2083 { 2083 {
2084 struct module *mod; 2084 struct module *mod;
2085 2085
2086 printk("Modules linked in:"); 2086 printk("Modules linked in:");
2087 list_for_each_entry(mod, &modules, list) 2087 list_for_each_entry(mod, &modules, list)
2088 printk(" %s", mod->name); 2088 printk(" %s", mod->name);
2089 printk("\n"); 2089 printk("\n");
2090 } 2090 }
2091 2091
2092 void module_add_driver(struct module *mod, struct device_driver *drv) 2092 void module_add_driver(struct module *mod, struct device_driver *drv)
2093 { 2093 {
2094 if (!mod || !drv) 2094 if (!mod || !drv)
2095 return; 2095 return;
2096 2096
2097 /* Don't check return code; this call is idempotent */ 2097 /* Don't check return code; this call is idempotent */
2098 sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); 2098 sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
2099 } 2099 }
2100 EXPORT_SYMBOL(module_add_driver); 2100 EXPORT_SYMBOL(module_add_driver);
2101 2101
2102 void module_remove_driver(struct device_driver *drv) 2102 void module_remove_driver(struct device_driver *drv)
2103 { 2103 {
2104 if (!drv) 2104 if (!drv)
2105 return; 2105 return;
2106 sysfs_remove_link(&drv->kobj, "module"); 2106 sysfs_remove_link(&drv->kobj, "module");
2107 } 2107 }
2108 EXPORT_SYMBOL(module_remove_driver); 2108 EXPORT_SYMBOL(module_remove_driver);
2109 2109
2110 #ifdef CONFIG_MODVERSIONS 2110 #ifdef CONFIG_MODVERSIONS
2111 /* Generate the signature for struct module here, too, for modversions. */ 2111 /* Generate the signature for struct module here, too, for modversions. */
2112 void struct_module(struct module *mod) { return; } 2112 void struct_module(struct module *mod) { return; }
2113 EXPORT_SYMBOL(struct_module); 2113 EXPORT_SYMBOL(struct_module);
2114 #endif 2114 #endif
2115 2115
1 /* 1 /*
2 * drivers/power/smp.c - Functions for stopping other CPUs. 2 * drivers/power/smp.c - Functions for stopping other CPUs.
3 * 3 *
4 * Copyright 2004 Pavel Machek <pavel@suse.cz> 4 * Copyright 2004 Pavel Machek <pavel@suse.cz>
5 * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz> 5 * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
6 * 6 *
7 * This file is released under the GPLv2. 7 * This file is released under the GPLv2.
8 */ 8 */
9 9
10 #undef DEBUG 10 #undef DEBUG
11 11
12 #include <linux/smp_lock.h> 12 #include <linux/smp_lock.h>
13 #include <linux/interrupt.h> 13 #include <linux/interrupt.h>
14 #include <linux/suspend.h> 14 #include <linux/suspend.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <asm/atomic.h> 16 #include <asm/atomic.h>
17 #include <asm/tlbflush.h> 17 #include <asm/tlbflush.h>
18 18
19 static atomic_t cpu_counter, freeze; 19 static atomic_t cpu_counter, freeze;
20 20
21 21
22 static void smp_pause(void * data) 22 static void smp_pause(void * data)
23 { 23 {
24 struct saved_context ctxt; 24 struct saved_context ctxt;
25 __save_processor_state(&ctxt); 25 __save_processor_state(&ctxt);
26 printk("Sleeping in:\n"); 26 printk("Sleeping in:\n");
27 dump_stack(); 27 dump_stack();
28 atomic_inc(&cpu_counter); 28 atomic_inc(&cpu_counter);
29 while (atomic_read(&freeze)) { 29 while (atomic_read(&freeze)) {
30 /* FIXME: restore takes place at random piece inside this. 30 /* FIXME: restore takes place at random piece inside this.
31 This should probably be written in assembly, and 31 This should probably be written in assembly, and
32 preserve general-purpose registers, too 32 preserve general-purpose registers, too
33 33
34 What about stack? We may need to move to new stack here. 34 What about stack? We may need to move to new stack here.
35 35
36 This should better be ran with interrupts disabled. 36 This should better be ran with interrupts disabled.
37 */ 37 */
38 cpu_relax(); 38 cpu_relax();
39 barrier(); 39 barrier();
40 } 40 }
41 atomic_dec(&cpu_counter); 41 atomic_dec(&cpu_counter);
42 __restore_processor_state(&ctxt); 42 __restore_processor_state(&ctxt);
43 } 43 }
44 44
45 static cpumask_t oldmask; 45 static cpumask_t oldmask;
46 46
47 void disable_nonboot_cpus(void) 47 void disable_nonboot_cpus(void)
48 { 48 {
49 oldmask = current->cpus_allowed; 49 oldmask = current->cpus_allowed;
50 set_cpus_allowed(current, cpumask_of_cpu(0)); 50 set_cpus_allowed(current, cpumask_of_cpu(0));
51 printk("Freezing CPUs (at %d)", _smp_processor_id()); 51 printk("Freezing CPUs (at %d)", raw_smp_processor_id());
52 current->state = TASK_INTERRUPTIBLE; 52 current->state = TASK_INTERRUPTIBLE;
53 schedule_timeout(HZ); 53 schedule_timeout(HZ);
54 printk("..."); 54 printk("...");
55 BUG_ON(_smp_processor_id() != 0); 55 BUG_ON(raw_smp_processor_id() != 0);
56 56
57 /* FIXME: for this to work, all the CPUs must be running 57 /* FIXME: for this to work, all the CPUs must be running
58 * "idle" thread (or we deadlock). Is that guaranteed? */ 58 * "idle" thread (or we deadlock). Is that guaranteed? */
59 59
60 atomic_set(&cpu_counter, 0); 60 atomic_set(&cpu_counter, 0);
61 atomic_set(&freeze, 1); 61 atomic_set(&freeze, 1);
62 smp_call_function(smp_pause, NULL, 0, 0); 62 smp_call_function(smp_pause, NULL, 0, 0);
63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { 63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
64 cpu_relax(); 64 cpu_relax();
65 barrier(); 65 barrier();
66 } 66 }
67 printk("ok\n"); 67 printk("ok\n");
68 } 68 }
69 69
70 void enable_nonboot_cpus(void) 70 void enable_nonboot_cpus(void)
71 { 71 {
72 printk("Restarting CPUs"); 72 printk("Restarting CPUs");
73 atomic_set(&freeze, 0); 73 atomic_set(&freeze, 0);
74 while (atomic_read(&cpu_counter)) { 74 while (atomic_read(&cpu_counter)) {
75 cpu_relax(); 75 cpu_relax();
76 barrier(); 76 barrier();
77 } 77 }
78 printk("..."); 78 printk("...");
79 set_cpus_allowed(current, oldmask); 79 set_cpus_allowed(current, oldmask);
80 schedule(); 80 schedule();
81 printk("ok\n"); 81 printk("ok\n");
82 82
83 } 83 }
84 84
85 85
86 86
1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */ 19 */
20 20
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/nmi.h> 23 #include <linux/nmi.h>
24 #include <linux/init.h> 24 #include <linux/init.h>
25 #include <asm/uaccess.h> 25 #include <asm/uaccess.h>
26 #include <linux/highmem.h> 26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h> 27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h> 28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h> 29 #include <linux/interrupt.h>
30 #include <linux/completion.h> 30 #include <linux/completion.h>
31 #include <linux/kernel_stat.h> 31 #include <linux/kernel_stat.h>
32 #include <linux/security.h> 32 #include <linux/security.h>
33 #include <linux/notifier.h> 33 #include <linux/notifier.h>
34 #include <linux/profile.h> 34 #include <linux/profile.h>
35 #include <linux/suspend.h> 35 #include <linux/suspend.h>
36 #include <linux/blkdev.h> 36 #include <linux/blkdev.h>
37 #include <linux/delay.h> 37 #include <linux/delay.h>
38 #include <linux/smp.h> 38 #include <linux/smp.h>
39 #include <linux/threads.h> 39 #include <linux/threads.h>
40 #include <linux/timer.h> 40 #include <linux/timer.h>
41 #include <linux/rcupdate.h> 41 #include <linux/rcupdate.h>
42 #include <linux/cpu.h> 42 #include <linux/cpu.h>
43 #include <linux/cpuset.h> 43 #include <linux/cpuset.h>
44 #include <linux/percpu.h> 44 #include <linux/percpu.h>
45 #include <linux/kthread.h> 45 #include <linux/kthread.h>
46 #include <linux/seq_file.h> 46 #include <linux/seq_file.h>
47 #include <linux/syscalls.h> 47 #include <linux/syscalls.h>
48 #include <linux/times.h> 48 #include <linux/times.h>
49 #include <linux/acct.h> 49 #include <linux/acct.h>
50 #include <asm/tlb.h> 50 #include <asm/tlb.h>
51 51
52 #include <asm/unistd.h> 52 #include <asm/unistd.h>
53 53
54 /* 54 /*
55 * Convert user-nice values [ -20 ... 0 ... 19 ] 55 * Convert user-nice values [ -20 ... 0 ... 19 ]
56 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 56 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
57 * and back. 57 * and back.
58 */ 58 */
59 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 59 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
60 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 60 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
61 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 61 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
62 62
63 /* 63 /*
64 * 'User priority' is the nice value converted to something we 64 * 'User priority' is the nice value converted to something we
65 * can work with better when scaling various scheduler parameters, 65 * can work with better when scaling various scheduler parameters,
66 * it's a [ 0 ... 39 ] range. 66 * it's a [ 0 ... 39 ] range.
67 */ 67 */
68 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 68 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
69 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 69 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
70 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 70 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
71 71
72 /* 72 /*
73 * Some helpers for converting nanosecond timing to jiffy resolution 73 * Some helpers for converting nanosecond timing to jiffy resolution
74 */ 74 */
75 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 75 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
76 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 76 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
77 77
78 /* 78 /*
79 * These are the 'tuning knobs' of the scheduler: 79 * These are the 'tuning knobs' of the scheduler:
80 * 80 *
81 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 81 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
82 * default timeslice is 100 msecs, maximum timeslice is 800 msecs. 82 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
83 * Timeslices get refilled after they expire. 83 * Timeslices get refilled after they expire.
84 */ 84 */
85 #define MIN_TIMESLICE max(5 * HZ / 1000, 1) 85 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
86 #define DEF_TIMESLICE (100 * HZ / 1000) 86 #define DEF_TIMESLICE (100 * HZ / 1000)
87 #define ON_RUNQUEUE_WEIGHT 30 87 #define ON_RUNQUEUE_WEIGHT 30
88 #define CHILD_PENALTY 95 88 #define CHILD_PENALTY 95
89 #define PARENT_PENALTY 100 89 #define PARENT_PENALTY 100
90 #define EXIT_WEIGHT 3 90 #define EXIT_WEIGHT 3
91 #define PRIO_BONUS_RATIO 25 91 #define PRIO_BONUS_RATIO 25
92 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) 92 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
93 #define INTERACTIVE_DELTA 2 93 #define INTERACTIVE_DELTA 2
94 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) 94 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
95 #define STARVATION_LIMIT (MAX_SLEEP_AVG) 95 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
96 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) 96 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
97 97
98 /* 98 /*
99 * If a task is 'interactive' then we reinsert it in the active 99 * If a task is 'interactive' then we reinsert it in the active
100 * array after it has expired its current timeslice. (it will not 100 * array after it has expired its current timeslice. (it will not
101 * continue to run immediately, it will still roundrobin with 101 * continue to run immediately, it will still roundrobin with
102 * other interactive tasks.) 102 * other interactive tasks.)
103 * 103 *
104 * This part scales the interactivity limit depending on niceness. 104 * This part scales the interactivity limit depending on niceness.
105 * 105 *
106 * We scale it linearly, offset by the INTERACTIVE_DELTA delta. 106 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
107 * Here are a few examples of different nice levels: 107 * Here are a few examples of different nice levels:
108 * 108 *
109 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] 109 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
110 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] 110 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
111 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] 111 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
112 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] 112 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
113 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] 113 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
114 * 114 *
115 * (the X axis represents the possible -5 ... 0 ... +5 dynamic 115 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
116 * priority range a task can explore, a value of '1' means the 116 * priority range a task can explore, a value of '1' means the
117 * task is rated interactive.) 117 * task is rated interactive.)
118 * 118 *
119 * Ie. nice +19 tasks can never get 'interactive' enough to be 119 * Ie. nice +19 tasks can never get 'interactive' enough to be
120 * reinserted into the active array. And only heavily CPU-hog nice -20 120 * reinserted into the active array. And only heavily CPU-hog nice -20
121 * tasks will be expired. Default nice 0 tasks are somewhere between, 121 * tasks will be expired. Default nice 0 tasks are somewhere between,
122 * it takes some effort for them to get interactive, but it's not 122 * it takes some effort for them to get interactive, but it's not
123 * too hard. 123 * too hard.
124 */ 124 */
125 125
126 #define CURRENT_BONUS(p) \ 126 #define CURRENT_BONUS(p) \
127 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ 127 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
128 MAX_SLEEP_AVG) 128 MAX_SLEEP_AVG)
129 129
130 #define GRANULARITY (10 * HZ / 1000 ? : 1) 130 #define GRANULARITY (10 * HZ / 1000 ? : 1)
131 131
132 #ifdef CONFIG_SMP 132 #ifdef CONFIG_SMP
133 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ 133 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
134 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ 134 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
135 num_online_cpus()) 135 num_online_cpus())
136 #else 136 #else
137 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ 137 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
138 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) 138 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
139 #endif 139 #endif
140 140
141 #define SCALE(v1,v1_max,v2_max) \ 141 #define SCALE(v1,v1_max,v2_max) \
142 (v1) * (v2_max) / (v1_max) 142 (v1) * (v2_max) / (v1_max)
143 143
144 #define DELTA(p) \ 144 #define DELTA(p) \
145 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 145 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
146 146
147 #define TASK_INTERACTIVE(p) \ 147 #define TASK_INTERACTIVE(p) \
148 ((p)->prio <= (p)->static_prio - DELTA(p)) 148 ((p)->prio <= (p)->static_prio - DELTA(p))
149 149
150 #define INTERACTIVE_SLEEP(p) \ 150 #define INTERACTIVE_SLEEP(p) \
151 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ 151 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) 152 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
153 153
154 #define TASK_PREEMPTS_CURR(p, rq) \ 154 #define TASK_PREEMPTS_CURR(p, rq) \
155 ((p)->prio < (rq)->curr->prio) 155 ((p)->prio < (rq)->curr->prio)
156 156
157 /* 157 /*
158 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 158 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
159 * to time slice values: [800ms ... 100ms ... 5ms] 159 * to time slice values: [800ms ... 100ms ... 5ms]
160 * 160 *
161 * The higher a thread's priority, the bigger timeslices 161 * The higher a thread's priority, the bigger timeslices
162 * it gets during one round of execution. But even the lowest 162 * it gets during one round of execution. But even the lowest
163 * priority thread gets MIN_TIMESLICE worth of execution time. 163 * priority thread gets MIN_TIMESLICE worth of execution time.
164 */ 164 */
165 165
166 #define SCALE_PRIO(x, prio) \ 166 #define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168 168
169 static inline unsigned int task_timeslice(task_t *p) 169 static inline unsigned int task_timeslice(task_t *p)
170 { 170 {
171 if (p->static_prio < NICE_TO_PRIO(0)) 171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
173 else 173 else
174 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 174 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
175 } 175 }
176 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 176 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
177 < (long long) (sd)->cache_hot_time) 177 < (long long) (sd)->cache_hot_time)
178 178
179 /* 179 /*
180 * These are the runqueue data structures: 180 * These are the runqueue data structures:
181 */ 181 */
182 182
183 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) 183 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
184 184
185 typedef struct runqueue runqueue_t; 185 typedef struct runqueue runqueue_t;
186 186
187 struct prio_array { 187 struct prio_array {
188 unsigned int nr_active; 188 unsigned int nr_active;
189 unsigned long bitmap[BITMAP_SIZE]; 189 unsigned long bitmap[BITMAP_SIZE];
190 struct list_head queue[MAX_PRIO]; 190 struct list_head queue[MAX_PRIO];
191 }; 191 };
192 192
193 /* 193 /*
194 * This is the main, per-CPU runqueue data structure. 194 * This is the main, per-CPU runqueue data structure.
195 * 195 *
196 * Locking rule: those places that want to lock multiple runqueues 196 * Locking rule: those places that want to lock multiple runqueues
197 * (such as the load balancing or the thread migration code), lock 197 * (such as the load balancing or the thread migration code), lock
198 * acquire operations must be ordered by ascending &runqueue. 198 * acquire operations must be ordered by ascending &runqueue.
199 */ 199 */
200 struct runqueue { 200 struct runqueue {
201 spinlock_t lock; 201 spinlock_t lock;
202 202
203 /* 203 /*
204 * nr_running and cpu_load should be in the same cacheline because 204 * nr_running and cpu_load should be in the same cacheline because
205 * remote CPUs use both these fields when doing load calculation. 205 * remote CPUs use both these fields when doing load calculation.
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208 #ifdef CONFIG_SMP 208 #ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load;
210 #endif 210 #endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
213 /* 213 /*
214 * This is part of a global counter where only the total sum 214 * This is part of a global counter where only the total sum
215 * over all CPUs matters. A task can increase this counter on 215 * over all CPUs matters. A task can increase this counter on
216 * one CPU and if it got migrated afterwards it may decrease 216 * one CPU and if it got migrated afterwards it may decrease
217 * it on another CPU. Always updated under the runqueue lock: 217 * it on another CPU. Always updated under the runqueue lock:
218 */ 218 */
219 unsigned long nr_uninterruptible; 219 unsigned long nr_uninterruptible;
220 220
221 unsigned long expired_timestamp; 221 unsigned long expired_timestamp;
222 unsigned long long timestamp_last_tick; 222 unsigned long long timestamp_last_tick;
223 task_t *curr, *idle; 223 task_t *curr, *idle;
224 struct mm_struct *prev_mm; 224 struct mm_struct *prev_mm;
225 prio_array_t *active, *expired, arrays[2]; 225 prio_array_t *active, *expired, arrays[2];
226 int best_expired_prio; 226 int best_expired_prio;
227 atomic_t nr_iowait; 227 atomic_t nr_iowait;
228 228
229 #ifdef CONFIG_SMP 229 #ifdef CONFIG_SMP
230 struct sched_domain *sd; 230 struct sched_domain *sd;
231 231
232 /* For active balancing */ 232 /* For active balancing */
233 int active_balance; 233 int active_balance;
234 int push_cpu; 234 int push_cpu;
235 235
236 task_t *migration_thread; 236 task_t *migration_thread;
237 struct list_head migration_queue; 237 struct list_head migration_queue;
238 #endif 238 #endif
239 239
240 #ifdef CONFIG_SCHEDSTATS 240 #ifdef CONFIG_SCHEDSTATS
241 /* latency stats */ 241 /* latency stats */
242 struct sched_info rq_sched_info; 242 struct sched_info rq_sched_info;
243 243
244 /* sys_sched_yield() stats */ 244 /* sys_sched_yield() stats */
245 unsigned long yld_exp_empty; 245 unsigned long yld_exp_empty;
246 unsigned long yld_act_empty; 246 unsigned long yld_act_empty;
247 unsigned long yld_both_empty; 247 unsigned long yld_both_empty;
248 unsigned long yld_cnt; 248 unsigned long yld_cnt;
249 249
250 /* schedule() stats */ 250 /* schedule() stats */
251 unsigned long sched_switch; 251 unsigned long sched_switch;
252 unsigned long sched_cnt; 252 unsigned long sched_cnt;
253 unsigned long sched_goidle; 253 unsigned long sched_goidle;
254 254
255 /* try_to_wake_up() stats */ 255 /* try_to_wake_up() stats */
256 unsigned long ttwu_cnt; 256 unsigned long ttwu_cnt;
257 unsigned long ttwu_local; 257 unsigned long ttwu_local;
258 #endif 258 #endif
259 }; 259 };
260 260
261 static DEFINE_PER_CPU(struct runqueue, runqueues); 261 static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263 #define for_each_domain(cpu, domain) \ 263 #define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
265 265
266 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 266 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267 #define this_rq() (&__get_cpu_var(runqueues)) 267 #define this_rq() (&__get_cpu_var(runqueues))
268 #define task_rq(p) cpu_rq(task_cpu(p)) 268 #define task_rq(p) cpu_rq(task_cpu(p))
269 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 269 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270 270
271 /* 271 /*
272 * Default context-switch locking: 272 * Default context-switch locking:
273 */ 273 */
274 #ifndef prepare_arch_switch 274 #ifndef prepare_arch_switch
275 # define prepare_arch_switch(rq, next) do { } while (0) 275 # define prepare_arch_switch(rq, next) do { } while (0)
276 # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 276 # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
277 # define task_running(rq, p) ((rq)->curr == (p)) 277 # define task_running(rq, p) ((rq)->curr == (p))
278 #endif 278 #endif
279 279
280 /* 280 /*
281 * task_rq_lock - lock the runqueue a given task resides on and disable 281 * task_rq_lock - lock the runqueue a given task resides on and disable
282 * interrupts. Note the ordering: we can safely lookup the task_rq without 282 * interrupts. Note the ordering: we can safely lookup the task_rq without
283 * explicitly disabling preemption. 283 * explicitly disabling preemption.
284 */ 284 */
285 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 285 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
286 __acquires(rq->lock) 286 __acquires(rq->lock)
287 { 287 {
288 struct runqueue *rq; 288 struct runqueue *rq;
289 289
290 repeat_lock_task: 290 repeat_lock_task:
291 local_irq_save(*flags); 291 local_irq_save(*flags);
292 rq = task_rq(p); 292 rq = task_rq(p);
293 spin_lock(&rq->lock); 293 spin_lock(&rq->lock);
294 if (unlikely(rq != task_rq(p))) { 294 if (unlikely(rq != task_rq(p))) {
295 spin_unlock_irqrestore(&rq->lock, *flags); 295 spin_unlock_irqrestore(&rq->lock, *flags);
296 goto repeat_lock_task; 296 goto repeat_lock_task;
297 } 297 }
298 return rq; 298 return rq;
299 } 299 }
300 300
301 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 301 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
302 __releases(rq->lock) 302 __releases(rq->lock)
303 { 303 {
304 spin_unlock_irqrestore(&rq->lock, *flags); 304 spin_unlock_irqrestore(&rq->lock, *flags);
305 } 305 }
306 306
307 #ifdef CONFIG_SCHEDSTATS 307 #ifdef CONFIG_SCHEDSTATS
308 /* 308 /*
309 * bump this up when changing the output format or the meaning of an existing 309 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort) 310 * format, so that tools can adapt (or abort)
311 */ 311 */
312 #define SCHEDSTAT_VERSION 11 312 #define SCHEDSTAT_VERSION 11
313 313
314 static int show_schedstat(struct seq_file *seq, void *v) 314 static int show_schedstat(struct seq_file *seq, void *v)
315 { 315 {
316 int cpu; 316 int cpu;
317 317
318 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 318 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
319 seq_printf(seq, "timestamp %lu\n", jiffies); 319 seq_printf(seq, "timestamp %lu\n", jiffies);
320 for_each_online_cpu(cpu) { 320 for_each_online_cpu(cpu) {
321 runqueue_t *rq = cpu_rq(cpu); 321 runqueue_t *rq = cpu_rq(cpu);
322 #ifdef CONFIG_SMP 322 #ifdef CONFIG_SMP
323 struct sched_domain *sd; 323 struct sched_domain *sd;
324 int dcnt = 0; 324 int dcnt = 0;
325 #endif 325 #endif
326 326
327 /* runqueue-specific stats */ 327 /* runqueue-specific stats */
328 seq_printf(seq, 328 seq_printf(seq,
329 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", 329 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
330 cpu, rq->yld_both_empty, 330 cpu, rq->yld_both_empty,
331 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, 331 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
332 rq->sched_switch, rq->sched_cnt, rq->sched_goidle, 332 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
333 rq->ttwu_cnt, rq->ttwu_local, 333 rq->ttwu_cnt, rq->ttwu_local,
334 rq->rq_sched_info.cpu_time, 334 rq->rq_sched_info.cpu_time,
335 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); 335 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
336 336
337 seq_printf(seq, "\n"); 337 seq_printf(seq, "\n");
338 338
339 #ifdef CONFIG_SMP 339 #ifdef CONFIG_SMP
340 /* domain-specific stats */ 340 /* domain-specific stats */
341 for_each_domain(cpu, sd) { 341 for_each_domain(cpu, sd) {
342 enum idle_type itype; 342 enum idle_type itype;
343 char mask_str[NR_CPUS]; 343 char mask_str[NR_CPUS];
344 344
345 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 345 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
346 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 346 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
347 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 347 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
348 itype++) { 348 itype++) {
349 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 349 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
350 sd->lb_cnt[itype], 350 sd->lb_cnt[itype],
351 sd->lb_balanced[itype], 351 sd->lb_balanced[itype],
352 sd->lb_failed[itype], 352 sd->lb_failed[itype],
353 sd->lb_imbalance[itype], 353 sd->lb_imbalance[itype],
354 sd->lb_gained[itype], 354 sd->lb_gained[itype],
355 sd->lb_hot_gained[itype], 355 sd->lb_hot_gained[itype],
356 sd->lb_nobusyq[itype], 356 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]); 357 sd->lb_nobusyg[itype]);
358 } 358 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", 359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts, 361 sd->sbe_pushed, sd->sbe_attempts,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 } 363 }
364 #endif 364 #endif
365 } 365 }
366 return 0; 366 return 0;
367 } 367 }
368 368
369 static int schedstat_open(struct inode *inode, struct file *file) 369 static int schedstat_open(struct inode *inode, struct file *file)
370 { 370 {
371 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 371 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
372 char *buf = kmalloc(size, GFP_KERNEL); 372 char *buf = kmalloc(size, GFP_KERNEL);
373 struct seq_file *m; 373 struct seq_file *m;
374 int res; 374 int res;
375 375
376 if (!buf) 376 if (!buf)
377 return -ENOMEM; 377 return -ENOMEM;
378 res = single_open(file, show_schedstat, NULL); 378 res = single_open(file, show_schedstat, NULL);
379 if (!res) { 379 if (!res) {
380 m = file->private_data; 380 m = file->private_data;
381 m->buf = buf; 381 m->buf = buf;
382 m->size = size; 382 m->size = size;
383 } else 383 } else
384 kfree(buf); 384 kfree(buf);
385 return res; 385 return res;
386 } 386 }
387 387
388 struct file_operations proc_schedstat_operations = { 388 struct file_operations proc_schedstat_operations = {
389 .open = schedstat_open, 389 .open = schedstat_open,
390 .read = seq_read, 390 .read = seq_read,
391 .llseek = seq_lseek, 391 .llseek = seq_lseek,
392 .release = single_release, 392 .release = single_release,
393 }; 393 };
394 394
395 # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 395 # define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
396 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 396 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
397 #else /* !CONFIG_SCHEDSTATS */ 397 #else /* !CONFIG_SCHEDSTATS */
398 # define schedstat_inc(rq, field) do { } while (0) 398 # define schedstat_inc(rq, field) do { } while (0)
399 # define schedstat_add(rq, field, amt) do { } while (0) 399 # define schedstat_add(rq, field, amt) do { } while (0)
400 #endif 400 #endif
401 401
402 /* 402 /*
403 * rq_lock - lock a given runqueue and disable interrupts. 403 * rq_lock - lock a given runqueue and disable interrupts.
404 */ 404 */
405 static inline runqueue_t *this_rq_lock(void) 405 static inline runqueue_t *this_rq_lock(void)
406 __acquires(rq->lock) 406 __acquires(rq->lock)
407 { 407 {
408 runqueue_t *rq; 408 runqueue_t *rq;
409 409
410 local_irq_disable(); 410 local_irq_disable();
411 rq = this_rq(); 411 rq = this_rq();
412 spin_lock(&rq->lock); 412 spin_lock(&rq->lock);
413 413
414 return rq; 414 return rq;
415 } 415 }
416 416
417 #ifdef CONFIG_SCHED_SMT 417 #ifdef CONFIG_SCHED_SMT
418 static int cpu_and_siblings_are_idle(int cpu) 418 static int cpu_and_siblings_are_idle(int cpu)
419 { 419 {
420 int sib; 420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { 421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib)) 422 if (idle_cpu(sib))
423 continue; 423 continue;
424 return 0; 424 return 0;
425 } 425 }
426 426
427 return 1; 427 return 1;
428 } 428 }
429 #else 429 #else
430 #define cpu_and_siblings_are_idle(A) idle_cpu(A) 430 #define cpu_and_siblings_are_idle(A) idle_cpu(A)
431 #endif 431 #endif
432 432
433 #ifdef CONFIG_SCHEDSTATS 433 #ifdef CONFIG_SCHEDSTATS
434 /* 434 /*
435 * Called when a process is dequeued from the active array and given 435 * Called when a process is dequeued from the active array and given
436 * the cpu. We should note that with the exception of interactive 436 * the cpu. We should note that with the exception of interactive
437 * tasks, the expired queue will become the active queue after the active 437 * tasks, the expired queue will become the active queue after the active
438 * queue is empty, without explicitly dequeuing and requeuing tasks in the 438 * queue is empty, without explicitly dequeuing and requeuing tasks in the
439 * expired queue. (Interactive tasks may be requeued directly to the 439 * expired queue. (Interactive tasks may be requeued directly to the
440 * active queue, thus delaying tasks in the expired queue from running; 440 * active queue, thus delaying tasks in the expired queue from running;
441 * see scheduler_tick()). 441 * see scheduler_tick()).
442 * 442 *
443 * This function is only called from sched_info_arrive(), rather than 443 * This function is only called from sched_info_arrive(), rather than
444 * dequeue_task(). Even though a task may be queued and dequeued multiple 444 * dequeue_task(). Even though a task may be queued and dequeued multiple
445 * times as it is shuffled about, we're really interested in knowing how 445 * times as it is shuffled about, we're really interested in knowing how
446 * long it was from the *first* time it was queued to the time that it 446 * long it was from the *first* time it was queued to the time that it
447 * finally hit a cpu. 447 * finally hit a cpu.
448 */ 448 */
449 static inline void sched_info_dequeued(task_t *t) 449 static inline void sched_info_dequeued(task_t *t)
450 { 450 {
451 t->sched_info.last_queued = 0; 451 t->sched_info.last_queued = 0;
452 } 452 }
453 453
454 /* 454 /*
455 * Called when a task finally hits the cpu. We can now calculate how 455 * Called when a task finally hits the cpu. We can now calculate how
456 * long it was waiting to run. We also note when it began so that we 456 * long it was waiting to run. We also note when it began so that we
457 * can keep stats on how long its timeslice is. 457 * can keep stats on how long its timeslice is.
458 */ 458 */
459 static inline void sched_info_arrive(task_t *t) 459 static inline void sched_info_arrive(task_t *t)
460 { 460 {
461 unsigned long now = jiffies, diff = 0; 461 unsigned long now = jiffies, diff = 0;
462 struct runqueue *rq = task_rq(t); 462 struct runqueue *rq = task_rq(t);
463 463
464 if (t->sched_info.last_queued) 464 if (t->sched_info.last_queued)
465 diff = now - t->sched_info.last_queued; 465 diff = now - t->sched_info.last_queued;
466 sched_info_dequeued(t); 466 sched_info_dequeued(t);
467 t->sched_info.run_delay += diff; 467 t->sched_info.run_delay += diff;
468 t->sched_info.last_arrival = now; 468 t->sched_info.last_arrival = now;
469 t->sched_info.pcnt++; 469 t->sched_info.pcnt++;
470 470
471 if (!rq) 471 if (!rq)
472 return; 472 return;
473 473
474 rq->rq_sched_info.run_delay += diff; 474 rq->rq_sched_info.run_delay += diff;
475 rq->rq_sched_info.pcnt++; 475 rq->rq_sched_info.pcnt++;
476 } 476 }
477 477
478 /* 478 /*
479 * Called when a process is queued into either the active or expired 479 * Called when a process is queued into either the active or expired
480 * array. The time is noted and later used to determine how long we 480 * array. The time is noted and later used to determine how long we
481 * had to wait for us to reach the cpu. Since the expired queue will 481 * had to wait for us to reach the cpu. Since the expired queue will
482 * become the active queue after active queue is empty, without dequeuing 482 * become the active queue after active queue is empty, without dequeuing
483 * and requeuing any tasks, we are interested in queuing to either. It 483 * and requeuing any tasks, we are interested in queuing to either. It
484 * is unusual but not impossible for tasks to be dequeued and immediately 484 * is unusual but not impossible for tasks to be dequeued and immediately
485 * requeued in the same or another array: this can happen in sched_yield(), 485 * requeued in the same or another array: this can happen in sched_yield(),
486 * set_user_nice(), and even load_balance() as it moves tasks from runqueue 486 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
487 * to runqueue. 487 * to runqueue.
488 * 488 *
489 * This function is only called from enqueue_task(), but also only updates 489 * This function is only called from enqueue_task(), but also only updates
490 * the timestamp if it is already not set. It's assumed that 490 * the timestamp if it is already not set. It's assumed that
491 * sched_info_dequeued() will clear that stamp when appropriate. 491 * sched_info_dequeued() will clear that stamp when appropriate.
492 */ 492 */
493 static inline void sched_info_queued(task_t *t) 493 static inline void sched_info_queued(task_t *t)
494 { 494 {
495 if (!t->sched_info.last_queued) 495 if (!t->sched_info.last_queued)
496 t->sched_info.last_queued = jiffies; 496 t->sched_info.last_queued = jiffies;
497 } 497 }
498 498
499 /* 499 /*
500 * Called when a process ceases being the active-running process, either 500 * Called when a process ceases being the active-running process, either
501 * voluntarily or involuntarily. Now we can calculate how long we ran. 501 * voluntarily or involuntarily. Now we can calculate how long we ran.
502 */ 502 */
503 static inline void sched_info_depart(task_t *t) 503 static inline void sched_info_depart(task_t *t)
504 { 504 {
505 struct runqueue *rq = task_rq(t); 505 struct runqueue *rq = task_rq(t);
506 unsigned long diff = jiffies - t->sched_info.last_arrival; 506 unsigned long diff = jiffies - t->sched_info.last_arrival;
507 507
508 t->sched_info.cpu_time += diff; 508 t->sched_info.cpu_time += diff;
509 509
510 if (rq) 510 if (rq)
511 rq->rq_sched_info.cpu_time += diff; 511 rq->rq_sched_info.cpu_time += diff;
512 } 512 }
513 513
514 /* 514 /*
515 * Called when tasks are switched involuntarily due, typically, to expiring 515 * Called when tasks are switched involuntarily due, typically, to expiring
516 * their time slice. (This may also be called when switching to or from 516 * their time slice. (This may also be called when switching to or from
517 * the idle task.) We are only called when prev != next. 517 * the idle task.) We are only called when prev != next.
518 */ 518 */
519 static inline void sched_info_switch(task_t *prev, task_t *next) 519 static inline void sched_info_switch(task_t *prev, task_t *next)
520 { 520 {
521 struct runqueue *rq = task_rq(prev); 521 struct runqueue *rq = task_rq(prev);
522 522
523 /* 523 /*
524 * prev now departs the cpu. It's not interesting to record 524 * prev now departs the cpu. It's not interesting to record
525 * stats about how efficient we were at scheduling the idle 525 * stats about how efficient we were at scheduling the idle
526 * process, however. 526 * process, however.
527 */ 527 */
528 if (prev != rq->idle) 528 if (prev != rq->idle)
529 sched_info_depart(prev); 529 sched_info_depart(prev);
530 530
531 if (next != rq->idle) 531 if (next != rq->idle)
532 sched_info_arrive(next); 532 sched_info_arrive(next);
533 } 533 }
534 #else 534 #else
535 #define sched_info_queued(t) do { } while (0) 535 #define sched_info_queued(t) do { } while (0)
536 #define sched_info_switch(t, next) do { } while (0) 536 #define sched_info_switch(t, next) do { } while (0)
537 #endif /* CONFIG_SCHEDSTATS */ 537 #endif /* CONFIG_SCHEDSTATS */
538 538
539 /* 539 /*
540 * Adding/removing a task to/from a priority array: 540 * Adding/removing a task to/from a priority array:
541 */ 541 */
542 static void dequeue_task(struct task_struct *p, prio_array_t *array) 542 static void dequeue_task(struct task_struct *p, prio_array_t *array)
543 { 543 {
544 array->nr_active--; 544 array->nr_active--;
545 list_del(&p->run_list); 545 list_del(&p->run_list);
546 if (list_empty(array->queue + p->prio)) 546 if (list_empty(array->queue + p->prio))
547 __clear_bit(p->prio, array->bitmap); 547 __clear_bit(p->prio, array->bitmap);
548 } 548 }
549 549
550 static void enqueue_task(struct task_struct *p, prio_array_t *array) 550 static void enqueue_task(struct task_struct *p, prio_array_t *array)
551 { 551 {
552 sched_info_queued(p); 552 sched_info_queued(p);
553 list_add_tail(&p->run_list, array->queue + p->prio); 553 list_add_tail(&p->run_list, array->queue + p->prio);
554 __set_bit(p->prio, array->bitmap); 554 __set_bit(p->prio, array->bitmap);
555 array->nr_active++; 555 array->nr_active++;
556 p->array = array; 556 p->array = array;
557 } 557 }
558 558
559 /* 559 /*
560 * Put task to the end of the run list without the overhead of dequeue 560 * Put task to the end of the run list without the overhead of dequeue
561 * followed by enqueue. 561 * followed by enqueue.
562 */ 562 */
563 static void requeue_task(struct task_struct *p, prio_array_t *array) 563 static void requeue_task(struct task_struct *p, prio_array_t *array)
564 { 564 {
565 list_move_tail(&p->run_list, array->queue + p->prio); 565 list_move_tail(&p->run_list, array->queue + p->prio);
566 } 566 }
567 567
568 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) 568 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
569 { 569 {
570 list_add(&p->run_list, array->queue + p->prio); 570 list_add(&p->run_list, array->queue + p->prio);
571 __set_bit(p->prio, array->bitmap); 571 __set_bit(p->prio, array->bitmap);
572 array->nr_active++; 572 array->nr_active++;
573 p->array = array; 573 p->array = array;
574 } 574 }
575 575
576 /* 576 /*
577 * effective_prio - return the priority that is based on the static 577 * effective_prio - return the priority that is based on the static
578 * priority but is modified by bonuses/penalties. 578 * priority but is modified by bonuses/penalties.
579 * 579 *
580 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 580 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
581 * into the -5 ... 0 ... +5 bonus/penalty range. 581 * into the -5 ... 0 ... +5 bonus/penalty range.
582 * 582 *
583 * We use 25% of the full 0...39 priority range so that: 583 * We use 25% of the full 0...39 priority range so that:
584 * 584 *
585 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 585 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
586 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 586 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
587 * 587 *
588 * Both properties are important to certain workloads. 588 * Both properties are important to certain workloads.
589 */ 589 */
590 static int effective_prio(task_t *p) 590 static int effective_prio(task_t *p)
591 { 591 {
592 int bonus, prio; 592 int bonus, prio;
593 593
594 if (rt_task(p)) 594 if (rt_task(p))
595 return p->prio; 595 return p->prio;
596 596
597 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 597 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
598 598
599 prio = p->static_prio - bonus; 599 prio = p->static_prio - bonus;
600 if (prio < MAX_RT_PRIO) 600 if (prio < MAX_RT_PRIO)
601 prio = MAX_RT_PRIO; 601 prio = MAX_RT_PRIO;
602 if (prio > MAX_PRIO-1) 602 if (prio > MAX_PRIO-1)
603 prio = MAX_PRIO-1; 603 prio = MAX_PRIO-1;
604 return prio; 604 return prio;
605 } 605 }
606 606
607 /* 607 /*
608 * __activate_task - move a task to the runqueue. 608 * __activate_task - move a task to the runqueue.
609 */ 609 */
610 static inline void __activate_task(task_t *p, runqueue_t *rq) 610 static inline void __activate_task(task_t *p, runqueue_t *rq)
611 { 611 {
612 enqueue_task(p, rq->active); 612 enqueue_task(p, rq->active);
613 rq->nr_running++; 613 rq->nr_running++;
614 } 614 }
615 615
616 /* 616 /*
617 * __activate_idle_task - move idle task to the _front_ of runqueue. 617 * __activate_idle_task - move idle task to the _front_ of runqueue.
618 */ 618 */
619 static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 619 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
620 { 620 {
621 enqueue_task_head(p, rq->active); 621 enqueue_task_head(p, rq->active);
622 rq->nr_running++; 622 rq->nr_running++;
623 } 623 }
624 624
625 static void recalc_task_prio(task_t *p, unsigned long long now) 625 static void recalc_task_prio(task_t *p, unsigned long long now)
626 { 626 {
627 /* Caller must always ensure 'now >= p->timestamp' */ 627 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp; 628 unsigned long long __sleep_time = now - p->timestamp;
629 unsigned long sleep_time; 629 unsigned long sleep_time;
630 630
631 if (__sleep_time > NS_MAX_SLEEP_AVG) 631 if (__sleep_time > NS_MAX_SLEEP_AVG)
632 sleep_time = NS_MAX_SLEEP_AVG; 632 sleep_time = NS_MAX_SLEEP_AVG;
633 else 633 else
634 sleep_time = (unsigned long)__sleep_time; 634 sleep_time = (unsigned long)__sleep_time;
635 635
636 if (likely(sleep_time > 0)) { 636 if (likely(sleep_time > 0)) {
637 /* 637 /*
638 * User tasks that sleep a long time are categorised as 638 * User tasks that sleep a long time are categorised as
639 * idle and will get just interactive status to stay active & 639 * idle and will get just interactive status to stay active &
640 * prevent them suddenly becoming cpu hogs and starving 640 * prevent them suddenly becoming cpu hogs and starving
641 * other processes. 641 * other processes.
642 */ 642 */
643 if (p->mm && p->activated != -1 && 643 if (p->mm && p->activated != -1 &&
644 sleep_time > INTERACTIVE_SLEEP(p)) { 644 sleep_time > INTERACTIVE_SLEEP(p)) {
645 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - 645 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
646 DEF_TIMESLICE); 646 DEF_TIMESLICE);
647 } else { 647 } else {
648 /* 648 /*
649 * The lower the sleep avg a task has the more 649 * The lower the sleep avg a task has the more
650 * rapidly it will rise with sleep time. 650 * rapidly it will rise with sleep time.
651 */ 651 */
652 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; 652 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
653 653
654 /* 654 /*
655 * Tasks waking from uninterruptible sleep are 655 * Tasks waking from uninterruptible sleep are
656 * limited in their sleep_avg rise as they 656 * limited in their sleep_avg rise as they
657 * are likely to be waiting on I/O 657 * are likely to be waiting on I/O
658 */ 658 */
659 if (p->activated == -1 && p->mm) { 659 if (p->activated == -1 && p->mm) {
660 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 660 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
661 sleep_time = 0; 661 sleep_time = 0;
662 else if (p->sleep_avg + sleep_time >= 662 else if (p->sleep_avg + sleep_time >=
663 INTERACTIVE_SLEEP(p)) { 663 INTERACTIVE_SLEEP(p)) {
664 p->sleep_avg = INTERACTIVE_SLEEP(p); 664 p->sleep_avg = INTERACTIVE_SLEEP(p);
665 sleep_time = 0; 665 sleep_time = 0;
666 } 666 }
667 } 667 }
668 668
669 /* 669 /*
670 * This code gives a bonus to interactive tasks. 670 * This code gives a bonus to interactive tasks.
671 * 671 *
672 * The boost works by updating the 'average sleep time' 672 * The boost works by updating the 'average sleep time'
673 * value here, based on ->timestamp. The more time a 673 * value here, based on ->timestamp. The more time a
674 * task spends sleeping, the higher the average gets - 674 * task spends sleeping, the higher the average gets -
675 * and the higher the priority boost gets as well. 675 * and the higher the priority boost gets as well.
676 */ 676 */
677 p->sleep_avg += sleep_time; 677 p->sleep_avg += sleep_time;
678 678
679 if (p->sleep_avg > NS_MAX_SLEEP_AVG) 679 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
680 p->sleep_avg = NS_MAX_SLEEP_AVG; 680 p->sleep_avg = NS_MAX_SLEEP_AVG;
681 } 681 }
682 } 682 }
683 683
684 p->prio = effective_prio(p); 684 p->prio = effective_prio(p);
685 } 685 }
686 686
687 /* 687 /*
688 * activate_task - move a task to the runqueue and do priority recalculation 688 * activate_task - move a task to the runqueue and do priority recalculation
689 * 689 *
690 * Update all the scheduling statistics stuff. (sleep average 690 * Update all the scheduling statistics stuff. (sleep average
691 * calculation, priority modifiers, etc.) 691 * calculation, priority modifiers, etc.)
692 */ 692 */
693 static void activate_task(task_t *p, runqueue_t *rq, int local) 693 static void activate_task(task_t *p, runqueue_t *rq, int local)
694 { 694 {
695 unsigned long long now; 695 unsigned long long now;
696 696
697 now = sched_clock(); 697 now = sched_clock();
698 #ifdef CONFIG_SMP 698 #ifdef CONFIG_SMP
699 if (!local) { 699 if (!local) {
700 /* Compensate for drifting sched_clock */ 700 /* Compensate for drifting sched_clock */
701 runqueue_t *this_rq = this_rq(); 701 runqueue_t *this_rq = this_rq();
702 now = (now - this_rq->timestamp_last_tick) 702 now = (now - this_rq->timestamp_last_tick)
703 + rq->timestamp_last_tick; 703 + rq->timestamp_last_tick;
704 } 704 }
705 #endif 705 #endif
706 706
707 recalc_task_prio(p, now); 707 recalc_task_prio(p, now);
708 708
709 /* 709 /*
710 * This checks to make sure it's not an uninterruptible task 710 * This checks to make sure it's not an uninterruptible task
711 * that is now waking up. 711 * that is now waking up.
712 */ 712 */
713 if (!p->activated) { 713 if (!p->activated) {
714 /* 714 /*
715 * Tasks which were woken up by interrupts (ie. hw events) 715 * Tasks which were woken up by interrupts (ie. hw events)
716 * are most likely of interactive nature. So we give them 716 * are most likely of interactive nature. So we give them
717 * the credit of extending their sleep time to the period 717 * the credit of extending their sleep time to the period
718 * of time they spend on the runqueue, waiting for execution 718 * of time they spend on the runqueue, waiting for execution
719 * on a CPU, first time around: 719 * on a CPU, first time around:
720 */ 720 */
721 if (in_interrupt()) 721 if (in_interrupt())
722 p->activated = 2; 722 p->activated = 2;
723 else { 723 else {
724 /* 724 /*
725 * Normal first-time wakeups get a credit too for 725 * Normal first-time wakeups get a credit too for
726 * on-runqueue time, but it will be weighted down: 726 * on-runqueue time, but it will be weighted down:
727 */ 727 */
728 p->activated = 1; 728 p->activated = 1;
729 } 729 }
730 } 730 }
731 p->timestamp = now; 731 p->timestamp = now;
732 732
733 __activate_task(p, rq); 733 __activate_task(p, rq);
734 } 734 }
735 735
736 /* 736 /*
737 * deactivate_task - remove a task from the runqueue. 737 * deactivate_task - remove a task from the runqueue.
738 */ 738 */
739 static void deactivate_task(struct task_struct *p, runqueue_t *rq) 739 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
740 { 740 {
741 rq->nr_running--; 741 rq->nr_running--;
742 dequeue_task(p, p->array); 742 dequeue_task(p, p->array);
743 p->array = NULL; 743 p->array = NULL;
744 } 744 }
745 745
746 /* 746 /*
747 * resched_task - mark a task 'to be rescheduled now'. 747 * resched_task - mark a task 'to be rescheduled now'.
748 * 748 *
749 * On UP this means the setting of the need_resched flag, on SMP it 749 * On UP this means the setting of the need_resched flag, on SMP it
750 * might also involve a cross-CPU call to trigger the scheduler on 750 * might also involve a cross-CPU call to trigger the scheduler on
751 * the target CPU. 751 * the target CPU.
752 */ 752 */
753 #ifdef CONFIG_SMP 753 #ifdef CONFIG_SMP
754 static void resched_task(task_t *p) 754 static void resched_task(task_t *p)
755 { 755 {
756 int need_resched, nrpolling; 756 int need_resched, nrpolling;
757 757
758 assert_spin_locked(&task_rq(p)->lock); 758 assert_spin_locked(&task_rq(p)->lock);
759 759
760 /* minimise the chance of sending an interrupt to poll_idle() */ 760 /* minimise the chance of sending an interrupt to poll_idle() */
761 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 761 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
762 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); 762 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
763 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 763 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
764 764
765 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) 765 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
766 smp_send_reschedule(task_cpu(p)); 766 smp_send_reschedule(task_cpu(p));
767 } 767 }
768 #else 768 #else
769 static inline void resched_task(task_t *p) 769 static inline void resched_task(task_t *p)
770 { 770 {
771 set_tsk_need_resched(p); 771 set_tsk_need_resched(p);
772 } 772 }
773 #endif 773 #endif
774 774
775 /** 775 /**
776 * task_curr - is this task currently executing on a CPU? 776 * task_curr - is this task currently executing on a CPU?
777 * @p: the task in question. 777 * @p: the task in question.
778 */ 778 */
779 inline int task_curr(const task_t *p) 779 inline int task_curr(const task_t *p)
780 { 780 {
781 return cpu_curr(task_cpu(p)) == p; 781 return cpu_curr(task_cpu(p)) == p;
782 } 782 }
783 783
784 #ifdef CONFIG_SMP 784 #ifdef CONFIG_SMP
785 enum request_type { 785 enum request_type {
786 REQ_MOVE_TASK, 786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN, 787 REQ_SET_DOMAIN,
788 }; 788 };
789 789
790 typedef struct { 790 typedef struct {
791 struct list_head list; 791 struct list_head list;
792 enum request_type type; 792 enum request_type type;
793 793
794 /* For REQ_MOVE_TASK */ 794 /* For REQ_MOVE_TASK */
795 task_t *task; 795 task_t *task;
796 int dest_cpu; 796 int dest_cpu;
797 797
798 /* For REQ_SET_DOMAIN */ 798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd; 799 struct sched_domain *sd;
800 800
801 struct completion done; 801 struct completion done;
802 } migration_req_t; 802 } migration_req_t;
803 803
804 /* 804 /*
805 * The task's runqueue lock must be held. 805 * The task's runqueue lock must be held.
806 * Returns true if you have to wait for migration thread. 806 * Returns true if you have to wait for migration thread.
807 */ 807 */
808 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) 808 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
809 { 809 {
810 runqueue_t *rq = task_rq(p); 810 runqueue_t *rq = task_rq(p);
811 811
812 /* 812 /*
813 * If the task is not on a runqueue (and not running), then 813 * If the task is not on a runqueue (and not running), then
814 * it is sufficient to simply update the task's cpu field. 814 * it is sufficient to simply update the task's cpu field.
815 */ 815 */
816 if (!p->array && !task_running(rq, p)) { 816 if (!p->array && !task_running(rq, p)) {
817 set_task_cpu(p, dest_cpu); 817 set_task_cpu(p, dest_cpu);
818 return 0; 818 return 0;
819 } 819 }
820 820
821 init_completion(&req->done); 821 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK; 822 req->type = REQ_MOVE_TASK;
823 req->task = p; 823 req->task = p;
824 req->dest_cpu = dest_cpu; 824 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue); 825 list_add(&req->list, &rq->migration_queue);
826 return 1; 826 return 1;
827 } 827 }
828 828
829 /* 829 /*
830 * wait_task_inactive - wait for a thread to unschedule. 830 * wait_task_inactive - wait for a thread to unschedule.
831 * 831 *
832 * The caller must ensure that the task *will* unschedule sometime soon, 832 * The caller must ensure that the task *will* unschedule sometime soon,
833 * else this function might spin for a *long* time. This function can't 833 * else this function might spin for a *long* time. This function can't
834 * be called with interrupts off, or it may introduce deadlock with 834 * be called with interrupts off, or it may introduce deadlock with
835 * smp_call_function() if an IPI is sent by the same process we are 835 * smp_call_function() if an IPI is sent by the same process we are
836 * waiting to become inactive. 836 * waiting to become inactive.
837 */ 837 */
838 void wait_task_inactive(task_t * p) 838 void wait_task_inactive(task_t * p)
839 { 839 {
840 unsigned long flags; 840 unsigned long flags;
841 runqueue_t *rq; 841 runqueue_t *rq;
842 int preempted; 842 int preempted;
843 843
844 repeat: 844 repeat:
845 rq = task_rq_lock(p, &flags); 845 rq = task_rq_lock(p, &flags);
846 /* Must be off runqueue entirely, not preempted. */ 846 /* Must be off runqueue entirely, not preempted. */
847 if (unlikely(p->array || task_running(rq, p))) { 847 if (unlikely(p->array || task_running(rq, p))) {
848 /* If it's preempted, we yield. It could be a while. */ 848 /* If it's preempted, we yield. It could be a while. */
849 preempted = !task_running(rq, p); 849 preempted = !task_running(rq, p);
850 task_rq_unlock(rq, &flags); 850 task_rq_unlock(rq, &flags);
851 cpu_relax(); 851 cpu_relax();
852 if (preempted) 852 if (preempted)
853 yield(); 853 yield();
854 goto repeat; 854 goto repeat;
855 } 855 }
856 task_rq_unlock(rq, &flags); 856 task_rq_unlock(rq, &flags);
857 } 857 }
858 858
859 /*** 859 /***
860 * kick_process - kick a running thread to enter/exit the kernel 860 * kick_process - kick a running thread to enter/exit the kernel
861 * @p: the to-be-kicked thread 861 * @p: the to-be-kicked thread
862 * 862 *
863 * Cause a process which is running on another CPU to enter 863 * Cause a process which is running on another CPU to enter
864 * kernel-mode, without any delay. (to get signals handled.) 864 * kernel-mode, without any delay. (to get signals handled.)
865 * 865 *
866 * NOTE: this function doesnt have to take the runqueue lock, 866 * NOTE: this function doesnt have to take the runqueue lock,
867 * because all it wants to ensure is that the remote task enters 867 * because all it wants to ensure is that the remote task enters
868 * the kernel. If the IPI races and the task has been migrated 868 * the kernel. If the IPI races and the task has been migrated
869 * to another CPU then no harm is done and the purpose has been 869 * to another CPU then no harm is done and the purpose has been
870 * achieved as well. 870 * achieved as well.
871 */ 871 */
872 void kick_process(task_t *p) 872 void kick_process(task_t *p)
873 { 873 {
874 int cpu; 874 int cpu;
875 875
876 preempt_disable(); 876 preempt_disable();
877 cpu = task_cpu(p); 877 cpu = task_cpu(p);
878 if ((cpu != smp_processor_id()) && task_curr(p)) 878 if ((cpu != smp_processor_id()) && task_curr(p))
879 smp_send_reschedule(cpu); 879 smp_send_reschedule(cpu);
880 preempt_enable(); 880 preempt_enable();
881 } 881 }
882 882
883 /* 883 /*
884 * Return a low guess at the load of a migration-source cpu. 884 * Return a low guess at the load of a migration-source cpu.
885 * 885 *
886 * We want to under-estimate the load of migration sources, to 886 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 887 * balance conservatively.
888 */ 888 */
889 static inline unsigned long source_load(int cpu) 889 static inline unsigned long source_load(int cpu)
890 { 890 {
891 runqueue_t *rq = cpu_rq(cpu); 891 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
893 893
894 return min(rq->cpu_load, load_now); 894 return min(rq->cpu_load, load_now);
895 } 895 }
896 896
897 /* 897 /*
898 * Return a high guess at the load of a migration-target cpu 898 * Return a high guess at the load of a migration-target cpu
899 */ 899 */
900 static inline unsigned long target_load(int cpu) 900 static inline unsigned long target_load(int cpu)
901 { 901 {
902 runqueue_t *rq = cpu_rq(cpu); 902 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
904 904
905 return max(rq->cpu_load, load_now); 905 return max(rq->cpu_load, load_now);
906 } 906 }
907 907
908 #endif 908 #endif
909 909
910 /* 910 /*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is 911 * wake_idle() will wake a task on an idle cpu if task->cpu is
912 * not idle and an idle cpu is available. The span of cpus to 912 * not idle and an idle cpu is available. The span of cpus to
913 * search starts with cpus closest then further out as needed, 913 * search starts with cpus closest then further out as needed,
914 * so we always favor a closer, idle cpu. 914 * so we always favor a closer, idle cpu.
915 * 915 *
916 * Returns the CPU we should wake onto. 916 * Returns the CPU we should wake onto.
917 */ 917 */
918 #if defined(ARCH_HAS_SCHED_WAKE_IDLE) 918 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
919 static int wake_idle(int cpu, task_t *p) 919 static int wake_idle(int cpu, task_t *p)
920 { 920 {
921 cpumask_t tmp; 921 cpumask_t tmp;
922 struct sched_domain *sd; 922 struct sched_domain *sd;
923 int i; 923 int i;
924 924
925 if (idle_cpu(cpu)) 925 if (idle_cpu(cpu))
926 return cpu; 926 return cpu;
927 927
928 for_each_domain(cpu, sd) { 928 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) { 929 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map); 930 cpus_and(tmp, sd->span, cpu_online_map);
931 cpus_and(tmp, tmp, p->cpus_allowed); 931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) { 932 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i)) 933 if (idle_cpu(i))
934 return i; 934 return i;
935 } 935 }
936 } 936 }
937 else break; 937 else break;
938 } 938 }
939 return cpu; 939 return cpu;
940 } 940 }
941 #else 941 #else
942 static inline int wake_idle(int cpu, task_t *p) 942 static inline int wake_idle(int cpu, task_t *p)
943 { 943 {
944 return cpu; 944 return cpu;
945 } 945 }
946 #endif 946 #endif
947 947
948 /*** 948 /***
949 * try_to_wake_up - wake up a thread 949 * try_to_wake_up - wake up a thread
950 * @p: the to-be-woken-up thread 950 * @p: the to-be-woken-up thread
951 * @state: the mask of task states that can be woken 951 * @state: the mask of task states that can be woken
952 * @sync: do a synchronous wakeup? 952 * @sync: do a synchronous wakeup?
953 * 953 *
954 * Put it on the run-queue if it's not already there. The "current" 954 * Put it on the run-queue if it's not already there. The "current"
955 * thread is always on the run-queue (except when the actual 955 * thread is always on the run-queue (except when the actual
956 * re-schedule is in progress), and as such you're allowed to do 956 * re-schedule is in progress), and as such you're allowed to do
957 * the simpler "current->state = TASK_RUNNING" to mark yourself 957 * the simpler "current->state = TASK_RUNNING" to mark yourself
958 * runnable without the overhead of this. 958 * runnable without the overhead of this.
959 * 959 *
960 * returns failure only if the task is already active. 960 * returns failure only if the task is already active.
961 */ 961 */
962 static int try_to_wake_up(task_t * p, unsigned int state, int sync) 962 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
963 { 963 {
964 int cpu, this_cpu, success = 0; 964 int cpu, this_cpu, success = 0;
965 unsigned long flags; 965 unsigned long flags;
966 long old_state; 966 long old_state;
967 runqueue_t *rq; 967 runqueue_t *rq;
968 #ifdef CONFIG_SMP 968 #ifdef CONFIG_SMP
969 unsigned long load, this_load; 969 unsigned long load, this_load;
970 struct sched_domain *sd; 970 struct sched_domain *sd;
971 int new_cpu; 971 int new_cpu;
972 #endif 972 #endif
973 973
974 rq = task_rq_lock(p, &flags); 974 rq = task_rq_lock(p, &flags);
975 old_state = p->state; 975 old_state = p->state;
976 if (!(old_state & state)) 976 if (!(old_state & state))
977 goto out; 977 goto out;
978 978
979 if (p->array) 979 if (p->array)
980 goto out_running; 980 goto out_running;
981 981
982 cpu = task_cpu(p); 982 cpu = task_cpu(p);
983 this_cpu = smp_processor_id(); 983 this_cpu = smp_processor_id();
984 984
985 #ifdef CONFIG_SMP 985 #ifdef CONFIG_SMP
986 if (unlikely(task_running(rq, p))) 986 if (unlikely(task_running(rq, p)))
987 goto out_activate; 987 goto out_activate;
988 988
989 #ifdef CONFIG_SCHEDSTATS 989 #ifdef CONFIG_SCHEDSTATS
990 schedstat_inc(rq, ttwu_cnt); 990 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 991 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 992 schedstat_inc(rq, ttwu_local);
993 } else { 993 } else {
994 for_each_domain(this_cpu, sd) { 994 for_each_domain(this_cpu, sd) {
995 if (cpu_isset(cpu, sd->span)) { 995 if (cpu_isset(cpu, sd->span)) {
996 schedstat_inc(sd, ttwu_wake_remote); 996 schedstat_inc(sd, ttwu_wake_remote);
997 break; 997 break;
998 } 998 }
999 } 999 }
1000 } 1000 }
1001 #endif 1001 #endif
1002 1002
1003 new_cpu = cpu; 1003 new_cpu = cpu;
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1005 goto out_set_cpu;
1006 1006
1007 load = source_load(cpu); 1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu); 1008 this_load = target_load(this_cpu);
1009 1009
1010 /* 1010 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1011 * If sync wakeup then subtract the (maximum possible) effect of
1012 * the currently running task from the load of the current CPU: 1012 * the currently running task from the load of the current CPU:
1013 */ 1013 */
1014 if (sync) 1014 if (sync)
1015 this_load -= SCHED_LOAD_SCALE; 1015 this_load -= SCHED_LOAD_SCALE;
1016 1016
1017 /* Don't pull the task off an idle CPU to a busy one */ 1017 /* Don't pull the task off an idle CPU to a busy one */
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) 1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu; 1019 goto out_set_cpu;
1020 1020
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1021 new_cpu = this_cpu; /* Wake to this CPU if we can */
1022 1022
1023 /* 1023 /*
1024 * Scan domains for affine wakeup and passive balancing 1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities. 1025 * possibilities.
1026 */ 1026 */
1027 for_each_domain(this_cpu, sd) { 1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance; 1028 unsigned int imbalance;
1029 /* 1029 /*
1030 * Start passive balancing when half the imbalance_pct 1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached. 1031 * limit is reached.
1032 */ 1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; 1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034 1034
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1035 if ((sd->flags & SD_WAKE_AFFINE) &&
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1036 !task_hot(p, rq->timestamp_last_tick, sd)) {
1037 /* 1037 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1038 * This domain has SD_WAKE_AFFINE and p is cache cold
1039 * in this domain. 1039 * in this domain.
1040 */ 1040 */
1041 if (cpu_isset(cpu, sd->span)) { 1041 if (cpu_isset(cpu, sd->span)) {
1042 schedstat_inc(sd, ttwu_move_affine); 1042 schedstat_inc(sd, ttwu_move_affine);
1043 goto out_set_cpu; 1043 goto out_set_cpu;
1044 } 1044 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) && 1045 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1046 imbalance*this_load <= 100*load) { 1046 imbalance*this_load <= 100*load) {
1047 /* 1047 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1048 * This domain has SD_WAKE_BALANCE and there is
1049 * an imbalance. 1049 * an imbalance.
1050 */ 1050 */
1051 if (cpu_isset(cpu, sd->span)) { 1051 if (cpu_isset(cpu, sd->span)) {
1052 schedstat_inc(sd, ttwu_move_balance); 1052 schedstat_inc(sd, ttwu_move_balance);
1053 goto out_set_cpu; 1053 goto out_set_cpu;
1054 } 1054 }
1055 } 1055 }
1056 } 1056 }
1057 1057
1058 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1058 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1059 out_set_cpu: 1059 out_set_cpu:
1060 new_cpu = wake_idle(new_cpu, p); 1060 new_cpu = wake_idle(new_cpu, p);
1061 if (new_cpu != cpu) { 1061 if (new_cpu != cpu) {
1062 set_task_cpu(p, new_cpu); 1062 set_task_cpu(p, new_cpu);
1063 task_rq_unlock(rq, &flags); 1063 task_rq_unlock(rq, &flags);
1064 /* might preempt at this point */ 1064 /* might preempt at this point */
1065 rq = task_rq_lock(p, &flags); 1065 rq = task_rq_lock(p, &flags);
1066 old_state = p->state; 1066 old_state = p->state;
1067 if (!(old_state & state)) 1067 if (!(old_state & state))
1068 goto out; 1068 goto out;
1069 if (p->array) 1069 if (p->array)
1070 goto out_running; 1070 goto out_running;
1071 1071
1072 this_cpu = smp_processor_id(); 1072 this_cpu = smp_processor_id();
1073 cpu = task_cpu(p); 1073 cpu = task_cpu(p);
1074 } 1074 }
1075 1075
1076 out_activate: 1076 out_activate:
1077 #endif /* CONFIG_SMP */ 1077 #endif /* CONFIG_SMP */
1078 if (old_state == TASK_UNINTERRUPTIBLE) { 1078 if (old_state == TASK_UNINTERRUPTIBLE) {
1079 rq->nr_uninterruptible--; 1079 rq->nr_uninterruptible--;
1080 /* 1080 /*
1081 * Tasks on involuntary sleep don't earn 1081 * Tasks on involuntary sleep don't earn
1082 * sleep_avg beyond just interactive state. 1082 * sleep_avg beyond just interactive state.
1083 */ 1083 */
1084 p->activated = -1; 1084 p->activated = -1;
1085 } 1085 }
1086 1086
1087 /* 1087 /*
1088 * Sync wakeups (i.e. those types of wakeups where the waker 1088 * Sync wakeups (i.e. those types of wakeups where the waker
1089 * has indicated that it will leave the CPU in short order) 1089 * has indicated that it will leave the CPU in short order)
1090 * don't trigger a preemption, if the woken up task will run on 1090 * don't trigger a preemption, if the woken up task will run on
1091 * this cpu. (in this case the 'I will reschedule' promise of 1091 * this cpu. (in this case the 'I will reschedule' promise of
1092 * the waker guarantees that the freshly woken up task is going 1092 * the waker guarantees that the freshly woken up task is going
1093 * to be considered on this CPU.) 1093 * to be considered on this CPU.)
1094 */ 1094 */
1095 activate_task(p, rq, cpu == this_cpu); 1095 activate_task(p, rq, cpu == this_cpu);
1096 if (!sync || cpu != this_cpu) { 1096 if (!sync || cpu != this_cpu) {
1097 if (TASK_PREEMPTS_CURR(p, rq)) 1097 if (TASK_PREEMPTS_CURR(p, rq))
1098 resched_task(rq->curr); 1098 resched_task(rq->curr);
1099 } 1099 }
1100 success = 1; 1100 success = 1;
1101 1101
1102 out_running: 1102 out_running:
1103 p->state = TASK_RUNNING; 1103 p->state = TASK_RUNNING;
1104 out: 1104 out:
1105 task_rq_unlock(rq, &flags); 1105 task_rq_unlock(rq, &flags);
1106 1106
1107 return success; 1107 return success;
1108 } 1108 }
1109 1109
1110 int fastcall wake_up_process(task_t * p) 1110 int fastcall wake_up_process(task_t * p)
1111 { 1111 {
1112 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1112 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1113 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1113 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1114 } 1114 }
1115 1115
1116 EXPORT_SYMBOL(wake_up_process); 1116 EXPORT_SYMBOL(wake_up_process);
1117 1117
1118 int fastcall wake_up_state(task_t *p, unsigned int state) 1118 int fastcall wake_up_state(task_t *p, unsigned int state)
1119 { 1119 {
1120 return try_to_wake_up(p, state, 0); 1120 return try_to_wake_up(p, state, 0);
1121 } 1121 }
1122 1122
1123 #ifdef CONFIG_SMP 1123 #ifdef CONFIG_SMP
1124 static int find_idlest_cpu(struct task_struct *p, int this_cpu, 1124 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd); 1125 struct sched_domain *sd);
1126 #endif 1126 #endif
1127 1127
1128 /* 1128 /*
1129 * Perform scheduler related setup for a newly forked process p. 1129 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current. 1130 * p is forked by current.
1131 */ 1131 */
1132 void fastcall sched_fork(task_t *p) 1132 void fastcall sched_fork(task_t *p)
1133 { 1133 {
1134 /* 1134 /*
1135 * We mark the process as running here, but have not actually 1135 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that 1136 * inserted it onto the runqueue yet. This guarantees that
1137 * nobody will actually run it, and a signal or other external 1137 * nobody will actually run it, and a signal or other external
1138 * event cannot wake it up and insert it on the runqueue either. 1138 * event cannot wake it up and insert it on the runqueue either.
1139 */ 1139 */
1140 p->state = TASK_RUNNING; 1140 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list); 1141 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL; 1142 p->array = NULL;
1143 spin_lock_init(&p->switch_lock); 1143 spin_lock_init(&p->switch_lock);
1144 #ifdef CONFIG_SCHEDSTATS 1144 #ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1145 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146 #endif 1146 #endif
1147 #ifdef CONFIG_PREEMPT 1147 #ifdef CONFIG_PREEMPT
1148 /* 1148 /*
1149 * During context-switch we hold precisely one spinlock, which 1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock, 1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count 1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled. 1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */ 1153 */
1154 p->thread_info->preempt_count = 1; 1154 p->thread_info->preempt_count = 1;
1155 #endif 1155 #endif
1156 /* 1156 /*
1157 * Share the timeslice between parent and child, thus the 1157 * Share the timeslice between parent and child, thus the
1158 * total amount of pending timeslices in the system doesn't change, 1158 * total amount of pending timeslices in the system doesn't change,
1159 * resulting in more scheduling fairness. 1159 * resulting in more scheduling fairness.
1160 */ 1160 */
1161 local_irq_disable(); 1161 local_irq_disable();
1162 p->time_slice = (current->time_slice + 1) >> 1; 1162 p->time_slice = (current->time_slice + 1) >> 1;
1163 /* 1163 /*
1164 * The remainder of the first timeslice might be recovered by 1164 * The remainder of the first timeslice might be recovered by
1165 * the parent if the child exits early enough. 1165 * the parent if the child exits early enough.
1166 */ 1166 */
1167 p->first_time_slice = 1; 1167 p->first_time_slice = 1;
1168 current->time_slice >>= 1; 1168 current->time_slice >>= 1;
1169 p->timestamp = sched_clock(); 1169 p->timestamp = sched_clock();
1170 if (unlikely(!current->time_slice)) { 1170 if (unlikely(!current->time_slice)) {
1171 /* 1171 /*
1172 * This case is rare, it happens when the parent has only 1172 * This case is rare, it happens when the parent has only
1173 * a single jiffy left from its timeslice. Taking the 1173 * a single jiffy left from its timeslice. Taking the
1174 * runqueue lock is not a problem. 1174 * runqueue lock is not a problem.
1175 */ 1175 */
1176 current->time_slice = 1; 1176 current->time_slice = 1;
1177 preempt_disable(); 1177 preempt_disable();
1178 scheduler_tick(); 1178 scheduler_tick();
1179 local_irq_enable(); 1179 local_irq_enable();
1180 preempt_enable(); 1180 preempt_enable();
1181 } else 1181 } else
1182 local_irq_enable(); 1182 local_irq_enable();
1183 } 1183 }
1184 1184
1185 /* 1185 /*
1186 * wake_up_new_task - wake up a newly created task for the first time. 1186 * wake_up_new_task - wake up a newly created task for the first time.
1187 * 1187 *
1188 * This function will do some initial scheduler statistics housekeeping 1188 * This function will do some initial scheduler statistics housekeeping
1189 * that must be done for every newly created context, then puts the task 1189 * that must be done for every newly created context, then puts the task
1190 * on the runqueue and wakes it. 1190 * on the runqueue and wakes it.
1191 */ 1191 */
1192 void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) 1192 void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1193 { 1193 {
1194 unsigned long flags; 1194 unsigned long flags;
1195 int this_cpu, cpu; 1195 int this_cpu, cpu;
1196 runqueue_t *rq, *this_rq; 1196 runqueue_t *rq, *this_rq;
1197 1197
1198 rq = task_rq_lock(p, &flags); 1198 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p); 1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id(); 1200 this_cpu = smp_processor_id();
1201 1201
1202 BUG_ON(p->state != TASK_RUNNING); 1202 BUG_ON(p->state != TASK_RUNNING);
1203 1203
1204 /* 1204 /*
1205 * We decrease the sleep average of forking parents 1205 * We decrease the sleep average of forking parents
1206 * and children as well, to keep max-interactive tasks 1206 * and children as well, to keep max-interactive tasks
1207 * from forking tasks that are max-interactive. The parent 1207 * from forking tasks that are max-interactive. The parent
1208 * (current) is done further down, under its lock. 1208 * (current) is done further down, under its lock.
1209 */ 1209 */
1210 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * 1210 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1211 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1211 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1212 1212
1213 p->prio = effective_prio(p); 1213 p->prio = effective_prio(p);
1214 1214
1215 if (likely(cpu == this_cpu)) { 1215 if (likely(cpu == this_cpu)) {
1216 if (!(clone_flags & CLONE_VM)) { 1216 if (!(clone_flags & CLONE_VM)) {
1217 /* 1217 /*
1218 * The VM isn't cloned, so we're in a good position to 1218 * The VM isn't cloned, so we're in a good position to
1219 * do child-runs-first in anticipation of an exec. This 1219 * do child-runs-first in anticipation of an exec. This
1220 * usually avoids a lot of COW overhead. 1220 * usually avoids a lot of COW overhead.
1221 */ 1221 */
1222 if (unlikely(!current->array)) 1222 if (unlikely(!current->array))
1223 __activate_task(p, rq); 1223 __activate_task(p, rq);
1224 else { 1224 else {
1225 p->prio = current->prio; 1225 p->prio = current->prio;
1226 list_add_tail(&p->run_list, &current->run_list); 1226 list_add_tail(&p->run_list, &current->run_list);
1227 p->array = current->array; 1227 p->array = current->array;
1228 p->array->nr_active++; 1228 p->array->nr_active++;
1229 rq->nr_running++; 1229 rq->nr_running++;
1230 } 1230 }
1231 set_need_resched(); 1231 set_need_resched();
1232 } else 1232 } else
1233 /* Run child last */ 1233 /* Run child last */
1234 __activate_task(p, rq); 1234 __activate_task(p, rq);
1235 /* 1235 /*
1236 * We skip the following code due to cpu == this_cpu 1236 * We skip the following code due to cpu == this_cpu
1237 * 1237 *
1238 * task_rq_unlock(rq, &flags); 1238 * task_rq_unlock(rq, &flags);
1239 * this_rq = task_rq_lock(current, &flags); 1239 * this_rq = task_rq_lock(current, &flags);
1240 */ 1240 */
1241 this_rq = rq; 1241 this_rq = rq;
1242 } else { 1242 } else {
1243 this_rq = cpu_rq(this_cpu); 1243 this_rq = cpu_rq(this_cpu);
1244 1244
1245 /* 1245 /*
1246 * Not the local CPU - must adjust timestamp. This should 1246 * Not the local CPU - must adjust timestamp. This should
1247 * get optimised away in the !CONFIG_SMP case. 1247 * get optimised away in the !CONFIG_SMP case.
1248 */ 1248 */
1249 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1249 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1250 + rq->timestamp_last_tick; 1250 + rq->timestamp_last_tick;
1251 __activate_task(p, rq); 1251 __activate_task(p, rq);
1252 if (TASK_PREEMPTS_CURR(p, rq)) 1252 if (TASK_PREEMPTS_CURR(p, rq))
1253 resched_task(rq->curr); 1253 resched_task(rq->curr);
1254 1254
1255 /* 1255 /*
1256 * Parent and child are on different CPUs, now get the 1256 * Parent and child are on different CPUs, now get the
1257 * parent runqueue to update the parent's ->sleep_avg: 1257 * parent runqueue to update the parent's ->sleep_avg:
1258 */ 1258 */
1259 task_rq_unlock(rq, &flags); 1259 task_rq_unlock(rq, &flags);
1260 this_rq = task_rq_lock(current, &flags); 1260 this_rq = task_rq_lock(current, &flags);
1261 } 1261 }
1262 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * 1262 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1263 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1263 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1264 task_rq_unlock(this_rq, &flags); 1264 task_rq_unlock(this_rq, &flags);
1265 } 1265 }
1266 1266
1267 /* 1267 /*
1268 * Potentially available exiting-child timeslices are 1268 * Potentially available exiting-child timeslices are
1269 * retrieved here - this way the parent does not get 1269 * retrieved here - this way the parent does not get
1270 * penalized for creating too many threads. 1270 * penalized for creating too many threads.
1271 * 1271 *
1272 * (this cannot be used to 'generate' timeslices 1272 * (this cannot be used to 'generate' timeslices
1273 * artificially, because any timeslice recovered here 1273 * artificially, because any timeslice recovered here
1274 * was given away by the parent in the first place.) 1274 * was given away by the parent in the first place.)
1275 */ 1275 */
1276 void fastcall sched_exit(task_t * p) 1276 void fastcall sched_exit(task_t * p)
1277 { 1277 {
1278 unsigned long flags; 1278 unsigned long flags;
1279 runqueue_t *rq; 1279 runqueue_t *rq;
1280 1280
1281 /* 1281 /*
1282 * If the child was a (relative-) CPU hog then decrease 1282 * If the child was a (relative-) CPU hog then decrease
1283 * the sleep_avg of the parent as well. 1283 * the sleep_avg of the parent as well.
1284 */ 1284 */
1285 rq = task_rq_lock(p->parent, &flags); 1285 rq = task_rq_lock(p->parent, &flags);
1286 if (p->first_time_slice) { 1286 if (p->first_time_slice) {
1287 p->parent->time_slice += p->time_slice; 1287 p->parent->time_slice += p->time_slice;
1288 if (unlikely(p->parent->time_slice > task_timeslice(p))) 1288 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1289 p->parent->time_slice = task_timeslice(p); 1289 p->parent->time_slice = task_timeslice(p);
1290 } 1290 }
1291 if (p->sleep_avg < p->parent->sleep_avg) 1291 if (p->sleep_avg < p->parent->sleep_avg)
1292 p->parent->sleep_avg = p->parent->sleep_avg / 1292 p->parent->sleep_avg = p->parent->sleep_avg /
1293 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / 1293 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1294 (EXIT_WEIGHT + 1); 1294 (EXIT_WEIGHT + 1);
1295 task_rq_unlock(rq, &flags); 1295 task_rq_unlock(rq, &flags);
1296 } 1296 }
1297 1297
1298 /** 1298 /**
1299 * finish_task_switch - clean up after a task-switch 1299 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from. 1300 * @prev: the thread we just switched away from.
1301 * 1301 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch() 1302 * We enter this with the runqueue still locked, and finish_arch_switch()
1303 * will unlock it along with doing any other architecture-specific cleanup 1303 * will unlock it along with doing any other architecture-specific cleanup
1304 * actions. 1304 * actions.
1305 * 1305 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If 1306 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it 1307 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for 1308 * with the lock held can cause deadlocks; see schedule() for
1309 * details.) 1309 * details.)
1310 */ 1310 */
1311 static inline void finish_task_switch(task_t *prev) 1311 static inline void finish_task_switch(task_t *prev)
1312 __releases(rq->lock) 1312 __releases(rq->lock)
1313 { 1313 {
1314 runqueue_t *rq = this_rq(); 1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm; 1315 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags; 1316 unsigned long prev_task_flags;
1317 1317
1318 rq->prev_mm = NULL; 1318 rq->prev_mm = NULL;
1319 1319
1320 /* 1320 /*
1321 * A task struct has one reference for the use as "current". 1321 * A task struct has one reference for the use as "current".
1322 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and 1322 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1323 * calls schedule one last time. The schedule call will never return, 1323 * calls schedule one last time. The schedule call will never return,
1324 * and the scheduled task must drop that reference. 1324 * and the scheduled task must drop that reference.
1325 * The test for EXIT_ZOMBIE must occur while the runqueue locks are 1325 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1326 * still held, otherwise prev could be scheduled on another cpu, die 1326 * still held, otherwise prev could be scheduled on another cpu, die
1327 * there before we look at prev->state, and then the reference would 1327 * there before we look at prev->state, and then the reference would
1328 * be dropped twice. 1328 * be dropped twice.
1329 * Manfred Spraul <manfred@colorfullife.com> 1329 * Manfred Spraul <manfred@colorfullife.com>
1330 */ 1330 */
1331 prev_task_flags = prev->flags; 1331 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev); 1332 finish_arch_switch(rq, prev);
1333 if (mm) 1333 if (mm)
1334 mmdrop(mm); 1334 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD)) 1335 if (unlikely(prev_task_flags & PF_DEAD))
1336 put_task_struct(prev); 1336 put_task_struct(prev);
1337 } 1337 }
1338 1338
1339 /** 1339 /**
1340 * schedule_tail - first thing a freshly forked thread must call. 1340 * schedule_tail - first thing a freshly forked thread must call.
1341 * @prev: the thread we just switched away from. 1341 * @prev: the thread we just switched away from.
1342 */ 1342 */
1343 asmlinkage void schedule_tail(task_t *prev) 1343 asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock) 1344 __releases(rq->lock)
1345 { 1345 {
1346 finish_task_switch(prev); 1346 finish_task_switch(prev);
1347 1347
1348 if (current->set_child_tid) 1348 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid); 1349 put_user(current->pid, current->set_child_tid);
1350 } 1350 }
1351 1351
1352 /* 1352 /*
1353 * context_switch - switch to the new MM and the new 1353 * context_switch - switch to the new MM and the new
1354 * thread's register state. 1354 * thread's register state.
1355 */ 1355 */
1356 static inline 1356 static inline
1357 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) 1357 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1358 { 1358 {
1359 struct mm_struct *mm = next->mm; 1359 struct mm_struct *mm = next->mm;
1360 struct mm_struct *oldmm = prev->active_mm; 1360 struct mm_struct *oldmm = prev->active_mm;
1361 1361
1362 if (unlikely(!mm)) { 1362 if (unlikely(!mm)) {
1363 next->active_mm = oldmm; 1363 next->active_mm = oldmm;
1364 atomic_inc(&oldmm->mm_count); 1364 atomic_inc(&oldmm->mm_count);
1365 enter_lazy_tlb(oldmm, next); 1365 enter_lazy_tlb(oldmm, next);
1366 } else 1366 } else
1367 switch_mm(oldmm, mm, next); 1367 switch_mm(oldmm, mm, next);
1368 1368
1369 if (unlikely(!prev->mm)) { 1369 if (unlikely(!prev->mm)) {
1370 prev->active_mm = NULL; 1370 prev->active_mm = NULL;
1371 WARN_ON(rq->prev_mm); 1371 WARN_ON(rq->prev_mm);
1372 rq->prev_mm = oldmm; 1372 rq->prev_mm = oldmm;
1373 } 1373 }
1374 1374
1375 /* Here we just switch the register state and the stack. */ 1375 /* Here we just switch the register state and the stack. */
1376 switch_to(prev, next, prev); 1376 switch_to(prev, next, prev);
1377 1377
1378 return prev; 1378 return prev;
1379 } 1379 }
1380 1380
1381 /* 1381 /*
1382 * nr_running, nr_uninterruptible and nr_context_switches: 1382 * nr_running, nr_uninterruptible and nr_context_switches:
1383 * 1383 *
1384 * externally visible scheduler statistics: current number of runnable 1384 * externally visible scheduler statistics: current number of runnable
1385 * threads, current number of uninterruptible-sleeping threads, total 1385 * threads, current number of uninterruptible-sleeping threads, total
1386 * number of context switches performed since bootup. 1386 * number of context switches performed since bootup.
1387 */ 1387 */
1388 unsigned long nr_running(void) 1388 unsigned long nr_running(void)
1389 { 1389 {
1390 unsigned long i, sum = 0; 1390 unsigned long i, sum = 0;
1391 1391
1392 for_each_online_cpu(i) 1392 for_each_online_cpu(i)
1393 sum += cpu_rq(i)->nr_running; 1393 sum += cpu_rq(i)->nr_running;
1394 1394
1395 return sum; 1395 return sum;
1396 } 1396 }
1397 1397
1398 unsigned long nr_uninterruptible(void) 1398 unsigned long nr_uninterruptible(void)
1399 { 1399 {
1400 unsigned long i, sum = 0; 1400 unsigned long i, sum = 0;
1401 1401
1402 for_each_cpu(i) 1402 for_each_cpu(i)
1403 sum += cpu_rq(i)->nr_uninterruptible; 1403 sum += cpu_rq(i)->nr_uninterruptible;
1404 1404
1405 /* 1405 /*
1406 * Since we read the counters lockless, it might be slightly 1406 * Since we read the counters lockless, it might be slightly
1407 * inaccurate. Do not allow it to go below zero though: 1407 * inaccurate. Do not allow it to go below zero though:
1408 */ 1408 */
1409 if (unlikely((long)sum < 0)) 1409 if (unlikely((long)sum < 0))
1410 sum = 0; 1410 sum = 0;
1411 1411
1412 return sum; 1412 return sum;
1413 } 1413 }
1414 1414
1415 unsigned long long nr_context_switches(void) 1415 unsigned long long nr_context_switches(void)
1416 { 1416 {
1417 unsigned long long i, sum = 0; 1417 unsigned long long i, sum = 0;
1418 1418
1419 for_each_cpu(i) 1419 for_each_cpu(i)
1420 sum += cpu_rq(i)->nr_switches; 1420 sum += cpu_rq(i)->nr_switches;
1421 1421
1422 return sum; 1422 return sum;
1423 } 1423 }
1424 1424
1425 unsigned long nr_iowait(void) 1425 unsigned long nr_iowait(void)
1426 { 1426 {
1427 unsigned long i, sum = 0; 1427 unsigned long i, sum = 0;
1428 1428
1429 for_each_cpu(i) 1429 for_each_cpu(i)
1430 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1430 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1431 1431
1432 return sum; 1432 return sum;
1433 } 1433 }
1434 1434
1435 #ifdef CONFIG_SMP 1435 #ifdef CONFIG_SMP
1436 1436
1437 /* 1437 /*
1438 * double_rq_lock - safely lock two runqueues 1438 * double_rq_lock - safely lock two runqueues
1439 * 1439 *
1440 * Note this does not disable interrupts like task_rq_lock, 1440 * Note this does not disable interrupts like task_rq_lock,
1441 * you need to do so manually before calling. 1441 * you need to do so manually before calling.
1442 */ 1442 */
1443 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) 1443 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1444 __acquires(rq1->lock) 1444 __acquires(rq1->lock)
1445 __acquires(rq2->lock) 1445 __acquires(rq2->lock)
1446 { 1446 {
1447 if (rq1 == rq2) { 1447 if (rq1 == rq2) {
1448 spin_lock(&rq1->lock); 1448 spin_lock(&rq1->lock);
1449 __acquire(rq2->lock); /* Fake it out ;) */ 1449 __acquire(rq2->lock); /* Fake it out ;) */
1450 } else { 1450 } else {
1451 if (rq1 < rq2) { 1451 if (rq1 < rq2) {
1452 spin_lock(&rq1->lock); 1452 spin_lock(&rq1->lock);
1453 spin_lock(&rq2->lock); 1453 spin_lock(&rq2->lock);
1454 } else { 1454 } else {
1455 spin_lock(&rq2->lock); 1455 spin_lock(&rq2->lock);
1456 spin_lock(&rq1->lock); 1456 spin_lock(&rq1->lock);
1457 } 1457 }
1458 } 1458 }
1459 } 1459 }
1460 1460
1461 /* 1461 /*
1462 * double_rq_unlock - safely unlock two runqueues 1462 * double_rq_unlock - safely unlock two runqueues
1463 * 1463 *
1464 * Note this does not restore interrupts like task_rq_unlock, 1464 * Note this does not restore interrupts like task_rq_unlock,
1465 * you need to do so manually after calling. 1465 * you need to do so manually after calling.
1466 */ 1466 */
1467 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) 1467 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1468 __releases(rq1->lock) 1468 __releases(rq1->lock)
1469 __releases(rq2->lock) 1469 __releases(rq2->lock)
1470 { 1470 {
1471 spin_unlock(&rq1->lock); 1471 spin_unlock(&rq1->lock);
1472 if (rq1 != rq2) 1472 if (rq1 != rq2)
1473 spin_unlock(&rq2->lock); 1473 spin_unlock(&rq2->lock);
1474 else 1474 else
1475 __release(rq2->lock); 1475 __release(rq2->lock);
1476 } 1476 }
1477 1477
1478 /* 1478 /*
1479 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1479 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1480 */ 1480 */
1481 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) 1481 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1482 __releases(this_rq->lock) 1482 __releases(this_rq->lock)
1483 __acquires(busiest->lock) 1483 __acquires(busiest->lock)
1484 __acquires(this_rq->lock) 1484 __acquires(this_rq->lock)
1485 { 1485 {
1486 if (unlikely(!spin_trylock(&busiest->lock))) { 1486 if (unlikely(!spin_trylock(&busiest->lock))) {
1487 if (busiest < this_rq) { 1487 if (busiest < this_rq) {
1488 spin_unlock(&this_rq->lock); 1488 spin_unlock(&this_rq->lock);
1489 spin_lock(&busiest->lock); 1489 spin_lock(&busiest->lock);
1490 spin_lock(&this_rq->lock); 1490 spin_lock(&this_rq->lock);
1491 } else 1491 } else
1492 spin_lock(&busiest->lock); 1492 spin_lock(&busiest->lock);
1493 } 1493 }
1494 } 1494 }
1495 1495
1496 /* 1496 /*
1497 * find_idlest_cpu - find the least busy runqueue. 1497 * find_idlest_cpu - find the least busy runqueue.
1498 */ 1498 */
1499 static int find_idlest_cpu(struct task_struct *p, int this_cpu, 1499 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd) 1500 struct sched_domain *sd)
1501 { 1501 {
1502 unsigned long load, min_load, this_load; 1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu; 1503 int i, min_cpu;
1504 cpumask_t mask; 1504 cpumask_t mask;
1505 1505
1506 min_cpu = UINT_MAX; 1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX; 1507 min_load = ULONG_MAX;
1508 1508
1509 cpus_and(mask, sd->span, p->cpus_allowed); 1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510 1510
1511 for_each_cpu_mask(i, mask) { 1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i); 1512 load = target_load(i);
1513 1513
1514 if (load < min_load) { 1514 if (load < min_load) {
1515 min_cpu = i; 1515 min_cpu = i;
1516 min_load = load; 1516 min_load = load;
1517 1517
1518 /* break out early on an idle CPU: */ 1518 /* break out early on an idle CPU: */
1519 if (!min_load) 1519 if (!min_load)
1520 break; 1520 break;
1521 } 1521 }
1522 } 1522 }
1523 1523
1524 /* add +1 to account for the new task */ 1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; 1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526 1526
1527 /* 1527 /*
1528 * Would with the addition of the new task to the 1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this 1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU? 1530 * CPU and the idlest CPU?
1531 * 1531 *
1532 * Use half of the balancing threshold - new-context is 1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance. 1533 * a good opportunity to balance.
1534 */ 1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) 1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu; 1536 return min_cpu;
1537 1537
1538 return this_cpu; 1538 return this_cpu;
1539 } 1539 }
1540 1540
1541 /* 1541 /*
1542 * If dest_cpu is allowed for this process, migrate the task to it. 1542 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only 1543 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1545 * the cpu_allowed mask is restored. 1545 * the cpu_allowed mask is restored.
1546 */ 1546 */
1547 static void sched_migrate_task(task_t *p, int dest_cpu) 1547 static void sched_migrate_task(task_t *p, int dest_cpu)
1548 { 1548 {
1549 migration_req_t req; 1549 migration_req_t req;
1550 runqueue_t *rq; 1550 runqueue_t *rq;
1551 unsigned long flags; 1551 unsigned long flags;
1552 1552
1553 rq = task_rq_lock(p, &flags); 1553 rq = task_rq_lock(p, &flags);
1554 if (!cpu_isset(dest_cpu, p->cpus_allowed) 1554 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1555 || unlikely(cpu_is_offline(dest_cpu))) 1555 || unlikely(cpu_is_offline(dest_cpu)))
1556 goto out; 1556 goto out;
1557 1557
1558 /* force the process onto the specified CPU */ 1558 /* force the process onto the specified CPU */
1559 if (migrate_task(p, dest_cpu, &req)) { 1559 if (migrate_task(p, dest_cpu, &req)) {
1560 /* Need to wait for migration thread (might exit: take ref). */ 1560 /* Need to wait for migration thread (might exit: take ref). */
1561 struct task_struct *mt = rq->migration_thread; 1561 struct task_struct *mt = rq->migration_thread;
1562 get_task_struct(mt); 1562 get_task_struct(mt);
1563 task_rq_unlock(rq, &flags); 1563 task_rq_unlock(rq, &flags);
1564 wake_up_process(mt); 1564 wake_up_process(mt);
1565 put_task_struct(mt); 1565 put_task_struct(mt);
1566 wait_for_completion(&req.done); 1566 wait_for_completion(&req.done);
1567 return; 1567 return;
1568 } 1568 }
1569 out: 1569 out:
1570 task_rq_unlock(rq, &flags); 1570 task_rq_unlock(rq, &flags);
1571 } 1571 }
1572 1572
1573 /* 1573 /*
1574 * sched_exec(): find the highest-level, exec-balance-capable 1574 * sched_exec(): find the highest-level, exec-balance-capable
1575 * domain and try to migrate the task to the least loaded CPU. 1575 * domain and try to migrate the task to the least loaded CPU.
1576 * 1576 *
1577 * execve() is a valuable balancing opportunity, because at this point 1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint. 1578 * the task has the smallest effective memory and cache footprint.
1579 */ 1579 */
1580 void sched_exec(void) 1580 void sched_exec(void)
1581 { 1581 {
1582 struct sched_domain *tmp, *sd = NULL; 1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu(); 1583 int new_cpu, this_cpu = get_cpu();
1584 1584
1585 /* Prefer the current CPU if there's only this task running */ 1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1) 1586 if (this_rq()->nr_running <= 1)
1587 goto out; 1587 goto out;
1588 1588
1589 for_each_domain(this_cpu, tmp) 1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC) 1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp; 1591 sd = tmp;
1592 1592
1593 if (sd) { 1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts); 1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd); 1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) { 1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed); 1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu(); 1598 put_cpu();
1599 sched_migrate_task(current, new_cpu); 1599 sched_migrate_task(current, new_cpu);
1600 return; 1600 return;
1601 } 1601 }
1602 } 1602 }
1603 out: 1603 out:
1604 put_cpu(); 1604 put_cpu();
1605 } 1605 }
1606 1606
1607 /* 1607 /*
1608 * pull_task - move a task from a remote runqueue to the local runqueue. 1608 * pull_task - move a task from a remote runqueue to the local runqueue.
1609 * Both runqueues must be locked. 1609 * Both runqueues must be locked.
1610 */ 1610 */
1611 static inline 1611 static inline
1612 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 1612 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1613 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1613 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1614 { 1614 {
1615 dequeue_task(p, src_array); 1615 dequeue_task(p, src_array);
1616 src_rq->nr_running--; 1616 src_rq->nr_running--;
1617 set_task_cpu(p, this_cpu); 1617 set_task_cpu(p, this_cpu);
1618 this_rq->nr_running++; 1618 this_rq->nr_running++;
1619 enqueue_task(p, this_array); 1619 enqueue_task(p, this_array);
1620 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1620 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1621 + this_rq->timestamp_last_tick; 1621 + this_rq->timestamp_last_tick;
1622 /* 1622 /*
1623 * Note that idle threads have a prio of MAX_PRIO, for this test 1623 * Note that idle threads have a prio of MAX_PRIO, for this test
1624 * to be always true for them. 1624 * to be always true for them.
1625 */ 1625 */
1626 if (TASK_PREEMPTS_CURR(p, this_rq)) 1626 if (TASK_PREEMPTS_CURR(p, this_rq))
1627 resched_task(this_rq->curr); 1627 resched_task(this_rq->curr);
1628 } 1628 }
1629 1629
1630 /* 1630 /*
1631 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 1631 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1632 */ 1632 */
1633 static inline 1633 static inline
1634 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1634 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1635 struct sched_domain *sd, enum idle_type idle)
1636 { 1636 {
1637 /* 1637 /*
1638 * We do not migrate tasks that are: 1638 * We do not migrate tasks that are:
1639 * 1) running (obviously), or 1639 * 1) running (obviously), or
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1641 * 3) are cache-hot on their current CPU.
1642 */ 1642 */
1643 if (task_running(rq, p)) 1643 if (task_running(rq, p))
1644 return 0; 1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1645 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1646 return 0;
1647 1647
1648 /* 1648 /*
1649 * Aggressive migration if: 1649 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or 1650 * 1) the [whole] cpu is idle, or
1651 * 2) too many balance attempts have failed. 1651 * 2) too many balance attempts have failed.
1652 */ 1652 */
1653 1653
1654 if (cpu_and_siblings_are_idle(this_cpu) || \ 1654 if (cpu_and_siblings_are_idle(this_cpu) || \
1655 sd->nr_balance_failed > sd->cache_nice_tries) 1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1; 1656 return 1;
1657 1657
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1658 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1659 return 0;
1660 return 1; 1660 return 1;
1661 } 1661 }
1662 1662
1663 /* 1663 /*
1664 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 1664 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1665 * as part of a balancing operation within "domain". Returns the number of 1665 * as part of a balancing operation within "domain". Returns the number of
1666 * tasks moved. 1666 * tasks moved.
1667 * 1667 *
1668 * Called with both runqueues locked. 1668 * Called with both runqueues locked.
1669 */ 1669 */
1670 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1670 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1671 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1672 enum idle_type idle)
1673 { 1673 {
1674 prio_array_t *array, *dst_array; 1674 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1675 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1676 int idx, pulled = 0;
1677 task_t *tmp; 1677 task_t *tmp;
1678 1678
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1679 if (max_nr_move <= 0 || busiest->nr_running <= 1)
1680 goto out; 1680 goto out;
1681 1681
1682 /* 1682 /*
1683 * We first consider expired tasks. Those will likely not be 1683 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1684 * executed in the near future, and they are most likely to
1685 * be cache-cold, thus switching CPUs has the least effect 1685 * be cache-cold, thus switching CPUs has the least effect
1686 * on them. 1686 * on them.
1687 */ 1687 */
1688 if (busiest->expired->nr_active) { 1688 if (busiest->expired->nr_active) {
1689 array = busiest->expired; 1689 array = busiest->expired;
1690 dst_array = this_rq->expired; 1690 dst_array = this_rq->expired;
1691 } else { 1691 } else {
1692 array = busiest->active; 1692 array = busiest->active;
1693 dst_array = this_rq->active; 1693 dst_array = this_rq->active;
1694 } 1694 }
1695 1695
1696 new_array: 1696 new_array:
1697 /* Start searching at priority 0: */ 1697 /* Start searching at priority 0: */
1698 idx = 0; 1698 idx = 0;
1699 skip_bitmap: 1699 skip_bitmap:
1700 if (!idx) 1700 if (!idx)
1701 idx = sched_find_first_bit(array->bitmap); 1701 idx = sched_find_first_bit(array->bitmap);
1702 else 1702 else
1703 idx = find_next_bit(array->bitmap, MAX_PRIO, idx); 1703 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1704 if (idx >= MAX_PRIO) { 1704 if (idx >= MAX_PRIO) {
1705 if (array == busiest->expired && busiest->active->nr_active) { 1705 if (array == busiest->expired && busiest->active->nr_active) {
1706 array = busiest->active; 1706 array = busiest->active;
1707 dst_array = this_rq->active; 1707 dst_array = this_rq->active;
1708 goto new_array; 1708 goto new_array;
1709 } 1709 }
1710 goto out; 1710 goto out;
1711 } 1711 }
1712 1712
1713 head = array->queue + idx; 1713 head = array->queue + idx;
1714 curr = head->prev; 1714 curr = head->prev;
1715 skip_queue: 1715 skip_queue:
1716 tmp = list_entry(curr, task_t, run_list); 1716 tmp = list_entry(curr, task_t, run_list);
1717 1717
1718 curr = curr->prev; 1718 curr = curr->prev;
1719 1719
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1721 if (curr != head) 1721 if (curr != head)
1722 goto skip_queue; 1722 goto skip_queue;
1723 idx++; 1723 idx++;
1724 goto skip_bitmap; 1724 goto skip_bitmap;
1725 } 1725 }
1726 1726
1727 #ifdef CONFIG_SCHEDSTATS 1727 #ifdef CONFIG_SCHEDSTATS
1728 if (task_hot(tmp, busiest->timestamp_last_tick, sd)) 1728 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1729 schedstat_inc(sd, lb_hot_gained[idle]); 1729 schedstat_inc(sd, lb_hot_gained[idle]);
1730 #endif 1730 #endif
1731 1731
1732 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 1732 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1733 pulled++; 1733 pulled++;
1734 1734
1735 /* We only want to steal up to the prescribed number of tasks. */ 1735 /* We only want to steal up to the prescribed number of tasks. */
1736 if (pulled < max_nr_move) { 1736 if (pulled < max_nr_move) {
1737 if (curr != head) 1737 if (curr != head)
1738 goto skip_queue; 1738 goto skip_queue;
1739 idx++; 1739 idx++;
1740 goto skip_bitmap; 1740 goto skip_bitmap;
1741 } 1741 }
1742 out: 1742 out:
1743 /* 1743 /*
1744 * Right now, this is the only place pull_task() is called, 1744 * Right now, this is the only place pull_task() is called,
1745 * so we can safely collect pull_task() stats here rather than 1745 * so we can safely collect pull_task() stats here rather than
1746 * inside pull_task(). 1746 * inside pull_task().
1747 */ 1747 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1748 schedstat_add(sd, lb_gained[idle], pulled);
1749 return pulled; 1749 return pulled;
1750 } 1750 }
1751 1751
1752 /* 1752 /*
1753 * find_busiest_group finds and returns the busiest CPU group within the 1753 * find_busiest_group finds and returns the busiest CPU group within the
1754 * domain. It calculates and returns the number of tasks which should be 1754 * domain. It calculates and returns the number of tasks which should be
1755 * moved to restore balance via the imbalance parameter. 1755 * moved to restore balance via the imbalance parameter.
1756 */ 1756 */
1757 static struct sched_group * 1757 static struct sched_group *
1758 find_busiest_group(struct sched_domain *sd, int this_cpu, 1758 find_busiest_group(struct sched_domain *sd, int this_cpu,
1759 unsigned long *imbalance, enum idle_type idle) 1759 unsigned long *imbalance, enum idle_type idle)
1760 { 1760 {
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1763 1763
1764 max_load = this_load = total_load = total_pwr = 0; 1764 max_load = this_load = total_load = total_pwr = 0;
1765 1765
1766 do { 1766 do {
1767 unsigned long load; 1767 unsigned long load;
1768 int local_group; 1768 int local_group;
1769 int i; 1769 int i;
1770 1770
1771 local_group = cpu_isset(this_cpu, group->cpumask); 1771 local_group = cpu_isset(this_cpu, group->cpumask);
1772 1772
1773 /* Tally up the load of all CPUs in the group */ 1773 /* Tally up the load of all CPUs in the group */
1774 avg_load = 0; 1774 avg_load = 0;
1775 1775
1776 for_each_cpu_mask(i, group->cpumask) { 1776 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */ 1777 /* Bias balancing toward cpus of our domain */
1778 if (local_group) 1778 if (local_group)
1779 load = target_load(i); 1779 load = target_load(i);
1780 else 1780 else
1781 load = source_load(i); 1781 load = source_load(i);
1782 1782
1783 avg_load += load; 1783 avg_load += load;
1784 } 1784 }
1785 1785
1786 total_load += avg_load; 1786 total_load += avg_load;
1787 total_pwr += group->cpu_power; 1787 total_pwr += group->cpu_power;
1788 1788
1789 /* Adjust by relative CPU power of the group */ 1789 /* Adjust by relative CPU power of the group */
1790 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1790 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1791 1791
1792 if (local_group) { 1792 if (local_group) {
1793 this_load = avg_load; 1793 this_load = avg_load;
1794 this = group; 1794 this = group;
1795 goto nextgroup; 1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1796 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1797 max_load = avg_load;
1798 busiest = group; 1798 busiest = group;
1799 } 1799 }
1800 nextgroup: 1800 nextgroup:
1801 group = group->next; 1801 group = group->next;
1802 } while (group != sd->groups); 1802 } while (group != sd->groups);
1803 1803
1804 if (!busiest || this_load >= max_load) 1804 if (!busiest || this_load >= max_load)
1805 goto out_balanced; 1805 goto out_balanced;
1806 1806
1807 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 1807 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1808 1808
1809 if (this_load >= avg_load || 1809 if (this_load >= avg_load ||
1810 100*max_load <= sd->imbalance_pct*this_load) 1810 100*max_load <= sd->imbalance_pct*this_load)
1811 goto out_balanced; 1811 goto out_balanced;
1812 1812
1813 /* 1813 /*
1814 * We're trying to get all the cpus to the average_load, so we don't 1814 * We're trying to get all the cpus to the average_load, so we don't
1815 * want to push ourselves above the average load, nor do we wish to 1815 * want to push ourselves above the average load, nor do we wish to
1816 * reduce the max loaded cpu below the average load, as either of these 1816 * reduce the max loaded cpu below the average load, as either of these
1817 * actions would just result in more rebalancing later, and ping-pong 1817 * actions would just result in more rebalancing later, and ping-pong
1818 * tasks around. Thus we look for the minimum possible imbalance. 1818 * tasks around. Thus we look for the minimum possible imbalance.
1819 * Negative imbalances (*we* are more loaded than anyone else) will 1819 * Negative imbalances (*we* are more loaded than anyone else) will
1820 * be counted as no imbalance for these purposes -- we can't fix that 1820 * be counted as no imbalance for these purposes -- we can't fix that
1821 * by pulling tasks to us. Be careful of negative numbers as they'll 1821 * by pulling tasks to us. Be careful of negative numbers as they'll
1822 * appear as very large values with unsigned longs. 1822 * appear as very large values with unsigned longs.
1823 */ 1823 */
1824 /* How much load to actually move to equalise the imbalance */ 1824 /* How much load to actually move to equalise the imbalance */
1825 *imbalance = min((max_load - avg_load) * busiest->cpu_power, 1825 *imbalance = min((max_load - avg_load) * busiest->cpu_power,
1826 (avg_load - this_load) * this->cpu_power) 1826 (avg_load - this_load) * this->cpu_power)
1827 / SCHED_LOAD_SCALE; 1827 / SCHED_LOAD_SCALE;
1828 1828
1829 if (*imbalance < SCHED_LOAD_SCALE) { 1829 if (*imbalance < SCHED_LOAD_SCALE) {
1830 unsigned long pwr_now = 0, pwr_move = 0; 1830 unsigned long pwr_now = 0, pwr_move = 0;
1831 unsigned long tmp; 1831 unsigned long tmp;
1832 1832
1833 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 1833 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1834 *imbalance = 1; 1834 *imbalance = 1;
1835 return busiest; 1835 return busiest;
1836 } 1836 }
1837 1837
1838 /* 1838 /*
1839 * OK, we don't have enough imbalance to justify moving tasks, 1839 * OK, we don't have enough imbalance to justify moving tasks,
1840 * however we may be able to increase total CPU power used by 1840 * however we may be able to increase total CPU power used by
1841 * moving them. 1841 * moving them.
1842 */ 1842 */
1843 1843
1844 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 1844 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1845 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 1845 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1846 pwr_now /= SCHED_LOAD_SCALE; 1846 pwr_now /= SCHED_LOAD_SCALE;
1847 1847
1848 /* Amount of load we'd subtract */ 1848 /* Amount of load we'd subtract */
1849 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 1849 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1850 if (max_load > tmp) 1850 if (max_load > tmp)
1851 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 1851 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1852 max_load - tmp); 1852 max_load - tmp);
1853 1853
1854 /* Amount of load we'd add */ 1854 /* Amount of load we'd add */
1855 if (max_load*busiest->cpu_power < 1855 if (max_load*busiest->cpu_power <
1856 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 1856 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
1857 tmp = max_load*busiest->cpu_power/this->cpu_power; 1857 tmp = max_load*busiest->cpu_power/this->cpu_power;
1858 else 1858 else
1859 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 1859 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1860 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 1860 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1861 pwr_move /= SCHED_LOAD_SCALE; 1861 pwr_move /= SCHED_LOAD_SCALE;
1862 1862
1863 /* Move if we gain throughput */ 1863 /* Move if we gain throughput */
1864 if (pwr_move <= pwr_now) 1864 if (pwr_move <= pwr_now)
1865 goto out_balanced; 1865 goto out_balanced;
1866 1866
1867 *imbalance = 1; 1867 *imbalance = 1;
1868 return busiest; 1868 return busiest;
1869 } 1869 }
1870 1870
1871 /* Get rid of the scaling factor, rounding down as we divide */ 1871 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE; 1872 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873 1873
1874 return busiest; 1874 return busiest;
1875 1875
1876 out_balanced: 1876 out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE || 1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { 1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1; 1879 *imbalance = 1;
1880 return busiest; 1880 return busiest;
1881 } 1881 }
1882 1882
1883 *imbalance = 0; 1883 *imbalance = 0;
1884 return NULL; 1884 return NULL;
1885 } 1885 }
1886 1886
1887 /* 1887 /*
1888 * find_busiest_queue - find the busiest runqueue among the cpus in group. 1888 * find_busiest_queue - find the busiest runqueue among the cpus in group.
1889 */ 1889 */
1890 static runqueue_t *find_busiest_queue(struct sched_group *group) 1890 static runqueue_t *find_busiest_queue(struct sched_group *group)
1891 { 1891 {
1892 unsigned long load, max_load = 0; 1892 unsigned long load, max_load = 0;
1893 runqueue_t *busiest = NULL; 1893 runqueue_t *busiest = NULL;
1894 int i; 1894 int i;
1895 1895
1896 for_each_cpu_mask(i, group->cpumask) { 1896 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i); 1897 load = source_load(i);
1898 1898
1899 if (load > max_load) { 1899 if (load > max_load) {
1900 max_load = load; 1900 max_load = load;
1901 busiest = cpu_rq(i); 1901 busiest = cpu_rq(i);
1902 } 1902 }
1903 } 1903 }
1904 1904
1905 return busiest; 1905 return busiest;
1906 } 1906 }
1907 1907
1908 /* 1908 /*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move 1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance. 1910 * tasks if there is an imbalance.
1911 * 1911 *
1912 * Called with this_rq unlocked. 1912 * Called with this_rq unlocked.
1913 */ 1913 */
1914 static int load_balance(int this_cpu, runqueue_t *this_rq, 1914 static int load_balance(int this_cpu, runqueue_t *this_rq,
1915 struct sched_domain *sd, enum idle_type idle) 1915 struct sched_domain *sd, enum idle_type idle)
1916 { 1916 {
1917 struct sched_group *group; 1917 struct sched_group *group;
1918 runqueue_t *busiest; 1918 runqueue_t *busiest;
1919 unsigned long imbalance; 1919 unsigned long imbalance;
1920 int nr_moved; 1920 int nr_moved;
1921 1921
1922 spin_lock(&this_rq->lock); 1922 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 1923 schedstat_inc(sd, lb_cnt[idle]);
1924 1924
1925 group = find_busiest_group(sd, this_cpu, &imbalance, idle); 1925 group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1926 if (!group) { 1926 if (!group) {
1927 schedstat_inc(sd, lb_nobusyg[idle]); 1927 schedstat_inc(sd, lb_nobusyg[idle]);
1928 goto out_balanced; 1928 goto out_balanced;
1929 } 1929 }
1930 1930
1931 busiest = find_busiest_queue(group); 1931 busiest = find_busiest_queue(group);
1932 if (!busiest) { 1932 if (!busiest) {
1933 schedstat_inc(sd, lb_nobusyq[idle]); 1933 schedstat_inc(sd, lb_nobusyq[idle]);
1934 goto out_balanced; 1934 goto out_balanced;
1935 } 1935 }
1936 1936
1937 /* 1937 /*
1938 * This should be "impossible", but since load 1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical, 1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory. 1940 * it could happen in theory.
1941 */ 1941 */
1942 if (unlikely(busiest == this_rq)) { 1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1); 1943 WARN_ON(1);
1944 goto out_balanced; 1944 goto out_balanced;
1945 } 1945 }
1946 1946
1947 schedstat_add(sd, lb_imbalance[idle], imbalance); 1947 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948 1948
1949 nr_moved = 0; 1949 nr_moved = 0;
1950 if (busiest->nr_running > 1) { 1950 if (busiest->nr_running > 1) {
1951 /* 1951 /*
1952 * Attempt to move tasks. If find_busiest_group has found 1952 * Attempt to move tasks. If find_busiest_group has found
1953 * an imbalance but busiest->nr_running <= 1, the group is 1953 * an imbalance but busiest->nr_running <= 1, the group is
1954 * still unbalanced. nr_moved simply stays zero, so it is 1954 * still unbalanced. nr_moved simply stays zero, so it is
1955 * correctly treated as an imbalance. 1955 * correctly treated as an imbalance.
1956 */ 1956 */
1957 double_lock_balance(this_rq, busiest); 1957 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 1958 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 1959 imbalance, sd, idle);
1960 spin_unlock(&busiest->lock); 1960 spin_unlock(&busiest->lock);
1961 } 1961 }
1962 spin_unlock(&this_rq->lock); 1962 spin_unlock(&this_rq->lock);
1963 1963
1964 if (!nr_moved) { 1964 if (!nr_moved) {
1965 schedstat_inc(sd, lb_failed[idle]); 1965 schedstat_inc(sd, lb_failed[idle]);
1966 sd->nr_balance_failed++; 1966 sd->nr_balance_failed++;
1967 1967
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0; 1969 int wake = 0;
1970 1970
1971 spin_lock(&busiest->lock); 1971 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 1972 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 1973 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 1974 busiest->push_cpu = this_cpu;
1975 wake = 1; 1975 wake = 1;
1976 } 1976 }
1977 spin_unlock(&busiest->lock); 1977 spin_unlock(&busiest->lock);
1978 if (wake) 1978 if (wake)
1979 wake_up_process(busiest->migration_thread); 1979 wake_up_process(busiest->migration_thread);
1980 1980
1981 /* 1981 /*
1982 * We've kicked active balancing, reset the failure 1982 * We've kicked active balancing, reset the failure
1983 * counter. 1983 * counter.
1984 */ 1984 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 1985 sd->nr_balance_failed = sd->cache_nice_tries;
1986 } 1986 }
1987 1987
1988 /* 1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(), 1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention. 1990 * so bump the balance_interval to lessen the lock contention.
1991 */ 1991 */
1992 if (sd->balance_interval < sd->max_interval) 1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++; 1993 sd->balance_interval++;
1994 } else { 1994 } else {
1995 sd->nr_balance_failed = 0; 1995 sd->nr_balance_failed = 0;
1996 1996
1997 /* We were unbalanced, so reset the balancing interval */ 1997 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 1998 sd->balance_interval = sd->min_interval;
1999 } 1999 }
2000 2000
2001 return nr_moved; 2001 return nr_moved;
2002 2002
2003 out_balanced: 2003 out_balanced:
2004 spin_unlock(&this_rq->lock); 2004 spin_unlock(&this_rq->lock);
2005 2005
2006 schedstat_inc(sd, lb_balanced[idle]); 2006 schedstat_inc(sd, lb_balanced[idle]);
2007 2007
2008 /* tune up the balancing interval */ 2008 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval) 2009 if (sd->balance_interval < sd->max_interval)
2010 sd->balance_interval *= 2; 2010 sd->balance_interval *= 2;
2011 2011
2012 return 0; 2012 return 0;
2013 } 2013 }
2014 2014
2015 /* 2015 /*
2016 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2016 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2017 * tasks if there is an imbalance. 2017 * tasks if there is an imbalance.
2018 * 2018 *
2019 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2019 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2020 * this_rq is locked. 2020 * this_rq is locked.
2021 */ 2021 */
2022 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, 2022 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2023 struct sched_domain *sd) 2023 struct sched_domain *sd)
2024 { 2024 {
2025 struct sched_group *group; 2025 struct sched_group *group;
2026 runqueue_t *busiest = NULL; 2026 runqueue_t *busiest = NULL;
2027 unsigned long imbalance; 2027 unsigned long imbalance;
2028 int nr_moved = 0; 2028 int nr_moved = 0;
2029 2029
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) { 2032 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out; 2035 goto out;
2036 } 2036 }
2037 2037
2038 busiest = find_busiest_queue(group); 2038 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) { 2039 if (!busiest || busiest == this_rq) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out; 2042 goto out;
2043 } 2043 }
2044 2044
2045 /* Attempt to move tasks */ 2045 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest); 2046 double_lock_balance(this_rq, busiest);
2047 2047
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2049 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2050 imbalance, sd, NEWLY_IDLE);
2051 if (!nr_moved) 2051 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2053 2053
2054 spin_unlock(&busiest->lock); 2054 spin_unlock(&busiest->lock);
2055 2055
2056 out: 2056 out:
2057 return nr_moved; 2057 return nr_moved;
2058 } 2058 }
2059 2059
2060 /* 2060 /*
2061 * idle_balance is called by schedule() if this_cpu is about to become 2061 * idle_balance is called by schedule() if this_cpu is about to become
2062 * idle. Attempts to pull tasks from other CPUs. 2062 * idle. Attempts to pull tasks from other CPUs.
2063 */ 2063 */
2064 static inline void idle_balance(int this_cpu, runqueue_t *this_rq) 2064 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2065 { 2065 {
2066 struct sched_domain *sd; 2066 struct sched_domain *sd;
2067 2067
2068 for_each_domain(this_cpu, sd) { 2068 for_each_domain(this_cpu, sd) {
2069 if (sd->flags & SD_BALANCE_NEWIDLE) { 2069 if (sd->flags & SD_BALANCE_NEWIDLE) {
2070 if (load_balance_newidle(this_cpu, this_rq, sd)) { 2070 if (load_balance_newidle(this_cpu, this_rq, sd)) {
2071 /* We've pulled tasks over so stop searching */ 2071 /* We've pulled tasks over so stop searching */
2072 break; 2072 break;
2073 } 2073 }
2074 } 2074 }
2075 } 2075 }
2076 } 2076 }
2077 2077
2078 /* 2078 /*
2079 * active_load_balance is run by migration threads. It pushes running tasks 2079 * active_load_balance is run by migration threads. It pushes running tasks
2080 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 2080 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2081 * running on each physical CPU where possible, and avoids physical / 2081 * running on each physical CPU where possible, and avoids physical /
2082 * logical imbalances. 2082 * logical imbalances.
2083 * 2083 *
2084 * Called with busiest_rq locked. 2084 * Called with busiest_rq locked.
2085 */ 2085 */
2086 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2086 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087 { 2087 {
2088 struct sched_domain *sd; 2088 struct sched_domain *sd;
2089 struct sched_group *cpu_group; 2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq; 2090 runqueue_t *target_rq;
2091 cpumask_t visited_cpus; 2091 cpumask_t visited_cpus;
2092 int cpu; 2092 int cpu;
2093 2093
2094 /* 2094 /*
2095 * Search for suitable CPUs to push tasks to in successively higher 2095 * Search for suitable CPUs to push tasks to in successively higher
2096 * domains with SD_LOAD_BALANCE set. 2096 * domains with SD_LOAD_BALANCE set.
2097 */ 2097 */
2098 visited_cpus = CPU_MASK_NONE; 2098 visited_cpus = CPU_MASK_NONE;
2099 for_each_domain(busiest_cpu, sd) { 2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE)) 2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */ 2101 /* no more domains to search */
2102 break; 2102 break;
2103 2103
2104 schedstat_inc(sd, alb_cnt); 2104 schedstat_inc(sd, alb_cnt);
2105 2105
2106 cpu_group = sd->groups; 2106 cpu_group = sd->groups;
2107 do { 2107 do {
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) { 2108 for_each_cpu_mask(cpu, cpu_group->cpumask) {
2109 if (busiest_rq->nr_running <= 1) 2109 if (busiest_rq->nr_running <= 1)
2110 /* no more tasks left to move */ 2110 /* no more tasks left to move */
2111 return; 2111 return;
2112 if (cpu_isset(cpu, visited_cpus)) 2112 if (cpu_isset(cpu, visited_cpus))
2113 continue; 2113 continue;
2114 cpu_set(cpu, visited_cpus); 2114 cpu_set(cpu, visited_cpus);
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) 2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
2116 continue; 2116 continue;
2117 2117
2118 target_rq = cpu_rq(cpu); 2118 target_rq = cpu_rq(cpu);
2119 /* 2119 /*
2120 * This condition is "impossible", if it occurs 2120 * This condition is "impossible", if it occurs
2121 * we need to fix it. Originally reported by 2121 * we need to fix it. Originally reported by
2122 * Bjorn Helgaas on a 128-cpu setup. 2122 * Bjorn Helgaas on a 128-cpu setup.
2123 */ 2123 */
2124 BUG_ON(busiest_rq == target_rq); 2124 BUG_ON(busiest_rq == target_rq);
2125 2125
2126 /* move a task from busiest_rq to target_rq */ 2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq); 2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq, 2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) { 2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed); 2130 schedstat_inc(sd, alb_pushed);
2131 } else { 2131 } else {
2132 schedstat_inc(sd, alb_failed); 2132 schedstat_inc(sd, alb_failed);
2133 } 2133 }
2134 spin_unlock(&target_rq->lock); 2134 spin_unlock(&target_rq->lock);
2135 } 2135 }
2136 cpu_group = cpu_group->next; 2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups); 2137 } while (cpu_group != sd->groups);
2138 } 2138 }
2139 } 2139 }
2140 2140
2141 /* 2141 /*
2142 * rebalance_tick will get called every timer tick, on every CPU. 2142 * rebalance_tick will get called every timer tick, on every CPU.
2143 * 2143 *
2144 * It checks each scheduling domain to see if it is due to be balanced, 2144 * It checks each scheduling domain to see if it is due to be balanced,
2145 * and initiates a balancing operation if so. 2145 * and initiates a balancing operation if so.
2146 * 2146 *
2147 * Balancing parameters are set up in arch_init_sched_domains. 2147 * Balancing parameters are set up in arch_init_sched_domains.
2148 */ 2148 */
2149 2149
2150 /* Don't have all balancing operations going off at once */ 2150 /* Don't have all balancing operations going off at once */
2151 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) 2151 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2152 2152
2153 static void rebalance_tick(int this_cpu, runqueue_t *this_rq, 2153 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2154 enum idle_type idle) 2154 enum idle_type idle)
2155 { 2155 {
2156 unsigned long old_load, this_load; 2156 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd; 2158 struct sched_domain *sd;
2159 2159
2160 /* Update our load */ 2160 /* Update our load */
2161 old_load = this_rq->cpu_load; 2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /* 2163 /*
2164 * Round up the averaging division if load is increasing. This 2164 * Round up the averaging division if load is increasing. This
2165 * prevents us from getting stuck on 9 if the load is 10, for 2165 * prevents us from getting stuck on 9 if the load is 10, for
2166 * example. 2166 * example.
2167 */ 2167 */
2168 if (this_load > old_load) 2168 if (this_load > old_load)
2169 old_load++; 2169 old_load++;
2170 this_rq->cpu_load = (old_load + this_load) / 2; 2170 this_rq->cpu_load = (old_load + this_load) / 2;
2171 2171
2172 for_each_domain(this_cpu, sd) { 2172 for_each_domain(this_cpu, sd) {
2173 unsigned long interval; 2173 unsigned long interval;
2174 2174
2175 if (!(sd->flags & SD_LOAD_BALANCE)) 2175 if (!(sd->flags & SD_LOAD_BALANCE))
2176 continue; 2176 continue;
2177 2177
2178 interval = sd->balance_interval; 2178 interval = sd->balance_interval;
2179 if (idle != SCHED_IDLE) 2179 if (idle != SCHED_IDLE)
2180 interval *= sd->busy_factor; 2180 interval *= sd->busy_factor;
2181 2181
2182 /* scale ms to jiffies */ 2182 /* scale ms to jiffies */
2183 interval = msecs_to_jiffies(interval); 2183 interval = msecs_to_jiffies(interval);
2184 if (unlikely(!interval)) 2184 if (unlikely(!interval))
2185 interval = 1; 2185 interval = 1;
2186 2186
2187 if (j - sd->last_balance >= interval) { 2187 if (j - sd->last_balance >= interval) {
2188 if (load_balance(this_cpu, this_rq, sd, idle)) { 2188 if (load_balance(this_cpu, this_rq, sd, idle)) {
2189 /* We've pulled tasks over so no longer idle */ 2189 /* We've pulled tasks over so no longer idle */
2190 idle = NOT_IDLE; 2190 idle = NOT_IDLE;
2191 } 2191 }
2192 sd->last_balance += interval; 2192 sd->last_balance += interval;
2193 } 2193 }
2194 } 2194 }
2195 } 2195 }
2196 #else 2196 #else
2197 /* 2197 /*
2198 * on UP we do not need to balance between CPUs: 2198 * on UP we do not need to balance between CPUs:
2199 */ 2199 */
2200 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) 2200 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2201 { 2201 {
2202 } 2202 }
2203 static inline void idle_balance(int cpu, runqueue_t *rq) 2203 static inline void idle_balance(int cpu, runqueue_t *rq)
2204 { 2204 {
2205 } 2205 }
2206 #endif 2206 #endif
2207 2207
2208 static inline int wake_priority_sleeper(runqueue_t *rq) 2208 static inline int wake_priority_sleeper(runqueue_t *rq)
2209 { 2209 {
2210 int ret = 0; 2210 int ret = 0;
2211 #ifdef CONFIG_SCHED_SMT 2211 #ifdef CONFIG_SCHED_SMT
2212 spin_lock(&rq->lock); 2212 spin_lock(&rq->lock);
2213 /* 2213 /*
2214 * If an SMT sibling task has been put to sleep for priority 2214 * If an SMT sibling task has been put to sleep for priority
2215 * reasons reschedule the idle task to see if it can now run. 2215 * reasons reschedule the idle task to see if it can now run.
2216 */ 2216 */
2217 if (rq->nr_running) { 2217 if (rq->nr_running) {
2218 resched_task(rq->idle); 2218 resched_task(rq->idle);
2219 ret = 1; 2219 ret = 1;
2220 } 2220 }
2221 spin_unlock(&rq->lock); 2221 spin_unlock(&rq->lock);
2222 #endif 2222 #endif
2223 return ret; 2223 return ret;
2224 } 2224 }
2225 2225
2226 DEFINE_PER_CPU(struct kernel_stat, kstat); 2226 DEFINE_PER_CPU(struct kernel_stat, kstat);
2227 2227
2228 EXPORT_PER_CPU_SYMBOL(kstat); 2228 EXPORT_PER_CPU_SYMBOL(kstat);
2229 2229
2230 /* 2230 /*
2231 * This is called on clock ticks and on context switches. 2231 * This is called on clock ticks and on context switches.
2232 * Bank in p->sched_time the ns elapsed since the last tick or switch. 2232 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2233 */ 2233 */
2234 static inline void update_cpu_clock(task_t *p, runqueue_t *rq, 2234 static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
2235 unsigned long long now) 2235 unsigned long long now)
2236 { 2236 {
2237 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); 2237 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2238 p->sched_time += now - last; 2238 p->sched_time += now - last;
2239 } 2239 }
2240 2240
2241 /* 2241 /*
2242 * Return current->sched_time plus any more ns on the sched_clock 2242 * Return current->sched_time plus any more ns on the sched_clock
2243 * that have not yet been banked. 2243 * that have not yet been banked.
2244 */ 2244 */
2245 unsigned long long current_sched_time(const task_t *tsk) 2245 unsigned long long current_sched_time(const task_t *tsk)
2246 { 2246 {
2247 unsigned long long ns; 2247 unsigned long long ns;
2248 unsigned long flags; 2248 unsigned long flags;
2249 local_irq_save(flags); 2249 local_irq_save(flags);
2250 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); 2250 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2251 ns = tsk->sched_time + (sched_clock() - ns); 2251 ns = tsk->sched_time + (sched_clock() - ns);
2252 local_irq_restore(flags); 2252 local_irq_restore(flags);
2253 return ns; 2253 return ns;
2254 } 2254 }
2255 2255
2256 /* 2256 /*
2257 * We place interactive tasks back into the active array, if possible. 2257 * We place interactive tasks back into the active array, if possible.
2258 * 2258 *
2259 * To guarantee that this does not starve expired tasks we ignore the 2259 * To guarantee that this does not starve expired tasks we ignore the
2260 * interactivity of a task if the first expired task had to wait more 2260 * interactivity of a task if the first expired task had to wait more
2261 * than a 'reasonable' amount of time. This deadline timeout is 2261 * than a 'reasonable' amount of time. This deadline timeout is
2262 * load-dependent, as the frequency of array switched decreases with 2262 * load-dependent, as the frequency of array switched decreases with
2263 * increasing number of running tasks. We also ignore the interactivity 2263 * increasing number of running tasks. We also ignore the interactivity
2264 * if a better static_prio task has expired: 2264 * if a better static_prio task has expired:
2265 */ 2265 */
2266 #define EXPIRED_STARVING(rq) \ 2266 #define EXPIRED_STARVING(rq) \
2267 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ 2267 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2268 (jiffies - (rq)->expired_timestamp >= \ 2268 (jiffies - (rq)->expired_timestamp >= \
2269 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ 2269 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2270 ((rq)->curr->static_prio > (rq)->best_expired_prio)) 2270 ((rq)->curr->static_prio > (rq)->best_expired_prio))
2271 2271
2272 /* 2272 /*
2273 * Account user cpu time to a process. 2273 * Account user cpu time to a process.
2274 * @p: the process that the cpu time gets accounted to 2274 * @p: the process that the cpu time gets accounted to
2275 * @hardirq_offset: the offset to subtract from hardirq_count() 2275 * @hardirq_offset: the offset to subtract from hardirq_count()
2276 * @cputime: the cpu time spent in user space since the last update 2276 * @cputime: the cpu time spent in user space since the last update
2277 */ 2277 */
2278 void account_user_time(struct task_struct *p, cputime_t cputime) 2278 void account_user_time(struct task_struct *p, cputime_t cputime)
2279 { 2279 {
2280 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2280 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2281 cputime64_t tmp; 2281 cputime64_t tmp;
2282 2282
2283 p->utime = cputime_add(p->utime, cputime); 2283 p->utime = cputime_add(p->utime, cputime);
2284 2284
2285 /* Add user time to cpustat. */ 2285 /* Add user time to cpustat. */
2286 tmp = cputime_to_cputime64(cputime); 2286 tmp = cputime_to_cputime64(cputime);
2287 if (TASK_NICE(p) > 0) 2287 if (TASK_NICE(p) > 0)
2288 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2288 cpustat->nice = cputime64_add(cpustat->nice, tmp);
2289 else 2289 else
2290 cpustat->user = cputime64_add(cpustat->user, tmp); 2290 cpustat->user = cputime64_add(cpustat->user, tmp);
2291 } 2291 }
2292 2292
2293 /* 2293 /*
2294 * Account system cpu time to a process. 2294 * Account system cpu time to a process.
2295 * @p: the process that the cpu time gets accounted to 2295 * @p: the process that the cpu time gets accounted to
2296 * @hardirq_offset: the offset to subtract from hardirq_count() 2296 * @hardirq_offset: the offset to subtract from hardirq_count()
2297 * @cputime: the cpu time spent in kernel space since the last update 2297 * @cputime: the cpu time spent in kernel space since the last update
2298 */ 2298 */
2299 void account_system_time(struct task_struct *p, int hardirq_offset, 2299 void account_system_time(struct task_struct *p, int hardirq_offset,
2300 cputime_t cputime) 2300 cputime_t cputime)
2301 { 2301 {
2302 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2302 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2303 runqueue_t *rq = this_rq(); 2303 runqueue_t *rq = this_rq();
2304 cputime64_t tmp; 2304 cputime64_t tmp;
2305 2305
2306 p->stime = cputime_add(p->stime, cputime); 2306 p->stime = cputime_add(p->stime, cputime);
2307 2307
2308 /* Add system time to cpustat. */ 2308 /* Add system time to cpustat. */
2309 tmp = cputime_to_cputime64(cputime); 2309 tmp = cputime_to_cputime64(cputime);
2310 if (hardirq_count() - hardirq_offset) 2310 if (hardirq_count() - hardirq_offset)
2311 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2311 cpustat->irq = cputime64_add(cpustat->irq, tmp);
2312 else if (softirq_count()) 2312 else if (softirq_count())
2313 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2313 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2314 else if (p != rq->idle) 2314 else if (p != rq->idle)
2315 cpustat->system = cputime64_add(cpustat->system, tmp); 2315 cpustat->system = cputime64_add(cpustat->system, tmp);
2316 else if (atomic_read(&rq->nr_iowait) > 0) 2316 else if (atomic_read(&rq->nr_iowait) > 0)
2317 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 2317 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2318 else 2318 else
2319 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2319 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2320 /* Account for system time used */ 2320 /* Account for system time used */
2321 acct_update_integrals(p); 2321 acct_update_integrals(p);
2322 /* Update rss highwater mark */ 2322 /* Update rss highwater mark */
2323 update_mem_hiwater(p); 2323 update_mem_hiwater(p);
2324 } 2324 }
2325 2325
2326 /* 2326 /*
2327 * Account for involuntary wait time. 2327 * Account for involuntary wait time.
2328 * @p: the process from which the cpu time has been stolen 2328 * @p: the process from which the cpu time has been stolen
2329 * @steal: the cpu time spent in involuntary wait 2329 * @steal: the cpu time spent in involuntary wait
2330 */ 2330 */
2331 void account_steal_time(struct task_struct *p, cputime_t steal) 2331 void account_steal_time(struct task_struct *p, cputime_t steal)
2332 { 2332 {
2333 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2333 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2334 cputime64_t tmp = cputime_to_cputime64(steal); 2334 cputime64_t tmp = cputime_to_cputime64(steal);
2335 runqueue_t *rq = this_rq(); 2335 runqueue_t *rq = this_rq();
2336 2336
2337 if (p == rq->idle) { 2337 if (p == rq->idle) {
2338 p->stime = cputime_add(p->stime, steal); 2338 p->stime = cputime_add(p->stime, steal);
2339 if (atomic_read(&rq->nr_iowait) > 0) 2339 if (atomic_read(&rq->nr_iowait) > 0)
2340 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 2340 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2341 else 2341 else
2342 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2342 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2343 } else 2343 } else
2344 cpustat->steal = cputime64_add(cpustat->steal, tmp); 2344 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2345 } 2345 }
2346 2346
2347 /* 2347 /*
2348 * This function gets called by the timer code, with HZ frequency. 2348 * This function gets called by the timer code, with HZ frequency.
2349 * We call it with interrupts disabled. 2349 * We call it with interrupts disabled.
2350 * 2350 *
2351 * It also gets called by the fork code, when changing the parent's 2351 * It also gets called by the fork code, when changing the parent's
2352 * timeslices. 2352 * timeslices.
2353 */ 2353 */
2354 void scheduler_tick(void) 2354 void scheduler_tick(void)
2355 { 2355 {
2356 int cpu = smp_processor_id(); 2356 int cpu = smp_processor_id();
2357 runqueue_t *rq = this_rq(); 2357 runqueue_t *rq = this_rq();
2358 task_t *p = current; 2358 task_t *p = current;
2359 unsigned long long now = sched_clock(); 2359 unsigned long long now = sched_clock();
2360 2360
2361 update_cpu_clock(p, rq, now); 2361 update_cpu_clock(p, rq, now);
2362 2362
2363 rq->timestamp_last_tick = now; 2363 rq->timestamp_last_tick = now;
2364 2364
2365 if (p == rq->idle) { 2365 if (p == rq->idle) {
2366 if (wake_priority_sleeper(rq)) 2366 if (wake_priority_sleeper(rq))
2367 goto out; 2367 goto out;
2368 rebalance_tick(cpu, rq, SCHED_IDLE); 2368 rebalance_tick(cpu, rq, SCHED_IDLE);
2369 return; 2369 return;
2370 } 2370 }
2371 2371
2372 /* Task might have expired already, but not scheduled off yet */ 2372 /* Task might have expired already, but not scheduled off yet */
2373 if (p->array != rq->active) { 2373 if (p->array != rq->active) {
2374 set_tsk_need_resched(p); 2374 set_tsk_need_resched(p);
2375 goto out; 2375 goto out;
2376 } 2376 }
2377 spin_lock(&rq->lock); 2377 spin_lock(&rq->lock);
2378 /* 2378 /*
2379 * The task was running during this tick - update the 2379 * The task was running during this tick - update the
2380 * time slice counter. Note: we do not update a thread's 2380 * time slice counter. Note: we do not update a thread's
2381 * priority until it either goes to sleep or uses up its 2381 * priority until it either goes to sleep or uses up its
2382 * timeslice. This makes it possible for interactive tasks 2382 * timeslice. This makes it possible for interactive tasks
2383 * to use up their timeslices at their highest priority levels. 2383 * to use up their timeslices at their highest priority levels.
2384 */ 2384 */
2385 if (rt_task(p)) { 2385 if (rt_task(p)) {
2386 /* 2386 /*
2387 * RR tasks need a special form of timeslice management. 2387 * RR tasks need a special form of timeslice management.
2388 * FIFO tasks have no timeslices. 2388 * FIFO tasks have no timeslices.
2389 */ 2389 */
2390 if ((p->policy == SCHED_RR) && !--p->time_slice) { 2390 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2391 p->time_slice = task_timeslice(p); 2391 p->time_slice = task_timeslice(p);
2392 p->first_time_slice = 0; 2392 p->first_time_slice = 0;
2393 set_tsk_need_resched(p); 2393 set_tsk_need_resched(p);
2394 2394
2395 /* put it at the end of the queue: */ 2395 /* put it at the end of the queue: */
2396 requeue_task(p, rq->active); 2396 requeue_task(p, rq->active);
2397 } 2397 }
2398 goto out_unlock; 2398 goto out_unlock;
2399 } 2399 }
2400 if (!--p->time_slice) { 2400 if (!--p->time_slice) {
2401 dequeue_task(p, rq->active); 2401 dequeue_task(p, rq->active);
2402 set_tsk_need_resched(p); 2402 set_tsk_need_resched(p);
2403 p->prio = effective_prio(p); 2403 p->prio = effective_prio(p);
2404 p->time_slice = task_timeslice(p); 2404 p->time_slice = task_timeslice(p);
2405 p->first_time_slice = 0; 2405 p->first_time_slice = 0;
2406 2406
2407 if (!rq->expired_timestamp) 2407 if (!rq->expired_timestamp)
2408 rq->expired_timestamp = jiffies; 2408 rq->expired_timestamp = jiffies;
2409 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { 2409 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2410 enqueue_task(p, rq->expired); 2410 enqueue_task(p, rq->expired);
2411 if (p->static_prio < rq->best_expired_prio) 2411 if (p->static_prio < rq->best_expired_prio)
2412 rq->best_expired_prio = p->static_prio; 2412 rq->best_expired_prio = p->static_prio;
2413 } else 2413 } else
2414 enqueue_task(p, rq->active); 2414 enqueue_task(p, rq->active);
2415 } else { 2415 } else {
2416 /* 2416 /*
2417 * Prevent a too long timeslice allowing a task to monopolize 2417 * Prevent a too long timeslice allowing a task to monopolize
2418 * the CPU. We do this by splitting up the timeslice into 2418 * the CPU. We do this by splitting up the timeslice into
2419 * smaller pieces. 2419 * smaller pieces.
2420 * 2420 *
2421 * Note: this does not mean the task's timeslices expire or 2421 * Note: this does not mean the task's timeslices expire or
2422 * get lost in any way, they just might be preempted by 2422 * get lost in any way, they just might be preempted by
2423 * another task of equal priority. (one with higher 2423 * another task of equal priority. (one with higher
2424 * priority would have preempted this task already.) We 2424 * priority would have preempted this task already.) We
2425 * requeue this task to the end of the list on this priority 2425 * requeue this task to the end of the list on this priority
2426 * level, which is in essence a round-robin of tasks with 2426 * level, which is in essence a round-robin of tasks with
2427 * equal priority. 2427 * equal priority.
2428 * 2428 *
2429 * This only applies to tasks in the interactive 2429 * This only applies to tasks in the interactive
2430 * delta range with at least TIMESLICE_GRANULARITY to requeue. 2430 * delta range with at least TIMESLICE_GRANULARITY to requeue.
2431 */ 2431 */
2432 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - 2432 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2433 p->time_slice) % TIMESLICE_GRANULARITY(p)) && 2433 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2434 (p->time_slice >= TIMESLICE_GRANULARITY(p)) && 2434 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2435 (p->array == rq->active)) { 2435 (p->array == rq->active)) {
2436 2436
2437 requeue_task(p, rq->active); 2437 requeue_task(p, rq->active);
2438 set_tsk_need_resched(p); 2438 set_tsk_need_resched(p);
2439 } 2439 }
2440 } 2440 }
2441 out_unlock: 2441 out_unlock:
2442 spin_unlock(&rq->lock); 2442 spin_unlock(&rq->lock);
2443 out: 2443 out:
2444 rebalance_tick(cpu, rq, NOT_IDLE); 2444 rebalance_tick(cpu, rq, NOT_IDLE);
2445 } 2445 }
2446 2446
2447 #ifdef CONFIG_SCHED_SMT 2447 #ifdef CONFIG_SCHED_SMT
2448 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2448 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449 { 2449 {
2450 struct sched_domain *sd = this_rq->sd; 2450 struct sched_domain *sd = this_rq->sd;
2451 cpumask_t sibling_map; 2451 cpumask_t sibling_map;
2452 int i; 2452 int i;
2453 2453
2454 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2454 if (!(sd->flags & SD_SHARE_CPUPOWER))
2455 return; 2455 return;
2456 2456
2457 /* 2457 /*
2458 * Unlock the current runqueue because we have to lock in 2458 * Unlock the current runqueue because we have to lock in
2459 * CPU order to avoid deadlocks. Caller knows that we might 2459 * CPU order to avoid deadlocks. Caller knows that we might
2460 * unlock. We keep IRQs disabled. 2460 * unlock. We keep IRQs disabled.
2461 */ 2461 */
2462 spin_unlock(&this_rq->lock); 2462 spin_unlock(&this_rq->lock);
2463 2463
2464 sibling_map = sd->span; 2464 sibling_map = sd->span;
2465 2465
2466 for_each_cpu_mask(i, sibling_map) 2466 for_each_cpu_mask(i, sibling_map)
2467 spin_lock(&cpu_rq(i)->lock); 2467 spin_lock(&cpu_rq(i)->lock);
2468 /* 2468 /*
2469 * We clear this CPU from the mask. This both simplifies the 2469 * We clear this CPU from the mask. This both simplifies the
2470 * inner loop and keps this_rq locked when we exit: 2470 * inner loop and keps this_rq locked when we exit:
2471 */ 2471 */
2472 cpu_clear(this_cpu, sibling_map); 2472 cpu_clear(this_cpu, sibling_map);
2473 2473
2474 for_each_cpu_mask(i, sibling_map) { 2474 for_each_cpu_mask(i, sibling_map) {
2475 runqueue_t *smt_rq = cpu_rq(i); 2475 runqueue_t *smt_rq = cpu_rq(i);
2476 2476
2477 /* 2477 /*
2478 * If an SMT sibling task is sleeping due to priority 2478 * If an SMT sibling task is sleeping due to priority
2479 * reasons wake it up now. 2479 * reasons wake it up now.
2480 */ 2480 */
2481 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) 2481 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2482 resched_task(smt_rq->idle); 2482 resched_task(smt_rq->idle);
2483 } 2483 }
2484 2484
2485 for_each_cpu_mask(i, sibling_map) 2485 for_each_cpu_mask(i, sibling_map)
2486 spin_unlock(&cpu_rq(i)->lock); 2486 spin_unlock(&cpu_rq(i)->lock);
2487 /* 2487 /*
2488 * We exit with this_cpu's rq still held and IRQs 2488 * We exit with this_cpu's rq still held and IRQs
2489 * still disabled: 2489 * still disabled:
2490 */ 2490 */
2491 } 2491 }
2492 2492
2493 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2493 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494 { 2494 {
2495 struct sched_domain *sd = this_rq->sd; 2495 struct sched_domain *sd = this_rq->sd;
2496 cpumask_t sibling_map; 2496 cpumask_t sibling_map;
2497 prio_array_t *array; 2497 prio_array_t *array;
2498 int ret = 0, i; 2498 int ret = 0, i;
2499 task_t *p; 2499 task_t *p;
2500 2500
2501 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2501 if (!(sd->flags & SD_SHARE_CPUPOWER))
2502 return 0; 2502 return 0;
2503 2503
2504 /* 2504 /*
2505 * The same locking rules and details apply as for 2505 * The same locking rules and details apply as for
2506 * wake_sleeping_dependent(): 2506 * wake_sleeping_dependent():
2507 */ 2507 */
2508 spin_unlock(&this_rq->lock); 2508 spin_unlock(&this_rq->lock);
2509 sibling_map = sd->span; 2509 sibling_map = sd->span;
2510 for_each_cpu_mask(i, sibling_map) 2510 for_each_cpu_mask(i, sibling_map)
2511 spin_lock(&cpu_rq(i)->lock); 2511 spin_lock(&cpu_rq(i)->lock);
2512 cpu_clear(this_cpu, sibling_map); 2512 cpu_clear(this_cpu, sibling_map);
2513 2513
2514 /* 2514 /*
2515 * Establish next task to be run - it might have gone away because 2515 * Establish next task to be run - it might have gone away because
2516 * we released the runqueue lock above: 2516 * we released the runqueue lock above:
2517 */ 2517 */
2518 if (!this_rq->nr_running) 2518 if (!this_rq->nr_running)
2519 goto out_unlock; 2519 goto out_unlock;
2520 array = this_rq->active; 2520 array = this_rq->active;
2521 if (!array->nr_active) 2521 if (!array->nr_active)
2522 array = this_rq->expired; 2522 array = this_rq->expired;
2523 BUG_ON(!array->nr_active); 2523 BUG_ON(!array->nr_active);
2524 2524
2525 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 2525 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2526 task_t, run_list); 2526 task_t, run_list);
2527 2527
2528 for_each_cpu_mask(i, sibling_map) { 2528 for_each_cpu_mask(i, sibling_map) {
2529 runqueue_t *smt_rq = cpu_rq(i); 2529 runqueue_t *smt_rq = cpu_rq(i);
2530 task_t *smt_curr = smt_rq->curr; 2530 task_t *smt_curr = smt_rq->curr;
2531 2531
2532 /* 2532 /*
2533 * If a user task with lower static priority than the 2533 * If a user task with lower static priority than the
2534 * running task on the SMT sibling is trying to schedule, 2534 * running task on the SMT sibling is trying to schedule,
2535 * delay it till there is proportionately less timeslice 2535 * delay it till there is proportionately less timeslice
2536 * left of the sibling task to prevent a lower priority 2536 * left of the sibling task to prevent a lower priority
2537 * task from using an unfair proportion of the 2537 * task from using an unfair proportion of the
2538 * physical cpu's resources. -ck 2538 * physical cpu's resources. -ck
2539 */ 2539 */
2540 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > 2540 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2541 task_timeslice(p) || rt_task(smt_curr)) && 2541 task_timeslice(p) || rt_task(smt_curr)) &&
2542 p->mm && smt_curr->mm && !rt_task(p)) 2542 p->mm && smt_curr->mm && !rt_task(p))
2543 ret = 1; 2543 ret = 1;
2544 2544
2545 /* 2545 /*
2546 * Reschedule a lower priority task on the SMT sibling, 2546 * Reschedule a lower priority task on the SMT sibling,
2547 * or wake it up if it has been put to sleep for priority 2547 * or wake it up if it has been put to sleep for priority
2548 * reasons. 2548 * reasons.
2549 */ 2549 */
2550 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > 2550 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2551 task_timeslice(smt_curr) || rt_task(p)) && 2551 task_timeslice(smt_curr) || rt_task(p)) &&
2552 smt_curr->mm && p->mm && !rt_task(smt_curr)) || 2552 smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2553 (smt_curr == smt_rq->idle && smt_rq->nr_running)) 2553 (smt_curr == smt_rq->idle && smt_rq->nr_running))
2554 resched_task(smt_curr); 2554 resched_task(smt_curr);
2555 } 2555 }
2556 out_unlock: 2556 out_unlock:
2557 for_each_cpu_mask(i, sibling_map) 2557 for_each_cpu_mask(i, sibling_map)
2558 spin_unlock(&cpu_rq(i)->lock); 2558 spin_unlock(&cpu_rq(i)->lock);
2559 return ret; 2559 return ret;
2560 } 2560 }
2561 #else 2561 #else
2562 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2562 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2563 { 2563 {
2564 } 2564 }
2565 2565
2566 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2566 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2567 { 2567 {
2568 return 0; 2568 return 0;
2569 } 2569 }
2570 #endif 2570 #endif
2571 2571
2572 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 2572 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2573 2573
2574 void fastcall add_preempt_count(int val) 2574 void fastcall add_preempt_count(int val)
2575 { 2575 {
2576 /* 2576 /*
2577 * Underflow? 2577 * Underflow?
2578 */ 2578 */
2579 BUG_ON(((int)preempt_count() < 0)); 2579 BUG_ON(((int)preempt_count() < 0));
2580 preempt_count() += val; 2580 preempt_count() += val;
2581 /* 2581 /*
2582 * Spinlock count overflowing soon? 2582 * Spinlock count overflowing soon?
2583 */ 2583 */
2584 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 2584 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2585 } 2585 }
2586 EXPORT_SYMBOL(add_preempt_count); 2586 EXPORT_SYMBOL(add_preempt_count);
2587 2587
2588 void fastcall sub_preempt_count(int val) 2588 void fastcall sub_preempt_count(int val)
2589 { 2589 {
2590 /* 2590 /*
2591 * Underflow? 2591 * Underflow?
2592 */ 2592 */
2593 BUG_ON(val > preempt_count()); 2593 BUG_ON(val > preempt_count());
2594 /* 2594 /*
2595 * Is the spinlock portion underflowing? 2595 * Is the spinlock portion underflowing?
2596 */ 2596 */
2597 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); 2597 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2598 preempt_count() -= val; 2598 preempt_count() -= val;
2599 } 2599 }
2600 EXPORT_SYMBOL(sub_preempt_count); 2600 EXPORT_SYMBOL(sub_preempt_count);
2601 2601
2602 #endif 2602 #endif
2603 2603
2604 /* 2604 /*
2605 * schedule() is the main scheduler function. 2605 * schedule() is the main scheduler function.
2606 */ 2606 */
2607 asmlinkage void __sched schedule(void) 2607 asmlinkage void __sched schedule(void)
2608 { 2608 {
2609 long *switch_count; 2609 long *switch_count;
2610 task_t *prev, *next; 2610 task_t *prev, *next;
2611 runqueue_t *rq; 2611 runqueue_t *rq;
2612 prio_array_t *array; 2612 prio_array_t *array;
2613 struct list_head *queue; 2613 struct list_head *queue;
2614 unsigned long long now; 2614 unsigned long long now;
2615 unsigned long run_time; 2615 unsigned long run_time;
2616 int cpu, idx; 2616 int cpu, idx;
2617 2617
2618 /* 2618 /*
2619 * Test if we are atomic. Since do_exit() needs to call into 2619 * Test if we are atomic. Since do_exit() needs to call into
2620 * schedule() atomically, we ignore that path for now. 2620 * schedule() atomically, we ignore that path for now.
2621 * Otherwise, whine if we are scheduling when we should not be. 2621 * Otherwise, whine if we are scheduling when we should not be.
2622 */ 2622 */
2623 if (likely(!current->exit_state)) { 2623 if (likely(!current->exit_state)) {
2624 if (unlikely(in_atomic())) { 2624 if (unlikely(in_atomic())) {
2625 printk(KERN_ERR "scheduling while atomic: " 2625 printk(KERN_ERR "scheduling while atomic: "
2626 "%s/0x%08x/%d\n", 2626 "%s/0x%08x/%d\n",
2627 current->comm, preempt_count(), current->pid); 2627 current->comm, preempt_count(), current->pid);
2628 dump_stack(); 2628 dump_stack();
2629 } 2629 }
2630 } 2630 }
2631 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2631 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2632 2632
2633 need_resched: 2633 need_resched:
2634 preempt_disable(); 2634 preempt_disable();
2635 prev = current; 2635 prev = current;
2636 release_kernel_lock(prev); 2636 release_kernel_lock(prev);
2637 need_resched_nonpreemptible: 2637 need_resched_nonpreemptible:
2638 rq = this_rq(); 2638 rq = this_rq();
2639 2639
2640 /* 2640 /*
2641 * The idle thread is not allowed to schedule! 2641 * The idle thread is not allowed to schedule!
2642 * Remove this check after it has been exercised a bit. 2642 * Remove this check after it has been exercised a bit.
2643 */ 2643 */
2644 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { 2644 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2645 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 2645 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2646 dump_stack(); 2646 dump_stack();
2647 } 2647 }
2648 2648
2649 schedstat_inc(rq, sched_cnt); 2649 schedstat_inc(rq, sched_cnt);
2650 now = sched_clock(); 2650 now = sched_clock();
2651 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 2651 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
2652 run_time = now - prev->timestamp; 2652 run_time = now - prev->timestamp;
2653 if (unlikely((long long)(now - prev->timestamp) < 0)) 2653 if (unlikely((long long)(now - prev->timestamp) < 0))
2654 run_time = 0; 2654 run_time = 0;
2655 } else 2655 } else
2656 run_time = NS_MAX_SLEEP_AVG; 2656 run_time = NS_MAX_SLEEP_AVG;
2657 2657
2658 /* 2658 /*
2659 * Tasks charged proportionately less run_time at high sleep_avg to 2659 * Tasks charged proportionately less run_time at high sleep_avg to
2660 * delay them losing their interactive status 2660 * delay them losing their interactive status
2661 */ 2661 */
2662 run_time /= (CURRENT_BONUS(prev) ? : 1); 2662 run_time /= (CURRENT_BONUS(prev) ? : 1);
2663 2663
2664 spin_lock_irq(&rq->lock); 2664 spin_lock_irq(&rq->lock);
2665 2665
2666 if (unlikely(prev->flags & PF_DEAD)) 2666 if (unlikely(prev->flags & PF_DEAD))
2667 prev->state = EXIT_DEAD; 2667 prev->state = EXIT_DEAD;
2668 2668
2669 switch_count = &prev->nivcsw; 2669 switch_count = &prev->nivcsw;
2670 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2670 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2671 switch_count = &prev->nvcsw; 2671 switch_count = &prev->nvcsw;
2672 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 2672 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2673 unlikely(signal_pending(prev)))) 2673 unlikely(signal_pending(prev))))
2674 prev->state = TASK_RUNNING; 2674 prev->state = TASK_RUNNING;
2675 else { 2675 else {
2676 if (prev->state == TASK_UNINTERRUPTIBLE) 2676 if (prev->state == TASK_UNINTERRUPTIBLE)
2677 rq->nr_uninterruptible++; 2677 rq->nr_uninterruptible++;
2678 deactivate_task(prev, rq); 2678 deactivate_task(prev, rq);
2679 } 2679 }
2680 } 2680 }
2681 2681
2682 cpu = smp_processor_id(); 2682 cpu = smp_processor_id();
2683 if (unlikely(!rq->nr_running)) { 2683 if (unlikely(!rq->nr_running)) {
2684 go_idle: 2684 go_idle:
2685 idle_balance(cpu, rq); 2685 idle_balance(cpu, rq);
2686 if (!rq->nr_running) { 2686 if (!rq->nr_running) {
2687 next = rq->idle; 2687 next = rq->idle;
2688 rq->expired_timestamp = 0; 2688 rq->expired_timestamp = 0;
2689 wake_sleeping_dependent(cpu, rq); 2689 wake_sleeping_dependent(cpu, rq);
2690 /* 2690 /*
2691 * wake_sleeping_dependent() might have released 2691 * wake_sleeping_dependent() might have released
2692 * the runqueue, so break out if we got new 2692 * the runqueue, so break out if we got new
2693 * tasks meanwhile: 2693 * tasks meanwhile:
2694 */ 2694 */
2695 if (!rq->nr_running) 2695 if (!rq->nr_running)
2696 goto switch_tasks; 2696 goto switch_tasks;
2697 } 2697 }
2698 } else { 2698 } else {
2699 if (dependent_sleeper(cpu, rq)) { 2699 if (dependent_sleeper(cpu, rq)) {
2700 next = rq->idle; 2700 next = rq->idle;
2701 goto switch_tasks; 2701 goto switch_tasks;
2702 } 2702 }
2703 /* 2703 /*
2704 * dependent_sleeper() releases and reacquires the runqueue 2704 * dependent_sleeper() releases and reacquires the runqueue
2705 * lock, hence go into the idle loop if the rq went 2705 * lock, hence go into the idle loop if the rq went
2706 * empty meanwhile: 2706 * empty meanwhile:
2707 */ 2707 */
2708 if (unlikely(!rq->nr_running)) 2708 if (unlikely(!rq->nr_running))
2709 goto go_idle; 2709 goto go_idle;
2710 } 2710 }
2711 2711
2712 array = rq->active; 2712 array = rq->active;
2713 if (unlikely(!array->nr_active)) { 2713 if (unlikely(!array->nr_active)) {
2714 /* 2714 /*
2715 * Switch the active and expired arrays. 2715 * Switch the active and expired arrays.
2716 */ 2716 */
2717 schedstat_inc(rq, sched_switch); 2717 schedstat_inc(rq, sched_switch);
2718 rq->active = rq->expired; 2718 rq->active = rq->expired;
2719 rq->expired = array; 2719 rq->expired = array;
2720 array = rq->active; 2720 array = rq->active;
2721 rq->expired_timestamp = 0; 2721 rq->expired_timestamp = 0;
2722 rq->best_expired_prio = MAX_PRIO; 2722 rq->best_expired_prio = MAX_PRIO;
2723 } 2723 }
2724 2724
2725 idx = sched_find_first_bit(array->bitmap); 2725 idx = sched_find_first_bit(array->bitmap);
2726 queue = array->queue + idx; 2726 queue = array->queue + idx;
2727 next = list_entry(queue->next, task_t, run_list); 2727 next = list_entry(queue->next, task_t, run_list);
2728 2728
2729 if (!rt_task(next) && next->activated > 0) { 2729 if (!rt_task(next) && next->activated > 0) {
2730 unsigned long long delta = now - next->timestamp; 2730 unsigned long long delta = now - next->timestamp;
2731 if (unlikely((long long)(now - next->timestamp) < 0)) 2731 if (unlikely((long long)(now - next->timestamp) < 0))
2732 delta = 0; 2732 delta = 0;
2733 2733
2734 if (next->activated == 1) 2734 if (next->activated == 1)
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736 2736
2737 array = next->array; 2737 array = next->array;
2738 dequeue_task(next, array); 2738 dequeue_task(next, array);
2739 recalc_task_prio(next, next->timestamp + delta); 2739 recalc_task_prio(next, next->timestamp + delta);
2740 enqueue_task(next, array); 2740 enqueue_task(next, array);
2741 } 2741 }
2742 next->activated = 0; 2742 next->activated = 0;
2743 switch_tasks: 2743 switch_tasks:
2744 if (next == rq->idle) 2744 if (next == rq->idle)
2745 schedstat_inc(rq, sched_goidle); 2745 schedstat_inc(rq, sched_goidle);
2746 prefetch(next); 2746 prefetch(next);
2747 clear_tsk_need_resched(prev); 2747 clear_tsk_need_resched(prev);
2748 rcu_qsctr_inc(task_cpu(prev)); 2748 rcu_qsctr_inc(task_cpu(prev));
2749 2749
2750 update_cpu_clock(prev, rq, now); 2750 update_cpu_clock(prev, rq, now);
2751 2751
2752 prev->sleep_avg -= run_time; 2752 prev->sleep_avg -= run_time;
2753 if ((long)prev->sleep_avg <= 0) 2753 if ((long)prev->sleep_avg <= 0)
2754 prev->sleep_avg = 0; 2754 prev->sleep_avg = 0;
2755 prev->timestamp = prev->last_ran = now; 2755 prev->timestamp = prev->last_ran = now;
2756 2756
2757 sched_info_switch(prev, next); 2757 sched_info_switch(prev, next);
2758 if (likely(prev != next)) { 2758 if (likely(prev != next)) {
2759 next->timestamp = now; 2759 next->timestamp = now;
2760 rq->nr_switches++; 2760 rq->nr_switches++;
2761 rq->curr = next; 2761 rq->curr = next;
2762 ++*switch_count; 2762 ++*switch_count;
2763 2763
2764 prepare_arch_switch(rq, next); 2764 prepare_arch_switch(rq, next);
2765 prev = context_switch(rq, prev, next); 2765 prev = context_switch(rq, prev, next);
2766 barrier(); 2766 barrier();
2767 2767
2768 finish_task_switch(prev); 2768 finish_task_switch(prev);
2769 } else 2769 } else
2770 spin_unlock_irq(&rq->lock); 2770 spin_unlock_irq(&rq->lock);
2771 2771
2772 prev = current; 2772 prev = current;
2773 if (unlikely(reacquire_kernel_lock(prev) < 0)) 2773 if (unlikely(reacquire_kernel_lock(prev) < 0))
2774 goto need_resched_nonpreemptible; 2774 goto need_resched_nonpreemptible;
2775 preempt_enable_no_resched(); 2775 preempt_enable_no_resched();
2776 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 2776 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2777 goto need_resched; 2777 goto need_resched;
2778 } 2778 }
2779 2779
2780 EXPORT_SYMBOL(schedule); 2780 EXPORT_SYMBOL(schedule);
2781 2781
2782 #ifdef CONFIG_PREEMPT 2782 #ifdef CONFIG_PREEMPT
2783 /* 2783 /*
2784 * this is is the entry point to schedule() from in-kernel preemption 2784 * this is is the entry point to schedule() from in-kernel preemption
2785 * off of preempt_enable. Kernel preemptions off return from interrupt 2785 * off of preempt_enable. Kernel preemptions off return from interrupt
2786 * occur there and call schedule directly. 2786 * occur there and call schedule directly.
2787 */ 2787 */
2788 asmlinkage void __sched preempt_schedule(void) 2788 asmlinkage void __sched preempt_schedule(void)
2789 { 2789 {
2790 struct thread_info *ti = current_thread_info(); 2790 struct thread_info *ti = current_thread_info();
2791 #ifdef CONFIG_PREEMPT_BKL 2791 #ifdef CONFIG_PREEMPT_BKL
2792 struct task_struct *task = current; 2792 struct task_struct *task = current;
2793 int saved_lock_depth; 2793 int saved_lock_depth;
2794 #endif 2794 #endif
2795 /* 2795 /*
2796 * If there is a non-zero preempt_count or interrupts are disabled, 2796 * If there is a non-zero preempt_count or interrupts are disabled,
2797 * we do not want to preempt the current task. Just return.. 2797 * we do not want to preempt the current task. Just return..
2798 */ 2798 */
2799 if (unlikely(ti->preempt_count || irqs_disabled())) 2799 if (unlikely(ti->preempt_count || irqs_disabled()))
2800 return; 2800 return;
2801 2801
2802 need_resched: 2802 need_resched:
2803 add_preempt_count(PREEMPT_ACTIVE); 2803 add_preempt_count(PREEMPT_ACTIVE);
2804 /* 2804 /*
2805 * We keep the big kernel semaphore locked, but we 2805 * We keep the big kernel semaphore locked, but we
2806 * clear ->lock_depth so that schedule() doesnt 2806 * clear ->lock_depth so that schedule() doesnt
2807 * auto-release the semaphore: 2807 * auto-release the semaphore:
2808 */ 2808 */
2809 #ifdef CONFIG_PREEMPT_BKL 2809 #ifdef CONFIG_PREEMPT_BKL
2810 saved_lock_depth = task->lock_depth; 2810 saved_lock_depth = task->lock_depth;
2811 task->lock_depth = -1; 2811 task->lock_depth = -1;
2812 #endif 2812 #endif
2813 schedule(); 2813 schedule();
2814 #ifdef CONFIG_PREEMPT_BKL 2814 #ifdef CONFIG_PREEMPT_BKL
2815 task->lock_depth = saved_lock_depth; 2815 task->lock_depth = saved_lock_depth;
2816 #endif 2816 #endif
2817 sub_preempt_count(PREEMPT_ACTIVE); 2817 sub_preempt_count(PREEMPT_ACTIVE);
2818 2818
2819 /* we could miss a preemption opportunity between schedule and now */ 2819 /* we could miss a preemption opportunity between schedule and now */
2820 barrier(); 2820 barrier();
2821 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 2821 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2822 goto need_resched; 2822 goto need_resched;
2823 } 2823 }
2824 2824
2825 EXPORT_SYMBOL(preempt_schedule); 2825 EXPORT_SYMBOL(preempt_schedule);
2826 2826
2827 /* 2827 /*
2828 * this is is the entry point to schedule() from kernel preemption 2828 * this is is the entry point to schedule() from kernel preemption
2829 * off of irq context. 2829 * off of irq context.
2830 * Note, that this is called and return with irqs disabled. This will 2830 * Note, that this is called and return with irqs disabled. This will
2831 * protect us against recursive calling from irq. 2831 * protect us against recursive calling from irq.
2832 */ 2832 */
2833 asmlinkage void __sched preempt_schedule_irq(void) 2833 asmlinkage void __sched preempt_schedule_irq(void)
2834 { 2834 {
2835 struct thread_info *ti = current_thread_info(); 2835 struct thread_info *ti = current_thread_info();
2836 #ifdef CONFIG_PREEMPT_BKL 2836 #ifdef CONFIG_PREEMPT_BKL
2837 struct task_struct *task = current; 2837 struct task_struct *task = current;
2838 int saved_lock_depth; 2838 int saved_lock_depth;
2839 #endif 2839 #endif
2840 /* Catch callers which need to be fixed*/ 2840 /* Catch callers which need to be fixed*/
2841 BUG_ON(ti->preempt_count || !irqs_disabled()); 2841 BUG_ON(ti->preempt_count || !irqs_disabled());
2842 2842
2843 need_resched: 2843 need_resched:
2844 add_preempt_count(PREEMPT_ACTIVE); 2844 add_preempt_count(PREEMPT_ACTIVE);
2845 /* 2845 /*
2846 * We keep the big kernel semaphore locked, but we 2846 * We keep the big kernel semaphore locked, but we
2847 * clear ->lock_depth so that schedule() doesnt 2847 * clear ->lock_depth so that schedule() doesnt
2848 * auto-release the semaphore: 2848 * auto-release the semaphore:
2849 */ 2849 */
2850 #ifdef CONFIG_PREEMPT_BKL 2850 #ifdef CONFIG_PREEMPT_BKL
2851 saved_lock_depth = task->lock_depth; 2851 saved_lock_depth = task->lock_depth;
2852 task->lock_depth = -1; 2852 task->lock_depth = -1;
2853 #endif 2853 #endif
2854 local_irq_enable(); 2854 local_irq_enable();
2855 schedule(); 2855 schedule();
2856 local_irq_disable(); 2856 local_irq_disable();
2857 #ifdef CONFIG_PREEMPT_BKL 2857 #ifdef CONFIG_PREEMPT_BKL
2858 task->lock_depth = saved_lock_depth; 2858 task->lock_depth = saved_lock_depth;
2859 #endif 2859 #endif
2860 sub_preempt_count(PREEMPT_ACTIVE); 2860 sub_preempt_count(PREEMPT_ACTIVE);
2861 2861
2862 /* we could miss a preemption opportunity between schedule and now */ 2862 /* we could miss a preemption opportunity between schedule and now */
2863 barrier(); 2863 barrier();
2864 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 2864 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2865 goto need_resched; 2865 goto need_resched;
2866 } 2866 }
2867 2867
2868 #endif /* CONFIG_PREEMPT */ 2868 #endif /* CONFIG_PREEMPT */
2869 2869
2870 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 2870 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871 { 2871 {
2872 task_t *p = curr->task; 2872 task_t *p = curr->task;
2873 return try_to_wake_up(p, mode, sync); 2873 return try_to_wake_up(p, mode, sync);
2874 } 2874 }
2875 2875
2876 EXPORT_SYMBOL(default_wake_function); 2876 EXPORT_SYMBOL(default_wake_function);
2877 2877
2878 /* 2878 /*
2879 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 2879 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2880 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 2880 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2881 * number) then we wake all the non-exclusive tasks and one exclusive task. 2881 * number) then we wake all the non-exclusive tasks and one exclusive task.
2882 * 2882 *
2883 * There are circumstances in which we can try to wake a task which has already 2883 * There are circumstances in which we can try to wake a task which has already
2884 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 2884 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2885 * zero in this (rare) case, and we handle it by continuing to scan the queue. 2885 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2886 */ 2886 */
2887 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 2887 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2888 int nr_exclusive, int sync, void *key) 2888 int nr_exclusive, int sync, void *key)
2889 { 2889 {
2890 struct list_head *tmp, *next; 2890 struct list_head *tmp, *next;
2891 2891
2892 list_for_each_safe(tmp, next, &q->task_list) { 2892 list_for_each_safe(tmp, next, &q->task_list) {
2893 wait_queue_t *curr; 2893 wait_queue_t *curr;
2894 unsigned flags; 2894 unsigned flags;
2895 curr = list_entry(tmp, wait_queue_t, task_list); 2895 curr = list_entry(tmp, wait_queue_t, task_list);
2896 flags = curr->flags; 2896 flags = curr->flags;
2897 if (curr->func(curr, mode, sync, key) && 2897 if (curr->func(curr, mode, sync, key) &&
2898 (flags & WQ_FLAG_EXCLUSIVE) && 2898 (flags & WQ_FLAG_EXCLUSIVE) &&
2899 !--nr_exclusive) 2899 !--nr_exclusive)
2900 break; 2900 break;
2901 } 2901 }
2902 } 2902 }
2903 2903
2904 /** 2904 /**
2905 * __wake_up - wake up threads blocked on a waitqueue. 2905 * __wake_up - wake up threads blocked on a waitqueue.
2906 * @q: the waitqueue 2906 * @q: the waitqueue
2907 * @mode: which threads 2907 * @mode: which threads
2908 * @nr_exclusive: how many wake-one or wake-many threads to wake up 2908 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2909 * @key: is directly passed to the wakeup function 2909 * @key: is directly passed to the wakeup function
2910 */ 2910 */
2911 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 2911 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2912 int nr_exclusive, void *key) 2912 int nr_exclusive, void *key)
2913 { 2913 {
2914 unsigned long flags; 2914 unsigned long flags;
2915 2915
2916 spin_lock_irqsave(&q->lock, flags); 2916 spin_lock_irqsave(&q->lock, flags);
2917 __wake_up_common(q, mode, nr_exclusive, 0, key); 2917 __wake_up_common(q, mode, nr_exclusive, 0, key);
2918 spin_unlock_irqrestore(&q->lock, flags); 2918 spin_unlock_irqrestore(&q->lock, flags);
2919 } 2919 }
2920 2920
2921 EXPORT_SYMBOL(__wake_up); 2921 EXPORT_SYMBOL(__wake_up);
2922 2922
2923 /* 2923 /*
2924 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 2924 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2925 */ 2925 */
2926 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 2926 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2927 { 2927 {
2928 __wake_up_common(q, mode, 1, 0, NULL); 2928 __wake_up_common(q, mode, 1, 0, NULL);
2929 } 2929 }
2930 2930
2931 /** 2931 /**
2932 * __wake_up_sync - wake up threads blocked on a waitqueue. 2932 * __wake_up_sync - wake up threads blocked on a waitqueue.
2933 * @q: the waitqueue 2933 * @q: the waitqueue
2934 * @mode: which threads 2934 * @mode: which threads
2935 * @nr_exclusive: how many wake-one or wake-many threads to wake up 2935 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2936 * 2936 *
2937 * The sync wakeup differs that the waker knows that it will schedule 2937 * The sync wakeup differs that the waker knows that it will schedule
2938 * away soon, so while the target thread will be woken up, it will not 2938 * away soon, so while the target thread will be woken up, it will not
2939 * be migrated to another CPU - ie. the two threads are 'synchronized' 2939 * be migrated to another CPU - ie. the two threads are 'synchronized'
2940 * with each other. This can prevent needless bouncing between CPUs. 2940 * with each other. This can prevent needless bouncing between CPUs.
2941 * 2941 *
2942 * On UP it can prevent extra preemption. 2942 * On UP it can prevent extra preemption.
2943 */ 2943 */
2944 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 2944 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2945 { 2945 {
2946 unsigned long flags; 2946 unsigned long flags;
2947 int sync = 1; 2947 int sync = 1;
2948 2948
2949 if (unlikely(!q)) 2949 if (unlikely(!q))
2950 return; 2950 return;
2951 2951
2952 if (unlikely(!nr_exclusive)) 2952 if (unlikely(!nr_exclusive))
2953 sync = 0; 2953 sync = 0;
2954 2954
2955 spin_lock_irqsave(&q->lock, flags); 2955 spin_lock_irqsave(&q->lock, flags);
2956 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 2956 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2957 spin_unlock_irqrestore(&q->lock, flags); 2957 spin_unlock_irqrestore(&q->lock, flags);
2958 } 2958 }
2959 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 2959 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2960 2960
2961 void fastcall complete(struct completion *x) 2961 void fastcall complete(struct completion *x)
2962 { 2962 {
2963 unsigned long flags; 2963 unsigned long flags;
2964 2964
2965 spin_lock_irqsave(&x->wait.lock, flags); 2965 spin_lock_irqsave(&x->wait.lock, flags);
2966 x->done++; 2966 x->done++;
2967 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 2967 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2968 1, 0, NULL); 2968 1, 0, NULL);
2969 spin_unlock_irqrestore(&x->wait.lock, flags); 2969 spin_unlock_irqrestore(&x->wait.lock, flags);
2970 } 2970 }
2971 EXPORT_SYMBOL(complete); 2971 EXPORT_SYMBOL(complete);
2972 2972
2973 void fastcall complete_all(struct completion *x) 2973 void fastcall complete_all(struct completion *x)
2974 { 2974 {
2975 unsigned long flags; 2975 unsigned long flags;
2976 2976
2977 spin_lock_irqsave(&x->wait.lock, flags); 2977 spin_lock_irqsave(&x->wait.lock, flags);
2978 x->done += UINT_MAX/2; 2978 x->done += UINT_MAX/2;
2979 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 2979 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2980 0, 0, NULL); 2980 0, 0, NULL);
2981 spin_unlock_irqrestore(&x->wait.lock, flags); 2981 spin_unlock_irqrestore(&x->wait.lock, flags);
2982 } 2982 }
2983 EXPORT_SYMBOL(complete_all); 2983 EXPORT_SYMBOL(complete_all);
2984 2984
2985 void fastcall __sched wait_for_completion(struct completion *x) 2985 void fastcall __sched wait_for_completion(struct completion *x)
2986 { 2986 {
2987 might_sleep(); 2987 might_sleep();
2988 spin_lock_irq(&x->wait.lock); 2988 spin_lock_irq(&x->wait.lock);
2989 if (!x->done) { 2989 if (!x->done) {
2990 DECLARE_WAITQUEUE(wait, current); 2990 DECLARE_WAITQUEUE(wait, current);
2991 2991
2992 wait.flags |= WQ_FLAG_EXCLUSIVE; 2992 wait.flags |= WQ_FLAG_EXCLUSIVE;
2993 __add_wait_queue_tail(&x->wait, &wait); 2993 __add_wait_queue_tail(&x->wait, &wait);
2994 do { 2994 do {
2995 __set_current_state(TASK_UNINTERRUPTIBLE); 2995 __set_current_state(TASK_UNINTERRUPTIBLE);
2996 spin_unlock_irq(&x->wait.lock); 2996 spin_unlock_irq(&x->wait.lock);
2997 schedule(); 2997 schedule();
2998 spin_lock_irq(&x->wait.lock); 2998 spin_lock_irq(&x->wait.lock);
2999 } while (!x->done); 2999 } while (!x->done);
3000 __remove_wait_queue(&x->wait, &wait); 3000 __remove_wait_queue(&x->wait, &wait);
3001 } 3001 }
3002 x->done--; 3002 x->done--;
3003 spin_unlock_irq(&x->wait.lock); 3003 spin_unlock_irq(&x->wait.lock);
3004 } 3004 }
3005 EXPORT_SYMBOL(wait_for_completion); 3005 EXPORT_SYMBOL(wait_for_completion);
3006 3006
3007 unsigned long fastcall __sched 3007 unsigned long fastcall __sched
3008 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3008 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3009 { 3009 {
3010 might_sleep(); 3010 might_sleep();
3011 3011
3012 spin_lock_irq(&x->wait.lock); 3012 spin_lock_irq(&x->wait.lock);
3013 if (!x->done) { 3013 if (!x->done) {
3014 DECLARE_WAITQUEUE(wait, current); 3014 DECLARE_WAITQUEUE(wait, current);
3015 3015
3016 wait.flags |= WQ_FLAG_EXCLUSIVE; 3016 wait.flags |= WQ_FLAG_EXCLUSIVE;
3017 __add_wait_queue_tail(&x->wait, &wait); 3017 __add_wait_queue_tail(&x->wait, &wait);
3018 do { 3018 do {
3019 __set_current_state(TASK_UNINTERRUPTIBLE); 3019 __set_current_state(TASK_UNINTERRUPTIBLE);
3020 spin_unlock_irq(&x->wait.lock); 3020 spin_unlock_irq(&x->wait.lock);
3021 timeout = schedule_timeout(timeout); 3021 timeout = schedule_timeout(timeout);
3022 spin_lock_irq(&x->wait.lock); 3022 spin_lock_irq(&x->wait.lock);
3023 if (!timeout) { 3023 if (!timeout) {
3024 __remove_wait_queue(&x->wait, &wait); 3024 __remove_wait_queue(&x->wait, &wait);
3025 goto out; 3025 goto out;
3026 } 3026 }
3027 } while (!x->done); 3027 } while (!x->done);
3028 __remove_wait_queue(&x->wait, &wait); 3028 __remove_wait_queue(&x->wait, &wait);
3029 } 3029 }
3030 x->done--; 3030 x->done--;
3031 out: 3031 out:
3032 spin_unlock_irq(&x->wait.lock); 3032 spin_unlock_irq(&x->wait.lock);
3033 return timeout; 3033 return timeout;
3034 } 3034 }
3035 EXPORT_SYMBOL(wait_for_completion_timeout); 3035 EXPORT_SYMBOL(wait_for_completion_timeout);
3036 3036
3037 int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3037 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3038 { 3038 {
3039 int ret = 0; 3039 int ret = 0;
3040 3040
3041 might_sleep(); 3041 might_sleep();
3042 3042
3043 spin_lock_irq(&x->wait.lock); 3043 spin_lock_irq(&x->wait.lock);
3044 if (!x->done) { 3044 if (!x->done) {
3045 DECLARE_WAITQUEUE(wait, current); 3045 DECLARE_WAITQUEUE(wait, current);
3046 3046
3047 wait.flags |= WQ_FLAG_EXCLUSIVE; 3047 wait.flags |= WQ_FLAG_EXCLUSIVE;
3048 __add_wait_queue_tail(&x->wait, &wait); 3048 __add_wait_queue_tail(&x->wait, &wait);
3049 do { 3049 do {
3050 if (signal_pending(current)) { 3050 if (signal_pending(current)) {
3051 ret = -ERESTARTSYS; 3051 ret = -ERESTARTSYS;
3052 __remove_wait_queue(&x->wait, &wait); 3052 __remove_wait_queue(&x->wait, &wait);
3053 goto out; 3053 goto out;
3054 } 3054 }
3055 __set_current_state(TASK_INTERRUPTIBLE); 3055 __set_current_state(TASK_INTERRUPTIBLE);
3056 spin_unlock_irq(&x->wait.lock); 3056 spin_unlock_irq(&x->wait.lock);
3057 schedule(); 3057 schedule();
3058 spin_lock_irq(&x->wait.lock); 3058 spin_lock_irq(&x->wait.lock);
3059 } while (!x->done); 3059 } while (!x->done);
3060 __remove_wait_queue(&x->wait, &wait); 3060 __remove_wait_queue(&x->wait, &wait);
3061 } 3061 }
3062 x->done--; 3062 x->done--;
3063 out: 3063 out:
3064 spin_unlock_irq(&x->wait.lock); 3064 spin_unlock_irq(&x->wait.lock);
3065 3065
3066 return ret; 3066 return ret;
3067 } 3067 }
3068 EXPORT_SYMBOL(wait_for_completion_interruptible); 3068 EXPORT_SYMBOL(wait_for_completion_interruptible);
3069 3069
3070 unsigned long fastcall __sched 3070 unsigned long fastcall __sched
3071 wait_for_completion_interruptible_timeout(struct completion *x, 3071 wait_for_completion_interruptible_timeout(struct completion *x,
3072 unsigned long timeout) 3072 unsigned long timeout)
3073 { 3073 {
3074 might_sleep(); 3074 might_sleep();
3075 3075
3076 spin_lock_irq(&x->wait.lock); 3076 spin_lock_irq(&x->wait.lock);
3077 if (!x->done) { 3077 if (!x->done) {
3078 DECLARE_WAITQUEUE(wait, current); 3078 DECLARE_WAITQUEUE(wait, current);
3079 3079
3080 wait.flags |= WQ_FLAG_EXCLUSIVE; 3080 wait.flags |= WQ_FLAG_EXCLUSIVE;
3081 __add_wait_queue_tail(&x->wait, &wait); 3081 __add_wait_queue_tail(&x->wait, &wait);
3082 do { 3082 do {
3083 if (signal_pending(current)) { 3083 if (signal_pending(current)) {
3084 timeout = -ERESTARTSYS; 3084 timeout = -ERESTARTSYS;
3085 __remove_wait_queue(&x->wait, &wait); 3085 __remove_wait_queue(&x->wait, &wait);
3086 goto out; 3086 goto out;
3087 } 3087 }
3088 __set_current_state(TASK_INTERRUPTIBLE); 3088 __set_current_state(TASK_INTERRUPTIBLE);
3089 spin_unlock_irq(&x->wait.lock); 3089 spin_unlock_irq(&x->wait.lock);
3090 timeout = schedule_timeout(timeout); 3090 timeout = schedule_timeout(timeout);
3091 spin_lock_irq(&x->wait.lock); 3091 spin_lock_irq(&x->wait.lock);
3092 if (!timeout) { 3092 if (!timeout) {
3093 __remove_wait_queue(&x->wait, &wait); 3093 __remove_wait_queue(&x->wait, &wait);
3094 goto out; 3094 goto out;
3095 } 3095 }
3096 } while (!x->done); 3096 } while (!x->done);
3097 __remove_wait_queue(&x->wait, &wait); 3097 __remove_wait_queue(&x->wait, &wait);
3098 } 3098 }
3099 x->done--; 3099 x->done--;
3100 out: 3100 out:
3101 spin_unlock_irq(&x->wait.lock); 3101 spin_unlock_irq(&x->wait.lock);
3102 return timeout; 3102 return timeout;
3103 } 3103 }
3104 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3104 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3105 3105
3106 3106
3107 #define SLEEP_ON_VAR \ 3107 #define SLEEP_ON_VAR \
3108 unsigned long flags; \ 3108 unsigned long flags; \
3109 wait_queue_t wait; \ 3109 wait_queue_t wait; \
3110 init_waitqueue_entry(&wait, current); 3110 init_waitqueue_entry(&wait, current);
3111 3111
3112 #define SLEEP_ON_HEAD \ 3112 #define SLEEP_ON_HEAD \
3113 spin_lock_irqsave(&q->lock,flags); \ 3113 spin_lock_irqsave(&q->lock,flags); \
3114 __add_wait_queue(q, &wait); \ 3114 __add_wait_queue(q, &wait); \
3115 spin_unlock(&q->lock); 3115 spin_unlock(&q->lock);
3116 3116
3117 #define SLEEP_ON_TAIL \ 3117 #define SLEEP_ON_TAIL \
3118 spin_lock_irq(&q->lock); \ 3118 spin_lock_irq(&q->lock); \
3119 __remove_wait_queue(q, &wait); \ 3119 __remove_wait_queue(q, &wait); \
3120 spin_unlock_irqrestore(&q->lock, flags); 3120 spin_unlock_irqrestore(&q->lock, flags);
3121 3121
3122 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3122 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3123 { 3123 {
3124 SLEEP_ON_VAR 3124 SLEEP_ON_VAR
3125 3125
3126 current->state = TASK_INTERRUPTIBLE; 3126 current->state = TASK_INTERRUPTIBLE;
3127 3127
3128 SLEEP_ON_HEAD 3128 SLEEP_ON_HEAD
3129 schedule(); 3129 schedule();
3130 SLEEP_ON_TAIL 3130 SLEEP_ON_TAIL
3131 } 3131 }
3132 3132
3133 EXPORT_SYMBOL(interruptible_sleep_on); 3133 EXPORT_SYMBOL(interruptible_sleep_on);
3134 3134
3135 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3135 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3136 { 3136 {
3137 SLEEP_ON_VAR 3137 SLEEP_ON_VAR
3138 3138
3139 current->state = TASK_INTERRUPTIBLE; 3139 current->state = TASK_INTERRUPTIBLE;
3140 3140
3141 SLEEP_ON_HEAD 3141 SLEEP_ON_HEAD
3142 timeout = schedule_timeout(timeout); 3142 timeout = schedule_timeout(timeout);
3143 SLEEP_ON_TAIL 3143 SLEEP_ON_TAIL
3144 3144
3145 return timeout; 3145 return timeout;
3146 } 3146 }
3147 3147
3148 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3148 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3149 3149
3150 void fastcall __sched sleep_on(wait_queue_head_t *q) 3150 void fastcall __sched sleep_on(wait_queue_head_t *q)
3151 { 3151 {
3152 SLEEP_ON_VAR 3152 SLEEP_ON_VAR
3153 3153
3154 current->state = TASK_UNINTERRUPTIBLE; 3154 current->state = TASK_UNINTERRUPTIBLE;
3155 3155
3156 SLEEP_ON_HEAD 3156 SLEEP_ON_HEAD
3157 schedule(); 3157 schedule();
3158 SLEEP_ON_TAIL 3158 SLEEP_ON_TAIL
3159 } 3159 }
3160 3160
3161 EXPORT_SYMBOL(sleep_on); 3161 EXPORT_SYMBOL(sleep_on);
3162 3162
3163 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3163 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3164 { 3164 {
3165 SLEEP_ON_VAR 3165 SLEEP_ON_VAR
3166 3166
3167 current->state = TASK_UNINTERRUPTIBLE; 3167 current->state = TASK_UNINTERRUPTIBLE;
3168 3168
3169 SLEEP_ON_HEAD 3169 SLEEP_ON_HEAD
3170 timeout = schedule_timeout(timeout); 3170 timeout = schedule_timeout(timeout);
3171 SLEEP_ON_TAIL 3171 SLEEP_ON_TAIL
3172 3172
3173 return timeout; 3173 return timeout;
3174 } 3174 }
3175 3175
3176 EXPORT_SYMBOL(sleep_on_timeout); 3176 EXPORT_SYMBOL(sleep_on_timeout);
3177 3177
3178 void set_user_nice(task_t *p, long nice) 3178 void set_user_nice(task_t *p, long nice)
3179 { 3179 {
3180 unsigned long flags; 3180 unsigned long flags;
3181 prio_array_t *array; 3181 prio_array_t *array;
3182 runqueue_t *rq; 3182 runqueue_t *rq;
3183 int old_prio, new_prio, delta; 3183 int old_prio, new_prio, delta;
3184 3184
3185 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3185 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3186 return; 3186 return;
3187 /* 3187 /*
3188 * We have to be careful, if called from sys_setpriority(), 3188 * We have to be careful, if called from sys_setpriority(),
3189 * the task might be in the middle of scheduling on another CPU. 3189 * the task might be in the middle of scheduling on another CPU.
3190 */ 3190 */
3191 rq = task_rq_lock(p, &flags); 3191 rq = task_rq_lock(p, &flags);
3192 /* 3192 /*
3193 * The RT priorities are set via sched_setscheduler(), but we still 3193 * The RT priorities are set via sched_setscheduler(), but we still
3194 * allow the 'normal' nice value to be set - but as expected 3194 * allow the 'normal' nice value to be set - but as expected
3195 * it wont have any effect on scheduling until the task is 3195 * it wont have any effect on scheduling until the task is
3196 * not SCHED_NORMAL: 3196 * not SCHED_NORMAL:
3197 */ 3197 */
3198 if (rt_task(p)) { 3198 if (rt_task(p)) {
3199 p->static_prio = NICE_TO_PRIO(nice); 3199 p->static_prio = NICE_TO_PRIO(nice);
3200 goto out_unlock; 3200 goto out_unlock;
3201 } 3201 }
3202 array = p->array; 3202 array = p->array;
3203 if (array) 3203 if (array)
3204 dequeue_task(p, array); 3204 dequeue_task(p, array);
3205 3205
3206 old_prio = p->prio; 3206 old_prio = p->prio;
3207 new_prio = NICE_TO_PRIO(nice); 3207 new_prio = NICE_TO_PRIO(nice);
3208 delta = new_prio - old_prio; 3208 delta = new_prio - old_prio;
3209 p->static_prio = NICE_TO_PRIO(nice); 3209 p->static_prio = NICE_TO_PRIO(nice);
3210 p->prio += delta; 3210 p->prio += delta;
3211 3211
3212 if (array) { 3212 if (array) {
3213 enqueue_task(p, array); 3213 enqueue_task(p, array);
3214 /* 3214 /*
3215 * If the task increased its priority or is running and 3215 * If the task increased its priority or is running and
3216 * lowered its priority, then reschedule its CPU: 3216 * lowered its priority, then reschedule its CPU:
3217 */ 3217 */
3218 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3218 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3219 resched_task(rq->curr); 3219 resched_task(rq->curr);
3220 } 3220 }
3221 out_unlock: 3221 out_unlock:
3222 task_rq_unlock(rq, &flags); 3222 task_rq_unlock(rq, &flags);
3223 } 3223 }
3224 3224
3225 EXPORT_SYMBOL(set_user_nice); 3225 EXPORT_SYMBOL(set_user_nice);
3226 3226
3227 /* 3227 /*
3228 * can_nice - check if a task can reduce its nice value 3228 * can_nice - check if a task can reduce its nice value
3229 * @p: task 3229 * @p: task
3230 * @nice: nice value 3230 * @nice: nice value
3231 */ 3231 */
3232 int can_nice(const task_t *p, const int nice) 3232 int can_nice(const task_t *p, const int nice)
3233 { 3233 {
3234 /* convert nice value [19,-20] to rlimit style value [0,39] */ 3234 /* convert nice value [19,-20] to rlimit style value [0,39] */
3235 int nice_rlim = 19 - nice; 3235 int nice_rlim = 19 - nice;
3236 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3236 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3237 capable(CAP_SYS_NICE)); 3237 capable(CAP_SYS_NICE));
3238 } 3238 }
3239 3239
3240 #ifdef __ARCH_WANT_SYS_NICE 3240 #ifdef __ARCH_WANT_SYS_NICE
3241 3241
3242 /* 3242 /*
3243 * sys_nice - change the priority of the current process. 3243 * sys_nice - change the priority of the current process.
3244 * @increment: priority increment 3244 * @increment: priority increment
3245 * 3245 *
3246 * sys_setpriority is a more generic, but much slower function that 3246 * sys_setpriority is a more generic, but much slower function that
3247 * does similar things. 3247 * does similar things.
3248 */ 3248 */
3249 asmlinkage long sys_nice(int increment) 3249 asmlinkage long sys_nice(int increment)
3250 { 3250 {
3251 int retval; 3251 int retval;
3252 long nice; 3252 long nice;
3253 3253
3254 /* 3254 /*
3255 * Setpriority might change our priority at the same moment. 3255 * Setpriority might change our priority at the same moment.
3256 * We don't have to worry. Conceptually one call occurs first 3256 * We don't have to worry. Conceptually one call occurs first
3257 * and we have a single winner. 3257 * and we have a single winner.
3258 */ 3258 */
3259 if (increment < -40) 3259 if (increment < -40)
3260 increment = -40; 3260 increment = -40;
3261 if (increment > 40) 3261 if (increment > 40)
3262 increment = 40; 3262 increment = 40;
3263 3263
3264 nice = PRIO_TO_NICE(current->static_prio) + increment; 3264 nice = PRIO_TO_NICE(current->static_prio) + increment;
3265 if (nice < -20) 3265 if (nice < -20)
3266 nice = -20; 3266 nice = -20;
3267 if (nice > 19) 3267 if (nice > 19)
3268 nice = 19; 3268 nice = 19;
3269 3269
3270 if (increment < 0 && !can_nice(current, nice)) 3270 if (increment < 0 && !can_nice(current, nice))
3271 return -EPERM; 3271 return -EPERM;
3272 3272
3273 retval = security_task_setnice(current, nice); 3273 retval = security_task_setnice(current, nice);
3274 if (retval) 3274 if (retval)
3275 return retval; 3275 return retval;
3276 3276
3277 set_user_nice(current, nice); 3277 set_user_nice(current, nice);
3278 return 0; 3278 return 0;
3279 } 3279 }
3280 3280
3281 #endif 3281 #endif
3282 3282
3283 /** 3283 /**
3284 * task_prio - return the priority value of a given task. 3284 * task_prio - return the priority value of a given task.
3285 * @p: the task in question. 3285 * @p: the task in question.
3286 * 3286 *
3287 * This is the priority value as seen by users in /proc. 3287 * This is the priority value as seen by users in /proc.
3288 * RT tasks are offset by -200. Normal tasks are centered 3288 * RT tasks are offset by -200. Normal tasks are centered
3289 * around 0, value goes from -16 to +15. 3289 * around 0, value goes from -16 to +15.
3290 */ 3290 */
3291 int task_prio(const task_t *p) 3291 int task_prio(const task_t *p)
3292 { 3292 {
3293 return p->prio - MAX_RT_PRIO; 3293 return p->prio - MAX_RT_PRIO;
3294 } 3294 }
3295 3295
3296 /** 3296 /**
3297 * task_nice - return the nice value of a given task. 3297 * task_nice - return the nice value of a given task.
3298 * @p: the task in question. 3298 * @p: the task in question.
3299 */ 3299 */
3300 int task_nice(const task_t *p) 3300 int task_nice(const task_t *p)
3301 { 3301 {
3302 return TASK_NICE(p); 3302 return TASK_NICE(p);
3303 } 3303 }
3304 3304
3305 /* 3305 /*
3306 * The only users of task_nice are binfmt_elf and binfmt_elf32. 3306 * The only users of task_nice are binfmt_elf and binfmt_elf32.
3307 * binfmt_elf is no longer modular, but binfmt_elf32 still is. 3307 * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3308 * Therefore, task_nice is needed if there is a compat_mode. 3308 * Therefore, task_nice is needed if there is a compat_mode.
3309 */ 3309 */
3310 #ifdef CONFIG_COMPAT 3310 #ifdef CONFIG_COMPAT
3311 EXPORT_SYMBOL_GPL(task_nice); 3311 EXPORT_SYMBOL_GPL(task_nice);
3312 #endif 3312 #endif
3313 3313
3314 /** 3314 /**
3315 * idle_cpu - is a given cpu idle currently? 3315 * idle_cpu - is a given cpu idle currently?
3316 * @cpu: the processor in question. 3316 * @cpu: the processor in question.
3317 */ 3317 */
3318 int idle_cpu(int cpu) 3318 int idle_cpu(int cpu)
3319 { 3319 {
3320 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 3320 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3321 } 3321 }
3322 3322
3323 EXPORT_SYMBOL_GPL(idle_cpu); 3323 EXPORT_SYMBOL_GPL(idle_cpu);
3324 3324
3325 /** 3325 /**
3326 * idle_task - return the idle task for a given cpu. 3326 * idle_task - return the idle task for a given cpu.
3327 * @cpu: the processor in question. 3327 * @cpu: the processor in question.
3328 */ 3328 */
3329 task_t *idle_task(int cpu) 3329 task_t *idle_task(int cpu)
3330 { 3330 {
3331 return cpu_rq(cpu)->idle; 3331 return cpu_rq(cpu)->idle;
3332 } 3332 }
3333 3333
3334 /** 3334 /**
3335 * find_process_by_pid - find a process with a matching PID value. 3335 * find_process_by_pid - find a process with a matching PID value.
3336 * @pid: the pid in question. 3336 * @pid: the pid in question.
3337 */ 3337 */
3338 static inline task_t *find_process_by_pid(pid_t pid) 3338 static inline task_t *find_process_by_pid(pid_t pid)
3339 { 3339 {
3340 return pid ? find_task_by_pid(pid) : current; 3340 return pid ? find_task_by_pid(pid) : current;
3341 } 3341 }
3342 3342
3343 /* Actually do priority change: must hold rq lock. */ 3343 /* Actually do priority change: must hold rq lock. */
3344 static void __setscheduler(struct task_struct *p, int policy, int prio) 3344 static void __setscheduler(struct task_struct *p, int policy, int prio)
3345 { 3345 {
3346 BUG_ON(p->array); 3346 BUG_ON(p->array);
3347 p->policy = policy; 3347 p->policy = policy;
3348 p->rt_priority = prio; 3348 p->rt_priority = prio;
3349 if (policy != SCHED_NORMAL) 3349 if (policy != SCHED_NORMAL)
3350 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; 3350 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3351 else 3351 else
3352 p->prio = p->static_prio; 3352 p->prio = p->static_prio;
3353 } 3353 }
3354 3354
3355 /** 3355 /**
3356 * sched_setscheduler - change the scheduling policy and/or RT priority of 3356 * sched_setscheduler - change the scheduling policy and/or RT priority of
3357 * a thread. 3357 * a thread.
3358 * @p: the task in question. 3358 * @p: the task in question.
3359 * @policy: new policy. 3359 * @policy: new policy.
3360 * @param: structure containing the new RT priority. 3360 * @param: structure containing the new RT priority.
3361 */ 3361 */
3362 int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) 3362 int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
3363 { 3363 {
3364 int retval; 3364 int retval;
3365 int oldprio, oldpolicy = -1; 3365 int oldprio, oldpolicy = -1;
3366 prio_array_t *array; 3366 prio_array_t *array;
3367 unsigned long flags; 3367 unsigned long flags;
3368 runqueue_t *rq; 3368 runqueue_t *rq;
3369 3369
3370 recheck: 3370 recheck:
3371 /* double check policy once rq lock held */ 3371 /* double check policy once rq lock held */
3372 if (policy < 0) 3372 if (policy < 0)
3373 policy = oldpolicy = p->policy; 3373 policy = oldpolicy = p->policy;
3374 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3374 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3375 policy != SCHED_NORMAL) 3375 policy != SCHED_NORMAL)
3376 return -EINVAL; 3376 return -EINVAL;
3377 /* 3377 /*
3378 * Valid priorities for SCHED_FIFO and SCHED_RR are 3378 * Valid priorities for SCHED_FIFO and SCHED_RR are
3379 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3379 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3380 */ 3380 */
3381 if (param->sched_priority < 0 || 3381 if (param->sched_priority < 0 ||
3382 param->sched_priority > MAX_USER_RT_PRIO-1) 3382 param->sched_priority > MAX_USER_RT_PRIO-1)
3383 return -EINVAL; 3383 return -EINVAL;
3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385 return -EINVAL; 3385 return -EINVAL;
3386 3386
3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) && 3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && 3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
3389 !capable(CAP_SYS_NICE)) 3389 !capable(CAP_SYS_NICE))
3390 return -EPERM; 3390 return -EPERM;
3391 if ((current->euid != p->euid) && (current->euid != p->uid) && 3391 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3392 !capable(CAP_SYS_NICE)) 3392 !capable(CAP_SYS_NICE))
3393 return -EPERM; 3393 return -EPERM;
3394 3394
3395 retval = security_task_setscheduler(p, policy, param); 3395 retval = security_task_setscheduler(p, policy, param);
3396 if (retval) 3396 if (retval)
3397 return retval; 3397 return retval;
3398 /* 3398 /*
3399 * To be able to change p->policy safely, the apropriate 3399 * To be able to change p->policy safely, the apropriate
3400 * runqueue lock must be held. 3400 * runqueue lock must be held.
3401 */ 3401 */
3402 rq = task_rq_lock(p, &flags); 3402 rq = task_rq_lock(p, &flags);
3403 /* recheck policy now with rq lock held */ 3403 /* recheck policy now with rq lock held */
3404 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3404 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3405 policy = oldpolicy = -1; 3405 policy = oldpolicy = -1;
3406 task_rq_unlock(rq, &flags); 3406 task_rq_unlock(rq, &flags);
3407 goto recheck; 3407 goto recheck;
3408 } 3408 }
3409 array = p->array; 3409 array = p->array;
3410 if (array) 3410 if (array)
3411 deactivate_task(p, rq); 3411 deactivate_task(p, rq);
3412 oldprio = p->prio; 3412 oldprio = p->prio;
3413 __setscheduler(p, policy, param->sched_priority); 3413 __setscheduler(p, policy, param->sched_priority);
3414 if (array) { 3414 if (array) {
3415 __activate_task(p, rq); 3415 __activate_task(p, rq);
3416 /* 3416 /*
3417 * Reschedule if we are currently running on this runqueue and 3417 * Reschedule if we are currently running on this runqueue and
3418 * our priority decreased, or if we are not currently running on 3418 * our priority decreased, or if we are not currently running on
3419 * this runqueue and our priority is higher than the current's 3419 * this runqueue and our priority is higher than the current's
3420 */ 3420 */
3421 if (task_running(rq, p)) { 3421 if (task_running(rq, p)) {
3422 if (p->prio > oldprio) 3422 if (p->prio > oldprio)
3423 resched_task(rq->curr); 3423 resched_task(rq->curr);
3424 } else if (TASK_PREEMPTS_CURR(p, rq)) 3424 } else if (TASK_PREEMPTS_CURR(p, rq))
3425 resched_task(rq->curr); 3425 resched_task(rq->curr);
3426 } 3426 }
3427 task_rq_unlock(rq, &flags); 3427 task_rq_unlock(rq, &flags);
3428 return 0; 3428 return 0;
3429 } 3429 }
3430 EXPORT_SYMBOL_GPL(sched_setscheduler); 3430 EXPORT_SYMBOL_GPL(sched_setscheduler);
3431 3431
3432 static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3432 static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3433 { 3433 {
3434 int retval; 3434 int retval;
3435 struct sched_param lparam; 3435 struct sched_param lparam;
3436 struct task_struct *p; 3436 struct task_struct *p;
3437 3437
3438 if (!param || pid < 0) 3438 if (!param || pid < 0)
3439 return -EINVAL; 3439 return -EINVAL;
3440 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3440 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3441 return -EFAULT; 3441 return -EFAULT;
3442 read_lock_irq(&tasklist_lock); 3442 read_lock_irq(&tasklist_lock);
3443 p = find_process_by_pid(pid); 3443 p = find_process_by_pid(pid);
3444 if (!p) { 3444 if (!p) {
3445 read_unlock_irq(&tasklist_lock); 3445 read_unlock_irq(&tasklist_lock);
3446 return -ESRCH; 3446 return -ESRCH;
3447 } 3447 }
3448 retval = sched_setscheduler(p, policy, &lparam); 3448 retval = sched_setscheduler(p, policy, &lparam);
3449 read_unlock_irq(&tasklist_lock); 3449 read_unlock_irq(&tasklist_lock);
3450 return retval; 3450 return retval;
3451 } 3451 }
3452 3452
3453 /** 3453 /**
3454 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3454 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3455 * @pid: the pid in question. 3455 * @pid: the pid in question.
3456 * @policy: new policy. 3456 * @policy: new policy.
3457 * @param: structure containing the new RT priority. 3457 * @param: structure containing the new RT priority.
3458 */ 3458 */
3459 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 3459 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3460 struct sched_param __user *param) 3460 struct sched_param __user *param)
3461 { 3461 {
3462 return do_sched_setscheduler(pid, policy, param); 3462 return do_sched_setscheduler(pid, policy, param);
3463 } 3463 }
3464 3464
3465 /** 3465 /**
3466 * sys_sched_setparam - set/change the RT priority of a thread 3466 * sys_sched_setparam - set/change the RT priority of a thread
3467 * @pid: the pid in question. 3467 * @pid: the pid in question.
3468 * @param: structure containing the new RT priority. 3468 * @param: structure containing the new RT priority.
3469 */ 3469 */
3470 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 3470 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3471 { 3471 {
3472 return do_sched_setscheduler(pid, -1, param); 3472 return do_sched_setscheduler(pid, -1, param);
3473 } 3473 }
3474 3474
3475 /** 3475 /**
3476 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3476 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3477 * @pid: the pid in question. 3477 * @pid: the pid in question.
3478 */ 3478 */
3479 asmlinkage long sys_sched_getscheduler(pid_t pid) 3479 asmlinkage long sys_sched_getscheduler(pid_t pid)
3480 { 3480 {
3481 int retval = -EINVAL; 3481 int retval = -EINVAL;
3482 task_t *p; 3482 task_t *p;
3483 3483
3484 if (pid < 0) 3484 if (pid < 0)
3485 goto out_nounlock; 3485 goto out_nounlock;
3486 3486
3487 retval = -ESRCH; 3487 retval = -ESRCH;
3488 read_lock(&tasklist_lock); 3488 read_lock(&tasklist_lock);
3489 p = find_process_by_pid(pid); 3489 p = find_process_by_pid(pid);
3490 if (p) { 3490 if (p) {
3491 retval = security_task_getscheduler(p); 3491 retval = security_task_getscheduler(p);
3492 if (!retval) 3492 if (!retval)
3493 retval = p->policy; 3493 retval = p->policy;
3494 } 3494 }
3495 read_unlock(&tasklist_lock); 3495 read_unlock(&tasklist_lock);
3496 3496
3497 out_nounlock: 3497 out_nounlock:
3498 return retval; 3498 return retval;
3499 } 3499 }
3500 3500
3501 /** 3501 /**
3502 * sys_sched_getscheduler - get the RT priority of a thread 3502 * sys_sched_getscheduler - get the RT priority of a thread
3503 * @pid: the pid in question. 3503 * @pid: the pid in question.
3504 * @param: structure containing the RT priority. 3504 * @param: structure containing the RT priority.
3505 */ 3505 */
3506 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 3506 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3507 { 3507 {
3508 struct sched_param lp; 3508 struct sched_param lp;
3509 int retval = -EINVAL; 3509 int retval = -EINVAL;
3510 task_t *p; 3510 task_t *p;
3511 3511
3512 if (!param || pid < 0) 3512 if (!param || pid < 0)
3513 goto out_nounlock; 3513 goto out_nounlock;
3514 3514
3515 read_lock(&tasklist_lock); 3515 read_lock(&tasklist_lock);
3516 p = find_process_by_pid(pid); 3516 p = find_process_by_pid(pid);
3517 retval = -ESRCH; 3517 retval = -ESRCH;
3518 if (!p) 3518 if (!p)
3519 goto out_unlock; 3519 goto out_unlock;
3520 3520
3521 retval = security_task_getscheduler(p); 3521 retval = security_task_getscheduler(p);
3522 if (retval) 3522 if (retval)
3523 goto out_unlock; 3523 goto out_unlock;
3524 3524
3525 lp.sched_priority = p->rt_priority; 3525 lp.sched_priority = p->rt_priority;
3526 read_unlock(&tasklist_lock); 3526 read_unlock(&tasklist_lock);
3527 3527
3528 /* 3528 /*
3529 * This one might sleep, we cannot do it with a spinlock held ... 3529 * This one might sleep, we cannot do it with a spinlock held ...
3530 */ 3530 */
3531 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3531 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3532 3532
3533 out_nounlock: 3533 out_nounlock:
3534 return retval; 3534 return retval;
3535 3535
3536 out_unlock: 3536 out_unlock:
3537 read_unlock(&tasklist_lock); 3537 read_unlock(&tasklist_lock);
3538 return retval; 3538 return retval;
3539 } 3539 }
3540 3540
3541 long sched_setaffinity(pid_t pid, cpumask_t new_mask) 3541 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3542 { 3542 {
3543 task_t *p; 3543 task_t *p;
3544 int retval; 3544 int retval;
3545 cpumask_t cpus_allowed; 3545 cpumask_t cpus_allowed;
3546 3546
3547 lock_cpu_hotplug(); 3547 lock_cpu_hotplug();
3548 read_lock(&tasklist_lock); 3548 read_lock(&tasklist_lock);
3549 3549
3550 p = find_process_by_pid(pid); 3550 p = find_process_by_pid(pid);
3551 if (!p) { 3551 if (!p) {
3552 read_unlock(&tasklist_lock); 3552 read_unlock(&tasklist_lock);
3553 unlock_cpu_hotplug(); 3553 unlock_cpu_hotplug();
3554 return -ESRCH; 3554 return -ESRCH;
3555 } 3555 }
3556 3556
3557 /* 3557 /*
3558 * It is not safe to call set_cpus_allowed with the 3558 * It is not safe to call set_cpus_allowed with the
3559 * tasklist_lock held. We will bump the task_struct's 3559 * tasklist_lock held. We will bump the task_struct's
3560 * usage count and then drop tasklist_lock. 3560 * usage count and then drop tasklist_lock.
3561 */ 3561 */
3562 get_task_struct(p); 3562 get_task_struct(p);
3563 read_unlock(&tasklist_lock); 3563 read_unlock(&tasklist_lock);
3564 3564
3565 retval = -EPERM; 3565 retval = -EPERM;
3566 if ((current->euid != p->euid) && (current->euid != p->uid) && 3566 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3567 !capable(CAP_SYS_NICE)) 3567 !capable(CAP_SYS_NICE))
3568 goto out_unlock; 3568 goto out_unlock;
3569 3569
3570 cpus_allowed = cpuset_cpus_allowed(p); 3570 cpus_allowed = cpuset_cpus_allowed(p);
3571 cpus_and(new_mask, new_mask, cpus_allowed); 3571 cpus_and(new_mask, new_mask, cpus_allowed);
3572 retval = set_cpus_allowed(p, new_mask); 3572 retval = set_cpus_allowed(p, new_mask);
3573 3573
3574 out_unlock: 3574 out_unlock:
3575 put_task_struct(p); 3575 put_task_struct(p);
3576 unlock_cpu_hotplug(); 3576 unlock_cpu_hotplug();
3577 return retval; 3577 return retval;
3578 } 3578 }
3579 3579
3580 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 3580 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3581 cpumask_t *new_mask) 3581 cpumask_t *new_mask)
3582 { 3582 {
3583 if (len < sizeof(cpumask_t)) { 3583 if (len < sizeof(cpumask_t)) {
3584 memset(new_mask, 0, sizeof(cpumask_t)); 3584 memset(new_mask, 0, sizeof(cpumask_t));
3585 } else if (len > sizeof(cpumask_t)) { 3585 } else if (len > sizeof(cpumask_t)) {
3586 len = sizeof(cpumask_t); 3586 len = sizeof(cpumask_t);
3587 } 3587 }
3588 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 3588 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3589 } 3589 }
3590 3590
3591 /** 3591 /**
3592 * sys_sched_setaffinity - set the cpu affinity of a process 3592 * sys_sched_setaffinity - set the cpu affinity of a process
3593 * @pid: pid of the process 3593 * @pid: pid of the process
3594 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3594 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3595 * @user_mask_ptr: user-space pointer to the new cpu mask 3595 * @user_mask_ptr: user-space pointer to the new cpu mask
3596 */ 3596 */
3597 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 3597 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3598 unsigned long __user *user_mask_ptr) 3598 unsigned long __user *user_mask_ptr)
3599 { 3599 {
3600 cpumask_t new_mask; 3600 cpumask_t new_mask;
3601 int retval; 3601 int retval;
3602 3602
3603 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 3603 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3604 if (retval) 3604 if (retval)
3605 return retval; 3605 return retval;
3606 3606
3607 return sched_setaffinity(pid, new_mask); 3607 return sched_setaffinity(pid, new_mask);
3608 } 3608 }
3609 3609
3610 /* 3610 /*
3611 * Represents all cpu's present in the system 3611 * Represents all cpu's present in the system
3612 * In systems capable of hotplug, this map could dynamically grow 3612 * In systems capable of hotplug, this map could dynamically grow
3613 * as new cpu's are detected in the system via any platform specific 3613 * as new cpu's are detected in the system via any platform specific
3614 * method, such as ACPI for e.g. 3614 * method, such as ACPI for e.g.
3615 */ 3615 */
3616 3616
3617 cpumask_t cpu_present_map; 3617 cpumask_t cpu_present_map;
3618 EXPORT_SYMBOL(cpu_present_map); 3618 EXPORT_SYMBOL(cpu_present_map);
3619 3619
3620 #ifndef CONFIG_SMP 3620 #ifndef CONFIG_SMP
3621 cpumask_t cpu_online_map = CPU_MASK_ALL; 3621 cpumask_t cpu_online_map = CPU_MASK_ALL;
3622 cpumask_t cpu_possible_map = CPU_MASK_ALL; 3622 cpumask_t cpu_possible_map = CPU_MASK_ALL;
3623 #endif 3623 #endif
3624 3624
3625 long sched_getaffinity(pid_t pid, cpumask_t *mask) 3625 long sched_getaffinity(pid_t pid, cpumask_t *mask)
3626 { 3626 {
3627 int retval; 3627 int retval;
3628 task_t *p; 3628 task_t *p;
3629 3629
3630 lock_cpu_hotplug(); 3630 lock_cpu_hotplug();
3631 read_lock(&tasklist_lock); 3631 read_lock(&tasklist_lock);
3632 3632
3633 retval = -ESRCH; 3633 retval = -ESRCH;
3634 p = find_process_by_pid(pid); 3634 p = find_process_by_pid(pid);
3635 if (!p) 3635 if (!p)
3636 goto out_unlock; 3636 goto out_unlock;
3637 3637
3638 retval = 0; 3638 retval = 0;
3639 cpus_and(*mask, p->cpus_allowed, cpu_possible_map); 3639 cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
3640 3640
3641 out_unlock: 3641 out_unlock:
3642 read_unlock(&tasklist_lock); 3642 read_unlock(&tasklist_lock);
3643 unlock_cpu_hotplug(); 3643 unlock_cpu_hotplug();
3644 if (retval) 3644 if (retval)
3645 return retval; 3645 return retval;
3646 3646
3647 return 0; 3647 return 0;
3648 } 3648 }
3649 3649
3650 /** 3650 /**
3651 * sys_sched_getaffinity - get the cpu affinity of a process 3651 * sys_sched_getaffinity - get the cpu affinity of a process
3652 * @pid: pid of the process 3652 * @pid: pid of the process
3653 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3653 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3654 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3654 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3655 */ 3655 */
3656 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 3656 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3657 unsigned long __user *user_mask_ptr) 3657 unsigned long __user *user_mask_ptr)
3658 { 3658 {
3659 int ret; 3659 int ret;
3660 cpumask_t mask; 3660 cpumask_t mask;
3661 3661
3662 if (len < sizeof(cpumask_t)) 3662 if (len < sizeof(cpumask_t))
3663 return -EINVAL; 3663 return -EINVAL;
3664 3664
3665 ret = sched_getaffinity(pid, &mask); 3665 ret = sched_getaffinity(pid, &mask);
3666 if (ret < 0) 3666 if (ret < 0)
3667 return ret; 3667 return ret;
3668 3668
3669 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 3669 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3670 return -EFAULT; 3670 return -EFAULT;
3671 3671
3672 return sizeof(cpumask_t); 3672 return sizeof(cpumask_t);
3673 } 3673 }
3674 3674
3675 /** 3675 /**
3676 * sys_sched_yield - yield the current processor to other threads. 3676 * sys_sched_yield - yield the current processor to other threads.
3677 * 3677 *
3678 * this function yields the current CPU by moving the calling thread 3678 * this function yields the current CPU by moving the calling thread
3679 * to the expired array. If there are no other threads running on this 3679 * to the expired array. If there are no other threads running on this
3680 * CPU then this function will return. 3680 * CPU then this function will return.
3681 */ 3681 */
3682 asmlinkage long sys_sched_yield(void) 3682 asmlinkage long sys_sched_yield(void)
3683 { 3683 {
3684 runqueue_t *rq = this_rq_lock(); 3684 runqueue_t *rq = this_rq_lock();
3685 prio_array_t *array = current->array; 3685 prio_array_t *array = current->array;
3686 prio_array_t *target = rq->expired; 3686 prio_array_t *target = rq->expired;
3687 3687
3688 schedstat_inc(rq, yld_cnt); 3688 schedstat_inc(rq, yld_cnt);
3689 /* 3689 /*
3690 * We implement yielding by moving the task into the expired 3690 * We implement yielding by moving the task into the expired
3691 * queue. 3691 * queue.
3692 * 3692 *
3693 * (special rule: RT tasks will just roundrobin in the active 3693 * (special rule: RT tasks will just roundrobin in the active
3694 * array.) 3694 * array.)
3695 */ 3695 */
3696 if (rt_task(current)) 3696 if (rt_task(current))
3697 target = rq->active; 3697 target = rq->active;
3698 3698
3699 if (current->array->nr_active == 1) { 3699 if (current->array->nr_active == 1) {
3700 schedstat_inc(rq, yld_act_empty); 3700 schedstat_inc(rq, yld_act_empty);
3701 if (!rq->expired->nr_active) 3701 if (!rq->expired->nr_active)
3702 schedstat_inc(rq, yld_both_empty); 3702 schedstat_inc(rq, yld_both_empty);
3703 } else if (!rq->expired->nr_active) 3703 } else if (!rq->expired->nr_active)
3704 schedstat_inc(rq, yld_exp_empty); 3704 schedstat_inc(rq, yld_exp_empty);
3705 3705
3706 if (array != target) { 3706 if (array != target) {
3707 dequeue_task(current, array); 3707 dequeue_task(current, array);
3708 enqueue_task(current, target); 3708 enqueue_task(current, target);
3709 } else 3709 } else
3710 /* 3710 /*
3711 * requeue_task is cheaper so perform that if possible. 3711 * requeue_task is cheaper so perform that if possible.
3712 */ 3712 */
3713 requeue_task(current, array); 3713 requeue_task(current, array);
3714 3714
3715 /* 3715 /*
3716 * Since we are going to call schedule() anyway, there's 3716 * Since we are going to call schedule() anyway, there's
3717 * no need to preempt or enable interrupts: 3717 * no need to preempt or enable interrupts:
3718 */ 3718 */
3719 __release(rq->lock); 3719 __release(rq->lock);
3720 _raw_spin_unlock(&rq->lock); 3720 _raw_spin_unlock(&rq->lock);
3721 preempt_enable_no_resched(); 3721 preempt_enable_no_resched();
3722 3722
3723 schedule(); 3723 schedule();
3724 3724
3725 return 0; 3725 return 0;
3726 } 3726 }
3727 3727
3728 static inline void __cond_resched(void) 3728 static inline void __cond_resched(void)
3729 { 3729 {
3730 do { 3730 do {
3731 add_preempt_count(PREEMPT_ACTIVE); 3731 add_preempt_count(PREEMPT_ACTIVE);
3732 schedule(); 3732 schedule();
3733 sub_preempt_count(PREEMPT_ACTIVE); 3733 sub_preempt_count(PREEMPT_ACTIVE);
3734 } while (need_resched()); 3734 } while (need_resched());
3735 } 3735 }
3736 3736
3737 int __sched cond_resched(void) 3737 int __sched cond_resched(void)
3738 { 3738 {
3739 if (need_resched()) { 3739 if (need_resched()) {
3740 __cond_resched(); 3740 __cond_resched();
3741 return 1; 3741 return 1;
3742 } 3742 }
3743 return 0; 3743 return 0;
3744 } 3744 }
3745 3745
3746 EXPORT_SYMBOL(cond_resched); 3746 EXPORT_SYMBOL(cond_resched);
3747 3747
3748 /* 3748 /*
3749 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 3749 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
3750 * call schedule, and on return reacquire the lock. 3750 * call schedule, and on return reacquire the lock.
3751 * 3751 *
3752 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 3752 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
3753 * operations here to prevent schedule() from being called twice (once via 3753 * operations here to prevent schedule() from being called twice (once via
3754 * spin_unlock(), once by hand). 3754 * spin_unlock(), once by hand).
3755 */ 3755 */
3756 int cond_resched_lock(spinlock_t * lock) 3756 int cond_resched_lock(spinlock_t * lock)
3757 { 3757 {
3758 int ret = 0; 3758 int ret = 0;
3759 3759
3760 if (need_lockbreak(lock)) { 3760 if (need_lockbreak(lock)) {
3761 spin_unlock(lock); 3761 spin_unlock(lock);
3762 cpu_relax(); 3762 cpu_relax();
3763 ret = 1; 3763 ret = 1;
3764 spin_lock(lock); 3764 spin_lock(lock);
3765 } 3765 }
3766 if (need_resched()) { 3766 if (need_resched()) {
3767 _raw_spin_unlock(lock); 3767 _raw_spin_unlock(lock);
3768 preempt_enable_no_resched(); 3768 preempt_enable_no_resched();
3769 __cond_resched(); 3769 __cond_resched();
3770 ret = 1; 3770 ret = 1;
3771 spin_lock(lock); 3771 spin_lock(lock);
3772 } 3772 }
3773 return ret; 3773 return ret;
3774 } 3774 }
3775 3775
3776 EXPORT_SYMBOL(cond_resched_lock); 3776 EXPORT_SYMBOL(cond_resched_lock);
3777 3777
3778 int __sched cond_resched_softirq(void) 3778 int __sched cond_resched_softirq(void)
3779 { 3779 {
3780 BUG_ON(!in_softirq()); 3780 BUG_ON(!in_softirq());
3781 3781
3782 if (need_resched()) { 3782 if (need_resched()) {
3783 __local_bh_enable(); 3783 __local_bh_enable();
3784 __cond_resched(); 3784 __cond_resched();
3785 local_bh_disable(); 3785 local_bh_disable();
3786 return 1; 3786 return 1;
3787 } 3787 }
3788 return 0; 3788 return 0;
3789 } 3789 }
3790 3790
3791 EXPORT_SYMBOL(cond_resched_softirq); 3791 EXPORT_SYMBOL(cond_resched_softirq);
3792 3792
3793 3793
3794 /** 3794 /**
3795 * yield - yield the current processor to other threads. 3795 * yield - yield the current processor to other threads.
3796 * 3796 *
3797 * this is a shortcut for kernel-space yielding - it marks the 3797 * this is a shortcut for kernel-space yielding - it marks the
3798 * thread runnable and calls sys_sched_yield(). 3798 * thread runnable and calls sys_sched_yield().
3799 */ 3799 */
3800 void __sched yield(void) 3800 void __sched yield(void)
3801 { 3801 {
3802 set_current_state(TASK_RUNNING); 3802 set_current_state(TASK_RUNNING);
3803 sys_sched_yield(); 3803 sys_sched_yield();
3804 } 3804 }
3805 3805
3806 EXPORT_SYMBOL(yield); 3806 EXPORT_SYMBOL(yield);
3807 3807
3808 /* 3808 /*
3809 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 3809 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
3810 * that process accounting knows that this is a task in IO wait state. 3810 * that process accounting knows that this is a task in IO wait state.
3811 * 3811 *
3812 * But don't do that if it is a deliberate, throttling IO wait (this task 3812 * But don't do that if it is a deliberate, throttling IO wait (this task
3813 * has set its backing_dev_info: the queue against which it should throttle) 3813 * has set its backing_dev_info: the queue against which it should throttle)
3814 */ 3814 */
3815 void __sched io_schedule(void) 3815 void __sched io_schedule(void)
3816 { 3816 {
3817 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3817 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3818 3818
3819 atomic_inc(&rq->nr_iowait); 3819 atomic_inc(&rq->nr_iowait);
3820 schedule(); 3820 schedule();
3821 atomic_dec(&rq->nr_iowait); 3821 atomic_dec(&rq->nr_iowait);
3822 } 3822 }
3823 3823
3824 EXPORT_SYMBOL(io_schedule); 3824 EXPORT_SYMBOL(io_schedule);
3825 3825
3826 long __sched io_schedule_timeout(long timeout) 3826 long __sched io_schedule_timeout(long timeout)
3827 { 3827 {
3828 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3828 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3829 long ret; 3829 long ret;
3830 3830
3831 atomic_inc(&rq->nr_iowait); 3831 atomic_inc(&rq->nr_iowait);
3832 ret = schedule_timeout(timeout); 3832 ret = schedule_timeout(timeout);
3833 atomic_dec(&rq->nr_iowait); 3833 atomic_dec(&rq->nr_iowait);
3834 return ret; 3834 return ret;
3835 } 3835 }
3836 3836
3837 /** 3837 /**
3838 * sys_sched_get_priority_max - return maximum RT priority. 3838 * sys_sched_get_priority_max - return maximum RT priority.
3839 * @policy: scheduling class. 3839 * @policy: scheduling class.
3840 * 3840 *
3841 * this syscall returns the maximum rt_priority that can be used 3841 * this syscall returns the maximum rt_priority that can be used
3842 * by a given scheduling class. 3842 * by a given scheduling class.
3843 */ 3843 */
3844 asmlinkage long sys_sched_get_priority_max(int policy) 3844 asmlinkage long sys_sched_get_priority_max(int policy)
3845 { 3845 {
3846 int ret = -EINVAL; 3846 int ret = -EINVAL;
3847 3847
3848 switch (policy) { 3848 switch (policy) {
3849 case SCHED_FIFO: 3849 case SCHED_FIFO:
3850 case SCHED_RR: 3850 case SCHED_RR:
3851 ret = MAX_USER_RT_PRIO-1; 3851 ret = MAX_USER_RT_PRIO-1;
3852 break; 3852 break;
3853 case SCHED_NORMAL: 3853 case SCHED_NORMAL:
3854 ret = 0; 3854 ret = 0;
3855 break; 3855 break;
3856 } 3856 }
3857 return ret; 3857 return ret;
3858 } 3858 }
3859 3859
3860 /** 3860 /**
3861 * sys_sched_get_priority_min - return minimum RT priority. 3861 * sys_sched_get_priority_min - return minimum RT priority.
3862 * @policy: scheduling class. 3862 * @policy: scheduling class.
3863 * 3863 *
3864 * this syscall returns the minimum rt_priority that can be used 3864 * this syscall returns the minimum rt_priority that can be used
3865 * by a given scheduling class. 3865 * by a given scheduling class.
3866 */ 3866 */
3867 asmlinkage long sys_sched_get_priority_min(int policy) 3867 asmlinkage long sys_sched_get_priority_min(int policy)
3868 { 3868 {
3869 int ret = -EINVAL; 3869 int ret = -EINVAL;
3870 3870
3871 switch (policy) { 3871 switch (policy) {
3872 case SCHED_FIFO: 3872 case SCHED_FIFO:
3873 case SCHED_RR: 3873 case SCHED_RR:
3874 ret = 1; 3874 ret = 1;
3875 break; 3875 break;
3876 case SCHED_NORMAL: 3876 case SCHED_NORMAL:
3877 ret = 0; 3877 ret = 0;
3878 } 3878 }
3879 return ret; 3879 return ret;
3880 } 3880 }
3881 3881
3882 /** 3882 /**
3883 * sys_sched_rr_get_interval - return the default timeslice of a process. 3883 * sys_sched_rr_get_interval - return the default timeslice of a process.
3884 * @pid: pid of the process. 3884 * @pid: pid of the process.
3885 * @interval: userspace pointer to the timeslice value. 3885 * @interval: userspace pointer to the timeslice value.
3886 * 3886 *
3887 * this syscall writes the default timeslice value of a given process 3887 * this syscall writes the default timeslice value of a given process
3888 * into the user-space timespec buffer. A value of '0' means infinity. 3888 * into the user-space timespec buffer. A value of '0' means infinity.
3889 */ 3889 */
3890 asmlinkage 3890 asmlinkage
3891 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 3891 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3892 { 3892 {
3893 int retval = -EINVAL; 3893 int retval = -EINVAL;
3894 struct timespec t; 3894 struct timespec t;
3895 task_t *p; 3895 task_t *p;
3896 3896
3897 if (pid < 0) 3897 if (pid < 0)
3898 goto out_nounlock; 3898 goto out_nounlock;
3899 3899
3900 retval = -ESRCH; 3900 retval = -ESRCH;
3901 read_lock(&tasklist_lock); 3901 read_lock(&tasklist_lock);
3902 p = find_process_by_pid(pid); 3902 p = find_process_by_pid(pid);
3903 if (!p) 3903 if (!p)
3904 goto out_unlock; 3904 goto out_unlock;
3905 3905
3906 retval = security_task_getscheduler(p); 3906 retval = security_task_getscheduler(p);
3907 if (retval) 3907 if (retval)
3908 goto out_unlock; 3908 goto out_unlock;
3909 3909
3910 jiffies_to_timespec(p->policy & SCHED_FIFO ? 3910 jiffies_to_timespec(p->policy & SCHED_FIFO ?
3911 0 : task_timeslice(p), &t); 3911 0 : task_timeslice(p), &t);
3912 read_unlock(&tasklist_lock); 3912 read_unlock(&tasklist_lock);
3913 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 3913 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3914 out_nounlock: 3914 out_nounlock:
3915 return retval; 3915 return retval;
3916 out_unlock: 3916 out_unlock:
3917 read_unlock(&tasklist_lock); 3917 read_unlock(&tasklist_lock);
3918 return retval; 3918 return retval;
3919 } 3919 }
3920 3920
3921 static inline struct task_struct *eldest_child(struct task_struct *p) 3921 static inline struct task_struct *eldest_child(struct task_struct *p)
3922 { 3922 {
3923 if (list_empty(&p->children)) return NULL; 3923 if (list_empty(&p->children)) return NULL;
3924 return list_entry(p->children.next,struct task_struct,sibling); 3924 return list_entry(p->children.next,struct task_struct,sibling);
3925 } 3925 }
3926 3926
3927 static inline struct task_struct *older_sibling(struct task_struct *p) 3927 static inline struct task_struct *older_sibling(struct task_struct *p)
3928 { 3928 {
3929 if (p->sibling.prev==&p->parent->children) return NULL; 3929 if (p->sibling.prev==&p->parent->children) return NULL;
3930 return list_entry(p->sibling.prev,struct task_struct,sibling); 3930 return list_entry(p->sibling.prev,struct task_struct,sibling);
3931 } 3931 }
3932 3932
3933 static inline struct task_struct *younger_sibling(struct task_struct *p) 3933 static inline struct task_struct *younger_sibling(struct task_struct *p)
3934 { 3934 {
3935 if (p->sibling.next==&p->parent->children) return NULL; 3935 if (p->sibling.next==&p->parent->children) return NULL;
3936 return list_entry(p->sibling.next,struct task_struct,sibling); 3936 return list_entry(p->sibling.next,struct task_struct,sibling);
3937 } 3937 }
3938 3938
3939 static void show_task(task_t * p) 3939 static void show_task(task_t * p)
3940 { 3940 {
3941 task_t *relative; 3941 task_t *relative;
3942 unsigned state; 3942 unsigned state;
3943 unsigned long free = 0; 3943 unsigned long free = 0;
3944 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; 3944 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
3945 3945
3946 printk("%-13.13s ", p->comm); 3946 printk("%-13.13s ", p->comm);
3947 state = p->state ? __ffs(p->state) + 1 : 0; 3947 state = p->state ? __ffs(p->state) + 1 : 0;
3948 if (state < ARRAY_SIZE(stat_nam)) 3948 if (state < ARRAY_SIZE(stat_nam))
3949 printk(stat_nam[state]); 3949 printk(stat_nam[state]);
3950 else 3950 else
3951 printk("?"); 3951 printk("?");
3952 #if (BITS_PER_LONG == 32) 3952 #if (BITS_PER_LONG == 32)
3953 if (state == TASK_RUNNING) 3953 if (state == TASK_RUNNING)
3954 printk(" running "); 3954 printk(" running ");
3955 else 3955 else
3956 printk(" %08lX ", thread_saved_pc(p)); 3956 printk(" %08lX ", thread_saved_pc(p));
3957 #else 3957 #else
3958 if (state == TASK_RUNNING) 3958 if (state == TASK_RUNNING)
3959 printk(" running task "); 3959 printk(" running task ");
3960 else 3960 else
3961 printk(" %016lx ", thread_saved_pc(p)); 3961 printk(" %016lx ", thread_saved_pc(p));
3962 #endif 3962 #endif
3963 #ifdef CONFIG_DEBUG_STACK_USAGE 3963 #ifdef CONFIG_DEBUG_STACK_USAGE
3964 { 3964 {
3965 unsigned long * n = (unsigned long *) (p->thread_info+1); 3965 unsigned long * n = (unsigned long *) (p->thread_info+1);
3966 while (!*n) 3966 while (!*n)
3967 n++; 3967 n++;
3968 free = (unsigned long) n - (unsigned long)(p->thread_info+1); 3968 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3969 } 3969 }
3970 #endif 3970 #endif
3971 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); 3971 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3972 if ((relative = eldest_child(p))) 3972 if ((relative = eldest_child(p)))
3973 printk("%5d ", relative->pid); 3973 printk("%5d ", relative->pid);
3974 else 3974 else
3975 printk(" "); 3975 printk(" ");
3976 if ((relative = younger_sibling(p))) 3976 if ((relative = younger_sibling(p)))
3977 printk("%7d", relative->pid); 3977 printk("%7d", relative->pid);
3978 else 3978 else
3979 printk(" "); 3979 printk(" ");
3980 if ((relative = older_sibling(p))) 3980 if ((relative = older_sibling(p)))
3981 printk(" %5d", relative->pid); 3981 printk(" %5d", relative->pid);
3982 else 3982 else
3983 printk(" "); 3983 printk(" ");
3984 if (!p->mm) 3984 if (!p->mm)
3985 printk(" (L-TLB)\n"); 3985 printk(" (L-TLB)\n");
3986 else 3986 else
3987 printk(" (NOTLB)\n"); 3987 printk(" (NOTLB)\n");
3988 3988
3989 if (state != TASK_RUNNING) 3989 if (state != TASK_RUNNING)
3990 show_stack(p, NULL); 3990 show_stack(p, NULL);
3991 } 3991 }
3992 3992
3993 void show_state(void) 3993 void show_state(void)
3994 { 3994 {
3995 task_t *g, *p; 3995 task_t *g, *p;
3996 3996
3997 #if (BITS_PER_LONG == 32) 3997 #if (BITS_PER_LONG == 32)
3998 printk("\n" 3998 printk("\n"
3999 " sibling\n"); 3999 " sibling\n");
4000 printk(" task PC pid father child younger older\n"); 4000 printk(" task PC pid father child younger older\n");
4001 #else 4001 #else
4002 printk("\n" 4002 printk("\n"
4003 " sibling\n"); 4003 " sibling\n");
4004 printk(" task PC pid father child younger older\n"); 4004 printk(" task PC pid father child younger older\n");
4005 #endif 4005 #endif
4006 read_lock(&tasklist_lock); 4006 read_lock(&tasklist_lock);
4007 do_each_thread(g, p) { 4007 do_each_thread(g, p) {
4008 /* 4008 /*
4009 * reset the NMI-timeout, listing all files on a slow 4009 * reset the NMI-timeout, listing all files on a slow
4010 * console might take alot of time: 4010 * console might take alot of time:
4011 */ 4011 */
4012 touch_nmi_watchdog(); 4012 touch_nmi_watchdog();
4013 show_task(p); 4013 show_task(p);
4014 } while_each_thread(g, p); 4014 } while_each_thread(g, p);
4015 4015
4016 read_unlock(&tasklist_lock); 4016 read_unlock(&tasklist_lock);
4017 } 4017 }
4018 4018
4019 void __devinit init_idle(task_t *idle, int cpu) 4019 void __devinit init_idle(task_t *idle, int cpu)
4020 { 4020 {
4021 runqueue_t *rq = cpu_rq(cpu); 4021 runqueue_t *rq = cpu_rq(cpu);
4022 unsigned long flags; 4022 unsigned long flags;
4023 4023
4024 idle->sleep_avg = 0; 4024 idle->sleep_avg = 0;
4025 idle->array = NULL; 4025 idle->array = NULL;
4026 idle->prio = MAX_PRIO; 4026 idle->prio = MAX_PRIO;
4027 idle->state = TASK_RUNNING; 4027 idle->state = TASK_RUNNING;
4028 idle->cpus_allowed = cpumask_of_cpu(cpu); 4028 idle->cpus_allowed = cpumask_of_cpu(cpu);
4029 set_task_cpu(idle, cpu); 4029 set_task_cpu(idle, cpu);
4030 4030
4031 spin_lock_irqsave(&rq->lock, flags); 4031 spin_lock_irqsave(&rq->lock, flags);
4032 rq->curr = rq->idle = idle; 4032 rq->curr = rq->idle = idle;
4033 set_tsk_need_resched(idle); 4033 set_tsk_need_resched(idle);
4034 spin_unlock_irqrestore(&rq->lock, flags); 4034 spin_unlock_irqrestore(&rq->lock, flags);
4035 4035
4036 /* Set the preempt count _outside_ the spinlocks! */ 4036 /* Set the preempt count _outside_ the spinlocks! */
4037 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) 4037 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4038 idle->thread_info->preempt_count = (idle->lock_depth >= 0); 4038 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4039 #else 4039 #else
4040 idle->thread_info->preempt_count = 0; 4040 idle->thread_info->preempt_count = 0;
4041 #endif 4041 #endif
4042 } 4042 }
4043 4043
4044 /* 4044 /*
4045 * In a system that switches off the HZ timer nohz_cpu_mask 4045 * In a system that switches off the HZ timer nohz_cpu_mask
4046 * indicates which cpus entered this state. This is used 4046 * indicates which cpus entered this state. This is used
4047 * in the rcu update to wait only for active cpus. For system 4047 * in the rcu update to wait only for active cpus. For system
4048 * which do not switch off the HZ timer nohz_cpu_mask should 4048 * which do not switch off the HZ timer nohz_cpu_mask should
4049 * always be CPU_MASK_NONE. 4049 * always be CPU_MASK_NONE.
4050 */ 4050 */
4051 cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4051 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4052 4052
4053 #ifdef CONFIG_SMP 4053 #ifdef CONFIG_SMP
4054 /* 4054 /*
4055 * This is how migration works: 4055 * This is how migration works:
4056 * 4056 *
4057 * 1) we queue a migration_req_t structure in the source CPU's 4057 * 1) we queue a migration_req_t structure in the source CPU's
4058 * runqueue and wake up that CPU's migration thread. 4058 * runqueue and wake up that CPU's migration thread.
4059 * 2) we down() the locked semaphore => thread blocks. 4059 * 2) we down() the locked semaphore => thread blocks.
4060 * 3) migration thread wakes up (implicitly it forces the migrated 4060 * 3) migration thread wakes up (implicitly it forces the migrated
4061 * thread off the CPU) 4061 * thread off the CPU)
4062 * 4) it gets the migration request and checks whether the migrated 4062 * 4) it gets the migration request and checks whether the migrated
4063 * task is still in the wrong runqueue. 4063 * task is still in the wrong runqueue.
4064 * 5) if it's in the wrong runqueue then the migration thread removes 4064 * 5) if it's in the wrong runqueue then the migration thread removes
4065 * it and puts it into the right queue. 4065 * it and puts it into the right queue.
4066 * 6) migration thread up()s the semaphore. 4066 * 6) migration thread up()s the semaphore.
4067 * 7) we wake up and the migration is done. 4067 * 7) we wake up and the migration is done.
4068 */ 4068 */
4069 4069
4070 /* 4070 /*
4071 * Change a given task's CPU affinity. Migrate the thread to a 4071 * Change a given task's CPU affinity. Migrate the thread to a
4072 * proper CPU and schedule it away if the CPU it's executing on 4072 * proper CPU and schedule it away if the CPU it's executing on
4073 * is removed from the allowed bitmask. 4073 * is removed from the allowed bitmask.
4074 * 4074 *
4075 * NOTE: the caller must have a valid reference to the task, the 4075 * NOTE: the caller must have a valid reference to the task, the
4076 * task must not exit() & deallocate itself prematurely. The 4076 * task must not exit() & deallocate itself prematurely. The
4077 * call is not atomic; no spinlocks may be held. 4077 * call is not atomic; no spinlocks may be held.
4078 */ 4078 */
4079 int set_cpus_allowed(task_t *p, cpumask_t new_mask) 4079 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4080 { 4080 {
4081 unsigned long flags; 4081 unsigned long flags;
4082 int ret = 0; 4082 int ret = 0;
4083 migration_req_t req; 4083 migration_req_t req;
4084 runqueue_t *rq; 4084 runqueue_t *rq;
4085 4085
4086 rq = task_rq_lock(p, &flags); 4086 rq = task_rq_lock(p, &flags);
4087 if (!cpus_intersects(new_mask, cpu_online_map)) { 4087 if (!cpus_intersects(new_mask, cpu_online_map)) {
4088 ret = -EINVAL; 4088 ret = -EINVAL;
4089 goto out; 4089 goto out;
4090 } 4090 }
4091 4091
4092 p->cpus_allowed = new_mask; 4092 p->cpus_allowed = new_mask;
4093 /* Can the task run on the task's current CPU? If so, we're done */ 4093 /* Can the task run on the task's current CPU? If so, we're done */
4094 if (cpu_isset(task_cpu(p), new_mask)) 4094 if (cpu_isset(task_cpu(p), new_mask))
4095 goto out; 4095 goto out;
4096 4096
4097 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 4097 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4098 /* Need help from migration thread: drop lock and wait. */ 4098 /* Need help from migration thread: drop lock and wait. */
4099 task_rq_unlock(rq, &flags); 4099 task_rq_unlock(rq, &flags);
4100 wake_up_process(rq->migration_thread); 4100 wake_up_process(rq->migration_thread);
4101 wait_for_completion(&req.done); 4101 wait_for_completion(&req.done);
4102 tlb_migrate_finish(p->mm); 4102 tlb_migrate_finish(p->mm);
4103 return 0; 4103 return 0;
4104 } 4104 }
4105 out: 4105 out:
4106 task_rq_unlock(rq, &flags); 4106 task_rq_unlock(rq, &flags);
4107 return ret; 4107 return ret;
4108 } 4108 }
4109 4109
4110 EXPORT_SYMBOL_GPL(set_cpus_allowed); 4110 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4111 4111
4112 /* 4112 /*
4113 * Move (not current) task off this cpu, onto dest cpu. We're doing 4113 * Move (not current) task off this cpu, onto dest cpu. We're doing
4114 * this because either it can't run here any more (set_cpus_allowed() 4114 * this because either it can't run here any more (set_cpus_allowed()
4115 * away from this CPU, or CPU going down), or because we're 4115 * away from this CPU, or CPU going down), or because we're
4116 * attempting to rebalance this task on exec (sched_exec). 4116 * attempting to rebalance this task on exec (sched_exec).
4117 * 4117 *
4118 * So we race with normal scheduler movements, but that's OK, as long 4118 * So we race with normal scheduler movements, but that's OK, as long
4119 * as the task is no longer on this CPU. 4119 * as the task is no longer on this CPU.
4120 */ 4120 */
4121 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4121 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4122 { 4122 {
4123 runqueue_t *rq_dest, *rq_src; 4123 runqueue_t *rq_dest, *rq_src;
4124 4124
4125 if (unlikely(cpu_is_offline(dest_cpu))) 4125 if (unlikely(cpu_is_offline(dest_cpu)))
4126 return; 4126 return;
4127 4127
4128 rq_src = cpu_rq(src_cpu); 4128 rq_src = cpu_rq(src_cpu);
4129 rq_dest = cpu_rq(dest_cpu); 4129 rq_dest = cpu_rq(dest_cpu);
4130 4130
4131 double_rq_lock(rq_src, rq_dest); 4131 double_rq_lock(rq_src, rq_dest);
4132 /* Already moved. */ 4132 /* Already moved. */
4133 if (task_cpu(p) != src_cpu) 4133 if (task_cpu(p) != src_cpu)
4134 goto out; 4134 goto out;
4135 /* Affinity changed (again). */ 4135 /* Affinity changed (again). */
4136 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4136 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4137 goto out; 4137 goto out;
4138 4138
4139 set_task_cpu(p, dest_cpu); 4139 set_task_cpu(p, dest_cpu);
4140 if (p->array) { 4140 if (p->array) {
4141 /* 4141 /*
4142 * Sync timestamp with rq_dest's before activating. 4142 * Sync timestamp with rq_dest's before activating.
4143 * The same thing could be achieved by doing this step 4143 * The same thing could be achieved by doing this step
4144 * afterwards, and pretending it was a local activate. 4144 * afterwards, and pretending it was a local activate.
4145 * This way is cleaner and logically correct. 4145 * This way is cleaner and logically correct.
4146 */ 4146 */
4147 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 4147 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4148 + rq_dest->timestamp_last_tick; 4148 + rq_dest->timestamp_last_tick;
4149 deactivate_task(p, rq_src); 4149 deactivate_task(p, rq_src);
4150 activate_task(p, rq_dest, 0); 4150 activate_task(p, rq_dest, 0);
4151 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4151 if (TASK_PREEMPTS_CURR(p, rq_dest))
4152 resched_task(rq_dest->curr); 4152 resched_task(rq_dest->curr);
4153 } 4153 }
4154 4154
4155 out: 4155 out:
4156 double_rq_unlock(rq_src, rq_dest); 4156 double_rq_unlock(rq_src, rq_dest);
4157 } 4157 }
4158 4158
4159 /* 4159 /*
4160 * migration_thread - this is a highprio system thread that performs 4160 * migration_thread - this is a highprio system thread that performs
4161 * thread migration by bumping thread off CPU then 'pushing' onto 4161 * thread migration by bumping thread off CPU then 'pushing' onto
4162 * another runqueue. 4162 * another runqueue.
4163 */ 4163 */
4164 static int migration_thread(void * data) 4164 static int migration_thread(void * data)
4165 { 4165 {
4166 runqueue_t *rq; 4166 runqueue_t *rq;
4167 int cpu = (long)data; 4167 int cpu = (long)data;
4168 4168
4169 rq = cpu_rq(cpu); 4169 rq = cpu_rq(cpu);
4170 BUG_ON(rq->migration_thread != current); 4170 BUG_ON(rq->migration_thread != current);
4171 4171
4172 set_current_state(TASK_INTERRUPTIBLE); 4172 set_current_state(TASK_INTERRUPTIBLE);
4173 while (!kthread_should_stop()) { 4173 while (!kthread_should_stop()) {
4174 struct list_head *head; 4174 struct list_head *head;
4175 migration_req_t *req; 4175 migration_req_t *req;
4176 4176
4177 if (current->flags & PF_FREEZE) 4177 if (current->flags & PF_FREEZE)
4178 refrigerator(PF_FREEZE); 4178 refrigerator(PF_FREEZE);
4179 4179
4180 spin_lock_irq(&rq->lock); 4180 spin_lock_irq(&rq->lock);
4181 4181
4182 if (cpu_is_offline(cpu)) { 4182 if (cpu_is_offline(cpu)) {
4183 spin_unlock_irq(&rq->lock); 4183 spin_unlock_irq(&rq->lock);
4184 goto wait_to_die; 4184 goto wait_to_die;
4185 } 4185 }
4186 4186
4187 if (rq->active_balance) { 4187 if (rq->active_balance) {
4188 active_load_balance(rq, cpu); 4188 active_load_balance(rq, cpu);
4189 rq->active_balance = 0; 4189 rq->active_balance = 0;
4190 } 4190 }
4191 4191
4192 head = &rq->migration_queue; 4192 head = &rq->migration_queue;
4193 4193
4194 if (list_empty(head)) { 4194 if (list_empty(head)) {
4195 spin_unlock_irq(&rq->lock); 4195 spin_unlock_irq(&rq->lock);
4196 schedule(); 4196 schedule();
4197 set_current_state(TASK_INTERRUPTIBLE); 4197 set_current_state(TASK_INTERRUPTIBLE);
4198 continue; 4198 continue;
4199 } 4199 }
4200 req = list_entry(head->next, migration_req_t, list); 4200 req = list_entry(head->next, migration_req_t, list);
4201 list_del_init(head->next); 4201 list_del_init(head->next);
4202 4202
4203 if (req->type == REQ_MOVE_TASK) { 4203 if (req->type == REQ_MOVE_TASK) {
4204 spin_unlock(&rq->lock); 4204 spin_unlock(&rq->lock);
4205 __migrate_task(req->task, cpu, req->dest_cpu); 4205 __migrate_task(req->task, cpu, req->dest_cpu);
4206 local_irq_enable(); 4206 local_irq_enable();
4207 } else if (req->type == REQ_SET_DOMAIN) { 4207 } else if (req->type == REQ_SET_DOMAIN) {
4208 rq->sd = req->sd; 4208 rq->sd = req->sd;
4209 spin_unlock_irq(&rq->lock); 4209 spin_unlock_irq(&rq->lock);
4210 } else { 4210 } else {
4211 spin_unlock_irq(&rq->lock); 4211 spin_unlock_irq(&rq->lock);
4212 WARN_ON(1); 4212 WARN_ON(1);
4213 } 4213 }
4214 4214
4215 complete(&req->done); 4215 complete(&req->done);
4216 } 4216 }
4217 __set_current_state(TASK_RUNNING); 4217 __set_current_state(TASK_RUNNING);
4218 return 0; 4218 return 0;
4219 4219
4220 wait_to_die: 4220 wait_to_die:
4221 /* Wait for kthread_stop */ 4221 /* Wait for kthread_stop */
4222 set_current_state(TASK_INTERRUPTIBLE); 4222 set_current_state(TASK_INTERRUPTIBLE);
4223 while (!kthread_should_stop()) { 4223 while (!kthread_should_stop()) {
4224 schedule(); 4224 schedule();
4225 set_current_state(TASK_INTERRUPTIBLE); 4225 set_current_state(TASK_INTERRUPTIBLE);
4226 } 4226 }
4227 __set_current_state(TASK_RUNNING); 4227 __set_current_state(TASK_RUNNING);
4228 return 0; 4228 return 0;
4229 } 4229 }
4230 4230
4231 #ifdef CONFIG_HOTPLUG_CPU 4231 #ifdef CONFIG_HOTPLUG_CPU
4232 /* Figure out where task on dead CPU should go, use force if neccessary. */ 4232 /* Figure out where task on dead CPU should go, use force if neccessary. */
4233 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4233 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4234 { 4234 {
4235 int dest_cpu; 4235 int dest_cpu;
4236 cpumask_t mask; 4236 cpumask_t mask;
4237 4237
4238 /* On same node? */ 4238 /* On same node? */
4239 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4239 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4240 cpus_and(mask, mask, tsk->cpus_allowed); 4240 cpus_and(mask, mask, tsk->cpus_allowed);
4241 dest_cpu = any_online_cpu(mask); 4241 dest_cpu = any_online_cpu(mask);
4242 4242
4243 /* On any allowed CPU? */ 4243 /* On any allowed CPU? */
4244 if (dest_cpu == NR_CPUS) 4244 if (dest_cpu == NR_CPUS)
4245 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4245 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4246 4246
4247 /* No more Mr. Nice Guy. */ 4247 /* No more Mr. Nice Guy. */
4248 if (dest_cpu == NR_CPUS) { 4248 if (dest_cpu == NR_CPUS) {
4249 cpus_setall(tsk->cpus_allowed); 4249 cpus_setall(tsk->cpus_allowed);
4250 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4250 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4251 4251
4252 /* 4252 /*
4253 * Don't tell them about moving exiting tasks or 4253 * Don't tell them about moving exiting tasks or
4254 * kernel threads (both mm NULL), since they never 4254 * kernel threads (both mm NULL), since they never
4255 * leave kernel. 4255 * leave kernel.
4256 */ 4256 */
4257 if (tsk->mm && printk_ratelimit()) 4257 if (tsk->mm && printk_ratelimit())
4258 printk(KERN_INFO "process %d (%s) no " 4258 printk(KERN_INFO "process %d (%s) no "
4259 "longer affine to cpu%d\n", 4259 "longer affine to cpu%d\n",
4260 tsk->pid, tsk->comm, dead_cpu); 4260 tsk->pid, tsk->comm, dead_cpu);
4261 } 4261 }
4262 __migrate_task(tsk, dead_cpu, dest_cpu); 4262 __migrate_task(tsk, dead_cpu, dest_cpu);
4263 } 4263 }
4264 4264
4265 /* 4265 /*
4266 * While a dead CPU has no uninterruptible tasks queued at this point, 4266 * While a dead CPU has no uninterruptible tasks queued at this point,
4267 * it might still have a nonzero ->nr_uninterruptible counter, because 4267 * it might still have a nonzero ->nr_uninterruptible counter, because
4268 * for performance reasons the counter is not stricly tracking tasks to 4268 * for performance reasons the counter is not stricly tracking tasks to
4269 * their home CPUs. So we just add the counter to another CPU's counter, 4269 * their home CPUs. So we just add the counter to another CPU's counter,
4270 * to keep the global sum constant after CPU-down: 4270 * to keep the global sum constant after CPU-down:
4271 */ 4271 */
4272 static void migrate_nr_uninterruptible(runqueue_t *rq_src) 4272 static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4273 { 4273 {
4274 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 4274 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4275 unsigned long flags; 4275 unsigned long flags;
4276 4276
4277 local_irq_save(flags); 4277 local_irq_save(flags);
4278 double_rq_lock(rq_src, rq_dest); 4278 double_rq_lock(rq_src, rq_dest);
4279 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 4279 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4280 rq_src->nr_uninterruptible = 0; 4280 rq_src->nr_uninterruptible = 0;
4281 double_rq_unlock(rq_src, rq_dest); 4281 double_rq_unlock(rq_src, rq_dest);
4282 local_irq_restore(flags); 4282 local_irq_restore(flags);
4283 } 4283 }
4284 4284
4285 /* Run through task list and migrate tasks from the dead cpu. */ 4285 /* Run through task list and migrate tasks from the dead cpu. */
4286 static void migrate_live_tasks(int src_cpu) 4286 static void migrate_live_tasks(int src_cpu)
4287 { 4287 {
4288 struct task_struct *tsk, *t; 4288 struct task_struct *tsk, *t;
4289 4289
4290 write_lock_irq(&tasklist_lock); 4290 write_lock_irq(&tasklist_lock);
4291 4291
4292 do_each_thread(t, tsk) { 4292 do_each_thread(t, tsk) {
4293 if (tsk == current) 4293 if (tsk == current)
4294 continue; 4294 continue;
4295 4295
4296 if (task_cpu(tsk) == src_cpu) 4296 if (task_cpu(tsk) == src_cpu)
4297 move_task_off_dead_cpu(src_cpu, tsk); 4297 move_task_off_dead_cpu(src_cpu, tsk);
4298 } while_each_thread(t, tsk); 4298 } while_each_thread(t, tsk);
4299 4299
4300 write_unlock_irq(&tasklist_lock); 4300 write_unlock_irq(&tasklist_lock);
4301 } 4301 }
4302 4302
4303 /* Schedules idle task to be the next runnable task on current CPU. 4303 /* Schedules idle task to be the next runnable task on current CPU.
4304 * It does so by boosting its priority to highest possible and adding it to 4304 * It does so by boosting its priority to highest possible and adding it to
4305 * the _front_ of runqueue. Used by CPU offline code. 4305 * the _front_ of runqueue. Used by CPU offline code.
4306 */ 4306 */
4307 void sched_idle_next(void) 4307 void sched_idle_next(void)
4308 { 4308 {
4309 int cpu = smp_processor_id(); 4309 int cpu = smp_processor_id();
4310 runqueue_t *rq = this_rq(); 4310 runqueue_t *rq = this_rq();
4311 struct task_struct *p = rq->idle; 4311 struct task_struct *p = rq->idle;
4312 unsigned long flags; 4312 unsigned long flags;
4313 4313
4314 /* cpu has to be offline */ 4314 /* cpu has to be offline */
4315 BUG_ON(cpu_online(cpu)); 4315 BUG_ON(cpu_online(cpu));
4316 4316
4317 /* Strictly not necessary since rest of the CPUs are stopped by now 4317 /* Strictly not necessary since rest of the CPUs are stopped by now
4318 * and interrupts disabled on current cpu. 4318 * and interrupts disabled on current cpu.
4319 */ 4319 */
4320 spin_lock_irqsave(&rq->lock, flags); 4320 spin_lock_irqsave(&rq->lock, flags);
4321 4321
4322 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 4322 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4323 /* Add idle task to _front_ of it's priority queue */ 4323 /* Add idle task to _front_ of it's priority queue */
4324 __activate_idle_task(p, rq); 4324 __activate_idle_task(p, rq);
4325 4325
4326 spin_unlock_irqrestore(&rq->lock, flags); 4326 spin_unlock_irqrestore(&rq->lock, flags);
4327 } 4327 }
4328 4328
4329 /* Ensures that the idle task is using init_mm right before its cpu goes 4329 /* Ensures that the idle task is using init_mm right before its cpu goes
4330 * offline. 4330 * offline.
4331 */ 4331 */
4332 void idle_task_exit(void) 4332 void idle_task_exit(void)
4333 { 4333 {
4334 struct mm_struct *mm = current->active_mm; 4334 struct mm_struct *mm = current->active_mm;
4335 4335
4336 BUG_ON(cpu_online(smp_processor_id())); 4336 BUG_ON(cpu_online(smp_processor_id()));
4337 4337
4338 if (mm != &init_mm) 4338 if (mm != &init_mm)
4339 switch_mm(mm, &init_mm, current); 4339 switch_mm(mm, &init_mm, current);
4340 mmdrop(mm); 4340 mmdrop(mm);
4341 } 4341 }
4342 4342
4343 static void migrate_dead(unsigned int dead_cpu, task_t *tsk) 4343 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4344 { 4344 {
4345 struct runqueue *rq = cpu_rq(dead_cpu); 4345 struct runqueue *rq = cpu_rq(dead_cpu);
4346 4346
4347 /* Must be exiting, otherwise would be on tasklist. */ 4347 /* Must be exiting, otherwise would be on tasklist. */
4348 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); 4348 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4349 4349
4350 /* Cannot have done final schedule yet: would have vanished. */ 4350 /* Cannot have done final schedule yet: would have vanished. */
4351 BUG_ON(tsk->flags & PF_DEAD); 4351 BUG_ON(tsk->flags & PF_DEAD);
4352 4352
4353 get_task_struct(tsk); 4353 get_task_struct(tsk);
4354 4354
4355 /* 4355 /*
4356 * Drop lock around migration; if someone else moves it, 4356 * Drop lock around migration; if someone else moves it,
4357 * that's OK. No task can be added to this CPU, so iteration is 4357 * that's OK. No task can be added to this CPU, so iteration is
4358 * fine. 4358 * fine.
4359 */ 4359 */
4360 spin_unlock_irq(&rq->lock); 4360 spin_unlock_irq(&rq->lock);
4361 move_task_off_dead_cpu(dead_cpu, tsk); 4361 move_task_off_dead_cpu(dead_cpu, tsk);
4362 spin_lock_irq(&rq->lock); 4362 spin_lock_irq(&rq->lock);
4363 4363
4364 put_task_struct(tsk); 4364 put_task_struct(tsk);
4365 } 4365 }
4366 4366
4367 /* release_task() removes task from tasklist, so we won't find dead tasks. */ 4367 /* release_task() removes task from tasklist, so we won't find dead tasks. */
4368 static void migrate_dead_tasks(unsigned int dead_cpu) 4368 static void migrate_dead_tasks(unsigned int dead_cpu)
4369 { 4369 {
4370 unsigned arr, i; 4370 unsigned arr, i;
4371 struct runqueue *rq = cpu_rq(dead_cpu); 4371 struct runqueue *rq = cpu_rq(dead_cpu);
4372 4372
4373 for (arr = 0; arr < 2; arr++) { 4373 for (arr = 0; arr < 2; arr++) {
4374 for (i = 0; i < MAX_PRIO; i++) { 4374 for (i = 0; i < MAX_PRIO; i++) {
4375 struct list_head *list = &rq->arrays[arr].queue[i]; 4375 struct list_head *list = &rq->arrays[arr].queue[i];
4376 while (!list_empty(list)) 4376 while (!list_empty(list))
4377 migrate_dead(dead_cpu, 4377 migrate_dead(dead_cpu,
4378 list_entry(list->next, task_t, 4378 list_entry(list->next, task_t,
4379 run_list)); 4379 run_list));
4380 } 4380 }
4381 } 4381 }
4382 } 4382 }
4383 #endif /* CONFIG_HOTPLUG_CPU */ 4383 #endif /* CONFIG_HOTPLUG_CPU */
4384 4384
4385 /* 4385 /*
4386 * migration_call - callback that gets triggered when a CPU is added. 4386 * migration_call - callback that gets triggered when a CPU is added.
4387 * Here we can start up the necessary migration thread for the new CPU. 4387 * Here we can start up the necessary migration thread for the new CPU.
4388 */ 4388 */
4389 static int migration_call(struct notifier_block *nfb, unsigned long action, 4389 static int migration_call(struct notifier_block *nfb, unsigned long action,
4390 void *hcpu) 4390 void *hcpu)
4391 { 4391 {
4392 int cpu = (long)hcpu; 4392 int cpu = (long)hcpu;
4393 struct task_struct *p; 4393 struct task_struct *p;
4394 struct runqueue *rq; 4394 struct runqueue *rq;
4395 unsigned long flags; 4395 unsigned long flags;
4396 4396
4397 switch (action) { 4397 switch (action) {
4398 case CPU_UP_PREPARE: 4398 case CPU_UP_PREPARE:
4399 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 4399 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4400 if (IS_ERR(p)) 4400 if (IS_ERR(p))
4401 return NOTIFY_BAD; 4401 return NOTIFY_BAD;
4402 p->flags |= PF_NOFREEZE; 4402 p->flags |= PF_NOFREEZE;
4403 kthread_bind(p, cpu); 4403 kthread_bind(p, cpu);
4404 /* Must be high prio: stop_machine expects to yield to it. */ 4404 /* Must be high prio: stop_machine expects to yield to it. */
4405 rq = task_rq_lock(p, &flags); 4405 rq = task_rq_lock(p, &flags);
4406 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 4406 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4407 task_rq_unlock(rq, &flags); 4407 task_rq_unlock(rq, &flags);
4408 cpu_rq(cpu)->migration_thread = p; 4408 cpu_rq(cpu)->migration_thread = p;
4409 break; 4409 break;
4410 case CPU_ONLINE: 4410 case CPU_ONLINE:
4411 /* Strictly unneccessary, as first user will wake it. */ 4411 /* Strictly unneccessary, as first user will wake it. */
4412 wake_up_process(cpu_rq(cpu)->migration_thread); 4412 wake_up_process(cpu_rq(cpu)->migration_thread);
4413 break; 4413 break;
4414 #ifdef CONFIG_HOTPLUG_CPU 4414 #ifdef CONFIG_HOTPLUG_CPU
4415 case CPU_UP_CANCELED: 4415 case CPU_UP_CANCELED:
4416 /* Unbind it from offline cpu so it can run. Fall thru. */ 4416 /* Unbind it from offline cpu so it can run. Fall thru. */
4417 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); 4417 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4418 kthread_stop(cpu_rq(cpu)->migration_thread); 4418 kthread_stop(cpu_rq(cpu)->migration_thread);
4419 cpu_rq(cpu)->migration_thread = NULL; 4419 cpu_rq(cpu)->migration_thread = NULL;
4420 break; 4420 break;
4421 case CPU_DEAD: 4421 case CPU_DEAD:
4422 migrate_live_tasks(cpu); 4422 migrate_live_tasks(cpu);
4423 rq = cpu_rq(cpu); 4423 rq = cpu_rq(cpu);
4424 kthread_stop(rq->migration_thread); 4424 kthread_stop(rq->migration_thread);
4425 rq->migration_thread = NULL; 4425 rq->migration_thread = NULL;
4426 /* Idle task back to normal (off runqueue, low prio) */ 4426 /* Idle task back to normal (off runqueue, low prio) */
4427 rq = task_rq_lock(rq->idle, &flags); 4427 rq = task_rq_lock(rq->idle, &flags);
4428 deactivate_task(rq->idle, rq); 4428 deactivate_task(rq->idle, rq);
4429 rq->idle->static_prio = MAX_PRIO; 4429 rq->idle->static_prio = MAX_PRIO;
4430 __setscheduler(rq->idle, SCHED_NORMAL, 0); 4430 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4431 migrate_dead_tasks(cpu); 4431 migrate_dead_tasks(cpu);
4432 task_rq_unlock(rq, &flags); 4432 task_rq_unlock(rq, &flags);
4433 migrate_nr_uninterruptible(rq); 4433 migrate_nr_uninterruptible(rq);
4434 BUG_ON(rq->nr_running != 0); 4434 BUG_ON(rq->nr_running != 0);
4435 4435
4436 /* No need to migrate the tasks: it was best-effort if 4436 /* No need to migrate the tasks: it was best-effort if
4437 * they didn't do lock_cpu_hotplug(). Just wake up 4437 * they didn't do lock_cpu_hotplug(). Just wake up
4438 * the requestors. */ 4438 * the requestors. */
4439 spin_lock_irq(&rq->lock); 4439 spin_lock_irq(&rq->lock);
4440 while (!list_empty(&rq->migration_queue)) { 4440 while (!list_empty(&rq->migration_queue)) {
4441 migration_req_t *req; 4441 migration_req_t *req;
4442 req = list_entry(rq->migration_queue.next, 4442 req = list_entry(rq->migration_queue.next,
4443 migration_req_t, list); 4443 migration_req_t, list);
4444 BUG_ON(req->type != REQ_MOVE_TASK); 4444 BUG_ON(req->type != REQ_MOVE_TASK);
4445 list_del_init(&req->list); 4445 list_del_init(&req->list);
4446 complete(&req->done); 4446 complete(&req->done);
4447 } 4447 }
4448 spin_unlock_irq(&rq->lock); 4448 spin_unlock_irq(&rq->lock);
4449 break; 4449 break;
4450 #endif 4450 #endif
4451 } 4451 }
4452 return NOTIFY_OK; 4452 return NOTIFY_OK;
4453 } 4453 }
4454 4454
4455 /* Register at highest priority so that task migration (migrate_all_tasks) 4455 /* Register at highest priority so that task migration (migrate_all_tasks)
4456 * happens before everything else. 4456 * happens before everything else.
4457 */ 4457 */
4458 static struct notifier_block __devinitdata migration_notifier = { 4458 static struct notifier_block __devinitdata migration_notifier = {
4459 .notifier_call = migration_call, 4459 .notifier_call = migration_call,
4460 .priority = 10 4460 .priority = 10
4461 }; 4461 };
4462 4462
4463 int __init migration_init(void) 4463 int __init migration_init(void)
4464 { 4464 {
4465 void *cpu = (void *)(long)smp_processor_id(); 4465 void *cpu = (void *)(long)smp_processor_id();
4466 /* Start one for boot CPU. */ 4466 /* Start one for boot CPU. */
4467 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 4467 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4468 migration_call(&migration_notifier, CPU_ONLINE, cpu); 4468 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4469 register_cpu_notifier(&migration_notifier); 4469 register_cpu_notifier(&migration_notifier);
4470 return 0; 4470 return 0;
4471 } 4471 }
4472 #endif 4472 #endif
4473 4473
4474 #ifdef CONFIG_SMP 4474 #ifdef CONFIG_SMP
4475 #define SCHED_DOMAIN_DEBUG 4475 #define SCHED_DOMAIN_DEBUG
4476 #ifdef SCHED_DOMAIN_DEBUG 4476 #ifdef SCHED_DOMAIN_DEBUG
4477 static void sched_domain_debug(struct sched_domain *sd, int cpu) 4477 static void sched_domain_debug(struct sched_domain *sd, int cpu)
4478 { 4478 {
4479 int level = 0; 4479 int level = 0;
4480 4480
4481 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4481 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4482 4482
4483 do { 4483 do {
4484 int i; 4484 int i;
4485 char str[NR_CPUS]; 4485 char str[NR_CPUS];
4486 struct sched_group *group = sd->groups; 4486 struct sched_group *group = sd->groups;
4487 cpumask_t groupmask; 4487 cpumask_t groupmask;
4488 4488
4489 cpumask_scnprintf(str, NR_CPUS, sd->span); 4489 cpumask_scnprintf(str, NR_CPUS, sd->span);
4490 cpus_clear(groupmask); 4490 cpus_clear(groupmask);
4491 4491
4492 printk(KERN_DEBUG); 4492 printk(KERN_DEBUG);
4493 for (i = 0; i < level + 1; i++) 4493 for (i = 0; i < level + 1; i++)
4494 printk(" "); 4494 printk(" ");
4495 printk("domain %d: ", level); 4495 printk("domain %d: ", level);
4496 4496
4497 if (!(sd->flags & SD_LOAD_BALANCE)) { 4497 if (!(sd->flags & SD_LOAD_BALANCE)) {
4498 printk("does not load-balance\n"); 4498 printk("does not load-balance\n");
4499 if (sd->parent) 4499 if (sd->parent)
4500 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 4500 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4501 break; 4501 break;
4502 } 4502 }
4503 4503
4504 printk("span %s\n", str); 4504 printk("span %s\n", str);
4505 4505
4506 if (!cpu_isset(cpu, sd->span)) 4506 if (!cpu_isset(cpu, sd->span))
4507 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 4507 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4508 if (!cpu_isset(cpu, group->cpumask)) 4508 if (!cpu_isset(cpu, group->cpumask))
4509 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 4509 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4510 4510
4511 printk(KERN_DEBUG); 4511 printk(KERN_DEBUG);
4512 for (i = 0; i < level + 2; i++) 4512 for (i = 0; i < level + 2; i++)
4513 printk(" "); 4513 printk(" ");
4514 printk("groups:"); 4514 printk("groups:");
4515 do { 4515 do {
4516 if (!group) { 4516 if (!group) {
4517 printk("\n"); 4517 printk("\n");
4518 printk(KERN_ERR "ERROR: group is NULL\n"); 4518 printk(KERN_ERR "ERROR: group is NULL\n");
4519 break; 4519 break;
4520 } 4520 }
4521 4521
4522 if (!group->cpu_power) { 4522 if (!group->cpu_power) {
4523 printk("\n"); 4523 printk("\n");
4524 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 4524 printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4525 } 4525 }
4526 4526
4527 if (!cpus_weight(group->cpumask)) { 4527 if (!cpus_weight(group->cpumask)) {
4528 printk("\n"); 4528 printk("\n");
4529 printk(KERN_ERR "ERROR: empty group\n"); 4529 printk(KERN_ERR "ERROR: empty group\n");
4530 } 4530 }
4531 4531
4532 if (cpus_intersects(groupmask, group->cpumask)) { 4532 if (cpus_intersects(groupmask, group->cpumask)) {
4533 printk("\n"); 4533 printk("\n");
4534 printk(KERN_ERR "ERROR: repeated CPUs\n"); 4534 printk(KERN_ERR "ERROR: repeated CPUs\n");
4535 } 4535 }
4536 4536
4537 cpus_or(groupmask, groupmask, group->cpumask); 4537 cpus_or(groupmask, groupmask, group->cpumask);
4538 4538
4539 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 4539 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4540 printk(" %s", str); 4540 printk(" %s", str);
4541 4541
4542 group = group->next; 4542 group = group->next;
4543 } while (group != sd->groups); 4543 } while (group != sd->groups);
4544 printk("\n"); 4544 printk("\n");
4545 4545
4546 if (!cpus_equal(sd->span, groupmask)) 4546 if (!cpus_equal(sd->span, groupmask))
4547 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 4547 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4548 4548
4549 level++; 4549 level++;
4550 sd = sd->parent; 4550 sd = sd->parent;
4551 4551
4552 if (sd) { 4552 if (sd) {
4553 if (!cpus_subset(groupmask, sd->span)) 4553 if (!cpus_subset(groupmask, sd->span))
4554 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 4554 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4555 } 4555 }
4556 4556
4557 } while (sd); 4557 } while (sd);
4558 } 4558 }
4559 #else 4559 #else
4560 #define sched_domain_debug(sd, cpu) {} 4560 #define sched_domain_debug(sd, cpu) {}
4561 #endif 4561 #endif
4562 4562
4563 /* 4563 /*
4564 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4564 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4565 * hold the hotplug lock. 4565 * hold the hotplug lock.
4566 */ 4566 */
4567 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4567 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
4568 { 4568 {
4569 migration_req_t req; 4569 migration_req_t req;
4570 unsigned long flags; 4570 unsigned long flags;
4571 runqueue_t *rq = cpu_rq(cpu); 4571 runqueue_t *rq = cpu_rq(cpu);
4572 int local = 1; 4572 int local = 1;
4573 4573
4574 sched_domain_debug(sd, cpu); 4574 sched_domain_debug(sd, cpu);
4575 4575
4576 spin_lock_irqsave(&rq->lock, flags); 4576 spin_lock_irqsave(&rq->lock, flags);
4577 4577
4578 if (cpu == smp_processor_id() || !cpu_online(cpu)) { 4578 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4579 rq->sd = sd; 4579 rq->sd = sd;
4580 } else { 4580 } else {
4581 init_completion(&req.done); 4581 init_completion(&req.done);
4582 req.type = REQ_SET_DOMAIN; 4582 req.type = REQ_SET_DOMAIN;
4583 req.sd = sd; 4583 req.sd = sd;
4584 list_add(&req.list, &rq->migration_queue); 4584 list_add(&req.list, &rq->migration_queue);
4585 local = 0; 4585 local = 0;
4586 } 4586 }
4587 4587
4588 spin_unlock_irqrestore(&rq->lock, flags); 4588 spin_unlock_irqrestore(&rq->lock, flags);
4589 4589
4590 if (!local) { 4590 if (!local) {
4591 wake_up_process(rq->migration_thread); 4591 wake_up_process(rq->migration_thread);
4592 wait_for_completion(&req.done); 4592 wait_for_completion(&req.done);
4593 } 4593 }
4594 } 4594 }
4595 4595
4596 /* cpus with isolated domains */ 4596 /* cpus with isolated domains */
4597 cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4597 cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4598 4598
4599 /* Setup the mask of cpus configured for isolated domains */ 4599 /* Setup the mask of cpus configured for isolated domains */
4600 static int __init isolated_cpu_setup(char *str) 4600 static int __init isolated_cpu_setup(char *str)
4601 { 4601 {
4602 int ints[NR_CPUS], i; 4602 int ints[NR_CPUS], i;
4603 4603
4604 str = get_options(str, ARRAY_SIZE(ints), ints); 4604 str = get_options(str, ARRAY_SIZE(ints), ints);
4605 cpus_clear(cpu_isolated_map); 4605 cpus_clear(cpu_isolated_map);
4606 for (i = 1; i <= ints[0]; i++) 4606 for (i = 1; i <= ints[0]; i++)
4607 if (ints[i] < NR_CPUS) 4607 if (ints[i] < NR_CPUS)
4608 cpu_set(ints[i], cpu_isolated_map); 4608 cpu_set(ints[i], cpu_isolated_map);
4609 return 1; 4609 return 1;
4610 } 4610 }
4611 4611
4612 __setup ("isolcpus=", isolated_cpu_setup); 4612 __setup ("isolcpus=", isolated_cpu_setup);
4613 4613
4614 /* 4614 /*
4615 * init_sched_build_groups takes an array of groups, the cpumask we wish 4615 * init_sched_build_groups takes an array of groups, the cpumask we wish
4616 * to span, and a pointer to a function which identifies what group a CPU 4616 * to span, and a pointer to a function which identifies what group a CPU
4617 * belongs to. The return value of group_fn must be a valid index into the 4617 * belongs to. The return value of group_fn must be a valid index into the
4618 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 4618 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4619 * keep track of groups covered with a cpumask_t). 4619 * keep track of groups covered with a cpumask_t).
4620 * 4620 *
4621 * init_sched_build_groups will build a circular linked list of the groups 4621 * init_sched_build_groups will build a circular linked list of the groups
4622 * covered by the given span, and will set each group's ->cpumask correctly, 4622 * covered by the given span, and will set each group's ->cpumask correctly,
4623 * and ->cpu_power to 0. 4623 * and ->cpu_power to 0.
4624 */ 4624 */
4625 void __devinit init_sched_build_groups(struct sched_group groups[], 4625 void __devinit init_sched_build_groups(struct sched_group groups[],
4626 cpumask_t span, int (*group_fn)(int cpu)) 4626 cpumask_t span, int (*group_fn)(int cpu))
4627 { 4627 {
4628 struct sched_group *first = NULL, *last = NULL; 4628 struct sched_group *first = NULL, *last = NULL;
4629 cpumask_t covered = CPU_MASK_NONE; 4629 cpumask_t covered = CPU_MASK_NONE;
4630 int i; 4630 int i;
4631 4631
4632 for_each_cpu_mask(i, span) { 4632 for_each_cpu_mask(i, span) {
4633 int group = group_fn(i); 4633 int group = group_fn(i);
4634 struct sched_group *sg = &groups[group]; 4634 struct sched_group *sg = &groups[group];
4635 int j; 4635 int j;
4636 4636
4637 if (cpu_isset(i, covered)) 4637 if (cpu_isset(i, covered))
4638 continue; 4638 continue;
4639 4639
4640 sg->cpumask = CPU_MASK_NONE; 4640 sg->cpumask = CPU_MASK_NONE;
4641 sg->cpu_power = 0; 4641 sg->cpu_power = 0;
4642 4642
4643 for_each_cpu_mask(j, span) { 4643 for_each_cpu_mask(j, span) {
4644 if (group_fn(j) != group) 4644 if (group_fn(j) != group)
4645 continue; 4645 continue;
4646 4646
4647 cpu_set(j, covered); 4647 cpu_set(j, covered);
4648 cpu_set(j, sg->cpumask); 4648 cpu_set(j, sg->cpumask);
4649 } 4649 }
4650 if (!first) 4650 if (!first)
4651 first = sg; 4651 first = sg;
4652 if (last) 4652 if (last)
4653 last->next = sg; 4653 last->next = sg;
4654 last = sg; 4654 last = sg;
4655 } 4655 }
4656 last->next = first; 4656 last->next = first;
4657 } 4657 }
4658 4658
4659 4659
4660 #ifdef ARCH_HAS_SCHED_DOMAIN 4660 #ifdef ARCH_HAS_SCHED_DOMAIN
4661 extern void __devinit arch_init_sched_domains(void); 4661 extern void __devinit arch_init_sched_domains(void);
4662 extern void __devinit arch_destroy_sched_domains(void); 4662 extern void __devinit arch_destroy_sched_domains(void);
4663 #else 4663 #else
4664 #ifdef CONFIG_SCHED_SMT 4664 #ifdef CONFIG_SCHED_SMT
4665 static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4665 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4666 static struct sched_group sched_group_cpus[NR_CPUS]; 4666 static struct sched_group sched_group_cpus[NR_CPUS];
4667 static int __devinit cpu_to_cpu_group(int cpu) 4667 static int __devinit cpu_to_cpu_group(int cpu)
4668 { 4668 {
4669 return cpu; 4669 return cpu;
4670 } 4670 }
4671 #endif 4671 #endif
4672 4672
4673 static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4673 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4674 static struct sched_group sched_group_phys[NR_CPUS]; 4674 static struct sched_group sched_group_phys[NR_CPUS];
4675 static int __devinit cpu_to_phys_group(int cpu) 4675 static int __devinit cpu_to_phys_group(int cpu)
4676 { 4676 {
4677 #ifdef CONFIG_SCHED_SMT 4677 #ifdef CONFIG_SCHED_SMT
4678 return first_cpu(cpu_sibling_map[cpu]); 4678 return first_cpu(cpu_sibling_map[cpu]);
4679 #else 4679 #else
4680 return cpu; 4680 return cpu;
4681 #endif 4681 #endif
4682 } 4682 }
4683 4683
4684 #ifdef CONFIG_NUMA 4684 #ifdef CONFIG_NUMA
4685 4685
4686 static DEFINE_PER_CPU(struct sched_domain, node_domains); 4686 static DEFINE_PER_CPU(struct sched_domain, node_domains);
4687 static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4687 static struct sched_group sched_group_nodes[MAX_NUMNODES];
4688 static int __devinit cpu_to_node_group(int cpu) 4688 static int __devinit cpu_to_node_group(int cpu)
4689 { 4689 {
4690 return cpu_to_node(cpu); 4690 return cpu_to_node(cpu);
4691 } 4691 }
4692 #endif 4692 #endif
4693 4693
4694 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) 4694 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4695 /* 4695 /*
4696 * The domains setup code relies on siblings not spanning 4696 * The domains setup code relies on siblings not spanning
4697 * multiple nodes. Make sure the architecture has a proper 4697 * multiple nodes. Make sure the architecture has a proper
4698 * siblings map: 4698 * siblings map:
4699 */ 4699 */
4700 static void check_sibling_maps(void) 4700 static void check_sibling_maps(void)
4701 { 4701 {
4702 int i, j; 4702 int i, j;
4703 4703
4704 for_each_online_cpu(i) { 4704 for_each_online_cpu(i) {
4705 for_each_cpu_mask(j, cpu_sibling_map[i]) { 4705 for_each_cpu_mask(j, cpu_sibling_map[i]) {
4706 if (cpu_to_node(i) != cpu_to_node(j)) { 4706 if (cpu_to_node(i) != cpu_to_node(j)) {
4707 printk(KERN_INFO "warning: CPU %d siblings map " 4707 printk(KERN_INFO "warning: CPU %d siblings map "
4708 "to different node - isolating " 4708 "to different node - isolating "
4709 "them.\n", i); 4709 "them.\n", i);
4710 cpu_sibling_map[i] = cpumask_of_cpu(i); 4710 cpu_sibling_map[i] = cpumask_of_cpu(i);
4711 break; 4711 break;
4712 } 4712 }
4713 } 4713 }
4714 } 4714 }
4715 } 4715 }
4716 #endif 4716 #endif
4717 4717
4718 /* 4718 /*
4719 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4719 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
4720 */ 4720 */
4721 static void __devinit arch_init_sched_domains(void) 4721 static void __devinit arch_init_sched_domains(void)
4722 { 4722 {
4723 int i; 4723 int i;
4724 cpumask_t cpu_default_map; 4724 cpumask_t cpu_default_map;
4725 4725
4726 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) 4726 #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727 check_sibling_maps(); 4727 check_sibling_maps();
4728 #endif 4728 #endif
4729 /* 4729 /*
4730 * Setup mask for cpus without special case scheduling requirements. 4730 * Setup mask for cpus without special case scheduling requirements.
4731 * For now this just excludes isolated cpus, but could be used to 4731 * For now this just excludes isolated cpus, but could be used to
4732 * exclude other special cases in the future. 4732 * exclude other special cases in the future.
4733 */ 4733 */
4734 cpus_complement(cpu_default_map, cpu_isolated_map); 4734 cpus_complement(cpu_default_map, cpu_isolated_map);
4735 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); 4735 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4736 4736
4737 /* 4737 /*
4738 * Set up domains. Isolated domains just stay on the dummy domain. 4738 * Set up domains. Isolated domains just stay on the dummy domain.
4739 */ 4739 */
4740 for_each_cpu_mask(i, cpu_default_map) { 4740 for_each_cpu_mask(i, cpu_default_map) {
4741 int group; 4741 int group;
4742 struct sched_domain *sd = NULL, *p; 4742 struct sched_domain *sd = NULL, *p;
4743 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4743 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4744 4744
4745 cpus_and(nodemask, nodemask, cpu_default_map); 4745 cpus_and(nodemask, nodemask, cpu_default_map);
4746 4746
4747 #ifdef CONFIG_NUMA 4747 #ifdef CONFIG_NUMA
4748 sd = &per_cpu(node_domains, i); 4748 sd = &per_cpu(node_domains, i);
4749 group = cpu_to_node_group(i); 4749 group = cpu_to_node_group(i);
4750 *sd = SD_NODE_INIT; 4750 *sd = SD_NODE_INIT;
4751 sd->span = cpu_default_map; 4751 sd->span = cpu_default_map;
4752 sd->groups = &sched_group_nodes[group]; 4752 sd->groups = &sched_group_nodes[group];
4753 #endif 4753 #endif
4754 4754
4755 p = sd; 4755 p = sd;
4756 sd = &per_cpu(phys_domains, i); 4756 sd = &per_cpu(phys_domains, i);
4757 group = cpu_to_phys_group(i); 4757 group = cpu_to_phys_group(i);
4758 *sd = SD_CPU_INIT; 4758 *sd = SD_CPU_INIT;
4759 sd->span = nodemask; 4759 sd->span = nodemask;
4760 sd->parent = p; 4760 sd->parent = p;
4761 sd->groups = &sched_group_phys[group]; 4761 sd->groups = &sched_group_phys[group];
4762 4762
4763 #ifdef CONFIG_SCHED_SMT 4763 #ifdef CONFIG_SCHED_SMT
4764 p = sd; 4764 p = sd;
4765 sd = &per_cpu(cpu_domains, i); 4765 sd = &per_cpu(cpu_domains, i);
4766 group = cpu_to_cpu_group(i); 4766 group = cpu_to_cpu_group(i);
4767 *sd = SD_SIBLING_INIT; 4767 *sd = SD_SIBLING_INIT;
4768 sd->span = cpu_sibling_map[i]; 4768 sd->span = cpu_sibling_map[i];
4769 cpus_and(sd->span, sd->span, cpu_default_map); 4769 cpus_and(sd->span, sd->span, cpu_default_map);
4770 sd->parent = p; 4770 sd->parent = p;
4771 sd->groups = &sched_group_cpus[group]; 4771 sd->groups = &sched_group_cpus[group];
4772 #endif 4772 #endif
4773 } 4773 }
4774 4774
4775 #ifdef CONFIG_SCHED_SMT 4775 #ifdef CONFIG_SCHED_SMT
4776 /* Set up CPU (sibling) groups */ 4776 /* Set up CPU (sibling) groups */
4777 for_each_online_cpu(i) { 4777 for_each_online_cpu(i) {
4778 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4778 cpumask_t this_sibling_map = cpu_sibling_map[i];
4779 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4779 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
4780 if (i != first_cpu(this_sibling_map)) 4780 if (i != first_cpu(this_sibling_map))
4781 continue; 4781 continue;
4782 4782
4783 init_sched_build_groups(sched_group_cpus, this_sibling_map, 4783 init_sched_build_groups(sched_group_cpus, this_sibling_map,
4784 &cpu_to_cpu_group); 4784 &cpu_to_cpu_group);
4785 } 4785 }
4786 #endif 4786 #endif
4787 4787
4788 /* Set up physical groups */ 4788 /* Set up physical groups */
4789 for (i = 0; i < MAX_NUMNODES; i++) { 4789 for (i = 0; i < MAX_NUMNODES; i++) {
4790 cpumask_t nodemask = node_to_cpumask(i); 4790 cpumask_t nodemask = node_to_cpumask(i);
4791 4791
4792 cpus_and(nodemask, nodemask, cpu_default_map); 4792 cpus_and(nodemask, nodemask, cpu_default_map);
4793 if (cpus_empty(nodemask)) 4793 if (cpus_empty(nodemask))
4794 continue; 4794 continue;
4795 4795
4796 init_sched_build_groups(sched_group_phys, nodemask, 4796 init_sched_build_groups(sched_group_phys, nodemask,
4797 &cpu_to_phys_group); 4797 &cpu_to_phys_group);
4798 } 4798 }
4799 4799
4800 #ifdef CONFIG_NUMA 4800 #ifdef CONFIG_NUMA
4801 /* Set up node groups */ 4801 /* Set up node groups */
4802 init_sched_build_groups(sched_group_nodes, cpu_default_map, 4802 init_sched_build_groups(sched_group_nodes, cpu_default_map,
4803 &cpu_to_node_group); 4803 &cpu_to_node_group);
4804 #endif 4804 #endif
4805 4805
4806 /* Calculate CPU power for physical packages and nodes */ 4806 /* Calculate CPU power for physical packages and nodes */
4807 for_each_cpu_mask(i, cpu_default_map) { 4807 for_each_cpu_mask(i, cpu_default_map) {
4808 int power; 4808 int power;
4809 struct sched_domain *sd; 4809 struct sched_domain *sd;
4810 #ifdef CONFIG_SCHED_SMT 4810 #ifdef CONFIG_SCHED_SMT
4811 sd = &per_cpu(cpu_domains, i); 4811 sd = &per_cpu(cpu_domains, i);
4812 power = SCHED_LOAD_SCALE; 4812 power = SCHED_LOAD_SCALE;
4813 sd->groups->cpu_power = power; 4813 sd->groups->cpu_power = power;
4814 #endif 4814 #endif
4815 4815
4816 sd = &per_cpu(phys_domains, i); 4816 sd = &per_cpu(phys_domains, i);
4817 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 4817 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
4818 (cpus_weight(sd->groups->cpumask)-1) / 10; 4818 (cpus_weight(sd->groups->cpumask)-1) / 10;
4819 sd->groups->cpu_power = power; 4819 sd->groups->cpu_power = power;
4820 4820
4821 #ifdef CONFIG_NUMA 4821 #ifdef CONFIG_NUMA
4822 if (i == first_cpu(sd->groups->cpumask)) { 4822 if (i == first_cpu(sd->groups->cpumask)) {
4823 /* Only add "power" once for each physical package. */ 4823 /* Only add "power" once for each physical package. */
4824 sd = &per_cpu(node_domains, i); 4824 sd = &per_cpu(node_domains, i);
4825 sd->groups->cpu_power += power; 4825 sd->groups->cpu_power += power;
4826 } 4826 }
4827 #endif 4827 #endif
4828 } 4828 }
4829 4829
4830 /* Attach the domains */ 4830 /* Attach the domains */
4831 for_each_online_cpu(i) { 4831 for_each_online_cpu(i) {
4832 struct sched_domain *sd; 4832 struct sched_domain *sd;
4833 #ifdef CONFIG_SCHED_SMT 4833 #ifdef CONFIG_SCHED_SMT
4834 sd = &per_cpu(cpu_domains, i); 4834 sd = &per_cpu(cpu_domains, i);
4835 #else 4835 #else
4836 sd = &per_cpu(phys_domains, i); 4836 sd = &per_cpu(phys_domains, i);
4837 #endif 4837 #endif
4838 cpu_attach_domain(sd, i); 4838 cpu_attach_domain(sd, i);
4839 } 4839 }
4840 } 4840 }
4841 4841
4842 #ifdef CONFIG_HOTPLUG_CPU 4842 #ifdef CONFIG_HOTPLUG_CPU
4843 static void __devinit arch_destroy_sched_domains(void) 4843 static void __devinit arch_destroy_sched_domains(void)
4844 { 4844 {
4845 /* Do nothing: everything is statically allocated. */ 4845 /* Do nothing: everything is statically allocated. */
4846 } 4846 }
4847 #endif 4847 #endif
4848 4848
4849 #endif /* ARCH_HAS_SCHED_DOMAIN */ 4849 #endif /* ARCH_HAS_SCHED_DOMAIN */
4850 4850
4851 /* 4851 /*
4852 * Initial dummy domain for early boot and for hotplug cpu. Being static, 4852 * Initial dummy domain for early boot and for hotplug cpu. Being static,
4853 * it is initialized to zero, so all balancing flags are cleared which is 4853 * it is initialized to zero, so all balancing flags are cleared which is
4854 * what we want. 4854 * what we want.
4855 */ 4855 */
4856 static struct sched_domain sched_domain_dummy; 4856 static struct sched_domain sched_domain_dummy;
4857 4857
4858 #ifdef CONFIG_HOTPLUG_CPU 4858 #ifdef CONFIG_HOTPLUG_CPU
4859 /* 4859 /*
4860 * Force a reinitialization of the sched domains hierarchy. The domains 4860 * Force a reinitialization of the sched domains hierarchy. The domains
4861 * and groups cannot be updated in place without racing with the balancing 4861 * and groups cannot be updated in place without racing with the balancing
4862 * code, so we temporarily attach all running cpus to a "dummy" domain 4862 * code, so we temporarily attach all running cpus to a "dummy" domain
4863 * which will prevent rebalancing while the sched domains are recalculated. 4863 * which will prevent rebalancing while the sched domains are recalculated.
4864 */ 4864 */
4865 static int update_sched_domains(struct notifier_block *nfb, 4865 static int update_sched_domains(struct notifier_block *nfb,
4866 unsigned long action, void *hcpu) 4866 unsigned long action, void *hcpu)
4867 { 4867 {
4868 int i; 4868 int i;
4869 4869
4870 switch (action) { 4870 switch (action) {
4871 case CPU_UP_PREPARE: 4871 case CPU_UP_PREPARE:
4872 case CPU_DOWN_PREPARE: 4872 case CPU_DOWN_PREPARE:
4873 for_each_online_cpu(i) 4873 for_each_online_cpu(i)
4874 cpu_attach_domain(&sched_domain_dummy, i); 4874 cpu_attach_domain(&sched_domain_dummy, i);
4875 arch_destroy_sched_domains(); 4875 arch_destroy_sched_domains();
4876 return NOTIFY_OK; 4876 return NOTIFY_OK;
4877 4877
4878 case CPU_UP_CANCELED: 4878 case CPU_UP_CANCELED:
4879 case CPU_DOWN_FAILED: 4879 case CPU_DOWN_FAILED:
4880 case CPU_ONLINE: 4880 case CPU_ONLINE:
4881 case CPU_DEAD: 4881 case CPU_DEAD:
4882 /* 4882 /*
4883 * Fall through and re-initialise the domains. 4883 * Fall through and re-initialise the domains.
4884 */ 4884 */
4885 break; 4885 break;
4886 default: 4886 default:
4887 return NOTIFY_DONE; 4887 return NOTIFY_DONE;
4888 } 4888 }
4889 4889
4890 /* The hotplug lock is already held by cpu_up/cpu_down */ 4890 /* The hotplug lock is already held by cpu_up/cpu_down */
4891 arch_init_sched_domains(); 4891 arch_init_sched_domains();
4892 4892
4893 return NOTIFY_OK; 4893 return NOTIFY_OK;
4894 } 4894 }
4895 #endif 4895 #endif
4896 4896
4897 void __init sched_init_smp(void) 4897 void __init sched_init_smp(void)
4898 { 4898 {
4899 lock_cpu_hotplug(); 4899 lock_cpu_hotplug();
4900 arch_init_sched_domains(); 4900 arch_init_sched_domains();
4901 unlock_cpu_hotplug(); 4901 unlock_cpu_hotplug();
4902 /* XXX: Theoretical race here - CPU may be hotplugged now */ 4902 /* XXX: Theoretical race here - CPU may be hotplugged now */
4903 hotcpu_notifier(update_sched_domains, 0); 4903 hotcpu_notifier(update_sched_domains, 0);
4904 } 4904 }
4905 #else 4905 #else
4906 void __init sched_init_smp(void) 4906 void __init sched_init_smp(void)
4907 { 4907 {
4908 } 4908 }
4909 #endif /* CONFIG_SMP */ 4909 #endif /* CONFIG_SMP */
4910 4910
4911 int in_sched_functions(unsigned long addr) 4911 int in_sched_functions(unsigned long addr)
4912 { 4912 {
4913 /* Linker adds these: start and end of __sched functions */ 4913 /* Linker adds these: start and end of __sched functions */
4914 extern char __sched_text_start[], __sched_text_end[]; 4914 extern char __sched_text_start[], __sched_text_end[];
4915 return in_lock_functions(addr) || 4915 return in_lock_functions(addr) ||
4916 (addr >= (unsigned long)__sched_text_start 4916 (addr >= (unsigned long)__sched_text_start
4917 && addr < (unsigned long)__sched_text_end); 4917 && addr < (unsigned long)__sched_text_end);
4918 } 4918 }
4919 4919
4920 void __init sched_init(void) 4920 void __init sched_init(void)
4921 { 4921 {
4922 runqueue_t *rq; 4922 runqueue_t *rq;
4923 int i, j, k; 4923 int i, j, k;
4924 4924
4925 for (i = 0; i < NR_CPUS; i++) { 4925 for (i = 0; i < NR_CPUS; i++) {
4926 prio_array_t *array; 4926 prio_array_t *array;
4927 4927
4928 rq = cpu_rq(i); 4928 rq = cpu_rq(i);
4929 spin_lock_init(&rq->lock); 4929 spin_lock_init(&rq->lock);
4930 rq->active = rq->arrays; 4930 rq->active = rq->arrays;
4931 rq->expired = rq->arrays + 1; 4931 rq->expired = rq->arrays + 1;
4932 rq->best_expired_prio = MAX_PRIO; 4932 rq->best_expired_prio = MAX_PRIO;
4933 4933
4934 #ifdef CONFIG_SMP 4934 #ifdef CONFIG_SMP
4935 rq->sd = &sched_domain_dummy; 4935 rq->sd = &sched_domain_dummy;
4936 rq->cpu_load = 0; 4936 rq->cpu_load = 0;
4937 rq->active_balance = 0; 4937 rq->active_balance = 0;
4938 rq->push_cpu = 0; 4938 rq->push_cpu = 0;
4939 rq->migration_thread = NULL; 4939 rq->migration_thread = NULL;
4940 INIT_LIST_HEAD(&rq->migration_queue); 4940 INIT_LIST_HEAD(&rq->migration_queue);
4941 #endif 4941 #endif
4942 atomic_set(&rq->nr_iowait, 0); 4942 atomic_set(&rq->nr_iowait, 0);
4943 4943
4944 for (j = 0; j < 2; j++) { 4944 for (j = 0; j < 2; j++) {
4945 array = rq->arrays + j; 4945 array = rq->arrays + j;
4946 for (k = 0; k < MAX_PRIO; k++) { 4946 for (k = 0; k < MAX_PRIO; k++) {
4947 INIT_LIST_HEAD(array->queue + k); 4947 INIT_LIST_HEAD(array->queue + k);
4948 __clear_bit(k, array->bitmap); 4948 __clear_bit(k, array->bitmap);
4949 } 4949 }
4950 // delimiter for bitsearch 4950 // delimiter for bitsearch
4951 __set_bit(MAX_PRIO, array->bitmap); 4951 __set_bit(MAX_PRIO, array->bitmap);
4952 } 4952 }
4953 } 4953 }
4954 4954
4955 /* 4955 /*
4956 * The boot idle thread does lazy MMU switching as well: 4956 * The boot idle thread does lazy MMU switching as well:
4957 */ 4957 */
4958 atomic_inc(&init_mm.mm_count); 4958 atomic_inc(&init_mm.mm_count);
4959 enter_lazy_tlb(&init_mm, current); 4959 enter_lazy_tlb(&init_mm, current);
4960 4960
4961 /* 4961 /*
4962 * Make us the idle thread. Technically, schedule() should not be 4962 * Make us the idle thread. Technically, schedule() should not be
4963 * called from this thread, however somewhere below it might be, 4963 * called from this thread, however somewhere below it might be,
4964 * but because we are the idle thread, we just pick up running again 4964 * but because we are the idle thread, we just pick up running again
4965 * when this runqueue becomes "idle". 4965 * when this runqueue becomes "idle".
4966 */ 4966 */
4967 init_idle(current, smp_processor_id()); 4967 init_idle(current, smp_processor_id());
4968 } 4968 }
4969 4969
4970 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 4970 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4971 void __might_sleep(char *file, int line) 4971 void __might_sleep(char *file, int line)
4972 { 4972 {
4973 #if defined(in_atomic) 4973 #if defined(in_atomic)
4974 static unsigned long prev_jiffy; /* ratelimiting */ 4974 static unsigned long prev_jiffy; /* ratelimiting */
4975 4975
4976 if ((in_atomic() || irqs_disabled()) && 4976 if ((in_atomic() || irqs_disabled()) &&
4977 system_state == SYSTEM_RUNNING && !oops_in_progress) { 4977 system_state == SYSTEM_RUNNING && !oops_in_progress) {
4978 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 4978 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4979 return; 4979 return;
4980 prev_jiffy = jiffies; 4980 prev_jiffy = jiffies;
4981 printk(KERN_ERR "Debug: sleeping function called from invalid" 4981 printk(KERN_ERR "Debug: sleeping function called from invalid"
4982 " context at %s:%d\n", file, line); 4982 " context at %s:%d\n", file, line);
4983 printk("in_atomic():%d, irqs_disabled():%d\n", 4983 printk("in_atomic():%d, irqs_disabled():%d\n",
4984 in_atomic(), irqs_disabled()); 4984 in_atomic(), irqs_disabled());
4985 dump_stack(); 4985 dump_stack();
4986 } 4986 }
4987 #endif 4987 #endif
4988 } 4988 }
4989 EXPORT_SYMBOL(__might_sleep); 4989 EXPORT_SYMBOL(__might_sleep);
4990 #endif 4990 #endif
4991 4991
4992 #ifdef CONFIG_MAGIC_SYSRQ 4992 #ifdef CONFIG_MAGIC_SYSRQ
4993 void normalize_rt_tasks(void) 4993 void normalize_rt_tasks(void)
4994 { 4994 {
4995 struct task_struct *p; 4995 struct task_struct *p;
4996 prio_array_t *array; 4996 prio_array_t *array;
4997 unsigned long flags; 4997 unsigned long flags;
4998 runqueue_t *rq; 4998 runqueue_t *rq;
4999 4999
5000 read_lock_irq(&tasklist_lock); 5000 read_lock_irq(&tasklist_lock);
5001 for_each_process (p) { 5001 for_each_process (p) {
5002 if (!rt_task(p)) 5002 if (!rt_task(p))
5003 continue; 5003 continue;
5004 5004
5005 rq = task_rq_lock(p, &flags); 5005 rq = task_rq_lock(p, &flags);
5006 5006
5007 array = p->array; 5007 array = p->array;
5008 if (array) 5008 if (array)
5009 deactivate_task(p, task_rq(p)); 5009 deactivate_task(p, task_rq(p));
5010 __setscheduler(p, SCHED_NORMAL, 0); 5010 __setscheduler(p, SCHED_NORMAL, 0);
5011 if (array) { 5011 if (array) {
5012 __activate_task(p, task_rq(p)); 5012 __activate_task(p, task_rq(p));
5013 resched_task(rq->curr); 5013 resched_task(rq->curr);
5014 } 5014 }
5015 5015
5016 task_rq_unlock(rq, &flags); 5016 task_rq_unlock(rq, &flags);
5017 } 5017 }
5018 read_unlock_irq(&tasklist_lock); 5018 read_unlock_irq(&tasklist_lock);
5019 } 5019 }
5020 5020
5021 #endif /* CONFIG_MAGIC_SYSRQ */ 5021 #endif /* CONFIG_MAGIC_SYSRQ */
5022 5022
kernel/stop_machine.c
1 #include <linux/stop_machine.h> 1 #include <linux/stop_machine.h>
2 #include <linux/kthread.h> 2 #include <linux/kthread.h>
3 #include <linux/sched.h> 3 #include <linux/sched.h>
4 #include <linux/cpu.h> 4 #include <linux/cpu.h>
5 #include <linux/err.h> 5 #include <linux/err.h>
6 #include <linux/syscalls.h> 6 #include <linux/syscalls.h>
7 #include <asm/atomic.h> 7 #include <asm/atomic.h>
8 #include <asm/semaphore.h> 8 #include <asm/semaphore.h>
9 #include <asm/uaccess.h> 9 #include <asm/uaccess.h>
10 10
11 /* Since we effect priority and affinity (both of which are visible 11 /* Since we effect priority and affinity (both of which are visible
12 * to, and settable by outside processes) we do indirection via a 12 * to, and settable by outside processes) we do indirection via a
13 * kthread. */ 13 * kthread. */
14 14
15 /* Thread to stop each CPU in user context. */ 15 /* Thread to stop each CPU in user context. */
16 enum stopmachine_state { 16 enum stopmachine_state {
17 STOPMACHINE_WAIT, 17 STOPMACHINE_WAIT,
18 STOPMACHINE_PREPARE, 18 STOPMACHINE_PREPARE,
19 STOPMACHINE_DISABLE_IRQ, 19 STOPMACHINE_DISABLE_IRQ,
20 STOPMACHINE_EXIT, 20 STOPMACHINE_EXIT,
21 }; 21 };
22 22
23 static enum stopmachine_state stopmachine_state; 23 static enum stopmachine_state stopmachine_state;
24 static unsigned int stopmachine_num_threads; 24 static unsigned int stopmachine_num_threads;
25 static atomic_t stopmachine_thread_ack; 25 static atomic_t stopmachine_thread_ack;
26 static DECLARE_MUTEX(stopmachine_mutex); 26 static DECLARE_MUTEX(stopmachine_mutex);
27 27
28 static int stopmachine(void *cpu) 28 static int stopmachine(void *cpu)
29 { 29 {
30 int irqs_disabled = 0; 30 int irqs_disabled = 0;
31 int prepared = 0; 31 int prepared = 0;
32 32
33 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); 33 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
34 34
35 /* Ack: we are alive */ 35 /* Ack: we are alive */
36 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 36 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
37 atomic_inc(&stopmachine_thread_ack); 37 atomic_inc(&stopmachine_thread_ack);
38 38
39 /* Simple state machine */ 39 /* Simple state machine */
40 while (stopmachine_state != STOPMACHINE_EXIT) { 40 while (stopmachine_state != STOPMACHINE_EXIT) {
41 if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 41 if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
42 && !irqs_disabled) { 42 && !irqs_disabled) {
43 local_irq_disable(); 43 local_irq_disable();
44 irqs_disabled = 1; 44 irqs_disabled = 1;
45 /* Ack: irqs disabled. */ 45 /* Ack: irqs disabled. */
46 smp_mb(); /* Must read state first. */ 46 smp_mb(); /* Must read state first. */
47 atomic_inc(&stopmachine_thread_ack); 47 atomic_inc(&stopmachine_thread_ack);
48 } else if (stopmachine_state == STOPMACHINE_PREPARE 48 } else if (stopmachine_state == STOPMACHINE_PREPARE
49 && !prepared) { 49 && !prepared) {
50 /* Everyone is in place, hold CPU. */ 50 /* Everyone is in place, hold CPU. */
51 preempt_disable(); 51 preempt_disable();
52 prepared = 1; 52 prepared = 1;
53 smp_mb(); /* Must read state first. */ 53 smp_mb(); /* Must read state first. */
54 atomic_inc(&stopmachine_thread_ack); 54 atomic_inc(&stopmachine_thread_ack);
55 } 55 }
56 /* Yield in first stage: migration threads need to 56 /* Yield in first stage: migration threads need to
57 * help our sisters onto their CPUs. */ 57 * help our sisters onto their CPUs. */
58 if (!prepared && !irqs_disabled) 58 if (!prepared && !irqs_disabled)
59 yield(); 59 yield();
60 else 60 else
61 cpu_relax(); 61 cpu_relax();
62 } 62 }
63 63
64 /* Ack: we are exiting. */ 64 /* Ack: we are exiting. */
65 smp_mb(); /* Must read state first. */ 65 smp_mb(); /* Must read state first. */
66 atomic_inc(&stopmachine_thread_ack); 66 atomic_inc(&stopmachine_thread_ack);
67 67
68 if (irqs_disabled) 68 if (irqs_disabled)
69 local_irq_enable(); 69 local_irq_enable();
70 if (prepared) 70 if (prepared)
71 preempt_enable(); 71 preempt_enable();
72 72
73 return 0; 73 return 0;
74 } 74 }
75 75
76 /* Change the thread state */ 76 /* Change the thread state */
77 static void stopmachine_set_state(enum stopmachine_state state) 77 static void stopmachine_set_state(enum stopmachine_state state)
78 { 78 {
79 atomic_set(&stopmachine_thread_ack, 0); 79 atomic_set(&stopmachine_thread_ack, 0);
80 smp_wmb(); 80 smp_wmb();
81 stopmachine_state = state; 81 stopmachine_state = state;
82 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) 82 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
83 cpu_relax(); 83 cpu_relax();
84 } 84 }
85 85
86 static int stop_machine(void) 86 static int stop_machine(void)
87 { 87 {
88 int i, ret = 0; 88 int i, ret = 0;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 mm_segment_t old_fs = get_fs(); 90 mm_segment_t old_fs = get_fs();
91 91
92 /* One high-prio thread per cpu. We'll do this one. */ 92 /* One high-prio thread per cpu. We'll do this one. */
93 set_fs(KERNEL_DS); 93 set_fs(KERNEL_DS);
94 sys_sched_setscheduler(current->pid, SCHED_FIFO, 94 sys_sched_setscheduler(current->pid, SCHED_FIFO,
95 (struct sched_param __user *)&param); 95 (struct sched_param __user *)&param);
96 set_fs(old_fs); 96 set_fs(old_fs);
97 97
98 atomic_set(&stopmachine_thread_ack, 0); 98 atomic_set(&stopmachine_thread_ack, 0);
99 stopmachine_num_threads = 0; 99 stopmachine_num_threads = 0;
100 stopmachine_state = STOPMACHINE_WAIT; 100 stopmachine_state = STOPMACHINE_WAIT;
101 101
102 for_each_online_cpu(i) { 102 for_each_online_cpu(i) {
103 if (i == _smp_processor_id()) 103 if (i == raw_smp_processor_id())
104 continue; 104 continue;
105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
106 if (ret < 0) 106 if (ret < 0)
107 break; 107 break;
108 stopmachine_num_threads++; 108 stopmachine_num_threads++;
109 } 109 }
110 110
111 /* Wait for them all to come to life. */ 111 /* Wait for them all to come to life. */
112 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) 112 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
113 yield(); 113 yield();
114 114
115 /* If some failed, kill them all. */ 115 /* If some failed, kill them all. */
116 if (ret < 0) { 116 if (ret < 0) {
117 stopmachine_set_state(STOPMACHINE_EXIT); 117 stopmachine_set_state(STOPMACHINE_EXIT);
118 up(&stopmachine_mutex); 118 up(&stopmachine_mutex);
119 return ret; 119 return ret;
120 } 120 }
121 121
122 /* Don't schedule us away at this point, please. */ 122 /* Don't schedule us away at this point, please. */
123 local_irq_disable(); 123 local_irq_disable();
124 124
125 /* Now they are all started, make them hold the CPUs, ready. */ 125 /* Now they are all started, make them hold the CPUs, ready. */
126 stopmachine_set_state(STOPMACHINE_PREPARE); 126 stopmachine_set_state(STOPMACHINE_PREPARE);
127 127
128 /* Make them disable irqs. */ 128 /* Make them disable irqs. */
129 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); 129 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
130 130
131 return 0; 131 return 0;
132 } 132 }
133 133
134 static void restart_machine(void) 134 static void restart_machine(void)
135 { 135 {
136 stopmachine_set_state(STOPMACHINE_EXIT); 136 stopmachine_set_state(STOPMACHINE_EXIT);
137 local_irq_enable(); 137 local_irq_enable();
138 } 138 }
139 139
140 struct stop_machine_data 140 struct stop_machine_data
141 { 141 {
142 int (*fn)(void *); 142 int (*fn)(void *);
143 void *data; 143 void *data;
144 struct completion done; 144 struct completion done;
145 }; 145 };
146 146
147 static int do_stop(void *_smdata) 147 static int do_stop(void *_smdata)
148 { 148 {
149 struct stop_machine_data *smdata = _smdata; 149 struct stop_machine_data *smdata = _smdata;
150 int ret; 150 int ret;
151 151
152 ret = stop_machine(); 152 ret = stop_machine();
153 if (ret == 0) { 153 if (ret == 0) {
154 ret = smdata->fn(smdata->data); 154 ret = smdata->fn(smdata->data);
155 restart_machine(); 155 restart_machine();
156 } 156 }
157 157
158 /* We're done: you can kthread_stop us now */ 158 /* We're done: you can kthread_stop us now */
159 complete(&smdata->done); 159 complete(&smdata->done);
160 160
161 /* Wait for kthread_stop */ 161 /* Wait for kthread_stop */
162 set_current_state(TASK_INTERRUPTIBLE); 162 set_current_state(TASK_INTERRUPTIBLE);
163 while (!kthread_should_stop()) { 163 while (!kthread_should_stop()) {
164 schedule(); 164 schedule();
165 set_current_state(TASK_INTERRUPTIBLE); 165 set_current_state(TASK_INTERRUPTIBLE);
166 } 166 }
167 __set_current_state(TASK_RUNNING); 167 __set_current_state(TASK_RUNNING);
168 return ret; 168 return ret;
169 } 169 }
170 170
171 struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, 171 struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
172 unsigned int cpu) 172 unsigned int cpu)
173 { 173 {
174 struct stop_machine_data smdata; 174 struct stop_machine_data smdata;
175 struct task_struct *p; 175 struct task_struct *p;
176 176
177 smdata.fn = fn; 177 smdata.fn = fn;
178 smdata.data = data; 178 smdata.data = data;
179 init_completion(&smdata.done); 179 init_completion(&smdata.done);
180 180
181 down(&stopmachine_mutex); 181 down(&stopmachine_mutex);
182 182
183 /* If they don't care which CPU fn runs on, bind to any online one. */ 183 /* If they don't care which CPU fn runs on, bind to any online one. */
184 if (cpu == NR_CPUS) 184 if (cpu == NR_CPUS)
185 cpu = _smp_processor_id(); 185 cpu = raw_smp_processor_id();
186 186
187 p = kthread_create(do_stop, &smdata, "kstopmachine"); 187 p = kthread_create(do_stop, &smdata, "kstopmachine");
188 if (!IS_ERR(p)) { 188 if (!IS_ERR(p)) {
189 kthread_bind(p, cpu); 189 kthread_bind(p, cpu);
190 wake_up_process(p); 190 wake_up_process(p);
191 wait_for_completion(&smdata.done); 191 wait_for_completion(&smdata.done);
192 } 192 }
193 up(&stopmachine_mutex); 193 up(&stopmachine_mutex);
194 return p; 194 return p;
195 } 195 }
196 196
197 int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) 197 int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
198 { 198 {
199 struct task_struct *p; 199 struct task_struct *p;
200 int ret; 200 int ret;
201 201
202 /* No CPUs can come up or down during this. */ 202 /* No CPUs can come up or down during this. */
203 lock_cpu_hotplug(); 203 lock_cpu_hotplug();
204 p = __stop_machine_run(fn, data, cpu); 204 p = __stop_machine_run(fn, data, cpu);
205 if (!IS_ERR(p)) 205 if (!IS_ERR(p))
206 ret = kthread_stop(p); 206 ret = kthread_stop(p);
207 else 207 else
208 ret = PTR_ERR(p); 208 ret = PTR_ERR(p);
209 unlock_cpu_hotplug(); 209 unlock_cpu_hotplug();
210 210
211 return ret; 211 return ret;
212 } 212 }
213 213
1 # 1 #
2 # Makefile for some libs needed in the kernel. 2 # Makefile for some libs needed in the kernel.
3 # 3 #
4 4
5 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \ 5 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
6 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ 6 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
7 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ 7 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8 sha1.o halfmd4.o 8 sha1.o halfmd4.o
9 9
10 lib-y += kobject.o kref.o kobject_uevent.o klist.o 10 lib-y += kobject.o kref.o kobject_uevent.o klist.o
11 11
12 obj-y += sort.o parser.o 12 obj-y += sort.o parser.o
13 13
14 ifeq ($(CONFIG_DEBUG_KOBJECT),y) 14 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
15 CFLAGS_kobject.o += -DDEBUG 15 CFLAGS_kobject.o += -DDEBUG
16 CFLAGS_kobject_uevent.o += -DDEBUG 16 CFLAGS_kobject_uevent.o += -DDEBUG
17 endif 17 endif
18 18
19 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 19 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
20 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o 20 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
21 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o 21 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
22 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o 22 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
23 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
23 24
24 ifneq ($(CONFIG_HAVE_DEC_LOCK),y) 25 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
25 lib-y += dec_and_lock.o 26 lib-y += dec_and_lock.o
26 endif 27 endif
27 28
28 obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o 29 obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
29 obj-$(CONFIG_CRC32) += crc32.o 30 obj-$(CONFIG_CRC32) += crc32.o
30 obj-$(CONFIG_LIBCRC32C) += libcrc32c.o 31 obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
31 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o 32 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
32 33
33 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ 34 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
34 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ 35 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
35 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ 36 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
36 37
37 hostprogs-y := gen_crc32table 38 hostprogs-y := gen_crc32table
38 clean-files := crc32table.h 39 clean-files := crc32table.h
39 40
40 $(obj)/crc32.o: $(obj)/crc32table.h 41 $(obj)/crc32.o: $(obj)/crc32table.h
41 42
42 quiet_cmd_crc32 = GEN $@ 43 quiet_cmd_crc32 = GEN $@
43 cmd_crc32 = $< > $@ 44 cmd_crc32 = $< > $@
44 45
45 $(obj)/crc32table.h: $(obj)/gen_crc32table 46 $(obj)/crc32table.h: $(obj)/gen_crc32table
46 $(call cmd,crc32) 47 $(call cmd,crc32)
47 48
1 /* 1 /*
2 * lib/kernel_lock.c 2 * lib/kernel_lock.c
3 * 3 *
4 * This is the traditional BKL - big kernel lock. Largely 4 * This is the traditional BKL - big kernel lock. Largely
5 * relegated to obsolescense, but used by various less 5 * relegated to obsolescense, but used by various less
6 * important (or lazy) subsystems. 6 * important (or lazy) subsystems.
7 */ 7 */
8 #include <linux/smp_lock.h> 8 #include <linux/smp_lock.h>
9 #include <linux/module.h> 9 #include <linux/module.h>
10 #include <linux/kallsyms.h> 10 #include <linux/kallsyms.h>
11 11
12 #if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \
13 defined(CONFIG_DEBUG_PREEMPT)
14
15 /*
16 * Debugging check.
17 */
18 unsigned int smp_processor_id(void)
19 {
20 unsigned long preempt_count = preempt_count();
21 int this_cpu = __smp_processor_id();
22 cpumask_t this_mask;
23
24 if (likely(preempt_count))
25 goto out;
26
27 if (irqs_disabled())
28 goto out;
29
30 /*
31 * Kernel threads bound to a single CPU can safely use
32 * smp_processor_id():
33 */
34 this_mask = cpumask_of_cpu(this_cpu);
35
36 if (cpus_equal(current->cpus_allowed, this_mask))
37 goto out;
38
39 /*
40 * It is valid to assume CPU-locality during early bootup:
41 */
42 if (system_state != SYSTEM_RUNNING)
43 goto out;
44
45 /*
46 * Avoid recursion:
47 */
48 preempt_disable();
49
50 if (!printk_ratelimit())
51 goto out_enable;
52
53 printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
54 print_symbol("caller is %s\n", (long)__builtin_return_address(0));
55 dump_stack();
56
57 out_enable:
58 preempt_enable_no_resched();
59 out:
60 return this_cpu;
61 }
62
63 EXPORT_SYMBOL(smp_processor_id);
64
65 #endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */
66
67 #ifdef CONFIG_PREEMPT_BKL 12 #ifdef CONFIG_PREEMPT_BKL
68 /* 13 /*
69 * The 'big kernel semaphore' 14 * The 'big kernel semaphore'
70 * 15 *
71 * This mutex is taken and released recursively by lock_kernel() 16 * This mutex is taken and released recursively by lock_kernel()
72 * and unlock_kernel(). It is transparently dropped and reaquired 17 * and unlock_kernel(). It is transparently dropped and reaquired
73 * over schedule(). It is used to protect legacy code that hasn't 18 * over schedule(). It is used to protect legacy code that hasn't
74 * been migrated to a proper locking design yet. 19 * been migrated to a proper locking design yet.
75 * 20 *
76 * Note: code locked by this semaphore will only be serialized against 21 * Note: code locked by this semaphore will only be serialized against
77 * other code using the same locking facility. The code guarantees that 22 * other code using the same locking facility. The code guarantees that
78 * the task remains on the same CPU. 23 * the task remains on the same CPU.
79 * 24 *
80 * Don't use in new code. 25 * Don't use in new code.
81 */ 26 */
82 static DECLARE_MUTEX(kernel_sem); 27 static DECLARE_MUTEX(kernel_sem);
83 28
84 /* 29 /*
85 * Re-acquire the kernel semaphore. 30 * Re-acquire the kernel semaphore.
86 * 31 *
87 * This function is called with preemption off. 32 * This function is called with preemption off.
88 * 33 *
89 * We are executing in schedule() so the code must be extremely careful 34 * We are executing in schedule() so the code must be extremely careful
90 * about recursion, both due to the down() and due to the enabling of 35 * about recursion, both due to the down() and due to the enabling of
91 * preemption. schedule() will re-check the preemption flag after 36 * preemption. schedule() will re-check the preemption flag after
92 * reacquiring the semaphore. 37 * reacquiring the semaphore.
93 */ 38 */
94 int __lockfunc __reacquire_kernel_lock(void) 39 int __lockfunc __reacquire_kernel_lock(void)
95 { 40 {
96 struct task_struct *task = current; 41 struct task_struct *task = current;
97 int saved_lock_depth = task->lock_depth; 42 int saved_lock_depth = task->lock_depth;
98 43
99 BUG_ON(saved_lock_depth < 0); 44 BUG_ON(saved_lock_depth < 0);
100 45
101 task->lock_depth = -1; 46 task->lock_depth = -1;
102 preempt_enable_no_resched(); 47 preempt_enable_no_resched();
103 48
104 down(&kernel_sem); 49 down(&kernel_sem);
105 50
106 preempt_disable(); 51 preempt_disable();
107 task->lock_depth = saved_lock_depth; 52 task->lock_depth = saved_lock_depth;
108 53
109 return 0; 54 return 0;
110 } 55 }
111 56
112 void __lockfunc __release_kernel_lock(void) 57 void __lockfunc __release_kernel_lock(void)
113 { 58 {
114 up(&kernel_sem); 59 up(&kernel_sem);
115 } 60 }
116 61
117 /* 62 /*
118 * Getting the big kernel semaphore. 63 * Getting the big kernel semaphore.
119 */ 64 */
120 void __lockfunc lock_kernel(void) 65 void __lockfunc lock_kernel(void)
121 { 66 {
122 struct task_struct *task = current; 67 struct task_struct *task = current;
123 int depth = task->lock_depth + 1; 68 int depth = task->lock_depth + 1;
124 69
125 if (likely(!depth)) 70 if (likely(!depth))
126 /* 71 /*
127 * No recursion worries - we set up lock_depth _after_ 72 * No recursion worries - we set up lock_depth _after_
128 */ 73 */
129 down(&kernel_sem); 74 down(&kernel_sem);
130 75
131 task->lock_depth = depth; 76 task->lock_depth = depth;
132 } 77 }
133 78
134 void __lockfunc unlock_kernel(void) 79 void __lockfunc unlock_kernel(void)
135 { 80 {
136 struct task_struct *task = current; 81 struct task_struct *task = current;
137 82
138 BUG_ON(task->lock_depth < 0); 83 BUG_ON(task->lock_depth < 0);
139 84
140 if (likely(--task->lock_depth < 0)) 85 if (likely(--task->lock_depth < 0))
141 up(&kernel_sem); 86 up(&kernel_sem);
142 } 87 }
143 88
144 #else 89 #else
145 90
146 /* 91 /*
147 * The 'big kernel lock' 92 * The 'big kernel lock'
148 * 93 *
149 * This spinlock is taken and released recursively by lock_kernel() 94 * This spinlock is taken and released recursively by lock_kernel()
150 * and unlock_kernel(). It is transparently dropped and reaquired 95 * and unlock_kernel(). It is transparently dropped and reaquired
151 * over schedule(). It is used to protect legacy code that hasn't 96 * over schedule(). It is used to protect legacy code that hasn't
152 * been migrated to a proper locking design yet. 97 * been migrated to a proper locking design yet.
153 * 98 *
154 * Don't use in new code. 99 * Don't use in new code.
155 */ 100 */
156 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag); 101 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
157 102
158 103
159 /* 104 /*
160 * Acquire/release the underlying lock from the scheduler. 105 * Acquire/release the underlying lock from the scheduler.
161 * 106 *
162 * This is called with preemption disabled, and should 107 * This is called with preemption disabled, and should
163 * return an error value if it cannot get the lock and 108 * return an error value if it cannot get the lock and
164 * TIF_NEED_RESCHED gets set. 109 * TIF_NEED_RESCHED gets set.
165 * 110 *
166 * If it successfully gets the lock, it should increment 111 * If it successfully gets the lock, it should increment
167 * the preemption count like any spinlock does. 112 * the preemption count like any spinlock does.
168 * 113 *
169 * (This works on UP too - _raw_spin_trylock will never 114 * (This works on UP too - _raw_spin_trylock will never
170 * return false in that case) 115 * return false in that case)
171 */ 116 */
172 int __lockfunc __reacquire_kernel_lock(void) 117 int __lockfunc __reacquire_kernel_lock(void)
173 { 118 {
174 while (!_raw_spin_trylock(&kernel_flag)) { 119 while (!_raw_spin_trylock(&kernel_flag)) {
175 if (test_thread_flag(TIF_NEED_RESCHED)) 120 if (test_thread_flag(TIF_NEED_RESCHED))
176 return -EAGAIN; 121 return -EAGAIN;
177 cpu_relax(); 122 cpu_relax();
178 } 123 }
179 preempt_disable(); 124 preempt_disable();
180 return 0; 125 return 0;
181 } 126 }
182 127
183 void __lockfunc __release_kernel_lock(void) 128 void __lockfunc __release_kernel_lock(void)
184 { 129 {
185 _raw_spin_unlock(&kernel_flag); 130 _raw_spin_unlock(&kernel_flag);
186 preempt_enable_no_resched(); 131 preempt_enable_no_resched();
187 } 132 }
188 133
189 /* 134 /*
190 * These are the BKL spinlocks - we try to be polite about preemption. 135 * These are the BKL spinlocks - we try to be polite about preemption.
191 * If SMP is not on (ie UP preemption), this all goes away because the 136 * If SMP is not on (ie UP preemption), this all goes away because the
192 * _raw_spin_trylock() will always succeed. 137 * _raw_spin_trylock() will always succeed.
193 */ 138 */
194 #ifdef CONFIG_PREEMPT 139 #ifdef CONFIG_PREEMPT
195 static inline void __lock_kernel(void) 140 static inline void __lock_kernel(void)
196 { 141 {
197 preempt_disable(); 142 preempt_disable();
198 if (unlikely(!_raw_spin_trylock(&kernel_flag))) { 143 if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
199 /* 144 /*
200 * If preemption was disabled even before this 145 * If preemption was disabled even before this
201 * was called, there's nothing we can be polite 146 * was called, there's nothing we can be polite
202 * about - just spin. 147 * about - just spin.
203 */ 148 */
204 if (preempt_count() > 1) { 149 if (preempt_count() > 1) {
205 _raw_spin_lock(&kernel_flag); 150 _raw_spin_lock(&kernel_flag);
206 return; 151 return;
207 } 152 }
208 153
209 /* 154 /*
210 * Otherwise, let's wait for the kernel lock 155 * Otherwise, let's wait for the kernel lock
211 * with preemption enabled.. 156 * with preemption enabled..
212 */ 157 */
213 do { 158 do {
214 preempt_enable(); 159 preempt_enable();
215 while (spin_is_locked(&kernel_flag)) 160 while (spin_is_locked(&kernel_flag))
216 cpu_relax(); 161 cpu_relax();
217 preempt_disable(); 162 preempt_disable();
218 } while (!_raw_spin_trylock(&kernel_flag)); 163 } while (!_raw_spin_trylock(&kernel_flag));
219 } 164 }
220 } 165 }
221 166
222 #else 167 #else
223 168
224 /* 169 /*
225 * Non-preemption case - just get the spinlock 170 * Non-preemption case - just get the spinlock
226 */ 171 */
227 static inline void __lock_kernel(void) 172 static inline void __lock_kernel(void)
228 { 173 {
229 _raw_spin_lock(&kernel_flag); 174 _raw_spin_lock(&kernel_flag);
230 } 175 }
231 #endif 176 #endif
232 177
233 static inline void __unlock_kernel(void) 178 static inline void __unlock_kernel(void)
234 { 179 {
235 _raw_spin_unlock(&kernel_flag); 180 _raw_spin_unlock(&kernel_flag);
236 preempt_enable(); 181 preempt_enable();
237 } 182 }
238 183
239 /* 184 /*
240 * Getting the big kernel lock. 185 * Getting the big kernel lock.
241 * 186 *
242 * This cannot happen asynchronously, so we only need to 187 * This cannot happen asynchronously, so we only need to
243 * worry about other CPU's. 188 * worry about other CPU's.
244 */ 189 */
245 void __lockfunc lock_kernel(void) 190 void __lockfunc lock_kernel(void)
246 { 191 {
247 int depth = current->lock_depth+1; 192 int depth = current->lock_depth+1;
248 if (likely(!depth)) 193 if (likely(!depth))
249 __lock_kernel(); 194 __lock_kernel();
250 current->lock_depth = depth; 195 current->lock_depth = depth;
251 } 196 }
252 197
253 void __lockfunc unlock_kernel(void) 198 void __lockfunc unlock_kernel(void)
254 { 199 {
255 BUG_ON(current->lock_depth < 0); 200 BUG_ON(current->lock_depth < 0);
256 if (likely(--current->lock_depth < 0)) 201 if (likely(--current->lock_depth < 0))
257 __unlock_kernel(); 202 __unlock_kernel();
258 } 203 }
259 204
260 #endif 205 #endif
261 206
262 EXPORT_SYMBOL(lock_kernel); 207 EXPORT_SYMBOL(lock_kernel);
263 EXPORT_SYMBOL(unlock_kernel); 208 EXPORT_SYMBOL(unlock_kernel);
264 209
265 210
lib/smp_processor_id.c
File was created 1 /*
2 * lib/smp_processor_id.c
3 *
4 * DEBUG_PREEMPT variant of smp_processor_id().
5 */
6 #include <linux/module.h>
7 #include <linux/kallsyms.h>
8
9 unsigned int debug_smp_processor_id(void)
10 {
11 unsigned long preempt_count = preempt_count();
12 int this_cpu = raw_smp_processor_id();
13 cpumask_t this_mask;
14
15 if (likely(preempt_count))
16 goto out;
17
18 if (irqs_disabled())
19 goto out;
20
21 /*
22 * Kernel threads bound to a single CPU can safely use
23 * smp_processor_id():
24 */
25 this_mask = cpumask_of_cpu(this_cpu);
26
27 if (cpus_equal(current->cpus_allowed, this_mask))
28 goto out;
29
30 /*
31 * It is valid to assume CPU-locality during early bootup:
32 */
33 if (system_state != SYSTEM_RUNNING)
34 goto out;
35
36 /*
37 * Avoid recursion:
38 */
39 preempt_disable();
40
41 if (!printk_ratelimit())
42 goto out_enable;
43
44 printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
45 print_symbol("caller is %s\n", (long)__builtin_return_address(0));
46 dump_stack();
47
48 out_enable:
49 preempt_enable_no_resched();
50 out:
51 return this_cpu;
52 }
53
54 EXPORT_SYMBOL(debug_smp_processor_id);
55
56