Commit c6fd91f0bdcd294a0ae0ba2b2a7f7456ef4b7144

Authored by bibo mao
Committed by Linus Torvalds
1 parent c9becf58d9

[PATCH] kretprobe instance recycled by parent process

When kretprobe probes the schedule() function, if the probed process exits
then schedule() will never return, so some kretprobe instances will never
be recycled.

In this patch the parent process will recycle retprobe instances of the
probed function and there will be no memory leak of kretprobe instances.

Signed-off-by: bibo mao <bibo.mao@intel.com>
Cc: Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 6 changed files with 14 additions and 32 deletions Inline Diff

arch/i386/kernel/process.c
1 /* 1 /*
2 * linux/arch/i386/kernel/process.c 2 * linux/arch/i386/kernel/process.c
3 * 3 *
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 * 5 *
6 * Pentium III FXSR, SSE support 6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */ 8 */
9 9
10 /* 10 /*
11 * This file handles the architecture-dependent parts of process handling.. 11 * This file handles the architecture-dependent parts of process handling..
12 */ 12 */
13 13
14 #include <stdarg.h> 14 #include <stdarg.h>
15 15
16 #include <linux/cpu.h> 16 #include <linux/cpu.h>
17 #include <linux/errno.h> 17 #include <linux/errno.h>
18 #include <linux/sched.h> 18 #include <linux/sched.h>
19 #include <linux/fs.h> 19 #include <linux/fs.h>
20 #include <linux/kernel.h> 20 #include <linux/kernel.h>
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/elfcore.h> 22 #include <linux/elfcore.h>
23 #include <linux/smp.h> 23 #include <linux/smp.h>
24 #include <linux/smp_lock.h> 24 #include <linux/smp_lock.h>
25 #include <linux/stddef.h> 25 #include <linux/stddef.h>
26 #include <linux/slab.h> 26 #include <linux/slab.h>
27 #include <linux/vmalloc.h> 27 #include <linux/vmalloc.h>
28 #include <linux/user.h> 28 #include <linux/user.h>
29 #include <linux/a.out.h> 29 #include <linux/a.out.h>
30 #include <linux/interrupt.h> 30 #include <linux/interrupt.h>
31 #include <linux/config.h> 31 #include <linux/config.h>
32 #include <linux/utsname.h> 32 #include <linux/utsname.h>
33 #include <linux/delay.h> 33 #include <linux/delay.h>
34 #include <linux/reboot.h> 34 #include <linux/reboot.h>
35 #include <linux/init.h> 35 #include <linux/init.h>
36 #include <linux/mc146818rtc.h> 36 #include <linux/mc146818rtc.h>
37 #include <linux/module.h> 37 #include <linux/module.h>
38 #include <linux/kallsyms.h> 38 #include <linux/kallsyms.h>
39 #include <linux/ptrace.h> 39 #include <linux/ptrace.h>
40 #include <linux/random.h> 40 #include <linux/random.h>
41 #include <linux/kprobes.h>
42 41
43 #include <asm/uaccess.h> 42 #include <asm/uaccess.h>
44 #include <asm/pgtable.h> 43 #include <asm/pgtable.h>
45 #include <asm/system.h> 44 #include <asm/system.h>
46 #include <asm/io.h> 45 #include <asm/io.h>
47 #include <asm/ldt.h> 46 #include <asm/ldt.h>
48 #include <asm/processor.h> 47 #include <asm/processor.h>
49 #include <asm/i387.h> 48 #include <asm/i387.h>
50 #include <asm/desc.h> 49 #include <asm/desc.h>
51 #include <asm/vm86.h> 50 #include <asm/vm86.h>
52 #ifdef CONFIG_MATH_EMULATION 51 #ifdef CONFIG_MATH_EMULATION
53 #include <asm/math_emu.h> 52 #include <asm/math_emu.h>
54 #endif 53 #endif
55 54
56 #include <linux/err.h> 55 #include <linux/err.h>
57 56
58 #include <asm/tlbflush.h> 57 #include <asm/tlbflush.h>
59 #include <asm/cpu.h> 58 #include <asm/cpu.h>
60 59
61 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
62 61
63 static int hlt_counter; 62 static int hlt_counter;
64 63
65 unsigned long boot_option_idle_override = 0; 64 unsigned long boot_option_idle_override = 0;
66 EXPORT_SYMBOL(boot_option_idle_override); 65 EXPORT_SYMBOL(boot_option_idle_override);
67 66
68 /* 67 /*
69 * Return saved PC of a blocked thread. 68 * Return saved PC of a blocked thread.
70 */ 69 */
71 unsigned long thread_saved_pc(struct task_struct *tsk) 70 unsigned long thread_saved_pc(struct task_struct *tsk)
72 { 71 {
73 return ((unsigned long *)tsk->thread.esp)[3]; 72 return ((unsigned long *)tsk->thread.esp)[3];
74 } 73 }
75 74
76 /* 75 /*
77 * Powermanagement idle function, if any.. 76 * Powermanagement idle function, if any..
78 */ 77 */
79 void (*pm_idle)(void); 78 void (*pm_idle)(void);
80 EXPORT_SYMBOL(pm_idle); 79 EXPORT_SYMBOL(pm_idle);
81 static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 80 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
82 81
83 void disable_hlt(void) 82 void disable_hlt(void)
84 { 83 {
85 hlt_counter++; 84 hlt_counter++;
86 } 85 }
87 86
88 EXPORT_SYMBOL(disable_hlt); 87 EXPORT_SYMBOL(disable_hlt);
89 88
90 void enable_hlt(void) 89 void enable_hlt(void)
91 { 90 {
92 hlt_counter--; 91 hlt_counter--;
93 } 92 }
94 93
95 EXPORT_SYMBOL(enable_hlt); 94 EXPORT_SYMBOL(enable_hlt);
96 95
97 /* 96 /*
98 * We use this if we don't have any better 97 * We use this if we don't have any better
99 * idle routine.. 98 * idle routine..
100 */ 99 */
101 void default_idle(void) 100 void default_idle(void)
102 { 101 {
103 local_irq_enable(); 102 local_irq_enable();
104 103
105 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
106 clear_thread_flag(TIF_POLLING_NRFLAG); 105 clear_thread_flag(TIF_POLLING_NRFLAG);
107 smp_mb__after_clear_bit(); 106 smp_mb__after_clear_bit();
108 while (!need_resched()) { 107 while (!need_resched()) {
109 local_irq_disable(); 108 local_irq_disable();
110 if (!need_resched()) 109 if (!need_resched())
111 safe_halt(); 110 safe_halt();
112 else 111 else
113 local_irq_enable(); 112 local_irq_enable();
114 } 113 }
115 set_thread_flag(TIF_POLLING_NRFLAG); 114 set_thread_flag(TIF_POLLING_NRFLAG);
116 } else { 115 } else {
117 while (!need_resched()) 116 while (!need_resched())
118 cpu_relax(); 117 cpu_relax();
119 } 118 }
120 } 119 }
121 #ifdef CONFIG_APM_MODULE 120 #ifdef CONFIG_APM_MODULE
122 EXPORT_SYMBOL(default_idle); 121 EXPORT_SYMBOL(default_idle);
123 #endif 122 #endif
124 123
125 /* 124 /*
126 * On SMP it's slightly faster (but much more power-consuming!) 125 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->work.need_resched flag instead of waiting for the 126 * to poll the ->work.need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution. 127 * cross-CPU IPI to arrive. Use this option with caution.
129 */ 128 */
130 static void poll_idle (void) 129 static void poll_idle (void)
131 { 130 {
132 local_irq_enable(); 131 local_irq_enable();
133 132
134 asm volatile( 133 asm volatile(
135 "2:" 134 "2:"
136 "testl %0, %1;" 135 "testl %0, %1;"
137 "rep; nop;" 136 "rep; nop;"
138 "je 2b;" 137 "je 2b;"
139 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); 138 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
140 } 139 }
141 140
142 #ifdef CONFIG_HOTPLUG_CPU 141 #ifdef CONFIG_HOTPLUG_CPU
143 #include <asm/nmi.h> 142 #include <asm/nmi.h>
144 /* We don't actually take CPU down, just spin without interrupts. */ 143 /* We don't actually take CPU down, just spin without interrupts. */
145 static inline void play_dead(void) 144 static inline void play_dead(void)
146 { 145 {
147 /* This must be done before dead CPU ack */ 146 /* This must be done before dead CPU ack */
148 cpu_exit_clear(); 147 cpu_exit_clear();
149 wbinvd(); 148 wbinvd();
150 mb(); 149 mb();
151 /* Ack it */ 150 /* Ack it */
152 __get_cpu_var(cpu_state) = CPU_DEAD; 151 __get_cpu_var(cpu_state) = CPU_DEAD;
153 152
154 /* 153 /*
155 * With physical CPU hotplug, we should halt the cpu 154 * With physical CPU hotplug, we should halt the cpu
156 */ 155 */
157 local_irq_disable(); 156 local_irq_disable();
158 while (1) 157 while (1)
159 halt(); 158 halt();
160 } 159 }
161 #else 160 #else
162 static inline void play_dead(void) 161 static inline void play_dead(void)
163 { 162 {
164 BUG(); 163 BUG();
165 } 164 }
166 #endif /* CONFIG_HOTPLUG_CPU */ 165 #endif /* CONFIG_HOTPLUG_CPU */
167 166
168 /* 167 /*
169 * The idle thread. There's no useful work to be 168 * The idle thread. There's no useful work to be
170 * done, so just try to conserve power and have a 169 * done, so just try to conserve power and have a
171 * low exit latency (ie sit in a loop waiting for 170 * low exit latency (ie sit in a loop waiting for
172 * somebody to say that they'd like to reschedule) 171 * somebody to say that they'd like to reschedule)
173 */ 172 */
174 void cpu_idle(void) 173 void cpu_idle(void)
175 { 174 {
176 int cpu = smp_processor_id(); 175 int cpu = smp_processor_id();
177 176
178 set_thread_flag(TIF_POLLING_NRFLAG); 177 set_thread_flag(TIF_POLLING_NRFLAG);
179 178
180 /* endless idle loop with no priority at all */ 179 /* endless idle loop with no priority at all */
181 while (1) { 180 while (1) {
182 while (!need_resched()) { 181 while (!need_resched()) {
183 void (*idle)(void); 182 void (*idle)(void);
184 183
185 if (__get_cpu_var(cpu_idle_state)) 184 if (__get_cpu_var(cpu_idle_state))
186 __get_cpu_var(cpu_idle_state) = 0; 185 __get_cpu_var(cpu_idle_state) = 0;
187 186
188 rmb(); 187 rmb();
189 idle = pm_idle; 188 idle = pm_idle;
190 189
191 if (!idle) 190 if (!idle)
192 idle = default_idle; 191 idle = default_idle;
193 192
194 if (cpu_is_offline(cpu)) 193 if (cpu_is_offline(cpu))
195 play_dead(); 194 play_dead();
196 195
197 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 196 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
198 idle(); 197 idle();
199 } 198 }
200 preempt_enable_no_resched(); 199 preempt_enable_no_resched();
201 schedule(); 200 schedule();
202 preempt_disable(); 201 preempt_disable();
203 } 202 }
204 } 203 }
205 204
206 void cpu_idle_wait(void) 205 void cpu_idle_wait(void)
207 { 206 {
208 unsigned int cpu, this_cpu = get_cpu(); 207 unsigned int cpu, this_cpu = get_cpu();
209 cpumask_t map; 208 cpumask_t map;
210 209
211 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 210 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
212 put_cpu(); 211 put_cpu();
213 212
214 cpus_clear(map); 213 cpus_clear(map);
215 for_each_online_cpu(cpu) { 214 for_each_online_cpu(cpu) {
216 per_cpu(cpu_idle_state, cpu) = 1; 215 per_cpu(cpu_idle_state, cpu) = 1;
217 cpu_set(cpu, map); 216 cpu_set(cpu, map);
218 } 217 }
219 218
220 __get_cpu_var(cpu_idle_state) = 0; 219 __get_cpu_var(cpu_idle_state) = 0;
221 220
222 wmb(); 221 wmb();
223 do { 222 do {
224 ssleep(1); 223 ssleep(1);
225 for_each_online_cpu(cpu) { 224 for_each_online_cpu(cpu) {
226 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) 225 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
227 cpu_clear(cpu, map); 226 cpu_clear(cpu, map);
228 } 227 }
229 cpus_and(map, map, cpu_online_map); 228 cpus_and(map, map, cpu_online_map);
230 } while (!cpus_empty(map)); 229 } while (!cpus_empty(map));
231 } 230 }
232 EXPORT_SYMBOL_GPL(cpu_idle_wait); 231 EXPORT_SYMBOL_GPL(cpu_idle_wait);
233 232
234 /* 233 /*
235 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 234 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
236 * which can obviate IPI to trigger checking of need_resched. 235 * which can obviate IPI to trigger checking of need_resched.
237 * We execute MONITOR against need_resched and enter optimized wait state 236 * We execute MONITOR against need_resched and enter optimized wait state
238 * through MWAIT. Whenever someone changes need_resched, we would be woken 237 * through MWAIT. Whenever someone changes need_resched, we would be woken
239 * up from MWAIT (without an IPI). 238 * up from MWAIT (without an IPI).
240 */ 239 */
241 static void mwait_idle(void) 240 static void mwait_idle(void)
242 { 241 {
243 local_irq_enable(); 242 local_irq_enable();
244 243
245 while (!need_resched()) { 244 while (!need_resched()) {
246 __monitor((void *)&current_thread_info()->flags, 0, 0); 245 __monitor((void *)&current_thread_info()->flags, 0, 0);
247 smp_mb(); 246 smp_mb();
248 if (need_resched()) 247 if (need_resched())
249 break; 248 break;
250 __mwait(0, 0); 249 __mwait(0, 0);
251 } 250 }
252 } 251 }
253 252
254 void __devinit select_idle_routine(const struct cpuinfo_x86 *c) 253 void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
255 { 254 {
256 if (cpu_has(c, X86_FEATURE_MWAIT)) { 255 if (cpu_has(c, X86_FEATURE_MWAIT)) {
257 printk("monitor/mwait feature present.\n"); 256 printk("monitor/mwait feature present.\n");
258 /* 257 /*
259 * Skip, if setup has overridden idle. 258 * Skip, if setup has overridden idle.
260 * One CPU supports mwait => All CPUs supports mwait 259 * One CPU supports mwait => All CPUs supports mwait
261 */ 260 */
262 if (!pm_idle) { 261 if (!pm_idle) {
263 printk("using mwait in idle threads.\n"); 262 printk("using mwait in idle threads.\n");
264 pm_idle = mwait_idle; 263 pm_idle = mwait_idle;
265 } 264 }
266 } 265 }
267 } 266 }
268 267
269 static int __init idle_setup (char *str) 268 static int __init idle_setup (char *str)
270 { 269 {
271 if (!strncmp(str, "poll", 4)) { 270 if (!strncmp(str, "poll", 4)) {
272 printk("using polling idle threads.\n"); 271 printk("using polling idle threads.\n");
273 pm_idle = poll_idle; 272 pm_idle = poll_idle;
274 #ifdef CONFIG_X86_SMP 273 #ifdef CONFIG_X86_SMP
275 if (smp_num_siblings > 1) 274 if (smp_num_siblings > 1)
276 printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); 275 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
277 #endif 276 #endif
278 } else if (!strncmp(str, "halt", 4)) { 277 } else if (!strncmp(str, "halt", 4)) {
279 printk("using halt in idle threads.\n"); 278 printk("using halt in idle threads.\n");
280 pm_idle = default_idle; 279 pm_idle = default_idle;
281 } 280 }
282 281
283 boot_option_idle_override = 1; 282 boot_option_idle_override = 1;
284 return 1; 283 return 1;
285 } 284 }
286 285
287 __setup("idle=", idle_setup); 286 __setup("idle=", idle_setup);
288 287
289 void show_regs(struct pt_regs * regs) 288 void show_regs(struct pt_regs * regs)
290 { 289 {
291 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 290 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
292 291
293 printk("\n"); 292 printk("\n");
294 printk("Pid: %d, comm: %20s\n", current->pid, current->comm); 293 printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
295 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); 294 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
296 print_symbol("EIP is at %s\n", regs->eip); 295 print_symbol("EIP is at %s\n", regs->eip);
297 296
298 if (user_mode_vm(regs)) 297 if (user_mode_vm(regs))
299 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); 298 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
300 printk(" EFLAGS: %08lx %s (%s %.*s)\n", 299 printk(" EFLAGS: %08lx %s (%s %.*s)\n",
301 regs->eflags, print_tainted(), system_utsname.release, 300 regs->eflags, print_tainted(), system_utsname.release,
302 (int)strcspn(system_utsname.version, " "), 301 (int)strcspn(system_utsname.version, " "),
303 system_utsname.version); 302 system_utsname.version);
304 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 303 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
305 regs->eax,regs->ebx,regs->ecx,regs->edx); 304 regs->eax,regs->ebx,regs->ecx,regs->edx);
306 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 305 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
307 regs->esi, regs->edi, regs->ebp); 306 regs->esi, regs->edi, regs->ebp);
308 printk(" DS: %04x ES: %04x\n", 307 printk(" DS: %04x ES: %04x\n",
309 0xffff & regs->xds,0xffff & regs->xes); 308 0xffff & regs->xds,0xffff & regs->xes);
310 309
311 cr0 = read_cr0(); 310 cr0 = read_cr0();
312 cr2 = read_cr2(); 311 cr2 = read_cr2();
313 cr3 = read_cr3(); 312 cr3 = read_cr3();
314 cr4 = read_cr4_safe(); 313 cr4 = read_cr4_safe();
315 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); 314 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
316 show_trace(NULL, &regs->esp); 315 show_trace(NULL, &regs->esp);
317 } 316 }
318 317
319 /* 318 /*
320 * This gets run with %ebx containing the 319 * This gets run with %ebx containing the
321 * function to call, and %edx containing 320 * function to call, and %edx containing
322 * the "args". 321 * the "args".
323 */ 322 */
324 extern void kernel_thread_helper(void); 323 extern void kernel_thread_helper(void);
325 __asm__(".section .text\n" 324 __asm__(".section .text\n"
326 ".align 4\n" 325 ".align 4\n"
327 "kernel_thread_helper:\n\t" 326 "kernel_thread_helper:\n\t"
328 "movl %edx,%eax\n\t" 327 "movl %edx,%eax\n\t"
329 "pushl %edx\n\t" 328 "pushl %edx\n\t"
330 "call *%ebx\n\t" 329 "call *%ebx\n\t"
331 "pushl %eax\n\t" 330 "pushl %eax\n\t"
332 "call do_exit\n" 331 "call do_exit\n"
333 ".previous"); 332 ".previous");
334 333
335 /* 334 /*
336 * Create a kernel thread 335 * Create a kernel thread
337 */ 336 */
338 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) 337 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
339 { 338 {
340 struct pt_regs regs; 339 struct pt_regs regs;
341 340
342 memset(&regs, 0, sizeof(regs)); 341 memset(&regs, 0, sizeof(regs));
343 342
344 regs.ebx = (unsigned long) fn; 343 regs.ebx = (unsigned long) fn;
345 regs.edx = (unsigned long) arg; 344 regs.edx = (unsigned long) arg;
346 345
347 regs.xds = __USER_DS; 346 regs.xds = __USER_DS;
348 regs.xes = __USER_DS; 347 regs.xes = __USER_DS;
349 regs.orig_eax = -1; 348 regs.orig_eax = -1;
350 regs.eip = (unsigned long) kernel_thread_helper; 349 regs.eip = (unsigned long) kernel_thread_helper;
351 regs.xcs = __KERNEL_CS; 350 regs.xcs = __KERNEL_CS;
352 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; 351 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
353 352
354 /* Ok, create the new process.. */ 353 /* Ok, create the new process.. */
355 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 354 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
356 } 355 }
357 EXPORT_SYMBOL(kernel_thread); 356 EXPORT_SYMBOL(kernel_thread);
358 357
359 /* 358 /*
360 * Free current thread data structures etc.. 359 * Free current thread data structures etc..
361 */ 360 */
362 void exit_thread(void) 361 void exit_thread(void)
363 { 362 {
364 struct task_struct *tsk = current; 363 struct task_struct *tsk = current;
365 struct thread_struct *t = &tsk->thread; 364 struct thread_struct *t = &tsk->thread;
366
367 /*
368 * Remove function-return probe instances associated with this task
369 * and put them back on the free list. Do not insert an exit probe for
370 * this function, it will be disabled by kprobe_flush_task if you do.
371 */
372 kprobe_flush_task(tsk);
373 365
374 /* The process may have allocated an io port bitmap... nuke it. */ 366 /* The process may have allocated an io port bitmap... nuke it. */
375 if (unlikely(NULL != t->io_bitmap_ptr)) { 367 if (unlikely(NULL != t->io_bitmap_ptr)) {
376 int cpu = get_cpu(); 368 int cpu = get_cpu();
377 struct tss_struct *tss = &per_cpu(init_tss, cpu); 369 struct tss_struct *tss = &per_cpu(init_tss, cpu);
378 370
379 kfree(t->io_bitmap_ptr); 371 kfree(t->io_bitmap_ptr);
380 t->io_bitmap_ptr = NULL; 372 t->io_bitmap_ptr = NULL;
381 /* 373 /*
382 * Careful, clear this in the TSS too: 374 * Careful, clear this in the TSS too:
383 */ 375 */
384 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); 376 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
385 t->io_bitmap_max = 0; 377 t->io_bitmap_max = 0;
386 tss->io_bitmap_owner = NULL; 378 tss->io_bitmap_owner = NULL;
387 tss->io_bitmap_max = 0; 379 tss->io_bitmap_max = 0;
388 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 380 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
389 put_cpu(); 381 put_cpu();
390 } 382 }
391 } 383 }
392 384
393 void flush_thread(void) 385 void flush_thread(void)
394 { 386 {
395 struct task_struct *tsk = current; 387 struct task_struct *tsk = current;
396 388
397 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); 389 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
398 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 390 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
399 /* 391 /*
400 * Forget coprocessor state.. 392 * Forget coprocessor state..
401 */ 393 */
402 clear_fpu(tsk); 394 clear_fpu(tsk);
403 clear_used_math(); 395 clear_used_math();
404 } 396 }
405 397
406 void release_thread(struct task_struct *dead_task) 398 void release_thread(struct task_struct *dead_task)
407 { 399 {
408 BUG_ON(dead_task->mm); 400 BUG_ON(dead_task->mm);
409 release_vm86_irqs(dead_task); 401 release_vm86_irqs(dead_task);
410 } 402 }
411 403
412 /* 404 /*
413 * This gets called before we allocate a new thread and copy 405 * This gets called before we allocate a new thread and copy
414 * the current task into it. 406 * the current task into it.
415 */ 407 */
416 void prepare_to_copy(struct task_struct *tsk) 408 void prepare_to_copy(struct task_struct *tsk)
417 { 409 {
418 unlazy_fpu(tsk); 410 unlazy_fpu(tsk);
419 } 411 }
420 412
421 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, 413 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
422 unsigned long unused, 414 unsigned long unused,
423 struct task_struct * p, struct pt_regs * regs) 415 struct task_struct * p, struct pt_regs * regs)
424 { 416 {
425 struct pt_regs * childregs; 417 struct pt_regs * childregs;
426 struct task_struct *tsk; 418 struct task_struct *tsk;
427 int err; 419 int err;
428 420
429 childregs = task_pt_regs(p); 421 childregs = task_pt_regs(p);
430 *childregs = *regs; 422 *childregs = *regs;
431 childregs->eax = 0; 423 childregs->eax = 0;
432 childregs->esp = esp; 424 childregs->esp = esp;
433 425
434 p->thread.esp = (unsigned long) childregs; 426 p->thread.esp = (unsigned long) childregs;
435 p->thread.esp0 = (unsigned long) (childregs+1); 427 p->thread.esp0 = (unsigned long) (childregs+1);
436 428
437 p->thread.eip = (unsigned long) ret_from_fork; 429 p->thread.eip = (unsigned long) ret_from_fork;
438 430
439 savesegment(fs,p->thread.fs); 431 savesegment(fs,p->thread.fs);
440 savesegment(gs,p->thread.gs); 432 savesegment(gs,p->thread.gs);
441 433
442 tsk = current; 434 tsk = current;
443 if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { 435 if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
444 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 436 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
445 if (!p->thread.io_bitmap_ptr) { 437 if (!p->thread.io_bitmap_ptr) {
446 p->thread.io_bitmap_max = 0; 438 p->thread.io_bitmap_max = 0;
447 return -ENOMEM; 439 return -ENOMEM;
448 } 440 }
449 memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, 441 memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
450 IO_BITMAP_BYTES); 442 IO_BITMAP_BYTES);
451 } 443 }
452 444
453 /* 445 /*
454 * Set a new TLS for the child thread? 446 * Set a new TLS for the child thread?
455 */ 447 */
456 if (clone_flags & CLONE_SETTLS) { 448 if (clone_flags & CLONE_SETTLS) {
457 struct desc_struct *desc; 449 struct desc_struct *desc;
458 struct user_desc info; 450 struct user_desc info;
459 int idx; 451 int idx;
460 452
461 err = -EFAULT; 453 err = -EFAULT;
462 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) 454 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
463 goto out; 455 goto out;
464 err = -EINVAL; 456 err = -EINVAL;
465 if (LDT_empty(&info)) 457 if (LDT_empty(&info))
466 goto out; 458 goto out;
467 459
468 idx = info.entry_number; 460 idx = info.entry_number;
469 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 461 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
470 goto out; 462 goto out;
471 463
472 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; 464 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
473 desc->a = LDT_entry_a(&info); 465 desc->a = LDT_entry_a(&info);
474 desc->b = LDT_entry_b(&info); 466 desc->b = LDT_entry_b(&info);
475 } 467 }
476 468
477 err = 0; 469 err = 0;
478 out: 470 out:
479 if (err && p->thread.io_bitmap_ptr) { 471 if (err && p->thread.io_bitmap_ptr) {
480 kfree(p->thread.io_bitmap_ptr); 472 kfree(p->thread.io_bitmap_ptr);
481 p->thread.io_bitmap_max = 0; 473 p->thread.io_bitmap_max = 0;
482 } 474 }
483 return err; 475 return err;
484 } 476 }
485 477
486 /* 478 /*
487 * fill in the user structure for a core dump.. 479 * fill in the user structure for a core dump..
488 */ 480 */
489 void dump_thread(struct pt_regs * regs, struct user * dump) 481 void dump_thread(struct pt_regs * regs, struct user * dump)
490 { 482 {
491 int i; 483 int i;
492 484
493 /* changed the size calculations - should hopefully work better. lbt */ 485 /* changed the size calculations - should hopefully work better. lbt */
494 dump->magic = CMAGIC; 486 dump->magic = CMAGIC;
495 dump->start_code = 0; 487 dump->start_code = 0;
496 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); 488 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
497 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; 489 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
498 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; 490 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
499 dump->u_dsize -= dump->u_tsize; 491 dump->u_dsize -= dump->u_tsize;
500 dump->u_ssize = 0; 492 dump->u_ssize = 0;
501 for (i = 0; i < 8; i++) 493 for (i = 0; i < 8; i++)
502 dump->u_debugreg[i] = current->thread.debugreg[i]; 494 dump->u_debugreg[i] = current->thread.debugreg[i];
503 495
504 if (dump->start_stack < TASK_SIZE) 496 if (dump->start_stack < TASK_SIZE)
505 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; 497 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
506 498
507 dump->regs.ebx = regs->ebx; 499 dump->regs.ebx = regs->ebx;
508 dump->regs.ecx = regs->ecx; 500 dump->regs.ecx = regs->ecx;
509 dump->regs.edx = regs->edx; 501 dump->regs.edx = regs->edx;
510 dump->regs.esi = regs->esi; 502 dump->regs.esi = regs->esi;
511 dump->regs.edi = regs->edi; 503 dump->regs.edi = regs->edi;
512 dump->regs.ebp = regs->ebp; 504 dump->regs.ebp = regs->ebp;
513 dump->regs.eax = regs->eax; 505 dump->regs.eax = regs->eax;
514 dump->regs.ds = regs->xds; 506 dump->regs.ds = regs->xds;
515 dump->regs.es = regs->xes; 507 dump->regs.es = regs->xes;
516 savesegment(fs,dump->regs.fs); 508 savesegment(fs,dump->regs.fs);
517 savesegment(gs,dump->regs.gs); 509 savesegment(gs,dump->regs.gs);
518 dump->regs.orig_eax = regs->orig_eax; 510 dump->regs.orig_eax = regs->orig_eax;
519 dump->regs.eip = regs->eip; 511 dump->regs.eip = regs->eip;
520 dump->regs.cs = regs->xcs; 512 dump->regs.cs = regs->xcs;
521 dump->regs.eflags = regs->eflags; 513 dump->regs.eflags = regs->eflags;
522 dump->regs.esp = regs->esp; 514 dump->regs.esp = regs->esp;
523 dump->regs.ss = regs->xss; 515 dump->regs.ss = regs->xss;
524 516
525 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 517 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
526 } 518 }
527 EXPORT_SYMBOL(dump_thread); 519 EXPORT_SYMBOL(dump_thread);
528 520
529 /* 521 /*
530 * Capture the user space registers if the task is not running (in user space) 522 * Capture the user space registers if the task is not running (in user space)
531 */ 523 */
532 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) 524 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
533 { 525 {
534 struct pt_regs ptregs = *task_pt_regs(tsk); 526 struct pt_regs ptregs = *task_pt_regs(tsk);
535 ptregs.xcs &= 0xffff; 527 ptregs.xcs &= 0xffff;
536 ptregs.xds &= 0xffff; 528 ptregs.xds &= 0xffff;
537 ptregs.xes &= 0xffff; 529 ptregs.xes &= 0xffff;
538 ptregs.xss &= 0xffff; 530 ptregs.xss &= 0xffff;
539 531
540 elf_core_copy_regs(regs, &ptregs); 532 elf_core_copy_regs(regs, &ptregs);
541 533
542 return 1; 534 return 1;
543 } 535 }
544 536
545 static inline void 537 static inline void
546 handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) 538 handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
547 { 539 {
548 if (!next->io_bitmap_ptr) { 540 if (!next->io_bitmap_ptr) {
549 /* 541 /*
550 * Disable the bitmap via an invalid offset. We still cache 542 * Disable the bitmap via an invalid offset. We still cache
551 * the previous bitmap owner and the IO bitmap contents: 543 * the previous bitmap owner and the IO bitmap contents:
552 */ 544 */
553 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 545 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
554 return; 546 return;
555 } 547 }
556 if (likely(next == tss->io_bitmap_owner)) { 548 if (likely(next == tss->io_bitmap_owner)) {
557 /* 549 /*
558 * Previous owner of the bitmap (hence the bitmap content) 550 * Previous owner of the bitmap (hence the bitmap content)
559 * matches the next task, we dont have to do anything but 551 * matches the next task, we dont have to do anything but
560 * to set a valid offset in the TSS: 552 * to set a valid offset in the TSS:
561 */ 553 */
562 tss->io_bitmap_base = IO_BITMAP_OFFSET; 554 tss->io_bitmap_base = IO_BITMAP_OFFSET;
563 return; 555 return;
564 } 556 }
565 /* 557 /*
566 * Lazy TSS's I/O bitmap copy. We set an invalid offset here 558 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
567 * and we let the task to get a GPF in case an I/O instruction 559 * and we let the task to get a GPF in case an I/O instruction
568 * is performed. The handler of the GPF will verify that the 560 * is performed. The handler of the GPF will verify that the
569 * faulting task has a valid I/O bitmap and, it true, does the 561 * faulting task has a valid I/O bitmap and, it true, does the
570 * real copy and restart the instruction. This will save us 562 * real copy and restart the instruction. This will save us
571 * redundant copies when the currently switched task does not 563 * redundant copies when the currently switched task does not
572 * perform any I/O during its timeslice. 564 * perform any I/O during its timeslice.
573 */ 565 */
574 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 566 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
575 } 567 }
576 568
577 /* 569 /*
578 * This function selects if the context switch from prev to next 570 * This function selects if the context switch from prev to next
579 * has to tweak the TSC disable bit in the cr4. 571 * has to tweak the TSC disable bit in the cr4.
580 */ 572 */
581 static inline void disable_tsc(struct task_struct *prev_p, 573 static inline void disable_tsc(struct task_struct *prev_p,
582 struct task_struct *next_p) 574 struct task_struct *next_p)
583 { 575 {
584 struct thread_info *prev, *next; 576 struct thread_info *prev, *next;
585 577
586 /* 578 /*
587 * gcc should eliminate the ->thread_info dereference if 579 * gcc should eliminate the ->thread_info dereference if
588 * has_secure_computing returns 0 at compile time (SECCOMP=n). 580 * has_secure_computing returns 0 at compile time (SECCOMP=n).
589 */ 581 */
590 prev = task_thread_info(prev_p); 582 prev = task_thread_info(prev_p);
591 next = task_thread_info(next_p); 583 next = task_thread_info(next_p);
592 584
593 if (has_secure_computing(prev) || has_secure_computing(next)) { 585 if (has_secure_computing(prev) || has_secure_computing(next)) {
594 /* slow path here */ 586 /* slow path here */
595 if (has_secure_computing(prev) && 587 if (has_secure_computing(prev) &&
596 !has_secure_computing(next)) { 588 !has_secure_computing(next)) {
597 write_cr4(read_cr4() & ~X86_CR4_TSD); 589 write_cr4(read_cr4() & ~X86_CR4_TSD);
598 } else if (!has_secure_computing(prev) && 590 } else if (!has_secure_computing(prev) &&
599 has_secure_computing(next)) 591 has_secure_computing(next))
600 write_cr4(read_cr4() | X86_CR4_TSD); 592 write_cr4(read_cr4() | X86_CR4_TSD);
601 } 593 }
602 } 594 }
603 595
604 /* 596 /*
605 * switch_to(x,yn) should switch tasks from x to y. 597 * switch_to(x,yn) should switch tasks from x to y.
606 * 598 *
607 * We fsave/fwait so that an exception goes off at the right time 599 * We fsave/fwait so that an exception goes off at the right time
608 * (as a call from the fsave or fwait in effect) rather than to 600 * (as a call from the fsave or fwait in effect) rather than to
609 * the wrong process. Lazy FP saving no longer makes any sense 601 * the wrong process. Lazy FP saving no longer makes any sense
610 * with modern CPU's, and this simplifies a lot of things (SMP 602 * with modern CPU's, and this simplifies a lot of things (SMP
611 * and UP become the same). 603 * and UP become the same).
612 * 604 *
613 * NOTE! We used to use the x86 hardware context switching. The 605 * NOTE! We used to use the x86 hardware context switching. The
614 * reason for not using it any more becomes apparent when you 606 * reason for not using it any more becomes apparent when you
615 * try to recover gracefully from saved state that is no longer 607 * try to recover gracefully from saved state that is no longer
616 * valid (stale segment register values in particular). With the 608 * valid (stale segment register values in particular). With the
617 * hardware task-switch, there is no way to fix up bad state in 609 * hardware task-switch, there is no way to fix up bad state in
618 * a reasonable manner. 610 * a reasonable manner.
619 * 611 *
620 * The fact that Intel documents the hardware task-switching to 612 * The fact that Intel documents the hardware task-switching to
621 * be slow is a fairly red herring - this code is not noticeably 613 * be slow is a fairly red herring - this code is not noticeably
622 * faster. However, there _is_ some room for improvement here, 614 * faster. However, there _is_ some room for improvement here,
623 * so the performance issues may eventually be a valid point. 615 * so the performance issues may eventually be a valid point.
624 * More important, however, is the fact that this allows us much 616 * More important, however, is the fact that this allows us much
625 * more flexibility. 617 * more flexibility.
626 * 618 *
627 * The return value (in %eax) will be the "prev" task after 619 * The return value (in %eax) will be the "prev" task after
628 * the task-switch, and shows up in ret_from_fork in entry.S, 620 * the task-switch, and shows up in ret_from_fork in entry.S,
629 * for example. 621 * for example.
630 */ 622 */
631 struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 623 struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
632 { 624 {
633 struct thread_struct *prev = &prev_p->thread, 625 struct thread_struct *prev = &prev_p->thread,
634 *next = &next_p->thread; 626 *next = &next_p->thread;
635 int cpu = smp_processor_id(); 627 int cpu = smp_processor_id();
636 struct tss_struct *tss = &per_cpu(init_tss, cpu); 628 struct tss_struct *tss = &per_cpu(init_tss, cpu);
637 629
638 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 630 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
639 631
640 __unlazy_fpu(prev_p); 632 __unlazy_fpu(prev_p);
641 633
642 /* 634 /*
643 * Reload esp0. 635 * Reload esp0.
644 */ 636 */
645 load_esp0(tss, next); 637 load_esp0(tss, next);
646 638
647 /* 639 /*
648 * Save away %fs and %gs. No need to save %es and %ds, as 640 * Save away %fs and %gs. No need to save %es and %ds, as
649 * those are always kernel segments while inside the kernel. 641 * those are always kernel segments while inside the kernel.
650 * Doing this before setting the new TLS descriptors avoids 642 * Doing this before setting the new TLS descriptors avoids
651 * the situation where we temporarily have non-reloadable 643 * the situation where we temporarily have non-reloadable
652 * segments in %fs and %gs. This could be an issue if the 644 * segments in %fs and %gs. This could be an issue if the
653 * NMI handler ever used %fs or %gs (it does not today), or 645 * NMI handler ever used %fs or %gs (it does not today), or
654 * if the kernel is running inside of a hypervisor layer. 646 * if the kernel is running inside of a hypervisor layer.
655 */ 647 */
656 savesegment(fs, prev->fs); 648 savesegment(fs, prev->fs);
657 savesegment(gs, prev->gs); 649 savesegment(gs, prev->gs);
658 650
659 /* 651 /*
660 * Load the per-thread Thread-Local Storage descriptor. 652 * Load the per-thread Thread-Local Storage descriptor.
661 */ 653 */
662 load_TLS(next, cpu); 654 load_TLS(next, cpu);
663 655
664 /* 656 /*
665 * Restore %fs and %gs if needed. 657 * Restore %fs and %gs if needed.
666 * 658 *
667 * Glibc normally makes %fs be zero, and %gs is one of 659 * Glibc normally makes %fs be zero, and %gs is one of
668 * the TLS segments. 660 * the TLS segments.
669 */ 661 */
670 if (unlikely(prev->fs | next->fs)) 662 if (unlikely(prev->fs | next->fs))
671 loadsegment(fs, next->fs); 663 loadsegment(fs, next->fs);
672 664
673 if (prev->gs | next->gs) 665 if (prev->gs | next->gs)
674 loadsegment(gs, next->gs); 666 loadsegment(gs, next->gs);
675 667
676 /* 668 /*
677 * Restore IOPL if needed. 669 * Restore IOPL if needed.
678 */ 670 */
679 if (unlikely(prev->iopl != next->iopl)) 671 if (unlikely(prev->iopl != next->iopl))
680 set_iopl_mask(next->iopl); 672 set_iopl_mask(next->iopl);
681 673
682 /* 674 /*
683 * Now maybe reload the debug registers 675 * Now maybe reload the debug registers
684 */ 676 */
685 if (unlikely(next->debugreg[7])) { 677 if (unlikely(next->debugreg[7])) {
686 set_debugreg(next->debugreg[0], 0); 678 set_debugreg(next->debugreg[0], 0);
687 set_debugreg(next->debugreg[1], 1); 679 set_debugreg(next->debugreg[1], 1);
688 set_debugreg(next->debugreg[2], 2); 680 set_debugreg(next->debugreg[2], 2);
689 set_debugreg(next->debugreg[3], 3); 681 set_debugreg(next->debugreg[3], 3);
690 /* no 4 and 5 */ 682 /* no 4 and 5 */
691 set_debugreg(next->debugreg[6], 6); 683 set_debugreg(next->debugreg[6], 6);
692 set_debugreg(next->debugreg[7], 7); 684 set_debugreg(next->debugreg[7], 7);
693 } 685 }
694 686
695 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) 687 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
696 handle_io_bitmap(next, tss); 688 handle_io_bitmap(next, tss);
697 689
698 disable_tsc(prev_p, next_p); 690 disable_tsc(prev_p, next_p);
699 691
700 return prev_p; 692 return prev_p;
701 } 693 }
702 694
703 asmlinkage int sys_fork(struct pt_regs regs) 695 asmlinkage int sys_fork(struct pt_regs regs)
704 { 696 {
705 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 697 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
706 } 698 }
707 699
708 asmlinkage int sys_clone(struct pt_regs regs) 700 asmlinkage int sys_clone(struct pt_regs regs)
709 { 701 {
710 unsigned long clone_flags; 702 unsigned long clone_flags;
711 unsigned long newsp; 703 unsigned long newsp;
712 int __user *parent_tidptr, *child_tidptr; 704 int __user *parent_tidptr, *child_tidptr;
713 705
714 clone_flags = regs.ebx; 706 clone_flags = regs.ebx;
715 newsp = regs.ecx; 707 newsp = regs.ecx;
716 parent_tidptr = (int __user *)regs.edx; 708 parent_tidptr = (int __user *)regs.edx;
717 child_tidptr = (int __user *)regs.edi; 709 child_tidptr = (int __user *)regs.edi;
718 if (!newsp) 710 if (!newsp)
719 newsp = regs.esp; 711 newsp = regs.esp;
720 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 712 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
721 } 713 }
722 714
723 /* 715 /*
724 * This is trivial, and on the face of it looks like it 716 * This is trivial, and on the face of it looks like it
725 * could equally well be done in user mode. 717 * could equally well be done in user mode.
726 * 718 *
727 * Not so, for quite unobvious reasons - register pressure. 719 * Not so, for quite unobvious reasons - register pressure.
728 * In user mode vfork() cannot have a stack frame, and if 720 * In user mode vfork() cannot have a stack frame, and if
729 * done by calling the "clone()" system call directly, you 721 * done by calling the "clone()" system call directly, you
730 * do not have enough call-clobbered registers to hold all 722 * do not have enough call-clobbered registers to hold all
731 * the information you need. 723 * the information you need.
732 */ 724 */
733 asmlinkage int sys_vfork(struct pt_regs regs) 725 asmlinkage int sys_vfork(struct pt_regs regs)
734 { 726 {
735 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 727 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
736 } 728 }
737 729
738 /* 730 /*
739 * sys_execve() executes a new program. 731 * sys_execve() executes a new program.
740 */ 732 */
741 asmlinkage int sys_execve(struct pt_regs regs) 733 asmlinkage int sys_execve(struct pt_regs regs)
742 { 734 {
743 int error; 735 int error;
744 char * filename; 736 char * filename;
745 737
746 filename = getname((char __user *) regs.ebx); 738 filename = getname((char __user *) regs.ebx);
747 error = PTR_ERR(filename); 739 error = PTR_ERR(filename);
748 if (IS_ERR(filename)) 740 if (IS_ERR(filename))
749 goto out; 741 goto out;
750 error = do_execve(filename, 742 error = do_execve(filename,
751 (char __user * __user *) regs.ecx, 743 (char __user * __user *) regs.ecx,
752 (char __user * __user *) regs.edx, 744 (char __user * __user *) regs.edx,
753 &regs); 745 &regs);
754 if (error == 0) { 746 if (error == 0) {
755 task_lock(current); 747 task_lock(current);
756 current->ptrace &= ~PT_DTRACE; 748 current->ptrace &= ~PT_DTRACE;
757 task_unlock(current); 749 task_unlock(current);
758 /* Make sure we don't return using sysenter.. */ 750 /* Make sure we don't return using sysenter.. */
759 set_thread_flag(TIF_IRET); 751 set_thread_flag(TIF_IRET);
760 } 752 }
761 putname(filename); 753 putname(filename);
762 out: 754 out:
763 return error; 755 return error;
764 } 756 }
765 757
766 #define top_esp (THREAD_SIZE - sizeof(unsigned long)) 758 #define top_esp (THREAD_SIZE - sizeof(unsigned long))
767 #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 759 #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
768 760
769 unsigned long get_wchan(struct task_struct *p) 761 unsigned long get_wchan(struct task_struct *p)
770 { 762 {
771 unsigned long ebp, esp, eip; 763 unsigned long ebp, esp, eip;
772 unsigned long stack_page; 764 unsigned long stack_page;
773 int count = 0; 765 int count = 0;
774 if (!p || p == current || p->state == TASK_RUNNING) 766 if (!p || p == current || p->state == TASK_RUNNING)
775 return 0; 767 return 0;
776 stack_page = (unsigned long)task_stack_page(p); 768 stack_page = (unsigned long)task_stack_page(p);
777 esp = p->thread.esp; 769 esp = p->thread.esp;
778 if (!stack_page || esp < stack_page || esp > top_esp+stack_page) 770 if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
779 return 0; 771 return 0;
780 /* include/asm-i386/system.h:switch_to() pushes ebp last. */ 772 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
781 ebp = *(unsigned long *) esp; 773 ebp = *(unsigned long *) esp;
782 do { 774 do {
783 if (ebp < stack_page || ebp > top_ebp+stack_page) 775 if (ebp < stack_page || ebp > top_ebp+stack_page)
784 return 0; 776 return 0;
785 eip = *(unsigned long *) (ebp+4); 777 eip = *(unsigned long *) (ebp+4);
786 if (!in_sched_functions(eip)) 778 if (!in_sched_functions(eip))
787 return eip; 779 return eip;
788 ebp = *(unsigned long *) ebp; 780 ebp = *(unsigned long *) ebp;
789 } while (count++ < 16); 781 } while (count++ < 16);
790 return 0; 782 return 0;
791 } 783 }
792 EXPORT_SYMBOL(get_wchan); 784 EXPORT_SYMBOL(get_wchan);
793 785
794 /* 786 /*
795 * sys_alloc_thread_area: get a yet unused TLS descriptor index. 787 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
796 */ 788 */
797 static int get_free_idx(void) 789 static int get_free_idx(void)
798 { 790 {
799 struct thread_struct *t = &current->thread; 791 struct thread_struct *t = &current->thread;
800 int idx; 792 int idx;
801 793
802 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) 794 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
803 if (desc_empty(t->tls_array + idx)) 795 if (desc_empty(t->tls_array + idx))
804 return idx + GDT_ENTRY_TLS_MIN; 796 return idx + GDT_ENTRY_TLS_MIN;
805 return -ESRCH; 797 return -ESRCH;
806 } 798 }
807 799
808 /* 800 /*
809 * Set a given TLS descriptor: 801 * Set a given TLS descriptor:
810 */ 802 */
811 asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) 803 asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
812 { 804 {
813 struct thread_struct *t = &current->thread; 805 struct thread_struct *t = &current->thread;
814 struct user_desc info; 806 struct user_desc info;
815 struct desc_struct *desc; 807 struct desc_struct *desc;
816 int cpu, idx; 808 int cpu, idx;
817 809
818 if (copy_from_user(&info, u_info, sizeof(info))) 810 if (copy_from_user(&info, u_info, sizeof(info)))
819 return -EFAULT; 811 return -EFAULT;
820 idx = info.entry_number; 812 idx = info.entry_number;
821 813
822 /* 814 /*
823 * index -1 means the kernel should try to find and 815 * index -1 means the kernel should try to find and
824 * allocate an empty descriptor: 816 * allocate an empty descriptor:
825 */ 817 */
826 if (idx == -1) { 818 if (idx == -1) {
827 idx = get_free_idx(); 819 idx = get_free_idx();
828 if (idx < 0) 820 if (idx < 0)
829 return idx; 821 return idx;
830 if (put_user(idx, &u_info->entry_number)) 822 if (put_user(idx, &u_info->entry_number))
831 return -EFAULT; 823 return -EFAULT;
832 } 824 }
833 825
834 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 826 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
835 return -EINVAL; 827 return -EINVAL;
836 828
837 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; 829 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
838 830
839 /* 831 /*
840 * We must not get preempted while modifying the TLS. 832 * We must not get preempted while modifying the TLS.
841 */ 833 */
842 cpu = get_cpu(); 834 cpu = get_cpu();
843 835
844 if (LDT_empty(&info)) { 836 if (LDT_empty(&info)) {
845 desc->a = 0; 837 desc->a = 0;
846 desc->b = 0; 838 desc->b = 0;
847 } else { 839 } else {
848 desc->a = LDT_entry_a(&info); 840 desc->a = LDT_entry_a(&info);
849 desc->b = LDT_entry_b(&info); 841 desc->b = LDT_entry_b(&info);
850 } 842 }
851 load_TLS(t, cpu); 843 load_TLS(t, cpu);
852 844
853 put_cpu(); 845 put_cpu();
854 846
855 return 0; 847 return 0;
856 } 848 }
857 849
858 /* 850 /*
859 * Get the current Thread-Local Storage area: 851 * Get the current Thread-Local Storage area:
860 */ 852 */
861 853
862 #define GET_BASE(desc) ( \ 854 #define GET_BASE(desc) ( \
863 (((desc)->a >> 16) & 0x0000ffff) | \ 855 (((desc)->a >> 16) & 0x0000ffff) | \
864 (((desc)->b << 16) & 0x00ff0000) | \ 856 (((desc)->b << 16) & 0x00ff0000) | \
865 ( (desc)->b & 0xff000000) ) 857 ( (desc)->b & 0xff000000) )
866 858
867 #define GET_LIMIT(desc) ( \ 859 #define GET_LIMIT(desc) ( \
868 ((desc)->a & 0x0ffff) | \ 860 ((desc)->a & 0x0ffff) | \
869 ((desc)->b & 0xf0000) ) 861 ((desc)->b & 0xf0000) )
870 862
871 #define GET_32BIT(desc) (((desc)->b >> 22) & 1) 863 #define GET_32BIT(desc) (((desc)->b >> 22) & 1)
872 #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) 864 #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
873 #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) 865 #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
874 #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) 866 #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
875 #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) 867 #define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
876 #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) 868 #define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
877 869
878 asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) 870 asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
879 { 871 {
880 struct user_desc info; 872 struct user_desc info;
881 struct desc_struct *desc; 873 struct desc_struct *desc;
882 int idx; 874 int idx;
883 875
884 if (get_user(idx, &u_info->entry_number)) 876 if (get_user(idx, &u_info->entry_number))
885 return -EFAULT; 877 return -EFAULT;
886 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 878 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
887 return -EINVAL; 879 return -EINVAL;
888 880
889 memset(&info, 0, sizeof(info)); 881 memset(&info, 0, sizeof(info));
890 882
891 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; 883 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
892 884
893 info.entry_number = idx; 885 info.entry_number = idx;
894 info.base_addr = GET_BASE(desc); 886 info.base_addr = GET_BASE(desc);
895 info.limit = GET_LIMIT(desc); 887 info.limit = GET_LIMIT(desc);
896 info.seg_32bit = GET_32BIT(desc); 888 info.seg_32bit = GET_32BIT(desc);
897 info.contents = GET_CONTENTS(desc); 889 info.contents = GET_CONTENTS(desc);
898 info.read_exec_only = !GET_WRITABLE(desc); 890 info.read_exec_only = !GET_WRITABLE(desc);
899 info.limit_in_pages = GET_LIMIT_PAGES(desc); 891 info.limit_in_pages = GET_LIMIT_PAGES(desc);
900 info.seg_not_present = !GET_PRESENT(desc); 892 info.seg_not_present = !GET_PRESENT(desc);
901 info.useable = GET_USEABLE(desc); 893 info.useable = GET_USEABLE(desc);
902 894
903 if (copy_to_user(u_info, &info, sizeof(info))) 895 if (copy_to_user(u_info, &info, sizeof(info)))
904 return -EFAULT; 896 return -EFAULT;
905 return 0; 897 return 0;
906 } 898 }
907 899
908 unsigned long arch_align_stack(unsigned long sp) 900 unsigned long arch_align_stack(unsigned long sp)
909 { 901 {
910 if (randomize_va_space) 902 if (randomize_va_space)
911 sp -= get_random_int() % 8192; 903 sp -= get_random_int() % 8192;
912 return sp & ~0xf; 904 return sp & ~0xf;
913 } 905 }
914 906
arch/ia64/kernel/process.c
1 /* 1 /*
2 * Architecture-specific setup. 2 * Architecture-specific setup.
3 * 3 *
4 * Copyright (C) 1998-2003 Hewlett-Packard Co 4 * Copyright (C) 1998-2003 Hewlett-Packard Co
5 * David Mosberger-Tang <davidm@hpl.hp.com> 5 * David Mosberger-Tang <davidm@hpl.hp.com>
6 * 04/11/17 Ashok Raj <ashok.raj@intel.com> Added CPU Hotplug Support 6 * 04/11/17 Ashok Raj <ashok.raj@intel.com> Added CPU Hotplug Support
7 * 7 *
8 * 2005-10-07 Keith Owens <kaos@sgi.com> 8 * 2005-10-07 Keith Owens <kaos@sgi.com>
9 * Add notify_die() hooks. 9 * Add notify_die() hooks.
10 */ 10 */
11 #define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ 11 #define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */
12 #include <linux/config.h> 12 #include <linux/config.h>
13 13
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <linux/pm.h> 15 #include <linux/pm.h>
16 #include <linux/elf.h> 16 #include <linux/elf.h>
17 #include <linux/errno.h> 17 #include <linux/errno.h>
18 #include <linux/kallsyms.h> 18 #include <linux/kallsyms.h>
19 #include <linux/kernel.h> 19 #include <linux/kernel.h>
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/module.h> 21 #include <linux/module.h>
22 #include <linux/notifier.h> 22 #include <linux/notifier.h>
23 #include <linux/personality.h> 23 #include <linux/personality.h>
24 #include <linux/sched.h> 24 #include <linux/sched.h>
25 #include <linux/slab.h> 25 #include <linux/slab.h>
26 #include <linux/smp_lock.h> 26 #include <linux/smp_lock.h>
27 #include <linux/stddef.h> 27 #include <linux/stddef.h>
28 #include <linux/thread_info.h> 28 #include <linux/thread_info.h>
29 #include <linux/unistd.h> 29 #include <linux/unistd.h>
30 #include <linux/efi.h> 30 #include <linux/efi.h>
31 #include <linux/interrupt.h> 31 #include <linux/interrupt.h>
32 #include <linux/delay.h> 32 #include <linux/delay.h>
33 #include <linux/kprobes.h>
34 33
35 #include <asm/cpu.h> 34 #include <asm/cpu.h>
36 #include <asm/delay.h> 35 #include <asm/delay.h>
37 #include <asm/elf.h> 36 #include <asm/elf.h>
38 #include <asm/ia32.h> 37 #include <asm/ia32.h>
39 #include <asm/irq.h> 38 #include <asm/irq.h>
40 #include <asm/kdebug.h> 39 #include <asm/kdebug.h>
41 #include <asm/pgalloc.h> 40 #include <asm/pgalloc.h>
42 #include <asm/processor.h> 41 #include <asm/processor.h>
43 #include <asm/sal.h> 42 #include <asm/sal.h>
44 #include <asm/tlbflush.h> 43 #include <asm/tlbflush.h>
45 #include <asm/uaccess.h> 44 #include <asm/uaccess.h>
46 #include <asm/unwind.h> 45 #include <asm/unwind.h>
47 #include <asm/user.h> 46 #include <asm/user.h>
48 47
49 #include "entry.h" 48 #include "entry.h"
50 49
51 #ifdef CONFIG_PERFMON 50 #ifdef CONFIG_PERFMON
52 # include <asm/perfmon.h> 51 # include <asm/perfmon.h>
53 #endif 52 #endif
54 53
55 #include "sigframe.h" 54 #include "sigframe.h"
56 55
57 void (*ia64_mark_idle)(int); 56 void (*ia64_mark_idle)(int);
58 static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 57 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
59 58
60 unsigned long boot_option_idle_override = 0; 59 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override); 60 EXPORT_SYMBOL(boot_option_idle_override);
62 61
63 void 62 void
64 ia64_do_show_stack (struct unw_frame_info *info, void *arg) 63 ia64_do_show_stack (struct unw_frame_info *info, void *arg)
65 { 64 {
66 unsigned long ip, sp, bsp; 65 unsigned long ip, sp, bsp;
67 char buf[128]; /* don't make it so big that it overflows the stack! */ 66 char buf[128]; /* don't make it so big that it overflows the stack! */
68 67
69 printk("\nCall Trace:\n"); 68 printk("\nCall Trace:\n");
70 do { 69 do {
71 unw_get_ip(info, &ip); 70 unw_get_ip(info, &ip);
72 if (ip == 0) 71 if (ip == 0)
73 break; 72 break;
74 73
75 unw_get_sp(info, &sp); 74 unw_get_sp(info, &sp);
76 unw_get_bsp(info, &bsp); 75 unw_get_bsp(info, &bsp);
77 snprintf(buf, sizeof(buf), 76 snprintf(buf, sizeof(buf),
78 " [<%016lx>] %%s\n" 77 " [<%016lx>] %%s\n"
79 " sp=%016lx bsp=%016lx\n", 78 " sp=%016lx bsp=%016lx\n",
80 ip, sp, bsp); 79 ip, sp, bsp);
81 print_symbol(buf, ip); 80 print_symbol(buf, ip);
82 } while (unw_unwind(info) >= 0); 81 } while (unw_unwind(info) >= 0);
83 } 82 }
84 83
85 void 84 void
86 show_stack (struct task_struct *task, unsigned long *sp) 85 show_stack (struct task_struct *task, unsigned long *sp)
87 { 86 {
88 if (!task) 87 if (!task)
89 unw_init_running(ia64_do_show_stack, NULL); 88 unw_init_running(ia64_do_show_stack, NULL);
90 else { 89 else {
91 struct unw_frame_info info; 90 struct unw_frame_info info;
92 91
93 unw_init_from_blocked_task(&info, task); 92 unw_init_from_blocked_task(&info, task);
94 ia64_do_show_stack(&info, NULL); 93 ia64_do_show_stack(&info, NULL);
95 } 94 }
96 } 95 }
97 96
98 void 97 void
99 dump_stack (void) 98 dump_stack (void)
100 { 99 {
101 show_stack(NULL, NULL); 100 show_stack(NULL, NULL);
102 } 101 }
103 102
104 EXPORT_SYMBOL(dump_stack); 103 EXPORT_SYMBOL(dump_stack);
105 104
106 void 105 void
107 show_regs (struct pt_regs *regs) 106 show_regs (struct pt_regs *regs)
108 { 107 {
109 unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; 108 unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
110 109
111 print_modules(); 110 print_modules();
112 printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); 111 printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm);
113 printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", 112 printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n",
114 regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); 113 regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
115 print_symbol("ip is at %s\n", ip); 114 print_symbol("ip is at %s\n", ip);
116 printk("unat: %016lx pfs : %016lx rsc : %016lx\n", 115 printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
117 regs->ar_unat, regs->ar_pfs, regs->ar_rsc); 116 regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
118 printk("rnat: %016lx bsps: %016lx pr : %016lx\n", 117 printk("rnat: %016lx bsps: %016lx pr : %016lx\n",
119 regs->ar_rnat, regs->ar_bspstore, regs->pr); 118 regs->ar_rnat, regs->ar_bspstore, regs->pr);
120 printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n", 119 printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
121 regs->loadrs, regs->ar_ccv, regs->ar_fpsr); 120 regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
122 printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd); 121 printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
123 printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7); 122 printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7);
124 printk("f6 : %05lx%016lx f7 : %05lx%016lx\n", 123 printk("f6 : %05lx%016lx f7 : %05lx%016lx\n",
125 regs->f6.u.bits[1], regs->f6.u.bits[0], 124 regs->f6.u.bits[1], regs->f6.u.bits[0],
126 regs->f7.u.bits[1], regs->f7.u.bits[0]); 125 regs->f7.u.bits[1], regs->f7.u.bits[0]);
127 printk("f8 : %05lx%016lx f9 : %05lx%016lx\n", 126 printk("f8 : %05lx%016lx f9 : %05lx%016lx\n",
128 regs->f8.u.bits[1], regs->f8.u.bits[0], 127 regs->f8.u.bits[1], regs->f8.u.bits[0],
129 regs->f9.u.bits[1], regs->f9.u.bits[0]); 128 regs->f9.u.bits[1], regs->f9.u.bits[0]);
130 printk("f10 : %05lx%016lx f11 : %05lx%016lx\n", 129 printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
131 regs->f10.u.bits[1], regs->f10.u.bits[0], 130 regs->f10.u.bits[1], regs->f10.u.bits[0],
132 regs->f11.u.bits[1], regs->f11.u.bits[0]); 131 regs->f11.u.bits[1], regs->f11.u.bits[0]);
133 132
134 printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3); 133 printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3);
135 printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10); 134 printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10);
136 printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13); 135 printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13);
137 printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16); 136 printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16);
138 printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19); 137 printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19);
139 printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22); 138 printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22);
140 printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25); 139 printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25);
141 printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28); 140 printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28);
142 printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31); 141 printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31);
143 142
144 if (user_mode(regs)) { 143 if (user_mode(regs)) {
145 /* print the stacked registers */ 144 /* print the stacked registers */
146 unsigned long val, *bsp, ndirty; 145 unsigned long val, *bsp, ndirty;
147 int i, sof, is_nat = 0; 146 int i, sof, is_nat = 0;
148 147
149 sof = regs->cr_ifs & 0x7f; /* size of frame */ 148 sof = regs->cr_ifs & 0x7f; /* size of frame */
150 ndirty = (regs->loadrs >> 19); 149 ndirty = (regs->loadrs >> 19);
151 bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty); 150 bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty);
152 for (i = 0; i < sof; ++i) { 151 for (i = 0; i < sof; ++i) {
153 get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i)); 152 get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i));
154 printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val, 153 printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val,
155 ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); 154 ((i == sof - 1) || (i % 3) == 2) ? "\n" : " ");
156 } 155 }
157 } else 156 } else
158 show_stack(NULL, NULL); 157 show_stack(NULL, NULL);
159 } 158 }
160 159
161 void 160 void
162 do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) 161 do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
163 { 162 {
164 if (fsys_mode(current, &scr->pt)) { 163 if (fsys_mode(current, &scr->pt)) {
165 /* defer signal-handling etc. until we return to privilege-level 0. */ 164 /* defer signal-handling etc. until we return to privilege-level 0. */
166 if (!ia64_psr(&scr->pt)->lp) 165 if (!ia64_psr(&scr->pt)->lp)
167 ia64_psr(&scr->pt)->lp = 1; 166 ia64_psr(&scr->pt)->lp = 1;
168 return; 167 return;
169 } 168 }
170 169
171 #ifdef CONFIG_PERFMON 170 #ifdef CONFIG_PERFMON
172 if (current->thread.pfm_needs_checking) 171 if (current->thread.pfm_needs_checking)
173 pfm_handle_work(); 172 pfm_handle_work();
174 #endif 173 #endif
175 174
176 /* deal with pending signal delivery */ 175 /* deal with pending signal delivery */
177 if (test_thread_flag(TIF_SIGPENDING)) 176 if (test_thread_flag(TIF_SIGPENDING))
178 ia64_do_signal(oldset, scr, in_syscall); 177 ia64_do_signal(oldset, scr, in_syscall);
179 } 178 }
180 179
181 static int pal_halt = 1; 180 static int pal_halt = 1;
182 static int can_do_pal_halt = 1; 181 static int can_do_pal_halt = 1;
183 182
184 static int __init nohalt_setup(char * str) 183 static int __init nohalt_setup(char * str)
185 { 184 {
186 pal_halt = can_do_pal_halt = 0; 185 pal_halt = can_do_pal_halt = 0;
187 return 1; 186 return 1;
188 } 187 }
189 __setup("nohalt", nohalt_setup); 188 __setup("nohalt", nohalt_setup);
190 189
191 void 190 void
192 update_pal_halt_status(int status) 191 update_pal_halt_status(int status)
193 { 192 {
194 can_do_pal_halt = pal_halt && status; 193 can_do_pal_halt = pal_halt && status;
195 } 194 }
196 195
197 /* 196 /*
198 * We use this if we don't have any better idle routine.. 197 * We use this if we don't have any better idle routine..
199 */ 198 */
200 void 199 void
201 default_idle (void) 200 default_idle (void)
202 { 201 {
203 local_irq_enable(); 202 local_irq_enable();
204 while (!need_resched()) { 203 while (!need_resched()) {
205 if (can_do_pal_halt) 204 if (can_do_pal_halt)
206 safe_halt(); 205 safe_halt();
207 else 206 else
208 cpu_relax(); 207 cpu_relax();
209 } 208 }
210 } 209 }
211 210
212 #ifdef CONFIG_HOTPLUG_CPU 211 #ifdef CONFIG_HOTPLUG_CPU
213 /* We don't actually take CPU down, just spin without interrupts. */ 212 /* We don't actually take CPU down, just spin without interrupts. */
214 static inline void play_dead(void) 213 static inline void play_dead(void)
215 { 214 {
216 extern void ia64_cpu_local_tick (void); 215 extern void ia64_cpu_local_tick (void);
217 unsigned int this_cpu = smp_processor_id(); 216 unsigned int this_cpu = smp_processor_id();
218 217
219 /* Ack it */ 218 /* Ack it */
220 __get_cpu_var(cpu_state) = CPU_DEAD; 219 __get_cpu_var(cpu_state) = CPU_DEAD;
221 220
222 max_xtp(); 221 max_xtp();
223 local_irq_disable(); 222 local_irq_disable();
224 idle_task_exit(); 223 idle_task_exit();
225 ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]); 224 ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]);
226 /* 225 /*
227 * The above is a point of no-return, the processor is 226 * The above is a point of no-return, the processor is
228 * expected to be in SAL loop now. 227 * expected to be in SAL loop now.
229 */ 228 */
230 BUG(); 229 BUG();
231 } 230 }
232 #else 231 #else
233 static inline void play_dead(void) 232 static inline void play_dead(void)
234 { 233 {
235 BUG(); 234 BUG();
236 } 235 }
237 #endif /* CONFIG_HOTPLUG_CPU */ 236 #endif /* CONFIG_HOTPLUG_CPU */
238 237
239 void cpu_idle_wait(void) 238 void cpu_idle_wait(void)
240 { 239 {
241 unsigned int cpu, this_cpu = get_cpu(); 240 unsigned int cpu, this_cpu = get_cpu();
242 cpumask_t map; 241 cpumask_t map;
243 242
244 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 243 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
245 put_cpu(); 244 put_cpu();
246 245
247 cpus_clear(map); 246 cpus_clear(map);
248 for_each_online_cpu(cpu) { 247 for_each_online_cpu(cpu) {
249 per_cpu(cpu_idle_state, cpu) = 1; 248 per_cpu(cpu_idle_state, cpu) = 1;
250 cpu_set(cpu, map); 249 cpu_set(cpu, map);
251 } 250 }
252 251
253 __get_cpu_var(cpu_idle_state) = 0; 252 __get_cpu_var(cpu_idle_state) = 0;
254 253
255 wmb(); 254 wmb();
256 do { 255 do {
257 ssleep(1); 256 ssleep(1);
258 for_each_online_cpu(cpu) { 257 for_each_online_cpu(cpu) {
259 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) 258 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
260 cpu_clear(cpu, map); 259 cpu_clear(cpu, map);
261 } 260 }
262 cpus_and(map, map, cpu_online_map); 261 cpus_and(map, map, cpu_online_map);
263 } while (!cpus_empty(map)); 262 } while (!cpus_empty(map));
264 } 263 }
265 EXPORT_SYMBOL_GPL(cpu_idle_wait); 264 EXPORT_SYMBOL_GPL(cpu_idle_wait);
266 265
267 void __attribute__((noreturn)) 266 void __attribute__((noreturn))
268 cpu_idle (void) 267 cpu_idle (void)
269 { 268 {
270 void (*mark_idle)(int) = ia64_mark_idle; 269 void (*mark_idle)(int) = ia64_mark_idle;
271 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
272 271
273 /* endless idle loop with no priority at all */ 272 /* endless idle loop with no priority at all */
274 while (1) { 273 while (1) {
275 if (can_do_pal_halt) 274 if (can_do_pal_halt)
276 clear_thread_flag(TIF_POLLING_NRFLAG); 275 clear_thread_flag(TIF_POLLING_NRFLAG);
277 else 276 else
278 set_thread_flag(TIF_POLLING_NRFLAG); 277 set_thread_flag(TIF_POLLING_NRFLAG);
279 278
280 if (!need_resched()) { 279 if (!need_resched()) {
281 void (*idle)(void); 280 void (*idle)(void);
282 #ifdef CONFIG_SMP 281 #ifdef CONFIG_SMP
283 min_xtp(); 282 min_xtp();
284 #endif 283 #endif
285 if (__get_cpu_var(cpu_idle_state)) 284 if (__get_cpu_var(cpu_idle_state))
286 __get_cpu_var(cpu_idle_state) = 0; 285 __get_cpu_var(cpu_idle_state) = 0;
287 286
288 rmb(); 287 rmb();
289 if (mark_idle) 288 if (mark_idle)
290 (*mark_idle)(1); 289 (*mark_idle)(1);
291 290
292 idle = pm_idle; 291 idle = pm_idle;
293 if (!idle) 292 if (!idle)
294 idle = default_idle; 293 idle = default_idle;
295 (*idle)(); 294 (*idle)();
296 if (mark_idle) 295 if (mark_idle)
297 (*mark_idle)(0); 296 (*mark_idle)(0);
298 #ifdef CONFIG_SMP 297 #ifdef CONFIG_SMP
299 normal_xtp(); 298 normal_xtp();
300 #endif 299 #endif
301 } 300 }
302 preempt_enable_no_resched(); 301 preempt_enable_no_resched();
303 schedule(); 302 schedule();
304 preempt_disable(); 303 preempt_disable();
305 check_pgt_cache(); 304 check_pgt_cache();
306 if (cpu_is_offline(cpu)) 305 if (cpu_is_offline(cpu))
307 play_dead(); 306 play_dead();
308 } 307 }
309 } 308 }
310 309
311 void 310 void
312 ia64_save_extra (struct task_struct *task) 311 ia64_save_extra (struct task_struct *task)
313 { 312 {
314 #ifdef CONFIG_PERFMON 313 #ifdef CONFIG_PERFMON
315 unsigned long info; 314 unsigned long info;
316 #endif 315 #endif
317 316
318 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) 317 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
319 ia64_save_debug_regs(&task->thread.dbr[0]); 318 ia64_save_debug_regs(&task->thread.dbr[0]);
320 319
321 #ifdef CONFIG_PERFMON 320 #ifdef CONFIG_PERFMON
322 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) 321 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
323 pfm_save_regs(task); 322 pfm_save_regs(task);
324 323
325 info = __get_cpu_var(pfm_syst_info); 324 info = __get_cpu_var(pfm_syst_info);
326 if (info & PFM_CPUINFO_SYST_WIDE) 325 if (info & PFM_CPUINFO_SYST_WIDE)
327 pfm_syst_wide_update_task(task, info, 0); 326 pfm_syst_wide_update_task(task, info, 0);
328 #endif 327 #endif
329 328
330 #ifdef CONFIG_IA32_SUPPORT 329 #ifdef CONFIG_IA32_SUPPORT
331 if (IS_IA32_PROCESS(task_pt_regs(task))) 330 if (IS_IA32_PROCESS(task_pt_regs(task)))
332 ia32_save_state(task); 331 ia32_save_state(task);
333 #endif 332 #endif
334 } 333 }
335 334
336 void 335 void
337 ia64_load_extra (struct task_struct *task) 336 ia64_load_extra (struct task_struct *task)
338 { 337 {
339 #ifdef CONFIG_PERFMON 338 #ifdef CONFIG_PERFMON
340 unsigned long info; 339 unsigned long info;
341 #endif 340 #endif
342 341
343 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) 342 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
344 ia64_load_debug_regs(&task->thread.dbr[0]); 343 ia64_load_debug_regs(&task->thread.dbr[0]);
345 344
346 #ifdef CONFIG_PERFMON 345 #ifdef CONFIG_PERFMON
347 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) 346 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
348 pfm_load_regs(task); 347 pfm_load_regs(task);
349 348
350 info = __get_cpu_var(pfm_syst_info); 349 info = __get_cpu_var(pfm_syst_info);
351 if (info & PFM_CPUINFO_SYST_WIDE) 350 if (info & PFM_CPUINFO_SYST_WIDE)
352 pfm_syst_wide_update_task(task, info, 1); 351 pfm_syst_wide_update_task(task, info, 1);
353 #endif 352 #endif
354 353
355 #ifdef CONFIG_IA32_SUPPORT 354 #ifdef CONFIG_IA32_SUPPORT
356 if (IS_IA32_PROCESS(task_pt_regs(task))) 355 if (IS_IA32_PROCESS(task_pt_regs(task)))
357 ia32_load_state(task); 356 ia32_load_state(task);
358 #endif 357 #endif
359 } 358 }
360 359
361 /* 360 /*
362 * Copy the state of an ia-64 thread. 361 * Copy the state of an ia-64 thread.
363 * 362 *
364 * We get here through the following call chain: 363 * We get here through the following call chain:
365 * 364 *
366 * from user-level: from kernel: 365 * from user-level: from kernel:
367 * 366 *
368 * <clone syscall> <some kernel call frames> 367 * <clone syscall> <some kernel call frames>
369 * sys_clone : 368 * sys_clone :
370 * do_fork do_fork 369 * do_fork do_fork
371 * copy_thread copy_thread 370 * copy_thread copy_thread
372 * 371 *
373 * This means that the stack layout is as follows: 372 * This means that the stack layout is as follows:
374 * 373 *
375 * +---------------------+ (highest addr) 374 * +---------------------+ (highest addr)
376 * | struct pt_regs | 375 * | struct pt_regs |
377 * +---------------------+ 376 * +---------------------+
378 * | struct switch_stack | 377 * | struct switch_stack |
379 * +---------------------+ 378 * +---------------------+
380 * | | 379 * | |
381 * | memory stack | 380 * | memory stack |
382 * | | <-- sp (lowest addr) 381 * | | <-- sp (lowest addr)
383 * +---------------------+ 382 * +---------------------+
384 * 383 *
385 * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an 384 * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an
386 * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register, 385 * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register,
387 * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the 386 * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the
388 * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since 387 * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since
389 * the stack is page aligned and the page size is at least 4KB, this is always the case, 388 * the stack is page aligned and the page size is at least 4KB, this is always the case,
390 * so there is nothing to worry about. 389 * so there is nothing to worry about.
391 */ 390 */
392 int 391 int
393 copy_thread (int nr, unsigned long clone_flags, 392 copy_thread (int nr, unsigned long clone_flags,
394 unsigned long user_stack_base, unsigned long user_stack_size, 393 unsigned long user_stack_base, unsigned long user_stack_size,
395 struct task_struct *p, struct pt_regs *regs) 394 struct task_struct *p, struct pt_regs *regs)
396 { 395 {
397 extern char ia64_ret_from_clone, ia32_ret_from_clone; 396 extern char ia64_ret_from_clone, ia32_ret_from_clone;
398 struct switch_stack *child_stack, *stack; 397 struct switch_stack *child_stack, *stack;
399 unsigned long rbs, child_rbs, rbs_size; 398 unsigned long rbs, child_rbs, rbs_size;
400 struct pt_regs *child_ptregs; 399 struct pt_regs *child_ptregs;
401 int retval = 0; 400 int retval = 0;
402 401
403 #ifdef CONFIG_SMP 402 #ifdef CONFIG_SMP
404 /* 403 /*
405 * For SMP idle threads, fork_by_hand() calls do_fork with 404 * For SMP idle threads, fork_by_hand() calls do_fork with
406 * NULL regs. 405 * NULL regs.
407 */ 406 */
408 if (!regs) 407 if (!regs)
409 return 0; 408 return 0;
410 #endif 409 #endif
411 410
412 stack = ((struct switch_stack *) regs) - 1; 411 stack = ((struct switch_stack *) regs) - 1;
413 412
414 child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; 413 child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
415 child_stack = (struct switch_stack *) child_ptregs - 1; 414 child_stack = (struct switch_stack *) child_ptregs - 1;
416 415
417 /* copy parent's switch_stack & pt_regs to child: */ 416 /* copy parent's switch_stack & pt_regs to child: */
418 memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); 417 memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
419 418
420 rbs = (unsigned long) current + IA64_RBS_OFFSET; 419 rbs = (unsigned long) current + IA64_RBS_OFFSET;
421 child_rbs = (unsigned long) p + IA64_RBS_OFFSET; 420 child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
422 rbs_size = stack->ar_bspstore - rbs; 421 rbs_size = stack->ar_bspstore - rbs;
423 422
424 /* copy the parent's register backing store to the child: */ 423 /* copy the parent's register backing store to the child: */
425 memcpy((void *) child_rbs, (void *) rbs, rbs_size); 424 memcpy((void *) child_rbs, (void *) rbs, rbs_size);
426 425
427 if (likely(user_mode(child_ptregs))) { 426 if (likely(user_mode(child_ptregs))) {
428 if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) 427 if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs))
429 child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ 428 child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */
430 if (user_stack_base) { 429 if (user_stack_base) {
431 child_ptregs->r12 = user_stack_base + user_stack_size - 16; 430 child_ptregs->r12 = user_stack_base + user_stack_size - 16;
432 child_ptregs->ar_bspstore = user_stack_base; 431 child_ptregs->ar_bspstore = user_stack_base;
433 child_ptregs->ar_rnat = 0; 432 child_ptregs->ar_rnat = 0;
434 child_ptregs->loadrs = 0; 433 child_ptregs->loadrs = 0;
435 } 434 }
436 } else { 435 } else {
437 /* 436 /*
438 * Note: we simply preserve the relative position of 437 * Note: we simply preserve the relative position of
439 * the stack pointer here. There is no need to 438 * the stack pointer here. There is no need to
440 * allocate a scratch area here, since that will have 439 * allocate a scratch area here, since that will have
441 * been taken care of by the caller of sys_clone() 440 * been taken care of by the caller of sys_clone()
442 * already. 441 * already.
443 */ 442 */
444 child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ 443 child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */
445 child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ 444 child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */
446 } 445 }
447 child_stack->ar_bspstore = child_rbs + rbs_size; 446 child_stack->ar_bspstore = child_rbs + rbs_size;
448 if (IS_IA32_PROCESS(regs)) 447 if (IS_IA32_PROCESS(regs))
449 child_stack->b0 = (unsigned long) &ia32_ret_from_clone; 448 child_stack->b0 = (unsigned long) &ia32_ret_from_clone;
450 else 449 else
451 child_stack->b0 = (unsigned long) &ia64_ret_from_clone; 450 child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
452 451
453 /* copy parts of thread_struct: */ 452 /* copy parts of thread_struct: */
454 p->thread.ksp = (unsigned long) child_stack - 16; 453 p->thread.ksp = (unsigned long) child_stack - 16;
455 454
456 /* stop some PSR bits from being inherited. 455 /* stop some PSR bits from being inherited.
457 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() 456 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
458 * therefore we must specify them explicitly here and not include them in 457 * therefore we must specify them explicitly here and not include them in
459 * IA64_PSR_BITS_TO_CLEAR. 458 * IA64_PSR_BITS_TO_CLEAR.
460 */ 459 */
461 child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) 460 child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
462 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); 461 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
463 462
464 /* 463 /*
465 * NOTE: The calling convention considers all floating point 464 * NOTE: The calling convention considers all floating point
466 * registers in the high partition (fph) to be scratch. Since 465 * registers in the high partition (fph) to be scratch. Since
467 * the only way to get to this point is through a system call, 466 * the only way to get to this point is through a system call,
468 * we know that the values in fph are all dead. Hence, there 467 * we know that the values in fph are all dead. Hence, there
469 * is no need to inherit the fph state from the parent to the 468 * is no need to inherit the fph state from the parent to the
470 * child and all we have to do is to make sure that 469 * child and all we have to do is to make sure that
471 * IA64_THREAD_FPH_VALID is cleared in the child. 470 * IA64_THREAD_FPH_VALID is cleared in the child.
472 * 471 *
473 * XXX We could push this optimization a bit further by 472 * XXX We could push this optimization a bit further by
474 * clearing IA64_THREAD_FPH_VALID on ANY system call. 473 * clearing IA64_THREAD_FPH_VALID on ANY system call.
475 * However, it's not clear this is worth doing. Also, it 474 * However, it's not clear this is worth doing. Also, it
476 * would be a slight deviation from the normal Linux system 475 * would be a slight deviation from the normal Linux system
477 * call behavior where scratch registers are preserved across 476 * call behavior where scratch registers are preserved across
478 * system calls (unless used by the system call itself). 477 * system calls (unless used by the system call itself).
479 */ 478 */
480 # define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ 479 # define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
481 | IA64_THREAD_PM_VALID) 480 | IA64_THREAD_PM_VALID)
482 # define THREAD_FLAGS_TO_SET 0 481 # define THREAD_FLAGS_TO_SET 0
483 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) 482 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
484 | THREAD_FLAGS_TO_SET); 483 | THREAD_FLAGS_TO_SET);
485 ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ 484 ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */
486 #ifdef CONFIG_IA32_SUPPORT 485 #ifdef CONFIG_IA32_SUPPORT
487 /* 486 /*
488 * If we're cloning an IA32 task then save the IA32 extra 487 * If we're cloning an IA32 task then save the IA32 extra
489 * state from the current task to the new task 488 * state from the current task to the new task
490 */ 489 */
491 if (IS_IA32_PROCESS(task_pt_regs(current))) { 490 if (IS_IA32_PROCESS(task_pt_regs(current))) {
492 ia32_save_state(p); 491 ia32_save_state(p);
493 if (clone_flags & CLONE_SETTLS) 492 if (clone_flags & CLONE_SETTLS)
494 retval = ia32_clone_tls(p, child_ptregs); 493 retval = ia32_clone_tls(p, child_ptregs);
495 494
496 /* Copy partially mapped page list */ 495 /* Copy partially mapped page list */
497 if (!retval) 496 if (!retval)
498 retval = ia32_copy_partial_page_list(p, clone_flags); 497 retval = ia32_copy_partial_page_list(p, clone_flags);
499 } 498 }
500 #endif 499 #endif
501 500
502 #ifdef CONFIG_PERFMON 501 #ifdef CONFIG_PERFMON
503 if (current->thread.pfm_context) 502 if (current->thread.pfm_context)
504 pfm_inherit(p, child_ptregs); 503 pfm_inherit(p, child_ptregs);
505 #endif 504 #endif
506 return retval; 505 return retval;
507 } 506 }
508 507
509 static void 508 static void
510 do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) 509 do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
511 { 510 {
512 unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm; 511 unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
513 elf_greg_t *dst = arg; 512 elf_greg_t *dst = arg;
514 struct pt_regs *pt; 513 struct pt_regs *pt;
515 char nat; 514 char nat;
516 int i; 515 int i;
517 516
518 memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */ 517 memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */
519 518
520 if (unw_unwind_to_user(info) < 0) 519 if (unw_unwind_to_user(info) < 0)
521 return; 520 return;
522 521
523 unw_get_sp(info, &sp); 522 unw_get_sp(info, &sp);
524 pt = (struct pt_regs *) (sp + 16); 523 pt = (struct pt_regs *) (sp + 16);
525 524
526 urbs_end = ia64_get_user_rbs_end(task, pt, &cfm); 525 urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
527 526
528 if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0) 527 if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
529 return; 528 return;
530 529
531 ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end), 530 ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
532 &ar_rnat); 531 &ar_rnat);
533 532
534 /* 533 /*
535 * coredump format: 534 * coredump format:
536 * r0-r31 535 * r0-r31
537 * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) 536 * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
538 * predicate registers (p0-p63) 537 * predicate registers (p0-p63)
539 * b0-b7 538 * b0-b7
540 * ip cfm user-mask 539 * ip cfm user-mask
541 * ar.rsc ar.bsp ar.bspstore ar.rnat 540 * ar.rsc ar.bsp ar.bspstore ar.rnat
542 * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec 541 * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
543 */ 542 */
544 543
545 /* r0 is zero */ 544 /* r0 is zero */
546 for (i = 1, mask = (1UL << i); i < 32; ++i) { 545 for (i = 1, mask = (1UL << i); i < 32; ++i) {
547 unw_get_gr(info, i, &dst[i], &nat); 546 unw_get_gr(info, i, &dst[i], &nat);
548 if (nat) 547 if (nat)
549 nat_bits |= mask; 548 nat_bits |= mask;
550 mask <<= 1; 549 mask <<= 1;
551 } 550 }
552 dst[32] = nat_bits; 551 dst[32] = nat_bits;
553 unw_get_pr(info, &dst[33]); 552 unw_get_pr(info, &dst[33]);
554 553
555 for (i = 0; i < 8; ++i) 554 for (i = 0; i < 8; ++i)
556 unw_get_br(info, i, &dst[34 + i]); 555 unw_get_br(info, i, &dst[34 + i]);
557 556
558 unw_get_rp(info, &ip); 557 unw_get_rp(info, &ip);
559 dst[42] = ip + ia64_psr(pt)->ri; 558 dst[42] = ip + ia64_psr(pt)->ri;
560 dst[43] = cfm; 559 dst[43] = cfm;
561 dst[44] = pt->cr_ipsr & IA64_PSR_UM; 560 dst[44] = pt->cr_ipsr & IA64_PSR_UM;
562 561
563 unw_get_ar(info, UNW_AR_RSC, &dst[45]); 562 unw_get_ar(info, UNW_AR_RSC, &dst[45]);
564 /* 563 /*
565 * For bsp and bspstore, unw_get_ar() would return the kernel 564 * For bsp and bspstore, unw_get_ar() would return the kernel
566 * addresses, but we need the user-level addresses instead: 565 * addresses, but we need the user-level addresses instead:
567 */ 566 */
568 dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */ 567 dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */
569 dst[47] = pt->ar_bspstore; 568 dst[47] = pt->ar_bspstore;
570 dst[48] = ar_rnat; 569 dst[48] = ar_rnat;
571 unw_get_ar(info, UNW_AR_CCV, &dst[49]); 570 unw_get_ar(info, UNW_AR_CCV, &dst[49]);
572 unw_get_ar(info, UNW_AR_UNAT, &dst[50]); 571 unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
573 unw_get_ar(info, UNW_AR_FPSR, &dst[51]); 572 unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
574 dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */ 573 dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
575 unw_get_ar(info, UNW_AR_LC, &dst[53]); 574 unw_get_ar(info, UNW_AR_LC, &dst[53]);
576 unw_get_ar(info, UNW_AR_EC, &dst[54]); 575 unw_get_ar(info, UNW_AR_EC, &dst[54]);
577 unw_get_ar(info, UNW_AR_CSD, &dst[55]); 576 unw_get_ar(info, UNW_AR_CSD, &dst[55]);
578 unw_get_ar(info, UNW_AR_SSD, &dst[56]); 577 unw_get_ar(info, UNW_AR_SSD, &dst[56]);
579 } 578 }
580 579
581 void 580 void
582 do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg) 581 do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg)
583 { 582 {
584 elf_fpreg_t *dst = arg; 583 elf_fpreg_t *dst = arg;
585 int i; 584 int i;
586 585
587 memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */ 586 memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */
588 587
589 if (unw_unwind_to_user(info) < 0) 588 if (unw_unwind_to_user(info) < 0)
590 return; 589 return;
591 590
592 /* f0 is 0.0, f1 is 1.0 */ 591 /* f0 is 0.0, f1 is 1.0 */
593 592
594 for (i = 2; i < 32; ++i) 593 for (i = 2; i < 32; ++i)
595 unw_get_fr(info, i, dst + i); 594 unw_get_fr(info, i, dst + i);
596 595
597 ia64_flush_fph(task); 596 ia64_flush_fph(task);
598 if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0) 597 if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0)
599 memcpy(dst + 32, task->thread.fph, 96*16); 598 memcpy(dst + 32, task->thread.fph, 96*16);
600 } 599 }
601 600
602 void 601 void
603 do_copy_regs (struct unw_frame_info *info, void *arg) 602 do_copy_regs (struct unw_frame_info *info, void *arg)
604 { 603 {
605 do_copy_task_regs(current, info, arg); 604 do_copy_task_regs(current, info, arg);
606 } 605 }
607 606
608 void 607 void
609 do_dump_fpu (struct unw_frame_info *info, void *arg) 608 do_dump_fpu (struct unw_frame_info *info, void *arg)
610 { 609 {
611 do_dump_task_fpu(current, info, arg); 610 do_dump_task_fpu(current, info, arg);
612 } 611 }
613 612
614 int 613 int
615 dump_task_regs(struct task_struct *task, elf_gregset_t *regs) 614 dump_task_regs(struct task_struct *task, elf_gregset_t *regs)
616 { 615 {
617 struct unw_frame_info tcore_info; 616 struct unw_frame_info tcore_info;
618 617
619 if (current == task) { 618 if (current == task) {
620 unw_init_running(do_copy_regs, regs); 619 unw_init_running(do_copy_regs, regs);
621 } else { 620 } else {
622 memset(&tcore_info, 0, sizeof(tcore_info)); 621 memset(&tcore_info, 0, sizeof(tcore_info));
623 unw_init_from_blocked_task(&tcore_info, task); 622 unw_init_from_blocked_task(&tcore_info, task);
624 do_copy_task_regs(task, &tcore_info, regs); 623 do_copy_task_regs(task, &tcore_info, regs);
625 } 624 }
626 return 1; 625 return 1;
627 } 626 }
628 627
629 void 628 void
630 ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) 629 ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
631 { 630 {
632 unw_init_running(do_copy_regs, dst); 631 unw_init_running(do_copy_regs, dst);
633 } 632 }
634 633
635 int 634 int
636 dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) 635 dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst)
637 { 636 {
638 struct unw_frame_info tcore_info; 637 struct unw_frame_info tcore_info;
639 638
640 if (current == task) { 639 if (current == task) {
641 unw_init_running(do_dump_fpu, dst); 640 unw_init_running(do_dump_fpu, dst);
642 } else { 641 } else {
643 memset(&tcore_info, 0, sizeof(tcore_info)); 642 memset(&tcore_info, 0, sizeof(tcore_info));
644 unw_init_from_blocked_task(&tcore_info, task); 643 unw_init_from_blocked_task(&tcore_info, task);
645 do_dump_task_fpu(task, &tcore_info, dst); 644 do_dump_task_fpu(task, &tcore_info, dst);
646 } 645 }
647 return 1; 646 return 1;
648 } 647 }
649 648
650 int 649 int
651 dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) 650 dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
652 { 651 {
653 unw_init_running(do_dump_fpu, dst); 652 unw_init_running(do_dump_fpu, dst);
654 return 1; /* f0-f31 are always valid so we always return 1 */ 653 return 1; /* f0-f31 are always valid so we always return 1 */
655 } 654 }
656 655
657 long 656 long
658 sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, 657 sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
659 struct pt_regs *regs) 658 struct pt_regs *regs)
660 { 659 {
661 char *fname; 660 char *fname;
662 int error; 661 int error;
663 662
664 fname = getname(filename); 663 fname = getname(filename);
665 error = PTR_ERR(fname); 664 error = PTR_ERR(fname);
666 if (IS_ERR(fname)) 665 if (IS_ERR(fname))
667 goto out; 666 goto out;
668 error = do_execve(fname, argv, envp, regs); 667 error = do_execve(fname, argv, envp, regs);
669 putname(fname); 668 putname(fname);
670 out: 669 out:
671 return error; 670 return error;
672 } 671 }
673 672
674 pid_t 673 pid_t
675 kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) 674 kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
676 { 675 {
677 extern void start_kernel_thread (void); 676 extern void start_kernel_thread (void);
678 unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; 677 unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
679 struct { 678 struct {
680 struct switch_stack sw; 679 struct switch_stack sw;
681 struct pt_regs pt; 680 struct pt_regs pt;
682 } regs; 681 } regs;
683 682
684 memset(&regs, 0, sizeof(regs)); 683 memset(&regs, 0, sizeof(regs));
685 regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ 684 regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */
686 regs.pt.r1 = helper_fptr[1]; /* set GP */ 685 regs.pt.r1 = helper_fptr[1]; /* set GP */
687 regs.pt.r9 = (unsigned long) fn; /* 1st argument */ 686 regs.pt.r9 = (unsigned long) fn; /* 1st argument */
688 regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ 687 regs.pt.r11 = (unsigned long) arg; /* 2nd argument */
689 /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ 688 /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */
690 regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; 689 regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
691 regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ 690 regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */
692 regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); 691 regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
693 regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; 692 regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
694 regs.sw.pr = (1 << PRED_KERNEL_STACK); 693 regs.sw.pr = (1 << PRED_KERNEL_STACK);
695 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL); 694 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL);
696 } 695 }
697 EXPORT_SYMBOL(kernel_thread); 696 EXPORT_SYMBOL(kernel_thread);
698 697
699 /* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ 698 /* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */
700 int 699 int
701 kernel_thread_helper (int (*fn)(void *), void *arg) 700 kernel_thread_helper (int (*fn)(void *), void *arg)
702 { 701 {
703 #ifdef CONFIG_IA32_SUPPORT 702 #ifdef CONFIG_IA32_SUPPORT
704 if (IS_IA32_PROCESS(task_pt_regs(current))) { 703 if (IS_IA32_PROCESS(task_pt_regs(current))) {
705 /* A kernel thread is always a 64-bit process. */ 704 /* A kernel thread is always a 64-bit process. */
706 current->thread.map_base = DEFAULT_MAP_BASE; 705 current->thread.map_base = DEFAULT_MAP_BASE;
707 current->thread.task_size = DEFAULT_TASK_SIZE; 706 current->thread.task_size = DEFAULT_TASK_SIZE;
708 ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); 707 ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob);
709 ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); 708 ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1);
710 } 709 }
711 #endif 710 #endif
712 return (*fn)(arg); 711 return (*fn)(arg);
713 } 712 }
714 713
715 /* 714 /*
716 * Flush thread state. This is called when a thread does an execve(). 715 * Flush thread state. This is called when a thread does an execve().
717 */ 716 */
718 void 717 void
719 flush_thread (void) 718 flush_thread (void)
720 { 719 {
721 /* drop floating-point and debug-register state if it exists: */ 720 /* drop floating-point and debug-register state if it exists: */
722 current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); 721 current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
723 ia64_drop_fpu(current); 722 ia64_drop_fpu(current);
724 #ifdef CONFIG_IA32_SUPPORT 723 #ifdef CONFIG_IA32_SUPPORT
725 if (IS_IA32_PROCESS(task_pt_regs(current))) { 724 if (IS_IA32_PROCESS(task_pt_regs(current))) {
726 ia32_drop_partial_page_list(current); 725 ia32_drop_partial_page_list(current);
727 current->thread.task_size = IA32_PAGE_OFFSET; 726 current->thread.task_size = IA32_PAGE_OFFSET;
728 set_fs(USER_DS); 727 set_fs(USER_DS);
729 } 728 }
730 #endif 729 #endif
731 } 730 }
732 731
733 /* 732 /*
734 * Clean up state associated with current thread. This is called when 733 * Clean up state associated with current thread. This is called when
735 * the thread calls exit(). 734 * the thread calls exit().
736 */ 735 */
737 void 736 void
738 exit_thread (void) 737 exit_thread (void)
739 { 738 {
740
741 /*
742 * Remove function-return probe instances associated with this task
743 * and put them back on the free list. Do not insert an exit probe for
744 * this function, it will be disabled by kprobe_flush_task if you do.
745 */
746 kprobe_flush_task(current);
747 739
748 ia64_drop_fpu(current); 740 ia64_drop_fpu(current);
749 #ifdef CONFIG_PERFMON 741 #ifdef CONFIG_PERFMON
750 /* if needed, stop monitoring and flush state to perfmon context */ 742 /* if needed, stop monitoring and flush state to perfmon context */
751 if (current->thread.pfm_context) 743 if (current->thread.pfm_context)
752 pfm_exit_thread(current); 744 pfm_exit_thread(current);
753 745
754 /* free debug register resources */ 746 /* free debug register resources */
755 if (current->thread.flags & IA64_THREAD_DBG_VALID) 747 if (current->thread.flags & IA64_THREAD_DBG_VALID)
756 pfm_release_debug_registers(current); 748 pfm_release_debug_registers(current);
757 #endif 749 #endif
758 if (IS_IA32_PROCESS(task_pt_regs(current))) 750 if (IS_IA32_PROCESS(task_pt_regs(current)))
759 ia32_drop_partial_page_list(current); 751 ia32_drop_partial_page_list(current);
760 } 752 }
761 753
762 unsigned long 754 unsigned long
763 get_wchan (struct task_struct *p) 755 get_wchan (struct task_struct *p)
764 { 756 {
765 struct unw_frame_info info; 757 struct unw_frame_info info;
766 unsigned long ip; 758 unsigned long ip;
767 int count = 0; 759 int count = 0;
768 760
769 /* 761 /*
770 * Note: p may not be a blocked task (it could be current or 762 * Note: p may not be a blocked task (it could be current or
771 * another process running on some other CPU. Rather than 763 * another process running on some other CPU. Rather than
772 * trying to determine if p is really blocked, we just assume 764 * trying to determine if p is really blocked, we just assume
773 * it's blocked and rely on the unwind routines to fail 765 * it's blocked and rely on the unwind routines to fail
774 * gracefully if the process wasn't really blocked after all. 766 * gracefully if the process wasn't really blocked after all.
775 * --davidm 99/12/15 767 * --davidm 99/12/15
776 */ 768 */
777 unw_init_from_blocked_task(&info, p); 769 unw_init_from_blocked_task(&info, p);
778 do { 770 do {
779 if (unw_unwind(&info) < 0) 771 if (unw_unwind(&info) < 0)
780 return 0; 772 return 0;
781 unw_get_ip(&info, &ip); 773 unw_get_ip(&info, &ip);
782 if (!in_sched_functions(ip)) 774 if (!in_sched_functions(ip))
783 return ip; 775 return ip;
784 } while (count++ < 16); 776 } while (count++ < 16);
785 return 0; 777 return 0;
786 } 778 }
787 779
788 void 780 void
789 cpu_halt (void) 781 cpu_halt (void)
790 { 782 {
791 pal_power_mgmt_info_u_t power_info[8]; 783 pal_power_mgmt_info_u_t power_info[8];
792 unsigned long min_power; 784 unsigned long min_power;
793 int i, min_power_state; 785 int i, min_power_state;
794 786
795 if (ia64_pal_halt_info(power_info) != 0) 787 if (ia64_pal_halt_info(power_info) != 0)
796 return; 788 return;
797 789
798 min_power_state = 0; 790 min_power_state = 0;
799 min_power = power_info[0].pal_power_mgmt_info_s.power_consumption; 791 min_power = power_info[0].pal_power_mgmt_info_s.power_consumption;
800 for (i = 1; i < 8; ++i) 792 for (i = 1; i < 8; ++i)
801 if (power_info[i].pal_power_mgmt_info_s.im 793 if (power_info[i].pal_power_mgmt_info_s.im
802 && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) { 794 && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) {
803 min_power = power_info[i].pal_power_mgmt_info_s.power_consumption; 795 min_power = power_info[i].pal_power_mgmt_info_s.power_consumption;
804 min_power_state = i; 796 min_power_state = i;
805 } 797 }
806 798
807 while (1) 799 while (1)
808 ia64_pal_halt(min_power_state); 800 ia64_pal_halt(min_power_state);
809 } 801 }
810 802
811 void 803 void
812 machine_restart (char *restart_cmd) 804 machine_restart (char *restart_cmd)
813 { 805 {
814 (void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0); 806 (void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
815 (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); 807 (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
816 } 808 }
817 809
818 void 810 void
819 machine_halt (void) 811 machine_halt (void)
820 { 812 {
821 (void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0); 813 (void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
822 cpu_halt(); 814 cpu_halt();
823 } 815 }
824 816
825 void 817 void
826 machine_power_off (void) 818 machine_power_off (void)
827 { 819 {
828 if (pm_power_off) 820 if (pm_power_off)
829 pm_power_off(); 821 pm_power_off();
830 machine_halt(); 822 machine_halt();
831 } 823 }
832 824
833 825
arch/powerpc/kernel/process.c
1 /* 1 /*
2 * Derived from "arch/i386/kernel/process.c" 2 * Derived from "arch/i386/kernel/process.c"
3 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 1995 Linus Torvalds
4 * 4 *
5 * Updated and modified by Cort Dougan (cort@cs.nmt.edu) and 5 * Updated and modified by Cort Dougan (cort@cs.nmt.edu) and
6 * Paul Mackerras (paulus@cs.anu.edu.au) 6 * Paul Mackerras (paulus@cs.anu.edu.au)
7 * 7 *
8 * PowerPC version 8 * PowerPC version
9 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 9 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version 13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version. 14 * 2 of the License, or (at your option) any later version.
15 */ 15 */
16 16
17 #include <linux/config.h> 17 #include <linux/config.h>
18 #include <linux/errno.h> 18 #include <linux/errno.h>
19 #include <linux/sched.h> 19 #include <linux/sched.h>
20 #include <linux/kernel.h> 20 #include <linux/kernel.h>
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/smp.h> 22 #include <linux/smp.h>
23 #include <linux/smp_lock.h> 23 #include <linux/smp_lock.h>
24 #include <linux/stddef.h> 24 #include <linux/stddef.h>
25 #include <linux/unistd.h> 25 #include <linux/unistd.h>
26 #include <linux/ptrace.h> 26 #include <linux/ptrace.h>
27 #include <linux/slab.h> 27 #include <linux/slab.h>
28 #include <linux/user.h> 28 #include <linux/user.h>
29 #include <linux/elf.h> 29 #include <linux/elf.h>
30 #include <linux/init.h> 30 #include <linux/init.h>
31 #include <linux/prctl.h> 31 #include <linux/prctl.h>
32 #include <linux/init_task.h> 32 #include <linux/init_task.h>
33 #include <linux/module.h> 33 #include <linux/module.h>
34 #include <linux/kallsyms.h> 34 #include <linux/kallsyms.h>
35 #include <linux/mqueue.h> 35 #include <linux/mqueue.h>
36 #include <linux/hardirq.h> 36 #include <linux/hardirq.h>
37 #include <linux/utsname.h> 37 #include <linux/utsname.h>
38 #include <linux/kprobes.h>
39 38
40 #include <asm/pgtable.h> 39 #include <asm/pgtable.h>
41 #include <asm/uaccess.h> 40 #include <asm/uaccess.h>
42 #include <asm/system.h> 41 #include <asm/system.h>
43 #include <asm/io.h> 42 #include <asm/io.h>
44 #include <asm/processor.h> 43 #include <asm/processor.h>
45 #include <asm/mmu.h> 44 #include <asm/mmu.h>
46 #include <asm/prom.h> 45 #include <asm/prom.h>
47 #include <asm/machdep.h> 46 #include <asm/machdep.h>
48 #include <asm/time.h> 47 #include <asm/time.h>
49 #ifdef CONFIG_PPC64 48 #ifdef CONFIG_PPC64
50 #include <asm/firmware.h> 49 #include <asm/firmware.h>
51 #endif 50 #endif
52 51
53 extern unsigned long _get_SP(void); 52 extern unsigned long _get_SP(void);
54 53
55 #ifndef CONFIG_SMP 54 #ifndef CONFIG_SMP
56 struct task_struct *last_task_used_math = NULL; 55 struct task_struct *last_task_used_math = NULL;
57 struct task_struct *last_task_used_altivec = NULL; 56 struct task_struct *last_task_used_altivec = NULL;
58 struct task_struct *last_task_used_spe = NULL; 57 struct task_struct *last_task_used_spe = NULL;
59 #endif 58 #endif
60 59
61 /* 60 /*
62 * Make sure the floating-point register state in the 61 * Make sure the floating-point register state in the
63 * the thread_struct is up to date for task tsk. 62 * the thread_struct is up to date for task tsk.
64 */ 63 */
65 void flush_fp_to_thread(struct task_struct *tsk) 64 void flush_fp_to_thread(struct task_struct *tsk)
66 { 65 {
67 if (tsk->thread.regs) { 66 if (tsk->thread.regs) {
68 /* 67 /*
69 * We need to disable preemption here because if we didn't, 68 * We need to disable preemption here because if we didn't,
70 * another process could get scheduled after the regs->msr 69 * another process could get scheduled after the regs->msr
71 * test but before we have finished saving the FP registers 70 * test but before we have finished saving the FP registers
72 * to the thread_struct. That process could take over the 71 * to the thread_struct. That process could take over the
73 * FPU, and then when we get scheduled again we would store 72 * FPU, and then when we get scheduled again we would store
74 * bogus values for the remaining FP registers. 73 * bogus values for the remaining FP registers.
75 */ 74 */
76 preempt_disable(); 75 preempt_disable();
77 if (tsk->thread.regs->msr & MSR_FP) { 76 if (tsk->thread.regs->msr & MSR_FP) {
78 #ifdef CONFIG_SMP 77 #ifdef CONFIG_SMP
79 /* 78 /*
80 * This should only ever be called for current or 79 * This should only ever be called for current or
81 * for a stopped child process. Since we save away 80 * for a stopped child process. Since we save away
82 * the FP register state on context switch on SMP, 81 * the FP register state on context switch on SMP,
83 * there is something wrong if a stopped child appears 82 * there is something wrong if a stopped child appears
84 * to still have its FP state in the CPU registers. 83 * to still have its FP state in the CPU registers.
85 */ 84 */
86 BUG_ON(tsk != current); 85 BUG_ON(tsk != current);
87 #endif 86 #endif
88 giveup_fpu(current); 87 giveup_fpu(current);
89 } 88 }
90 preempt_enable(); 89 preempt_enable();
91 } 90 }
92 } 91 }
93 92
94 void enable_kernel_fp(void) 93 void enable_kernel_fp(void)
95 { 94 {
96 WARN_ON(preemptible()); 95 WARN_ON(preemptible());
97 96
98 #ifdef CONFIG_SMP 97 #ifdef CONFIG_SMP
99 if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) 98 if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
100 giveup_fpu(current); 99 giveup_fpu(current);
101 else 100 else
102 giveup_fpu(NULL); /* just enables FP for kernel */ 101 giveup_fpu(NULL); /* just enables FP for kernel */
103 #else 102 #else
104 giveup_fpu(last_task_used_math); 103 giveup_fpu(last_task_used_math);
105 #endif /* CONFIG_SMP */ 104 #endif /* CONFIG_SMP */
106 } 105 }
107 EXPORT_SYMBOL(enable_kernel_fp); 106 EXPORT_SYMBOL(enable_kernel_fp);
108 107
109 int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs) 108 int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs)
110 { 109 {
111 if (!tsk->thread.regs) 110 if (!tsk->thread.regs)
112 return 0; 111 return 0;
113 flush_fp_to_thread(current); 112 flush_fp_to_thread(current);
114 113
115 memcpy(fpregs, &tsk->thread.fpr[0], sizeof(*fpregs)); 114 memcpy(fpregs, &tsk->thread.fpr[0], sizeof(*fpregs));
116 115
117 return 1; 116 return 1;
118 } 117 }
119 118
120 #ifdef CONFIG_ALTIVEC 119 #ifdef CONFIG_ALTIVEC
121 void enable_kernel_altivec(void) 120 void enable_kernel_altivec(void)
122 { 121 {
123 WARN_ON(preemptible()); 122 WARN_ON(preemptible());
124 123
125 #ifdef CONFIG_SMP 124 #ifdef CONFIG_SMP
126 if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) 125 if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
127 giveup_altivec(current); 126 giveup_altivec(current);
128 else 127 else
129 giveup_altivec(NULL); /* just enable AltiVec for kernel - force */ 128 giveup_altivec(NULL); /* just enable AltiVec for kernel - force */
130 #else 129 #else
131 giveup_altivec(last_task_used_altivec); 130 giveup_altivec(last_task_used_altivec);
132 #endif /* CONFIG_SMP */ 131 #endif /* CONFIG_SMP */
133 } 132 }
134 EXPORT_SYMBOL(enable_kernel_altivec); 133 EXPORT_SYMBOL(enable_kernel_altivec);
135 134
136 /* 135 /*
137 * Make sure the VMX/Altivec register state in the 136 * Make sure the VMX/Altivec register state in the
138 * the thread_struct is up to date for task tsk. 137 * the thread_struct is up to date for task tsk.
139 */ 138 */
140 void flush_altivec_to_thread(struct task_struct *tsk) 139 void flush_altivec_to_thread(struct task_struct *tsk)
141 { 140 {
142 if (tsk->thread.regs) { 141 if (tsk->thread.regs) {
143 preempt_disable(); 142 preempt_disable();
144 if (tsk->thread.regs->msr & MSR_VEC) { 143 if (tsk->thread.regs->msr & MSR_VEC) {
145 #ifdef CONFIG_SMP 144 #ifdef CONFIG_SMP
146 BUG_ON(tsk != current); 145 BUG_ON(tsk != current);
147 #endif 146 #endif
148 giveup_altivec(current); 147 giveup_altivec(current);
149 } 148 }
150 preempt_enable(); 149 preempt_enable();
151 } 150 }
152 } 151 }
153 152
154 int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs) 153 int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs)
155 { 154 {
156 flush_altivec_to_thread(current); 155 flush_altivec_to_thread(current);
157 memcpy(vrregs, &current->thread.vr[0], sizeof(*vrregs)); 156 memcpy(vrregs, &current->thread.vr[0], sizeof(*vrregs));
158 return 1; 157 return 1;
159 } 158 }
160 #endif /* CONFIG_ALTIVEC */ 159 #endif /* CONFIG_ALTIVEC */
161 160
162 #ifdef CONFIG_SPE 161 #ifdef CONFIG_SPE
163 162
164 void enable_kernel_spe(void) 163 void enable_kernel_spe(void)
165 { 164 {
166 WARN_ON(preemptible()); 165 WARN_ON(preemptible());
167 166
168 #ifdef CONFIG_SMP 167 #ifdef CONFIG_SMP
169 if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) 168 if (current->thread.regs && (current->thread.regs->msr & MSR_SPE))
170 giveup_spe(current); 169 giveup_spe(current);
171 else 170 else
172 giveup_spe(NULL); /* just enable SPE for kernel - force */ 171 giveup_spe(NULL); /* just enable SPE for kernel - force */
173 #else 172 #else
174 giveup_spe(last_task_used_spe); 173 giveup_spe(last_task_used_spe);
175 #endif /* __SMP __ */ 174 #endif /* __SMP __ */
176 } 175 }
177 EXPORT_SYMBOL(enable_kernel_spe); 176 EXPORT_SYMBOL(enable_kernel_spe);
178 177
179 void flush_spe_to_thread(struct task_struct *tsk) 178 void flush_spe_to_thread(struct task_struct *tsk)
180 { 179 {
181 if (tsk->thread.regs) { 180 if (tsk->thread.regs) {
182 preempt_disable(); 181 preempt_disable();
183 if (tsk->thread.regs->msr & MSR_SPE) { 182 if (tsk->thread.regs->msr & MSR_SPE) {
184 #ifdef CONFIG_SMP 183 #ifdef CONFIG_SMP
185 BUG_ON(tsk != current); 184 BUG_ON(tsk != current);
186 #endif 185 #endif
187 giveup_spe(current); 186 giveup_spe(current);
188 } 187 }
189 preempt_enable(); 188 preempt_enable();
190 } 189 }
191 } 190 }
192 191
193 int dump_spe(struct pt_regs *regs, elf_vrregset_t *evrregs) 192 int dump_spe(struct pt_regs *regs, elf_vrregset_t *evrregs)
194 { 193 {
195 flush_spe_to_thread(current); 194 flush_spe_to_thread(current);
196 /* We copy u32 evr[32] + u64 acc + u32 spefscr -> 35 */ 195 /* We copy u32 evr[32] + u64 acc + u32 spefscr -> 35 */
197 memcpy(evrregs, &current->thread.evr[0], sizeof(u32) * 35); 196 memcpy(evrregs, &current->thread.evr[0], sizeof(u32) * 35);
198 return 1; 197 return 1;
199 } 198 }
200 #endif /* CONFIG_SPE */ 199 #endif /* CONFIG_SPE */
201 200
202 #ifndef CONFIG_SMP 201 #ifndef CONFIG_SMP
203 /* 202 /*
204 * If we are doing lazy switching of CPU state (FP, altivec or SPE), 203 * If we are doing lazy switching of CPU state (FP, altivec or SPE),
205 * and the current task has some state, discard it. 204 * and the current task has some state, discard it.
206 */ 205 */
207 void discard_lazy_cpu_state(void) 206 void discard_lazy_cpu_state(void)
208 { 207 {
209 preempt_disable(); 208 preempt_disable();
210 if (last_task_used_math == current) 209 if (last_task_used_math == current)
211 last_task_used_math = NULL; 210 last_task_used_math = NULL;
212 #ifdef CONFIG_ALTIVEC 211 #ifdef CONFIG_ALTIVEC
213 if (last_task_used_altivec == current) 212 if (last_task_used_altivec == current)
214 last_task_used_altivec = NULL; 213 last_task_used_altivec = NULL;
215 #endif /* CONFIG_ALTIVEC */ 214 #endif /* CONFIG_ALTIVEC */
216 #ifdef CONFIG_SPE 215 #ifdef CONFIG_SPE
217 if (last_task_used_spe == current) 216 if (last_task_used_spe == current)
218 last_task_used_spe = NULL; 217 last_task_used_spe = NULL;
219 #endif 218 #endif
220 preempt_enable(); 219 preempt_enable();
221 } 220 }
222 #endif /* CONFIG_SMP */ 221 #endif /* CONFIG_SMP */
223 222
224 #ifdef CONFIG_PPC_MERGE /* XXX for now */ 223 #ifdef CONFIG_PPC_MERGE /* XXX for now */
225 int set_dabr(unsigned long dabr) 224 int set_dabr(unsigned long dabr)
226 { 225 {
227 if (ppc_md.set_dabr) 226 if (ppc_md.set_dabr)
228 return ppc_md.set_dabr(dabr); 227 return ppc_md.set_dabr(dabr);
229 228
230 mtspr(SPRN_DABR, dabr); 229 mtspr(SPRN_DABR, dabr);
231 return 0; 230 return 0;
232 } 231 }
233 #endif 232 #endif
234 233
235 #ifdef CONFIG_PPC64 234 #ifdef CONFIG_PPC64
236 DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array); 235 DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array);
237 static DEFINE_PER_CPU(unsigned long, current_dabr); 236 static DEFINE_PER_CPU(unsigned long, current_dabr);
238 #endif 237 #endif
239 238
240 struct task_struct *__switch_to(struct task_struct *prev, 239 struct task_struct *__switch_to(struct task_struct *prev,
241 struct task_struct *new) 240 struct task_struct *new)
242 { 241 {
243 struct thread_struct *new_thread, *old_thread; 242 struct thread_struct *new_thread, *old_thread;
244 unsigned long flags; 243 unsigned long flags;
245 struct task_struct *last; 244 struct task_struct *last;
246 245
247 #ifdef CONFIG_SMP 246 #ifdef CONFIG_SMP
248 /* avoid complexity of lazy save/restore of fpu 247 /* avoid complexity of lazy save/restore of fpu
249 * by just saving it every time we switch out if 248 * by just saving it every time we switch out if
250 * this task used the fpu during the last quantum. 249 * this task used the fpu during the last quantum.
251 * 250 *
252 * If it tries to use the fpu again, it'll trap and 251 * If it tries to use the fpu again, it'll trap and
253 * reload its fp regs. So we don't have to do a restore 252 * reload its fp regs. So we don't have to do a restore
254 * every switch, just a save. 253 * every switch, just a save.
255 * -- Cort 254 * -- Cort
256 */ 255 */
257 if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP)) 256 if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP))
258 giveup_fpu(prev); 257 giveup_fpu(prev);
259 #ifdef CONFIG_ALTIVEC 258 #ifdef CONFIG_ALTIVEC
260 /* 259 /*
261 * If the previous thread used altivec in the last quantum 260 * If the previous thread used altivec in the last quantum
262 * (thus changing altivec regs) then save them. 261 * (thus changing altivec regs) then save them.
263 * We used to check the VRSAVE register but not all apps 262 * We used to check the VRSAVE register but not all apps
264 * set it, so we don't rely on it now (and in fact we need 263 * set it, so we don't rely on it now (and in fact we need
265 * to save & restore VSCR even if VRSAVE == 0). -- paulus 264 * to save & restore VSCR even if VRSAVE == 0). -- paulus
266 * 265 *
267 * On SMP we always save/restore altivec regs just to avoid the 266 * On SMP we always save/restore altivec regs just to avoid the
268 * complexity of changing processors. 267 * complexity of changing processors.
269 * -- Cort 268 * -- Cort
270 */ 269 */
271 if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC)) 270 if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC))
272 giveup_altivec(prev); 271 giveup_altivec(prev);
273 #endif /* CONFIG_ALTIVEC */ 272 #endif /* CONFIG_ALTIVEC */
274 #ifdef CONFIG_SPE 273 #ifdef CONFIG_SPE
275 /* 274 /*
276 * If the previous thread used spe in the last quantum 275 * If the previous thread used spe in the last quantum
277 * (thus changing spe regs) then save them. 276 * (thus changing spe regs) then save them.
278 * 277 *
279 * On SMP we always save/restore spe regs just to avoid the 278 * On SMP we always save/restore spe regs just to avoid the
280 * complexity of changing processors. 279 * complexity of changing processors.
281 */ 280 */
282 if ((prev->thread.regs && (prev->thread.regs->msr & MSR_SPE))) 281 if ((prev->thread.regs && (prev->thread.regs->msr & MSR_SPE)))
283 giveup_spe(prev); 282 giveup_spe(prev);
284 #endif /* CONFIG_SPE */ 283 #endif /* CONFIG_SPE */
285 284
286 #else /* CONFIG_SMP */ 285 #else /* CONFIG_SMP */
287 #ifdef CONFIG_ALTIVEC 286 #ifdef CONFIG_ALTIVEC
288 /* Avoid the trap. On smp this this never happens since 287 /* Avoid the trap. On smp this this never happens since
289 * we don't set last_task_used_altivec -- Cort 288 * we don't set last_task_used_altivec -- Cort
290 */ 289 */
291 if (new->thread.regs && last_task_used_altivec == new) 290 if (new->thread.regs && last_task_used_altivec == new)
292 new->thread.regs->msr |= MSR_VEC; 291 new->thread.regs->msr |= MSR_VEC;
293 #endif /* CONFIG_ALTIVEC */ 292 #endif /* CONFIG_ALTIVEC */
294 #ifdef CONFIG_SPE 293 #ifdef CONFIG_SPE
295 /* Avoid the trap. On smp this this never happens since 294 /* Avoid the trap. On smp this this never happens since
296 * we don't set last_task_used_spe 295 * we don't set last_task_used_spe
297 */ 296 */
298 if (new->thread.regs && last_task_used_spe == new) 297 if (new->thread.regs && last_task_used_spe == new)
299 new->thread.regs->msr |= MSR_SPE; 298 new->thread.regs->msr |= MSR_SPE;
300 #endif /* CONFIG_SPE */ 299 #endif /* CONFIG_SPE */
301 300
302 #endif /* CONFIG_SMP */ 301 #endif /* CONFIG_SMP */
303 302
304 #ifdef CONFIG_PPC64 /* for now */ 303 #ifdef CONFIG_PPC64 /* for now */
305 if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) { 304 if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) {
306 set_dabr(new->thread.dabr); 305 set_dabr(new->thread.dabr);
307 __get_cpu_var(current_dabr) = new->thread.dabr; 306 __get_cpu_var(current_dabr) = new->thread.dabr;
308 } 307 }
309 308
310 flush_tlb_pending(); 309 flush_tlb_pending();
311 #endif 310 #endif
312 311
313 new_thread = &new->thread; 312 new_thread = &new->thread;
314 old_thread = &current->thread; 313 old_thread = &current->thread;
315 314
316 #ifdef CONFIG_PPC64 315 #ifdef CONFIG_PPC64
317 /* 316 /*
318 * Collect processor utilization data per process 317 * Collect processor utilization data per process
319 */ 318 */
320 if (firmware_has_feature(FW_FEATURE_SPLPAR)) { 319 if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
321 struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); 320 struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
322 long unsigned start_tb, current_tb; 321 long unsigned start_tb, current_tb;
323 start_tb = old_thread->start_tb; 322 start_tb = old_thread->start_tb;
324 cu->current_tb = current_tb = mfspr(SPRN_PURR); 323 cu->current_tb = current_tb = mfspr(SPRN_PURR);
325 old_thread->accum_tb += (current_tb - start_tb); 324 old_thread->accum_tb += (current_tb - start_tb);
326 new_thread->start_tb = current_tb; 325 new_thread->start_tb = current_tb;
327 } 326 }
328 #endif 327 #endif
329 328
330 local_irq_save(flags); 329 local_irq_save(flags);
331 330
332 account_system_vtime(current); 331 account_system_vtime(current);
333 account_process_vtime(current); 332 account_process_vtime(current);
334 calculate_steal_time(); 333 calculate_steal_time();
335 334
336 last = _switch(old_thread, new_thread); 335 last = _switch(old_thread, new_thread);
337 336
338 local_irq_restore(flags); 337 local_irq_restore(flags);
339 338
340 return last; 339 return last;
341 } 340 }
342 341
343 static int instructions_to_print = 16; 342 static int instructions_to_print = 16;
344 343
345 #ifdef CONFIG_PPC64 344 #ifdef CONFIG_PPC64
346 #define BAD_PC(pc) ((REGION_ID(pc) != KERNEL_REGION_ID) && \ 345 #define BAD_PC(pc) ((REGION_ID(pc) != KERNEL_REGION_ID) && \
347 (REGION_ID(pc) != VMALLOC_REGION_ID)) 346 (REGION_ID(pc) != VMALLOC_REGION_ID))
348 #else 347 #else
349 #define BAD_PC(pc) ((pc) < KERNELBASE) 348 #define BAD_PC(pc) ((pc) < KERNELBASE)
350 #endif 349 #endif
351 350
352 static void show_instructions(struct pt_regs *regs) 351 static void show_instructions(struct pt_regs *regs)
353 { 352 {
354 int i; 353 int i;
355 unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * 354 unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
356 sizeof(int)); 355 sizeof(int));
357 356
358 printk("Instruction dump:"); 357 printk("Instruction dump:");
359 358
360 for (i = 0; i < instructions_to_print; i++) { 359 for (i = 0; i < instructions_to_print; i++) {
361 int instr; 360 int instr;
362 361
363 if (!(i % 8)) 362 if (!(i % 8))
364 printk("\n"); 363 printk("\n");
365 364
366 if (BAD_PC(pc) || __get_user(instr, (unsigned int *)pc)) { 365 if (BAD_PC(pc) || __get_user(instr, (unsigned int *)pc)) {
367 printk("XXXXXXXX "); 366 printk("XXXXXXXX ");
368 } else { 367 } else {
369 if (regs->nip == pc) 368 if (regs->nip == pc)
370 printk("<%08x> ", instr); 369 printk("<%08x> ", instr);
371 else 370 else
372 printk("%08x ", instr); 371 printk("%08x ", instr);
373 } 372 }
374 373
375 pc += sizeof(int); 374 pc += sizeof(int);
376 } 375 }
377 376
378 printk("\n"); 377 printk("\n");
379 } 378 }
380 379
381 static struct regbit { 380 static struct regbit {
382 unsigned long bit; 381 unsigned long bit;
383 const char *name; 382 const char *name;
384 } msr_bits[] = { 383 } msr_bits[] = {
385 {MSR_EE, "EE"}, 384 {MSR_EE, "EE"},
386 {MSR_PR, "PR"}, 385 {MSR_PR, "PR"},
387 {MSR_FP, "FP"}, 386 {MSR_FP, "FP"},
388 {MSR_ME, "ME"}, 387 {MSR_ME, "ME"},
389 {MSR_IR, "IR"}, 388 {MSR_IR, "IR"},
390 {MSR_DR, "DR"}, 389 {MSR_DR, "DR"},
391 {0, NULL} 390 {0, NULL}
392 }; 391 };
393 392
394 static void printbits(unsigned long val, struct regbit *bits) 393 static void printbits(unsigned long val, struct regbit *bits)
395 { 394 {
396 const char *sep = ""; 395 const char *sep = "";
397 396
398 printk("<"); 397 printk("<");
399 for (; bits->bit; ++bits) 398 for (; bits->bit; ++bits)
400 if (val & bits->bit) { 399 if (val & bits->bit) {
401 printk("%s%s", sep, bits->name); 400 printk("%s%s", sep, bits->name);
402 sep = ","; 401 sep = ",";
403 } 402 }
404 printk(">"); 403 printk(">");
405 } 404 }
406 405
407 #ifdef CONFIG_PPC64 406 #ifdef CONFIG_PPC64
408 #define REG "%016lX" 407 #define REG "%016lX"
409 #define REGS_PER_LINE 4 408 #define REGS_PER_LINE 4
410 #define LAST_VOLATILE 13 409 #define LAST_VOLATILE 13
411 #else 410 #else
412 #define REG "%08lX" 411 #define REG "%08lX"
413 #define REGS_PER_LINE 8 412 #define REGS_PER_LINE 8
414 #define LAST_VOLATILE 12 413 #define LAST_VOLATILE 12
415 #endif 414 #endif
416 415
417 void show_regs(struct pt_regs * regs) 416 void show_regs(struct pt_regs * regs)
418 { 417 {
419 int i, trap; 418 int i, trap;
420 419
421 printk("NIP: "REG" LR: "REG" CTR: "REG"\n", 420 printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
422 regs->nip, regs->link, regs->ctr); 421 regs->nip, regs->link, regs->ctr);
423 printk("REGS: %p TRAP: %04lx %s (%s)\n", 422 printk("REGS: %p TRAP: %04lx %s (%s)\n",
424 regs, regs->trap, print_tainted(), system_utsname.release); 423 regs, regs->trap, print_tainted(), system_utsname.release);
425 printk("MSR: "REG" ", regs->msr); 424 printk("MSR: "REG" ", regs->msr);
426 printbits(regs->msr, msr_bits); 425 printbits(regs->msr, msr_bits);
427 printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer); 426 printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer);
428 trap = TRAP(regs); 427 trap = TRAP(regs);
429 if (trap == 0x300 || trap == 0x600) 428 if (trap == 0x300 || trap == 0x600)
430 printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr); 429 printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr);
431 printk("TASK = %p[%d] '%s' THREAD: %p", 430 printk("TASK = %p[%d] '%s' THREAD: %p",
432 current, current->pid, current->comm, task_thread_info(current)); 431 current, current->pid, current->comm, task_thread_info(current));
433 432
434 #ifdef CONFIG_SMP 433 #ifdef CONFIG_SMP
435 printk(" CPU: %d", smp_processor_id()); 434 printk(" CPU: %d", smp_processor_id());
436 #endif /* CONFIG_SMP */ 435 #endif /* CONFIG_SMP */
437 436
438 for (i = 0; i < 32; i++) { 437 for (i = 0; i < 32; i++) {
439 if ((i % REGS_PER_LINE) == 0) 438 if ((i % REGS_PER_LINE) == 0)
440 printk("\n" KERN_INFO "GPR%02d: ", i); 439 printk("\n" KERN_INFO "GPR%02d: ", i);
441 printk(REG " ", regs->gpr[i]); 440 printk(REG " ", regs->gpr[i]);
442 if (i == LAST_VOLATILE && !FULL_REGS(regs)) 441 if (i == LAST_VOLATILE && !FULL_REGS(regs))
443 break; 442 break;
444 } 443 }
445 printk("\n"); 444 printk("\n");
446 #ifdef CONFIG_KALLSYMS 445 #ifdef CONFIG_KALLSYMS
447 /* 446 /*
448 * Lookup NIP late so we have the best change of getting the 447 * Lookup NIP late so we have the best change of getting the
449 * above info out without failing 448 * above info out without failing
450 */ 449 */
451 printk("NIP ["REG"] ", regs->nip); 450 printk("NIP ["REG"] ", regs->nip);
452 print_symbol("%s\n", regs->nip); 451 print_symbol("%s\n", regs->nip);
453 printk("LR ["REG"] ", regs->link); 452 printk("LR ["REG"] ", regs->link);
454 print_symbol("%s\n", regs->link); 453 print_symbol("%s\n", regs->link);
455 #endif 454 #endif
456 show_stack(current, (unsigned long *) regs->gpr[1]); 455 show_stack(current, (unsigned long *) regs->gpr[1]);
457 if (!user_mode(regs)) 456 if (!user_mode(regs))
458 show_instructions(regs); 457 show_instructions(regs);
459 } 458 }
460 459
461 void exit_thread(void) 460 void exit_thread(void)
462 { 461 {
463 kprobe_flush_task(current);
464 discard_lazy_cpu_state(); 462 discard_lazy_cpu_state();
465 } 463 }
466 464
467 void flush_thread(void) 465 void flush_thread(void)
468 { 466 {
469 #ifdef CONFIG_PPC64 467 #ifdef CONFIG_PPC64
470 struct thread_info *t = current_thread_info(); 468 struct thread_info *t = current_thread_info();
471 469
472 if (t->flags & _TIF_ABI_PENDING) 470 if (t->flags & _TIF_ABI_PENDING)
473 t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT); 471 t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT);
474 #endif 472 #endif
475 473
476 discard_lazy_cpu_state(); 474 discard_lazy_cpu_state();
477 475
478 #ifdef CONFIG_PPC64 /* for now */ 476 #ifdef CONFIG_PPC64 /* for now */
479 if (current->thread.dabr) { 477 if (current->thread.dabr) {
480 current->thread.dabr = 0; 478 current->thread.dabr = 0;
481 set_dabr(0); 479 set_dabr(0);
482 } 480 }
483 #endif 481 #endif
484 } 482 }
485 483
486 void 484 void
487 release_thread(struct task_struct *t) 485 release_thread(struct task_struct *t)
488 { 486 {
489 } 487 }
490 488
491 /* 489 /*
492 * This gets called before we allocate a new thread and copy 490 * This gets called before we allocate a new thread and copy
493 * the current task into it. 491 * the current task into it.
494 */ 492 */
495 void prepare_to_copy(struct task_struct *tsk) 493 void prepare_to_copy(struct task_struct *tsk)
496 { 494 {
497 flush_fp_to_thread(current); 495 flush_fp_to_thread(current);
498 flush_altivec_to_thread(current); 496 flush_altivec_to_thread(current);
499 flush_spe_to_thread(current); 497 flush_spe_to_thread(current);
500 } 498 }
501 499
502 /* 500 /*
503 * Copy a thread.. 501 * Copy a thread..
504 */ 502 */
505 int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, 503 int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
506 unsigned long unused, struct task_struct *p, 504 unsigned long unused, struct task_struct *p,
507 struct pt_regs *regs) 505 struct pt_regs *regs)
508 { 506 {
509 struct pt_regs *childregs, *kregs; 507 struct pt_regs *childregs, *kregs;
510 extern void ret_from_fork(void); 508 extern void ret_from_fork(void);
511 unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; 509 unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
512 510
513 CHECK_FULL_REGS(regs); 511 CHECK_FULL_REGS(regs);
514 /* Copy registers */ 512 /* Copy registers */
515 sp -= sizeof(struct pt_regs); 513 sp -= sizeof(struct pt_regs);
516 childregs = (struct pt_regs *) sp; 514 childregs = (struct pt_regs *) sp;
517 *childregs = *regs; 515 *childregs = *regs;
518 if ((childregs->msr & MSR_PR) == 0) { 516 if ((childregs->msr & MSR_PR) == 0) {
519 /* for kernel thread, set `current' and stackptr in new task */ 517 /* for kernel thread, set `current' and stackptr in new task */
520 childregs->gpr[1] = sp + sizeof(struct pt_regs); 518 childregs->gpr[1] = sp + sizeof(struct pt_regs);
521 #ifdef CONFIG_PPC32 519 #ifdef CONFIG_PPC32
522 childregs->gpr[2] = (unsigned long) p; 520 childregs->gpr[2] = (unsigned long) p;
523 #else 521 #else
524 clear_tsk_thread_flag(p, TIF_32BIT); 522 clear_tsk_thread_flag(p, TIF_32BIT);
525 #endif 523 #endif
526 p->thread.regs = NULL; /* no user register state */ 524 p->thread.regs = NULL; /* no user register state */
527 } else { 525 } else {
528 childregs->gpr[1] = usp; 526 childregs->gpr[1] = usp;
529 p->thread.regs = childregs; 527 p->thread.regs = childregs;
530 if (clone_flags & CLONE_SETTLS) { 528 if (clone_flags & CLONE_SETTLS) {
531 #ifdef CONFIG_PPC64 529 #ifdef CONFIG_PPC64
532 if (!test_thread_flag(TIF_32BIT)) 530 if (!test_thread_flag(TIF_32BIT))
533 childregs->gpr[13] = childregs->gpr[6]; 531 childregs->gpr[13] = childregs->gpr[6];
534 else 532 else
535 #endif 533 #endif
536 childregs->gpr[2] = childregs->gpr[6]; 534 childregs->gpr[2] = childregs->gpr[6];
537 } 535 }
538 } 536 }
539 childregs->gpr[3] = 0; /* Result from fork() */ 537 childregs->gpr[3] = 0; /* Result from fork() */
540 sp -= STACK_FRAME_OVERHEAD; 538 sp -= STACK_FRAME_OVERHEAD;
541 539
542 /* 540 /*
543 * The way this works is that at some point in the future 541 * The way this works is that at some point in the future
544 * some task will call _switch to switch to the new task. 542 * some task will call _switch to switch to the new task.
545 * That will pop off the stack frame created below and start 543 * That will pop off the stack frame created below and start
546 * the new task running at ret_from_fork. The new task will 544 * the new task running at ret_from_fork. The new task will
547 * do some house keeping and then return from the fork or clone 545 * do some house keeping and then return from the fork or clone
548 * system call, using the stack frame created above. 546 * system call, using the stack frame created above.
549 */ 547 */
550 sp -= sizeof(struct pt_regs); 548 sp -= sizeof(struct pt_regs);
551 kregs = (struct pt_regs *) sp; 549 kregs = (struct pt_regs *) sp;
552 sp -= STACK_FRAME_OVERHEAD; 550 sp -= STACK_FRAME_OVERHEAD;
553 p->thread.ksp = sp; 551 p->thread.ksp = sp;
554 552
555 #ifdef CONFIG_PPC64 553 #ifdef CONFIG_PPC64
556 if (cpu_has_feature(CPU_FTR_SLB)) { 554 if (cpu_has_feature(CPU_FTR_SLB)) {
557 unsigned long sp_vsid = get_kernel_vsid(sp); 555 unsigned long sp_vsid = get_kernel_vsid(sp);
558 unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; 556 unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
559 557
560 sp_vsid <<= SLB_VSID_SHIFT; 558 sp_vsid <<= SLB_VSID_SHIFT;
561 sp_vsid |= SLB_VSID_KERNEL | llp; 559 sp_vsid |= SLB_VSID_KERNEL | llp;
562 p->thread.ksp_vsid = sp_vsid; 560 p->thread.ksp_vsid = sp_vsid;
563 } 561 }
564 562
565 /* 563 /*
566 * The PPC64 ABI makes use of a TOC to contain function 564 * The PPC64 ABI makes use of a TOC to contain function
567 * pointers. The function (ret_from_except) is actually a pointer 565 * pointers. The function (ret_from_except) is actually a pointer
568 * to the TOC entry. The first entry is a pointer to the actual 566 * to the TOC entry. The first entry is a pointer to the actual
569 * function. 567 * function.
570 */ 568 */
571 kregs->nip = *((unsigned long *)ret_from_fork); 569 kregs->nip = *((unsigned long *)ret_from_fork);
572 #else 570 #else
573 kregs->nip = (unsigned long)ret_from_fork; 571 kregs->nip = (unsigned long)ret_from_fork;
574 p->thread.last_syscall = -1; 572 p->thread.last_syscall = -1;
575 #endif 573 #endif
576 574
577 return 0; 575 return 0;
578 } 576 }
579 577
580 /* 578 /*
581 * Set up a thread for executing a new program 579 * Set up a thread for executing a new program
582 */ 580 */
583 void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) 581 void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
584 { 582 {
585 #ifdef CONFIG_PPC64 583 #ifdef CONFIG_PPC64
586 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ 584 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */
587 #endif 585 #endif
588 586
589 set_fs(USER_DS); 587 set_fs(USER_DS);
590 588
591 /* 589 /*
592 * If we exec out of a kernel thread then thread.regs will not be 590 * If we exec out of a kernel thread then thread.regs will not be
593 * set. Do it now. 591 * set. Do it now.
594 */ 592 */
595 if (!current->thread.regs) { 593 if (!current->thread.regs) {
596 struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; 594 struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE;
597 current->thread.regs = regs - 1; 595 current->thread.regs = regs - 1;
598 } 596 }
599 597
600 memset(regs->gpr, 0, sizeof(regs->gpr)); 598 memset(regs->gpr, 0, sizeof(regs->gpr));
601 regs->ctr = 0; 599 regs->ctr = 0;
602 regs->link = 0; 600 regs->link = 0;
603 regs->xer = 0; 601 regs->xer = 0;
604 regs->ccr = 0; 602 regs->ccr = 0;
605 regs->gpr[1] = sp; 603 regs->gpr[1] = sp;
606 604
607 #ifdef CONFIG_PPC32 605 #ifdef CONFIG_PPC32
608 regs->mq = 0; 606 regs->mq = 0;
609 regs->nip = start; 607 regs->nip = start;
610 regs->msr = MSR_USER; 608 regs->msr = MSR_USER;
611 #else 609 #else
612 if (!test_thread_flag(TIF_32BIT)) { 610 if (!test_thread_flag(TIF_32BIT)) {
613 unsigned long entry, toc; 611 unsigned long entry, toc;
614 612
615 /* start is a relocated pointer to the function descriptor for 613 /* start is a relocated pointer to the function descriptor for
616 * the elf _start routine. The first entry in the function 614 * the elf _start routine. The first entry in the function
617 * descriptor is the entry address of _start and the second 615 * descriptor is the entry address of _start and the second
618 * entry is the TOC value we need to use. 616 * entry is the TOC value we need to use.
619 */ 617 */
620 __get_user(entry, (unsigned long __user *)start); 618 __get_user(entry, (unsigned long __user *)start);
621 __get_user(toc, (unsigned long __user *)start+1); 619 __get_user(toc, (unsigned long __user *)start+1);
622 620
623 /* Check whether the e_entry function descriptor entries 621 /* Check whether the e_entry function descriptor entries
624 * need to be relocated before we can use them. 622 * need to be relocated before we can use them.
625 */ 623 */
626 if (load_addr != 0) { 624 if (load_addr != 0) {
627 entry += load_addr; 625 entry += load_addr;
628 toc += load_addr; 626 toc += load_addr;
629 } 627 }
630 regs->nip = entry; 628 regs->nip = entry;
631 regs->gpr[2] = toc; 629 regs->gpr[2] = toc;
632 regs->msr = MSR_USER64; 630 regs->msr = MSR_USER64;
633 } else { 631 } else {
634 regs->nip = start; 632 regs->nip = start;
635 regs->gpr[2] = 0; 633 regs->gpr[2] = 0;
636 regs->msr = MSR_USER32; 634 regs->msr = MSR_USER32;
637 } 635 }
638 #endif 636 #endif
639 637
640 discard_lazy_cpu_state(); 638 discard_lazy_cpu_state();
641 memset(current->thread.fpr, 0, sizeof(current->thread.fpr)); 639 memset(current->thread.fpr, 0, sizeof(current->thread.fpr));
642 current->thread.fpscr.val = 0; 640 current->thread.fpscr.val = 0;
643 #ifdef CONFIG_ALTIVEC 641 #ifdef CONFIG_ALTIVEC
644 memset(current->thread.vr, 0, sizeof(current->thread.vr)); 642 memset(current->thread.vr, 0, sizeof(current->thread.vr));
645 memset(&current->thread.vscr, 0, sizeof(current->thread.vscr)); 643 memset(&current->thread.vscr, 0, sizeof(current->thread.vscr));
646 current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */ 644 current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */
647 current->thread.vrsave = 0; 645 current->thread.vrsave = 0;
648 current->thread.used_vr = 0; 646 current->thread.used_vr = 0;
649 #endif /* CONFIG_ALTIVEC */ 647 #endif /* CONFIG_ALTIVEC */
650 #ifdef CONFIG_SPE 648 #ifdef CONFIG_SPE
651 memset(current->thread.evr, 0, sizeof(current->thread.evr)); 649 memset(current->thread.evr, 0, sizeof(current->thread.evr));
652 current->thread.acc = 0; 650 current->thread.acc = 0;
653 current->thread.spefscr = 0; 651 current->thread.spefscr = 0;
654 current->thread.used_spe = 0; 652 current->thread.used_spe = 0;
655 #endif /* CONFIG_SPE */ 653 #endif /* CONFIG_SPE */
656 } 654 }
657 655
658 #define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \ 656 #define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \
659 | PR_FP_EXC_RES | PR_FP_EXC_INV) 657 | PR_FP_EXC_RES | PR_FP_EXC_INV)
660 658
661 int set_fpexc_mode(struct task_struct *tsk, unsigned int val) 659 int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
662 { 660 {
663 struct pt_regs *regs = tsk->thread.regs; 661 struct pt_regs *regs = tsk->thread.regs;
664 662
665 /* This is a bit hairy. If we are an SPE enabled processor 663 /* This is a bit hairy. If we are an SPE enabled processor
666 * (have embedded fp) we store the IEEE exception enable flags in 664 * (have embedded fp) we store the IEEE exception enable flags in
667 * fpexc_mode. fpexc_mode is also used for setting FP exception 665 * fpexc_mode. fpexc_mode is also used for setting FP exception
668 * mode (asyn, precise, disabled) for 'Classic' FP. */ 666 * mode (asyn, precise, disabled) for 'Classic' FP. */
669 if (val & PR_FP_EXC_SW_ENABLE) { 667 if (val & PR_FP_EXC_SW_ENABLE) {
670 #ifdef CONFIG_SPE 668 #ifdef CONFIG_SPE
671 tsk->thread.fpexc_mode = val & 669 tsk->thread.fpexc_mode = val &
672 (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); 670 (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT);
673 return 0; 671 return 0;
674 #else 672 #else
675 return -EINVAL; 673 return -EINVAL;
676 #endif 674 #endif
677 } 675 }
678 676
679 /* on a CONFIG_SPE this does not hurt us. The bits that 677 /* on a CONFIG_SPE this does not hurt us. The bits that
680 * __pack_fe01 use do not overlap with bits used for 678 * __pack_fe01 use do not overlap with bits used for
681 * PR_FP_EXC_SW_ENABLE. Additionally, the MSR[FE0,FE1] bits 679 * PR_FP_EXC_SW_ENABLE. Additionally, the MSR[FE0,FE1] bits
682 * on CONFIG_SPE implementations are reserved so writing to 680 * on CONFIG_SPE implementations are reserved so writing to
683 * them does not change anything */ 681 * them does not change anything */
684 if (val > PR_FP_EXC_PRECISE) 682 if (val > PR_FP_EXC_PRECISE)
685 return -EINVAL; 683 return -EINVAL;
686 tsk->thread.fpexc_mode = __pack_fe01(val); 684 tsk->thread.fpexc_mode = __pack_fe01(val);
687 if (regs != NULL && (regs->msr & MSR_FP) != 0) 685 if (regs != NULL && (regs->msr & MSR_FP) != 0)
688 regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1)) 686 regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1))
689 | tsk->thread.fpexc_mode; 687 | tsk->thread.fpexc_mode;
690 return 0; 688 return 0;
691 } 689 }
692 690
693 int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) 691 int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)
694 { 692 {
695 unsigned int val; 693 unsigned int val;
696 694
697 if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) 695 if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE)
698 #ifdef CONFIG_SPE 696 #ifdef CONFIG_SPE
699 val = tsk->thread.fpexc_mode; 697 val = tsk->thread.fpexc_mode;
700 #else 698 #else
701 return -EINVAL; 699 return -EINVAL;
702 #endif 700 #endif
703 else 701 else
704 val = __unpack_fe01(tsk->thread.fpexc_mode); 702 val = __unpack_fe01(tsk->thread.fpexc_mode);
705 return put_user(val, (unsigned int __user *) adr); 703 return put_user(val, (unsigned int __user *) adr);
706 } 704 }
707 705
708 #define TRUNC_PTR(x) ((typeof(x))(((unsigned long)(x)) & 0xffffffff)) 706 #define TRUNC_PTR(x) ((typeof(x))(((unsigned long)(x)) & 0xffffffff))
709 707
710 int sys_clone(unsigned long clone_flags, unsigned long usp, 708 int sys_clone(unsigned long clone_flags, unsigned long usp,
711 int __user *parent_tidp, void __user *child_threadptr, 709 int __user *parent_tidp, void __user *child_threadptr,
712 int __user *child_tidp, int p6, 710 int __user *child_tidp, int p6,
713 struct pt_regs *regs) 711 struct pt_regs *regs)
714 { 712 {
715 CHECK_FULL_REGS(regs); 713 CHECK_FULL_REGS(regs);
716 if (usp == 0) 714 if (usp == 0)
717 usp = regs->gpr[1]; /* stack pointer for child */ 715 usp = regs->gpr[1]; /* stack pointer for child */
718 #ifdef CONFIG_PPC64 716 #ifdef CONFIG_PPC64
719 if (test_thread_flag(TIF_32BIT)) { 717 if (test_thread_flag(TIF_32BIT)) {
720 parent_tidp = TRUNC_PTR(parent_tidp); 718 parent_tidp = TRUNC_PTR(parent_tidp);
721 child_tidp = TRUNC_PTR(child_tidp); 719 child_tidp = TRUNC_PTR(child_tidp);
722 } 720 }
723 #endif 721 #endif
724 return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp); 722 return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp);
725 } 723 }
726 724
727 int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3, 725 int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3,
728 unsigned long p4, unsigned long p5, unsigned long p6, 726 unsigned long p4, unsigned long p5, unsigned long p6,
729 struct pt_regs *regs) 727 struct pt_regs *regs)
730 { 728 {
731 CHECK_FULL_REGS(regs); 729 CHECK_FULL_REGS(regs);
732 return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL); 730 return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL);
733 } 731 }
734 732
735 int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3, 733 int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3,
736 unsigned long p4, unsigned long p5, unsigned long p6, 734 unsigned long p4, unsigned long p5, unsigned long p6,
737 struct pt_regs *regs) 735 struct pt_regs *regs)
738 { 736 {
739 CHECK_FULL_REGS(regs); 737 CHECK_FULL_REGS(regs);
740 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1], 738 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1],
741 regs, 0, NULL, NULL); 739 regs, 0, NULL, NULL);
742 } 740 }
743 741
744 int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, 742 int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
745 unsigned long a3, unsigned long a4, unsigned long a5, 743 unsigned long a3, unsigned long a4, unsigned long a5,
746 struct pt_regs *regs) 744 struct pt_regs *regs)
747 { 745 {
748 int error; 746 int error;
749 char *filename; 747 char *filename;
750 748
751 filename = getname((char __user *) a0); 749 filename = getname((char __user *) a0);
752 error = PTR_ERR(filename); 750 error = PTR_ERR(filename);
753 if (IS_ERR(filename)) 751 if (IS_ERR(filename))
754 goto out; 752 goto out;
755 flush_fp_to_thread(current); 753 flush_fp_to_thread(current);
756 flush_altivec_to_thread(current); 754 flush_altivec_to_thread(current);
757 flush_spe_to_thread(current); 755 flush_spe_to_thread(current);
758 error = do_execve(filename, (char __user * __user *) a1, 756 error = do_execve(filename, (char __user * __user *) a1,
759 (char __user * __user *) a2, regs); 757 (char __user * __user *) a2, regs);
760 if (error == 0) { 758 if (error == 0) {
761 task_lock(current); 759 task_lock(current);
762 current->ptrace &= ~PT_DTRACE; 760 current->ptrace &= ~PT_DTRACE;
763 task_unlock(current); 761 task_unlock(current);
764 } 762 }
765 putname(filename); 763 putname(filename);
766 out: 764 out:
767 return error; 765 return error;
768 } 766 }
769 767
770 static int validate_sp(unsigned long sp, struct task_struct *p, 768 static int validate_sp(unsigned long sp, struct task_struct *p,
771 unsigned long nbytes) 769 unsigned long nbytes)
772 { 770 {
773 unsigned long stack_page = (unsigned long)task_stack_page(p); 771 unsigned long stack_page = (unsigned long)task_stack_page(p);
774 772
775 if (sp >= stack_page + sizeof(struct thread_struct) 773 if (sp >= stack_page + sizeof(struct thread_struct)
776 && sp <= stack_page + THREAD_SIZE - nbytes) 774 && sp <= stack_page + THREAD_SIZE - nbytes)
777 return 1; 775 return 1;
778 776
779 #ifdef CONFIG_IRQSTACKS 777 #ifdef CONFIG_IRQSTACKS
780 stack_page = (unsigned long) hardirq_ctx[task_cpu(p)]; 778 stack_page = (unsigned long) hardirq_ctx[task_cpu(p)];
781 if (sp >= stack_page + sizeof(struct thread_struct) 779 if (sp >= stack_page + sizeof(struct thread_struct)
782 && sp <= stack_page + THREAD_SIZE - nbytes) 780 && sp <= stack_page + THREAD_SIZE - nbytes)
783 return 1; 781 return 1;
784 782
785 stack_page = (unsigned long) softirq_ctx[task_cpu(p)]; 783 stack_page = (unsigned long) softirq_ctx[task_cpu(p)];
786 if (sp >= stack_page + sizeof(struct thread_struct) 784 if (sp >= stack_page + sizeof(struct thread_struct)
787 && sp <= stack_page + THREAD_SIZE - nbytes) 785 && sp <= stack_page + THREAD_SIZE - nbytes)
788 return 1; 786 return 1;
789 #endif 787 #endif
790 788
791 return 0; 789 return 0;
792 } 790 }
793 791
794 #ifdef CONFIG_PPC64 792 #ifdef CONFIG_PPC64
795 #define MIN_STACK_FRAME 112 /* same as STACK_FRAME_OVERHEAD, in fact */ 793 #define MIN_STACK_FRAME 112 /* same as STACK_FRAME_OVERHEAD, in fact */
796 #define FRAME_LR_SAVE 2 794 #define FRAME_LR_SAVE 2
797 #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD + 288) 795 #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD + 288)
798 #define REGS_MARKER 0x7265677368657265ul 796 #define REGS_MARKER 0x7265677368657265ul
799 #define FRAME_MARKER 12 797 #define FRAME_MARKER 12
800 #else 798 #else
801 #define MIN_STACK_FRAME 16 799 #define MIN_STACK_FRAME 16
802 #define FRAME_LR_SAVE 1 800 #define FRAME_LR_SAVE 1
803 #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) 801 #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD)
804 #define REGS_MARKER 0x72656773ul 802 #define REGS_MARKER 0x72656773ul
805 #define FRAME_MARKER 2 803 #define FRAME_MARKER 2
806 #endif 804 #endif
807 805
808 unsigned long get_wchan(struct task_struct *p) 806 unsigned long get_wchan(struct task_struct *p)
809 { 807 {
810 unsigned long ip, sp; 808 unsigned long ip, sp;
811 int count = 0; 809 int count = 0;
812 810
813 if (!p || p == current || p->state == TASK_RUNNING) 811 if (!p || p == current || p->state == TASK_RUNNING)
814 return 0; 812 return 0;
815 813
816 sp = p->thread.ksp; 814 sp = p->thread.ksp;
817 if (!validate_sp(sp, p, MIN_STACK_FRAME)) 815 if (!validate_sp(sp, p, MIN_STACK_FRAME))
818 return 0; 816 return 0;
819 817
820 do { 818 do {
821 sp = *(unsigned long *)sp; 819 sp = *(unsigned long *)sp;
822 if (!validate_sp(sp, p, MIN_STACK_FRAME)) 820 if (!validate_sp(sp, p, MIN_STACK_FRAME))
823 return 0; 821 return 0;
824 if (count > 0) { 822 if (count > 0) {
825 ip = ((unsigned long *)sp)[FRAME_LR_SAVE]; 823 ip = ((unsigned long *)sp)[FRAME_LR_SAVE];
826 if (!in_sched_functions(ip)) 824 if (!in_sched_functions(ip))
827 return ip; 825 return ip;
828 } 826 }
829 } while (count++ < 16); 827 } while (count++ < 16);
830 return 0; 828 return 0;
831 } 829 }
832 EXPORT_SYMBOL(get_wchan); 830 EXPORT_SYMBOL(get_wchan);
833 831
834 static int kstack_depth_to_print = 64; 832 static int kstack_depth_to_print = 64;
835 833
836 void show_stack(struct task_struct *tsk, unsigned long *stack) 834 void show_stack(struct task_struct *tsk, unsigned long *stack)
837 { 835 {
838 unsigned long sp, ip, lr, newsp; 836 unsigned long sp, ip, lr, newsp;
839 int count = 0; 837 int count = 0;
840 int firstframe = 1; 838 int firstframe = 1;
841 839
842 sp = (unsigned long) stack; 840 sp = (unsigned long) stack;
843 if (tsk == NULL) 841 if (tsk == NULL)
844 tsk = current; 842 tsk = current;
845 if (sp == 0) { 843 if (sp == 0) {
846 if (tsk == current) 844 if (tsk == current)
847 asm("mr %0,1" : "=r" (sp)); 845 asm("mr %0,1" : "=r" (sp));
848 else 846 else
849 sp = tsk->thread.ksp; 847 sp = tsk->thread.ksp;
850 } 848 }
851 849
852 lr = 0; 850 lr = 0;
853 printk("Call Trace:\n"); 851 printk("Call Trace:\n");
854 do { 852 do {
855 if (!validate_sp(sp, tsk, MIN_STACK_FRAME)) 853 if (!validate_sp(sp, tsk, MIN_STACK_FRAME))
856 return; 854 return;
857 855
858 stack = (unsigned long *) sp; 856 stack = (unsigned long *) sp;
859 newsp = stack[0]; 857 newsp = stack[0];
860 ip = stack[FRAME_LR_SAVE]; 858 ip = stack[FRAME_LR_SAVE];
861 if (!firstframe || ip != lr) { 859 if (!firstframe || ip != lr) {
862 printk("["REG"] ["REG"] ", sp, ip); 860 printk("["REG"] ["REG"] ", sp, ip);
863 print_symbol("%s", ip); 861 print_symbol("%s", ip);
864 if (firstframe) 862 if (firstframe)
865 printk(" (unreliable)"); 863 printk(" (unreliable)");
866 printk("\n"); 864 printk("\n");
867 } 865 }
868 firstframe = 0; 866 firstframe = 0;
869 867
870 /* 868 /*
871 * See if this is an exception frame. 869 * See if this is an exception frame.
872 * We look for the "regshere" marker in the current frame. 870 * We look for the "regshere" marker in the current frame.
873 */ 871 */
874 if (validate_sp(sp, tsk, INT_FRAME_SIZE) 872 if (validate_sp(sp, tsk, INT_FRAME_SIZE)
875 && stack[FRAME_MARKER] == REGS_MARKER) { 873 && stack[FRAME_MARKER] == REGS_MARKER) {
876 struct pt_regs *regs = (struct pt_regs *) 874 struct pt_regs *regs = (struct pt_regs *)
877 (sp + STACK_FRAME_OVERHEAD); 875 (sp + STACK_FRAME_OVERHEAD);
878 printk("--- Exception: %lx", regs->trap); 876 printk("--- Exception: %lx", regs->trap);
879 print_symbol(" at %s\n", regs->nip); 877 print_symbol(" at %s\n", regs->nip);
880 lr = regs->link; 878 lr = regs->link;
881 print_symbol(" LR = %s\n", lr); 879 print_symbol(" LR = %s\n", lr);
882 firstframe = 1; 880 firstframe = 1;
883 } 881 }
884 882
885 sp = newsp; 883 sp = newsp;
886 } while (count++ < kstack_depth_to_print); 884 } while (count++ < kstack_depth_to_print);
887 } 885 }
888 886
889 void dump_stack(void) 887 void dump_stack(void)
890 { 888 {
891 show_stack(current, NULL); 889 show_stack(current, NULL);
892 } 890 }
893 EXPORT_SYMBOL(dump_stack); 891 EXPORT_SYMBOL(dump_stack);
894 892
895 #ifdef CONFIG_PPC64 893 #ifdef CONFIG_PPC64
896 void ppc64_runlatch_on(void) 894 void ppc64_runlatch_on(void)
897 { 895 {
898 unsigned long ctrl; 896 unsigned long ctrl;
899 897
900 if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_flag(TIF_RUNLATCH)) { 898 if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_flag(TIF_RUNLATCH)) {
901 HMT_medium(); 899 HMT_medium();
902 900
903 ctrl = mfspr(SPRN_CTRLF); 901 ctrl = mfspr(SPRN_CTRLF);
904 ctrl |= CTRL_RUNLATCH; 902 ctrl |= CTRL_RUNLATCH;
905 mtspr(SPRN_CTRLT, ctrl); 903 mtspr(SPRN_CTRLT, ctrl);
906 904
907 set_thread_flag(TIF_RUNLATCH); 905 set_thread_flag(TIF_RUNLATCH);
908 } 906 }
909 } 907 }
910 908
911 void ppc64_runlatch_off(void) 909 void ppc64_runlatch_off(void)
912 { 910 {
913 unsigned long ctrl; 911 unsigned long ctrl;
914 912
915 if (cpu_has_feature(CPU_FTR_CTRL) && test_thread_flag(TIF_RUNLATCH)) { 913 if (cpu_has_feature(CPU_FTR_CTRL) && test_thread_flag(TIF_RUNLATCH)) {
916 HMT_medium(); 914 HMT_medium();
917 915
918 clear_thread_flag(TIF_RUNLATCH); 916 clear_thread_flag(TIF_RUNLATCH);
919 917
920 ctrl = mfspr(SPRN_CTRLF); 918 ctrl = mfspr(SPRN_CTRLF);
921 ctrl &= ~CTRL_RUNLATCH; 919 ctrl &= ~CTRL_RUNLATCH;
922 mtspr(SPRN_CTRLT, ctrl); 920 mtspr(SPRN_CTRLT, ctrl);
923 } 921 }
924 } 922 }
925 #endif 923 #endif
926 924
arch/x86_64/kernel/process.c
1 /* 1 /*
2 * linux/arch/x86-64/kernel/process.c 2 * linux/arch/x86-64/kernel/process.c
3 * 3 *
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 * 5 *
6 * Pentium III FXSR, SSE support 6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 * 8 *
9 * X86-64 port 9 * X86-64 port
10 * Andi Kleen. 10 * Andi Kleen.
11 * 11 *
12 * CPU hotplug support - ashok.raj@intel.com 12 * CPU hotplug support - ashok.raj@intel.com
13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ 13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
14 */ 14 */
15 15
16 /* 16 /*
17 * This file handles the architecture-dependent parts of process handling.. 17 * This file handles the architecture-dependent parts of process handling..
18 */ 18 */
19 19
20 #include <stdarg.h> 20 #include <stdarg.h>
21 21
22 #include <linux/cpu.h> 22 #include <linux/cpu.h>
23 #include <linux/errno.h> 23 #include <linux/errno.h>
24 #include <linux/sched.h> 24 #include <linux/sched.h>
25 #include <linux/kernel.h> 25 #include <linux/kernel.h>
26 #include <linux/mm.h> 26 #include <linux/mm.h>
27 #include <linux/elfcore.h> 27 #include <linux/elfcore.h>
28 #include <linux/smp.h> 28 #include <linux/smp.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/user.h> 30 #include <linux/user.h>
31 #include <linux/module.h> 31 #include <linux/module.h>
32 #include <linux/a.out.h> 32 #include <linux/a.out.h>
33 #include <linux/interrupt.h> 33 #include <linux/interrupt.h>
34 #include <linux/delay.h> 34 #include <linux/delay.h>
35 #include <linux/ptrace.h> 35 #include <linux/ptrace.h>
36 #include <linux/utsname.h> 36 #include <linux/utsname.h>
37 #include <linux/random.h> 37 #include <linux/random.h>
38 #include <linux/kprobes.h>
39 #include <linux/notifier.h> 38 #include <linux/notifier.h>
39 #include <linux/kprobes.h>
40 40
41 #include <asm/uaccess.h> 41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h> 42 #include <asm/pgtable.h>
43 #include <asm/system.h> 43 #include <asm/system.h>
44 #include <asm/io.h> 44 #include <asm/io.h>
45 #include <asm/processor.h> 45 #include <asm/processor.h>
46 #include <asm/i387.h> 46 #include <asm/i387.h>
47 #include <asm/mmu_context.h> 47 #include <asm/mmu_context.h>
48 #include <asm/pda.h> 48 #include <asm/pda.h>
49 #include <asm/prctl.h> 49 #include <asm/prctl.h>
50 #include <asm/kdebug.h> 50 #include <asm/kdebug.h>
51 #include <asm/desc.h> 51 #include <asm/desc.h>
52 #include <asm/proto.h> 52 #include <asm/proto.h>
53 #include <asm/ia32.h> 53 #include <asm/ia32.h>
54 #include <asm/idle.h> 54 #include <asm/idle.h>
55 55
56 asmlinkage extern void ret_from_fork(void); 56 asmlinkage extern void ret_from_fork(void);
57 57
58 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 58 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 59
60 unsigned long boot_option_idle_override = 0; 60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override); 61 EXPORT_SYMBOL(boot_option_idle_override);
62 62
63 /* 63 /*
64 * Powermanagement idle function, if any.. 64 * Powermanagement idle function, if any..
65 */ 65 */
66 void (*pm_idle)(void); 66 void (*pm_idle)(void);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 68
69 static struct notifier_block *idle_notifier; 69 static struct notifier_block *idle_notifier;
70 static DEFINE_SPINLOCK(idle_notifier_lock); 70 static DEFINE_SPINLOCK(idle_notifier_lock);
71 71
72 void idle_notifier_register(struct notifier_block *n) 72 void idle_notifier_register(struct notifier_block *n)
73 { 73 {
74 unsigned long flags; 74 unsigned long flags;
75 spin_lock_irqsave(&idle_notifier_lock, flags); 75 spin_lock_irqsave(&idle_notifier_lock, flags);
76 notifier_chain_register(&idle_notifier, n); 76 notifier_chain_register(&idle_notifier, n);
77 spin_unlock_irqrestore(&idle_notifier_lock, flags); 77 spin_unlock_irqrestore(&idle_notifier_lock, flags);
78 } 78 }
79 EXPORT_SYMBOL_GPL(idle_notifier_register); 79 EXPORT_SYMBOL_GPL(idle_notifier_register);
80 80
81 void idle_notifier_unregister(struct notifier_block *n) 81 void idle_notifier_unregister(struct notifier_block *n)
82 { 82 {
83 unsigned long flags; 83 unsigned long flags;
84 spin_lock_irqsave(&idle_notifier_lock, flags); 84 spin_lock_irqsave(&idle_notifier_lock, flags);
85 notifier_chain_unregister(&idle_notifier, n); 85 notifier_chain_unregister(&idle_notifier, n);
86 spin_unlock_irqrestore(&idle_notifier_lock, flags); 86 spin_unlock_irqrestore(&idle_notifier_lock, flags);
87 } 87 }
88 EXPORT_SYMBOL(idle_notifier_unregister); 88 EXPORT_SYMBOL(idle_notifier_unregister);
89 89
90 enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; 90 enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
91 static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; 91 static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
92 92
93 void enter_idle(void) 93 void enter_idle(void)
94 { 94 {
95 __get_cpu_var(idle_state) = CPU_IDLE; 95 __get_cpu_var(idle_state) = CPU_IDLE;
96 notifier_call_chain(&idle_notifier, IDLE_START, NULL); 96 notifier_call_chain(&idle_notifier, IDLE_START, NULL);
97 } 97 }
98 98
99 static void __exit_idle(void) 99 static void __exit_idle(void)
100 { 100 {
101 __get_cpu_var(idle_state) = CPU_NOT_IDLE; 101 __get_cpu_var(idle_state) = CPU_NOT_IDLE;
102 notifier_call_chain(&idle_notifier, IDLE_END, NULL); 102 notifier_call_chain(&idle_notifier, IDLE_END, NULL);
103 } 103 }
104 104
105 /* Called from interrupts to signify idle end */ 105 /* Called from interrupts to signify idle end */
106 void exit_idle(void) 106 void exit_idle(void)
107 { 107 {
108 if (current->pid | read_pda(irqcount)) 108 if (current->pid | read_pda(irqcount))
109 return; 109 return;
110 __exit_idle(); 110 __exit_idle();
111 } 111 }
112 112
113 /* 113 /*
114 * We use this if we don't have any better 114 * We use this if we don't have any better
115 * idle routine.. 115 * idle routine..
116 */ 116 */
117 static void default_idle(void) 117 static void default_idle(void)
118 { 118 {
119 local_irq_enable(); 119 local_irq_enable();
120 120
121 clear_thread_flag(TIF_POLLING_NRFLAG); 121 clear_thread_flag(TIF_POLLING_NRFLAG);
122 smp_mb__after_clear_bit(); 122 smp_mb__after_clear_bit();
123 while (!need_resched()) { 123 while (!need_resched()) {
124 local_irq_disable(); 124 local_irq_disable();
125 if (!need_resched()) 125 if (!need_resched())
126 safe_halt(); 126 safe_halt();
127 else 127 else
128 local_irq_enable(); 128 local_irq_enable();
129 } 129 }
130 set_thread_flag(TIF_POLLING_NRFLAG); 130 set_thread_flag(TIF_POLLING_NRFLAG);
131 } 131 }
132 132
133 /* 133 /*
134 * On SMP it's slightly faster (but much more power-consuming!) 134 * On SMP it's slightly faster (but much more power-consuming!)
135 * to poll the ->need_resched flag instead of waiting for the 135 * to poll the ->need_resched flag instead of waiting for the
136 * cross-CPU IPI to arrive. Use this option with caution. 136 * cross-CPU IPI to arrive. Use this option with caution.
137 */ 137 */
138 static void poll_idle (void) 138 static void poll_idle (void)
139 { 139 {
140 local_irq_enable(); 140 local_irq_enable();
141 141
142 asm volatile( 142 asm volatile(
143 "2:" 143 "2:"
144 "testl %0,%1;" 144 "testl %0,%1;"
145 "rep; nop;" 145 "rep; nop;"
146 "je 2b;" 146 "je 2b;"
147 : : 147 : :
148 "i" (_TIF_NEED_RESCHED), 148 "i" (_TIF_NEED_RESCHED),
149 "m" (current_thread_info()->flags)); 149 "m" (current_thread_info()->flags));
150 } 150 }
151 151
152 void cpu_idle_wait(void) 152 void cpu_idle_wait(void)
153 { 153 {
154 unsigned int cpu, this_cpu = get_cpu(); 154 unsigned int cpu, this_cpu = get_cpu();
155 cpumask_t map; 155 cpumask_t map;
156 156
157 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 157 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
158 put_cpu(); 158 put_cpu();
159 159
160 cpus_clear(map); 160 cpus_clear(map);
161 for_each_online_cpu(cpu) { 161 for_each_online_cpu(cpu) {
162 per_cpu(cpu_idle_state, cpu) = 1; 162 per_cpu(cpu_idle_state, cpu) = 1;
163 cpu_set(cpu, map); 163 cpu_set(cpu, map);
164 } 164 }
165 165
166 __get_cpu_var(cpu_idle_state) = 0; 166 __get_cpu_var(cpu_idle_state) = 0;
167 167
168 wmb(); 168 wmb();
169 do { 169 do {
170 ssleep(1); 170 ssleep(1);
171 for_each_online_cpu(cpu) { 171 for_each_online_cpu(cpu) {
172 if (cpu_isset(cpu, map) && 172 if (cpu_isset(cpu, map) &&
173 !per_cpu(cpu_idle_state, cpu)) 173 !per_cpu(cpu_idle_state, cpu))
174 cpu_clear(cpu, map); 174 cpu_clear(cpu, map);
175 } 175 }
176 cpus_and(map, map, cpu_online_map); 176 cpus_and(map, map, cpu_online_map);
177 } while (!cpus_empty(map)); 177 } while (!cpus_empty(map));
178 } 178 }
179 EXPORT_SYMBOL_GPL(cpu_idle_wait); 179 EXPORT_SYMBOL_GPL(cpu_idle_wait);
180 180
181 #ifdef CONFIG_HOTPLUG_CPU 181 #ifdef CONFIG_HOTPLUG_CPU
182 DECLARE_PER_CPU(int, cpu_state); 182 DECLARE_PER_CPU(int, cpu_state);
183 183
184 #include <asm/nmi.h> 184 #include <asm/nmi.h>
185 /* We halt the CPU with physical CPU hotplug */ 185 /* We halt the CPU with physical CPU hotplug */
186 static inline void play_dead(void) 186 static inline void play_dead(void)
187 { 187 {
188 idle_task_exit(); 188 idle_task_exit();
189 wbinvd(); 189 wbinvd();
190 mb(); 190 mb();
191 /* Ack it */ 191 /* Ack it */
192 __get_cpu_var(cpu_state) = CPU_DEAD; 192 __get_cpu_var(cpu_state) = CPU_DEAD;
193 193
194 local_irq_disable(); 194 local_irq_disable();
195 while (1) 195 while (1)
196 halt(); 196 halt();
197 } 197 }
198 #else 198 #else
199 static inline void play_dead(void) 199 static inline void play_dead(void)
200 { 200 {
201 BUG(); 201 BUG();
202 } 202 }
203 #endif /* CONFIG_HOTPLUG_CPU */ 203 #endif /* CONFIG_HOTPLUG_CPU */
204 204
205 /* 205 /*
206 * The idle thread. There's no useful work to be 206 * The idle thread. There's no useful work to be
207 * done, so just try to conserve power and have a 207 * done, so just try to conserve power and have a
208 * low exit latency (ie sit in a loop waiting for 208 * low exit latency (ie sit in a loop waiting for
209 * somebody to say that they'd like to reschedule) 209 * somebody to say that they'd like to reschedule)
210 */ 210 */
211 void cpu_idle (void) 211 void cpu_idle (void)
212 { 212 {
213 set_thread_flag(TIF_POLLING_NRFLAG); 213 set_thread_flag(TIF_POLLING_NRFLAG);
214 214
215 /* endless idle loop with no priority at all */ 215 /* endless idle loop with no priority at all */
216 while (1) { 216 while (1) {
217 while (!need_resched()) { 217 while (!need_resched()) {
218 void (*idle)(void); 218 void (*idle)(void);
219 219
220 if (__get_cpu_var(cpu_idle_state)) 220 if (__get_cpu_var(cpu_idle_state))
221 __get_cpu_var(cpu_idle_state) = 0; 221 __get_cpu_var(cpu_idle_state) = 0;
222 222
223 rmb(); 223 rmb();
224 idle = pm_idle; 224 idle = pm_idle;
225 if (!idle) 225 if (!idle)
226 idle = default_idle; 226 idle = default_idle;
227 if (cpu_is_offline(smp_processor_id())) 227 if (cpu_is_offline(smp_processor_id()))
228 play_dead(); 228 play_dead();
229 enter_idle(); 229 enter_idle();
230 idle(); 230 idle();
231 __exit_idle(); 231 __exit_idle();
232 } 232 }
233 233
234 preempt_enable_no_resched(); 234 preempt_enable_no_resched();
235 schedule(); 235 schedule();
236 preempt_disable(); 236 preempt_disable();
237 } 237 }
238 } 238 }
239 239
240 /* 240 /*
241 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 241 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
242 * which can obviate IPI to trigger checking of need_resched. 242 * which can obviate IPI to trigger checking of need_resched.
243 * We execute MONITOR against need_resched and enter optimized wait state 243 * We execute MONITOR against need_resched and enter optimized wait state
244 * through MWAIT. Whenever someone changes need_resched, we would be woken 244 * through MWAIT. Whenever someone changes need_resched, we would be woken
245 * up from MWAIT (without an IPI). 245 * up from MWAIT (without an IPI).
246 */ 246 */
247 static void mwait_idle(void) 247 static void mwait_idle(void)
248 { 248 {
249 local_irq_enable(); 249 local_irq_enable();
250 250
251 while (!need_resched()) { 251 while (!need_resched()) {
252 __monitor((void *)&current_thread_info()->flags, 0, 0); 252 __monitor((void *)&current_thread_info()->flags, 0, 0);
253 smp_mb(); 253 smp_mb();
254 if (need_resched()) 254 if (need_resched())
255 break; 255 break;
256 __mwait(0, 0); 256 __mwait(0, 0);
257 } 257 }
258 } 258 }
259 259
260 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 260 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
261 { 261 {
262 static int printed; 262 static int printed;
263 if (cpu_has(c, X86_FEATURE_MWAIT)) { 263 if (cpu_has(c, X86_FEATURE_MWAIT)) {
264 /* 264 /*
265 * Skip, if setup has overridden idle. 265 * Skip, if setup has overridden idle.
266 * One CPU supports mwait => All CPUs supports mwait 266 * One CPU supports mwait => All CPUs supports mwait
267 */ 267 */
268 if (!pm_idle) { 268 if (!pm_idle) {
269 if (!printed) { 269 if (!printed) {
270 printk("using mwait in idle threads.\n"); 270 printk("using mwait in idle threads.\n");
271 printed = 1; 271 printed = 1;
272 } 272 }
273 pm_idle = mwait_idle; 273 pm_idle = mwait_idle;
274 } 274 }
275 } 275 }
276 } 276 }
277 277
278 static int __init idle_setup (char *str) 278 static int __init idle_setup (char *str)
279 { 279 {
280 if (!strncmp(str, "poll", 4)) { 280 if (!strncmp(str, "poll", 4)) {
281 printk("using polling idle threads.\n"); 281 printk("using polling idle threads.\n");
282 pm_idle = poll_idle; 282 pm_idle = poll_idle;
283 } 283 }
284 284
285 boot_option_idle_override = 1; 285 boot_option_idle_override = 1;
286 return 1; 286 return 1;
287 } 287 }
288 288
289 __setup("idle=", idle_setup); 289 __setup("idle=", idle_setup);
290 290
291 /* Prints also some state that isn't saved in the pt_regs */ 291 /* Prints also some state that isn't saved in the pt_regs */
292 void __show_regs(struct pt_regs * regs) 292 void __show_regs(struct pt_regs * regs)
293 { 293 {
294 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 294 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
295 unsigned int fsindex,gsindex; 295 unsigned int fsindex,gsindex;
296 unsigned int ds,cs,es; 296 unsigned int ds,cs,es;
297 297
298 printk("\n"); 298 printk("\n");
299 print_modules(); 299 print_modules();
300 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 300 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
301 current->pid, current->comm, print_tainted(), 301 current->pid, current->comm, print_tainted(),
302 system_utsname.release, 302 system_utsname.release,
303 (int)strcspn(system_utsname.version, " "), 303 (int)strcspn(system_utsname.version, " "),
304 system_utsname.version); 304 system_utsname.version);
305 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); 305 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
306 printk_address(regs->rip); 306 printk_address(regs->rip);
307 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, 307 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
308 regs->eflags); 308 regs->eflags);
309 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 309 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
310 regs->rax, regs->rbx, regs->rcx); 310 regs->rax, regs->rbx, regs->rcx);
311 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 311 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
312 regs->rdx, regs->rsi, regs->rdi); 312 regs->rdx, regs->rsi, regs->rdi);
313 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 313 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
314 regs->rbp, regs->r8, regs->r9); 314 regs->rbp, regs->r8, regs->r9);
315 printk("R10: %016lx R11: %016lx R12: %016lx\n", 315 printk("R10: %016lx R11: %016lx R12: %016lx\n",
316 regs->r10, regs->r11, regs->r12); 316 regs->r10, regs->r11, regs->r12);
317 printk("R13: %016lx R14: %016lx R15: %016lx\n", 317 printk("R13: %016lx R14: %016lx R15: %016lx\n",
318 regs->r13, regs->r14, regs->r15); 318 regs->r13, regs->r14, regs->r15);
319 319
320 asm("movl %%ds,%0" : "=r" (ds)); 320 asm("movl %%ds,%0" : "=r" (ds));
321 asm("movl %%cs,%0" : "=r" (cs)); 321 asm("movl %%cs,%0" : "=r" (cs));
322 asm("movl %%es,%0" : "=r" (es)); 322 asm("movl %%es,%0" : "=r" (es));
323 asm("movl %%fs,%0" : "=r" (fsindex)); 323 asm("movl %%fs,%0" : "=r" (fsindex));
324 asm("movl %%gs,%0" : "=r" (gsindex)); 324 asm("movl %%gs,%0" : "=r" (gsindex));
325 325
326 rdmsrl(MSR_FS_BASE, fs); 326 rdmsrl(MSR_FS_BASE, fs);
327 rdmsrl(MSR_GS_BASE, gs); 327 rdmsrl(MSR_GS_BASE, gs);
328 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 328 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
329 329
330 asm("movq %%cr0, %0": "=r" (cr0)); 330 asm("movq %%cr0, %0": "=r" (cr0));
331 asm("movq %%cr2, %0": "=r" (cr2)); 331 asm("movq %%cr2, %0": "=r" (cr2));
332 asm("movq %%cr3, %0": "=r" (cr3)); 332 asm("movq %%cr3, %0": "=r" (cr3));
333 asm("movq %%cr4, %0": "=r" (cr4)); 333 asm("movq %%cr4, %0": "=r" (cr4));
334 334
335 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 335 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
336 fs,fsindex,gs,gsindex,shadowgs); 336 fs,fsindex,gs,gsindex,shadowgs);
337 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 337 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
338 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 338 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
339 } 339 }
340 340
341 void show_regs(struct pt_regs *regs) 341 void show_regs(struct pt_regs *regs)
342 { 342 {
343 printk("CPU %d:", smp_processor_id()); 343 printk("CPU %d:", smp_processor_id());
344 __show_regs(regs); 344 __show_regs(regs);
345 show_trace(&regs->rsp); 345 show_trace(&regs->rsp);
346 } 346 }
347 347
348 /* 348 /*
349 * Free current thread data structures etc.. 349 * Free current thread data structures etc..
350 */ 350 */
351 void exit_thread(void) 351 void exit_thread(void)
352 { 352 {
353 struct task_struct *me = current; 353 struct task_struct *me = current;
354 struct thread_struct *t = &me->thread; 354 struct thread_struct *t = &me->thread;
355
356 /*
357 * Remove function-return probe instances associated with this task
358 * and put them back on the free list. Do not insert an exit probe for
359 * this function, it will be disabled by kprobe_flush_task if you do.
360 */
361 kprobe_flush_task(me);
362 355
363 if (me->thread.io_bitmap_ptr) { 356 if (me->thread.io_bitmap_ptr) {
364 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 357 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
365 358
366 kfree(t->io_bitmap_ptr); 359 kfree(t->io_bitmap_ptr);
367 t->io_bitmap_ptr = NULL; 360 t->io_bitmap_ptr = NULL;
368 /* 361 /*
369 * Careful, clear this in the TSS too: 362 * Careful, clear this in the TSS too:
370 */ 363 */
371 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 364 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
372 t->io_bitmap_max = 0; 365 t->io_bitmap_max = 0;
373 put_cpu(); 366 put_cpu();
374 } 367 }
375 } 368 }
376 369
377 void flush_thread(void) 370 void flush_thread(void)
378 { 371 {
379 struct task_struct *tsk = current; 372 struct task_struct *tsk = current;
380 struct thread_info *t = current_thread_info(); 373 struct thread_info *t = current_thread_info();
381 374
382 if (t->flags & _TIF_ABI_PENDING) 375 if (t->flags & _TIF_ABI_PENDING)
383 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); 376 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
384 377
385 tsk->thread.debugreg0 = 0; 378 tsk->thread.debugreg0 = 0;
386 tsk->thread.debugreg1 = 0; 379 tsk->thread.debugreg1 = 0;
387 tsk->thread.debugreg2 = 0; 380 tsk->thread.debugreg2 = 0;
388 tsk->thread.debugreg3 = 0; 381 tsk->thread.debugreg3 = 0;
389 tsk->thread.debugreg6 = 0; 382 tsk->thread.debugreg6 = 0;
390 tsk->thread.debugreg7 = 0; 383 tsk->thread.debugreg7 = 0;
391 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 384 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
392 /* 385 /*
393 * Forget coprocessor state.. 386 * Forget coprocessor state..
394 */ 387 */
395 clear_fpu(tsk); 388 clear_fpu(tsk);
396 clear_used_math(); 389 clear_used_math();
397 } 390 }
398 391
399 void release_thread(struct task_struct *dead_task) 392 void release_thread(struct task_struct *dead_task)
400 { 393 {
401 if (dead_task->mm) { 394 if (dead_task->mm) {
402 if (dead_task->mm->context.size) { 395 if (dead_task->mm->context.size) {
403 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 396 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
404 dead_task->comm, 397 dead_task->comm,
405 dead_task->mm->context.ldt, 398 dead_task->mm->context.ldt,
406 dead_task->mm->context.size); 399 dead_task->mm->context.size);
407 BUG(); 400 BUG();
408 } 401 }
409 } 402 }
410 } 403 }
411 404
412 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 405 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
413 { 406 {
414 struct user_desc ud = { 407 struct user_desc ud = {
415 .base_addr = addr, 408 .base_addr = addr,
416 .limit = 0xfffff, 409 .limit = 0xfffff,
417 .seg_32bit = 1, 410 .seg_32bit = 1,
418 .limit_in_pages = 1, 411 .limit_in_pages = 1,
419 .useable = 1, 412 .useable = 1,
420 }; 413 };
421 struct n_desc_struct *desc = (void *)t->thread.tls_array; 414 struct n_desc_struct *desc = (void *)t->thread.tls_array;
422 desc += tls; 415 desc += tls;
423 desc->a = LDT_entry_a(&ud); 416 desc->a = LDT_entry_a(&ud);
424 desc->b = LDT_entry_b(&ud); 417 desc->b = LDT_entry_b(&ud);
425 } 418 }
426 419
427 static inline u32 read_32bit_tls(struct task_struct *t, int tls) 420 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
428 { 421 {
429 struct desc_struct *desc = (void *)t->thread.tls_array; 422 struct desc_struct *desc = (void *)t->thread.tls_array;
430 desc += tls; 423 desc += tls;
431 return desc->base0 | 424 return desc->base0 |
432 (((u32)desc->base1) << 16) | 425 (((u32)desc->base1) << 16) |
433 (((u32)desc->base2) << 24); 426 (((u32)desc->base2) << 24);
434 } 427 }
435 428
436 /* 429 /*
437 * This gets called before we allocate a new thread and copy 430 * This gets called before we allocate a new thread and copy
438 * the current task into it. 431 * the current task into it.
439 */ 432 */
440 void prepare_to_copy(struct task_struct *tsk) 433 void prepare_to_copy(struct task_struct *tsk)
441 { 434 {
442 unlazy_fpu(tsk); 435 unlazy_fpu(tsk);
443 } 436 }
444 437
445 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 438 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
446 unsigned long unused, 439 unsigned long unused,
447 struct task_struct * p, struct pt_regs * regs) 440 struct task_struct * p, struct pt_regs * regs)
448 { 441 {
449 int err; 442 int err;
450 struct pt_regs * childregs; 443 struct pt_regs * childregs;
451 struct task_struct *me = current; 444 struct task_struct *me = current;
452 445
453 childregs = ((struct pt_regs *) 446 childregs = ((struct pt_regs *)
454 (THREAD_SIZE + task_stack_page(p))) - 1; 447 (THREAD_SIZE + task_stack_page(p))) - 1;
455 *childregs = *regs; 448 *childregs = *regs;
456 449
457 childregs->rax = 0; 450 childregs->rax = 0;
458 childregs->rsp = rsp; 451 childregs->rsp = rsp;
459 if (rsp == ~0UL) 452 if (rsp == ~0UL)
460 childregs->rsp = (unsigned long)childregs; 453 childregs->rsp = (unsigned long)childregs;
461 454
462 p->thread.rsp = (unsigned long) childregs; 455 p->thread.rsp = (unsigned long) childregs;
463 p->thread.rsp0 = (unsigned long) (childregs+1); 456 p->thread.rsp0 = (unsigned long) (childregs+1);
464 p->thread.userrsp = me->thread.userrsp; 457 p->thread.userrsp = me->thread.userrsp;
465 458
466 set_tsk_thread_flag(p, TIF_FORK); 459 set_tsk_thread_flag(p, TIF_FORK);
467 460
468 p->thread.fs = me->thread.fs; 461 p->thread.fs = me->thread.fs;
469 p->thread.gs = me->thread.gs; 462 p->thread.gs = me->thread.gs;
470 463
471 asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); 464 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
472 asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); 465 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
473 asm("mov %%es,%0" : "=m" (p->thread.es)); 466 asm("mov %%es,%0" : "=m" (p->thread.es));
474 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 467 asm("mov %%ds,%0" : "=m" (p->thread.ds));
475 468
476 if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 469 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
477 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 470 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
478 if (!p->thread.io_bitmap_ptr) { 471 if (!p->thread.io_bitmap_ptr) {
479 p->thread.io_bitmap_max = 0; 472 p->thread.io_bitmap_max = 0;
480 return -ENOMEM; 473 return -ENOMEM;
481 } 474 }
482 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 475 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
483 IO_BITMAP_BYTES); 476 IO_BITMAP_BYTES);
484 } 477 }
485 478
486 /* 479 /*
487 * Set a new TLS for the child thread? 480 * Set a new TLS for the child thread?
488 */ 481 */
489 if (clone_flags & CLONE_SETTLS) { 482 if (clone_flags & CLONE_SETTLS) {
490 #ifdef CONFIG_IA32_EMULATION 483 #ifdef CONFIG_IA32_EMULATION
491 if (test_thread_flag(TIF_IA32)) 484 if (test_thread_flag(TIF_IA32))
492 err = ia32_child_tls(p, childregs); 485 err = ia32_child_tls(p, childregs);
493 else 486 else
494 #endif 487 #endif
495 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 488 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
496 if (err) 489 if (err)
497 goto out; 490 goto out;
498 } 491 }
499 err = 0; 492 err = 0;
500 out: 493 out:
501 if (err && p->thread.io_bitmap_ptr) { 494 if (err && p->thread.io_bitmap_ptr) {
502 kfree(p->thread.io_bitmap_ptr); 495 kfree(p->thread.io_bitmap_ptr);
503 p->thread.io_bitmap_max = 0; 496 p->thread.io_bitmap_max = 0;
504 } 497 }
505 return err; 498 return err;
506 } 499 }
507 500
508 /* 501 /*
509 * This special macro can be used to load a debugging register 502 * This special macro can be used to load a debugging register
510 */ 503 */
511 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 504 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
512 505
513 /* 506 /*
514 * switch_to(x,y) should switch tasks from x to y. 507 * switch_to(x,y) should switch tasks from x to y.
515 * 508 *
516 * This could still be optimized: 509 * This could still be optimized:
517 * - fold all the options into a flag word and test it with a single test. 510 * - fold all the options into a flag word and test it with a single test.
518 * - could test fs/gs bitsliced 511 * - could test fs/gs bitsliced
519 * 512 *
520 * Kprobes not supported here. Set the probe on schedule instead. 513 * Kprobes not supported here. Set the probe on schedule instead.
521 */ 514 */
522 __kprobes struct task_struct * 515 __kprobes struct task_struct *
523 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 516 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
524 { 517 {
525 struct thread_struct *prev = &prev_p->thread, 518 struct thread_struct *prev = &prev_p->thread,
526 *next = &next_p->thread; 519 *next = &next_p->thread;
527 int cpu = smp_processor_id(); 520 int cpu = smp_processor_id();
528 struct tss_struct *tss = &per_cpu(init_tss, cpu); 521 struct tss_struct *tss = &per_cpu(init_tss, cpu);
529 522
530 /* 523 /*
531 * Reload esp0, LDT and the page table pointer: 524 * Reload esp0, LDT and the page table pointer:
532 */ 525 */
533 tss->rsp0 = next->rsp0; 526 tss->rsp0 = next->rsp0;
534 527
535 /* 528 /*
536 * Switch DS and ES. 529 * Switch DS and ES.
537 * This won't pick up thread selector changes, but I guess that is ok. 530 * This won't pick up thread selector changes, but I guess that is ok.
538 */ 531 */
539 asm volatile("mov %%es,%0" : "=m" (prev->es)); 532 asm volatile("mov %%es,%0" : "=m" (prev->es));
540 if (unlikely(next->es | prev->es)) 533 if (unlikely(next->es | prev->es))
541 loadsegment(es, next->es); 534 loadsegment(es, next->es);
542 535
543 asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); 536 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
544 if (unlikely(next->ds | prev->ds)) 537 if (unlikely(next->ds | prev->ds))
545 loadsegment(ds, next->ds); 538 loadsegment(ds, next->ds);
546 539
547 load_TLS(next, cpu); 540 load_TLS(next, cpu);
548 541
549 /* 542 /*
550 * Switch FS and GS. 543 * Switch FS and GS.
551 */ 544 */
552 { 545 {
553 unsigned fsindex; 546 unsigned fsindex;
554 asm volatile("movl %%fs,%0" : "=r" (fsindex)); 547 asm volatile("movl %%fs,%0" : "=r" (fsindex));
555 /* segment register != 0 always requires a reload. 548 /* segment register != 0 always requires a reload.
556 also reload when it has changed. 549 also reload when it has changed.
557 when prev process used 64bit base always reload 550 when prev process used 64bit base always reload
558 to avoid an information leak. */ 551 to avoid an information leak. */
559 if (unlikely(fsindex | next->fsindex | prev->fs)) { 552 if (unlikely(fsindex | next->fsindex | prev->fs)) {
560 loadsegment(fs, next->fsindex); 553 loadsegment(fs, next->fsindex);
561 /* check if the user used a selector != 0 554 /* check if the user used a selector != 0
562 * if yes clear 64bit base, since overloaded base 555 * if yes clear 64bit base, since overloaded base
563 * is always mapped to the Null selector 556 * is always mapped to the Null selector
564 */ 557 */
565 if (fsindex) 558 if (fsindex)
566 prev->fs = 0; 559 prev->fs = 0;
567 } 560 }
568 /* when next process has a 64bit base use it */ 561 /* when next process has a 64bit base use it */
569 if (next->fs) 562 if (next->fs)
570 wrmsrl(MSR_FS_BASE, next->fs); 563 wrmsrl(MSR_FS_BASE, next->fs);
571 prev->fsindex = fsindex; 564 prev->fsindex = fsindex;
572 } 565 }
573 { 566 {
574 unsigned gsindex; 567 unsigned gsindex;
575 asm volatile("movl %%gs,%0" : "=r" (gsindex)); 568 asm volatile("movl %%gs,%0" : "=r" (gsindex));
576 if (unlikely(gsindex | next->gsindex | prev->gs)) { 569 if (unlikely(gsindex | next->gsindex | prev->gs)) {
577 load_gs_index(next->gsindex); 570 load_gs_index(next->gsindex);
578 if (gsindex) 571 if (gsindex)
579 prev->gs = 0; 572 prev->gs = 0;
580 } 573 }
581 if (next->gs) 574 if (next->gs)
582 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 575 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
583 prev->gsindex = gsindex; 576 prev->gsindex = gsindex;
584 } 577 }
585 578
586 /* 579 /*
587 * Switch the PDA and FPU contexts. 580 * Switch the PDA and FPU contexts.
588 */ 581 */
589 prev->userrsp = read_pda(oldrsp); 582 prev->userrsp = read_pda(oldrsp);
590 write_pda(oldrsp, next->userrsp); 583 write_pda(oldrsp, next->userrsp);
591 write_pda(pcurrent, next_p); 584 write_pda(pcurrent, next_p);
592 /* This must be here to ensure both math_state_restore() and 585 /* This must be here to ensure both math_state_restore() and
593 kernel_fpu_begin() work consistently. */ 586 kernel_fpu_begin() work consistently. */
594 unlazy_fpu(prev_p); 587 unlazy_fpu(prev_p);
595 write_pda(kernelstack, 588 write_pda(kernelstack,
596 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 589 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
597 590
598 /* 591 /*
599 * Now maybe reload the debug registers 592 * Now maybe reload the debug registers
600 */ 593 */
601 if (unlikely(next->debugreg7)) { 594 if (unlikely(next->debugreg7)) {
602 loaddebug(next, 0); 595 loaddebug(next, 0);
603 loaddebug(next, 1); 596 loaddebug(next, 1);
604 loaddebug(next, 2); 597 loaddebug(next, 2);
605 loaddebug(next, 3); 598 loaddebug(next, 3);
606 /* no 4 and 5 */ 599 /* no 4 and 5 */
607 loaddebug(next, 6); 600 loaddebug(next, 6);
608 loaddebug(next, 7); 601 loaddebug(next, 7);
609 } 602 }
610 603
611 604
612 /* 605 /*
613 * Handle the IO bitmap 606 * Handle the IO bitmap
614 */ 607 */
615 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { 608 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
616 if (next->io_bitmap_ptr) 609 if (next->io_bitmap_ptr)
617 /* 610 /*
618 * Copy the relevant range of the IO bitmap. 611 * Copy the relevant range of the IO bitmap.
619 * Normally this is 128 bytes or less: 612 * Normally this is 128 bytes or less:
620 */ 613 */
621 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 614 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
622 max(prev->io_bitmap_max, next->io_bitmap_max)); 615 max(prev->io_bitmap_max, next->io_bitmap_max));
623 else { 616 else {
624 /* 617 /*
625 * Clear any possible leftover bits: 618 * Clear any possible leftover bits:
626 */ 619 */
627 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 620 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
628 } 621 }
629 } 622 }
630 623
631 return prev_p; 624 return prev_p;
632 } 625 }
633 626
634 /* 627 /*
635 * sys_execve() executes a new program. 628 * sys_execve() executes a new program.
636 */ 629 */
637 asmlinkage 630 asmlinkage
638 long sys_execve(char __user *name, char __user * __user *argv, 631 long sys_execve(char __user *name, char __user * __user *argv,
639 char __user * __user *envp, struct pt_regs regs) 632 char __user * __user *envp, struct pt_regs regs)
640 { 633 {
641 long error; 634 long error;
642 char * filename; 635 char * filename;
643 636
644 filename = getname(name); 637 filename = getname(name);
645 error = PTR_ERR(filename); 638 error = PTR_ERR(filename);
646 if (IS_ERR(filename)) 639 if (IS_ERR(filename))
647 return error; 640 return error;
648 error = do_execve(filename, argv, envp, &regs); 641 error = do_execve(filename, argv, envp, &regs);
649 if (error == 0) { 642 if (error == 0) {
650 task_lock(current); 643 task_lock(current);
651 current->ptrace &= ~PT_DTRACE; 644 current->ptrace &= ~PT_DTRACE;
652 task_unlock(current); 645 task_unlock(current);
653 } 646 }
654 putname(filename); 647 putname(filename);
655 return error; 648 return error;
656 } 649 }
657 650
658 void set_personality_64bit(void) 651 void set_personality_64bit(void)
659 { 652 {
660 /* inherit personality from parent */ 653 /* inherit personality from parent */
661 654
662 /* Make sure to be in 64bit mode */ 655 /* Make sure to be in 64bit mode */
663 clear_thread_flag(TIF_IA32); 656 clear_thread_flag(TIF_IA32);
664 657
665 /* TBD: overwrites user setup. Should have two bits. 658 /* TBD: overwrites user setup. Should have two bits.
666 But 64bit processes have always behaved this way, 659 But 64bit processes have always behaved this way,
667 so it's not too bad. The main problem is just that 660 so it's not too bad. The main problem is just that
668 32bit childs are affected again. */ 661 32bit childs are affected again. */
669 current->personality &= ~READ_IMPLIES_EXEC; 662 current->personality &= ~READ_IMPLIES_EXEC;
670 } 663 }
671 664
672 asmlinkage long sys_fork(struct pt_regs *regs) 665 asmlinkage long sys_fork(struct pt_regs *regs)
673 { 666 {
674 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); 667 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
675 } 668 }
676 669
677 asmlinkage long 670 asmlinkage long
678 sys_clone(unsigned long clone_flags, unsigned long newsp, 671 sys_clone(unsigned long clone_flags, unsigned long newsp,
679 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 672 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
680 { 673 {
681 if (!newsp) 674 if (!newsp)
682 newsp = regs->rsp; 675 newsp = regs->rsp;
683 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 676 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
684 } 677 }
685 678
686 /* 679 /*
687 * This is trivial, and on the face of it looks like it 680 * This is trivial, and on the face of it looks like it
688 * could equally well be done in user mode. 681 * could equally well be done in user mode.
689 * 682 *
690 * Not so, for quite unobvious reasons - register pressure. 683 * Not so, for quite unobvious reasons - register pressure.
691 * In user mode vfork() cannot have a stack frame, and if 684 * In user mode vfork() cannot have a stack frame, and if
692 * done by calling the "clone()" system call directly, you 685 * done by calling the "clone()" system call directly, you
693 * do not have enough call-clobbered registers to hold all 686 * do not have enough call-clobbered registers to hold all
694 * the information you need. 687 * the information you need.
695 */ 688 */
696 asmlinkage long sys_vfork(struct pt_regs *regs) 689 asmlinkage long sys_vfork(struct pt_regs *regs)
697 { 690 {
698 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, 691 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
699 NULL, NULL); 692 NULL, NULL);
700 } 693 }
701 694
702 unsigned long get_wchan(struct task_struct *p) 695 unsigned long get_wchan(struct task_struct *p)
703 { 696 {
704 unsigned long stack; 697 unsigned long stack;
705 u64 fp,rip; 698 u64 fp,rip;
706 int count = 0; 699 int count = 0;
707 700
708 if (!p || p == current || p->state==TASK_RUNNING) 701 if (!p || p == current || p->state==TASK_RUNNING)
709 return 0; 702 return 0;
710 stack = (unsigned long)task_stack_page(p); 703 stack = (unsigned long)task_stack_page(p);
711 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) 704 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
712 return 0; 705 return 0;
713 fp = *(u64 *)(p->thread.rsp); 706 fp = *(u64 *)(p->thread.rsp);
714 do { 707 do {
715 if (fp < (unsigned long)stack || 708 if (fp < (unsigned long)stack ||
716 fp > (unsigned long)stack+THREAD_SIZE) 709 fp > (unsigned long)stack+THREAD_SIZE)
717 return 0; 710 return 0;
718 rip = *(u64 *)(fp+8); 711 rip = *(u64 *)(fp+8);
719 if (!in_sched_functions(rip)) 712 if (!in_sched_functions(rip))
720 return rip; 713 return rip;
721 fp = *(u64 *)fp; 714 fp = *(u64 *)fp;
722 } while (count++ < 16); 715 } while (count++ < 16);
723 return 0; 716 return 0;
724 } 717 }
725 718
726 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 719 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
727 { 720 {
728 int ret = 0; 721 int ret = 0;
729 int doit = task == current; 722 int doit = task == current;
730 int cpu; 723 int cpu;
731 724
732 switch (code) { 725 switch (code) {
733 case ARCH_SET_GS: 726 case ARCH_SET_GS:
734 if (addr >= TASK_SIZE_OF(task)) 727 if (addr >= TASK_SIZE_OF(task))
735 return -EPERM; 728 return -EPERM;
736 cpu = get_cpu(); 729 cpu = get_cpu();
737 /* handle small bases via the GDT because that's faster to 730 /* handle small bases via the GDT because that's faster to
738 switch. */ 731 switch. */
739 if (addr <= 0xffffffff) { 732 if (addr <= 0xffffffff) {
740 set_32bit_tls(task, GS_TLS, addr); 733 set_32bit_tls(task, GS_TLS, addr);
741 if (doit) { 734 if (doit) {
742 load_TLS(&task->thread, cpu); 735 load_TLS(&task->thread, cpu);
743 load_gs_index(GS_TLS_SEL); 736 load_gs_index(GS_TLS_SEL);
744 } 737 }
745 task->thread.gsindex = GS_TLS_SEL; 738 task->thread.gsindex = GS_TLS_SEL;
746 task->thread.gs = 0; 739 task->thread.gs = 0;
747 } else { 740 } else {
748 task->thread.gsindex = 0; 741 task->thread.gsindex = 0;
749 task->thread.gs = addr; 742 task->thread.gs = addr;
750 if (doit) { 743 if (doit) {
751 load_gs_index(0); 744 load_gs_index(0);
752 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 745 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
753 } 746 }
754 } 747 }
755 put_cpu(); 748 put_cpu();
756 break; 749 break;
757 case ARCH_SET_FS: 750 case ARCH_SET_FS:
758 /* Not strictly needed for fs, but do it for symmetry 751 /* Not strictly needed for fs, but do it for symmetry
759 with gs */ 752 with gs */
760 if (addr >= TASK_SIZE_OF(task)) 753 if (addr >= TASK_SIZE_OF(task))
761 return -EPERM; 754 return -EPERM;
762 cpu = get_cpu(); 755 cpu = get_cpu();
763 /* handle small bases via the GDT because that's faster to 756 /* handle small bases via the GDT because that's faster to
764 switch. */ 757 switch. */
765 if (addr <= 0xffffffff) { 758 if (addr <= 0xffffffff) {
766 set_32bit_tls(task, FS_TLS, addr); 759 set_32bit_tls(task, FS_TLS, addr);
767 if (doit) { 760 if (doit) {
768 load_TLS(&task->thread, cpu); 761 load_TLS(&task->thread, cpu);
769 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 762 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
770 } 763 }
771 task->thread.fsindex = FS_TLS_SEL; 764 task->thread.fsindex = FS_TLS_SEL;
772 task->thread.fs = 0; 765 task->thread.fs = 0;
773 } else { 766 } else {
774 task->thread.fsindex = 0; 767 task->thread.fsindex = 0;
775 task->thread.fs = addr; 768 task->thread.fs = addr;
776 if (doit) { 769 if (doit) {
777 /* set the selector to 0 to not confuse 770 /* set the selector to 0 to not confuse
778 __switch_to */ 771 __switch_to */
779 asm volatile("movl %0,%%fs" :: "r" (0)); 772 asm volatile("movl %0,%%fs" :: "r" (0));
780 ret = checking_wrmsrl(MSR_FS_BASE, addr); 773 ret = checking_wrmsrl(MSR_FS_BASE, addr);
781 } 774 }
782 } 775 }
783 put_cpu(); 776 put_cpu();
784 break; 777 break;
785 case ARCH_GET_FS: { 778 case ARCH_GET_FS: {
786 unsigned long base; 779 unsigned long base;
787 if (task->thread.fsindex == FS_TLS_SEL) 780 if (task->thread.fsindex == FS_TLS_SEL)
788 base = read_32bit_tls(task, FS_TLS); 781 base = read_32bit_tls(task, FS_TLS);
789 else if (doit) 782 else if (doit)
790 rdmsrl(MSR_FS_BASE, base); 783 rdmsrl(MSR_FS_BASE, base);
791 else 784 else
792 base = task->thread.fs; 785 base = task->thread.fs;
793 ret = put_user(base, (unsigned long __user *)addr); 786 ret = put_user(base, (unsigned long __user *)addr);
794 break; 787 break;
795 } 788 }
796 case ARCH_GET_GS: { 789 case ARCH_GET_GS: {
797 unsigned long base; 790 unsigned long base;
798 if (task->thread.gsindex == GS_TLS_SEL) 791 if (task->thread.gsindex == GS_TLS_SEL)
799 base = read_32bit_tls(task, GS_TLS); 792 base = read_32bit_tls(task, GS_TLS);
800 else if (doit) 793 else if (doit)
801 rdmsrl(MSR_KERNEL_GS_BASE, base); 794 rdmsrl(MSR_KERNEL_GS_BASE, base);
802 else 795 else
803 base = task->thread.gs; 796 base = task->thread.gs;
804 ret = put_user(base, (unsigned long __user *)addr); 797 ret = put_user(base, (unsigned long __user *)addr);
805 break; 798 break;
806 } 799 }
807 800
808 default: 801 default:
809 ret = -EINVAL; 802 ret = -EINVAL;
810 break; 803 break;
811 } 804 }
812 805
813 return ret; 806 return ret;
814 } 807 }
815 808
816 long sys_arch_prctl(int code, unsigned long addr) 809 long sys_arch_prctl(int code, unsigned long addr)
817 { 810 {
818 return do_arch_prctl(current, code, addr); 811 return do_arch_prctl(current, code, addr);
819 } 812 }
820 813
821 /* 814 /*
822 * Capture the user space registers if the task is not running (in user space) 815 * Capture the user space registers if the task is not running (in user space)
823 */ 816 */
824 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) 817 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
825 { 818 {
826 struct pt_regs *pp, ptregs; 819 struct pt_regs *pp, ptregs;
827 820
828 pp = task_pt_regs(tsk); 821 pp = task_pt_regs(tsk);
829 822
830 ptregs = *pp; 823 ptregs = *pp;
831 ptregs.cs &= 0xffff; 824 ptregs.cs &= 0xffff;
832 ptregs.ss &= 0xffff; 825 ptregs.ss &= 0xffff;
833 826
834 elf_core_copy_regs(regs, &ptregs); 827 elf_core_copy_regs(regs, &ptregs);
835 828
836 return 1; 829 return 1;
837 } 830 }
838 831
839 unsigned long arch_align_stack(unsigned long sp) 832 unsigned long arch_align_stack(unsigned long sp)
840 { 833 {
841 if (randomize_va_space) 834 if (randomize_va_space)
842 sp -= get_random_int() % 8192; 835 sp -= get_random_int() % 8192;
843 return sp & ~0xf; 836 return sp & ~0xf;
844 } 837 }
1 /* 1 /*
2 * Kernel Probes (KProbes) 2 * Kernel Probes (KProbes)
3 * kernel/kprobes.c 3 * kernel/kprobes.c
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or 7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version. 8 * (at your option) any later version.
9 * 9 *
10 * This program is distributed in the hope that it will be useful, 10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details. 13 * GNU General Public License for more details.
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 * 18 *
19 * Copyright (C) IBM Corporation, 2002, 2004 19 * Copyright (C) IBM Corporation, 2002, 2004
20 * 20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel 21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation (includes suggestions from 22 * Probes initial implementation (includes suggestions from
23 * Rusty Russell). 23 * Rusty Russell).
24 * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with 24 * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
25 * hlists and exceptions notifier as suggested by Andi Kleen. 25 * hlists and exceptions notifier as suggested by Andi Kleen.
26 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes 26 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
27 * interface to access function arguments. 27 * interface to access function arguments.
28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes 28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
29 * exceptions notifier to be first on the priority list. 29 * exceptions notifier to be first on the priority list.
30 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston 30 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
31 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi 31 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
32 * <prasanna@in.ibm.com> added function-return probes. 32 * <prasanna@in.ibm.com> added function-return probes.
33 */ 33 */
34 #include <linux/kprobes.h> 34 #include <linux/kprobes.h>
35 #include <linux/hash.h> 35 #include <linux/hash.h>
36 #include <linux/init.h> 36 #include <linux/init.h>
37 #include <linux/slab.h> 37 #include <linux/slab.h>
38 #include <linux/module.h> 38 #include <linux/module.h>
39 #include <linux/moduleloader.h> 39 #include <linux/moduleloader.h>
40 #include <asm-generic/sections.h> 40 #include <asm-generic/sections.h>
41 #include <asm/cacheflush.h> 41 #include <asm/cacheflush.h>
42 #include <asm/errno.h> 42 #include <asm/errno.h>
43 #include <asm/kdebug.h> 43 #include <asm/kdebug.h>
44 44
45 #define KPROBE_HASH_BITS 6 45 #define KPROBE_HASH_BITS 6
46 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 46 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
47 47
48 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51 DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 51 DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52 DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 52 DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 53 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
55 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT 55 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56 /* 56 /*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 57 * kprobe->ainsn.insn points to the copy of the instruction to be
58 * single-stepped. x86_64, POWER4 and above have no-exec support and 58 * single-stepped. x86_64, POWER4 and above have no-exec support and
59 * stepping on the instruction on a vmalloced/kmalloced/data page 59 * stepping on the instruction on a vmalloced/kmalloced/data page
60 * is a recipe for disaster 60 * is a recipe for disaster
61 */ 61 */
62 #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 62 #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
63 63
64 struct kprobe_insn_page { 64 struct kprobe_insn_page {
65 struct hlist_node hlist; 65 struct hlist_node hlist;
66 kprobe_opcode_t *insns; /* Page of instruction slots */ 66 kprobe_opcode_t *insns; /* Page of instruction slots */
67 char slot_used[INSNS_PER_PAGE]; 67 char slot_used[INSNS_PER_PAGE];
68 int nused; 68 int nused;
69 }; 69 };
70 70
71 static struct hlist_head kprobe_insn_pages; 71 static struct hlist_head kprobe_insn_pages;
72 72
73 /** 73 /**
74 * get_insn_slot() - Find a slot on an executable page for an instruction. 74 * get_insn_slot() - Find a slot on an executable page for an instruction.
75 * We allocate an executable page if there's no room on existing ones. 75 * We allocate an executable page if there's no room on existing ones.
76 */ 76 */
77 kprobe_opcode_t __kprobes *get_insn_slot(void) 77 kprobe_opcode_t __kprobes *get_insn_slot(void)
78 { 78 {
79 struct kprobe_insn_page *kip; 79 struct kprobe_insn_page *kip;
80 struct hlist_node *pos; 80 struct hlist_node *pos;
81 81
82 hlist_for_each(pos, &kprobe_insn_pages) { 82 hlist_for_each(pos, &kprobe_insn_pages) {
83 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 83 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
84 if (kip->nused < INSNS_PER_PAGE) { 84 if (kip->nused < INSNS_PER_PAGE) {
85 int i; 85 int i;
86 for (i = 0; i < INSNS_PER_PAGE; i++) { 86 for (i = 0; i < INSNS_PER_PAGE; i++) {
87 if (!kip->slot_used[i]) { 87 if (!kip->slot_used[i]) {
88 kip->slot_used[i] = 1; 88 kip->slot_used[i] = 1;
89 kip->nused++; 89 kip->nused++;
90 return kip->insns + (i * MAX_INSN_SIZE); 90 return kip->insns + (i * MAX_INSN_SIZE);
91 } 91 }
92 } 92 }
93 /* Surprise! No unused slots. Fix kip->nused. */ 93 /* Surprise! No unused slots. Fix kip->nused. */
94 kip->nused = INSNS_PER_PAGE; 94 kip->nused = INSNS_PER_PAGE;
95 } 95 }
96 } 96 }
97 97
98 /* All out of space. Need to allocate a new page. Use slot 0.*/ 98 /* All out of space. Need to allocate a new page. Use slot 0.*/
99 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 99 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
100 if (!kip) { 100 if (!kip) {
101 return NULL; 101 return NULL;
102 } 102 }
103 103
104 /* 104 /*
105 * Use module_alloc so this page is within +/- 2GB of where the 105 * Use module_alloc so this page is within +/- 2GB of where the
106 * kernel image and loaded module images reside. This is required 106 * kernel image and loaded module images reside. This is required
107 * so x86_64 can correctly handle the %rip-relative fixups. 107 * so x86_64 can correctly handle the %rip-relative fixups.
108 */ 108 */
109 kip->insns = module_alloc(PAGE_SIZE); 109 kip->insns = module_alloc(PAGE_SIZE);
110 if (!kip->insns) { 110 if (!kip->insns) {
111 kfree(kip); 111 kfree(kip);
112 return NULL; 112 return NULL;
113 } 113 }
114 INIT_HLIST_NODE(&kip->hlist); 114 INIT_HLIST_NODE(&kip->hlist);
115 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 115 hlist_add_head(&kip->hlist, &kprobe_insn_pages);
116 memset(kip->slot_used, 0, INSNS_PER_PAGE); 116 memset(kip->slot_used, 0, INSNS_PER_PAGE);
117 kip->slot_used[0] = 1; 117 kip->slot_used[0] = 1;
118 kip->nused = 1; 118 kip->nused = 1;
119 return kip->insns; 119 return kip->insns;
120 } 120 }
121 121
122 void __kprobes free_insn_slot(kprobe_opcode_t *slot) 122 void __kprobes free_insn_slot(kprobe_opcode_t *slot)
123 { 123 {
124 struct kprobe_insn_page *kip; 124 struct kprobe_insn_page *kip;
125 struct hlist_node *pos; 125 struct hlist_node *pos;
126 126
127 hlist_for_each(pos, &kprobe_insn_pages) { 127 hlist_for_each(pos, &kprobe_insn_pages) {
128 kip = hlist_entry(pos, struct kprobe_insn_page, hlist); 128 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
129 if (kip->insns <= slot && 129 if (kip->insns <= slot &&
130 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 130 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
131 int i = (slot - kip->insns) / MAX_INSN_SIZE; 131 int i = (slot - kip->insns) / MAX_INSN_SIZE;
132 kip->slot_used[i] = 0; 132 kip->slot_used[i] = 0;
133 kip->nused--; 133 kip->nused--;
134 if (kip->nused == 0) { 134 if (kip->nused == 0) {
135 /* 135 /*
136 * Page is no longer in use. Free it unless 136 * Page is no longer in use. Free it unless
137 * it's the last one. We keep the last one 137 * it's the last one. We keep the last one
138 * so as not to have to set it up again the 138 * so as not to have to set it up again the
139 * next time somebody inserts a probe. 139 * next time somebody inserts a probe.
140 */ 140 */
141 hlist_del(&kip->hlist); 141 hlist_del(&kip->hlist);
142 if (hlist_empty(&kprobe_insn_pages)) { 142 if (hlist_empty(&kprobe_insn_pages)) {
143 INIT_HLIST_NODE(&kip->hlist); 143 INIT_HLIST_NODE(&kip->hlist);
144 hlist_add_head(&kip->hlist, 144 hlist_add_head(&kip->hlist,
145 &kprobe_insn_pages); 145 &kprobe_insn_pages);
146 } else { 146 } else {
147 module_free(NULL, kip->insns); 147 module_free(NULL, kip->insns);
148 kfree(kip); 148 kfree(kip);
149 } 149 }
150 } 150 }
151 return; 151 return;
152 } 152 }
153 } 153 }
154 } 154 }
155 #endif 155 #endif
156 156
157 /* We have preemption disabled.. so it is safe to use __ versions */ 157 /* We have preemption disabled.. so it is safe to use __ versions */
158 static inline void set_kprobe_instance(struct kprobe *kp) 158 static inline void set_kprobe_instance(struct kprobe *kp)
159 { 159 {
160 __get_cpu_var(kprobe_instance) = kp; 160 __get_cpu_var(kprobe_instance) = kp;
161 } 161 }
162 162
163 static inline void reset_kprobe_instance(void) 163 static inline void reset_kprobe_instance(void)
164 { 164 {
165 __get_cpu_var(kprobe_instance) = NULL; 165 __get_cpu_var(kprobe_instance) = NULL;
166 } 166 }
167 167
168 /* 168 /*
169 * This routine is called either: 169 * This routine is called either:
170 * - under the kprobe_mutex - during kprobe_[un]register() 170 * - under the kprobe_mutex - during kprobe_[un]register()
171 * OR 171 * OR
172 * - with preemption disabled - from arch/xxx/kernel/kprobes.c 172 * - with preemption disabled - from arch/xxx/kernel/kprobes.c
173 */ 173 */
174 struct kprobe __kprobes *get_kprobe(void *addr) 174 struct kprobe __kprobes *get_kprobe(void *addr)
175 { 175 {
176 struct hlist_head *head; 176 struct hlist_head *head;
177 struct hlist_node *node; 177 struct hlist_node *node;
178 struct kprobe *p; 178 struct kprobe *p;
179 179
180 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; 180 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
181 hlist_for_each_entry_rcu(p, node, head, hlist) { 181 hlist_for_each_entry_rcu(p, node, head, hlist) {
182 if (p->addr == addr) 182 if (p->addr == addr)
183 return p; 183 return p;
184 } 184 }
185 return NULL; 185 return NULL;
186 } 186 }
187 187
188 /* 188 /*
189 * Aggregate handlers for multiple kprobes support - these handlers 189 * Aggregate handlers for multiple kprobes support - these handlers
190 * take care of invoking the individual kprobe handlers on p->list 190 * take care of invoking the individual kprobe handlers on p->list
191 */ 191 */
192 static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 192 static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
193 { 193 {
194 struct kprobe *kp; 194 struct kprobe *kp;
195 195
196 list_for_each_entry_rcu(kp, &p->list, list) { 196 list_for_each_entry_rcu(kp, &p->list, list) {
197 if (kp->pre_handler) { 197 if (kp->pre_handler) {
198 set_kprobe_instance(kp); 198 set_kprobe_instance(kp);
199 if (kp->pre_handler(kp, regs)) 199 if (kp->pre_handler(kp, regs))
200 return 1; 200 return 1;
201 } 201 }
202 reset_kprobe_instance(); 202 reset_kprobe_instance();
203 } 203 }
204 return 0; 204 return 0;
205 } 205 }
206 206
207 static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 207 static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
208 unsigned long flags) 208 unsigned long flags)
209 { 209 {
210 struct kprobe *kp; 210 struct kprobe *kp;
211 211
212 list_for_each_entry_rcu(kp, &p->list, list) { 212 list_for_each_entry_rcu(kp, &p->list, list) {
213 if (kp->post_handler) { 213 if (kp->post_handler) {
214 set_kprobe_instance(kp); 214 set_kprobe_instance(kp);
215 kp->post_handler(kp, regs, flags); 215 kp->post_handler(kp, regs, flags);
216 reset_kprobe_instance(); 216 reset_kprobe_instance();
217 } 217 }
218 } 218 }
219 return; 219 return;
220 } 220 }
221 221
222 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 222 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
223 int trapnr) 223 int trapnr)
224 { 224 {
225 struct kprobe *cur = __get_cpu_var(kprobe_instance); 225 struct kprobe *cur = __get_cpu_var(kprobe_instance);
226 226
227 /* 227 /*
228 * if we faulted "during" the execution of a user specified 228 * if we faulted "during" the execution of a user specified
229 * probe handler, invoke just that probe's fault handler 229 * probe handler, invoke just that probe's fault handler
230 */ 230 */
231 if (cur && cur->fault_handler) { 231 if (cur && cur->fault_handler) {
232 if (cur->fault_handler(cur, regs, trapnr)) 232 if (cur->fault_handler(cur, regs, trapnr))
233 return 1; 233 return 1;
234 } 234 }
235 return 0; 235 return 0;
236 } 236 }
237 237
238 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 238 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
239 { 239 {
240 struct kprobe *cur = __get_cpu_var(kprobe_instance); 240 struct kprobe *cur = __get_cpu_var(kprobe_instance);
241 int ret = 0; 241 int ret = 0;
242 242
243 if (cur && cur->break_handler) { 243 if (cur && cur->break_handler) {
244 if (cur->break_handler(cur, regs)) 244 if (cur->break_handler(cur, regs))
245 ret = 1; 245 ret = 1;
246 } 246 }
247 reset_kprobe_instance(); 247 reset_kprobe_instance();
248 return ret; 248 return ret;
249 } 249 }
250 250
251 /* Walks the list and increments nmissed count for multiprobe case */ 251 /* Walks the list and increments nmissed count for multiprobe case */
252 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 252 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
253 { 253 {
254 struct kprobe *kp; 254 struct kprobe *kp;
255 if (p->pre_handler != aggr_pre_handler) { 255 if (p->pre_handler != aggr_pre_handler) {
256 p->nmissed++; 256 p->nmissed++;
257 } else { 257 } else {
258 list_for_each_entry_rcu(kp, &p->list, list) 258 list_for_each_entry_rcu(kp, &p->list, list)
259 kp->nmissed++; 259 kp->nmissed++;
260 } 260 }
261 return; 261 return;
262 } 262 }
263 263
264 /* Called with kretprobe_lock held */ 264 /* Called with kretprobe_lock held */
265 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) 265 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
266 { 266 {
267 struct hlist_node *node; 267 struct hlist_node *node;
268 struct kretprobe_instance *ri; 268 struct kretprobe_instance *ri;
269 hlist_for_each_entry(ri, node, &rp->free_instances, uflist) 269 hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
270 return ri; 270 return ri;
271 return NULL; 271 return NULL;
272 } 272 }
273 273
274 /* Called with kretprobe_lock held */ 274 /* Called with kretprobe_lock held */
275 static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe 275 static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
276 *rp) 276 *rp)
277 { 277 {
278 struct hlist_node *node; 278 struct hlist_node *node;
279 struct kretprobe_instance *ri; 279 struct kretprobe_instance *ri;
280 hlist_for_each_entry(ri, node, &rp->used_instances, uflist) 280 hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
281 return ri; 281 return ri;
282 return NULL; 282 return NULL;
283 } 283 }
284 284
285 /* Called with kretprobe_lock held */ 285 /* Called with kretprobe_lock held */
286 void __kprobes add_rp_inst(struct kretprobe_instance *ri) 286 void __kprobes add_rp_inst(struct kretprobe_instance *ri)
287 { 287 {
288 /* 288 /*
289 * Remove rp inst off the free list - 289 * Remove rp inst off the free list -
290 * Add it back when probed function returns 290 * Add it back when probed function returns
291 */ 291 */
292 hlist_del(&ri->uflist); 292 hlist_del(&ri->uflist);
293 293
294 /* Add rp inst onto table */ 294 /* Add rp inst onto table */
295 INIT_HLIST_NODE(&ri->hlist); 295 INIT_HLIST_NODE(&ri->hlist);
296 hlist_add_head(&ri->hlist, 296 hlist_add_head(&ri->hlist,
297 &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); 297 &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
298 298
299 /* Also add this rp inst to the used list. */ 299 /* Also add this rp inst to the used list. */
300 INIT_HLIST_NODE(&ri->uflist); 300 INIT_HLIST_NODE(&ri->uflist);
301 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 301 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
302 } 302 }
303 303
304 /* Called with kretprobe_lock held */ 304 /* Called with kretprobe_lock held */
305 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) 305 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
306 { 306 {
307 /* remove rp inst off the rprobe_inst_table */ 307 /* remove rp inst off the rprobe_inst_table */
308 hlist_del(&ri->hlist); 308 hlist_del(&ri->hlist);
309 if (ri->rp) { 309 if (ri->rp) {
310 /* remove rp inst off the used list */ 310 /* remove rp inst off the used list */
311 hlist_del(&ri->uflist); 311 hlist_del(&ri->uflist);
312 /* put rp inst back onto the free list */ 312 /* put rp inst back onto the free list */
313 INIT_HLIST_NODE(&ri->uflist); 313 INIT_HLIST_NODE(&ri->uflist);
314 hlist_add_head(&ri->uflist, &ri->rp->free_instances); 314 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
315 } else 315 } else
316 /* Unregistering */ 316 /* Unregistering */
317 kfree(ri); 317 kfree(ri);
318 } 318 }
319 319
320 struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) 320 struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
321 { 321 {
322 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 322 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
323 } 323 }
324 324
325 /* 325 /*
326 * This function is called from exit_thread or flush_thread when task tk's 326 * This function is called from finish_task_switch when task tk becomes dead,
327 * stack is being recycled so that we can recycle any function-return probe 327 * so that we can recycle any function-return probe instances associated
328 * instances associated with this task. These left over instances represent 328 * with this task. These left over instances represent probed functions
329 * probed functions that have been called but will never return. 329 * that have been called but will never return.
330 */ 330 */
331 void __kprobes kprobe_flush_task(struct task_struct *tk) 331 void __kprobes kprobe_flush_task(struct task_struct *tk)
332 { 332 {
333 struct kretprobe_instance *ri; 333 struct kretprobe_instance *ri;
334 struct hlist_head *head; 334 struct hlist_head *head;
335 struct hlist_node *node, *tmp; 335 struct hlist_node *node, *tmp;
336 unsigned long flags = 0; 336 unsigned long flags = 0;
337 337
338 spin_lock_irqsave(&kretprobe_lock, flags); 338 spin_lock_irqsave(&kretprobe_lock, flags);
339 head = kretprobe_inst_table_head(current); 339 head = kretprobe_inst_table_head(tk);
340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
341 if (ri->task == tk) 341 if (ri->task == tk)
342 recycle_rp_inst(ri); 342 recycle_rp_inst(ri);
343 } 343 }
344 spin_unlock_irqrestore(&kretprobe_lock, flags); 344 spin_unlock_irqrestore(&kretprobe_lock, flags);
345 } 345 }
346 346
347 static inline void free_rp_inst(struct kretprobe *rp) 347 static inline void free_rp_inst(struct kretprobe *rp)
348 { 348 {
349 struct kretprobe_instance *ri; 349 struct kretprobe_instance *ri;
350 while ((ri = get_free_rp_inst(rp)) != NULL) { 350 while ((ri = get_free_rp_inst(rp)) != NULL) {
351 hlist_del(&ri->uflist); 351 hlist_del(&ri->uflist);
352 kfree(ri); 352 kfree(ri);
353 } 353 }
354 } 354 }
355 355
356 /* 356 /*
357 * Keep all fields in the kprobe consistent 357 * Keep all fields in the kprobe consistent
358 */ 358 */
359 static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) 359 static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
360 { 360 {
361 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); 361 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
362 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); 362 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
363 } 363 }
364 364
365 /* 365 /*
366 * Add the new probe to old_p->list. Fail if this is the 366 * Add the new probe to old_p->list. Fail if this is the
367 * second jprobe at the address - two jprobes can't coexist 367 * second jprobe at the address - two jprobes can't coexist
368 */ 368 */
369 static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 369 static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370 { 370 {
371 struct kprobe *kp; 371 struct kprobe *kp;
372 372
373 if (p->break_handler) { 373 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 374 list_for_each_entry_rcu(kp, &old_p->list, list) {
375 if (kp->break_handler) 375 if (kp->break_handler)
376 return -EEXIST; 376 return -EEXIST;
377 } 377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 378 list_add_tail_rcu(&p->list, &old_p->list);
379 } else 379 } else
380 list_add_rcu(&p->list, &old_p->list); 380 list_add_rcu(&p->list, &old_p->list);
381 return 0; 381 return 0;
382 } 382 }
383 383
384 /* 384 /*
385 * Fill in the required fields of the "manager kprobe". Replace the 385 * Fill in the required fields of the "manager kprobe". Replace the
386 * earlier kprobe in the hlist with the manager kprobe 386 * earlier kprobe in the hlist with the manager kprobe
387 */ 387 */
388 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 388 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
389 { 389 {
390 copy_kprobe(p, ap); 390 copy_kprobe(p, ap);
391 ap->addr = p->addr; 391 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 392 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler; 393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 394 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 395 ap->break_handler = aggr_break_handler;
396 396
397 INIT_LIST_HEAD(&ap->list); 397 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 398 list_add_rcu(&p->list, &ap->list);
399 399
400 hlist_replace_rcu(&p->hlist, &ap->hlist); 400 hlist_replace_rcu(&p->hlist, &ap->hlist);
401 } 401 }
402 402
403 /* 403 /*
404 * This is the second or subsequent kprobe at the address - handle 404 * This is the second or subsequent kprobe at the address - handle
405 * the intricacies 405 * the intricacies
406 */ 406 */
407 static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 407 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
408 struct kprobe *p) 408 struct kprobe *p)
409 { 409 {
410 int ret = 0; 410 int ret = 0;
411 struct kprobe *ap; 411 struct kprobe *ap;
412 412
413 if (old_p->pre_handler == aggr_pre_handler) { 413 if (old_p->pre_handler == aggr_pre_handler) {
414 copy_kprobe(old_p, p); 414 copy_kprobe(old_p, p);
415 ret = add_new_kprobe(old_p, p); 415 ret = add_new_kprobe(old_p, p);
416 } else { 416 } else {
417 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 417 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
418 if (!ap) 418 if (!ap)
419 return -ENOMEM; 419 return -ENOMEM;
420 add_aggr_kprobe(ap, old_p); 420 add_aggr_kprobe(ap, old_p);
421 copy_kprobe(ap, p); 421 copy_kprobe(ap, p);
422 ret = add_new_kprobe(ap, p); 422 ret = add_new_kprobe(ap, p);
423 } 423 }
424 return ret; 424 return ret;
425 } 425 }
426 426
427 static int __kprobes in_kprobes_functions(unsigned long addr) 427 static int __kprobes in_kprobes_functions(unsigned long addr)
428 { 428 {
429 if (addr >= (unsigned long)__kprobes_text_start 429 if (addr >= (unsigned long)__kprobes_text_start
430 && addr < (unsigned long)__kprobes_text_end) 430 && addr < (unsigned long)__kprobes_text_end)
431 return -EINVAL; 431 return -EINVAL;
432 return 0; 432 return 0;
433 } 433 }
434 434
435 static int __kprobes __register_kprobe(struct kprobe *p, 435 static int __kprobes __register_kprobe(struct kprobe *p,
436 unsigned long called_from) 436 unsigned long called_from)
437 { 437 {
438 int ret = 0; 438 int ret = 0;
439 struct kprobe *old_p; 439 struct kprobe *old_p;
440 struct module *probed_mod; 440 struct module *probed_mod;
441 441
442 if ((!kernel_text_address((unsigned long) p->addr)) || 442 if ((!kernel_text_address((unsigned long) p->addr)) ||
443 in_kprobes_functions((unsigned long) p->addr)) 443 in_kprobes_functions((unsigned long) p->addr))
444 return -EINVAL; 444 return -EINVAL;
445 445
446 p->mod_refcounted = 0; 446 p->mod_refcounted = 0;
447 /* Check are we probing a module */ 447 /* Check are we probing a module */
448 if ((probed_mod = module_text_address((unsigned long) p->addr))) { 448 if ((probed_mod = module_text_address((unsigned long) p->addr))) {
449 struct module *calling_mod = module_text_address(called_from); 449 struct module *calling_mod = module_text_address(called_from);
450 /* We must allow modules to probe themself and 450 /* We must allow modules to probe themself and
451 * in this case avoid incrementing the module refcount, 451 * in this case avoid incrementing the module refcount,
452 * so as to allow unloading of self probing modules. 452 * so as to allow unloading of self probing modules.
453 */ 453 */
454 if (calling_mod && (calling_mod != probed_mod)) { 454 if (calling_mod && (calling_mod != probed_mod)) {
455 if (unlikely(!try_module_get(probed_mod))) 455 if (unlikely(!try_module_get(probed_mod)))
456 return -EINVAL; 456 return -EINVAL;
457 p->mod_refcounted = 1; 457 p->mod_refcounted = 1;
458 } else 458 } else
459 probed_mod = NULL; 459 probed_mod = NULL;
460 } 460 }
461 461
462 p->nmissed = 0; 462 p->nmissed = 0;
463 mutex_lock(&kprobe_mutex); 463 mutex_lock(&kprobe_mutex);
464 old_p = get_kprobe(p->addr); 464 old_p = get_kprobe(p->addr);
465 if (old_p) { 465 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 466 ret = register_aggr_kprobe(old_p, p);
467 goto out; 467 goto out;
468 } 468 }
469 469
470 if ((ret = arch_prepare_kprobe(p)) != 0) 470 if ((ret = arch_prepare_kprobe(p)) != 0)
471 goto out; 471 goto out;
472 472
473 INIT_HLIST_NODE(&p->hlist); 473 INIT_HLIST_NODE(&p->hlist);
474 hlist_add_head_rcu(&p->hlist, 474 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 476
477 arch_arm_kprobe(p); 477 arch_arm_kprobe(p);
478 478
479 out: 479 out:
480 mutex_unlock(&kprobe_mutex); 480 mutex_unlock(&kprobe_mutex);
481 481
482 if (ret && probed_mod) 482 if (ret && probed_mod)
483 module_put(probed_mod); 483 module_put(probed_mod);
484 return ret; 484 return ret;
485 } 485 }
486 486
487 int __kprobes register_kprobe(struct kprobe *p) 487 int __kprobes register_kprobe(struct kprobe *p)
488 { 488 {
489 return __register_kprobe(p, 489 return __register_kprobe(p,
490 (unsigned long)__builtin_return_address(0)); 490 (unsigned long)__builtin_return_address(0));
491 } 491 }
492 492
493 void __kprobes unregister_kprobe(struct kprobe *p) 493 void __kprobes unregister_kprobe(struct kprobe *p)
494 { 494 {
495 struct module *mod; 495 struct module *mod;
496 struct kprobe *old_p, *list_p; 496 struct kprobe *old_p, *list_p;
497 int cleanup_p; 497 int cleanup_p;
498 498
499 mutex_lock(&kprobe_mutex); 499 mutex_lock(&kprobe_mutex);
500 old_p = get_kprobe(p->addr); 500 old_p = get_kprobe(p->addr);
501 if (unlikely(!old_p)) { 501 if (unlikely(!old_p)) {
502 mutex_unlock(&kprobe_mutex); 502 mutex_unlock(&kprobe_mutex);
503 return; 503 return;
504 } 504 }
505 if (p != old_p) { 505 if (p != old_p) {
506 list_for_each_entry_rcu(list_p, &old_p->list, list) 506 list_for_each_entry_rcu(list_p, &old_p->list, list)
507 if (list_p == p) 507 if (list_p == p)
508 /* kprobe p is a valid probe */ 508 /* kprobe p is a valid probe */
509 goto valid_p; 509 goto valid_p;
510 mutex_unlock(&kprobe_mutex); 510 mutex_unlock(&kprobe_mutex);
511 return; 511 return;
512 } 512 }
513 valid_p: 513 valid_p:
514 if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && 514 if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
515 (p->list.next == &old_p->list) && 515 (p->list.next == &old_p->list) &&
516 (p->list.prev == &old_p->list))) { 516 (p->list.prev == &old_p->list))) {
517 /* Only probe on the hash list */ 517 /* Only probe on the hash list */
518 arch_disarm_kprobe(p); 518 arch_disarm_kprobe(p);
519 hlist_del_rcu(&old_p->hlist); 519 hlist_del_rcu(&old_p->hlist);
520 cleanup_p = 1; 520 cleanup_p = 1;
521 } else { 521 } else {
522 list_del_rcu(&p->list); 522 list_del_rcu(&p->list);
523 cleanup_p = 0; 523 cleanup_p = 0;
524 } 524 }
525 525
526 mutex_unlock(&kprobe_mutex); 526 mutex_unlock(&kprobe_mutex);
527 527
528 synchronize_sched(); 528 synchronize_sched();
529 if (p->mod_refcounted && 529 if (p->mod_refcounted &&
530 (mod = module_text_address((unsigned long)p->addr))) 530 (mod = module_text_address((unsigned long)p->addr)))
531 module_put(mod); 531 module_put(mod);
532 532
533 if (cleanup_p) { 533 if (cleanup_p) {
534 if (p != old_p) { 534 if (p != old_p) {
535 list_del_rcu(&p->list); 535 list_del_rcu(&p->list);
536 kfree(old_p); 536 kfree(old_p);
537 } 537 }
538 arch_remove_kprobe(p); 538 arch_remove_kprobe(p);
539 } 539 }
540 } 540 }
541 541
542 static struct notifier_block kprobe_exceptions_nb = { 542 static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 543 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 544 .priority = 0x7fffffff /* we need to notified first */
545 }; 545 };
546 546
547 int __kprobes register_jprobe(struct jprobe *jp) 547 int __kprobes register_jprobe(struct jprobe *jp)
548 { 548 {
549 /* Todo: Verify probepoint is a function entry point */ 549 /* Todo: Verify probepoint is a function entry point */
550 jp->kp.pre_handler = setjmp_pre_handler; 550 jp->kp.pre_handler = setjmp_pre_handler;
551 jp->kp.break_handler = longjmp_break_handler; 551 jp->kp.break_handler = longjmp_break_handler;
552 552
553 return __register_kprobe(&jp->kp, 553 return __register_kprobe(&jp->kp,
554 (unsigned long)__builtin_return_address(0)); 554 (unsigned long)__builtin_return_address(0));
555 } 555 }
556 556
557 void __kprobes unregister_jprobe(struct jprobe *jp) 557 void __kprobes unregister_jprobe(struct jprobe *jp)
558 { 558 {
559 unregister_kprobe(&jp->kp); 559 unregister_kprobe(&jp->kp);
560 } 560 }
561 561
562 #ifdef ARCH_SUPPORTS_KRETPROBES 562 #ifdef ARCH_SUPPORTS_KRETPROBES
563 563
564 /* 564 /*
565 * This kprobe pre_handler is registered with every kretprobe. When probe 565 * This kprobe pre_handler is registered with every kretprobe. When probe
566 * hits it will set up the return probe. 566 * hits it will set up the return probe.
567 */ 567 */
568 static int __kprobes pre_handler_kretprobe(struct kprobe *p, 568 static int __kprobes pre_handler_kretprobe(struct kprobe *p,
569 struct pt_regs *regs) 569 struct pt_regs *regs)
570 { 570 {
571 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 571 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
572 unsigned long flags = 0; 572 unsigned long flags = 0;
573 573
574 /*TODO: consider to only swap the RA after the last pre_handler fired */ 574 /*TODO: consider to only swap the RA after the last pre_handler fired */
575 spin_lock_irqsave(&kretprobe_lock, flags); 575 spin_lock_irqsave(&kretprobe_lock, flags);
576 arch_prepare_kretprobe(rp, regs); 576 arch_prepare_kretprobe(rp, regs);
577 spin_unlock_irqrestore(&kretprobe_lock, flags); 577 spin_unlock_irqrestore(&kretprobe_lock, flags);
578 return 0; 578 return 0;
579 } 579 }
580 580
581 int __kprobes register_kretprobe(struct kretprobe *rp) 581 int __kprobes register_kretprobe(struct kretprobe *rp)
582 { 582 {
583 int ret = 0; 583 int ret = 0;
584 struct kretprobe_instance *inst; 584 struct kretprobe_instance *inst;
585 int i; 585 int i;
586 586
587 rp->kp.pre_handler = pre_handler_kretprobe; 587 rp->kp.pre_handler = pre_handler_kretprobe;
588 588
589 /* Pre-allocate memory for max kretprobe instances */ 589 /* Pre-allocate memory for max kretprobe instances */
590 if (rp->maxactive <= 0) { 590 if (rp->maxactive <= 0) {
591 #ifdef CONFIG_PREEMPT 591 #ifdef CONFIG_PREEMPT
592 rp->maxactive = max(10, 2 * NR_CPUS); 592 rp->maxactive = max(10, 2 * NR_CPUS);
593 #else 593 #else
594 rp->maxactive = NR_CPUS; 594 rp->maxactive = NR_CPUS;
595 #endif 595 #endif
596 } 596 }
597 INIT_HLIST_HEAD(&rp->used_instances); 597 INIT_HLIST_HEAD(&rp->used_instances);
598 INIT_HLIST_HEAD(&rp->free_instances); 598 INIT_HLIST_HEAD(&rp->free_instances);
599 for (i = 0; i < rp->maxactive; i++) { 599 for (i = 0; i < rp->maxactive; i++) {
600 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); 600 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
601 if (inst == NULL) { 601 if (inst == NULL) {
602 free_rp_inst(rp); 602 free_rp_inst(rp);
603 return -ENOMEM; 603 return -ENOMEM;
604 } 604 }
605 INIT_HLIST_NODE(&inst->uflist); 605 INIT_HLIST_NODE(&inst->uflist);
606 hlist_add_head(&inst->uflist, &rp->free_instances); 606 hlist_add_head(&inst->uflist, &rp->free_instances);
607 } 607 }
608 608
609 rp->nmissed = 0; 609 rp->nmissed = 0;
610 /* Establish function entry probe point */ 610 /* Establish function entry probe point */
611 if ((ret = __register_kprobe(&rp->kp, 611 if ((ret = __register_kprobe(&rp->kp,
612 (unsigned long)__builtin_return_address(0))) != 0) 612 (unsigned long)__builtin_return_address(0))) != 0)
613 free_rp_inst(rp); 613 free_rp_inst(rp);
614 return ret; 614 return ret;
615 } 615 }
616 616
617 #else /* ARCH_SUPPORTS_KRETPROBES */ 617 #else /* ARCH_SUPPORTS_KRETPROBES */
618 618
619 int __kprobes register_kretprobe(struct kretprobe *rp) 619 int __kprobes register_kretprobe(struct kretprobe *rp)
620 { 620 {
621 return -ENOSYS; 621 return -ENOSYS;
622 } 622 }
623 623
624 #endif /* ARCH_SUPPORTS_KRETPROBES */ 624 #endif /* ARCH_SUPPORTS_KRETPROBES */
625 625
626 void __kprobes unregister_kretprobe(struct kretprobe *rp) 626 void __kprobes unregister_kretprobe(struct kretprobe *rp)
627 { 627 {
628 unsigned long flags; 628 unsigned long flags;
629 struct kretprobe_instance *ri; 629 struct kretprobe_instance *ri;
630 630
631 unregister_kprobe(&rp->kp); 631 unregister_kprobe(&rp->kp);
632 /* No race here */ 632 /* No race here */
633 spin_lock_irqsave(&kretprobe_lock, flags); 633 spin_lock_irqsave(&kretprobe_lock, flags);
634 while ((ri = get_used_rp_inst(rp)) != NULL) { 634 while ((ri = get_used_rp_inst(rp)) != NULL) {
635 ri->rp = NULL; 635 ri->rp = NULL;
636 hlist_del(&ri->uflist); 636 hlist_del(&ri->uflist);
637 } 637 }
638 spin_unlock_irqrestore(&kretprobe_lock, flags); 638 spin_unlock_irqrestore(&kretprobe_lock, flags);
639 free_rp_inst(rp); 639 free_rp_inst(rp);
640 } 640 }
641 641
642 static int __init init_kprobes(void) 642 static int __init init_kprobes(void)
643 { 643 {
644 int i, err = 0; 644 int i, err = 0;
645 645
646 /* FIXME allocate the probe table, currently defined statically */ 646 /* FIXME allocate the probe table, currently defined statically */
647 /* initialize all list heads */ 647 /* initialize all list heads */
648 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 648 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
649 INIT_HLIST_HEAD(&kprobe_table[i]); 649 INIT_HLIST_HEAD(&kprobe_table[i]);
650 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 650 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
651 } 651 }
652 652
653 err = arch_init_kprobes(); 653 err = arch_init_kprobes();
654 if (!err) 654 if (!err)
655 err = register_die_notifier(&kprobe_exceptions_nb); 655 err = register_die_notifier(&kprobe_exceptions_nb);
656 656
657 return err; 657 return err;
658 } 658 }
659 659
660 __initcall(init_kprobes); 660 __initcall(init_kprobes);
661 661
662 EXPORT_SYMBOL_GPL(register_kprobe); 662 EXPORT_SYMBOL_GPL(register_kprobe);
663 EXPORT_SYMBOL_GPL(unregister_kprobe); 663 EXPORT_SYMBOL_GPL(unregister_kprobe);
664 EXPORT_SYMBOL_GPL(register_jprobe); 664 EXPORT_SYMBOL_GPL(register_jprobe);
665 EXPORT_SYMBOL_GPL(unregister_jprobe); 665 EXPORT_SYMBOL_GPL(unregister_jprobe);
666 EXPORT_SYMBOL_GPL(jprobe_return); 666 EXPORT_SYMBOL_GPL(jprobe_return);
667 EXPORT_SYMBOL_GPL(register_kretprobe); 667 EXPORT_SYMBOL_GPL(register_kretprobe);
668 EXPORT_SYMBOL_GPL(unregister_kretprobe); 668 EXPORT_SYMBOL_GPL(unregister_kretprobe);
669 669
670 670
1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */ 19 */
20 20
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/nmi.h> 23 #include <linux/nmi.h>
24 #include <linux/init.h> 24 #include <linux/init.h>
25 #include <asm/uaccess.h> 25 #include <asm/uaccess.h>
26 #include <linux/highmem.h> 26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h> 27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h> 28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h> 29 #include <linux/interrupt.h>
30 #include <linux/capability.h> 30 #include <linux/capability.h>
31 #include <linux/completion.h> 31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h> 32 #include <linux/kernel_stat.h>
33 #include <linux/security.h> 33 #include <linux/security.h>
34 #include <linux/notifier.h> 34 #include <linux/notifier.h>
35 #include <linux/profile.h> 35 #include <linux/profile.h>
36 #include <linux/suspend.h> 36 #include <linux/suspend.h>
37 #include <linux/vmalloc.h> 37 #include <linux/vmalloc.h>
38 #include <linux/blkdev.h> 38 #include <linux/blkdev.h>
39 #include <linux/delay.h> 39 #include <linux/delay.h>
40 #include <linux/smp.h> 40 #include <linux/smp.h>
41 #include <linux/threads.h> 41 #include <linux/threads.h>
42 #include <linux/timer.h> 42 #include <linux/timer.h>
43 #include <linux/rcupdate.h> 43 #include <linux/rcupdate.h>
44 #include <linux/cpu.h> 44 #include <linux/cpu.h>
45 #include <linux/cpuset.h> 45 #include <linux/cpuset.h>
46 #include <linux/percpu.h> 46 #include <linux/percpu.h>
47 #include <linux/kthread.h> 47 #include <linux/kthread.h>
48 #include <linux/seq_file.h> 48 #include <linux/seq_file.h>
49 #include <linux/syscalls.h> 49 #include <linux/syscalls.h>
50 #include <linux/times.h> 50 #include <linux/times.h>
51 #include <linux/acct.h> 51 #include <linux/acct.h>
52 #include <linux/kprobes.h>
52 #include <asm/tlb.h> 53 #include <asm/tlb.h>
53 54
54 #include <asm/unistd.h> 55 #include <asm/unistd.h>
55 56
56 /* 57 /*
57 * Convert user-nice values [ -20 ... 0 ... 19 ] 58 * Convert user-nice values [ -20 ... 0 ... 19 ]
58 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 59 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
59 * and back. 60 * and back.
60 */ 61 */
61 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 62 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
62 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 63 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
63 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 64 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
64 65
65 /* 66 /*
66 * 'User priority' is the nice value converted to something we 67 * 'User priority' is the nice value converted to something we
67 * can work with better when scaling various scheduler parameters, 68 * can work with better when scaling various scheduler parameters,
68 * it's a [ 0 ... 39 ] range. 69 * it's a [ 0 ... 39 ] range.
69 */ 70 */
70 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 71 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
71 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 72 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
72 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 73 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
73 74
74 /* 75 /*
75 * Some helpers for converting nanosecond timing to jiffy resolution 76 * Some helpers for converting nanosecond timing to jiffy resolution
76 */ 77 */
77 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 78 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
78 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 79 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
79 80
80 /* 81 /*
81 * These are the 'tuning knobs' of the scheduler: 82 * These are the 'tuning knobs' of the scheduler:
82 * 83 *
83 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 84 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
84 * default timeslice is 100 msecs, maximum timeslice is 800 msecs. 85 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
85 * Timeslices get refilled after they expire. 86 * Timeslices get refilled after they expire.
86 */ 87 */
87 #define MIN_TIMESLICE max(5 * HZ / 1000, 1) 88 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
88 #define DEF_TIMESLICE (100 * HZ / 1000) 89 #define DEF_TIMESLICE (100 * HZ / 1000)
89 #define ON_RUNQUEUE_WEIGHT 30 90 #define ON_RUNQUEUE_WEIGHT 30
90 #define CHILD_PENALTY 95 91 #define CHILD_PENALTY 95
91 #define PARENT_PENALTY 100 92 #define PARENT_PENALTY 100
92 #define EXIT_WEIGHT 3 93 #define EXIT_WEIGHT 3
93 #define PRIO_BONUS_RATIO 25 94 #define PRIO_BONUS_RATIO 25
94 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) 95 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
95 #define INTERACTIVE_DELTA 2 96 #define INTERACTIVE_DELTA 2
96 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) 97 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
97 #define STARVATION_LIMIT (MAX_SLEEP_AVG) 98 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
98 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) 99 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
99 100
100 /* 101 /*
101 * If a task is 'interactive' then we reinsert it in the active 102 * If a task is 'interactive' then we reinsert it in the active
102 * array after it has expired its current timeslice. (it will not 103 * array after it has expired its current timeslice. (it will not
103 * continue to run immediately, it will still roundrobin with 104 * continue to run immediately, it will still roundrobin with
104 * other interactive tasks.) 105 * other interactive tasks.)
105 * 106 *
106 * This part scales the interactivity limit depending on niceness. 107 * This part scales the interactivity limit depending on niceness.
107 * 108 *
108 * We scale it linearly, offset by the INTERACTIVE_DELTA delta. 109 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
109 * Here are a few examples of different nice levels: 110 * Here are a few examples of different nice levels:
110 * 111 *
111 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] 112 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
112 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] 113 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
113 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] 114 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
114 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] 115 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
115 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] 116 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
116 * 117 *
117 * (the X axis represents the possible -5 ... 0 ... +5 dynamic 118 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
118 * priority range a task can explore, a value of '1' means the 119 * priority range a task can explore, a value of '1' means the
119 * task is rated interactive.) 120 * task is rated interactive.)
120 * 121 *
121 * Ie. nice +19 tasks can never get 'interactive' enough to be 122 * Ie. nice +19 tasks can never get 'interactive' enough to be
122 * reinserted into the active array. And only heavily CPU-hog nice -20 123 * reinserted into the active array. And only heavily CPU-hog nice -20
123 * tasks will be expired. Default nice 0 tasks are somewhere between, 124 * tasks will be expired. Default nice 0 tasks are somewhere between,
124 * it takes some effort for them to get interactive, but it's not 125 * it takes some effort for them to get interactive, but it's not
125 * too hard. 126 * too hard.
126 */ 127 */
127 128
128 #define CURRENT_BONUS(p) \ 129 #define CURRENT_BONUS(p) \
129 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ 130 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
130 MAX_SLEEP_AVG) 131 MAX_SLEEP_AVG)
131 132
132 #define GRANULARITY (10 * HZ / 1000 ? : 1) 133 #define GRANULARITY (10 * HZ / 1000 ? : 1)
133 134
134 #ifdef CONFIG_SMP 135 #ifdef CONFIG_SMP
135 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ 136 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
136 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ 137 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
137 num_online_cpus()) 138 num_online_cpus())
138 #else 139 #else
139 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ 140 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
140 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) 141 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
141 #endif 142 #endif
142 143
143 #define SCALE(v1,v1_max,v2_max) \ 144 #define SCALE(v1,v1_max,v2_max) \
144 (v1) * (v2_max) / (v1_max) 145 (v1) * (v2_max) / (v1_max)
145 146
146 #define DELTA(p) \ 147 #define DELTA(p) \
147 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 148 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
148 149
149 #define TASK_INTERACTIVE(p) \ 150 #define TASK_INTERACTIVE(p) \
150 ((p)->prio <= (p)->static_prio - DELTA(p)) 151 ((p)->prio <= (p)->static_prio - DELTA(p))
151 152
152 #define INTERACTIVE_SLEEP(p) \ 153 #define INTERACTIVE_SLEEP(p) \
153 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ 154 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
154 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) 155 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
155 156
156 #define TASK_PREEMPTS_CURR(p, rq) \ 157 #define TASK_PREEMPTS_CURR(p, rq) \
157 ((p)->prio < (rq)->curr->prio) 158 ((p)->prio < (rq)->curr->prio)
158 159
159 /* 160 /*
160 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 161 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
161 * to time slice values: [800ms ... 100ms ... 5ms] 162 * to time slice values: [800ms ... 100ms ... 5ms]
162 * 163 *
163 * The higher a thread's priority, the bigger timeslices 164 * The higher a thread's priority, the bigger timeslices
164 * it gets during one round of execution. But even the lowest 165 * it gets during one round of execution. But even the lowest
165 * priority thread gets MIN_TIMESLICE worth of execution time. 166 * priority thread gets MIN_TIMESLICE worth of execution time.
166 */ 167 */
167 168
168 #define SCALE_PRIO(x, prio) \ 169 #define SCALE_PRIO(x, prio) \
169 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 170 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
170 171
171 static unsigned int task_timeslice(task_t *p) 172 static unsigned int task_timeslice(task_t *p)
172 { 173 {
173 if (p->static_prio < NICE_TO_PRIO(0)) 174 if (p->static_prio < NICE_TO_PRIO(0))
174 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 175 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
175 else 176 else
176 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 177 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
177 } 178 }
178 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 179 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
179 < (long long) (sd)->cache_hot_time) 180 < (long long) (sd)->cache_hot_time)
180 181
181 /* 182 /*
182 * These are the runqueue data structures: 183 * These are the runqueue data structures:
183 */ 184 */
184 185
185 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) 186 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
186 187
187 typedef struct runqueue runqueue_t; 188 typedef struct runqueue runqueue_t;
188 189
189 struct prio_array { 190 struct prio_array {
190 unsigned int nr_active; 191 unsigned int nr_active;
191 unsigned long bitmap[BITMAP_SIZE]; 192 unsigned long bitmap[BITMAP_SIZE];
192 struct list_head queue[MAX_PRIO]; 193 struct list_head queue[MAX_PRIO];
193 }; 194 };
194 195
195 /* 196 /*
196 * This is the main, per-CPU runqueue data structure. 197 * This is the main, per-CPU runqueue data structure.
197 * 198 *
198 * Locking rule: those places that want to lock multiple runqueues 199 * Locking rule: those places that want to lock multiple runqueues
199 * (such as the load balancing or the thread migration code), lock 200 * (such as the load balancing or the thread migration code), lock
200 * acquire operations must be ordered by ascending &runqueue. 201 * acquire operations must be ordered by ascending &runqueue.
201 */ 202 */
202 struct runqueue { 203 struct runqueue {
203 spinlock_t lock; 204 spinlock_t lock;
204 205
205 /* 206 /*
206 * nr_running and cpu_load should be in the same cacheline because 207 * nr_running and cpu_load should be in the same cacheline because
207 * remote CPUs use both these fields when doing load calculation. 208 * remote CPUs use both these fields when doing load calculation.
208 */ 209 */
209 unsigned long nr_running; 210 unsigned long nr_running;
210 #ifdef CONFIG_SMP 211 #ifdef CONFIG_SMP
211 unsigned long cpu_load[3]; 212 unsigned long cpu_load[3];
212 #endif 213 #endif
213 unsigned long long nr_switches; 214 unsigned long long nr_switches;
214 215
215 /* 216 /*
216 * This is part of a global counter where only the total sum 217 * This is part of a global counter where only the total sum
217 * over all CPUs matters. A task can increase this counter on 218 * over all CPUs matters. A task can increase this counter on
218 * one CPU and if it got migrated afterwards it may decrease 219 * one CPU and if it got migrated afterwards it may decrease
219 * it on another CPU. Always updated under the runqueue lock: 220 * it on another CPU. Always updated under the runqueue lock:
220 */ 221 */
221 unsigned long nr_uninterruptible; 222 unsigned long nr_uninterruptible;
222 223
223 unsigned long expired_timestamp; 224 unsigned long expired_timestamp;
224 unsigned long long timestamp_last_tick; 225 unsigned long long timestamp_last_tick;
225 task_t *curr, *idle; 226 task_t *curr, *idle;
226 struct mm_struct *prev_mm; 227 struct mm_struct *prev_mm;
227 prio_array_t *active, *expired, arrays[2]; 228 prio_array_t *active, *expired, arrays[2];
228 int best_expired_prio; 229 int best_expired_prio;
229 atomic_t nr_iowait; 230 atomic_t nr_iowait;
230 231
231 #ifdef CONFIG_SMP 232 #ifdef CONFIG_SMP
232 struct sched_domain *sd; 233 struct sched_domain *sd;
233 234
234 /* For active balancing */ 235 /* For active balancing */
235 int active_balance; 236 int active_balance;
236 int push_cpu; 237 int push_cpu;
237 238
238 task_t *migration_thread; 239 task_t *migration_thread;
239 struct list_head migration_queue; 240 struct list_head migration_queue;
240 int cpu; 241 int cpu;
241 #endif 242 #endif
242 243
243 #ifdef CONFIG_SCHEDSTATS 244 #ifdef CONFIG_SCHEDSTATS
244 /* latency stats */ 245 /* latency stats */
245 struct sched_info rq_sched_info; 246 struct sched_info rq_sched_info;
246 247
247 /* sys_sched_yield() stats */ 248 /* sys_sched_yield() stats */
248 unsigned long yld_exp_empty; 249 unsigned long yld_exp_empty;
249 unsigned long yld_act_empty; 250 unsigned long yld_act_empty;
250 unsigned long yld_both_empty; 251 unsigned long yld_both_empty;
251 unsigned long yld_cnt; 252 unsigned long yld_cnt;
252 253
253 /* schedule() stats */ 254 /* schedule() stats */
254 unsigned long sched_switch; 255 unsigned long sched_switch;
255 unsigned long sched_cnt; 256 unsigned long sched_cnt;
256 unsigned long sched_goidle; 257 unsigned long sched_goidle;
257 258
258 /* try_to_wake_up() stats */ 259 /* try_to_wake_up() stats */
259 unsigned long ttwu_cnt; 260 unsigned long ttwu_cnt;
260 unsigned long ttwu_local; 261 unsigned long ttwu_local;
261 #endif 262 #endif
262 }; 263 };
263 264
264 static DEFINE_PER_CPU(struct runqueue, runqueues); 265 static DEFINE_PER_CPU(struct runqueue, runqueues);
265 266
266 /* 267 /*
267 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 268 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
268 * See detach_destroy_domains: synchronize_sched for details. 269 * See detach_destroy_domains: synchronize_sched for details.
269 * 270 *
270 * The domain tree of any CPU may only be accessed from within 271 * The domain tree of any CPU may only be accessed from within
271 * preempt-disabled sections. 272 * preempt-disabled sections.
272 */ 273 */
273 #define for_each_domain(cpu, domain) \ 274 #define for_each_domain(cpu, domain) \
274 for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) 275 for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
275 276
276 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 277 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
277 #define this_rq() (&__get_cpu_var(runqueues)) 278 #define this_rq() (&__get_cpu_var(runqueues))
278 #define task_rq(p) cpu_rq(task_cpu(p)) 279 #define task_rq(p) cpu_rq(task_cpu(p))
279 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 280 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
280 281
281 #ifndef prepare_arch_switch 282 #ifndef prepare_arch_switch
282 # define prepare_arch_switch(next) do { } while (0) 283 # define prepare_arch_switch(next) do { } while (0)
283 #endif 284 #endif
284 #ifndef finish_arch_switch 285 #ifndef finish_arch_switch
285 # define finish_arch_switch(prev) do { } while (0) 286 # define finish_arch_switch(prev) do { } while (0)
286 #endif 287 #endif
287 288
288 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 289 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
289 static inline int task_running(runqueue_t *rq, task_t *p) 290 static inline int task_running(runqueue_t *rq, task_t *p)
290 { 291 {
291 return rq->curr == p; 292 return rq->curr == p;
292 } 293 }
293 294
294 static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 295 static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
295 { 296 {
296 } 297 }
297 298
298 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 299 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
299 { 300 {
300 #ifdef CONFIG_DEBUG_SPINLOCK 301 #ifdef CONFIG_DEBUG_SPINLOCK
301 /* this is a valid case when another task releases the spinlock */ 302 /* this is a valid case when another task releases the spinlock */
302 rq->lock.owner = current; 303 rq->lock.owner = current;
303 #endif 304 #endif
304 spin_unlock_irq(&rq->lock); 305 spin_unlock_irq(&rq->lock);
305 } 306 }
306 307
307 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 308 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
308 static inline int task_running(runqueue_t *rq, task_t *p) 309 static inline int task_running(runqueue_t *rq, task_t *p)
309 { 310 {
310 #ifdef CONFIG_SMP 311 #ifdef CONFIG_SMP
311 return p->oncpu; 312 return p->oncpu;
312 #else 313 #else
313 return rq->curr == p; 314 return rq->curr == p;
314 #endif 315 #endif
315 } 316 }
316 317
317 static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 318 static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
318 { 319 {
319 #ifdef CONFIG_SMP 320 #ifdef CONFIG_SMP
320 /* 321 /*
321 * We can optimise this out completely for !SMP, because the 322 * We can optimise this out completely for !SMP, because the
322 * SMP rebalancing from interrupt is the only thing that cares 323 * SMP rebalancing from interrupt is the only thing that cares
323 * here. 324 * here.
324 */ 325 */
325 next->oncpu = 1; 326 next->oncpu = 1;
326 #endif 327 #endif
327 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 328 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
328 spin_unlock_irq(&rq->lock); 329 spin_unlock_irq(&rq->lock);
329 #else 330 #else
330 spin_unlock(&rq->lock); 331 spin_unlock(&rq->lock);
331 #endif 332 #endif
332 } 333 }
333 334
334 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 335 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
335 { 336 {
336 #ifdef CONFIG_SMP 337 #ifdef CONFIG_SMP
337 /* 338 /*
338 * After ->oncpu is cleared, the task can be moved to a different CPU. 339 * After ->oncpu is cleared, the task can be moved to a different CPU.
339 * We must ensure this doesn't happen until the switch is completely 340 * We must ensure this doesn't happen until the switch is completely
340 * finished. 341 * finished.
341 */ 342 */
342 smp_wmb(); 343 smp_wmb();
343 prev->oncpu = 0; 344 prev->oncpu = 0;
344 #endif 345 #endif
345 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 346 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
346 local_irq_enable(); 347 local_irq_enable();
347 #endif 348 #endif
348 } 349 }
349 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 350 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
350 351
351 /* 352 /*
352 * task_rq_lock - lock the runqueue a given task resides on and disable 353 * task_rq_lock - lock the runqueue a given task resides on and disable
353 * interrupts. Note the ordering: we can safely lookup the task_rq without 354 * interrupts. Note the ordering: we can safely lookup the task_rq without
354 * explicitly disabling preemption. 355 * explicitly disabling preemption.
355 */ 356 */
356 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 357 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
357 __acquires(rq->lock) 358 __acquires(rq->lock)
358 { 359 {
359 struct runqueue *rq; 360 struct runqueue *rq;
360 361
361 repeat_lock_task: 362 repeat_lock_task:
362 local_irq_save(*flags); 363 local_irq_save(*flags);
363 rq = task_rq(p); 364 rq = task_rq(p);
364 spin_lock(&rq->lock); 365 spin_lock(&rq->lock);
365 if (unlikely(rq != task_rq(p))) { 366 if (unlikely(rq != task_rq(p))) {
366 spin_unlock_irqrestore(&rq->lock, *flags); 367 spin_unlock_irqrestore(&rq->lock, *flags);
367 goto repeat_lock_task; 368 goto repeat_lock_task;
368 } 369 }
369 return rq; 370 return rq;
370 } 371 }
371 372
372 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 373 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
373 __releases(rq->lock) 374 __releases(rq->lock)
374 { 375 {
375 spin_unlock_irqrestore(&rq->lock, *flags); 376 spin_unlock_irqrestore(&rq->lock, *flags);
376 } 377 }
377 378
378 #ifdef CONFIG_SCHEDSTATS 379 #ifdef CONFIG_SCHEDSTATS
379 /* 380 /*
380 * bump this up when changing the output format or the meaning of an existing 381 * bump this up when changing the output format or the meaning of an existing
381 * format, so that tools can adapt (or abort) 382 * format, so that tools can adapt (or abort)
382 */ 383 */
383 #define SCHEDSTAT_VERSION 12 384 #define SCHEDSTAT_VERSION 12
384 385
385 static int show_schedstat(struct seq_file *seq, void *v) 386 static int show_schedstat(struct seq_file *seq, void *v)
386 { 387 {
387 int cpu; 388 int cpu;
388 389
389 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 390 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
390 seq_printf(seq, "timestamp %lu\n", jiffies); 391 seq_printf(seq, "timestamp %lu\n", jiffies);
391 for_each_online_cpu(cpu) { 392 for_each_online_cpu(cpu) {
392 runqueue_t *rq = cpu_rq(cpu); 393 runqueue_t *rq = cpu_rq(cpu);
393 #ifdef CONFIG_SMP 394 #ifdef CONFIG_SMP
394 struct sched_domain *sd; 395 struct sched_domain *sd;
395 int dcnt = 0; 396 int dcnt = 0;
396 #endif 397 #endif
397 398
398 /* runqueue-specific stats */ 399 /* runqueue-specific stats */
399 seq_printf(seq, 400 seq_printf(seq,
400 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", 401 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
401 cpu, rq->yld_both_empty, 402 cpu, rq->yld_both_empty,
402 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, 403 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
403 rq->sched_switch, rq->sched_cnt, rq->sched_goidle, 404 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
404 rq->ttwu_cnt, rq->ttwu_local, 405 rq->ttwu_cnt, rq->ttwu_local,
405 rq->rq_sched_info.cpu_time, 406 rq->rq_sched_info.cpu_time,
406 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); 407 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
407 408
408 seq_printf(seq, "\n"); 409 seq_printf(seq, "\n");
409 410
410 #ifdef CONFIG_SMP 411 #ifdef CONFIG_SMP
411 /* domain-specific stats */ 412 /* domain-specific stats */
412 preempt_disable(); 413 preempt_disable();
413 for_each_domain(cpu, sd) { 414 for_each_domain(cpu, sd) {
414 enum idle_type itype; 415 enum idle_type itype;
415 char mask_str[NR_CPUS]; 416 char mask_str[NR_CPUS];
416 417
417 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 418 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
418 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 419 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
419 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 420 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
420 itype++) { 421 itype++) {
421 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 422 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
422 sd->lb_cnt[itype], 423 sd->lb_cnt[itype],
423 sd->lb_balanced[itype], 424 sd->lb_balanced[itype],
424 sd->lb_failed[itype], 425 sd->lb_failed[itype],
425 sd->lb_imbalance[itype], 426 sd->lb_imbalance[itype],
426 sd->lb_gained[itype], 427 sd->lb_gained[itype],
427 sd->lb_hot_gained[itype], 428 sd->lb_hot_gained[itype],
428 sd->lb_nobusyq[itype], 429 sd->lb_nobusyq[itype],
429 sd->lb_nobusyg[itype]); 430 sd->lb_nobusyg[itype]);
430 } 431 }
431 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 432 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
432 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 433 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
433 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 434 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
434 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 435 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
435 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 436 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
436 } 437 }
437 preempt_enable(); 438 preempt_enable();
438 #endif 439 #endif
439 } 440 }
440 return 0; 441 return 0;
441 } 442 }
442 443
443 static int schedstat_open(struct inode *inode, struct file *file) 444 static int schedstat_open(struct inode *inode, struct file *file)
444 { 445 {
445 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 446 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
446 char *buf = kmalloc(size, GFP_KERNEL); 447 char *buf = kmalloc(size, GFP_KERNEL);
447 struct seq_file *m; 448 struct seq_file *m;
448 int res; 449 int res;
449 450
450 if (!buf) 451 if (!buf)
451 return -ENOMEM; 452 return -ENOMEM;
452 res = single_open(file, show_schedstat, NULL); 453 res = single_open(file, show_schedstat, NULL);
453 if (!res) { 454 if (!res) {
454 m = file->private_data; 455 m = file->private_data;
455 m->buf = buf; 456 m->buf = buf;
456 m->size = size; 457 m->size = size;
457 } else 458 } else
458 kfree(buf); 459 kfree(buf);
459 return res; 460 return res;
460 } 461 }
461 462
462 struct file_operations proc_schedstat_operations = { 463 struct file_operations proc_schedstat_operations = {
463 .open = schedstat_open, 464 .open = schedstat_open,
464 .read = seq_read, 465 .read = seq_read,
465 .llseek = seq_lseek, 466 .llseek = seq_lseek,
466 .release = single_release, 467 .release = single_release,
467 }; 468 };
468 469
469 # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 470 # define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
470 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 471 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
471 #else /* !CONFIG_SCHEDSTATS */ 472 #else /* !CONFIG_SCHEDSTATS */
472 # define schedstat_inc(rq, field) do { } while (0) 473 # define schedstat_inc(rq, field) do { } while (0)
473 # define schedstat_add(rq, field, amt) do { } while (0) 474 # define schedstat_add(rq, field, amt) do { } while (0)
474 #endif 475 #endif
475 476
476 /* 477 /*
477 * rq_lock - lock a given runqueue and disable interrupts. 478 * rq_lock - lock a given runqueue and disable interrupts.
478 */ 479 */
479 static inline runqueue_t *this_rq_lock(void) 480 static inline runqueue_t *this_rq_lock(void)
480 __acquires(rq->lock) 481 __acquires(rq->lock)
481 { 482 {
482 runqueue_t *rq; 483 runqueue_t *rq;
483 484
484 local_irq_disable(); 485 local_irq_disable();
485 rq = this_rq(); 486 rq = this_rq();
486 spin_lock(&rq->lock); 487 spin_lock(&rq->lock);
487 488
488 return rq; 489 return rq;
489 } 490 }
490 491
491 #ifdef CONFIG_SCHEDSTATS 492 #ifdef CONFIG_SCHEDSTATS
492 /* 493 /*
493 * Called when a process is dequeued from the active array and given 494 * Called when a process is dequeued from the active array and given
494 * the cpu. We should note that with the exception of interactive 495 * the cpu. We should note that with the exception of interactive
495 * tasks, the expired queue will become the active queue after the active 496 * tasks, the expired queue will become the active queue after the active
496 * queue is empty, without explicitly dequeuing and requeuing tasks in the 497 * queue is empty, without explicitly dequeuing and requeuing tasks in the
497 * expired queue. (Interactive tasks may be requeued directly to the 498 * expired queue. (Interactive tasks may be requeued directly to the
498 * active queue, thus delaying tasks in the expired queue from running; 499 * active queue, thus delaying tasks in the expired queue from running;
499 * see scheduler_tick()). 500 * see scheduler_tick()).
500 * 501 *
501 * This function is only called from sched_info_arrive(), rather than 502 * This function is only called from sched_info_arrive(), rather than
502 * dequeue_task(). Even though a task may be queued and dequeued multiple 503 * dequeue_task(). Even though a task may be queued and dequeued multiple
503 * times as it is shuffled about, we're really interested in knowing how 504 * times as it is shuffled about, we're really interested in knowing how
504 * long it was from the *first* time it was queued to the time that it 505 * long it was from the *first* time it was queued to the time that it
505 * finally hit a cpu. 506 * finally hit a cpu.
506 */ 507 */
507 static inline void sched_info_dequeued(task_t *t) 508 static inline void sched_info_dequeued(task_t *t)
508 { 509 {
509 t->sched_info.last_queued = 0; 510 t->sched_info.last_queued = 0;
510 } 511 }
511 512
512 /* 513 /*
513 * Called when a task finally hits the cpu. We can now calculate how 514 * Called when a task finally hits the cpu. We can now calculate how
514 * long it was waiting to run. We also note when it began so that we 515 * long it was waiting to run. We also note when it began so that we
515 * can keep stats on how long its timeslice is. 516 * can keep stats on how long its timeslice is.
516 */ 517 */
517 static void sched_info_arrive(task_t *t) 518 static void sched_info_arrive(task_t *t)
518 { 519 {
519 unsigned long now = jiffies, diff = 0; 520 unsigned long now = jiffies, diff = 0;
520 struct runqueue *rq = task_rq(t); 521 struct runqueue *rq = task_rq(t);
521 522
522 if (t->sched_info.last_queued) 523 if (t->sched_info.last_queued)
523 diff = now - t->sched_info.last_queued; 524 diff = now - t->sched_info.last_queued;
524 sched_info_dequeued(t); 525 sched_info_dequeued(t);
525 t->sched_info.run_delay += diff; 526 t->sched_info.run_delay += diff;
526 t->sched_info.last_arrival = now; 527 t->sched_info.last_arrival = now;
527 t->sched_info.pcnt++; 528 t->sched_info.pcnt++;
528 529
529 if (!rq) 530 if (!rq)
530 return; 531 return;
531 532
532 rq->rq_sched_info.run_delay += diff; 533 rq->rq_sched_info.run_delay += diff;
533 rq->rq_sched_info.pcnt++; 534 rq->rq_sched_info.pcnt++;
534 } 535 }
535 536
536 /* 537 /*
537 * Called when a process is queued into either the active or expired 538 * Called when a process is queued into either the active or expired
538 * array. The time is noted and later used to determine how long we 539 * array. The time is noted and later used to determine how long we
539 * had to wait for us to reach the cpu. Since the expired queue will 540 * had to wait for us to reach the cpu. Since the expired queue will
540 * become the active queue after active queue is empty, without dequeuing 541 * become the active queue after active queue is empty, without dequeuing
541 * and requeuing any tasks, we are interested in queuing to either. It 542 * and requeuing any tasks, we are interested in queuing to either. It
542 * is unusual but not impossible for tasks to be dequeued and immediately 543 * is unusual but not impossible for tasks to be dequeued and immediately
543 * requeued in the same or another array: this can happen in sched_yield(), 544 * requeued in the same or another array: this can happen in sched_yield(),
544 * set_user_nice(), and even load_balance() as it moves tasks from runqueue 545 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
545 * to runqueue. 546 * to runqueue.
546 * 547 *
547 * This function is only called from enqueue_task(), but also only updates 548 * This function is only called from enqueue_task(), but also only updates
548 * the timestamp if it is already not set. It's assumed that 549 * the timestamp if it is already not set. It's assumed that
549 * sched_info_dequeued() will clear that stamp when appropriate. 550 * sched_info_dequeued() will clear that stamp when appropriate.
550 */ 551 */
551 static inline void sched_info_queued(task_t *t) 552 static inline void sched_info_queued(task_t *t)
552 { 553 {
553 if (!t->sched_info.last_queued) 554 if (!t->sched_info.last_queued)
554 t->sched_info.last_queued = jiffies; 555 t->sched_info.last_queued = jiffies;
555 } 556 }
556 557
557 /* 558 /*
558 * Called when a process ceases being the active-running process, either 559 * Called when a process ceases being the active-running process, either
559 * voluntarily or involuntarily. Now we can calculate how long we ran. 560 * voluntarily or involuntarily. Now we can calculate how long we ran.
560 */ 561 */
561 static inline void sched_info_depart(task_t *t) 562 static inline void sched_info_depart(task_t *t)
562 { 563 {
563 struct runqueue *rq = task_rq(t); 564 struct runqueue *rq = task_rq(t);
564 unsigned long diff = jiffies - t->sched_info.last_arrival; 565 unsigned long diff = jiffies - t->sched_info.last_arrival;
565 566
566 t->sched_info.cpu_time += diff; 567 t->sched_info.cpu_time += diff;
567 568
568 if (rq) 569 if (rq)
569 rq->rq_sched_info.cpu_time += diff; 570 rq->rq_sched_info.cpu_time += diff;
570 } 571 }
571 572
572 /* 573 /*
573 * Called when tasks are switched involuntarily due, typically, to expiring 574 * Called when tasks are switched involuntarily due, typically, to expiring
574 * their time slice. (This may also be called when switching to or from 575 * their time slice. (This may also be called when switching to or from
575 * the idle task.) We are only called when prev != next. 576 * the idle task.) We are only called when prev != next.
576 */ 577 */
577 static inline void sched_info_switch(task_t *prev, task_t *next) 578 static inline void sched_info_switch(task_t *prev, task_t *next)
578 { 579 {
579 struct runqueue *rq = task_rq(prev); 580 struct runqueue *rq = task_rq(prev);
580 581
581 /* 582 /*
582 * prev now departs the cpu. It's not interesting to record 583 * prev now departs the cpu. It's not interesting to record
583 * stats about how efficient we were at scheduling the idle 584 * stats about how efficient we were at scheduling the idle
584 * process, however. 585 * process, however.
585 */ 586 */
586 if (prev != rq->idle) 587 if (prev != rq->idle)
587 sched_info_depart(prev); 588 sched_info_depart(prev);
588 589
589 if (next != rq->idle) 590 if (next != rq->idle)
590 sched_info_arrive(next); 591 sched_info_arrive(next);
591 } 592 }
592 #else 593 #else
593 #define sched_info_queued(t) do { } while (0) 594 #define sched_info_queued(t) do { } while (0)
594 #define sched_info_switch(t, next) do { } while (0) 595 #define sched_info_switch(t, next) do { } while (0)
595 #endif /* CONFIG_SCHEDSTATS */ 596 #endif /* CONFIG_SCHEDSTATS */
596 597
597 /* 598 /*
598 * Adding/removing a task to/from a priority array: 599 * Adding/removing a task to/from a priority array:
599 */ 600 */
600 static void dequeue_task(struct task_struct *p, prio_array_t *array) 601 static void dequeue_task(struct task_struct *p, prio_array_t *array)
601 { 602 {
602 array->nr_active--; 603 array->nr_active--;
603 list_del(&p->run_list); 604 list_del(&p->run_list);
604 if (list_empty(array->queue + p->prio)) 605 if (list_empty(array->queue + p->prio))
605 __clear_bit(p->prio, array->bitmap); 606 __clear_bit(p->prio, array->bitmap);
606 } 607 }
607 608
608 static void enqueue_task(struct task_struct *p, prio_array_t *array) 609 static void enqueue_task(struct task_struct *p, prio_array_t *array)
609 { 610 {
610 sched_info_queued(p); 611 sched_info_queued(p);
611 list_add_tail(&p->run_list, array->queue + p->prio); 612 list_add_tail(&p->run_list, array->queue + p->prio);
612 __set_bit(p->prio, array->bitmap); 613 __set_bit(p->prio, array->bitmap);
613 array->nr_active++; 614 array->nr_active++;
614 p->array = array; 615 p->array = array;
615 } 616 }
616 617
617 /* 618 /*
618 * Put task to the end of the run list without the overhead of dequeue 619 * Put task to the end of the run list without the overhead of dequeue
619 * followed by enqueue. 620 * followed by enqueue.
620 */ 621 */
621 static void requeue_task(struct task_struct *p, prio_array_t *array) 622 static void requeue_task(struct task_struct *p, prio_array_t *array)
622 { 623 {
623 list_move_tail(&p->run_list, array->queue + p->prio); 624 list_move_tail(&p->run_list, array->queue + p->prio);
624 } 625 }
625 626
626 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) 627 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
627 { 628 {
628 list_add(&p->run_list, array->queue + p->prio); 629 list_add(&p->run_list, array->queue + p->prio);
629 __set_bit(p->prio, array->bitmap); 630 __set_bit(p->prio, array->bitmap);
630 array->nr_active++; 631 array->nr_active++;
631 p->array = array; 632 p->array = array;
632 } 633 }
633 634
634 /* 635 /*
635 * effective_prio - return the priority that is based on the static 636 * effective_prio - return the priority that is based on the static
636 * priority but is modified by bonuses/penalties. 637 * priority but is modified by bonuses/penalties.
637 * 638 *
638 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 639 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
639 * into the -5 ... 0 ... +5 bonus/penalty range. 640 * into the -5 ... 0 ... +5 bonus/penalty range.
640 * 641 *
641 * We use 25% of the full 0...39 priority range so that: 642 * We use 25% of the full 0...39 priority range so that:
642 * 643 *
643 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 644 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
644 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 645 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
645 * 646 *
646 * Both properties are important to certain workloads. 647 * Both properties are important to certain workloads.
647 */ 648 */
648 static int effective_prio(task_t *p) 649 static int effective_prio(task_t *p)
649 { 650 {
650 int bonus, prio; 651 int bonus, prio;
651 652
652 if (rt_task(p)) 653 if (rt_task(p))
653 return p->prio; 654 return p->prio;
654 655
655 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 656 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
656 657
657 prio = p->static_prio - bonus; 658 prio = p->static_prio - bonus;
658 if (prio < MAX_RT_PRIO) 659 if (prio < MAX_RT_PRIO)
659 prio = MAX_RT_PRIO; 660 prio = MAX_RT_PRIO;
660 if (prio > MAX_PRIO-1) 661 if (prio > MAX_PRIO-1)
661 prio = MAX_PRIO-1; 662 prio = MAX_PRIO-1;
662 return prio; 663 return prio;
663 } 664 }
664 665
665 /* 666 /*
666 * __activate_task - move a task to the runqueue. 667 * __activate_task - move a task to the runqueue.
667 */ 668 */
668 static inline void __activate_task(task_t *p, runqueue_t *rq) 669 static inline void __activate_task(task_t *p, runqueue_t *rq)
669 { 670 {
670 enqueue_task(p, rq->active); 671 enqueue_task(p, rq->active);
671 rq->nr_running++; 672 rq->nr_running++;
672 } 673 }
673 674
674 /* 675 /*
675 * __activate_idle_task - move idle task to the _front_ of runqueue. 676 * __activate_idle_task - move idle task to the _front_ of runqueue.
676 */ 677 */
677 static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 678 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
678 { 679 {
679 enqueue_task_head(p, rq->active); 680 enqueue_task_head(p, rq->active);
680 rq->nr_running++; 681 rq->nr_running++;
681 } 682 }
682 683
683 static int recalc_task_prio(task_t *p, unsigned long long now) 684 static int recalc_task_prio(task_t *p, unsigned long long now)
684 { 685 {
685 /* Caller must always ensure 'now >= p->timestamp' */ 686 /* Caller must always ensure 'now >= p->timestamp' */
686 unsigned long long __sleep_time = now - p->timestamp; 687 unsigned long long __sleep_time = now - p->timestamp;
687 unsigned long sleep_time; 688 unsigned long sleep_time;
688 689
689 if (unlikely(p->policy == SCHED_BATCH)) 690 if (unlikely(p->policy == SCHED_BATCH))
690 sleep_time = 0; 691 sleep_time = 0;
691 else { 692 else {
692 if (__sleep_time > NS_MAX_SLEEP_AVG) 693 if (__sleep_time > NS_MAX_SLEEP_AVG)
693 sleep_time = NS_MAX_SLEEP_AVG; 694 sleep_time = NS_MAX_SLEEP_AVG;
694 else 695 else
695 sleep_time = (unsigned long)__sleep_time; 696 sleep_time = (unsigned long)__sleep_time;
696 } 697 }
697 698
698 if (likely(sleep_time > 0)) { 699 if (likely(sleep_time > 0)) {
699 /* 700 /*
700 * User tasks that sleep a long time are categorised as 701 * User tasks that sleep a long time are categorised as
701 * idle and will get just interactive status to stay active & 702 * idle and will get just interactive status to stay active &
702 * prevent them suddenly becoming cpu hogs and starving 703 * prevent them suddenly becoming cpu hogs and starving
703 * other processes. 704 * other processes.
704 */ 705 */
705 if (p->mm && p->activated != -1 && 706 if (p->mm && p->activated != -1 &&
706 sleep_time > INTERACTIVE_SLEEP(p)) { 707 sleep_time > INTERACTIVE_SLEEP(p)) {
707 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - 708 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
708 DEF_TIMESLICE); 709 DEF_TIMESLICE);
709 } else { 710 } else {
710 /* 711 /*
711 * Tasks waking from uninterruptible sleep are 712 * Tasks waking from uninterruptible sleep are
712 * limited in their sleep_avg rise as they 713 * limited in their sleep_avg rise as they
713 * are likely to be waiting on I/O 714 * are likely to be waiting on I/O
714 */ 715 */
715 if (p->activated == -1 && p->mm) { 716 if (p->activated == -1 && p->mm) {
716 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 717 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
717 sleep_time = 0; 718 sleep_time = 0;
718 else if (p->sleep_avg + sleep_time >= 719 else if (p->sleep_avg + sleep_time >=
719 INTERACTIVE_SLEEP(p)) { 720 INTERACTIVE_SLEEP(p)) {
720 p->sleep_avg = INTERACTIVE_SLEEP(p); 721 p->sleep_avg = INTERACTIVE_SLEEP(p);
721 sleep_time = 0; 722 sleep_time = 0;
722 } 723 }
723 } 724 }
724 725
725 /* 726 /*
726 * This code gives a bonus to interactive tasks. 727 * This code gives a bonus to interactive tasks.
727 * 728 *
728 * The boost works by updating the 'average sleep time' 729 * The boost works by updating the 'average sleep time'
729 * value here, based on ->timestamp. The more time a 730 * value here, based on ->timestamp. The more time a
730 * task spends sleeping, the higher the average gets - 731 * task spends sleeping, the higher the average gets -
731 * and the higher the priority boost gets as well. 732 * and the higher the priority boost gets as well.
732 */ 733 */
733 p->sleep_avg += sleep_time; 734 p->sleep_avg += sleep_time;
734 735
735 if (p->sleep_avg > NS_MAX_SLEEP_AVG) 736 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
736 p->sleep_avg = NS_MAX_SLEEP_AVG; 737 p->sleep_avg = NS_MAX_SLEEP_AVG;
737 } 738 }
738 } 739 }
739 740
740 return effective_prio(p); 741 return effective_prio(p);
741 } 742 }
742 743
743 /* 744 /*
744 * activate_task - move a task to the runqueue and do priority recalculation 745 * activate_task - move a task to the runqueue and do priority recalculation
745 * 746 *
746 * Update all the scheduling statistics stuff. (sleep average 747 * Update all the scheduling statistics stuff. (sleep average
747 * calculation, priority modifiers, etc.) 748 * calculation, priority modifiers, etc.)
748 */ 749 */
749 static void activate_task(task_t *p, runqueue_t *rq, int local) 750 static void activate_task(task_t *p, runqueue_t *rq, int local)
750 { 751 {
751 unsigned long long now; 752 unsigned long long now;
752 753
753 now = sched_clock(); 754 now = sched_clock();
754 #ifdef CONFIG_SMP 755 #ifdef CONFIG_SMP
755 if (!local) { 756 if (!local) {
756 /* Compensate for drifting sched_clock */ 757 /* Compensate for drifting sched_clock */
757 runqueue_t *this_rq = this_rq(); 758 runqueue_t *this_rq = this_rq();
758 now = (now - this_rq->timestamp_last_tick) 759 now = (now - this_rq->timestamp_last_tick)
759 + rq->timestamp_last_tick; 760 + rq->timestamp_last_tick;
760 } 761 }
761 #endif 762 #endif
762 763
763 if (!rt_task(p)) 764 if (!rt_task(p))
764 p->prio = recalc_task_prio(p, now); 765 p->prio = recalc_task_prio(p, now);
765 766
766 /* 767 /*
767 * This checks to make sure it's not an uninterruptible task 768 * This checks to make sure it's not an uninterruptible task
768 * that is now waking up. 769 * that is now waking up.
769 */ 770 */
770 if (!p->activated) { 771 if (!p->activated) {
771 /* 772 /*
772 * Tasks which were woken up by interrupts (ie. hw events) 773 * Tasks which were woken up by interrupts (ie. hw events)
773 * are most likely of interactive nature. So we give them 774 * are most likely of interactive nature. So we give them
774 * the credit of extending their sleep time to the period 775 * the credit of extending their sleep time to the period
775 * of time they spend on the runqueue, waiting for execution 776 * of time they spend on the runqueue, waiting for execution
776 * on a CPU, first time around: 777 * on a CPU, first time around:
777 */ 778 */
778 if (in_interrupt()) 779 if (in_interrupt())
779 p->activated = 2; 780 p->activated = 2;
780 else { 781 else {
781 /* 782 /*
782 * Normal first-time wakeups get a credit too for 783 * Normal first-time wakeups get a credit too for
783 * on-runqueue time, but it will be weighted down: 784 * on-runqueue time, but it will be weighted down:
784 */ 785 */
785 p->activated = 1; 786 p->activated = 1;
786 } 787 }
787 } 788 }
788 p->timestamp = now; 789 p->timestamp = now;
789 790
790 __activate_task(p, rq); 791 __activate_task(p, rq);
791 } 792 }
792 793
793 /* 794 /*
794 * deactivate_task - remove a task from the runqueue. 795 * deactivate_task - remove a task from the runqueue.
795 */ 796 */
796 static void deactivate_task(struct task_struct *p, runqueue_t *rq) 797 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
797 { 798 {
798 rq->nr_running--; 799 rq->nr_running--;
799 dequeue_task(p, p->array); 800 dequeue_task(p, p->array);
800 p->array = NULL; 801 p->array = NULL;
801 } 802 }
802 803
803 /* 804 /*
804 * resched_task - mark a task 'to be rescheduled now'. 805 * resched_task - mark a task 'to be rescheduled now'.
805 * 806 *
806 * On UP this means the setting of the need_resched flag, on SMP it 807 * On UP this means the setting of the need_resched flag, on SMP it
807 * might also involve a cross-CPU call to trigger the scheduler on 808 * might also involve a cross-CPU call to trigger the scheduler on
808 * the target CPU. 809 * the target CPU.
809 */ 810 */
810 #ifdef CONFIG_SMP 811 #ifdef CONFIG_SMP
811 static void resched_task(task_t *p) 812 static void resched_task(task_t *p)
812 { 813 {
813 int cpu; 814 int cpu;
814 815
815 assert_spin_locked(&task_rq(p)->lock); 816 assert_spin_locked(&task_rq(p)->lock);
816 817
817 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 818 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
818 return; 819 return;
819 820
820 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 821 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
821 822
822 cpu = task_cpu(p); 823 cpu = task_cpu(p);
823 if (cpu == smp_processor_id()) 824 if (cpu == smp_processor_id())
824 return; 825 return;
825 826
826 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 827 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
827 smp_mb(); 828 smp_mb();
828 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 829 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
829 smp_send_reschedule(cpu); 830 smp_send_reschedule(cpu);
830 } 831 }
831 #else 832 #else
832 static inline void resched_task(task_t *p) 833 static inline void resched_task(task_t *p)
833 { 834 {
834 assert_spin_locked(&task_rq(p)->lock); 835 assert_spin_locked(&task_rq(p)->lock);
835 set_tsk_need_resched(p); 836 set_tsk_need_resched(p);
836 } 837 }
837 #endif 838 #endif
838 839
839 /** 840 /**
840 * task_curr - is this task currently executing on a CPU? 841 * task_curr - is this task currently executing on a CPU?
841 * @p: the task in question. 842 * @p: the task in question.
842 */ 843 */
843 inline int task_curr(const task_t *p) 844 inline int task_curr(const task_t *p)
844 { 845 {
845 return cpu_curr(task_cpu(p)) == p; 846 return cpu_curr(task_cpu(p)) == p;
846 } 847 }
847 848
848 #ifdef CONFIG_SMP 849 #ifdef CONFIG_SMP
849 typedef struct { 850 typedef struct {
850 struct list_head list; 851 struct list_head list;
851 852
852 task_t *task; 853 task_t *task;
853 int dest_cpu; 854 int dest_cpu;
854 855
855 struct completion done; 856 struct completion done;
856 } migration_req_t; 857 } migration_req_t;
857 858
858 /* 859 /*
859 * The task's runqueue lock must be held. 860 * The task's runqueue lock must be held.
860 * Returns true if you have to wait for migration thread. 861 * Returns true if you have to wait for migration thread.
861 */ 862 */
862 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) 863 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
863 { 864 {
864 runqueue_t *rq = task_rq(p); 865 runqueue_t *rq = task_rq(p);
865 866
866 /* 867 /*
867 * If the task is not on a runqueue (and not running), then 868 * If the task is not on a runqueue (and not running), then
868 * it is sufficient to simply update the task's cpu field. 869 * it is sufficient to simply update the task's cpu field.
869 */ 870 */
870 if (!p->array && !task_running(rq, p)) { 871 if (!p->array && !task_running(rq, p)) {
871 set_task_cpu(p, dest_cpu); 872 set_task_cpu(p, dest_cpu);
872 return 0; 873 return 0;
873 } 874 }
874 875
875 init_completion(&req->done); 876 init_completion(&req->done);
876 req->task = p; 877 req->task = p;
877 req->dest_cpu = dest_cpu; 878 req->dest_cpu = dest_cpu;
878 list_add(&req->list, &rq->migration_queue); 879 list_add(&req->list, &rq->migration_queue);
879 return 1; 880 return 1;
880 } 881 }
881 882
882 /* 883 /*
883 * wait_task_inactive - wait for a thread to unschedule. 884 * wait_task_inactive - wait for a thread to unschedule.
884 * 885 *
885 * The caller must ensure that the task *will* unschedule sometime soon, 886 * The caller must ensure that the task *will* unschedule sometime soon,
886 * else this function might spin for a *long* time. This function can't 887 * else this function might spin for a *long* time. This function can't
887 * be called with interrupts off, or it may introduce deadlock with 888 * be called with interrupts off, or it may introduce deadlock with
888 * smp_call_function() if an IPI is sent by the same process we are 889 * smp_call_function() if an IPI is sent by the same process we are
889 * waiting to become inactive. 890 * waiting to become inactive.
890 */ 891 */
891 void wait_task_inactive(task_t *p) 892 void wait_task_inactive(task_t *p)
892 { 893 {
893 unsigned long flags; 894 unsigned long flags;
894 runqueue_t *rq; 895 runqueue_t *rq;
895 int preempted; 896 int preempted;
896 897
897 repeat: 898 repeat:
898 rq = task_rq_lock(p, &flags); 899 rq = task_rq_lock(p, &flags);
899 /* Must be off runqueue entirely, not preempted. */ 900 /* Must be off runqueue entirely, not preempted. */
900 if (unlikely(p->array || task_running(rq, p))) { 901 if (unlikely(p->array || task_running(rq, p))) {
901 /* If it's preempted, we yield. It could be a while. */ 902 /* If it's preempted, we yield. It could be a while. */
902 preempted = !task_running(rq, p); 903 preempted = !task_running(rq, p);
903 task_rq_unlock(rq, &flags); 904 task_rq_unlock(rq, &flags);
904 cpu_relax(); 905 cpu_relax();
905 if (preempted) 906 if (preempted)
906 yield(); 907 yield();
907 goto repeat; 908 goto repeat;
908 } 909 }
909 task_rq_unlock(rq, &flags); 910 task_rq_unlock(rq, &flags);
910 } 911 }
911 912
912 /*** 913 /***
913 * kick_process - kick a running thread to enter/exit the kernel 914 * kick_process - kick a running thread to enter/exit the kernel
914 * @p: the to-be-kicked thread 915 * @p: the to-be-kicked thread
915 * 916 *
916 * Cause a process which is running on another CPU to enter 917 * Cause a process which is running on another CPU to enter
917 * kernel-mode, without any delay. (to get signals handled.) 918 * kernel-mode, without any delay. (to get signals handled.)
918 * 919 *
919 * NOTE: this function doesnt have to take the runqueue lock, 920 * NOTE: this function doesnt have to take the runqueue lock,
920 * because all it wants to ensure is that the remote task enters 921 * because all it wants to ensure is that the remote task enters
921 * the kernel. If the IPI races and the task has been migrated 922 * the kernel. If the IPI races and the task has been migrated
922 * to another CPU then no harm is done and the purpose has been 923 * to another CPU then no harm is done and the purpose has been
923 * achieved as well. 924 * achieved as well.
924 */ 925 */
925 void kick_process(task_t *p) 926 void kick_process(task_t *p)
926 { 927 {
927 int cpu; 928 int cpu;
928 929
929 preempt_disable(); 930 preempt_disable();
930 cpu = task_cpu(p); 931 cpu = task_cpu(p);
931 if ((cpu != smp_processor_id()) && task_curr(p)) 932 if ((cpu != smp_processor_id()) && task_curr(p))
932 smp_send_reschedule(cpu); 933 smp_send_reschedule(cpu);
933 preempt_enable(); 934 preempt_enable();
934 } 935 }
935 936
936 /* 937 /*
937 * Return a low guess at the load of a migration-source cpu. 938 * Return a low guess at the load of a migration-source cpu.
938 * 939 *
939 * We want to under-estimate the load of migration sources, to 940 * We want to under-estimate the load of migration sources, to
940 * balance conservatively. 941 * balance conservatively.
941 */ 942 */
942 static inline unsigned long source_load(int cpu, int type) 943 static inline unsigned long source_load(int cpu, int type)
943 { 944 {
944 runqueue_t *rq = cpu_rq(cpu); 945 runqueue_t *rq = cpu_rq(cpu);
945 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 946 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
946 if (type == 0) 947 if (type == 0)
947 return load_now; 948 return load_now;
948 949
949 return min(rq->cpu_load[type-1], load_now); 950 return min(rq->cpu_load[type-1], load_now);
950 } 951 }
951 952
952 /* 953 /*
953 * Return a high guess at the load of a migration-target cpu 954 * Return a high guess at the load of a migration-target cpu
954 */ 955 */
955 static inline unsigned long target_load(int cpu, int type) 956 static inline unsigned long target_load(int cpu, int type)
956 { 957 {
957 runqueue_t *rq = cpu_rq(cpu); 958 runqueue_t *rq = cpu_rq(cpu);
958 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 959 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
959 if (type == 0) 960 if (type == 0)
960 return load_now; 961 return load_now;
961 962
962 return max(rq->cpu_load[type-1], load_now); 963 return max(rq->cpu_load[type-1], load_now);
963 } 964 }
964 965
965 /* 966 /*
966 * find_idlest_group finds and returns the least busy CPU group within the 967 * find_idlest_group finds and returns the least busy CPU group within the
967 * domain. 968 * domain.
968 */ 969 */
969 static struct sched_group * 970 static struct sched_group *
970 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) 971 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
971 { 972 {
972 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 973 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
973 unsigned long min_load = ULONG_MAX, this_load = 0; 974 unsigned long min_load = ULONG_MAX, this_load = 0;
974 int load_idx = sd->forkexec_idx; 975 int load_idx = sd->forkexec_idx;
975 int imbalance = 100 + (sd->imbalance_pct-100)/2; 976 int imbalance = 100 + (sd->imbalance_pct-100)/2;
976 977
977 do { 978 do {
978 unsigned long load, avg_load; 979 unsigned long load, avg_load;
979 int local_group; 980 int local_group;
980 int i; 981 int i;
981 982
982 /* Skip over this group if it has no CPUs allowed */ 983 /* Skip over this group if it has no CPUs allowed */
983 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 984 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
984 goto nextgroup; 985 goto nextgroup;
985 986
986 local_group = cpu_isset(this_cpu, group->cpumask); 987 local_group = cpu_isset(this_cpu, group->cpumask);
987 988
988 /* Tally up the load of all CPUs in the group */ 989 /* Tally up the load of all CPUs in the group */
989 avg_load = 0; 990 avg_load = 0;
990 991
991 for_each_cpu_mask(i, group->cpumask) { 992 for_each_cpu_mask(i, group->cpumask) {
992 /* Bias balancing toward cpus of our domain */ 993 /* Bias balancing toward cpus of our domain */
993 if (local_group) 994 if (local_group)
994 load = source_load(i, load_idx); 995 load = source_load(i, load_idx);
995 else 996 else
996 load = target_load(i, load_idx); 997 load = target_load(i, load_idx);
997 998
998 avg_load += load; 999 avg_load += load;
999 } 1000 }
1000 1001
1001 /* Adjust by relative CPU power of the group */ 1002 /* Adjust by relative CPU power of the group */
1002 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1003 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1003 1004
1004 if (local_group) { 1005 if (local_group) {
1005 this_load = avg_load; 1006 this_load = avg_load;
1006 this = group; 1007 this = group;
1007 } else if (avg_load < min_load) { 1008 } else if (avg_load < min_load) {
1008 min_load = avg_load; 1009 min_load = avg_load;
1009 idlest = group; 1010 idlest = group;
1010 } 1011 }
1011 nextgroup: 1012 nextgroup:
1012 group = group->next; 1013 group = group->next;
1013 } while (group != sd->groups); 1014 } while (group != sd->groups);
1014 1015
1015 if (!idlest || 100*this_load < imbalance*min_load) 1016 if (!idlest || 100*this_load < imbalance*min_load)
1016 return NULL; 1017 return NULL;
1017 return idlest; 1018 return idlest;
1018 } 1019 }
1019 1020
1020 /* 1021 /*
1021 * find_idlest_queue - find the idlest runqueue among the cpus in group. 1022 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1022 */ 1023 */
1023 static int 1024 static int
1024 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 1025 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1025 { 1026 {
1026 cpumask_t tmp; 1027 cpumask_t tmp;
1027 unsigned long load, min_load = ULONG_MAX; 1028 unsigned long load, min_load = ULONG_MAX;
1028 int idlest = -1; 1029 int idlest = -1;
1029 int i; 1030 int i;
1030 1031
1031 /* Traverse only the allowed CPUs */ 1032 /* Traverse only the allowed CPUs */
1032 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1033 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1033 1034
1034 for_each_cpu_mask(i, tmp) { 1035 for_each_cpu_mask(i, tmp) {
1035 load = source_load(i, 0); 1036 load = source_load(i, 0);
1036 1037
1037 if (load < min_load || (load == min_load && i == this_cpu)) { 1038 if (load < min_load || (load == min_load && i == this_cpu)) {
1038 min_load = load; 1039 min_load = load;
1039 idlest = i; 1040 idlest = i;
1040 } 1041 }
1041 } 1042 }
1042 1043
1043 return idlest; 1044 return idlest;
1044 } 1045 }
1045 1046
1046 /* 1047 /*
1047 * sched_balance_self: balance the current task (running on cpu) in domains 1048 * sched_balance_self: balance the current task (running on cpu) in domains
1048 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1049 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1049 * SD_BALANCE_EXEC. 1050 * SD_BALANCE_EXEC.
1050 * 1051 *
1051 * Balance, ie. select the least loaded group. 1052 * Balance, ie. select the least loaded group.
1052 * 1053 *
1053 * Returns the target CPU number, or the same CPU if no balancing is needed. 1054 * Returns the target CPU number, or the same CPU if no balancing is needed.
1054 * 1055 *
1055 * preempt must be disabled. 1056 * preempt must be disabled.
1056 */ 1057 */
1057 static int sched_balance_self(int cpu, int flag) 1058 static int sched_balance_self(int cpu, int flag)
1058 { 1059 {
1059 struct task_struct *t = current; 1060 struct task_struct *t = current;
1060 struct sched_domain *tmp, *sd = NULL; 1061 struct sched_domain *tmp, *sd = NULL;
1061 1062
1062 for_each_domain(cpu, tmp) 1063 for_each_domain(cpu, tmp)
1063 if (tmp->flags & flag) 1064 if (tmp->flags & flag)
1064 sd = tmp; 1065 sd = tmp;
1065 1066
1066 while (sd) { 1067 while (sd) {
1067 cpumask_t span; 1068 cpumask_t span;
1068 struct sched_group *group; 1069 struct sched_group *group;
1069 int new_cpu; 1070 int new_cpu;
1070 int weight; 1071 int weight;
1071 1072
1072 span = sd->span; 1073 span = sd->span;
1073 group = find_idlest_group(sd, t, cpu); 1074 group = find_idlest_group(sd, t, cpu);
1074 if (!group) 1075 if (!group)
1075 goto nextlevel; 1076 goto nextlevel;
1076 1077
1077 new_cpu = find_idlest_cpu(group, t, cpu); 1078 new_cpu = find_idlest_cpu(group, t, cpu);
1078 if (new_cpu == -1 || new_cpu == cpu) 1079 if (new_cpu == -1 || new_cpu == cpu)
1079 goto nextlevel; 1080 goto nextlevel;
1080 1081
1081 /* Now try balancing at a lower domain level */ 1082 /* Now try balancing at a lower domain level */
1082 cpu = new_cpu; 1083 cpu = new_cpu;
1083 nextlevel: 1084 nextlevel:
1084 sd = NULL; 1085 sd = NULL;
1085 weight = cpus_weight(span); 1086 weight = cpus_weight(span);
1086 for_each_domain(cpu, tmp) { 1087 for_each_domain(cpu, tmp) {
1087 if (weight <= cpus_weight(tmp->span)) 1088 if (weight <= cpus_weight(tmp->span))
1088 break; 1089 break;
1089 if (tmp->flags & flag) 1090 if (tmp->flags & flag)
1090 sd = tmp; 1091 sd = tmp;
1091 } 1092 }
1092 /* while loop will break here if sd == NULL */ 1093 /* while loop will break here if sd == NULL */
1093 } 1094 }
1094 1095
1095 return cpu; 1096 return cpu;
1096 } 1097 }
1097 1098
1098 #endif /* CONFIG_SMP */ 1099 #endif /* CONFIG_SMP */
1099 1100
1100 /* 1101 /*
1101 * wake_idle() will wake a task on an idle cpu if task->cpu is 1102 * wake_idle() will wake a task on an idle cpu if task->cpu is
1102 * not idle and an idle cpu is available. The span of cpus to 1103 * not idle and an idle cpu is available. The span of cpus to
1103 * search starts with cpus closest then further out as needed, 1104 * search starts with cpus closest then further out as needed,
1104 * so we always favor a closer, idle cpu. 1105 * so we always favor a closer, idle cpu.
1105 * 1106 *
1106 * Returns the CPU we should wake onto. 1107 * Returns the CPU we should wake onto.
1107 */ 1108 */
1108 #if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1109 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1109 static int wake_idle(int cpu, task_t *p) 1110 static int wake_idle(int cpu, task_t *p)
1110 { 1111 {
1111 cpumask_t tmp; 1112 cpumask_t tmp;
1112 struct sched_domain *sd; 1113 struct sched_domain *sd;
1113 int i; 1114 int i;
1114 1115
1115 if (idle_cpu(cpu)) 1116 if (idle_cpu(cpu))
1116 return cpu; 1117 return cpu;
1117 1118
1118 for_each_domain(cpu, sd) { 1119 for_each_domain(cpu, sd) {
1119 if (sd->flags & SD_WAKE_IDLE) { 1120 if (sd->flags & SD_WAKE_IDLE) {
1120 cpus_and(tmp, sd->span, p->cpus_allowed); 1121 cpus_and(tmp, sd->span, p->cpus_allowed);
1121 for_each_cpu_mask(i, tmp) { 1122 for_each_cpu_mask(i, tmp) {
1122 if (idle_cpu(i)) 1123 if (idle_cpu(i))
1123 return i; 1124 return i;
1124 } 1125 }
1125 } 1126 }
1126 else 1127 else
1127 break; 1128 break;
1128 } 1129 }
1129 return cpu; 1130 return cpu;
1130 } 1131 }
1131 #else 1132 #else
1132 static inline int wake_idle(int cpu, task_t *p) 1133 static inline int wake_idle(int cpu, task_t *p)
1133 { 1134 {
1134 return cpu; 1135 return cpu;
1135 } 1136 }
1136 #endif 1137 #endif
1137 1138
1138 /*** 1139 /***
1139 * try_to_wake_up - wake up a thread 1140 * try_to_wake_up - wake up a thread
1140 * @p: the to-be-woken-up thread 1141 * @p: the to-be-woken-up thread
1141 * @state: the mask of task states that can be woken 1142 * @state: the mask of task states that can be woken
1142 * @sync: do a synchronous wakeup? 1143 * @sync: do a synchronous wakeup?
1143 * 1144 *
1144 * Put it on the run-queue if it's not already there. The "current" 1145 * Put it on the run-queue if it's not already there. The "current"
1145 * thread is always on the run-queue (except when the actual 1146 * thread is always on the run-queue (except when the actual
1146 * re-schedule is in progress), and as such you're allowed to do 1147 * re-schedule is in progress), and as such you're allowed to do
1147 * the simpler "current->state = TASK_RUNNING" to mark yourself 1148 * the simpler "current->state = TASK_RUNNING" to mark yourself
1148 * runnable without the overhead of this. 1149 * runnable without the overhead of this.
1149 * 1150 *
1150 * returns failure only if the task is already active. 1151 * returns failure only if the task is already active.
1151 */ 1152 */
1152 static int try_to_wake_up(task_t *p, unsigned int state, int sync) 1153 static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1153 { 1154 {
1154 int cpu, this_cpu, success = 0; 1155 int cpu, this_cpu, success = 0;
1155 unsigned long flags; 1156 unsigned long flags;
1156 long old_state; 1157 long old_state;
1157 runqueue_t *rq; 1158 runqueue_t *rq;
1158 #ifdef CONFIG_SMP 1159 #ifdef CONFIG_SMP
1159 unsigned long load, this_load; 1160 unsigned long load, this_load;
1160 struct sched_domain *sd, *this_sd = NULL; 1161 struct sched_domain *sd, *this_sd = NULL;
1161 int new_cpu; 1162 int new_cpu;
1162 #endif 1163 #endif
1163 1164
1164 rq = task_rq_lock(p, &flags); 1165 rq = task_rq_lock(p, &flags);
1165 old_state = p->state; 1166 old_state = p->state;
1166 if (!(old_state & state)) 1167 if (!(old_state & state))
1167 goto out; 1168 goto out;
1168 1169
1169 if (p->array) 1170 if (p->array)
1170 goto out_running; 1171 goto out_running;
1171 1172
1172 cpu = task_cpu(p); 1173 cpu = task_cpu(p);
1173 this_cpu = smp_processor_id(); 1174 this_cpu = smp_processor_id();
1174 1175
1175 #ifdef CONFIG_SMP 1176 #ifdef CONFIG_SMP
1176 if (unlikely(task_running(rq, p))) 1177 if (unlikely(task_running(rq, p)))
1177 goto out_activate; 1178 goto out_activate;
1178 1179
1179 new_cpu = cpu; 1180 new_cpu = cpu;
1180 1181
1181 schedstat_inc(rq, ttwu_cnt); 1182 schedstat_inc(rq, ttwu_cnt);
1182 if (cpu == this_cpu) { 1183 if (cpu == this_cpu) {
1183 schedstat_inc(rq, ttwu_local); 1184 schedstat_inc(rq, ttwu_local);
1184 goto out_set_cpu; 1185 goto out_set_cpu;
1185 } 1186 }
1186 1187
1187 for_each_domain(this_cpu, sd) { 1188 for_each_domain(this_cpu, sd) {
1188 if (cpu_isset(cpu, sd->span)) { 1189 if (cpu_isset(cpu, sd->span)) {
1189 schedstat_inc(sd, ttwu_wake_remote); 1190 schedstat_inc(sd, ttwu_wake_remote);
1190 this_sd = sd; 1191 this_sd = sd;
1191 break; 1192 break;
1192 } 1193 }
1193 } 1194 }
1194 1195
1195 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1196 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1196 goto out_set_cpu; 1197 goto out_set_cpu;
1197 1198
1198 /* 1199 /*
1199 * Check for affine wakeup and passive balancing possibilities. 1200 * Check for affine wakeup and passive balancing possibilities.
1200 */ 1201 */
1201 if (this_sd) { 1202 if (this_sd) {
1202 int idx = this_sd->wake_idx; 1203 int idx = this_sd->wake_idx;
1203 unsigned int imbalance; 1204 unsigned int imbalance;
1204 1205
1205 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1206 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1206 1207
1207 load = source_load(cpu, idx); 1208 load = source_load(cpu, idx);
1208 this_load = target_load(this_cpu, idx); 1209 this_load = target_load(this_cpu, idx);
1209 1210
1210 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1211 new_cpu = this_cpu; /* Wake to this CPU if we can */
1211 1212
1212 if (this_sd->flags & SD_WAKE_AFFINE) { 1213 if (this_sd->flags & SD_WAKE_AFFINE) {
1213 unsigned long tl = this_load; 1214 unsigned long tl = this_load;
1214 /* 1215 /*
1215 * If sync wakeup then subtract the (maximum possible) 1216 * If sync wakeup then subtract the (maximum possible)
1216 * effect of the currently running task from the load 1217 * effect of the currently running task from the load
1217 * of the current CPU: 1218 * of the current CPU:
1218 */ 1219 */
1219 if (sync) 1220 if (sync)
1220 tl -= SCHED_LOAD_SCALE; 1221 tl -= SCHED_LOAD_SCALE;
1221 1222
1222 if ((tl <= load && 1223 if ((tl <= load &&
1223 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1224 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1224 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1225 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1225 /* 1226 /*
1226 * This domain has SD_WAKE_AFFINE and 1227 * This domain has SD_WAKE_AFFINE and
1227 * p is cache cold in this domain, and 1228 * p is cache cold in this domain, and
1228 * there is no bad imbalance. 1229 * there is no bad imbalance.
1229 */ 1230 */
1230 schedstat_inc(this_sd, ttwu_move_affine); 1231 schedstat_inc(this_sd, ttwu_move_affine);
1231 goto out_set_cpu; 1232 goto out_set_cpu;
1232 } 1233 }
1233 } 1234 }
1234 1235
1235 /* 1236 /*
1236 * Start passive balancing when half the imbalance_pct 1237 * Start passive balancing when half the imbalance_pct
1237 * limit is reached. 1238 * limit is reached.
1238 */ 1239 */
1239 if (this_sd->flags & SD_WAKE_BALANCE) { 1240 if (this_sd->flags & SD_WAKE_BALANCE) {
1240 if (imbalance*this_load <= 100*load) { 1241 if (imbalance*this_load <= 100*load) {
1241 schedstat_inc(this_sd, ttwu_move_balance); 1242 schedstat_inc(this_sd, ttwu_move_balance);
1242 goto out_set_cpu; 1243 goto out_set_cpu;
1243 } 1244 }
1244 } 1245 }
1245 } 1246 }
1246 1247
1247 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1248 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1248 out_set_cpu: 1249 out_set_cpu:
1249 new_cpu = wake_idle(new_cpu, p); 1250 new_cpu = wake_idle(new_cpu, p);
1250 if (new_cpu != cpu) { 1251 if (new_cpu != cpu) {
1251 set_task_cpu(p, new_cpu); 1252 set_task_cpu(p, new_cpu);
1252 task_rq_unlock(rq, &flags); 1253 task_rq_unlock(rq, &flags);
1253 /* might preempt at this point */ 1254 /* might preempt at this point */
1254 rq = task_rq_lock(p, &flags); 1255 rq = task_rq_lock(p, &flags);
1255 old_state = p->state; 1256 old_state = p->state;
1256 if (!(old_state & state)) 1257 if (!(old_state & state))
1257 goto out; 1258 goto out;
1258 if (p->array) 1259 if (p->array)
1259 goto out_running; 1260 goto out_running;
1260 1261
1261 this_cpu = smp_processor_id(); 1262 this_cpu = smp_processor_id();
1262 cpu = task_cpu(p); 1263 cpu = task_cpu(p);
1263 } 1264 }
1264 1265
1265 out_activate: 1266 out_activate:
1266 #endif /* CONFIG_SMP */ 1267 #endif /* CONFIG_SMP */
1267 if (old_state == TASK_UNINTERRUPTIBLE) { 1268 if (old_state == TASK_UNINTERRUPTIBLE) {
1268 rq->nr_uninterruptible--; 1269 rq->nr_uninterruptible--;
1269 /* 1270 /*
1270 * Tasks on involuntary sleep don't earn 1271 * Tasks on involuntary sleep don't earn
1271 * sleep_avg beyond just interactive state. 1272 * sleep_avg beyond just interactive state.
1272 */ 1273 */
1273 p->activated = -1; 1274 p->activated = -1;
1274 } 1275 }
1275 1276
1276 /* 1277 /*
1277 * Tasks that have marked their sleep as noninteractive get 1278 * Tasks that have marked their sleep as noninteractive get
1278 * woken up without updating their sleep average. (i.e. their 1279 * woken up without updating their sleep average. (i.e. their
1279 * sleep is handled in a priority-neutral manner, no priority 1280 * sleep is handled in a priority-neutral manner, no priority
1280 * boost and no penalty.) 1281 * boost and no penalty.)
1281 */ 1282 */
1282 if (old_state & TASK_NONINTERACTIVE) 1283 if (old_state & TASK_NONINTERACTIVE)
1283 __activate_task(p, rq); 1284 __activate_task(p, rq);
1284 else 1285 else
1285 activate_task(p, rq, cpu == this_cpu); 1286 activate_task(p, rq, cpu == this_cpu);
1286 /* 1287 /*
1287 * Sync wakeups (i.e. those types of wakeups where the waker 1288 * Sync wakeups (i.e. those types of wakeups where the waker
1288 * has indicated that it will leave the CPU in short order) 1289 * has indicated that it will leave the CPU in short order)
1289 * don't trigger a preemption, if the woken up task will run on 1290 * don't trigger a preemption, if the woken up task will run on
1290 * this cpu. (in this case the 'I will reschedule' promise of 1291 * this cpu. (in this case the 'I will reschedule' promise of
1291 * the waker guarantees that the freshly woken up task is going 1292 * the waker guarantees that the freshly woken up task is going
1292 * to be considered on this CPU.) 1293 * to be considered on this CPU.)
1293 */ 1294 */
1294 if (!sync || cpu != this_cpu) { 1295 if (!sync || cpu != this_cpu) {
1295 if (TASK_PREEMPTS_CURR(p, rq)) 1296 if (TASK_PREEMPTS_CURR(p, rq))
1296 resched_task(rq->curr); 1297 resched_task(rq->curr);
1297 } 1298 }
1298 success = 1; 1299 success = 1;
1299 1300
1300 out_running: 1301 out_running:
1301 p->state = TASK_RUNNING; 1302 p->state = TASK_RUNNING;
1302 out: 1303 out:
1303 task_rq_unlock(rq, &flags); 1304 task_rq_unlock(rq, &flags);
1304 1305
1305 return success; 1306 return success;
1306 } 1307 }
1307 1308
1308 int fastcall wake_up_process(task_t *p) 1309 int fastcall wake_up_process(task_t *p)
1309 { 1310 {
1310 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1311 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1311 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1312 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1312 } 1313 }
1313 1314
1314 EXPORT_SYMBOL(wake_up_process); 1315 EXPORT_SYMBOL(wake_up_process);
1315 1316
1316 int fastcall wake_up_state(task_t *p, unsigned int state) 1317 int fastcall wake_up_state(task_t *p, unsigned int state)
1317 { 1318 {
1318 return try_to_wake_up(p, state, 0); 1319 return try_to_wake_up(p, state, 0);
1319 } 1320 }
1320 1321
1321 /* 1322 /*
1322 * Perform scheduler related setup for a newly forked process p. 1323 * Perform scheduler related setup for a newly forked process p.
1323 * p is forked by current. 1324 * p is forked by current.
1324 */ 1325 */
1325 void fastcall sched_fork(task_t *p, int clone_flags) 1326 void fastcall sched_fork(task_t *p, int clone_flags)
1326 { 1327 {
1327 int cpu = get_cpu(); 1328 int cpu = get_cpu();
1328 1329
1329 #ifdef CONFIG_SMP 1330 #ifdef CONFIG_SMP
1330 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1331 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1331 #endif 1332 #endif
1332 set_task_cpu(p, cpu); 1333 set_task_cpu(p, cpu);
1333 1334
1334 /* 1335 /*
1335 * We mark the process as running here, but have not actually 1336 * We mark the process as running here, but have not actually
1336 * inserted it onto the runqueue yet. This guarantees that 1337 * inserted it onto the runqueue yet. This guarantees that
1337 * nobody will actually run it, and a signal or other external 1338 * nobody will actually run it, and a signal or other external
1338 * event cannot wake it up and insert it on the runqueue either. 1339 * event cannot wake it up and insert it on the runqueue either.
1339 */ 1340 */
1340 p->state = TASK_RUNNING; 1341 p->state = TASK_RUNNING;
1341 INIT_LIST_HEAD(&p->run_list); 1342 INIT_LIST_HEAD(&p->run_list);
1342 p->array = NULL; 1343 p->array = NULL;
1343 #ifdef CONFIG_SCHEDSTATS 1344 #ifdef CONFIG_SCHEDSTATS
1344 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1345 memset(&p->sched_info, 0, sizeof(p->sched_info));
1345 #endif 1346 #endif
1346 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1347 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1347 p->oncpu = 0; 1348 p->oncpu = 0;
1348 #endif 1349 #endif
1349 #ifdef CONFIG_PREEMPT 1350 #ifdef CONFIG_PREEMPT
1350 /* Want to start with kernel preemption disabled. */ 1351 /* Want to start with kernel preemption disabled. */
1351 task_thread_info(p)->preempt_count = 1; 1352 task_thread_info(p)->preempt_count = 1;
1352 #endif 1353 #endif
1353 /* 1354 /*
1354 * Share the timeslice between parent and child, thus the 1355 * Share the timeslice between parent and child, thus the
1355 * total amount of pending timeslices in the system doesn't change, 1356 * total amount of pending timeslices in the system doesn't change,
1356 * resulting in more scheduling fairness. 1357 * resulting in more scheduling fairness.
1357 */ 1358 */
1358 local_irq_disable(); 1359 local_irq_disable();
1359 p->time_slice = (current->time_slice + 1) >> 1; 1360 p->time_slice = (current->time_slice + 1) >> 1;
1360 /* 1361 /*
1361 * The remainder of the first timeslice might be recovered by 1362 * The remainder of the first timeslice might be recovered by
1362 * the parent if the child exits early enough. 1363 * the parent if the child exits early enough.
1363 */ 1364 */
1364 p->first_time_slice = 1; 1365 p->first_time_slice = 1;
1365 current->time_slice >>= 1; 1366 current->time_slice >>= 1;
1366 p->timestamp = sched_clock(); 1367 p->timestamp = sched_clock();
1367 if (unlikely(!current->time_slice)) { 1368 if (unlikely(!current->time_slice)) {
1368 /* 1369 /*
1369 * This case is rare, it happens when the parent has only 1370 * This case is rare, it happens when the parent has only
1370 * a single jiffy left from its timeslice. Taking the 1371 * a single jiffy left from its timeslice. Taking the
1371 * runqueue lock is not a problem. 1372 * runqueue lock is not a problem.
1372 */ 1373 */
1373 current->time_slice = 1; 1374 current->time_slice = 1;
1374 scheduler_tick(); 1375 scheduler_tick();
1375 } 1376 }
1376 local_irq_enable(); 1377 local_irq_enable();
1377 put_cpu(); 1378 put_cpu();
1378 } 1379 }
1379 1380
1380 /* 1381 /*
1381 * wake_up_new_task - wake up a newly created task for the first time. 1382 * wake_up_new_task - wake up a newly created task for the first time.
1382 * 1383 *
1383 * This function will do some initial scheduler statistics housekeeping 1384 * This function will do some initial scheduler statistics housekeeping
1384 * that must be done for every newly created context, then puts the task 1385 * that must be done for every newly created context, then puts the task
1385 * on the runqueue and wakes it. 1386 * on the runqueue and wakes it.
1386 */ 1387 */
1387 void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) 1388 void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1388 { 1389 {
1389 unsigned long flags; 1390 unsigned long flags;
1390 int this_cpu, cpu; 1391 int this_cpu, cpu;
1391 runqueue_t *rq, *this_rq; 1392 runqueue_t *rq, *this_rq;
1392 1393
1393 rq = task_rq_lock(p, &flags); 1394 rq = task_rq_lock(p, &flags);
1394 BUG_ON(p->state != TASK_RUNNING); 1395 BUG_ON(p->state != TASK_RUNNING);
1395 this_cpu = smp_processor_id(); 1396 this_cpu = smp_processor_id();
1396 cpu = task_cpu(p); 1397 cpu = task_cpu(p);
1397 1398
1398 /* 1399 /*
1399 * We decrease the sleep average of forking parents 1400 * We decrease the sleep average of forking parents
1400 * and children as well, to keep max-interactive tasks 1401 * and children as well, to keep max-interactive tasks
1401 * from forking tasks that are max-interactive. The parent 1402 * from forking tasks that are max-interactive. The parent
1402 * (current) is done further down, under its lock. 1403 * (current) is done further down, under its lock.
1403 */ 1404 */
1404 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * 1405 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1405 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1406 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1406 1407
1407 p->prio = effective_prio(p); 1408 p->prio = effective_prio(p);
1408 1409
1409 if (likely(cpu == this_cpu)) { 1410 if (likely(cpu == this_cpu)) {
1410 if (!(clone_flags & CLONE_VM)) { 1411 if (!(clone_flags & CLONE_VM)) {
1411 /* 1412 /*
1412 * The VM isn't cloned, so we're in a good position to 1413 * The VM isn't cloned, so we're in a good position to
1413 * do child-runs-first in anticipation of an exec. This 1414 * do child-runs-first in anticipation of an exec. This
1414 * usually avoids a lot of COW overhead. 1415 * usually avoids a lot of COW overhead.
1415 */ 1416 */
1416 if (unlikely(!current->array)) 1417 if (unlikely(!current->array))
1417 __activate_task(p, rq); 1418 __activate_task(p, rq);
1418 else { 1419 else {
1419 p->prio = current->prio; 1420 p->prio = current->prio;
1420 list_add_tail(&p->run_list, &current->run_list); 1421 list_add_tail(&p->run_list, &current->run_list);
1421 p->array = current->array; 1422 p->array = current->array;
1422 p->array->nr_active++; 1423 p->array->nr_active++;
1423 rq->nr_running++; 1424 rq->nr_running++;
1424 } 1425 }
1425 set_need_resched(); 1426 set_need_resched();
1426 } else 1427 } else
1427 /* Run child last */ 1428 /* Run child last */
1428 __activate_task(p, rq); 1429 __activate_task(p, rq);
1429 /* 1430 /*
1430 * We skip the following code due to cpu == this_cpu 1431 * We skip the following code due to cpu == this_cpu
1431 * 1432 *
1432 * task_rq_unlock(rq, &flags); 1433 * task_rq_unlock(rq, &flags);
1433 * this_rq = task_rq_lock(current, &flags); 1434 * this_rq = task_rq_lock(current, &flags);
1434 */ 1435 */
1435 this_rq = rq; 1436 this_rq = rq;
1436 } else { 1437 } else {
1437 this_rq = cpu_rq(this_cpu); 1438 this_rq = cpu_rq(this_cpu);
1438 1439
1439 /* 1440 /*
1440 * Not the local CPU - must adjust timestamp. This should 1441 * Not the local CPU - must adjust timestamp. This should
1441 * get optimised away in the !CONFIG_SMP case. 1442 * get optimised away in the !CONFIG_SMP case.
1442 */ 1443 */
1443 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1444 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1444 + rq->timestamp_last_tick; 1445 + rq->timestamp_last_tick;
1445 __activate_task(p, rq); 1446 __activate_task(p, rq);
1446 if (TASK_PREEMPTS_CURR(p, rq)) 1447 if (TASK_PREEMPTS_CURR(p, rq))
1447 resched_task(rq->curr); 1448 resched_task(rq->curr);
1448 1449
1449 /* 1450 /*
1450 * Parent and child are on different CPUs, now get the 1451 * Parent and child are on different CPUs, now get the
1451 * parent runqueue to update the parent's ->sleep_avg: 1452 * parent runqueue to update the parent's ->sleep_avg:
1452 */ 1453 */
1453 task_rq_unlock(rq, &flags); 1454 task_rq_unlock(rq, &flags);
1454 this_rq = task_rq_lock(current, &flags); 1455 this_rq = task_rq_lock(current, &flags);
1455 } 1456 }
1456 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * 1457 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1457 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1458 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1458 task_rq_unlock(this_rq, &flags); 1459 task_rq_unlock(this_rq, &flags);
1459 } 1460 }
1460 1461
1461 /* 1462 /*
1462 * Potentially available exiting-child timeslices are 1463 * Potentially available exiting-child timeslices are
1463 * retrieved here - this way the parent does not get 1464 * retrieved here - this way the parent does not get
1464 * penalized for creating too many threads. 1465 * penalized for creating too many threads.
1465 * 1466 *
1466 * (this cannot be used to 'generate' timeslices 1467 * (this cannot be used to 'generate' timeslices
1467 * artificially, because any timeslice recovered here 1468 * artificially, because any timeslice recovered here
1468 * was given away by the parent in the first place.) 1469 * was given away by the parent in the first place.)
1469 */ 1470 */
1470 void fastcall sched_exit(task_t *p) 1471 void fastcall sched_exit(task_t *p)
1471 { 1472 {
1472 unsigned long flags; 1473 unsigned long flags;
1473 runqueue_t *rq; 1474 runqueue_t *rq;
1474 1475
1475 /* 1476 /*
1476 * If the child was a (relative-) CPU hog then decrease 1477 * If the child was a (relative-) CPU hog then decrease
1477 * the sleep_avg of the parent as well. 1478 * the sleep_avg of the parent as well.
1478 */ 1479 */
1479 rq = task_rq_lock(p->parent, &flags); 1480 rq = task_rq_lock(p->parent, &flags);
1480 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { 1481 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1481 p->parent->time_slice += p->time_slice; 1482 p->parent->time_slice += p->time_slice;
1482 if (unlikely(p->parent->time_slice > task_timeslice(p))) 1483 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1483 p->parent->time_slice = task_timeslice(p); 1484 p->parent->time_slice = task_timeslice(p);
1484 } 1485 }
1485 if (p->sleep_avg < p->parent->sleep_avg) 1486 if (p->sleep_avg < p->parent->sleep_avg)
1486 p->parent->sleep_avg = p->parent->sleep_avg / 1487 p->parent->sleep_avg = p->parent->sleep_avg /
1487 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / 1488 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1488 (EXIT_WEIGHT + 1); 1489 (EXIT_WEIGHT + 1);
1489 task_rq_unlock(rq, &flags); 1490 task_rq_unlock(rq, &flags);
1490 } 1491 }
1491 1492
1492 /** 1493 /**
1493 * prepare_task_switch - prepare to switch tasks 1494 * prepare_task_switch - prepare to switch tasks
1494 * @rq: the runqueue preparing to switch 1495 * @rq: the runqueue preparing to switch
1495 * @next: the task we are going to switch to. 1496 * @next: the task we are going to switch to.
1496 * 1497 *
1497 * This is called with the rq lock held and interrupts off. It must 1498 * This is called with the rq lock held and interrupts off. It must
1498 * be paired with a subsequent finish_task_switch after the context 1499 * be paired with a subsequent finish_task_switch after the context
1499 * switch. 1500 * switch.
1500 * 1501 *
1501 * prepare_task_switch sets up locking and calls architecture specific 1502 * prepare_task_switch sets up locking and calls architecture specific
1502 * hooks. 1503 * hooks.
1503 */ 1504 */
1504 static inline void prepare_task_switch(runqueue_t *rq, task_t *next) 1505 static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1505 { 1506 {
1506 prepare_lock_switch(rq, next); 1507 prepare_lock_switch(rq, next);
1507 prepare_arch_switch(next); 1508 prepare_arch_switch(next);
1508 } 1509 }
1509 1510
1510 /** 1511 /**
1511 * finish_task_switch - clean up after a task-switch 1512 * finish_task_switch - clean up after a task-switch
1512 * @rq: runqueue associated with task-switch 1513 * @rq: runqueue associated with task-switch
1513 * @prev: the thread we just switched away from. 1514 * @prev: the thread we just switched away from.
1514 * 1515 *
1515 * finish_task_switch must be called after the context switch, paired 1516 * finish_task_switch must be called after the context switch, paired
1516 * with a prepare_task_switch call before the context switch. 1517 * with a prepare_task_switch call before the context switch.
1517 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1518 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1518 * and do any other architecture-specific cleanup actions. 1519 * and do any other architecture-specific cleanup actions.
1519 * 1520 *
1520 * Note that we may have delayed dropping an mm in context_switch(). If 1521 * Note that we may have delayed dropping an mm in context_switch(). If
1521 * so, we finish that here outside of the runqueue lock. (Doing it 1522 * so, we finish that here outside of the runqueue lock. (Doing it
1522 * with the lock held can cause deadlocks; see schedule() for 1523 * with the lock held can cause deadlocks; see schedule() for
1523 * details.) 1524 * details.)
1524 */ 1525 */
1525 static inline void finish_task_switch(runqueue_t *rq, task_t *prev) 1526 static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1526 __releases(rq->lock) 1527 __releases(rq->lock)
1527 { 1528 {
1528 struct mm_struct *mm = rq->prev_mm; 1529 struct mm_struct *mm = rq->prev_mm;
1529 unsigned long prev_task_flags; 1530 unsigned long prev_task_flags;
1530 1531
1531 rq->prev_mm = NULL; 1532 rq->prev_mm = NULL;
1532 1533
1533 /* 1534 /*
1534 * A task struct has one reference for the use as "current". 1535 * A task struct has one reference for the use as "current".
1535 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and 1536 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1536 * calls schedule one last time. The schedule call will never return, 1537 * calls schedule one last time. The schedule call will never return,
1537 * and the scheduled task must drop that reference. 1538 * and the scheduled task must drop that reference.
1538 * The test for EXIT_ZOMBIE must occur while the runqueue locks are 1539 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1539 * still held, otherwise prev could be scheduled on another cpu, die 1540 * still held, otherwise prev could be scheduled on another cpu, die
1540 * there before we look at prev->state, and then the reference would 1541 * there before we look at prev->state, and then the reference would
1541 * be dropped twice. 1542 * be dropped twice.
1542 * Manfred Spraul <manfred@colorfullife.com> 1543 * Manfred Spraul <manfred@colorfullife.com>
1543 */ 1544 */
1544 prev_task_flags = prev->flags; 1545 prev_task_flags = prev->flags;
1545 finish_arch_switch(prev); 1546 finish_arch_switch(prev);
1546 finish_lock_switch(rq, prev); 1547 finish_lock_switch(rq, prev);
1547 if (mm) 1548 if (mm)
1548 mmdrop(mm); 1549 mmdrop(mm);
1549 if (unlikely(prev_task_flags & PF_DEAD)) 1550 if (unlikely(prev_task_flags & PF_DEAD)) {
1551 /*
1552 * Remove function-return probe instances associated with this
1553 * task and put them back on the free list.
1554 */
1555 kprobe_flush_task(prev);
1550 put_task_struct(prev); 1556 put_task_struct(prev);
1557 }
1551 } 1558 }
1552 1559
1553 /** 1560 /**
1554 * schedule_tail - first thing a freshly forked thread must call. 1561 * schedule_tail - first thing a freshly forked thread must call.
1555 * @prev: the thread we just switched away from. 1562 * @prev: the thread we just switched away from.
1556 */ 1563 */
1557 asmlinkage void schedule_tail(task_t *prev) 1564 asmlinkage void schedule_tail(task_t *prev)
1558 __releases(rq->lock) 1565 __releases(rq->lock)
1559 { 1566 {
1560 runqueue_t *rq = this_rq(); 1567 runqueue_t *rq = this_rq();
1561 finish_task_switch(rq, prev); 1568 finish_task_switch(rq, prev);
1562 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 1569 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1563 /* In this case, finish_task_switch does not reenable preemption */ 1570 /* In this case, finish_task_switch does not reenable preemption */
1564 preempt_enable(); 1571 preempt_enable();
1565 #endif 1572 #endif
1566 if (current->set_child_tid) 1573 if (current->set_child_tid)
1567 put_user(current->pid, current->set_child_tid); 1574 put_user(current->pid, current->set_child_tid);
1568 } 1575 }
1569 1576
1570 /* 1577 /*
1571 * context_switch - switch to the new MM and the new 1578 * context_switch - switch to the new MM and the new
1572 * thread's register state. 1579 * thread's register state.
1573 */ 1580 */
1574 static inline 1581 static inline
1575 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) 1582 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1576 { 1583 {
1577 struct mm_struct *mm = next->mm; 1584 struct mm_struct *mm = next->mm;
1578 struct mm_struct *oldmm = prev->active_mm; 1585 struct mm_struct *oldmm = prev->active_mm;
1579 1586
1580 if (unlikely(!mm)) { 1587 if (unlikely(!mm)) {
1581 next->active_mm = oldmm; 1588 next->active_mm = oldmm;
1582 atomic_inc(&oldmm->mm_count); 1589 atomic_inc(&oldmm->mm_count);
1583 enter_lazy_tlb(oldmm, next); 1590 enter_lazy_tlb(oldmm, next);
1584 } else 1591 } else
1585 switch_mm(oldmm, mm, next); 1592 switch_mm(oldmm, mm, next);
1586 1593
1587 if (unlikely(!prev->mm)) { 1594 if (unlikely(!prev->mm)) {
1588 prev->active_mm = NULL; 1595 prev->active_mm = NULL;
1589 WARN_ON(rq->prev_mm); 1596 WARN_ON(rq->prev_mm);
1590 rq->prev_mm = oldmm; 1597 rq->prev_mm = oldmm;
1591 } 1598 }
1592 1599
1593 /* Here we just switch the register state and the stack. */ 1600 /* Here we just switch the register state and the stack. */
1594 switch_to(prev, next, prev); 1601 switch_to(prev, next, prev);
1595 1602
1596 return prev; 1603 return prev;
1597 } 1604 }
1598 1605
1599 /* 1606 /*
1600 * nr_running, nr_uninterruptible and nr_context_switches: 1607 * nr_running, nr_uninterruptible and nr_context_switches:
1601 * 1608 *
1602 * externally visible scheduler statistics: current number of runnable 1609 * externally visible scheduler statistics: current number of runnable
1603 * threads, current number of uninterruptible-sleeping threads, total 1610 * threads, current number of uninterruptible-sleeping threads, total
1604 * number of context switches performed since bootup. 1611 * number of context switches performed since bootup.
1605 */ 1612 */
1606 unsigned long nr_running(void) 1613 unsigned long nr_running(void)
1607 { 1614 {
1608 unsigned long i, sum = 0; 1615 unsigned long i, sum = 0;
1609 1616
1610 for_each_online_cpu(i) 1617 for_each_online_cpu(i)
1611 sum += cpu_rq(i)->nr_running; 1618 sum += cpu_rq(i)->nr_running;
1612 1619
1613 return sum; 1620 return sum;
1614 } 1621 }
1615 1622
1616 unsigned long nr_uninterruptible(void) 1623 unsigned long nr_uninterruptible(void)
1617 { 1624 {
1618 unsigned long i, sum = 0; 1625 unsigned long i, sum = 0;
1619 1626
1620 for_each_cpu(i) 1627 for_each_cpu(i)
1621 sum += cpu_rq(i)->nr_uninterruptible; 1628 sum += cpu_rq(i)->nr_uninterruptible;
1622 1629
1623 /* 1630 /*
1624 * Since we read the counters lockless, it might be slightly 1631 * Since we read the counters lockless, it might be slightly
1625 * inaccurate. Do not allow it to go below zero though: 1632 * inaccurate. Do not allow it to go below zero though:
1626 */ 1633 */
1627 if (unlikely((long)sum < 0)) 1634 if (unlikely((long)sum < 0))
1628 sum = 0; 1635 sum = 0;
1629 1636
1630 return sum; 1637 return sum;
1631 } 1638 }
1632 1639
1633 unsigned long long nr_context_switches(void) 1640 unsigned long long nr_context_switches(void)
1634 { 1641 {
1635 unsigned long long i, sum = 0; 1642 unsigned long long i, sum = 0;
1636 1643
1637 for_each_cpu(i) 1644 for_each_cpu(i)
1638 sum += cpu_rq(i)->nr_switches; 1645 sum += cpu_rq(i)->nr_switches;
1639 1646
1640 return sum; 1647 return sum;
1641 } 1648 }
1642 1649
1643 unsigned long nr_iowait(void) 1650 unsigned long nr_iowait(void)
1644 { 1651 {
1645 unsigned long i, sum = 0; 1652 unsigned long i, sum = 0;
1646 1653
1647 for_each_cpu(i) 1654 for_each_cpu(i)
1648 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1655 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1649 1656
1650 return sum; 1657 return sum;
1651 } 1658 }
1652 1659
1653 #ifdef CONFIG_SMP 1660 #ifdef CONFIG_SMP
1654 1661
1655 /* 1662 /*
1656 * double_rq_lock - safely lock two runqueues 1663 * double_rq_lock - safely lock two runqueues
1657 * 1664 *
1658 * We must take them in cpu order to match code in 1665 * We must take them in cpu order to match code in
1659 * dependent_sleeper and wake_dependent_sleeper. 1666 * dependent_sleeper and wake_dependent_sleeper.
1660 * 1667 *
1661 * Note this does not disable interrupts like task_rq_lock, 1668 * Note this does not disable interrupts like task_rq_lock,
1662 * you need to do so manually before calling. 1669 * you need to do so manually before calling.
1663 */ 1670 */
1664 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) 1671 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1665 __acquires(rq1->lock) 1672 __acquires(rq1->lock)
1666 __acquires(rq2->lock) 1673 __acquires(rq2->lock)
1667 { 1674 {
1668 if (rq1 == rq2) { 1675 if (rq1 == rq2) {
1669 spin_lock(&rq1->lock); 1676 spin_lock(&rq1->lock);
1670 __acquire(rq2->lock); /* Fake it out ;) */ 1677 __acquire(rq2->lock); /* Fake it out ;) */
1671 } else { 1678 } else {
1672 if (rq1->cpu < rq2->cpu) { 1679 if (rq1->cpu < rq2->cpu) {
1673 spin_lock(&rq1->lock); 1680 spin_lock(&rq1->lock);
1674 spin_lock(&rq2->lock); 1681 spin_lock(&rq2->lock);
1675 } else { 1682 } else {
1676 spin_lock(&rq2->lock); 1683 spin_lock(&rq2->lock);
1677 spin_lock(&rq1->lock); 1684 spin_lock(&rq1->lock);
1678 } 1685 }
1679 } 1686 }
1680 } 1687 }
1681 1688
1682 /* 1689 /*
1683 * double_rq_unlock - safely unlock two runqueues 1690 * double_rq_unlock - safely unlock two runqueues
1684 * 1691 *
1685 * Note this does not restore interrupts like task_rq_unlock, 1692 * Note this does not restore interrupts like task_rq_unlock,
1686 * you need to do so manually after calling. 1693 * you need to do so manually after calling.
1687 */ 1694 */
1688 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) 1695 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1689 __releases(rq1->lock) 1696 __releases(rq1->lock)
1690 __releases(rq2->lock) 1697 __releases(rq2->lock)
1691 { 1698 {
1692 spin_unlock(&rq1->lock); 1699 spin_unlock(&rq1->lock);
1693 if (rq1 != rq2) 1700 if (rq1 != rq2)
1694 spin_unlock(&rq2->lock); 1701 spin_unlock(&rq2->lock);
1695 else 1702 else
1696 __release(rq2->lock); 1703 __release(rq2->lock);
1697 } 1704 }
1698 1705
1699 /* 1706 /*
1700 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1707 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1701 */ 1708 */
1702 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) 1709 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1703 __releases(this_rq->lock) 1710 __releases(this_rq->lock)
1704 __acquires(busiest->lock) 1711 __acquires(busiest->lock)
1705 __acquires(this_rq->lock) 1712 __acquires(this_rq->lock)
1706 { 1713 {
1707 if (unlikely(!spin_trylock(&busiest->lock))) { 1714 if (unlikely(!spin_trylock(&busiest->lock))) {
1708 if (busiest->cpu < this_rq->cpu) { 1715 if (busiest->cpu < this_rq->cpu) {
1709 spin_unlock(&this_rq->lock); 1716 spin_unlock(&this_rq->lock);
1710 spin_lock(&busiest->lock); 1717 spin_lock(&busiest->lock);
1711 spin_lock(&this_rq->lock); 1718 spin_lock(&this_rq->lock);
1712 } else 1719 } else
1713 spin_lock(&busiest->lock); 1720 spin_lock(&busiest->lock);
1714 } 1721 }
1715 } 1722 }
1716 1723
1717 /* 1724 /*
1718 * If dest_cpu is allowed for this process, migrate the task to it. 1725 * If dest_cpu is allowed for this process, migrate the task to it.
1719 * This is accomplished by forcing the cpu_allowed mask to only 1726 * This is accomplished by forcing the cpu_allowed mask to only
1720 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1727 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1721 * the cpu_allowed mask is restored. 1728 * the cpu_allowed mask is restored.
1722 */ 1729 */
1723 static void sched_migrate_task(task_t *p, int dest_cpu) 1730 static void sched_migrate_task(task_t *p, int dest_cpu)
1724 { 1731 {
1725 migration_req_t req; 1732 migration_req_t req;
1726 runqueue_t *rq; 1733 runqueue_t *rq;
1727 unsigned long flags; 1734 unsigned long flags;
1728 1735
1729 rq = task_rq_lock(p, &flags); 1736 rq = task_rq_lock(p, &flags);
1730 if (!cpu_isset(dest_cpu, p->cpus_allowed) 1737 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1731 || unlikely(cpu_is_offline(dest_cpu))) 1738 || unlikely(cpu_is_offline(dest_cpu)))
1732 goto out; 1739 goto out;
1733 1740
1734 /* force the process onto the specified CPU */ 1741 /* force the process onto the specified CPU */
1735 if (migrate_task(p, dest_cpu, &req)) { 1742 if (migrate_task(p, dest_cpu, &req)) {
1736 /* Need to wait for migration thread (might exit: take ref). */ 1743 /* Need to wait for migration thread (might exit: take ref). */
1737 struct task_struct *mt = rq->migration_thread; 1744 struct task_struct *mt = rq->migration_thread;
1738 get_task_struct(mt); 1745 get_task_struct(mt);
1739 task_rq_unlock(rq, &flags); 1746 task_rq_unlock(rq, &flags);
1740 wake_up_process(mt); 1747 wake_up_process(mt);
1741 put_task_struct(mt); 1748 put_task_struct(mt);
1742 wait_for_completion(&req.done); 1749 wait_for_completion(&req.done);
1743 return; 1750 return;
1744 } 1751 }
1745 out: 1752 out:
1746 task_rq_unlock(rq, &flags); 1753 task_rq_unlock(rq, &flags);
1747 } 1754 }
1748 1755
1749 /* 1756 /*
1750 * sched_exec - execve() is a valuable balancing opportunity, because at 1757 * sched_exec - execve() is a valuable balancing opportunity, because at
1751 * this point the task has the smallest effective memory and cache footprint. 1758 * this point the task has the smallest effective memory and cache footprint.
1752 */ 1759 */
1753 void sched_exec(void) 1760 void sched_exec(void)
1754 { 1761 {
1755 int new_cpu, this_cpu = get_cpu(); 1762 int new_cpu, this_cpu = get_cpu();
1756 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 1763 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1757 put_cpu(); 1764 put_cpu();
1758 if (new_cpu != this_cpu) 1765 if (new_cpu != this_cpu)
1759 sched_migrate_task(current, new_cpu); 1766 sched_migrate_task(current, new_cpu);
1760 } 1767 }
1761 1768
1762 /* 1769 /*
1763 * pull_task - move a task from a remote runqueue to the local runqueue. 1770 * pull_task - move a task from a remote runqueue to the local runqueue.
1764 * Both runqueues must be locked. 1771 * Both runqueues must be locked.
1765 */ 1772 */
1766 static 1773 static
1767 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 1774 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1768 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1775 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1769 { 1776 {
1770 dequeue_task(p, src_array); 1777 dequeue_task(p, src_array);
1771 src_rq->nr_running--; 1778 src_rq->nr_running--;
1772 set_task_cpu(p, this_cpu); 1779 set_task_cpu(p, this_cpu);
1773 this_rq->nr_running++; 1780 this_rq->nr_running++;
1774 enqueue_task(p, this_array); 1781 enqueue_task(p, this_array);
1775 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1782 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1776 + this_rq->timestamp_last_tick; 1783 + this_rq->timestamp_last_tick;
1777 /* 1784 /*
1778 * Note that idle threads have a prio of MAX_PRIO, for this test 1785 * Note that idle threads have a prio of MAX_PRIO, for this test
1779 * to be always true for them. 1786 * to be always true for them.
1780 */ 1787 */
1781 if (TASK_PREEMPTS_CURR(p, this_rq)) 1788 if (TASK_PREEMPTS_CURR(p, this_rq))
1782 resched_task(this_rq->curr); 1789 resched_task(this_rq->curr);
1783 } 1790 }
1784 1791
1785 /* 1792 /*
1786 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 1793 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1787 */ 1794 */
1788 static 1795 static
1789 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1796 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1790 struct sched_domain *sd, enum idle_type idle, 1797 struct sched_domain *sd, enum idle_type idle,
1791 int *all_pinned) 1798 int *all_pinned)
1792 { 1799 {
1793 /* 1800 /*
1794 * We do not migrate tasks that are: 1801 * We do not migrate tasks that are:
1795 * 1) running (obviously), or 1802 * 1) running (obviously), or
1796 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1803 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1797 * 3) are cache-hot on their current CPU. 1804 * 3) are cache-hot on their current CPU.
1798 */ 1805 */
1799 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1806 if (!cpu_isset(this_cpu, p->cpus_allowed))
1800 return 0; 1807 return 0;
1801 *all_pinned = 0; 1808 *all_pinned = 0;
1802 1809
1803 if (task_running(rq, p)) 1810 if (task_running(rq, p))
1804 return 0; 1811 return 0;
1805 1812
1806 /* 1813 /*
1807 * Aggressive migration if: 1814 * Aggressive migration if:
1808 * 1) task is cache cold, or 1815 * 1) task is cache cold, or
1809 * 2) too many balance attempts have failed. 1816 * 2) too many balance attempts have failed.
1810 */ 1817 */
1811 1818
1812 if (sd->nr_balance_failed > sd->cache_nice_tries) 1819 if (sd->nr_balance_failed > sd->cache_nice_tries)
1813 return 1; 1820 return 1;
1814 1821
1815 if (task_hot(p, rq->timestamp_last_tick, sd)) 1822 if (task_hot(p, rq->timestamp_last_tick, sd))
1816 return 0; 1823 return 0;
1817 return 1; 1824 return 1;
1818 } 1825 }
1819 1826
1820 /* 1827 /*
1821 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 1828 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1822 * as part of a balancing operation within "domain". Returns the number of 1829 * as part of a balancing operation within "domain". Returns the number of
1823 * tasks moved. 1830 * tasks moved.
1824 * 1831 *
1825 * Called with both runqueues locked. 1832 * Called with both runqueues locked.
1826 */ 1833 */
1827 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1834 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1828 unsigned long max_nr_move, struct sched_domain *sd, 1835 unsigned long max_nr_move, struct sched_domain *sd,
1829 enum idle_type idle, int *all_pinned) 1836 enum idle_type idle, int *all_pinned)
1830 { 1837 {
1831 prio_array_t *array, *dst_array; 1838 prio_array_t *array, *dst_array;
1832 struct list_head *head, *curr; 1839 struct list_head *head, *curr;
1833 int idx, pulled = 0, pinned = 0; 1840 int idx, pulled = 0, pinned = 0;
1834 task_t *tmp; 1841 task_t *tmp;
1835 1842
1836 if (max_nr_move == 0) 1843 if (max_nr_move == 0)
1837 goto out; 1844 goto out;
1838 1845
1839 pinned = 1; 1846 pinned = 1;
1840 1847
1841 /* 1848 /*
1842 * We first consider expired tasks. Those will likely not be 1849 * We first consider expired tasks. Those will likely not be
1843 * executed in the near future, and they are most likely to 1850 * executed in the near future, and they are most likely to
1844 * be cache-cold, thus switching CPUs has the least effect 1851 * be cache-cold, thus switching CPUs has the least effect
1845 * on them. 1852 * on them.
1846 */ 1853 */
1847 if (busiest->expired->nr_active) { 1854 if (busiest->expired->nr_active) {
1848 array = busiest->expired; 1855 array = busiest->expired;
1849 dst_array = this_rq->expired; 1856 dst_array = this_rq->expired;
1850 } else { 1857 } else {
1851 array = busiest->active; 1858 array = busiest->active;
1852 dst_array = this_rq->active; 1859 dst_array = this_rq->active;
1853 } 1860 }
1854 1861
1855 new_array: 1862 new_array:
1856 /* Start searching at priority 0: */ 1863 /* Start searching at priority 0: */
1857 idx = 0; 1864 idx = 0;
1858 skip_bitmap: 1865 skip_bitmap:
1859 if (!idx) 1866 if (!idx)
1860 idx = sched_find_first_bit(array->bitmap); 1867 idx = sched_find_first_bit(array->bitmap);
1861 else 1868 else
1862 idx = find_next_bit(array->bitmap, MAX_PRIO, idx); 1869 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1863 if (idx >= MAX_PRIO) { 1870 if (idx >= MAX_PRIO) {
1864 if (array == busiest->expired && busiest->active->nr_active) { 1871 if (array == busiest->expired && busiest->active->nr_active) {
1865 array = busiest->active; 1872 array = busiest->active;
1866 dst_array = this_rq->active; 1873 dst_array = this_rq->active;
1867 goto new_array; 1874 goto new_array;
1868 } 1875 }
1869 goto out; 1876 goto out;
1870 } 1877 }
1871 1878
1872 head = array->queue + idx; 1879 head = array->queue + idx;
1873 curr = head->prev; 1880 curr = head->prev;
1874 skip_queue: 1881 skip_queue:
1875 tmp = list_entry(curr, task_t, run_list); 1882 tmp = list_entry(curr, task_t, run_list);
1876 1883
1877 curr = curr->prev; 1884 curr = curr->prev;
1878 1885
1879 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 1886 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1880 if (curr != head) 1887 if (curr != head)
1881 goto skip_queue; 1888 goto skip_queue;
1882 idx++; 1889 idx++;
1883 goto skip_bitmap; 1890 goto skip_bitmap;
1884 } 1891 }
1885 1892
1886 #ifdef CONFIG_SCHEDSTATS 1893 #ifdef CONFIG_SCHEDSTATS
1887 if (task_hot(tmp, busiest->timestamp_last_tick, sd)) 1894 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
1888 schedstat_inc(sd, lb_hot_gained[idle]); 1895 schedstat_inc(sd, lb_hot_gained[idle]);
1889 #endif 1896 #endif
1890 1897
1891 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 1898 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1892 pulled++; 1899 pulled++;
1893 1900
1894 /* We only want to steal up to the prescribed number of tasks. */ 1901 /* We only want to steal up to the prescribed number of tasks. */
1895 if (pulled < max_nr_move) { 1902 if (pulled < max_nr_move) {
1896 if (curr != head) 1903 if (curr != head)
1897 goto skip_queue; 1904 goto skip_queue;
1898 idx++; 1905 idx++;
1899 goto skip_bitmap; 1906 goto skip_bitmap;
1900 } 1907 }
1901 out: 1908 out:
1902 /* 1909 /*
1903 * Right now, this is the only place pull_task() is called, 1910 * Right now, this is the only place pull_task() is called,
1904 * so we can safely collect pull_task() stats here rather than 1911 * so we can safely collect pull_task() stats here rather than
1905 * inside pull_task(). 1912 * inside pull_task().
1906 */ 1913 */
1907 schedstat_add(sd, lb_gained[idle], pulled); 1914 schedstat_add(sd, lb_gained[idle], pulled);
1908 1915
1909 if (all_pinned) 1916 if (all_pinned)
1910 *all_pinned = pinned; 1917 *all_pinned = pinned;
1911 return pulled; 1918 return pulled;
1912 } 1919 }
1913 1920
1914 /* 1921 /*
1915 * find_busiest_group finds and returns the busiest CPU group within the 1922 * find_busiest_group finds and returns the busiest CPU group within the
1916 * domain. It calculates and returns the number of tasks which should be 1923 * domain. It calculates and returns the number of tasks which should be
1917 * moved to restore balance via the imbalance parameter. 1924 * moved to restore balance via the imbalance parameter.
1918 */ 1925 */
1919 static struct sched_group * 1926 static struct sched_group *
1920 find_busiest_group(struct sched_domain *sd, int this_cpu, 1927 find_busiest_group(struct sched_domain *sd, int this_cpu,
1921 unsigned long *imbalance, enum idle_type idle, int *sd_idle) 1928 unsigned long *imbalance, enum idle_type idle, int *sd_idle)
1922 { 1929 {
1923 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1930 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1924 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1931 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1925 unsigned long max_pull; 1932 unsigned long max_pull;
1926 int load_idx; 1933 int load_idx;
1927 1934
1928 max_load = this_load = total_load = total_pwr = 0; 1935 max_load = this_load = total_load = total_pwr = 0;
1929 if (idle == NOT_IDLE) 1936 if (idle == NOT_IDLE)
1930 load_idx = sd->busy_idx; 1937 load_idx = sd->busy_idx;
1931 else if (idle == NEWLY_IDLE) 1938 else if (idle == NEWLY_IDLE)
1932 load_idx = sd->newidle_idx; 1939 load_idx = sd->newidle_idx;
1933 else 1940 else
1934 load_idx = sd->idle_idx; 1941 load_idx = sd->idle_idx;
1935 1942
1936 do { 1943 do {
1937 unsigned long load; 1944 unsigned long load;
1938 int local_group; 1945 int local_group;
1939 int i; 1946 int i;
1940 1947
1941 local_group = cpu_isset(this_cpu, group->cpumask); 1948 local_group = cpu_isset(this_cpu, group->cpumask);
1942 1949
1943 /* Tally up the load of all CPUs in the group */ 1950 /* Tally up the load of all CPUs in the group */
1944 avg_load = 0; 1951 avg_load = 0;
1945 1952
1946 for_each_cpu_mask(i, group->cpumask) { 1953 for_each_cpu_mask(i, group->cpumask) {
1947 if (*sd_idle && !idle_cpu(i)) 1954 if (*sd_idle && !idle_cpu(i))
1948 *sd_idle = 0; 1955 *sd_idle = 0;
1949 1956
1950 /* Bias balancing toward cpus of our domain */ 1957 /* Bias balancing toward cpus of our domain */
1951 if (local_group) 1958 if (local_group)
1952 load = target_load(i, load_idx); 1959 load = target_load(i, load_idx);
1953 else 1960 else
1954 load = source_load(i, load_idx); 1961 load = source_load(i, load_idx);
1955 1962
1956 avg_load += load; 1963 avg_load += load;
1957 } 1964 }
1958 1965
1959 total_load += avg_load; 1966 total_load += avg_load;
1960 total_pwr += group->cpu_power; 1967 total_pwr += group->cpu_power;
1961 1968
1962 /* Adjust by relative CPU power of the group */ 1969 /* Adjust by relative CPU power of the group */
1963 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1970 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1964 1971
1965 if (local_group) { 1972 if (local_group) {
1966 this_load = avg_load; 1973 this_load = avg_load;
1967 this = group; 1974 this = group;
1968 } else if (avg_load > max_load) { 1975 } else if (avg_load > max_load) {
1969 max_load = avg_load; 1976 max_load = avg_load;
1970 busiest = group; 1977 busiest = group;
1971 } 1978 }
1972 group = group->next; 1979 group = group->next;
1973 } while (group != sd->groups); 1980 } while (group != sd->groups);
1974 1981
1975 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 1982 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
1976 goto out_balanced; 1983 goto out_balanced;
1977 1984
1978 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 1985 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1979 1986
1980 if (this_load >= avg_load || 1987 if (this_load >= avg_load ||
1981 100*max_load <= sd->imbalance_pct*this_load) 1988 100*max_load <= sd->imbalance_pct*this_load)
1982 goto out_balanced; 1989 goto out_balanced;
1983 1990
1984 /* 1991 /*
1985 * We're trying to get all the cpus to the average_load, so we don't 1992 * We're trying to get all the cpus to the average_load, so we don't
1986 * want to push ourselves above the average load, nor do we wish to 1993 * want to push ourselves above the average load, nor do we wish to
1987 * reduce the max loaded cpu below the average load, as either of these 1994 * reduce the max loaded cpu below the average load, as either of these
1988 * actions would just result in more rebalancing later, and ping-pong 1995 * actions would just result in more rebalancing later, and ping-pong
1989 * tasks around. Thus we look for the minimum possible imbalance. 1996 * tasks around. Thus we look for the minimum possible imbalance.
1990 * Negative imbalances (*we* are more loaded than anyone else) will 1997 * Negative imbalances (*we* are more loaded than anyone else) will
1991 * be counted as no imbalance for these purposes -- we can't fix that 1998 * be counted as no imbalance for these purposes -- we can't fix that
1992 * by pulling tasks to us. Be careful of negative numbers as they'll 1999 * by pulling tasks to us. Be careful of negative numbers as they'll
1993 * appear as very large values with unsigned longs. 2000 * appear as very large values with unsigned longs.
1994 */ 2001 */
1995 2002
1996 /* Don't want to pull so many tasks that a group would go idle */ 2003 /* Don't want to pull so many tasks that a group would go idle */
1997 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2004 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
1998 2005
1999 /* How much load to actually move to equalise the imbalance */ 2006 /* How much load to actually move to equalise the imbalance */
2000 *imbalance = min(max_pull * busiest->cpu_power, 2007 *imbalance = min(max_pull * busiest->cpu_power,
2001 (avg_load - this_load) * this->cpu_power) 2008 (avg_load - this_load) * this->cpu_power)
2002 / SCHED_LOAD_SCALE; 2009 / SCHED_LOAD_SCALE;
2003 2010
2004 if (*imbalance < SCHED_LOAD_SCALE) { 2011 if (*imbalance < SCHED_LOAD_SCALE) {
2005 unsigned long pwr_now = 0, pwr_move = 0; 2012 unsigned long pwr_now = 0, pwr_move = 0;
2006 unsigned long tmp; 2013 unsigned long tmp;
2007 2014
2008 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2015 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
2009 *imbalance = 1; 2016 *imbalance = 1;
2010 return busiest; 2017 return busiest;
2011 } 2018 }
2012 2019
2013 /* 2020 /*
2014 * OK, we don't have enough imbalance to justify moving tasks, 2021 * OK, we don't have enough imbalance to justify moving tasks,
2015 * however we may be able to increase total CPU power used by 2022 * however we may be able to increase total CPU power used by
2016 * moving them. 2023 * moving them.
2017 */ 2024 */
2018 2025
2019 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2026 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
2020 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2027 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
2021 pwr_now /= SCHED_LOAD_SCALE; 2028 pwr_now /= SCHED_LOAD_SCALE;
2022 2029
2023 /* Amount of load we'd subtract */ 2030 /* Amount of load we'd subtract */
2024 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2031 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
2025 if (max_load > tmp) 2032 if (max_load > tmp)
2026 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2033 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
2027 max_load - tmp); 2034 max_load - tmp);
2028 2035
2029 /* Amount of load we'd add */ 2036 /* Amount of load we'd add */
2030 if (max_load*busiest->cpu_power < 2037 if (max_load*busiest->cpu_power <
2031 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2038 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
2032 tmp = max_load*busiest->cpu_power/this->cpu_power; 2039 tmp = max_load*busiest->cpu_power/this->cpu_power;
2033 else 2040 else
2034 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2041 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
2035 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2042 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
2036 pwr_move /= SCHED_LOAD_SCALE; 2043 pwr_move /= SCHED_LOAD_SCALE;
2037 2044
2038 /* Move if we gain throughput */ 2045 /* Move if we gain throughput */
2039 if (pwr_move <= pwr_now) 2046 if (pwr_move <= pwr_now)
2040 goto out_balanced; 2047 goto out_balanced;
2041 2048
2042 *imbalance = 1; 2049 *imbalance = 1;
2043 return busiest; 2050 return busiest;
2044 } 2051 }
2045 2052
2046 /* Get rid of the scaling factor, rounding down as we divide */ 2053 /* Get rid of the scaling factor, rounding down as we divide */
2047 *imbalance = *imbalance / SCHED_LOAD_SCALE; 2054 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2048 return busiest; 2055 return busiest;
2049 2056
2050 out_balanced: 2057 out_balanced:
2051 2058
2052 *imbalance = 0; 2059 *imbalance = 0;
2053 return NULL; 2060 return NULL;
2054 } 2061 }
2055 2062
2056 /* 2063 /*
2057 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2064 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2058 */ 2065 */
2059 static runqueue_t *find_busiest_queue(struct sched_group *group, 2066 static runqueue_t *find_busiest_queue(struct sched_group *group,
2060 enum idle_type idle) 2067 enum idle_type idle)
2061 { 2068 {
2062 unsigned long load, max_load = 0; 2069 unsigned long load, max_load = 0;
2063 runqueue_t *busiest = NULL; 2070 runqueue_t *busiest = NULL;
2064 int i; 2071 int i;
2065 2072
2066 for_each_cpu_mask(i, group->cpumask) { 2073 for_each_cpu_mask(i, group->cpumask) {
2067 load = source_load(i, 0); 2074 load = source_load(i, 0);
2068 2075
2069 if (load > max_load) { 2076 if (load > max_load) {
2070 max_load = load; 2077 max_load = load;
2071 busiest = cpu_rq(i); 2078 busiest = cpu_rq(i);
2072 } 2079 }
2073 } 2080 }
2074 2081
2075 return busiest; 2082 return busiest;
2076 } 2083 }
2077 2084
2078 /* 2085 /*
2079 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 2086 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2080 * so long as it is large enough. 2087 * so long as it is large enough.
2081 */ 2088 */
2082 #define MAX_PINNED_INTERVAL 512 2089 #define MAX_PINNED_INTERVAL 512
2083 2090
2084 /* 2091 /*
2085 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2092 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2086 * tasks if there is an imbalance. 2093 * tasks if there is an imbalance.
2087 * 2094 *
2088 * Called with this_rq unlocked. 2095 * Called with this_rq unlocked.
2089 */ 2096 */
2090 static int load_balance(int this_cpu, runqueue_t *this_rq, 2097 static int load_balance(int this_cpu, runqueue_t *this_rq,
2091 struct sched_domain *sd, enum idle_type idle) 2098 struct sched_domain *sd, enum idle_type idle)
2092 { 2099 {
2093 struct sched_group *group; 2100 struct sched_group *group;
2094 runqueue_t *busiest; 2101 runqueue_t *busiest;
2095 unsigned long imbalance; 2102 unsigned long imbalance;
2096 int nr_moved, all_pinned = 0; 2103 int nr_moved, all_pinned = 0;
2097 int active_balance = 0; 2104 int active_balance = 0;
2098 int sd_idle = 0; 2105 int sd_idle = 0;
2099 2106
2100 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2107 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
2101 sd_idle = 1; 2108 sd_idle = 1;
2102 2109
2103 schedstat_inc(sd, lb_cnt[idle]); 2110 schedstat_inc(sd, lb_cnt[idle]);
2104 2111
2105 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); 2112 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
2106 if (!group) { 2113 if (!group) {
2107 schedstat_inc(sd, lb_nobusyg[idle]); 2114 schedstat_inc(sd, lb_nobusyg[idle]);
2108 goto out_balanced; 2115 goto out_balanced;
2109 } 2116 }
2110 2117
2111 busiest = find_busiest_queue(group, idle); 2118 busiest = find_busiest_queue(group, idle);
2112 if (!busiest) { 2119 if (!busiest) {
2113 schedstat_inc(sd, lb_nobusyq[idle]); 2120 schedstat_inc(sd, lb_nobusyq[idle]);
2114 goto out_balanced; 2121 goto out_balanced;
2115 } 2122 }
2116 2123
2117 BUG_ON(busiest == this_rq); 2124 BUG_ON(busiest == this_rq);
2118 2125
2119 schedstat_add(sd, lb_imbalance[idle], imbalance); 2126 schedstat_add(sd, lb_imbalance[idle], imbalance);
2120 2127
2121 nr_moved = 0; 2128 nr_moved = 0;
2122 if (busiest->nr_running > 1) { 2129 if (busiest->nr_running > 1) {
2123 /* 2130 /*
2124 * Attempt to move tasks. If find_busiest_group has found 2131 * Attempt to move tasks. If find_busiest_group has found
2125 * an imbalance but busiest->nr_running <= 1, the group is 2132 * an imbalance but busiest->nr_running <= 1, the group is
2126 * still unbalanced. nr_moved simply stays zero, so it is 2133 * still unbalanced. nr_moved simply stays zero, so it is
2127 * correctly treated as an imbalance. 2134 * correctly treated as an imbalance.
2128 */ 2135 */
2129 double_rq_lock(this_rq, busiest); 2136 double_rq_lock(this_rq, busiest);
2130 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2137 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2131 imbalance, sd, idle, &all_pinned); 2138 imbalance, sd, idle, &all_pinned);
2132 double_rq_unlock(this_rq, busiest); 2139 double_rq_unlock(this_rq, busiest);
2133 2140
2134 /* All tasks on this runqueue were pinned by CPU affinity */ 2141 /* All tasks on this runqueue were pinned by CPU affinity */
2135 if (unlikely(all_pinned)) 2142 if (unlikely(all_pinned))
2136 goto out_balanced; 2143 goto out_balanced;
2137 } 2144 }
2138 2145
2139 if (!nr_moved) { 2146 if (!nr_moved) {
2140 schedstat_inc(sd, lb_failed[idle]); 2147 schedstat_inc(sd, lb_failed[idle]);
2141 sd->nr_balance_failed++; 2148 sd->nr_balance_failed++;
2142 2149
2143 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2150 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2144 2151
2145 spin_lock(&busiest->lock); 2152 spin_lock(&busiest->lock);
2146 2153
2147 /* don't kick the migration_thread, if the curr 2154 /* don't kick the migration_thread, if the curr
2148 * task on busiest cpu can't be moved to this_cpu 2155 * task on busiest cpu can't be moved to this_cpu
2149 */ 2156 */
2150 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2157 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2151 spin_unlock(&busiest->lock); 2158 spin_unlock(&busiest->lock);
2152 all_pinned = 1; 2159 all_pinned = 1;
2153 goto out_one_pinned; 2160 goto out_one_pinned;
2154 } 2161 }
2155 2162
2156 if (!busiest->active_balance) { 2163 if (!busiest->active_balance) {
2157 busiest->active_balance = 1; 2164 busiest->active_balance = 1;
2158 busiest->push_cpu = this_cpu; 2165 busiest->push_cpu = this_cpu;
2159 active_balance = 1; 2166 active_balance = 1;
2160 } 2167 }
2161 spin_unlock(&busiest->lock); 2168 spin_unlock(&busiest->lock);
2162 if (active_balance) 2169 if (active_balance)
2163 wake_up_process(busiest->migration_thread); 2170 wake_up_process(busiest->migration_thread);
2164 2171
2165 /* 2172 /*
2166 * We've kicked active balancing, reset the failure 2173 * We've kicked active balancing, reset the failure
2167 * counter. 2174 * counter.
2168 */ 2175 */
2169 sd->nr_balance_failed = sd->cache_nice_tries+1; 2176 sd->nr_balance_failed = sd->cache_nice_tries+1;
2170 } 2177 }
2171 } else 2178 } else
2172 sd->nr_balance_failed = 0; 2179 sd->nr_balance_failed = 0;
2173 2180
2174 if (likely(!active_balance)) { 2181 if (likely(!active_balance)) {
2175 /* We were unbalanced, so reset the balancing interval */ 2182 /* We were unbalanced, so reset the balancing interval */
2176 sd->balance_interval = sd->min_interval; 2183 sd->balance_interval = sd->min_interval;
2177 } else { 2184 } else {
2178 /* 2185 /*
2179 * If we've begun active balancing, start to back off. This 2186 * If we've begun active balancing, start to back off. This
2180 * case may not be covered by the all_pinned logic if there 2187 * case may not be covered by the all_pinned logic if there
2181 * is only 1 task on the busy runqueue (because we don't call 2188 * is only 1 task on the busy runqueue (because we don't call
2182 * move_tasks). 2189 * move_tasks).
2183 */ 2190 */
2184 if (sd->balance_interval < sd->max_interval) 2191 if (sd->balance_interval < sd->max_interval)
2185 sd->balance_interval *= 2; 2192 sd->balance_interval *= 2;
2186 } 2193 }
2187 2194
2188 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2195 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2189 return -1; 2196 return -1;
2190 return nr_moved; 2197 return nr_moved;
2191 2198
2192 out_balanced: 2199 out_balanced:
2193 schedstat_inc(sd, lb_balanced[idle]); 2200 schedstat_inc(sd, lb_balanced[idle]);
2194 2201
2195 sd->nr_balance_failed = 0; 2202 sd->nr_balance_failed = 0;
2196 2203
2197 out_one_pinned: 2204 out_one_pinned:
2198 /* tune up the balancing interval */ 2205 /* tune up the balancing interval */
2199 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 2206 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2200 (sd->balance_interval < sd->max_interval)) 2207 (sd->balance_interval < sd->max_interval))
2201 sd->balance_interval *= 2; 2208 sd->balance_interval *= 2;
2202 2209
2203 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2210 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2204 return -1; 2211 return -1;
2205 return 0; 2212 return 0;
2206 } 2213 }
2207 2214
2208 /* 2215 /*
2209 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2216 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2210 * tasks if there is an imbalance. 2217 * tasks if there is an imbalance.
2211 * 2218 *
2212 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2219 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2213 * this_rq is locked. 2220 * this_rq is locked.
2214 */ 2221 */
2215 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, 2222 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2216 struct sched_domain *sd) 2223 struct sched_domain *sd)
2217 { 2224 {
2218 struct sched_group *group; 2225 struct sched_group *group;
2219 runqueue_t *busiest = NULL; 2226 runqueue_t *busiest = NULL;
2220 unsigned long imbalance; 2227 unsigned long imbalance;
2221 int nr_moved = 0; 2228 int nr_moved = 0;
2222 int sd_idle = 0; 2229 int sd_idle = 0;
2223 2230
2224 if (sd->flags & SD_SHARE_CPUPOWER) 2231 if (sd->flags & SD_SHARE_CPUPOWER)
2225 sd_idle = 1; 2232 sd_idle = 1;
2226 2233
2227 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2234 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2228 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); 2235 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
2229 if (!group) { 2236 if (!group) {
2230 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2237 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2231 goto out_balanced; 2238 goto out_balanced;
2232 } 2239 }
2233 2240
2234 busiest = find_busiest_queue(group, NEWLY_IDLE); 2241 busiest = find_busiest_queue(group, NEWLY_IDLE);
2235 if (!busiest) { 2242 if (!busiest) {
2236 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2243 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2237 goto out_balanced; 2244 goto out_balanced;
2238 } 2245 }
2239 2246
2240 BUG_ON(busiest == this_rq); 2247 BUG_ON(busiest == this_rq);
2241 2248
2242 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2249 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2243 2250
2244 nr_moved = 0; 2251 nr_moved = 0;
2245 if (busiest->nr_running > 1) { 2252 if (busiest->nr_running > 1) {
2246 /* Attempt to move tasks */ 2253 /* Attempt to move tasks */
2247 double_lock_balance(this_rq, busiest); 2254 double_lock_balance(this_rq, busiest);
2248 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2255 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2249 imbalance, sd, NEWLY_IDLE, NULL); 2256 imbalance, sd, NEWLY_IDLE, NULL);
2250 spin_unlock(&busiest->lock); 2257 spin_unlock(&busiest->lock);
2251 } 2258 }
2252 2259
2253 if (!nr_moved) { 2260 if (!nr_moved) {
2254 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2261 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2255 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2262 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2256 return -1; 2263 return -1;
2257 } else 2264 } else
2258 sd->nr_balance_failed = 0; 2265 sd->nr_balance_failed = 0;
2259 2266
2260 return nr_moved; 2267 return nr_moved;
2261 2268
2262 out_balanced: 2269 out_balanced:
2263 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2270 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2264 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2271 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2265 return -1; 2272 return -1;
2266 sd->nr_balance_failed = 0; 2273 sd->nr_balance_failed = 0;
2267 return 0; 2274 return 0;
2268 } 2275 }
2269 2276
2270 /* 2277 /*
2271 * idle_balance is called by schedule() if this_cpu is about to become 2278 * idle_balance is called by schedule() if this_cpu is about to become
2272 * idle. Attempts to pull tasks from other CPUs. 2279 * idle. Attempts to pull tasks from other CPUs.
2273 */ 2280 */
2274 static void idle_balance(int this_cpu, runqueue_t *this_rq) 2281 static void idle_balance(int this_cpu, runqueue_t *this_rq)
2275 { 2282 {
2276 struct sched_domain *sd; 2283 struct sched_domain *sd;
2277 2284
2278 for_each_domain(this_cpu, sd) { 2285 for_each_domain(this_cpu, sd) {
2279 if (sd->flags & SD_BALANCE_NEWIDLE) { 2286 if (sd->flags & SD_BALANCE_NEWIDLE) {
2280 if (load_balance_newidle(this_cpu, this_rq, sd)) { 2287 if (load_balance_newidle(this_cpu, this_rq, sd)) {
2281 /* We've pulled tasks over so stop searching */ 2288 /* We've pulled tasks over so stop searching */
2282 break; 2289 break;
2283 } 2290 }
2284 } 2291 }
2285 } 2292 }
2286 } 2293 }
2287 2294
2288 /* 2295 /*
2289 * active_load_balance is run by migration threads. It pushes running tasks 2296 * active_load_balance is run by migration threads. It pushes running tasks
2290 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 2297 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2291 * running on each physical CPU where possible, and avoids physical / 2298 * running on each physical CPU where possible, and avoids physical /
2292 * logical imbalances. 2299 * logical imbalances.
2293 * 2300 *
2294 * Called with busiest_rq locked. 2301 * Called with busiest_rq locked.
2295 */ 2302 */
2296 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2303 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2297 { 2304 {
2298 struct sched_domain *sd; 2305 struct sched_domain *sd;
2299 runqueue_t *target_rq; 2306 runqueue_t *target_rq;
2300 int target_cpu = busiest_rq->push_cpu; 2307 int target_cpu = busiest_rq->push_cpu;
2301 2308
2302 if (busiest_rq->nr_running <= 1) 2309 if (busiest_rq->nr_running <= 1)
2303 /* no task to move */ 2310 /* no task to move */
2304 return; 2311 return;
2305 2312
2306 target_rq = cpu_rq(target_cpu); 2313 target_rq = cpu_rq(target_cpu);
2307 2314
2308 /* 2315 /*
2309 * This condition is "impossible", if it occurs 2316 * This condition is "impossible", if it occurs
2310 * we need to fix it. Originally reported by 2317 * we need to fix it. Originally reported by
2311 * Bjorn Helgaas on a 128-cpu setup. 2318 * Bjorn Helgaas on a 128-cpu setup.
2312 */ 2319 */
2313 BUG_ON(busiest_rq == target_rq); 2320 BUG_ON(busiest_rq == target_rq);
2314 2321
2315 /* move a task from busiest_rq to target_rq */ 2322 /* move a task from busiest_rq to target_rq */
2316 double_lock_balance(busiest_rq, target_rq); 2323 double_lock_balance(busiest_rq, target_rq);
2317 2324
2318 /* Search for an sd spanning us and the target CPU. */ 2325 /* Search for an sd spanning us and the target CPU. */
2319 for_each_domain(target_cpu, sd) 2326 for_each_domain(target_cpu, sd)
2320 if ((sd->flags & SD_LOAD_BALANCE) && 2327 if ((sd->flags & SD_LOAD_BALANCE) &&
2321 cpu_isset(busiest_cpu, sd->span)) 2328 cpu_isset(busiest_cpu, sd->span))
2322 break; 2329 break;
2323 2330
2324 if (unlikely(sd == NULL)) 2331 if (unlikely(sd == NULL))
2325 goto out; 2332 goto out;
2326 2333
2327 schedstat_inc(sd, alb_cnt); 2334 schedstat_inc(sd, alb_cnt);
2328 2335
2329 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2336 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2330 schedstat_inc(sd, alb_pushed); 2337 schedstat_inc(sd, alb_pushed);
2331 else 2338 else
2332 schedstat_inc(sd, alb_failed); 2339 schedstat_inc(sd, alb_failed);
2333 out: 2340 out:
2334 spin_unlock(&target_rq->lock); 2341 spin_unlock(&target_rq->lock);
2335 } 2342 }
2336 2343
2337 /* 2344 /*
2338 * rebalance_tick will get called every timer tick, on every CPU. 2345 * rebalance_tick will get called every timer tick, on every CPU.
2339 * 2346 *
2340 * It checks each scheduling domain to see if it is due to be balanced, 2347 * It checks each scheduling domain to see if it is due to be balanced,
2341 * and initiates a balancing operation if so. 2348 * and initiates a balancing operation if so.
2342 * 2349 *
2343 * Balancing parameters are set up in arch_init_sched_domains. 2350 * Balancing parameters are set up in arch_init_sched_domains.
2344 */ 2351 */
2345 2352
2346 /* Don't have all balancing operations going off at once */ 2353 /* Don't have all balancing operations going off at once */
2347 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) 2354 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2348 2355
2349 static void rebalance_tick(int this_cpu, runqueue_t *this_rq, 2356 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2350 enum idle_type idle) 2357 enum idle_type idle)
2351 { 2358 {
2352 unsigned long old_load, this_load; 2359 unsigned long old_load, this_load;
2353 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2360 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2354 struct sched_domain *sd; 2361 struct sched_domain *sd;
2355 int i; 2362 int i;
2356 2363
2357 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2364 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2358 /* Update our load */ 2365 /* Update our load */
2359 for (i = 0; i < 3; i++) { 2366 for (i = 0; i < 3; i++) {
2360 unsigned long new_load = this_load; 2367 unsigned long new_load = this_load;
2361 int scale = 1 << i; 2368 int scale = 1 << i;
2362 old_load = this_rq->cpu_load[i]; 2369 old_load = this_rq->cpu_load[i];
2363 /* 2370 /*
2364 * Round up the averaging division if load is increasing. This 2371 * Round up the averaging division if load is increasing. This
2365 * prevents us from getting stuck on 9 if the load is 10, for 2372 * prevents us from getting stuck on 9 if the load is 10, for
2366 * example. 2373 * example.
2367 */ 2374 */
2368 if (new_load > old_load) 2375 if (new_load > old_load)
2369 new_load += scale-1; 2376 new_load += scale-1;
2370 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2377 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2371 } 2378 }
2372 2379
2373 for_each_domain(this_cpu, sd) { 2380 for_each_domain(this_cpu, sd) {
2374 unsigned long interval; 2381 unsigned long interval;
2375 2382
2376 if (!(sd->flags & SD_LOAD_BALANCE)) 2383 if (!(sd->flags & SD_LOAD_BALANCE))
2377 continue; 2384 continue;
2378 2385
2379 interval = sd->balance_interval; 2386 interval = sd->balance_interval;
2380 if (idle != SCHED_IDLE) 2387 if (idle != SCHED_IDLE)
2381 interval *= sd->busy_factor; 2388 interval *= sd->busy_factor;
2382 2389
2383 /* scale ms to jiffies */ 2390 /* scale ms to jiffies */
2384 interval = msecs_to_jiffies(interval); 2391 interval = msecs_to_jiffies(interval);
2385 if (unlikely(!interval)) 2392 if (unlikely(!interval))
2386 interval = 1; 2393 interval = 1;
2387 2394
2388 if (j - sd->last_balance >= interval) { 2395 if (j - sd->last_balance >= interval) {
2389 if (load_balance(this_cpu, this_rq, sd, idle)) { 2396 if (load_balance(this_cpu, this_rq, sd, idle)) {
2390 /* 2397 /*
2391 * We've pulled tasks over so either we're no 2398 * We've pulled tasks over so either we're no
2392 * longer idle, or one of our SMT siblings is 2399 * longer idle, or one of our SMT siblings is
2393 * not idle. 2400 * not idle.
2394 */ 2401 */
2395 idle = NOT_IDLE; 2402 idle = NOT_IDLE;
2396 } 2403 }
2397 sd->last_balance += interval; 2404 sd->last_balance += interval;
2398 } 2405 }
2399 } 2406 }
2400 } 2407 }
2401 #else 2408 #else
2402 /* 2409 /*
2403 * on UP we do not need to balance between CPUs: 2410 * on UP we do not need to balance between CPUs:
2404 */ 2411 */
2405 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) 2412 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2406 { 2413 {
2407 } 2414 }
2408 static inline void idle_balance(int cpu, runqueue_t *rq) 2415 static inline void idle_balance(int cpu, runqueue_t *rq)
2409 { 2416 {
2410 } 2417 }
2411 #endif 2418 #endif
2412 2419
2413 static inline int wake_priority_sleeper(runqueue_t *rq) 2420 static inline int wake_priority_sleeper(runqueue_t *rq)
2414 { 2421 {
2415 int ret = 0; 2422 int ret = 0;
2416 #ifdef CONFIG_SCHED_SMT 2423 #ifdef CONFIG_SCHED_SMT
2417 spin_lock(&rq->lock); 2424 spin_lock(&rq->lock);
2418 /* 2425 /*
2419 * If an SMT sibling task has been put to sleep for priority 2426 * If an SMT sibling task has been put to sleep for priority
2420 * reasons reschedule the idle task to see if it can now run. 2427 * reasons reschedule the idle task to see if it can now run.
2421 */ 2428 */
2422 if (rq->nr_running) { 2429 if (rq->nr_running) {
2423 resched_task(rq->idle); 2430 resched_task(rq->idle);
2424 ret = 1; 2431 ret = 1;
2425 } 2432 }
2426 spin_unlock(&rq->lock); 2433 spin_unlock(&rq->lock);
2427 #endif 2434 #endif
2428 return ret; 2435 return ret;
2429 } 2436 }
2430 2437
2431 DEFINE_PER_CPU(struct kernel_stat, kstat); 2438 DEFINE_PER_CPU(struct kernel_stat, kstat);
2432 2439
2433 EXPORT_PER_CPU_SYMBOL(kstat); 2440 EXPORT_PER_CPU_SYMBOL(kstat);
2434 2441
2435 /* 2442 /*
2436 * This is called on clock ticks and on context switches. 2443 * This is called on clock ticks and on context switches.
2437 * Bank in p->sched_time the ns elapsed since the last tick or switch. 2444 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2438 */ 2445 */
2439 static inline void update_cpu_clock(task_t *p, runqueue_t *rq, 2446 static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
2440 unsigned long long now) 2447 unsigned long long now)
2441 { 2448 {
2442 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); 2449 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
2443 p->sched_time += now - last; 2450 p->sched_time += now - last;
2444 } 2451 }
2445 2452
2446 /* 2453 /*
2447 * Return current->sched_time plus any more ns on the sched_clock 2454 * Return current->sched_time plus any more ns on the sched_clock
2448 * that have not yet been banked. 2455 * that have not yet been banked.
2449 */ 2456 */
2450 unsigned long long current_sched_time(const task_t *tsk) 2457 unsigned long long current_sched_time(const task_t *tsk)
2451 { 2458 {
2452 unsigned long long ns; 2459 unsigned long long ns;
2453 unsigned long flags; 2460 unsigned long flags;
2454 local_irq_save(flags); 2461 local_irq_save(flags);
2455 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); 2462 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
2456 ns = tsk->sched_time + (sched_clock() - ns); 2463 ns = tsk->sched_time + (sched_clock() - ns);
2457 local_irq_restore(flags); 2464 local_irq_restore(flags);
2458 return ns; 2465 return ns;
2459 } 2466 }
2460 2467
2461 /* 2468 /*
2462 * We place interactive tasks back into the active array, if possible. 2469 * We place interactive tasks back into the active array, if possible.
2463 * 2470 *
2464 * To guarantee that this does not starve expired tasks we ignore the 2471 * To guarantee that this does not starve expired tasks we ignore the
2465 * interactivity of a task if the first expired task had to wait more 2472 * interactivity of a task if the first expired task had to wait more
2466 * than a 'reasonable' amount of time. This deadline timeout is 2473 * than a 'reasonable' amount of time. This deadline timeout is
2467 * load-dependent, as the frequency of array switched decreases with 2474 * load-dependent, as the frequency of array switched decreases with
2468 * increasing number of running tasks. We also ignore the interactivity 2475 * increasing number of running tasks. We also ignore the interactivity
2469 * if a better static_prio task has expired: 2476 * if a better static_prio task has expired:
2470 */ 2477 */
2471 #define EXPIRED_STARVING(rq) \ 2478 #define EXPIRED_STARVING(rq) \
2472 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ 2479 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2473 (jiffies - (rq)->expired_timestamp >= \ 2480 (jiffies - (rq)->expired_timestamp >= \
2474 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ 2481 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2475 ((rq)->curr->static_prio > (rq)->best_expired_prio)) 2482 ((rq)->curr->static_prio > (rq)->best_expired_prio))
2476 2483
2477 /* 2484 /*
2478 * Account user cpu time to a process. 2485 * Account user cpu time to a process.
2479 * @p: the process that the cpu time gets accounted to 2486 * @p: the process that the cpu time gets accounted to
2480 * @hardirq_offset: the offset to subtract from hardirq_count() 2487 * @hardirq_offset: the offset to subtract from hardirq_count()
2481 * @cputime: the cpu time spent in user space since the last update 2488 * @cputime: the cpu time spent in user space since the last update
2482 */ 2489 */
2483 void account_user_time(struct task_struct *p, cputime_t cputime) 2490 void account_user_time(struct task_struct *p, cputime_t cputime)
2484 { 2491 {
2485 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2492 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2486 cputime64_t tmp; 2493 cputime64_t tmp;
2487 2494
2488 p->utime = cputime_add(p->utime, cputime); 2495 p->utime = cputime_add(p->utime, cputime);
2489 2496
2490 /* Add user time to cpustat. */ 2497 /* Add user time to cpustat. */
2491 tmp = cputime_to_cputime64(cputime); 2498 tmp = cputime_to_cputime64(cputime);
2492 if (TASK_NICE(p) > 0) 2499 if (TASK_NICE(p) > 0)
2493 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2500 cpustat->nice = cputime64_add(cpustat->nice, tmp);
2494 else 2501 else
2495 cpustat->user = cputime64_add(cpustat->user, tmp); 2502 cpustat->user = cputime64_add(cpustat->user, tmp);
2496 } 2503 }
2497 2504
2498 /* 2505 /*
2499 * Account system cpu time to a process. 2506 * Account system cpu time to a process.
2500 * @p: the process that the cpu time gets accounted to 2507 * @p: the process that the cpu time gets accounted to
2501 * @hardirq_offset: the offset to subtract from hardirq_count() 2508 * @hardirq_offset: the offset to subtract from hardirq_count()
2502 * @cputime: the cpu time spent in kernel space since the last update 2509 * @cputime: the cpu time spent in kernel space since the last update
2503 */ 2510 */
2504 void account_system_time(struct task_struct *p, int hardirq_offset, 2511 void account_system_time(struct task_struct *p, int hardirq_offset,
2505 cputime_t cputime) 2512 cputime_t cputime)
2506 { 2513 {
2507 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2514 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2508 runqueue_t *rq = this_rq(); 2515 runqueue_t *rq = this_rq();
2509 cputime64_t tmp; 2516 cputime64_t tmp;
2510 2517
2511 p->stime = cputime_add(p->stime, cputime); 2518 p->stime = cputime_add(p->stime, cputime);
2512 2519
2513 /* Add system time to cpustat. */ 2520 /* Add system time to cpustat. */
2514 tmp = cputime_to_cputime64(cputime); 2521 tmp = cputime_to_cputime64(cputime);
2515 if (hardirq_count() - hardirq_offset) 2522 if (hardirq_count() - hardirq_offset)
2516 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2523 cpustat->irq = cputime64_add(cpustat->irq, tmp);
2517 else if (softirq_count()) 2524 else if (softirq_count())
2518 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2525 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
2519 else if (p != rq->idle) 2526 else if (p != rq->idle)
2520 cpustat->system = cputime64_add(cpustat->system, tmp); 2527 cpustat->system = cputime64_add(cpustat->system, tmp);
2521 else if (atomic_read(&rq->nr_iowait) > 0) 2528 else if (atomic_read(&rq->nr_iowait) > 0)
2522 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 2529 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2523 else 2530 else
2524 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2531 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2525 /* Account for system time used */ 2532 /* Account for system time used */
2526 acct_update_integrals(p); 2533 acct_update_integrals(p);
2527 } 2534 }
2528 2535
2529 /* 2536 /*
2530 * Account for involuntary wait time. 2537 * Account for involuntary wait time.
2531 * @p: the process from which the cpu time has been stolen 2538 * @p: the process from which the cpu time has been stolen
2532 * @steal: the cpu time spent in involuntary wait 2539 * @steal: the cpu time spent in involuntary wait
2533 */ 2540 */
2534 void account_steal_time(struct task_struct *p, cputime_t steal) 2541 void account_steal_time(struct task_struct *p, cputime_t steal)
2535 { 2542 {
2536 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2543 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2537 cputime64_t tmp = cputime_to_cputime64(steal); 2544 cputime64_t tmp = cputime_to_cputime64(steal);
2538 runqueue_t *rq = this_rq(); 2545 runqueue_t *rq = this_rq();
2539 2546
2540 if (p == rq->idle) { 2547 if (p == rq->idle) {
2541 p->stime = cputime_add(p->stime, steal); 2548 p->stime = cputime_add(p->stime, steal);
2542 if (atomic_read(&rq->nr_iowait) > 0) 2549 if (atomic_read(&rq->nr_iowait) > 0)
2543 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 2550 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
2544 else 2551 else
2545 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2552 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2546 } else 2553 } else
2547 cpustat->steal = cputime64_add(cpustat->steal, tmp); 2554 cpustat->steal = cputime64_add(cpustat->steal, tmp);
2548 } 2555 }
2549 2556
2550 /* 2557 /*
2551 * This function gets called by the timer code, with HZ frequency. 2558 * This function gets called by the timer code, with HZ frequency.
2552 * We call it with interrupts disabled. 2559 * We call it with interrupts disabled.
2553 * 2560 *
2554 * It also gets called by the fork code, when changing the parent's 2561 * It also gets called by the fork code, when changing the parent's
2555 * timeslices. 2562 * timeslices.
2556 */ 2563 */
2557 void scheduler_tick(void) 2564 void scheduler_tick(void)
2558 { 2565 {
2559 int cpu = smp_processor_id(); 2566 int cpu = smp_processor_id();
2560 runqueue_t *rq = this_rq(); 2567 runqueue_t *rq = this_rq();
2561 task_t *p = current; 2568 task_t *p = current;
2562 unsigned long long now = sched_clock(); 2569 unsigned long long now = sched_clock();
2563 2570
2564 update_cpu_clock(p, rq, now); 2571 update_cpu_clock(p, rq, now);
2565 2572
2566 rq->timestamp_last_tick = now; 2573 rq->timestamp_last_tick = now;
2567 2574
2568 if (p == rq->idle) { 2575 if (p == rq->idle) {
2569 if (wake_priority_sleeper(rq)) 2576 if (wake_priority_sleeper(rq))
2570 goto out; 2577 goto out;
2571 rebalance_tick(cpu, rq, SCHED_IDLE); 2578 rebalance_tick(cpu, rq, SCHED_IDLE);
2572 return; 2579 return;
2573 } 2580 }
2574 2581
2575 /* Task might have expired already, but not scheduled off yet */ 2582 /* Task might have expired already, but not scheduled off yet */
2576 if (p->array != rq->active) { 2583 if (p->array != rq->active) {
2577 set_tsk_need_resched(p); 2584 set_tsk_need_resched(p);
2578 goto out; 2585 goto out;
2579 } 2586 }
2580 spin_lock(&rq->lock); 2587 spin_lock(&rq->lock);
2581 /* 2588 /*
2582 * The task was running during this tick - update the 2589 * The task was running during this tick - update the
2583 * time slice counter. Note: we do not update a thread's 2590 * time slice counter. Note: we do not update a thread's
2584 * priority until it either goes to sleep or uses up its 2591 * priority until it either goes to sleep or uses up its
2585 * timeslice. This makes it possible for interactive tasks 2592 * timeslice. This makes it possible for interactive tasks
2586 * to use up their timeslices at their highest priority levels. 2593 * to use up their timeslices at their highest priority levels.
2587 */ 2594 */
2588 if (rt_task(p)) { 2595 if (rt_task(p)) {
2589 /* 2596 /*
2590 * RR tasks need a special form of timeslice management. 2597 * RR tasks need a special form of timeslice management.
2591 * FIFO tasks have no timeslices. 2598 * FIFO tasks have no timeslices.
2592 */ 2599 */
2593 if ((p->policy == SCHED_RR) && !--p->time_slice) { 2600 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2594 p->time_slice = task_timeslice(p); 2601 p->time_slice = task_timeslice(p);
2595 p->first_time_slice = 0; 2602 p->first_time_slice = 0;
2596 set_tsk_need_resched(p); 2603 set_tsk_need_resched(p);
2597 2604
2598 /* put it at the end of the queue: */ 2605 /* put it at the end of the queue: */
2599 requeue_task(p, rq->active); 2606 requeue_task(p, rq->active);
2600 } 2607 }
2601 goto out_unlock; 2608 goto out_unlock;
2602 } 2609 }
2603 if (!--p->time_slice) { 2610 if (!--p->time_slice) {
2604 dequeue_task(p, rq->active); 2611 dequeue_task(p, rq->active);
2605 set_tsk_need_resched(p); 2612 set_tsk_need_resched(p);
2606 p->prio = effective_prio(p); 2613 p->prio = effective_prio(p);
2607 p->time_slice = task_timeslice(p); 2614 p->time_slice = task_timeslice(p);
2608 p->first_time_slice = 0; 2615 p->first_time_slice = 0;
2609 2616
2610 if (!rq->expired_timestamp) 2617 if (!rq->expired_timestamp)
2611 rq->expired_timestamp = jiffies; 2618 rq->expired_timestamp = jiffies;
2612 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { 2619 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2613 enqueue_task(p, rq->expired); 2620 enqueue_task(p, rq->expired);
2614 if (p->static_prio < rq->best_expired_prio) 2621 if (p->static_prio < rq->best_expired_prio)
2615 rq->best_expired_prio = p->static_prio; 2622 rq->best_expired_prio = p->static_prio;
2616 } else 2623 } else
2617 enqueue_task(p, rq->active); 2624 enqueue_task(p, rq->active);
2618 } else { 2625 } else {
2619 /* 2626 /*
2620 * Prevent a too long timeslice allowing a task to monopolize 2627 * Prevent a too long timeslice allowing a task to monopolize
2621 * the CPU. We do this by splitting up the timeslice into 2628 * the CPU. We do this by splitting up the timeslice into
2622 * smaller pieces. 2629 * smaller pieces.
2623 * 2630 *
2624 * Note: this does not mean the task's timeslices expire or 2631 * Note: this does not mean the task's timeslices expire or
2625 * get lost in any way, they just might be preempted by 2632 * get lost in any way, they just might be preempted by
2626 * another task of equal priority. (one with higher 2633 * another task of equal priority. (one with higher
2627 * priority would have preempted this task already.) We 2634 * priority would have preempted this task already.) We
2628 * requeue this task to the end of the list on this priority 2635 * requeue this task to the end of the list on this priority
2629 * level, which is in essence a round-robin of tasks with 2636 * level, which is in essence a round-robin of tasks with
2630 * equal priority. 2637 * equal priority.
2631 * 2638 *
2632 * This only applies to tasks in the interactive 2639 * This only applies to tasks in the interactive
2633 * delta range with at least TIMESLICE_GRANULARITY to requeue. 2640 * delta range with at least TIMESLICE_GRANULARITY to requeue.
2634 */ 2641 */
2635 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - 2642 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2636 p->time_slice) % TIMESLICE_GRANULARITY(p)) && 2643 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2637 (p->time_slice >= TIMESLICE_GRANULARITY(p)) && 2644 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2638 (p->array == rq->active)) { 2645 (p->array == rq->active)) {
2639 2646
2640 requeue_task(p, rq->active); 2647 requeue_task(p, rq->active);
2641 set_tsk_need_resched(p); 2648 set_tsk_need_resched(p);
2642 } 2649 }
2643 } 2650 }
2644 out_unlock: 2651 out_unlock:
2645 spin_unlock(&rq->lock); 2652 spin_unlock(&rq->lock);
2646 out: 2653 out:
2647 rebalance_tick(cpu, rq, NOT_IDLE); 2654 rebalance_tick(cpu, rq, NOT_IDLE);
2648 } 2655 }
2649 2656
2650 #ifdef CONFIG_SCHED_SMT 2657 #ifdef CONFIG_SCHED_SMT
2651 static inline void wakeup_busy_runqueue(runqueue_t *rq) 2658 static inline void wakeup_busy_runqueue(runqueue_t *rq)
2652 { 2659 {
2653 /* If an SMT runqueue is sleeping due to priority reasons wake it up */ 2660 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2654 if (rq->curr == rq->idle && rq->nr_running) 2661 if (rq->curr == rq->idle && rq->nr_running)
2655 resched_task(rq->idle); 2662 resched_task(rq->idle);
2656 } 2663 }
2657 2664
2658 static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2665 static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2659 { 2666 {
2660 struct sched_domain *tmp, *sd = NULL; 2667 struct sched_domain *tmp, *sd = NULL;
2661 cpumask_t sibling_map; 2668 cpumask_t sibling_map;
2662 int i; 2669 int i;
2663 2670
2664 for_each_domain(this_cpu, tmp) 2671 for_each_domain(this_cpu, tmp)
2665 if (tmp->flags & SD_SHARE_CPUPOWER) 2672 if (tmp->flags & SD_SHARE_CPUPOWER)
2666 sd = tmp; 2673 sd = tmp;
2667 2674
2668 if (!sd) 2675 if (!sd)
2669 return; 2676 return;
2670 2677
2671 /* 2678 /*
2672 * Unlock the current runqueue because we have to lock in 2679 * Unlock the current runqueue because we have to lock in
2673 * CPU order to avoid deadlocks. Caller knows that we might 2680 * CPU order to avoid deadlocks. Caller knows that we might
2674 * unlock. We keep IRQs disabled. 2681 * unlock. We keep IRQs disabled.
2675 */ 2682 */
2676 spin_unlock(&this_rq->lock); 2683 spin_unlock(&this_rq->lock);
2677 2684
2678 sibling_map = sd->span; 2685 sibling_map = sd->span;
2679 2686
2680 for_each_cpu_mask(i, sibling_map) 2687 for_each_cpu_mask(i, sibling_map)
2681 spin_lock(&cpu_rq(i)->lock); 2688 spin_lock(&cpu_rq(i)->lock);
2682 /* 2689 /*
2683 * We clear this CPU from the mask. This both simplifies the 2690 * We clear this CPU from the mask. This both simplifies the
2684 * inner loop and keps this_rq locked when we exit: 2691 * inner loop and keps this_rq locked when we exit:
2685 */ 2692 */
2686 cpu_clear(this_cpu, sibling_map); 2693 cpu_clear(this_cpu, sibling_map);
2687 2694
2688 for_each_cpu_mask(i, sibling_map) { 2695 for_each_cpu_mask(i, sibling_map) {
2689 runqueue_t *smt_rq = cpu_rq(i); 2696 runqueue_t *smt_rq = cpu_rq(i);
2690 2697
2691 wakeup_busy_runqueue(smt_rq); 2698 wakeup_busy_runqueue(smt_rq);
2692 } 2699 }
2693 2700
2694 for_each_cpu_mask(i, sibling_map) 2701 for_each_cpu_mask(i, sibling_map)
2695 spin_unlock(&cpu_rq(i)->lock); 2702 spin_unlock(&cpu_rq(i)->lock);
2696 /* 2703 /*
2697 * We exit with this_cpu's rq still held and IRQs 2704 * We exit with this_cpu's rq still held and IRQs
2698 * still disabled: 2705 * still disabled:
2699 */ 2706 */
2700 } 2707 }
2701 2708
2702 /* 2709 /*
2703 * number of 'lost' timeslices this task wont be able to fully 2710 * number of 'lost' timeslices this task wont be able to fully
2704 * utilize, if another task runs on a sibling. This models the 2711 * utilize, if another task runs on a sibling. This models the
2705 * slowdown effect of other tasks running on siblings: 2712 * slowdown effect of other tasks running on siblings:
2706 */ 2713 */
2707 static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) 2714 static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2708 { 2715 {
2709 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 2716 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2710 } 2717 }
2711 2718
2712 static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2719 static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2713 { 2720 {
2714 struct sched_domain *tmp, *sd = NULL; 2721 struct sched_domain *tmp, *sd = NULL;
2715 cpumask_t sibling_map; 2722 cpumask_t sibling_map;
2716 prio_array_t *array; 2723 prio_array_t *array;
2717 int ret = 0, i; 2724 int ret = 0, i;
2718 task_t *p; 2725 task_t *p;
2719 2726
2720 for_each_domain(this_cpu, tmp) 2727 for_each_domain(this_cpu, tmp)
2721 if (tmp->flags & SD_SHARE_CPUPOWER) 2728 if (tmp->flags & SD_SHARE_CPUPOWER)
2722 sd = tmp; 2729 sd = tmp;
2723 2730
2724 if (!sd) 2731 if (!sd)
2725 return 0; 2732 return 0;
2726 2733
2727 /* 2734 /*
2728 * The same locking rules and details apply as for 2735 * The same locking rules and details apply as for
2729 * wake_sleeping_dependent(): 2736 * wake_sleeping_dependent():
2730 */ 2737 */
2731 spin_unlock(&this_rq->lock); 2738 spin_unlock(&this_rq->lock);
2732 sibling_map = sd->span; 2739 sibling_map = sd->span;
2733 for_each_cpu_mask(i, sibling_map) 2740 for_each_cpu_mask(i, sibling_map)
2734 spin_lock(&cpu_rq(i)->lock); 2741 spin_lock(&cpu_rq(i)->lock);
2735 cpu_clear(this_cpu, sibling_map); 2742 cpu_clear(this_cpu, sibling_map);
2736 2743
2737 /* 2744 /*
2738 * Establish next task to be run - it might have gone away because 2745 * Establish next task to be run - it might have gone away because
2739 * we released the runqueue lock above: 2746 * we released the runqueue lock above:
2740 */ 2747 */
2741 if (!this_rq->nr_running) 2748 if (!this_rq->nr_running)
2742 goto out_unlock; 2749 goto out_unlock;
2743 array = this_rq->active; 2750 array = this_rq->active;
2744 if (!array->nr_active) 2751 if (!array->nr_active)
2745 array = this_rq->expired; 2752 array = this_rq->expired;
2746 BUG_ON(!array->nr_active); 2753 BUG_ON(!array->nr_active);
2747 2754
2748 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 2755 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
2749 task_t, run_list); 2756 task_t, run_list);
2750 2757
2751 for_each_cpu_mask(i, sibling_map) { 2758 for_each_cpu_mask(i, sibling_map) {
2752 runqueue_t *smt_rq = cpu_rq(i); 2759 runqueue_t *smt_rq = cpu_rq(i);
2753 task_t *smt_curr = smt_rq->curr; 2760 task_t *smt_curr = smt_rq->curr;
2754 2761
2755 /* Kernel threads do not participate in dependent sleeping */ 2762 /* Kernel threads do not participate in dependent sleeping */
2756 if (!p->mm || !smt_curr->mm || rt_task(p)) 2763 if (!p->mm || !smt_curr->mm || rt_task(p))
2757 goto check_smt_task; 2764 goto check_smt_task;
2758 2765
2759 /* 2766 /*
2760 * If a user task with lower static priority than the 2767 * If a user task with lower static priority than the
2761 * running task on the SMT sibling is trying to schedule, 2768 * running task on the SMT sibling is trying to schedule,
2762 * delay it till there is proportionately less timeslice 2769 * delay it till there is proportionately less timeslice
2763 * left of the sibling task to prevent a lower priority 2770 * left of the sibling task to prevent a lower priority
2764 * task from using an unfair proportion of the 2771 * task from using an unfair proportion of the
2765 * physical cpu's resources. -ck 2772 * physical cpu's resources. -ck
2766 */ 2773 */
2767 if (rt_task(smt_curr)) { 2774 if (rt_task(smt_curr)) {
2768 /* 2775 /*
2769 * With real time tasks we run non-rt tasks only 2776 * With real time tasks we run non-rt tasks only
2770 * per_cpu_gain% of the time. 2777 * per_cpu_gain% of the time.
2771 */ 2778 */
2772 if ((jiffies % DEF_TIMESLICE) > 2779 if ((jiffies % DEF_TIMESLICE) >
2773 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 2780 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2774 ret = 1; 2781 ret = 1;
2775 } else 2782 } else
2776 if (smt_curr->static_prio < p->static_prio && 2783 if (smt_curr->static_prio < p->static_prio &&
2777 !TASK_PREEMPTS_CURR(p, smt_rq) && 2784 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2778 smt_slice(smt_curr, sd) > task_timeslice(p)) 2785 smt_slice(smt_curr, sd) > task_timeslice(p))
2779 ret = 1; 2786 ret = 1;
2780 2787
2781 check_smt_task: 2788 check_smt_task:
2782 if ((!smt_curr->mm && smt_curr != smt_rq->idle) || 2789 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2783 rt_task(smt_curr)) 2790 rt_task(smt_curr))
2784 continue; 2791 continue;
2785 if (!p->mm) { 2792 if (!p->mm) {
2786 wakeup_busy_runqueue(smt_rq); 2793 wakeup_busy_runqueue(smt_rq);
2787 continue; 2794 continue;
2788 } 2795 }
2789 2796
2790 /* 2797 /*
2791 * Reschedule a lower priority task on the SMT sibling for 2798 * Reschedule a lower priority task on the SMT sibling for
2792 * it to be put to sleep, or wake it up if it has been put to 2799 * it to be put to sleep, or wake it up if it has been put to
2793 * sleep for priority reasons to see if it should run now. 2800 * sleep for priority reasons to see if it should run now.
2794 */ 2801 */
2795 if (rt_task(p)) { 2802 if (rt_task(p)) {
2796 if ((jiffies % DEF_TIMESLICE) > 2803 if ((jiffies % DEF_TIMESLICE) >
2797 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2798 resched_task(smt_curr); 2805 resched_task(smt_curr);
2799 } else { 2806 } else {
2800 if (TASK_PREEMPTS_CURR(p, smt_rq) && 2807 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2801 smt_slice(p, sd) > task_timeslice(smt_curr)) 2808 smt_slice(p, sd) > task_timeslice(smt_curr))
2802 resched_task(smt_curr); 2809 resched_task(smt_curr);
2803 else 2810 else
2804 wakeup_busy_runqueue(smt_rq); 2811 wakeup_busy_runqueue(smt_rq);
2805 } 2812 }
2806 } 2813 }
2807 out_unlock: 2814 out_unlock:
2808 for_each_cpu_mask(i, sibling_map) 2815 for_each_cpu_mask(i, sibling_map)
2809 spin_unlock(&cpu_rq(i)->lock); 2816 spin_unlock(&cpu_rq(i)->lock);
2810 return ret; 2817 return ret;
2811 } 2818 }
2812 #else 2819 #else
2813 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2820 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2814 { 2821 {
2815 } 2822 }
2816 2823
2817 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2824 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2818 { 2825 {
2819 return 0; 2826 return 0;
2820 } 2827 }
2821 #endif 2828 #endif
2822 2829
2823 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 2830 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
2824 2831
2825 void fastcall add_preempt_count(int val) 2832 void fastcall add_preempt_count(int val)
2826 { 2833 {
2827 /* 2834 /*
2828 * Underflow? 2835 * Underflow?
2829 */ 2836 */
2830 BUG_ON((preempt_count() < 0)); 2837 BUG_ON((preempt_count() < 0));
2831 preempt_count() += val; 2838 preempt_count() += val;
2832 /* 2839 /*
2833 * Spinlock count overflowing soon? 2840 * Spinlock count overflowing soon?
2834 */ 2841 */
2835 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 2842 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2836 } 2843 }
2837 EXPORT_SYMBOL(add_preempt_count); 2844 EXPORT_SYMBOL(add_preempt_count);
2838 2845
2839 void fastcall sub_preempt_count(int val) 2846 void fastcall sub_preempt_count(int val)
2840 { 2847 {
2841 /* 2848 /*
2842 * Underflow? 2849 * Underflow?
2843 */ 2850 */
2844 BUG_ON(val > preempt_count()); 2851 BUG_ON(val > preempt_count());
2845 /* 2852 /*
2846 * Is the spinlock portion underflowing? 2853 * Is the spinlock portion underflowing?
2847 */ 2854 */
2848 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); 2855 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
2849 preempt_count() -= val; 2856 preempt_count() -= val;
2850 } 2857 }
2851 EXPORT_SYMBOL(sub_preempt_count); 2858 EXPORT_SYMBOL(sub_preempt_count);
2852 2859
2853 #endif 2860 #endif
2854 2861
2855 /* 2862 /*
2856 * schedule() is the main scheduler function. 2863 * schedule() is the main scheduler function.
2857 */ 2864 */
2858 asmlinkage void __sched schedule(void) 2865 asmlinkage void __sched schedule(void)
2859 { 2866 {
2860 long *switch_count; 2867 long *switch_count;
2861 task_t *prev, *next; 2868 task_t *prev, *next;
2862 runqueue_t *rq; 2869 runqueue_t *rq;
2863 prio_array_t *array; 2870 prio_array_t *array;
2864 struct list_head *queue; 2871 struct list_head *queue;
2865 unsigned long long now; 2872 unsigned long long now;
2866 unsigned long run_time; 2873 unsigned long run_time;
2867 int cpu, idx, new_prio; 2874 int cpu, idx, new_prio;
2868 2875
2869 /* 2876 /*
2870 * Test if we are atomic. Since do_exit() needs to call into 2877 * Test if we are atomic. Since do_exit() needs to call into
2871 * schedule() atomically, we ignore that path for now. 2878 * schedule() atomically, we ignore that path for now.
2872 * Otherwise, whine if we are scheduling when we should not be. 2879 * Otherwise, whine if we are scheduling when we should not be.
2873 */ 2880 */
2874 if (likely(!current->exit_state)) { 2881 if (likely(!current->exit_state)) {
2875 if (unlikely(in_atomic())) { 2882 if (unlikely(in_atomic())) {
2876 printk(KERN_ERR "BUG: scheduling while atomic: " 2883 printk(KERN_ERR "BUG: scheduling while atomic: "
2877 "%s/0x%08x/%d\n", 2884 "%s/0x%08x/%d\n",
2878 current->comm, preempt_count(), current->pid); 2885 current->comm, preempt_count(), current->pid);
2879 dump_stack(); 2886 dump_stack();
2880 } 2887 }
2881 } 2888 }
2882 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2889 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2883 2890
2884 need_resched: 2891 need_resched:
2885 preempt_disable(); 2892 preempt_disable();
2886 prev = current; 2893 prev = current;
2887 release_kernel_lock(prev); 2894 release_kernel_lock(prev);
2888 need_resched_nonpreemptible: 2895 need_resched_nonpreemptible:
2889 rq = this_rq(); 2896 rq = this_rq();
2890 2897
2891 /* 2898 /*
2892 * The idle thread is not allowed to schedule! 2899 * The idle thread is not allowed to schedule!
2893 * Remove this check after it has been exercised a bit. 2900 * Remove this check after it has been exercised a bit.
2894 */ 2901 */
2895 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { 2902 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
2896 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 2903 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
2897 dump_stack(); 2904 dump_stack();
2898 } 2905 }
2899 2906
2900 schedstat_inc(rq, sched_cnt); 2907 schedstat_inc(rq, sched_cnt);
2901 now = sched_clock(); 2908 now = sched_clock();
2902 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 2909 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
2903 run_time = now - prev->timestamp; 2910 run_time = now - prev->timestamp;
2904 if (unlikely((long long)(now - prev->timestamp) < 0)) 2911 if (unlikely((long long)(now - prev->timestamp) < 0))
2905 run_time = 0; 2912 run_time = 0;
2906 } else 2913 } else
2907 run_time = NS_MAX_SLEEP_AVG; 2914 run_time = NS_MAX_SLEEP_AVG;
2908 2915
2909 /* 2916 /*
2910 * Tasks charged proportionately less run_time at high sleep_avg to 2917 * Tasks charged proportionately less run_time at high sleep_avg to
2911 * delay them losing their interactive status 2918 * delay them losing their interactive status
2912 */ 2919 */
2913 run_time /= (CURRENT_BONUS(prev) ? : 1); 2920 run_time /= (CURRENT_BONUS(prev) ? : 1);
2914 2921
2915 spin_lock_irq(&rq->lock); 2922 spin_lock_irq(&rq->lock);
2916 2923
2917 if (unlikely(prev->flags & PF_DEAD)) 2924 if (unlikely(prev->flags & PF_DEAD))
2918 prev->state = EXIT_DEAD; 2925 prev->state = EXIT_DEAD;
2919 2926
2920 switch_count = &prev->nivcsw; 2927 switch_count = &prev->nivcsw;
2921 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2928 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2922 switch_count = &prev->nvcsw; 2929 switch_count = &prev->nvcsw;
2923 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 2930 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2924 unlikely(signal_pending(prev)))) 2931 unlikely(signal_pending(prev))))
2925 prev->state = TASK_RUNNING; 2932 prev->state = TASK_RUNNING;
2926 else { 2933 else {
2927 if (prev->state == TASK_UNINTERRUPTIBLE) 2934 if (prev->state == TASK_UNINTERRUPTIBLE)
2928 rq->nr_uninterruptible++; 2935 rq->nr_uninterruptible++;
2929 deactivate_task(prev, rq); 2936 deactivate_task(prev, rq);
2930 } 2937 }
2931 } 2938 }
2932 2939
2933 cpu = smp_processor_id(); 2940 cpu = smp_processor_id();
2934 if (unlikely(!rq->nr_running)) { 2941 if (unlikely(!rq->nr_running)) {
2935 go_idle: 2942 go_idle:
2936 idle_balance(cpu, rq); 2943 idle_balance(cpu, rq);
2937 if (!rq->nr_running) { 2944 if (!rq->nr_running) {
2938 next = rq->idle; 2945 next = rq->idle;
2939 rq->expired_timestamp = 0; 2946 rq->expired_timestamp = 0;
2940 wake_sleeping_dependent(cpu, rq); 2947 wake_sleeping_dependent(cpu, rq);
2941 /* 2948 /*
2942 * wake_sleeping_dependent() might have released 2949 * wake_sleeping_dependent() might have released
2943 * the runqueue, so break out if we got new 2950 * the runqueue, so break out if we got new
2944 * tasks meanwhile: 2951 * tasks meanwhile:
2945 */ 2952 */
2946 if (!rq->nr_running) 2953 if (!rq->nr_running)
2947 goto switch_tasks; 2954 goto switch_tasks;
2948 } 2955 }
2949 } else { 2956 } else {
2950 if (dependent_sleeper(cpu, rq)) { 2957 if (dependent_sleeper(cpu, rq)) {
2951 next = rq->idle; 2958 next = rq->idle;
2952 goto switch_tasks; 2959 goto switch_tasks;
2953 } 2960 }
2954 /* 2961 /*
2955 * dependent_sleeper() releases and reacquires the runqueue 2962 * dependent_sleeper() releases and reacquires the runqueue
2956 * lock, hence go into the idle loop if the rq went 2963 * lock, hence go into the idle loop if the rq went
2957 * empty meanwhile: 2964 * empty meanwhile:
2958 */ 2965 */
2959 if (unlikely(!rq->nr_running)) 2966 if (unlikely(!rq->nr_running))
2960 goto go_idle; 2967 goto go_idle;
2961 } 2968 }
2962 2969
2963 array = rq->active; 2970 array = rq->active;
2964 if (unlikely(!array->nr_active)) { 2971 if (unlikely(!array->nr_active)) {
2965 /* 2972 /*
2966 * Switch the active and expired arrays. 2973 * Switch the active and expired arrays.
2967 */ 2974 */
2968 schedstat_inc(rq, sched_switch); 2975 schedstat_inc(rq, sched_switch);
2969 rq->active = rq->expired; 2976 rq->active = rq->expired;
2970 rq->expired = array; 2977 rq->expired = array;
2971 array = rq->active; 2978 array = rq->active;
2972 rq->expired_timestamp = 0; 2979 rq->expired_timestamp = 0;
2973 rq->best_expired_prio = MAX_PRIO; 2980 rq->best_expired_prio = MAX_PRIO;
2974 } 2981 }
2975 2982
2976 idx = sched_find_first_bit(array->bitmap); 2983 idx = sched_find_first_bit(array->bitmap);
2977 queue = array->queue + idx; 2984 queue = array->queue + idx;
2978 next = list_entry(queue->next, task_t, run_list); 2985 next = list_entry(queue->next, task_t, run_list);
2979 2986
2980 if (!rt_task(next) && next->activated > 0) { 2987 if (!rt_task(next) && next->activated > 0) {
2981 unsigned long long delta = now - next->timestamp; 2988 unsigned long long delta = now - next->timestamp;
2982 if (unlikely((long long)(now - next->timestamp) < 0)) 2989 if (unlikely((long long)(now - next->timestamp) < 0))
2983 delta = 0; 2990 delta = 0;
2984 2991
2985 if (next->activated == 1) 2992 if (next->activated == 1)
2986 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2993 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2987 2994
2988 array = next->array; 2995 array = next->array;
2989 new_prio = recalc_task_prio(next, next->timestamp + delta); 2996 new_prio = recalc_task_prio(next, next->timestamp + delta);
2990 2997
2991 if (unlikely(next->prio != new_prio)) { 2998 if (unlikely(next->prio != new_prio)) {
2992 dequeue_task(next, array); 2999 dequeue_task(next, array);
2993 next->prio = new_prio; 3000 next->prio = new_prio;
2994 enqueue_task(next, array); 3001 enqueue_task(next, array);
2995 } else 3002 } else
2996 requeue_task(next, array); 3003 requeue_task(next, array);
2997 } 3004 }
2998 next->activated = 0; 3005 next->activated = 0;
2999 switch_tasks: 3006 switch_tasks:
3000 if (next == rq->idle) 3007 if (next == rq->idle)
3001 schedstat_inc(rq, sched_goidle); 3008 schedstat_inc(rq, sched_goidle);
3002 prefetch(next); 3009 prefetch(next);
3003 prefetch_stack(next); 3010 prefetch_stack(next);
3004 clear_tsk_need_resched(prev); 3011 clear_tsk_need_resched(prev);
3005 rcu_qsctr_inc(task_cpu(prev)); 3012 rcu_qsctr_inc(task_cpu(prev));
3006 3013
3007 update_cpu_clock(prev, rq, now); 3014 update_cpu_clock(prev, rq, now);
3008 3015
3009 prev->sleep_avg -= run_time; 3016 prev->sleep_avg -= run_time;
3010 if ((long)prev->sleep_avg <= 0) 3017 if ((long)prev->sleep_avg <= 0)
3011 prev->sleep_avg = 0; 3018 prev->sleep_avg = 0;
3012 prev->timestamp = prev->last_ran = now; 3019 prev->timestamp = prev->last_ran = now;
3013 3020
3014 sched_info_switch(prev, next); 3021 sched_info_switch(prev, next);
3015 if (likely(prev != next)) { 3022 if (likely(prev != next)) {
3016 next->timestamp = now; 3023 next->timestamp = now;
3017 rq->nr_switches++; 3024 rq->nr_switches++;
3018 rq->curr = next; 3025 rq->curr = next;
3019 ++*switch_count; 3026 ++*switch_count;
3020 3027
3021 prepare_task_switch(rq, next); 3028 prepare_task_switch(rq, next);
3022 prev = context_switch(rq, prev, next); 3029 prev = context_switch(rq, prev, next);
3023 barrier(); 3030 barrier();
3024 /* 3031 /*
3025 * this_rq must be evaluated again because prev may have moved 3032 * this_rq must be evaluated again because prev may have moved
3026 * CPUs since it called schedule(), thus the 'rq' on its stack 3033 * CPUs since it called schedule(), thus the 'rq' on its stack
3027 * frame will be invalid. 3034 * frame will be invalid.
3028 */ 3035 */
3029 finish_task_switch(this_rq(), prev); 3036 finish_task_switch(this_rq(), prev);
3030 } else 3037 } else
3031 spin_unlock_irq(&rq->lock); 3038 spin_unlock_irq(&rq->lock);
3032 3039
3033 prev = current; 3040 prev = current;
3034 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3041 if (unlikely(reacquire_kernel_lock(prev) < 0))
3035 goto need_resched_nonpreemptible; 3042 goto need_resched_nonpreemptible;
3036 preempt_enable_no_resched(); 3043 preempt_enable_no_resched();
3037 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3044 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3038 goto need_resched; 3045 goto need_resched;
3039 } 3046 }
3040 3047
3041 EXPORT_SYMBOL(schedule); 3048 EXPORT_SYMBOL(schedule);
3042 3049
3043 #ifdef CONFIG_PREEMPT 3050 #ifdef CONFIG_PREEMPT
3044 /* 3051 /*
3045 * this is is the entry point to schedule() from in-kernel preemption 3052 * this is is the entry point to schedule() from in-kernel preemption
3046 * off of preempt_enable. Kernel preemptions off return from interrupt 3053 * off of preempt_enable. Kernel preemptions off return from interrupt
3047 * occur there and call schedule directly. 3054 * occur there and call schedule directly.
3048 */ 3055 */
3049 asmlinkage void __sched preempt_schedule(void) 3056 asmlinkage void __sched preempt_schedule(void)
3050 { 3057 {
3051 struct thread_info *ti = current_thread_info(); 3058 struct thread_info *ti = current_thread_info();
3052 #ifdef CONFIG_PREEMPT_BKL 3059 #ifdef CONFIG_PREEMPT_BKL
3053 struct task_struct *task = current; 3060 struct task_struct *task = current;
3054 int saved_lock_depth; 3061 int saved_lock_depth;
3055 #endif 3062 #endif
3056 /* 3063 /*
3057 * If there is a non-zero preempt_count or interrupts are disabled, 3064 * If there is a non-zero preempt_count or interrupts are disabled,
3058 * we do not want to preempt the current task. Just return.. 3065 * we do not want to preempt the current task. Just return..
3059 */ 3066 */
3060 if (unlikely(ti->preempt_count || irqs_disabled())) 3067 if (unlikely(ti->preempt_count || irqs_disabled()))
3061 return; 3068 return;
3062 3069
3063 need_resched: 3070 need_resched:
3064 add_preempt_count(PREEMPT_ACTIVE); 3071 add_preempt_count(PREEMPT_ACTIVE);
3065 /* 3072 /*
3066 * We keep the big kernel semaphore locked, but we 3073 * We keep the big kernel semaphore locked, but we
3067 * clear ->lock_depth so that schedule() doesnt 3074 * clear ->lock_depth so that schedule() doesnt
3068 * auto-release the semaphore: 3075 * auto-release the semaphore:
3069 */ 3076 */
3070 #ifdef CONFIG_PREEMPT_BKL 3077 #ifdef CONFIG_PREEMPT_BKL
3071 saved_lock_depth = task->lock_depth; 3078 saved_lock_depth = task->lock_depth;
3072 task->lock_depth = -1; 3079 task->lock_depth = -1;
3073 #endif 3080 #endif
3074 schedule(); 3081 schedule();
3075 #ifdef CONFIG_PREEMPT_BKL 3082 #ifdef CONFIG_PREEMPT_BKL
3076 task->lock_depth = saved_lock_depth; 3083 task->lock_depth = saved_lock_depth;
3077 #endif 3084 #endif
3078 sub_preempt_count(PREEMPT_ACTIVE); 3085 sub_preempt_count(PREEMPT_ACTIVE);
3079 3086
3080 /* we could miss a preemption opportunity between schedule and now */ 3087 /* we could miss a preemption opportunity between schedule and now */
3081 barrier(); 3088 barrier();
3082 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3089 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3083 goto need_resched; 3090 goto need_resched;
3084 } 3091 }
3085 3092
3086 EXPORT_SYMBOL(preempt_schedule); 3093 EXPORT_SYMBOL(preempt_schedule);
3087 3094
3088 /* 3095 /*
3089 * this is is the entry point to schedule() from kernel preemption 3096 * this is is the entry point to schedule() from kernel preemption
3090 * off of irq context. 3097 * off of irq context.
3091 * Note, that this is called and return with irqs disabled. This will 3098 * Note, that this is called and return with irqs disabled. This will
3092 * protect us against recursive calling from irq. 3099 * protect us against recursive calling from irq.
3093 */ 3100 */
3094 asmlinkage void __sched preempt_schedule_irq(void) 3101 asmlinkage void __sched preempt_schedule_irq(void)
3095 { 3102 {
3096 struct thread_info *ti = current_thread_info(); 3103 struct thread_info *ti = current_thread_info();
3097 #ifdef CONFIG_PREEMPT_BKL 3104 #ifdef CONFIG_PREEMPT_BKL
3098 struct task_struct *task = current; 3105 struct task_struct *task = current;
3099 int saved_lock_depth; 3106 int saved_lock_depth;
3100 #endif 3107 #endif
3101 /* Catch callers which need to be fixed*/ 3108 /* Catch callers which need to be fixed*/
3102 BUG_ON(ti->preempt_count || !irqs_disabled()); 3109 BUG_ON(ti->preempt_count || !irqs_disabled());
3103 3110
3104 need_resched: 3111 need_resched:
3105 add_preempt_count(PREEMPT_ACTIVE); 3112 add_preempt_count(PREEMPT_ACTIVE);
3106 /* 3113 /*
3107 * We keep the big kernel semaphore locked, but we 3114 * We keep the big kernel semaphore locked, but we
3108 * clear ->lock_depth so that schedule() doesnt 3115 * clear ->lock_depth so that schedule() doesnt
3109 * auto-release the semaphore: 3116 * auto-release the semaphore:
3110 */ 3117 */
3111 #ifdef CONFIG_PREEMPT_BKL 3118 #ifdef CONFIG_PREEMPT_BKL
3112 saved_lock_depth = task->lock_depth; 3119 saved_lock_depth = task->lock_depth;
3113 task->lock_depth = -1; 3120 task->lock_depth = -1;
3114 #endif 3121 #endif
3115 local_irq_enable(); 3122 local_irq_enable();
3116 schedule(); 3123 schedule();
3117 local_irq_disable(); 3124 local_irq_disable();
3118 #ifdef CONFIG_PREEMPT_BKL 3125 #ifdef CONFIG_PREEMPT_BKL
3119 task->lock_depth = saved_lock_depth; 3126 task->lock_depth = saved_lock_depth;
3120 #endif 3127 #endif
3121 sub_preempt_count(PREEMPT_ACTIVE); 3128 sub_preempt_count(PREEMPT_ACTIVE);
3122 3129
3123 /* we could miss a preemption opportunity between schedule and now */ 3130 /* we could miss a preemption opportunity between schedule and now */
3124 barrier(); 3131 barrier();
3125 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3132 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3126 goto need_resched; 3133 goto need_resched;
3127 } 3134 }
3128 3135
3129 #endif /* CONFIG_PREEMPT */ 3136 #endif /* CONFIG_PREEMPT */
3130 3137
3131 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 3138 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3132 void *key) 3139 void *key)
3133 { 3140 {
3134 task_t *p = curr->private; 3141 task_t *p = curr->private;
3135 return try_to_wake_up(p, mode, sync); 3142 return try_to_wake_up(p, mode, sync);
3136 } 3143 }
3137 3144
3138 EXPORT_SYMBOL(default_wake_function); 3145 EXPORT_SYMBOL(default_wake_function);
3139 3146
3140 /* 3147 /*
3141 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3148 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3142 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3149 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3143 * number) then we wake all the non-exclusive tasks and one exclusive task. 3150 * number) then we wake all the non-exclusive tasks and one exclusive task.
3144 * 3151 *
3145 * There are circumstances in which we can try to wake a task which has already 3152 * There are circumstances in which we can try to wake a task which has already
3146 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3153 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3147 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3154 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3148 */ 3155 */
3149 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3156 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3150 int nr_exclusive, int sync, void *key) 3157 int nr_exclusive, int sync, void *key)
3151 { 3158 {
3152 struct list_head *tmp, *next; 3159 struct list_head *tmp, *next;
3153 3160
3154 list_for_each_safe(tmp, next, &q->task_list) { 3161 list_for_each_safe(tmp, next, &q->task_list) {
3155 wait_queue_t *curr; 3162 wait_queue_t *curr;
3156 unsigned flags; 3163 unsigned flags;
3157 curr = list_entry(tmp, wait_queue_t, task_list); 3164 curr = list_entry(tmp, wait_queue_t, task_list);
3158 flags = curr->flags; 3165 flags = curr->flags;
3159 if (curr->func(curr, mode, sync, key) && 3166 if (curr->func(curr, mode, sync, key) &&
3160 (flags & WQ_FLAG_EXCLUSIVE) && 3167 (flags & WQ_FLAG_EXCLUSIVE) &&
3161 !--nr_exclusive) 3168 !--nr_exclusive)
3162 break; 3169 break;
3163 } 3170 }
3164 } 3171 }
3165 3172
3166 /** 3173 /**
3167 * __wake_up - wake up threads blocked on a waitqueue. 3174 * __wake_up - wake up threads blocked on a waitqueue.
3168 * @q: the waitqueue 3175 * @q: the waitqueue
3169 * @mode: which threads 3176 * @mode: which threads
3170 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3177 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3171 * @key: is directly passed to the wakeup function 3178 * @key: is directly passed to the wakeup function
3172 */ 3179 */
3173 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 3180 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3174 int nr_exclusive, void *key) 3181 int nr_exclusive, void *key)
3175 { 3182 {
3176 unsigned long flags; 3183 unsigned long flags;
3177 3184
3178 spin_lock_irqsave(&q->lock, flags); 3185 spin_lock_irqsave(&q->lock, flags);
3179 __wake_up_common(q, mode, nr_exclusive, 0, key); 3186 __wake_up_common(q, mode, nr_exclusive, 0, key);
3180 spin_unlock_irqrestore(&q->lock, flags); 3187 spin_unlock_irqrestore(&q->lock, flags);
3181 } 3188 }
3182 3189
3183 EXPORT_SYMBOL(__wake_up); 3190 EXPORT_SYMBOL(__wake_up);
3184 3191
3185 /* 3192 /*
3186 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3193 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3187 */ 3194 */
3188 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3195 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3189 { 3196 {
3190 __wake_up_common(q, mode, 1, 0, NULL); 3197 __wake_up_common(q, mode, 1, 0, NULL);
3191 } 3198 }
3192 3199
3193 /** 3200 /**
3194 * __wake_up_sync - wake up threads blocked on a waitqueue. 3201 * __wake_up_sync - wake up threads blocked on a waitqueue.
3195 * @q: the waitqueue 3202 * @q: the waitqueue
3196 * @mode: which threads 3203 * @mode: which threads
3197 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3204 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3198 * 3205 *
3199 * The sync wakeup differs that the waker knows that it will schedule 3206 * The sync wakeup differs that the waker knows that it will schedule
3200 * away soon, so while the target thread will be woken up, it will not 3207 * away soon, so while the target thread will be woken up, it will not
3201 * be migrated to another CPU - ie. the two threads are 'synchronized' 3208 * be migrated to another CPU - ie. the two threads are 'synchronized'
3202 * with each other. This can prevent needless bouncing between CPUs. 3209 * with each other. This can prevent needless bouncing between CPUs.
3203 * 3210 *
3204 * On UP it can prevent extra preemption. 3211 * On UP it can prevent extra preemption.
3205 */ 3212 */
3206 void fastcall 3213 void fastcall
3207 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3214 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3208 { 3215 {
3209 unsigned long flags; 3216 unsigned long flags;
3210 int sync = 1; 3217 int sync = 1;
3211 3218
3212 if (unlikely(!q)) 3219 if (unlikely(!q))
3213 return; 3220 return;
3214 3221
3215 if (unlikely(!nr_exclusive)) 3222 if (unlikely(!nr_exclusive))
3216 sync = 0; 3223 sync = 0;
3217 3224
3218 spin_lock_irqsave(&q->lock, flags); 3225 spin_lock_irqsave(&q->lock, flags);
3219 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 3226 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3220 spin_unlock_irqrestore(&q->lock, flags); 3227 spin_unlock_irqrestore(&q->lock, flags);
3221 } 3228 }
3222 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3229 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3223 3230
3224 void fastcall complete(struct completion *x) 3231 void fastcall complete(struct completion *x)
3225 { 3232 {
3226 unsigned long flags; 3233 unsigned long flags;
3227 3234
3228 spin_lock_irqsave(&x->wait.lock, flags); 3235 spin_lock_irqsave(&x->wait.lock, flags);
3229 x->done++; 3236 x->done++;
3230 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 3237 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3231 1, 0, NULL); 3238 1, 0, NULL);
3232 spin_unlock_irqrestore(&x->wait.lock, flags); 3239 spin_unlock_irqrestore(&x->wait.lock, flags);
3233 } 3240 }
3234 EXPORT_SYMBOL(complete); 3241 EXPORT_SYMBOL(complete);
3235 3242
3236 void fastcall complete_all(struct completion *x) 3243 void fastcall complete_all(struct completion *x)
3237 { 3244 {
3238 unsigned long flags; 3245 unsigned long flags;
3239 3246
3240 spin_lock_irqsave(&x->wait.lock, flags); 3247 spin_lock_irqsave(&x->wait.lock, flags);
3241 x->done += UINT_MAX/2; 3248 x->done += UINT_MAX/2;
3242 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 3249 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3243 0, 0, NULL); 3250 0, 0, NULL);
3244 spin_unlock_irqrestore(&x->wait.lock, flags); 3251 spin_unlock_irqrestore(&x->wait.lock, flags);
3245 } 3252 }
3246 EXPORT_SYMBOL(complete_all); 3253 EXPORT_SYMBOL(complete_all);
3247 3254
3248 void fastcall __sched wait_for_completion(struct completion *x) 3255 void fastcall __sched wait_for_completion(struct completion *x)
3249 { 3256 {
3250 might_sleep(); 3257 might_sleep();
3251 spin_lock_irq(&x->wait.lock); 3258 spin_lock_irq(&x->wait.lock);
3252 if (!x->done) { 3259 if (!x->done) {
3253 DECLARE_WAITQUEUE(wait, current); 3260 DECLARE_WAITQUEUE(wait, current);
3254 3261
3255 wait.flags |= WQ_FLAG_EXCLUSIVE; 3262 wait.flags |= WQ_FLAG_EXCLUSIVE;
3256 __add_wait_queue_tail(&x->wait, &wait); 3263 __add_wait_queue_tail(&x->wait, &wait);
3257 do { 3264 do {
3258 __set_current_state(TASK_UNINTERRUPTIBLE); 3265 __set_current_state(TASK_UNINTERRUPTIBLE);
3259 spin_unlock_irq(&x->wait.lock); 3266 spin_unlock_irq(&x->wait.lock);
3260 schedule(); 3267 schedule();
3261 spin_lock_irq(&x->wait.lock); 3268 spin_lock_irq(&x->wait.lock);
3262 } while (!x->done); 3269 } while (!x->done);
3263 __remove_wait_queue(&x->wait, &wait); 3270 __remove_wait_queue(&x->wait, &wait);
3264 } 3271 }
3265 x->done--; 3272 x->done--;
3266 spin_unlock_irq(&x->wait.lock); 3273 spin_unlock_irq(&x->wait.lock);
3267 } 3274 }
3268 EXPORT_SYMBOL(wait_for_completion); 3275 EXPORT_SYMBOL(wait_for_completion);
3269 3276
3270 unsigned long fastcall __sched 3277 unsigned long fastcall __sched
3271 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3278 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3272 { 3279 {
3273 might_sleep(); 3280 might_sleep();
3274 3281
3275 spin_lock_irq(&x->wait.lock); 3282 spin_lock_irq(&x->wait.lock);
3276 if (!x->done) { 3283 if (!x->done) {
3277 DECLARE_WAITQUEUE(wait, current); 3284 DECLARE_WAITQUEUE(wait, current);
3278 3285
3279 wait.flags |= WQ_FLAG_EXCLUSIVE; 3286 wait.flags |= WQ_FLAG_EXCLUSIVE;
3280 __add_wait_queue_tail(&x->wait, &wait); 3287 __add_wait_queue_tail(&x->wait, &wait);
3281 do { 3288 do {
3282 __set_current_state(TASK_UNINTERRUPTIBLE); 3289 __set_current_state(TASK_UNINTERRUPTIBLE);
3283 spin_unlock_irq(&x->wait.lock); 3290 spin_unlock_irq(&x->wait.lock);
3284 timeout = schedule_timeout(timeout); 3291 timeout = schedule_timeout(timeout);
3285 spin_lock_irq(&x->wait.lock); 3292 spin_lock_irq(&x->wait.lock);
3286 if (!timeout) { 3293 if (!timeout) {
3287 __remove_wait_queue(&x->wait, &wait); 3294 __remove_wait_queue(&x->wait, &wait);
3288 goto out; 3295 goto out;
3289 } 3296 }
3290 } while (!x->done); 3297 } while (!x->done);
3291 __remove_wait_queue(&x->wait, &wait); 3298 __remove_wait_queue(&x->wait, &wait);
3292 } 3299 }
3293 x->done--; 3300 x->done--;
3294 out: 3301 out:
3295 spin_unlock_irq(&x->wait.lock); 3302 spin_unlock_irq(&x->wait.lock);
3296 return timeout; 3303 return timeout;
3297 } 3304 }
3298 EXPORT_SYMBOL(wait_for_completion_timeout); 3305 EXPORT_SYMBOL(wait_for_completion_timeout);
3299 3306
3300 int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3307 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3301 { 3308 {
3302 int ret = 0; 3309 int ret = 0;
3303 3310
3304 might_sleep(); 3311 might_sleep();
3305 3312
3306 spin_lock_irq(&x->wait.lock); 3313 spin_lock_irq(&x->wait.lock);
3307 if (!x->done) { 3314 if (!x->done) {
3308 DECLARE_WAITQUEUE(wait, current); 3315 DECLARE_WAITQUEUE(wait, current);
3309 3316
3310 wait.flags |= WQ_FLAG_EXCLUSIVE; 3317 wait.flags |= WQ_FLAG_EXCLUSIVE;
3311 __add_wait_queue_tail(&x->wait, &wait); 3318 __add_wait_queue_tail(&x->wait, &wait);
3312 do { 3319 do {
3313 if (signal_pending(current)) { 3320 if (signal_pending(current)) {
3314 ret = -ERESTARTSYS; 3321 ret = -ERESTARTSYS;
3315 __remove_wait_queue(&x->wait, &wait); 3322 __remove_wait_queue(&x->wait, &wait);
3316 goto out; 3323 goto out;
3317 } 3324 }
3318 __set_current_state(TASK_INTERRUPTIBLE); 3325 __set_current_state(TASK_INTERRUPTIBLE);
3319 spin_unlock_irq(&x->wait.lock); 3326 spin_unlock_irq(&x->wait.lock);
3320 schedule(); 3327 schedule();
3321 spin_lock_irq(&x->wait.lock); 3328 spin_lock_irq(&x->wait.lock);
3322 } while (!x->done); 3329 } while (!x->done);
3323 __remove_wait_queue(&x->wait, &wait); 3330 __remove_wait_queue(&x->wait, &wait);
3324 } 3331 }
3325 x->done--; 3332 x->done--;
3326 out: 3333 out:
3327 spin_unlock_irq(&x->wait.lock); 3334 spin_unlock_irq(&x->wait.lock);
3328 3335
3329 return ret; 3336 return ret;
3330 } 3337 }
3331 EXPORT_SYMBOL(wait_for_completion_interruptible); 3338 EXPORT_SYMBOL(wait_for_completion_interruptible);
3332 3339
3333 unsigned long fastcall __sched 3340 unsigned long fastcall __sched
3334 wait_for_completion_interruptible_timeout(struct completion *x, 3341 wait_for_completion_interruptible_timeout(struct completion *x,
3335 unsigned long timeout) 3342 unsigned long timeout)
3336 { 3343 {
3337 might_sleep(); 3344 might_sleep();
3338 3345
3339 spin_lock_irq(&x->wait.lock); 3346 spin_lock_irq(&x->wait.lock);
3340 if (!x->done) { 3347 if (!x->done) {
3341 DECLARE_WAITQUEUE(wait, current); 3348 DECLARE_WAITQUEUE(wait, current);
3342 3349
3343 wait.flags |= WQ_FLAG_EXCLUSIVE; 3350 wait.flags |= WQ_FLAG_EXCLUSIVE;
3344 __add_wait_queue_tail(&x->wait, &wait); 3351 __add_wait_queue_tail(&x->wait, &wait);
3345 do { 3352 do {
3346 if (signal_pending(current)) { 3353 if (signal_pending(current)) {
3347 timeout = -ERESTARTSYS; 3354 timeout = -ERESTARTSYS;
3348 __remove_wait_queue(&x->wait, &wait); 3355 __remove_wait_queue(&x->wait, &wait);
3349 goto out; 3356 goto out;
3350 } 3357 }
3351 __set_current_state(TASK_INTERRUPTIBLE); 3358 __set_current_state(TASK_INTERRUPTIBLE);
3352 spin_unlock_irq(&x->wait.lock); 3359 spin_unlock_irq(&x->wait.lock);
3353 timeout = schedule_timeout(timeout); 3360 timeout = schedule_timeout(timeout);
3354 spin_lock_irq(&x->wait.lock); 3361 spin_lock_irq(&x->wait.lock);
3355 if (!timeout) { 3362 if (!timeout) {
3356 __remove_wait_queue(&x->wait, &wait); 3363 __remove_wait_queue(&x->wait, &wait);
3357 goto out; 3364 goto out;
3358 } 3365 }
3359 } while (!x->done); 3366 } while (!x->done);
3360 __remove_wait_queue(&x->wait, &wait); 3367 __remove_wait_queue(&x->wait, &wait);
3361 } 3368 }
3362 x->done--; 3369 x->done--;
3363 out: 3370 out:
3364 spin_unlock_irq(&x->wait.lock); 3371 spin_unlock_irq(&x->wait.lock);
3365 return timeout; 3372 return timeout;
3366 } 3373 }
3367 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3374 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3368 3375
3369 3376
3370 #define SLEEP_ON_VAR \ 3377 #define SLEEP_ON_VAR \
3371 unsigned long flags; \ 3378 unsigned long flags; \
3372 wait_queue_t wait; \ 3379 wait_queue_t wait; \
3373 init_waitqueue_entry(&wait, current); 3380 init_waitqueue_entry(&wait, current);
3374 3381
3375 #define SLEEP_ON_HEAD \ 3382 #define SLEEP_ON_HEAD \
3376 spin_lock_irqsave(&q->lock,flags); \ 3383 spin_lock_irqsave(&q->lock,flags); \
3377 __add_wait_queue(q, &wait); \ 3384 __add_wait_queue(q, &wait); \
3378 spin_unlock(&q->lock); 3385 spin_unlock(&q->lock);
3379 3386
3380 #define SLEEP_ON_TAIL \ 3387 #define SLEEP_ON_TAIL \
3381 spin_lock_irq(&q->lock); \ 3388 spin_lock_irq(&q->lock); \
3382 __remove_wait_queue(q, &wait); \ 3389 __remove_wait_queue(q, &wait); \
3383 spin_unlock_irqrestore(&q->lock, flags); 3390 spin_unlock_irqrestore(&q->lock, flags);
3384 3391
3385 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3392 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3386 { 3393 {
3387 SLEEP_ON_VAR 3394 SLEEP_ON_VAR
3388 3395
3389 current->state = TASK_INTERRUPTIBLE; 3396 current->state = TASK_INTERRUPTIBLE;
3390 3397
3391 SLEEP_ON_HEAD 3398 SLEEP_ON_HEAD
3392 schedule(); 3399 schedule();
3393 SLEEP_ON_TAIL 3400 SLEEP_ON_TAIL
3394 } 3401 }
3395 3402
3396 EXPORT_SYMBOL(interruptible_sleep_on); 3403 EXPORT_SYMBOL(interruptible_sleep_on);
3397 3404
3398 long fastcall __sched 3405 long fastcall __sched
3399 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3406 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3400 { 3407 {
3401 SLEEP_ON_VAR 3408 SLEEP_ON_VAR
3402 3409
3403 current->state = TASK_INTERRUPTIBLE; 3410 current->state = TASK_INTERRUPTIBLE;
3404 3411
3405 SLEEP_ON_HEAD 3412 SLEEP_ON_HEAD
3406 timeout = schedule_timeout(timeout); 3413 timeout = schedule_timeout(timeout);
3407 SLEEP_ON_TAIL 3414 SLEEP_ON_TAIL
3408 3415
3409 return timeout; 3416 return timeout;
3410 } 3417 }
3411 3418
3412 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3419 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3413 3420
3414 void fastcall __sched sleep_on(wait_queue_head_t *q) 3421 void fastcall __sched sleep_on(wait_queue_head_t *q)
3415 { 3422 {
3416 SLEEP_ON_VAR 3423 SLEEP_ON_VAR
3417 3424
3418 current->state = TASK_UNINTERRUPTIBLE; 3425 current->state = TASK_UNINTERRUPTIBLE;
3419 3426
3420 SLEEP_ON_HEAD 3427 SLEEP_ON_HEAD
3421 schedule(); 3428 schedule();
3422 SLEEP_ON_TAIL 3429 SLEEP_ON_TAIL
3423 } 3430 }
3424 3431
3425 EXPORT_SYMBOL(sleep_on); 3432 EXPORT_SYMBOL(sleep_on);
3426 3433
3427 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3434 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3428 { 3435 {
3429 SLEEP_ON_VAR 3436 SLEEP_ON_VAR
3430 3437
3431 current->state = TASK_UNINTERRUPTIBLE; 3438 current->state = TASK_UNINTERRUPTIBLE;
3432 3439
3433 SLEEP_ON_HEAD 3440 SLEEP_ON_HEAD
3434 timeout = schedule_timeout(timeout); 3441 timeout = schedule_timeout(timeout);
3435 SLEEP_ON_TAIL 3442 SLEEP_ON_TAIL
3436 3443
3437 return timeout; 3444 return timeout;
3438 } 3445 }
3439 3446
3440 EXPORT_SYMBOL(sleep_on_timeout); 3447 EXPORT_SYMBOL(sleep_on_timeout);
3441 3448
3442 void set_user_nice(task_t *p, long nice) 3449 void set_user_nice(task_t *p, long nice)
3443 { 3450 {
3444 unsigned long flags; 3451 unsigned long flags;
3445 prio_array_t *array; 3452 prio_array_t *array;
3446 runqueue_t *rq; 3453 runqueue_t *rq;
3447 int old_prio, new_prio, delta; 3454 int old_prio, new_prio, delta;
3448 3455
3449 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3456 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3450 return; 3457 return;
3451 /* 3458 /*
3452 * We have to be careful, if called from sys_setpriority(), 3459 * We have to be careful, if called from sys_setpriority(),
3453 * the task might be in the middle of scheduling on another CPU. 3460 * the task might be in the middle of scheduling on another CPU.
3454 */ 3461 */
3455 rq = task_rq_lock(p, &flags); 3462 rq = task_rq_lock(p, &flags);
3456 /* 3463 /*
3457 * The RT priorities are set via sched_setscheduler(), but we still 3464 * The RT priorities are set via sched_setscheduler(), but we still
3458 * allow the 'normal' nice value to be set - but as expected 3465 * allow the 'normal' nice value to be set - but as expected
3459 * it wont have any effect on scheduling until the task is 3466 * it wont have any effect on scheduling until the task is
3460 * not SCHED_NORMAL/SCHED_BATCH: 3467 * not SCHED_NORMAL/SCHED_BATCH:
3461 */ 3468 */
3462 if (rt_task(p)) { 3469 if (rt_task(p)) {
3463 p->static_prio = NICE_TO_PRIO(nice); 3470 p->static_prio = NICE_TO_PRIO(nice);
3464 goto out_unlock; 3471 goto out_unlock;
3465 } 3472 }
3466 array = p->array; 3473 array = p->array;
3467 if (array) 3474 if (array)
3468 dequeue_task(p, array); 3475 dequeue_task(p, array);
3469 3476
3470 old_prio = p->prio; 3477 old_prio = p->prio;
3471 new_prio = NICE_TO_PRIO(nice); 3478 new_prio = NICE_TO_PRIO(nice);
3472 delta = new_prio - old_prio; 3479 delta = new_prio - old_prio;
3473 p->static_prio = NICE_TO_PRIO(nice); 3480 p->static_prio = NICE_TO_PRIO(nice);
3474 p->prio += delta; 3481 p->prio += delta;
3475 3482
3476 if (array) { 3483 if (array) {
3477 enqueue_task(p, array); 3484 enqueue_task(p, array);
3478 /* 3485 /*
3479 * If the task increased its priority or is running and 3486 * If the task increased its priority or is running and
3480 * lowered its priority, then reschedule its CPU: 3487 * lowered its priority, then reschedule its CPU:
3481 */ 3488 */
3482 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3489 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3483 resched_task(rq->curr); 3490 resched_task(rq->curr);
3484 } 3491 }
3485 out_unlock: 3492 out_unlock:
3486 task_rq_unlock(rq, &flags); 3493 task_rq_unlock(rq, &flags);
3487 } 3494 }
3488 3495
3489 EXPORT_SYMBOL(set_user_nice); 3496 EXPORT_SYMBOL(set_user_nice);
3490 3497
3491 /* 3498 /*
3492 * can_nice - check if a task can reduce its nice value 3499 * can_nice - check if a task can reduce its nice value
3493 * @p: task 3500 * @p: task
3494 * @nice: nice value 3501 * @nice: nice value
3495 */ 3502 */
3496 int can_nice(const task_t *p, const int nice) 3503 int can_nice(const task_t *p, const int nice)
3497 { 3504 {
3498 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3505 /* convert nice value [19,-20] to rlimit style value [1,40] */
3499 int nice_rlim = 20 - nice; 3506 int nice_rlim = 20 - nice;
3500 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3507 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3501 capable(CAP_SYS_NICE)); 3508 capable(CAP_SYS_NICE));
3502 } 3509 }
3503 3510
3504 #ifdef __ARCH_WANT_SYS_NICE 3511 #ifdef __ARCH_WANT_SYS_NICE
3505 3512
3506 /* 3513 /*
3507 * sys_nice - change the priority of the current process. 3514 * sys_nice - change the priority of the current process.
3508 * @increment: priority increment 3515 * @increment: priority increment
3509 * 3516 *
3510 * sys_setpriority is a more generic, but much slower function that 3517 * sys_setpriority is a more generic, but much slower function that
3511 * does similar things. 3518 * does similar things.
3512 */ 3519 */
3513 asmlinkage long sys_nice(int increment) 3520 asmlinkage long sys_nice(int increment)
3514 { 3521 {
3515 int retval; 3522 int retval;
3516 long nice; 3523 long nice;
3517 3524
3518 /* 3525 /*
3519 * Setpriority might change our priority at the same moment. 3526 * Setpriority might change our priority at the same moment.
3520 * We don't have to worry. Conceptually one call occurs first 3527 * We don't have to worry. Conceptually one call occurs first
3521 * and we have a single winner. 3528 * and we have a single winner.
3522 */ 3529 */
3523 if (increment < -40) 3530 if (increment < -40)
3524 increment = -40; 3531 increment = -40;
3525 if (increment > 40) 3532 if (increment > 40)
3526 increment = 40; 3533 increment = 40;
3527 3534
3528 nice = PRIO_TO_NICE(current->static_prio) + increment; 3535 nice = PRIO_TO_NICE(current->static_prio) + increment;
3529 if (nice < -20) 3536 if (nice < -20)
3530 nice = -20; 3537 nice = -20;
3531 if (nice > 19) 3538 if (nice > 19)
3532 nice = 19; 3539 nice = 19;
3533 3540
3534 if (increment < 0 && !can_nice(current, nice)) 3541 if (increment < 0 && !can_nice(current, nice))
3535 return -EPERM; 3542 return -EPERM;
3536 3543
3537 retval = security_task_setnice(current, nice); 3544 retval = security_task_setnice(current, nice);
3538 if (retval) 3545 if (retval)
3539 return retval; 3546 return retval;
3540 3547
3541 set_user_nice(current, nice); 3548 set_user_nice(current, nice);
3542 return 0; 3549 return 0;
3543 } 3550 }
3544 3551
3545 #endif 3552 #endif
3546 3553
3547 /** 3554 /**
3548 * task_prio - return the priority value of a given task. 3555 * task_prio - return the priority value of a given task.
3549 * @p: the task in question. 3556 * @p: the task in question.
3550 * 3557 *
3551 * This is the priority value as seen by users in /proc. 3558 * This is the priority value as seen by users in /proc.
3552 * RT tasks are offset by -200. Normal tasks are centered 3559 * RT tasks are offset by -200. Normal tasks are centered
3553 * around 0, value goes from -16 to +15. 3560 * around 0, value goes from -16 to +15.
3554 */ 3561 */
3555 int task_prio(const task_t *p) 3562 int task_prio(const task_t *p)
3556 { 3563 {
3557 return p->prio - MAX_RT_PRIO; 3564 return p->prio - MAX_RT_PRIO;
3558 } 3565 }
3559 3566
3560 /** 3567 /**
3561 * task_nice - return the nice value of a given task. 3568 * task_nice - return the nice value of a given task.
3562 * @p: the task in question. 3569 * @p: the task in question.
3563 */ 3570 */
3564 int task_nice(const task_t *p) 3571 int task_nice(const task_t *p)
3565 { 3572 {
3566 return TASK_NICE(p); 3573 return TASK_NICE(p);
3567 } 3574 }
3568 EXPORT_SYMBOL_GPL(task_nice); 3575 EXPORT_SYMBOL_GPL(task_nice);
3569 3576
3570 /** 3577 /**
3571 * idle_cpu - is a given cpu idle currently? 3578 * idle_cpu - is a given cpu idle currently?
3572 * @cpu: the processor in question. 3579 * @cpu: the processor in question.
3573 */ 3580 */
3574 int idle_cpu(int cpu) 3581 int idle_cpu(int cpu)
3575 { 3582 {
3576 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 3583 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3577 } 3584 }
3578 3585
3579 /** 3586 /**
3580 * idle_task - return the idle task for a given cpu. 3587 * idle_task - return the idle task for a given cpu.
3581 * @cpu: the processor in question. 3588 * @cpu: the processor in question.
3582 */ 3589 */
3583 task_t *idle_task(int cpu) 3590 task_t *idle_task(int cpu)
3584 { 3591 {
3585 return cpu_rq(cpu)->idle; 3592 return cpu_rq(cpu)->idle;
3586 } 3593 }
3587 3594
3588 /** 3595 /**
3589 * find_process_by_pid - find a process with a matching PID value. 3596 * find_process_by_pid - find a process with a matching PID value.
3590 * @pid: the pid in question. 3597 * @pid: the pid in question.
3591 */ 3598 */
3592 static inline task_t *find_process_by_pid(pid_t pid) 3599 static inline task_t *find_process_by_pid(pid_t pid)
3593 { 3600 {
3594 return pid ? find_task_by_pid(pid) : current; 3601 return pid ? find_task_by_pid(pid) : current;
3595 } 3602 }
3596 3603
3597 /* Actually do priority change: must hold rq lock. */ 3604 /* Actually do priority change: must hold rq lock. */
3598 static void __setscheduler(struct task_struct *p, int policy, int prio) 3605 static void __setscheduler(struct task_struct *p, int policy, int prio)
3599 { 3606 {
3600 BUG_ON(p->array); 3607 BUG_ON(p->array);
3601 p->policy = policy; 3608 p->policy = policy;
3602 p->rt_priority = prio; 3609 p->rt_priority = prio;
3603 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3610 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
3604 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3611 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
3605 } else { 3612 } else {
3606 p->prio = p->static_prio; 3613 p->prio = p->static_prio;
3607 /* 3614 /*
3608 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3615 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3609 */ 3616 */
3610 if (policy == SCHED_BATCH) 3617 if (policy == SCHED_BATCH)
3611 p->sleep_avg = 0; 3618 p->sleep_avg = 0;
3612 } 3619 }
3613 } 3620 }
3614 3621
3615 /** 3622 /**
3616 * sched_setscheduler - change the scheduling policy and/or RT priority of 3623 * sched_setscheduler - change the scheduling policy and/or RT priority of
3617 * a thread. 3624 * a thread.
3618 * @p: the task in question. 3625 * @p: the task in question.
3619 * @policy: new policy. 3626 * @policy: new policy.
3620 * @param: structure containing the new RT priority. 3627 * @param: structure containing the new RT priority.
3621 */ 3628 */
3622 int sched_setscheduler(struct task_struct *p, int policy, 3629 int sched_setscheduler(struct task_struct *p, int policy,
3623 struct sched_param *param) 3630 struct sched_param *param)
3624 { 3631 {
3625 int retval; 3632 int retval;
3626 int oldprio, oldpolicy = -1; 3633 int oldprio, oldpolicy = -1;
3627 prio_array_t *array; 3634 prio_array_t *array;
3628 unsigned long flags; 3635 unsigned long flags;
3629 runqueue_t *rq; 3636 runqueue_t *rq;
3630 3637
3631 recheck: 3638 recheck:
3632 /* double check policy once rq lock held */ 3639 /* double check policy once rq lock held */
3633 if (policy < 0) 3640 if (policy < 0)
3634 policy = oldpolicy = p->policy; 3641 policy = oldpolicy = p->policy;
3635 else if (policy != SCHED_FIFO && policy != SCHED_RR && 3642 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
3636 policy != SCHED_NORMAL && policy != SCHED_BATCH) 3643 policy != SCHED_NORMAL && policy != SCHED_BATCH)
3637 return -EINVAL; 3644 return -EINVAL;
3638 /* 3645 /*
3639 * Valid priorities for SCHED_FIFO and SCHED_RR are 3646 * Valid priorities for SCHED_FIFO and SCHED_RR are
3640 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 3647 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
3641 * SCHED_BATCH is 0. 3648 * SCHED_BATCH is 0.
3642 */ 3649 */
3643 if (param->sched_priority < 0 || 3650 if (param->sched_priority < 0 ||
3644 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3651 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3645 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3652 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3646 return -EINVAL; 3653 return -EINVAL;
3647 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) 3654 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
3648 != (param->sched_priority == 0)) 3655 != (param->sched_priority == 0))
3649 return -EINVAL; 3656 return -EINVAL;
3650 3657
3651 /* 3658 /*
3652 * Allow unprivileged RT tasks to decrease priority: 3659 * Allow unprivileged RT tasks to decrease priority:
3653 */ 3660 */
3654 if (!capable(CAP_SYS_NICE)) { 3661 if (!capable(CAP_SYS_NICE)) {
3655 /* 3662 /*
3656 * can't change policy, except between SCHED_NORMAL 3663 * can't change policy, except between SCHED_NORMAL
3657 * and SCHED_BATCH: 3664 * and SCHED_BATCH:
3658 */ 3665 */
3659 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && 3666 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
3660 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && 3667 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
3661 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3668 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3662 return -EPERM; 3669 return -EPERM;
3663 /* can't increase priority */ 3670 /* can't increase priority */
3664 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && 3671 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
3665 param->sched_priority > p->rt_priority && 3672 param->sched_priority > p->rt_priority &&
3666 param->sched_priority > 3673 param->sched_priority >
3667 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 3674 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3668 return -EPERM; 3675 return -EPERM;
3669 /* can't change other user's priorities */ 3676 /* can't change other user's priorities */
3670 if ((current->euid != p->euid) && 3677 if ((current->euid != p->euid) &&
3671 (current->euid != p->uid)) 3678 (current->euid != p->uid))
3672 return -EPERM; 3679 return -EPERM;
3673 } 3680 }
3674 3681
3675 retval = security_task_setscheduler(p, policy, param); 3682 retval = security_task_setscheduler(p, policy, param);
3676 if (retval) 3683 if (retval)
3677 return retval; 3684 return retval;
3678 /* 3685 /*
3679 * To be able to change p->policy safely, the apropriate 3686 * To be able to change p->policy safely, the apropriate
3680 * runqueue lock must be held. 3687 * runqueue lock must be held.
3681 */ 3688 */
3682 rq = task_rq_lock(p, &flags); 3689 rq = task_rq_lock(p, &flags);
3683 /* recheck policy now with rq lock held */ 3690 /* recheck policy now with rq lock held */
3684 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3691 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3685 policy = oldpolicy = -1; 3692 policy = oldpolicy = -1;
3686 task_rq_unlock(rq, &flags); 3693 task_rq_unlock(rq, &flags);
3687 goto recheck; 3694 goto recheck;
3688 } 3695 }
3689 array = p->array; 3696 array = p->array;
3690 if (array) 3697 if (array)
3691 deactivate_task(p, rq); 3698 deactivate_task(p, rq);
3692 oldprio = p->prio; 3699 oldprio = p->prio;
3693 __setscheduler(p, policy, param->sched_priority); 3700 __setscheduler(p, policy, param->sched_priority);
3694 if (array) { 3701 if (array) {
3695 __activate_task(p, rq); 3702 __activate_task(p, rq);
3696 /* 3703 /*
3697 * Reschedule if we are currently running on this runqueue and 3704 * Reschedule if we are currently running on this runqueue and
3698 * our priority decreased, or if we are not currently running on 3705 * our priority decreased, or if we are not currently running on
3699 * this runqueue and our priority is higher than the current's 3706 * this runqueue and our priority is higher than the current's
3700 */ 3707 */
3701 if (task_running(rq, p)) { 3708 if (task_running(rq, p)) {
3702 if (p->prio > oldprio) 3709 if (p->prio > oldprio)
3703 resched_task(rq->curr); 3710 resched_task(rq->curr);
3704 } else if (TASK_PREEMPTS_CURR(p, rq)) 3711 } else if (TASK_PREEMPTS_CURR(p, rq))
3705 resched_task(rq->curr); 3712 resched_task(rq->curr);
3706 } 3713 }
3707 task_rq_unlock(rq, &flags); 3714 task_rq_unlock(rq, &flags);
3708 return 0; 3715 return 0;
3709 } 3716 }
3710 EXPORT_SYMBOL_GPL(sched_setscheduler); 3717 EXPORT_SYMBOL_GPL(sched_setscheduler);
3711 3718
3712 static int 3719 static int
3713 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3720 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3714 { 3721 {
3715 int retval; 3722 int retval;
3716 struct sched_param lparam; 3723 struct sched_param lparam;
3717 struct task_struct *p; 3724 struct task_struct *p;
3718 3725
3719 if (!param || pid < 0) 3726 if (!param || pid < 0)
3720 return -EINVAL; 3727 return -EINVAL;
3721 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3728 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3722 return -EFAULT; 3729 return -EFAULT;
3723 read_lock_irq(&tasklist_lock); 3730 read_lock_irq(&tasklist_lock);
3724 p = find_process_by_pid(pid); 3731 p = find_process_by_pid(pid);
3725 if (!p) { 3732 if (!p) {
3726 read_unlock_irq(&tasklist_lock); 3733 read_unlock_irq(&tasklist_lock);
3727 return -ESRCH; 3734 return -ESRCH;
3728 } 3735 }
3729 retval = sched_setscheduler(p, policy, &lparam); 3736 retval = sched_setscheduler(p, policy, &lparam);
3730 read_unlock_irq(&tasklist_lock); 3737 read_unlock_irq(&tasklist_lock);
3731 return retval; 3738 return retval;
3732 } 3739 }
3733 3740
3734 /** 3741 /**
3735 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3742 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3736 * @pid: the pid in question. 3743 * @pid: the pid in question.
3737 * @policy: new policy. 3744 * @policy: new policy.
3738 * @param: structure containing the new RT priority. 3745 * @param: structure containing the new RT priority.
3739 */ 3746 */
3740 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 3747 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3741 struct sched_param __user *param) 3748 struct sched_param __user *param)
3742 { 3749 {
3743 /* negative values for policy are not valid */ 3750 /* negative values for policy are not valid */
3744 if (policy < 0) 3751 if (policy < 0)
3745 return -EINVAL; 3752 return -EINVAL;
3746 3753
3747 return do_sched_setscheduler(pid, policy, param); 3754 return do_sched_setscheduler(pid, policy, param);
3748 } 3755 }
3749 3756
3750 /** 3757 /**
3751 * sys_sched_setparam - set/change the RT priority of a thread 3758 * sys_sched_setparam - set/change the RT priority of a thread
3752 * @pid: the pid in question. 3759 * @pid: the pid in question.
3753 * @param: structure containing the new RT priority. 3760 * @param: structure containing the new RT priority.
3754 */ 3761 */
3755 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 3762 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3756 { 3763 {
3757 return do_sched_setscheduler(pid, -1, param); 3764 return do_sched_setscheduler(pid, -1, param);
3758 } 3765 }
3759 3766
3760 /** 3767 /**
3761 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3768 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3762 * @pid: the pid in question. 3769 * @pid: the pid in question.
3763 */ 3770 */
3764 asmlinkage long sys_sched_getscheduler(pid_t pid) 3771 asmlinkage long sys_sched_getscheduler(pid_t pid)
3765 { 3772 {
3766 int retval = -EINVAL; 3773 int retval = -EINVAL;
3767 task_t *p; 3774 task_t *p;
3768 3775
3769 if (pid < 0) 3776 if (pid < 0)
3770 goto out_nounlock; 3777 goto out_nounlock;
3771 3778
3772 retval = -ESRCH; 3779 retval = -ESRCH;
3773 read_lock(&tasklist_lock); 3780 read_lock(&tasklist_lock);
3774 p = find_process_by_pid(pid); 3781 p = find_process_by_pid(pid);
3775 if (p) { 3782 if (p) {
3776 retval = security_task_getscheduler(p); 3783 retval = security_task_getscheduler(p);
3777 if (!retval) 3784 if (!retval)
3778 retval = p->policy; 3785 retval = p->policy;
3779 } 3786 }
3780 read_unlock(&tasklist_lock); 3787 read_unlock(&tasklist_lock);
3781 3788
3782 out_nounlock: 3789 out_nounlock:
3783 return retval; 3790 return retval;
3784 } 3791 }
3785 3792
3786 /** 3793 /**
3787 * sys_sched_getscheduler - get the RT priority of a thread 3794 * sys_sched_getscheduler - get the RT priority of a thread
3788 * @pid: the pid in question. 3795 * @pid: the pid in question.
3789 * @param: structure containing the RT priority. 3796 * @param: structure containing the RT priority.
3790 */ 3797 */
3791 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 3798 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3792 { 3799 {
3793 struct sched_param lp; 3800 struct sched_param lp;
3794 int retval = -EINVAL; 3801 int retval = -EINVAL;
3795 task_t *p; 3802 task_t *p;
3796 3803
3797 if (!param || pid < 0) 3804 if (!param || pid < 0)
3798 goto out_nounlock; 3805 goto out_nounlock;
3799 3806
3800 read_lock(&tasklist_lock); 3807 read_lock(&tasklist_lock);
3801 p = find_process_by_pid(pid); 3808 p = find_process_by_pid(pid);
3802 retval = -ESRCH; 3809 retval = -ESRCH;
3803 if (!p) 3810 if (!p)
3804 goto out_unlock; 3811 goto out_unlock;
3805 3812
3806 retval = security_task_getscheduler(p); 3813 retval = security_task_getscheduler(p);
3807 if (retval) 3814 if (retval)
3808 goto out_unlock; 3815 goto out_unlock;
3809 3816
3810 lp.sched_priority = p->rt_priority; 3817 lp.sched_priority = p->rt_priority;
3811 read_unlock(&tasklist_lock); 3818 read_unlock(&tasklist_lock);
3812 3819
3813 /* 3820 /*
3814 * This one might sleep, we cannot do it with a spinlock held ... 3821 * This one might sleep, we cannot do it with a spinlock held ...
3815 */ 3822 */
3816 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3823 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3817 3824
3818 out_nounlock: 3825 out_nounlock:
3819 return retval; 3826 return retval;
3820 3827
3821 out_unlock: 3828 out_unlock:
3822 read_unlock(&tasklist_lock); 3829 read_unlock(&tasklist_lock);
3823 return retval; 3830 return retval;
3824 } 3831 }
3825 3832
3826 long sched_setaffinity(pid_t pid, cpumask_t new_mask) 3833 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3827 { 3834 {
3828 task_t *p; 3835 task_t *p;
3829 int retval; 3836 int retval;
3830 cpumask_t cpus_allowed; 3837 cpumask_t cpus_allowed;
3831 3838
3832 lock_cpu_hotplug(); 3839 lock_cpu_hotplug();
3833 read_lock(&tasklist_lock); 3840 read_lock(&tasklist_lock);
3834 3841
3835 p = find_process_by_pid(pid); 3842 p = find_process_by_pid(pid);
3836 if (!p) { 3843 if (!p) {
3837 read_unlock(&tasklist_lock); 3844 read_unlock(&tasklist_lock);
3838 unlock_cpu_hotplug(); 3845 unlock_cpu_hotplug();
3839 return -ESRCH; 3846 return -ESRCH;
3840 } 3847 }
3841 3848
3842 /* 3849 /*
3843 * It is not safe to call set_cpus_allowed with the 3850 * It is not safe to call set_cpus_allowed with the
3844 * tasklist_lock held. We will bump the task_struct's 3851 * tasklist_lock held. We will bump the task_struct's
3845 * usage count and then drop tasklist_lock. 3852 * usage count and then drop tasklist_lock.
3846 */ 3853 */
3847 get_task_struct(p); 3854 get_task_struct(p);
3848 read_unlock(&tasklist_lock); 3855 read_unlock(&tasklist_lock);
3849 3856
3850 retval = -EPERM; 3857 retval = -EPERM;
3851 if ((current->euid != p->euid) && (current->euid != p->uid) && 3858 if ((current->euid != p->euid) && (current->euid != p->uid) &&
3852 !capable(CAP_SYS_NICE)) 3859 !capable(CAP_SYS_NICE))
3853 goto out_unlock; 3860 goto out_unlock;
3854 3861
3855 cpus_allowed = cpuset_cpus_allowed(p); 3862 cpus_allowed = cpuset_cpus_allowed(p);
3856 cpus_and(new_mask, new_mask, cpus_allowed); 3863 cpus_and(new_mask, new_mask, cpus_allowed);
3857 retval = set_cpus_allowed(p, new_mask); 3864 retval = set_cpus_allowed(p, new_mask);
3858 3865
3859 out_unlock: 3866 out_unlock:
3860 put_task_struct(p); 3867 put_task_struct(p);
3861 unlock_cpu_hotplug(); 3868 unlock_cpu_hotplug();
3862 return retval; 3869 return retval;
3863 } 3870 }
3864 3871
3865 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 3872 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3866 cpumask_t *new_mask) 3873 cpumask_t *new_mask)
3867 { 3874 {
3868 if (len < sizeof(cpumask_t)) { 3875 if (len < sizeof(cpumask_t)) {
3869 memset(new_mask, 0, sizeof(cpumask_t)); 3876 memset(new_mask, 0, sizeof(cpumask_t));
3870 } else if (len > sizeof(cpumask_t)) { 3877 } else if (len > sizeof(cpumask_t)) {
3871 len = sizeof(cpumask_t); 3878 len = sizeof(cpumask_t);
3872 } 3879 }
3873 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 3880 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3874 } 3881 }
3875 3882
3876 /** 3883 /**
3877 * sys_sched_setaffinity - set the cpu affinity of a process 3884 * sys_sched_setaffinity - set the cpu affinity of a process
3878 * @pid: pid of the process 3885 * @pid: pid of the process
3879 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3886 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3880 * @user_mask_ptr: user-space pointer to the new cpu mask 3887 * @user_mask_ptr: user-space pointer to the new cpu mask
3881 */ 3888 */
3882 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 3889 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3883 unsigned long __user *user_mask_ptr) 3890 unsigned long __user *user_mask_ptr)
3884 { 3891 {
3885 cpumask_t new_mask; 3892 cpumask_t new_mask;
3886 int retval; 3893 int retval;
3887 3894
3888 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 3895 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
3889 if (retval) 3896 if (retval)
3890 return retval; 3897 return retval;
3891 3898
3892 return sched_setaffinity(pid, new_mask); 3899 return sched_setaffinity(pid, new_mask);
3893 } 3900 }
3894 3901
3895 /* 3902 /*
3896 * Represents all cpu's present in the system 3903 * Represents all cpu's present in the system
3897 * In systems capable of hotplug, this map could dynamically grow 3904 * In systems capable of hotplug, this map could dynamically grow
3898 * as new cpu's are detected in the system via any platform specific 3905 * as new cpu's are detected in the system via any platform specific
3899 * method, such as ACPI for e.g. 3906 * method, such as ACPI for e.g.
3900 */ 3907 */
3901 3908
3902 cpumask_t cpu_present_map __read_mostly; 3909 cpumask_t cpu_present_map __read_mostly;
3903 EXPORT_SYMBOL(cpu_present_map); 3910 EXPORT_SYMBOL(cpu_present_map);
3904 3911
3905 #ifndef CONFIG_SMP 3912 #ifndef CONFIG_SMP
3906 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; 3913 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
3907 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; 3914 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3908 #endif 3915 #endif
3909 3916
3910 long sched_getaffinity(pid_t pid, cpumask_t *mask) 3917 long sched_getaffinity(pid_t pid, cpumask_t *mask)
3911 { 3918 {
3912 int retval; 3919 int retval;
3913 task_t *p; 3920 task_t *p;
3914 3921
3915 lock_cpu_hotplug(); 3922 lock_cpu_hotplug();
3916 read_lock(&tasklist_lock); 3923 read_lock(&tasklist_lock);
3917 3924
3918 retval = -ESRCH; 3925 retval = -ESRCH;
3919 p = find_process_by_pid(pid); 3926 p = find_process_by_pid(pid);
3920 if (!p) 3927 if (!p)
3921 goto out_unlock; 3928 goto out_unlock;
3922 3929
3923 retval = 0; 3930 retval = 0;
3924 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 3931 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
3925 3932
3926 out_unlock: 3933 out_unlock:
3927 read_unlock(&tasklist_lock); 3934 read_unlock(&tasklist_lock);
3928 unlock_cpu_hotplug(); 3935 unlock_cpu_hotplug();
3929 if (retval) 3936 if (retval)
3930 return retval; 3937 return retval;
3931 3938
3932 return 0; 3939 return 0;
3933 } 3940 }
3934 3941
3935 /** 3942 /**
3936 * sys_sched_getaffinity - get the cpu affinity of a process 3943 * sys_sched_getaffinity - get the cpu affinity of a process
3937 * @pid: pid of the process 3944 * @pid: pid of the process
3938 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3945 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3939 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3946 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3940 */ 3947 */
3941 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 3948 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3942 unsigned long __user *user_mask_ptr) 3949 unsigned long __user *user_mask_ptr)
3943 { 3950 {
3944 int ret; 3951 int ret;
3945 cpumask_t mask; 3952 cpumask_t mask;
3946 3953
3947 if (len < sizeof(cpumask_t)) 3954 if (len < sizeof(cpumask_t))
3948 return -EINVAL; 3955 return -EINVAL;
3949 3956
3950 ret = sched_getaffinity(pid, &mask); 3957 ret = sched_getaffinity(pid, &mask);
3951 if (ret < 0) 3958 if (ret < 0)
3952 return ret; 3959 return ret;
3953 3960
3954 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 3961 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
3955 return -EFAULT; 3962 return -EFAULT;
3956 3963
3957 return sizeof(cpumask_t); 3964 return sizeof(cpumask_t);
3958 } 3965 }
3959 3966
3960 /** 3967 /**
3961 * sys_sched_yield - yield the current processor to other threads. 3968 * sys_sched_yield - yield the current processor to other threads.
3962 * 3969 *
3963 * this function yields the current CPU by moving the calling thread 3970 * this function yields the current CPU by moving the calling thread
3964 * to the expired array. If there are no other threads running on this 3971 * to the expired array. If there are no other threads running on this
3965 * CPU then this function will return. 3972 * CPU then this function will return.
3966 */ 3973 */
3967 asmlinkage long sys_sched_yield(void) 3974 asmlinkage long sys_sched_yield(void)
3968 { 3975 {
3969 runqueue_t *rq = this_rq_lock(); 3976 runqueue_t *rq = this_rq_lock();
3970 prio_array_t *array = current->array; 3977 prio_array_t *array = current->array;
3971 prio_array_t *target = rq->expired; 3978 prio_array_t *target = rq->expired;
3972 3979
3973 schedstat_inc(rq, yld_cnt); 3980 schedstat_inc(rq, yld_cnt);
3974 /* 3981 /*
3975 * We implement yielding by moving the task into the expired 3982 * We implement yielding by moving the task into the expired
3976 * queue. 3983 * queue.
3977 * 3984 *
3978 * (special rule: RT tasks will just roundrobin in the active 3985 * (special rule: RT tasks will just roundrobin in the active
3979 * array.) 3986 * array.)
3980 */ 3987 */
3981 if (rt_task(current)) 3988 if (rt_task(current))
3982 target = rq->active; 3989 target = rq->active;
3983 3990
3984 if (array->nr_active == 1) { 3991 if (array->nr_active == 1) {
3985 schedstat_inc(rq, yld_act_empty); 3992 schedstat_inc(rq, yld_act_empty);
3986 if (!rq->expired->nr_active) 3993 if (!rq->expired->nr_active)
3987 schedstat_inc(rq, yld_both_empty); 3994 schedstat_inc(rq, yld_both_empty);
3988 } else if (!rq->expired->nr_active) 3995 } else if (!rq->expired->nr_active)
3989 schedstat_inc(rq, yld_exp_empty); 3996 schedstat_inc(rq, yld_exp_empty);
3990 3997
3991 if (array != target) { 3998 if (array != target) {
3992 dequeue_task(current, array); 3999 dequeue_task(current, array);
3993 enqueue_task(current, target); 4000 enqueue_task(current, target);
3994 } else 4001 } else
3995 /* 4002 /*
3996 * requeue_task is cheaper so perform that if possible. 4003 * requeue_task is cheaper so perform that if possible.
3997 */ 4004 */
3998 requeue_task(current, array); 4005 requeue_task(current, array);
3999 4006
4000 /* 4007 /*
4001 * Since we are going to call schedule() anyway, there's 4008 * Since we are going to call schedule() anyway, there's
4002 * no need to preempt or enable interrupts: 4009 * no need to preempt or enable interrupts:
4003 */ 4010 */
4004 __release(rq->lock); 4011 __release(rq->lock);
4005 _raw_spin_unlock(&rq->lock); 4012 _raw_spin_unlock(&rq->lock);
4006 preempt_enable_no_resched(); 4013 preempt_enable_no_resched();
4007 4014
4008 schedule(); 4015 schedule();
4009 4016
4010 return 0; 4017 return 0;
4011 } 4018 }
4012 4019
4013 static inline void __cond_resched(void) 4020 static inline void __cond_resched(void)
4014 { 4021 {
4015 /* 4022 /*
4016 * The BKS might be reacquired before we have dropped 4023 * The BKS might be reacquired before we have dropped
4017 * PREEMPT_ACTIVE, which could trigger a second 4024 * PREEMPT_ACTIVE, which could trigger a second
4018 * cond_resched() call. 4025 * cond_resched() call.
4019 */ 4026 */
4020 if (unlikely(preempt_count())) 4027 if (unlikely(preempt_count()))
4021 return; 4028 return;
4022 if (unlikely(system_state != SYSTEM_RUNNING)) 4029 if (unlikely(system_state != SYSTEM_RUNNING))
4023 return; 4030 return;
4024 do { 4031 do {
4025 add_preempt_count(PREEMPT_ACTIVE); 4032 add_preempt_count(PREEMPT_ACTIVE);
4026 schedule(); 4033 schedule();
4027 sub_preempt_count(PREEMPT_ACTIVE); 4034 sub_preempt_count(PREEMPT_ACTIVE);
4028 } while (need_resched()); 4035 } while (need_resched());
4029 } 4036 }
4030 4037
4031 int __sched cond_resched(void) 4038 int __sched cond_resched(void)
4032 { 4039 {
4033 if (need_resched()) { 4040 if (need_resched()) {
4034 __cond_resched(); 4041 __cond_resched();
4035 return 1; 4042 return 1;
4036 } 4043 }
4037 return 0; 4044 return 0;
4038 } 4045 }
4039 4046
4040 EXPORT_SYMBOL(cond_resched); 4047 EXPORT_SYMBOL(cond_resched);
4041 4048
4042 /* 4049 /*
4043 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4050 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4044 * call schedule, and on return reacquire the lock. 4051 * call schedule, and on return reacquire the lock.
4045 * 4052 *
4046 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4053 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4047 * operations here to prevent schedule() from being called twice (once via 4054 * operations here to prevent schedule() from being called twice (once via
4048 * spin_unlock(), once by hand). 4055 * spin_unlock(), once by hand).
4049 */ 4056 */
4050 int cond_resched_lock(spinlock_t *lock) 4057 int cond_resched_lock(spinlock_t *lock)
4051 { 4058 {
4052 int ret = 0; 4059 int ret = 0;
4053 4060
4054 if (need_lockbreak(lock)) { 4061 if (need_lockbreak(lock)) {
4055 spin_unlock(lock); 4062 spin_unlock(lock);
4056 cpu_relax(); 4063 cpu_relax();
4057 ret = 1; 4064 ret = 1;
4058 spin_lock(lock); 4065 spin_lock(lock);
4059 } 4066 }
4060 if (need_resched()) { 4067 if (need_resched()) {
4061 _raw_spin_unlock(lock); 4068 _raw_spin_unlock(lock);
4062 preempt_enable_no_resched(); 4069 preempt_enable_no_resched();
4063 __cond_resched(); 4070 __cond_resched();
4064 ret = 1; 4071 ret = 1;
4065 spin_lock(lock); 4072 spin_lock(lock);
4066 } 4073 }
4067 return ret; 4074 return ret;
4068 } 4075 }
4069 4076
4070 EXPORT_SYMBOL(cond_resched_lock); 4077 EXPORT_SYMBOL(cond_resched_lock);
4071 4078
4072 int __sched cond_resched_softirq(void) 4079 int __sched cond_resched_softirq(void)
4073 { 4080 {
4074 BUG_ON(!in_softirq()); 4081 BUG_ON(!in_softirq());
4075 4082
4076 if (need_resched()) { 4083 if (need_resched()) {
4077 __local_bh_enable(); 4084 __local_bh_enable();
4078 __cond_resched(); 4085 __cond_resched();
4079 local_bh_disable(); 4086 local_bh_disable();
4080 return 1; 4087 return 1;
4081 } 4088 }
4082 return 0; 4089 return 0;
4083 } 4090 }
4084 4091
4085 EXPORT_SYMBOL(cond_resched_softirq); 4092 EXPORT_SYMBOL(cond_resched_softirq);
4086 4093
4087 4094
4088 /** 4095 /**
4089 * yield - yield the current processor to other threads. 4096 * yield - yield the current processor to other threads.
4090 * 4097 *
4091 * this is a shortcut for kernel-space yielding - it marks the 4098 * this is a shortcut for kernel-space yielding - it marks the
4092 * thread runnable and calls sys_sched_yield(). 4099 * thread runnable and calls sys_sched_yield().
4093 */ 4100 */
4094 void __sched yield(void) 4101 void __sched yield(void)
4095 { 4102 {
4096 set_current_state(TASK_RUNNING); 4103 set_current_state(TASK_RUNNING);
4097 sys_sched_yield(); 4104 sys_sched_yield();
4098 } 4105 }
4099 4106
4100 EXPORT_SYMBOL(yield); 4107 EXPORT_SYMBOL(yield);
4101 4108
4102 /* 4109 /*
4103 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4110 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4104 * that process accounting knows that this is a task in IO wait state. 4111 * that process accounting knows that this is a task in IO wait state.
4105 * 4112 *
4106 * But don't do that if it is a deliberate, throttling IO wait (this task 4113 * But don't do that if it is a deliberate, throttling IO wait (this task
4107 * has set its backing_dev_info: the queue against which it should throttle) 4114 * has set its backing_dev_info: the queue against which it should throttle)
4108 */ 4115 */
4109 void __sched io_schedule(void) 4116 void __sched io_schedule(void)
4110 { 4117 {
4111 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4118 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
4112 4119
4113 atomic_inc(&rq->nr_iowait); 4120 atomic_inc(&rq->nr_iowait);
4114 schedule(); 4121 schedule();
4115 atomic_dec(&rq->nr_iowait); 4122 atomic_dec(&rq->nr_iowait);
4116 } 4123 }
4117 4124
4118 EXPORT_SYMBOL(io_schedule); 4125 EXPORT_SYMBOL(io_schedule);
4119 4126
4120 long __sched io_schedule_timeout(long timeout) 4127 long __sched io_schedule_timeout(long timeout)
4121 { 4128 {
4122 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4129 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
4123 long ret; 4130 long ret;
4124 4131
4125 atomic_inc(&rq->nr_iowait); 4132 atomic_inc(&rq->nr_iowait);
4126 ret = schedule_timeout(timeout); 4133 ret = schedule_timeout(timeout);
4127 atomic_dec(&rq->nr_iowait); 4134 atomic_dec(&rq->nr_iowait);
4128 return ret; 4135 return ret;
4129 } 4136 }
4130 4137
4131 /** 4138 /**
4132 * sys_sched_get_priority_max - return maximum RT priority. 4139 * sys_sched_get_priority_max - return maximum RT priority.
4133 * @policy: scheduling class. 4140 * @policy: scheduling class.
4134 * 4141 *
4135 * this syscall returns the maximum rt_priority that can be used 4142 * this syscall returns the maximum rt_priority that can be used
4136 * by a given scheduling class. 4143 * by a given scheduling class.
4137 */ 4144 */
4138 asmlinkage long sys_sched_get_priority_max(int policy) 4145 asmlinkage long sys_sched_get_priority_max(int policy)
4139 { 4146 {
4140 int ret = -EINVAL; 4147 int ret = -EINVAL;
4141 4148
4142 switch (policy) { 4149 switch (policy) {
4143 case SCHED_FIFO: 4150 case SCHED_FIFO:
4144 case SCHED_RR: 4151 case SCHED_RR:
4145 ret = MAX_USER_RT_PRIO-1; 4152 ret = MAX_USER_RT_PRIO-1;
4146 break; 4153 break;
4147 case SCHED_NORMAL: 4154 case SCHED_NORMAL:
4148 case SCHED_BATCH: 4155 case SCHED_BATCH:
4149 ret = 0; 4156 ret = 0;
4150 break; 4157 break;
4151 } 4158 }
4152 return ret; 4159 return ret;
4153 } 4160 }
4154 4161
4155 /** 4162 /**
4156 * sys_sched_get_priority_min - return minimum RT priority. 4163 * sys_sched_get_priority_min - return minimum RT priority.
4157 * @policy: scheduling class. 4164 * @policy: scheduling class.
4158 * 4165 *
4159 * this syscall returns the minimum rt_priority that can be used 4166 * this syscall returns the minimum rt_priority that can be used
4160 * by a given scheduling class. 4167 * by a given scheduling class.
4161 */ 4168 */
4162 asmlinkage long sys_sched_get_priority_min(int policy) 4169 asmlinkage long sys_sched_get_priority_min(int policy)
4163 { 4170 {
4164 int ret = -EINVAL; 4171 int ret = -EINVAL;
4165 4172
4166 switch (policy) { 4173 switch (policy) {
4167 case SCHED_FIFO: 4174 case SCHED_FIFO:
4168 case SCHED_RR: 4175 case SCHED_RR:
4169 ret = 1; 4176 ret = 1;
4170 break; 4177 break;
4171 case SCHED_NORMAL: 4178 case SCHED_NORMAL:
4172 case SCHED_BATCH: 4179 case SCHED_BATCH:
4173 ret = 0; 4180 ret = 0;
4174 } 4181 }
4175 return ret; 4182 return ret;
4176 } 4183 }
4177 4184
4178 /** 4185 /**
4179 * sys_sched_rr_get_interval - return the default timeslice of a process. 4186 * sys_sched_rr_get_interval - return the default timeslice of a process.
4180 * @pid: pid of the process. 4187 * @pid: pid of the process.
4181 * @interval: userspace pointer to the timeslice value. 4188 * @interval: userspace pointer to the timeslice value.
4182 * 4189 *
4183 * this syscall writes the default timeslice value of a given process 4190 * this syscall writes the default timeslice value of a given process
4184 * into the user-space timespec buffer. A value of '0' means infinity. 4191 * into the user-space timespec buffer. A value of '0' means infinity.
4185 */ 4192 */
4186 asmlinkage 4193 asmlinkage
4187 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4194 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4188 { 4195 {
4189 int retval = -EINVAL; 4196 int retval = -EINVAL;
4190 struct timespec t; 4197 struct timespec t;
4191 task_t *p; 4198 task_t *p;
4192 4199
4193 if (pid < 0) 4200 if (pid < 0)
4194 goto out_nounlock; 4201 goto out_nounlock;
4195 4202
4196 retval = -ESRCH; 4203 retval = -ESRCH;
4197 read_lock(&tasklist_lock); 4204 read_lock(&tasklist_lock);
4198 p = find_process_by_pid(pid); 4205 p = find_process_by_pid(pid);
4199 if (!p) 4206 if (!p)
4200 goto out_unlock; 4207 goto out_unlock;
4201 4208
4202 retval = security_task_getscheduler(p); 4209 retval = security_task_getscheduler(p);
4203 if (retval) 4210 if (retval)
4204 goto out_unlock; 4211 goto out_unlock;
4205 4212
4206 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4213 jiffies_to_timespec(p->policy & SCHED_FIFO ?
4207 0 : task_timeslice(p), &t); 4214 0 : task_timeslice(p), &t);
4208 read_unlock(&tasklist_lock); 4215 read_unlock(&tasklist_lock);
4209 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4216 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4210 out_nounlock: 4217 out_nounlock:
4211 return retval; 4218 return retval;
4212 out_unlock: 4219 out_unlock:
4213 read_unlock(&tasklist_lock); 4220 read_unlock(&tasklist_lock);
4214 return retval; 4221 return retval;
4215 } 4222 }
4216 4223
4217 static inline struct task_struct *eldest_child(struct task_struct *p) 4224 static inline struct task_struct *eldest_child(struct task_struct *p)
4218 { 4225 {
4219 if (list_empty(&p->children)) return NULL; 4226 if (list_empty(&p->children)) return NULL;
4220 return list_entry(p->children.next,struct task_struct,sibling); 4227 return list_entry(p->children.next,struct task_struct,sibling);
4221 } 4228 }
4222 4229
4223 static inline struct task_struct *older_sibling(struct task_struct *p) 4230 static inline struct task_struct *older_sibling(struct task_struct *p)
4224 { 4231 {
4225 if (p->sibling.prev==&p->parent->children) return NULL; 4232 if (p->sibling.prev==&p->parent->children) return NULL;
4226 return list_entry(p->sibling.prev,struct task_struct,sibling); 4233 return list_entry(p->sibling.prev,struct task_struct,sibling);
4227 } 4234 }
4228 4235
4229 static inline struct task_struct *younger_sibling(struct task_struct *p) 4236 static inline struct task_struct *younger_sibling(struct task_struct *p)
4230 { 4237 {
4231 if (p->sibling.next==&p->parent->children) return NULL; 4238 if (p->sibling.next==&p->parent->children) return NULL;
4232 return list_entry(p->sibling.next,struct task_struct,sibling); 4239 return list_entry(p->sibling.next,struct task_struct,sibling);
4233 } 4240 }
4234 4241
4235 static void show_task(task_t *p) 4242 static void show_task(task_t *p)
4236 { 4243 {
4237 task_t *relative; 4244 task_t *relative;
4238 unsigned state; 4245 unsigned state;
4239 unsigned long free = 0; 4246 unsigned long free = 0;
4240 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; 4247 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
4241 4248
4242 printk("%-13.13s ", p->comm); 4249 printk("%-13.13s ", p->comm);
4243 state = p->state ? __ffs(p->state) + 1 : 0; 4250 state = p->state ? __ffs(p->state) + 1 : 0;
4244 if (state < ARRAY_SIZE(stat_nam)) 4251 if (state < ARRAY_SIZE(stat_nam))
4245 printk(stat_nam[state]); 4252 printk(stat_nam[state]);
4246 else 4253 else
4247 printk("?"); 4254 printk("?");
4248 #if (BITS_PER_LONG == 32) 4255 #if (BITS_PER_LONG == 32)
4249 if (state == TASK_RUNNING) 4256 if (state == TASK_RUNNING)
4250 printk(" running "); 4257 printk(" running ");
4251 else 4258 else
4252 printk(" %08lX ", thread_saved_pc(p)); 4259 printk(" %08lX ", thread_saved_pc(p));
4253 #else 4260 #else
4254 if (state == TASK_RUNNING) 4261 if (state == TASK_RUNNING)
4255 printk(" running task "); 4262 printk(" running task ");
4256 else 4263 else
4257 printk(" %016lx ", thread_saved_pc(p)); 4264 printk(" %016lx ", thread_saved_pc(p));
4258 #endif 4265 #endif
4259 #ifdef CONFIG_DEBUG_STACK_USAGE 4266 #ifdef CONFIG_DEBUG_STACK_USAGE
4260 { 4267 {
4261 unsigned long *n = end_of_stack(p); 4268 unsigned long *n = end_of_stack(p);
4262 while (!*n) 4269 while (!*n)
4263 n++; 4270 n++;
4264 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4271 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4265 } 4272 }
4266 #endif 4273 #endif
4267 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); 4274 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
4268 if ((relative = eldest_child(p))) 4275 if ((relative = eldest_child(p)))
4269 printk("%5d ", relative->pid); 4276 printk("%5d ", relative->pid);
4270 else 4277 else
4271 printk(" "); 4278 printk(" ");
4272 if ((relative = younger_sibling(p))) 4279 if ((relative = younger_sibling(p)))
4273 printk("%7d", relative->pid); 4280 printk("%7d", relative->pid);
4274 else 4281 else
4275 printk(" "); 4282 printk(" ");
4276 if ((relative = older_sibling(p))) 4283 if ((relative = older_sibling(p)))
4277 printk(" %5d", relative->pid); 4284 printk(" %5d", relative->pid);
4278 else 4285 else
4279 printk(" "); 4286 printk(" ");
4280 if (!p->mm) 4287 if (!p->mm)
4281 printk(" (L-TLB)\n"); 4288 printk(" (L-TLB)\n");
4282 else 4289 else
4283 printk(" (NOTLB)\n"); 4290 printk(" (NOTLB)\n");
4284 4291
4285 if (state != TASK_RUNNING) 4292 if (state != TASK_RUNNING)
4286 show_stack(p, NULL); 4293 show_stack(p, NULL);
4287 } 4294 }
4288 4295
4289 void show_state(void) 4296 void show_state(void)
4290 { 4297 {
4291 task_t *g, *p; 4298 task_t *g, *p;
4292 4299
4293 #if (BITS_PER_LONG == 32) 4300 #if (BITS_PER_LONG == 32)
4294 printk("\n" 4301 printk("\n"
4295 " sibling\n"); 4302 " sibling\n");
4296 printk(" task PC pid father child younger older\n"); 4303 printk(" task PC pid father child younger older\n");
4297 #else 4304 #else
4298 printk("\n" 4305 printk("\n"
4299 " sibling\n"); 4306 " sibling\n");
4300 printk(" task PC pid father child younger older\n"); 4307 printk(" task PC pid father child younger older\n");
4301 #endif 4308 #endif
4302 read_lock(&tasklist_lock); 4309 read_lock(&tasklist_lock);
4303 do_each_thread(g, p) { 4310 do_each_thread(g, p) {
4304 /* 4311 /*
4305 * reset the NMI-timeout, listing all files on a slow 4312 * reset the NMI-timeout, listing all files on a slow
4306 * console might take alot of time: 4313 * console might take alot of time:
4307 */ 4314 */
4308 touch_nmi_watchdog(); 4315 touch_nmi_watchdog();
4309 show_task(p); 4316 show_task(p);
4310 } while_each_thread(g, p); 4317 } while_each_thread(g, p);
4311 4318
4312 read_unlock(&tasklist_lock); 4319 read_unlock(&tasklist_lock);
4313 mutex_debug_show_all_locks(); 4320 mutex_debug_show_all_locks();
4314 } 4321 }
4315 4322
4316 /** 4323 /**
4317 * init_idle - set up an idle thread for a given CPU 4324 * init_idle - set up an idle thread for a given CPU
4318 * @idle: task in question 4325 * @idle: task in question
4319 * @cpu: cpu the idle task belongs to 4326 * @cpu: cpu the idle task belongs to
4320 * 4327 *
4321 * NOTE: this function does not set the idle thread's NEED_RESCHED 4328 * NOTE: this function does not set the idle thread's NEED_RESCHED
4322 * flag, to make booting more robust. 4329 * flag, to make booting more robust.
4323 */ 4330 */
4324 void __devinit init_idle(task_t *idle, int cpu) 4331 void __devinit init_idle(task_t *idle, int cpu)
4325 { 4332 {
4326 runqueue_t *rq = cpu_rq(cpu); 4333 runqueue_t *rq = cpu_rq(cpu);
4327 unsigned long flags; 4334 unsigned long flags;
4328 4335
4329 idle->timestamp = sched_clock(); 4336 idle->timestamp = sched_clock();
4330 idle->sleep_avg = 0; 4337 idle->sleep_avg = 0;
4331 idle->array = NULL; 4338 idle->array = NULL;
4332 idle->prio = MAX_PRIO; 4339 idle->prio = MAX_PRIO;
4333 idle->state = TASK_RUNNING; 4340 idle->state = TASK_RUNNING;
4334 idle->cpus_allowed = cpumask_of_cpu(cpu); 4341 idle->cpus_allowed = cpumask_of_cpu(cpu);
4335 set_task_cpu(idle, cpu); 4342 set_task_cpu(idle, cpu);
4336 4343
4337 spin_lock_irqsave(&rq->lock, flags); 4344 spin_lock_irqsave(&rq->lock, flags);
4338 rq->curr = rq->idle = idle; 4345 rq->curr = rq->idle = idle;
4339 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 4346 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4340 idle->oncpu = 1; 4347 idle->oncpu = 1;
4341 #endif 4348 #endif
4342 spin_unlock_irqrestore(&rq->lock, flags); 4349 spin_unlock_irqrestore(&rq->lock, flags);
4343 4350
4344 /* Set the preempt count _outside_ the spinlocks! */ 4351 /* Set the preempt count _outside_ the spinlocks! */
4345 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) 4352 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4346 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 4353 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4347 #else 4354 #else
4348 task_thread_info(idle)->preempt_count = 0; 4355 task_thread_info(idle)->preempt_count = 0;
4349 #endif 4356 #endif
4350 } 4357 }
4351 4358
4352 /* 4359 /*
4353 * In a system that switches off the HZ timer nohz_cpu_mask 4360 * In a system that switches off the HZ timer nohz_cpu_mask
4354 * indicates which cpus entered this state. This is used 4361 * indicates which cpus entered this state. This is used
4355 * in the rcu update to wait only for active cpus. For system 4362 * in the rcu update to wait only for active cpus. For system
4356 * which do not switch off the HZ timer nohz_cpu_mask should 4363 * which do not switch off the HZ timer nohz_cpu_mask should
4357 * always be CPU_MASK_NONE. 4364 * always be CPU_MASK_NONE.
4358 */ 4365 */
4359 cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4366 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4360 4367
4361 #ifdef CONFIG_SMP 4368 #ifdef CONFIG_SMP
4362 /* 4369 /*
4363 * This is how migration works: 4370 * This is how migration works:
4364 * 4371 *
4365 * 1) we queue a migration_req_t structure in the source CPU's 4372 * 1) we queue a migration_req_t structure in the source CPU's
4366 * runqueue and wake up that CPU's migration thread. 4373 * runqueue and wake up that CPU's migration thread.
4367 * 2) we down() the locked semaphore => thread blocks. 4374 * 2) we down() the locked semaphore => thread blocks.
4368 * 3) migration thread wakes up (implicitly it forces the migrated 4375 * 3) migration thread wakes up (implicitly it forces the migrated
4369 * thread off the CPU) 4376 * thread off the CPU)
4370 * 4) it gets the migration request and checks whether the migrated 4377 * 4) it gets the migration request and checks whether the migrated
4371 * task is still in the wrong runqueue. 4378 * task is still in the wrong runqueue.
4372 * 5) if it's in the wrong runqueue then the migration thread removes 4379 * 5) if it's in the wrong runqueue then the migration thread removes
4373 * it and puts it into the right queue. 4380 * it and puts it into the right queue.
4374 * 6) migration thread up()s the semaphore. 4381 * 6) migration thread up()s the semaphore.
4375 * 7) we wake up and the migration is done. 4382 * 7) we wake up and the migration is done.
4376 */ 4383 */
4377 4384
4378 /* 4385 /*
4379 * Change a given task's CPU affinity. Migrate the thread to a 4386 * Change a given task's CPU affinity. Migrate the thread to a
4380 * proper CPU and schedule it away if the CPU it's executing on 4387 * proper CPU and schedule it away if the CPU it's executing on
4381 * is removed from the allowed bitmask. 4388 * is removed from the allowed bitmask.
4382 * 4389 *
4383 * NOTE: the caller must have a valid reference to the task, the 4390 * NOTE: the caller must have a valid reference to the task, the
4384 * task must not exit() & deallocate itself prematurely. The 4391 * task must not exit() & deallocate itself prematurely. The
4385 * call is not atomic; no spinlocks may be held. 4392 * call is not atomic; no spinlocks may be held.
4386 */ 4393 */
4387 int set_cpus_allowed(task_t *p, cpumask_t new_mask) 4394 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4388 { 4395 {
4389 unsigned long flags; 4396 unsigned long flags;
4390 int ret = 0; 4397 int ret = 0;
4391 migration_req_t req; 4398 migration_req_t req;
4392 runqueue_t *rq; 4399 runqueue_t *rq;
4393 4400
4394 rq = task_rq_lock(p, &flags); 4401 rq = task_rq_lock(p, &flags);
4395 if (!cpus_intersects(new_mask, cpu_online_map)) { 4402 if (!cpus_intersects(new_mask, cpu_online_map)) {
4396 ret = -EINVAL; 4403 ret = -EINVAL;
4397 goto out; 4404 goto out;
4398 } 4405 }
4399 4406
4400 p->cpus_allowed = new_mask; 4407 p->cpus_allowed = new_mask;
4401 /* Can the task run on the task's current CPU? If so, we're done */ 4408 /* Can the task run on the task's current CPU? If so, we're done */
4402 if (cpu_isset(task_cpu(p), new_mask)) 4409 if (cpu_isset(task_cpu(p), new_mask))
4403 goto out; 4410 goto out;
4404 4411
4405 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 4412 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4406 /* Need help from migration thread: drop lock and wait. */ 4413 /* Need help from migration thread: drop lock and wait. */
4407 task_rq_unlock(rq, &flags); 4414 task_rq_unlock(rq, &flags);
4408 wake_up_process(rq->migration_thread); 4415 wake_up_process(rq->migration_thread);
4409 wait_for_completion(&req.done); 4416 wait_for_completion(&req.done);
4410 tlb_migrate_finish(p->mm); 4417 tlb_migrate_finish(p->mm);
4411 return 0; 4418 return 0;
4412 } 4419 }
4413 out: 4420 out:
4414 task_rq_unlock(rq, &flags); 4421 task_rq_unlock(rq, &flags);
4415 return ret; 4422 return ret;
4416 } 4423 }
4417 4424
4418 EXPORT_SYMBOL_GPL(set_cpus_allowed); 4425 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4419 4426
4420 /* 4427 /*
4421 * Move (not current) task off this cpu, onto dest cpu. We're doing 4428 * Move (not current) task off this cpu, onto dest cpu. We're doing
4422 * this because either it can't run here any more (set_cpus_allowed() 4429 * this because either it can't run here any more (set_cpus_allowed()
4423 * away from this CPU, or CPU going down), or because we're 4430 * away from this CPU, or CPU going down), or because we're
4424 * attempting to rebalance this task on exec (sched_exec). 4431 * attempting to rebalance this task on exec (sched_exec).
4425 * 4432 *
4426 * So we race with normal scheduler movements, but that's OK, as long 4433 * So we race with normal scheduler movements, but that's OK, as long
4427 * as the task is no longer on this CPU. 4434 * as the task is no longer on this CPU.
4428 */ 4435 */
4429 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4436 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4430 { 4437 {
4431 runqueue_t *rq_dest, *rq_src; 4438 runqueue_t *rq_dest, *rq_src;
4432 4439
4433 if (unlikely(cpu_is_offline(dest_cpu))) 4440 if (unlikely(cpu_is_offline(dest_cpu)))
4434 return; 4441 return;
4435 4442
4436 rq_src = cpu_rq(src_cpu); 4443 rq_src = cpu_rq(src_cpu);
4437 rq_dest = cpu_rq(dest_cpu); 4444 rq_dest = cpu_rq(dest_cpu);
4438 4445
4439 double_rq_lock(rq_src, rq_dest); 4446 double_rq_lock(rq_src, rq_dest);
4440 /* Already moved. */ 4447 /* Already moved. */
4441 if (task_cpu(p) != src_cpu) 4448 if (task_cpu(p) != src_cpu)
4442 goto out; 4449 goto out;
4443 /* Affinity changed (again). */ 4450 /* Affinity changed (again). */
4444 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4451 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4445 goto out; 4452 goto out;
4446 4453
4447 set_task_cpu(p, dest_cpu); 4454 set_task_cpu(p, dest_cpu);
4448 if (p->array) { 4455 if (p->array) {
4449 /* 4456 /*
4450 * Sync timestamp with rq_dest's before activating. 4457 * Sync timestamp with rq_dest's before activating.
4451 * The same thing could be achieved by doing this step 4458 * The same thing could be achieved by doing this step
4452 * afterwards, and pretending it was a local activate. 4459 * afterwards, and pretending it was a local activate.
4453 * This way is cleaner and logically correct. 4460 * This way is cleaner and logically correct.
4454 */ 4461 */
4455 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 4462 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4456 + rq_dest->timestamp_last_tick; 4463 + rq_dest->timestamp_last_tick;
4457 deactivate_task(p, rq_src); 4464 deactivate_task(p, rq_src);
4458 activate_task(p, rq_dest, 0); 4465 activate_task(p, rq_dest, 0);
4459 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4466 if (TASK_PREEMPTS_CURR(p, rq_dest))
4460 resched_task(rq_dest->curr); 4467 resched_task(rq_dest->curr);
4461 } 4468 }
4462 4469
4463 out: 4470 out:
4464 double_rq_unlock(rq_src, rq_dest); 4471 double_rq_unlock(rq_src, rq_dest);
4465 } 4472 }
4466 4473
4467 /* 4474 /*
4468 * migration_thread - this is a highprio system thread that performs 4475 * migration_thread - this is a highprio system thread that performs
4469 * thread migration by bumping thread off CPU then 'pushing' onto 4476 * thread migration by bumping thread off CPU then 'pushing' onto
4470 * another runqueue. 4477 * another runqueue.
4471 */ 4478 */
4472 static int migration_thread(void *data) 4479 static int migration_thread(void *data)
4473 { 4480 {
4474 runqueue_t *rq; 4481 runqueue_t *rq;
4475 int cpu = (long)data; 4482 int cpu = (long)data;
4476 4483
4477 rq = cpu_rq(cpu); 4484 rq = cpu_rq(cpu);
4478 BUG_ON(rq->migration_thread != current); 4485 BUG_ON(rq->migration_thread != current);
4479 4486
4480 set_current_state(TASK_INTERRUPTIBLE); 4487 set_current_state(TASK_INTERRUPTIBLE);
4481 while (!kthread_should_stop()) { 4488 while (!kthread_should_stop()) {
4482 struct list_head *head; 4489 struct list_head *head;
4483 migration_req_t *req; 4490 migration_req_t *req;
4484 4491
4485 try_to_freeze(); 4492 try_to_freeze();
4486 4493
4487 spin_lock_irq(&rq->lock); 4494 spin_lock_irq(&rq->lock);
4488 4495
4489 if (cpu_is_offline(cpu)) { 4496 if (cpu_is_offline(cpu)) {
4490 spin_unlock_irq(&rq->lock); 4497 spin_unlock_irq(&rq->lock);
4491 goto wait_to_die; 4498 goto wait_to_die;
4492 } 4499 }
4493 4500
4494 if (rq->active_balance) { 4501 if (rq->active_balance) {
4495 active_load_balance(rq, cpu); 4502 active_load_balance(rq, cpu);
4496 rq->active_balance = 0; 4503 rq->active_balance = 0;
4497 } 4504 }
4498 4505
4499 head = &rq->migration_queue; 4506 head = &rq->migration_queue;
4500 4507
4501 if (list_empty(head)) { 4508 if (list_empty(head)) {
4502 spin_unlock_irq(&rq->lock); 4509 spin_unlock_irq(&rq->lock);
4503 schedule(); 4510 schedule();
4504 set_current_state(TASK_INTERRUPTIBLE); 4511 set_current_state(TASK_INTERRUPTIBLE);
4505 continue; 4512 continue;
4506 } 4513 }
4507 req = list_entry(head->next, migration_req_t, list); 4514 req = list_entry(head->next, migration_req_t, list);
4508 list_del_init(head->next); 4515 list_del_init(head->next);
4509 4516
4510 spin_unlock(&rq->lock); 4517 spin_unlock(&rq->lock);
4511 __migrate_task(req->task, cpu, req->dest_cpu); 4518 __migrate_task(req->task, cpu, req->dest_cpu);
4512 local_irq_enable(); 4519 local_irq_enable();
4513 4520
4514 complete(&req->done); 4521 complete(&req->done);
4515 } 4522 }
4516 __set_current_state(TASK_RUNNING); 4523 __set_current_state(TASK_RUNNING);
4517 return 0; 4524 return 0;
4518 4525
4519 wait_to_die: 4526 wait_to_die:
4520 /* Wait for kthread_stop */ 4527 /* Wait for kthread_stop */
4521 set_current_state(TASK_INTERRUPTIBLE); 4528 set_current_state(TASK_INTERRUPTIBLE);
4522 while (!kthread_should_stop()) { 4529 while (!kthread_should_stop()) {
4523 schedule(); 4530 schedule();
4524 set_current_state(TASK_INTERRUPTIBLE); 4531 set_current_state(TASK_INTERRUPTIBLE);
4525 } 4532 }
4526 __set_current_state(TASK_RUNNING); 4533 __set_current_state(TASK_RUNNING);
4527 return 0; 4534 return 0;
4528 } 4535 }
4529 4536
4530 #ifdef CONFIG_HOTPLUG_CPU 4537 #ifdef CONFIG_HOTPLUG_CPU
4531 /* Figure out where task on dead CPU should go, use force if neccessary. */ 4538 /* Figure out where task on dead CPU should go, use force if neccessary. */
4532 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4539 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4533 { 4540 {
4534 int dest_cpu; 4541 int dest_cpu;
4535 cpumask_t mask; 4542 cpumask_t mask;
4536 4543
4537 /* On same node? */ 4544 /* On same node? */
4538 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4545 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4539 cpus_and(mask, mask, tsk->cpus_allowed); 4546 cpus_and(mask, mask, tsk->cpus_allowed);
4540 dest_cpu = any_online_cpu(mask); 4547 dest_cpu = any_online_cpu(mask);
4541 4548
4542 /* On any allowed CPU? */ 4549 /* On any allowed CPU? */
4543 if (dest_cpu == NR_CPUS) 4550 if (dest_cpu == NR_CPUS)
4544 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4551 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4545 4552
4546 /* No more Mr. Nice Guy. */ 4553 /* No more Mr. Nice Guy. */
4547 if (dest_cpu == NR_CPUS) { 4554 if (dest_cpu == NR_CPUS) {
4548 cpus_setall(tsk->cpus_allowed); 4555 cpus_setall(tsk->cpus_allowed);
4549 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4556 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4550 4557
4551 /* 4558 /*
4552 * Don't tell them about moving exiting tasks or 4559 * Don't tell them about moving exiting tasks or
4553 * kernel threads (both mm NULL), since they never 4560 * kernel threads (both mm NULL), since they never
4554 * leave kernel. 4561 * leave kernel.
4555 */ 4562 */
4556 if (tsk->mm && printk_ratelimit()) 4563 if (tsk->mm && printk_ratelimit())
4557 printk(KERN_INFO "process %d (%s) no " 4564 printk(KERN_INFO "process %d (%s) no "
4558 "longer affine to cpu%d\n", 4565 "longer affine to cpu%d\n",
4559 tsk->pid, tsk->comm, dead_cpu); 4566 tsk->pid, tsk->comm, dead_cpu);
4560 } 4567 }
4561 __migrate_task(tsk, dead_cpu, dest_cpu); 4568 __migrate_task(tsk, dead_cpu, dest_cpu);
4562 } 4569 }
4563 4570
4564 /* 4571 /*
4565 * While a dead CPU has no uninterruptible tasks queued at this point, 4572 * While a dead CPU has no uninterruptible tasks queued at this point,
4566 * it might still have a nonzero ->nr_uninterruptible counter, because 4573 * it might still have a nonzero ->nr_uninterruptible counter, because
4567 * for performance reasons the counter is not stricly tracking tasks to 4574 * for performance reasons the counter is not stricly tracking tasks to
4568 * their home CPUs. So we just add the counter to another CPU's counter, 4575 * their home CPUs. So we just add the counter to another CPU's counter,
4569 * to keep the global sum constant after CPU-down: 4576 * to keep the global sum constant after CPU-down:
4570 */ 4577 */
4571 static void migrate_nr_uninterruptible(runqueue_t *rq_src) 4578 static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4572 { 4579 {
4573 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 4580 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4574 unsigned long flags; 4581 unsigned long flags;
4575 4582
4576 local_irq_save(flags); 4583 local_irq_save(flags);
4577 double_rq_lock(rq_src, rq_dest); 4584 double_rq_lock(rq_src, rq_dest);
4578 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 4585 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4579 rq_src->nr_uninterruptible = 0; 4586 rq_src->nr_uninterruptible = 0;
4580 double_rq_unlock(rq_src, rq_dest); 4587 double_rq_unlock(rq_src, rq_dest);
4581 local_irq_restore(flags); 4588 local_irq_restore(flags);
4582 } 4589 }
4583 4590
4584 /* Run through task list and migrate tasks from the dead cpu. */ 4591 /* Run through task list and migrate tasks from the dead cpu. */
4585 static void migrate_live_tasks(int src_cpu) 4592 static void migrate_live_tasks(int src_cpu)
4586 { 4593 {
4587 struct task_struct *tsk, *t; 4594 struct task_struct *tsk, *t;
4588 4595
4589 write_lock_irq(&tasklist_lock); 4596 write_lock_irq(&tasklist_lock);
4590 4597
4591 do_each_thread(t, tsk) { 4598 do_each_thread(t, tsk) {
4592 if (tsk == current) 4599 if (tsk == current)
4593 continue; 4600 continue;
4594 4601
4595 if (task_cpu(tsk) == src_cpu) 4602 if (task_cpu(tsk) == src_cpu)
4596 move_task_off_dead_cpu(src_cpu, tsk); 4603 move_task_off_dead_cpu(src_cpu, tsk);
4597 } while_each_thread(t, tsk); 4604 } while_each_thread(t, tsk);
4598 4605
4599 write_unlock_irq(&tasklist_lock); 4606 write_unlock_irq(&tasklist_lock);
4600 } 4607 }
4601 4608
4602 /* Schedules idle task to be the next runnable task on current CPU. 4609 /* Schedules idle task to be the next runnable task on current CPU.
4603 * It does so by boosting its priority to highest possible and adding it to 4610 * It does so by boosting its priority to highest possible and adding it to
4604 * the _front_ of runqueue. Used by CPU offline code. 4611 * the _front_ of runqueue. Used by CPU offline code.
4605 */ 4612 */
4606 void sched_idle_next(void) 4613 void sched_idle_next(void)
4607 { 4614 {
4608 int cpu = smp_processor_id(); 4615 int cpu = smp_processor_id();
4609 runqueue_t *rq = this_rq(); 4616 runqueue_t *rq = this_rq();
4610 struct task_struct *p = rq->idle; 4617 struct task_struct *p = rq->idle;
4611 unsigned long flags; 4618 unsigned long flags;
4612 4619
4613 /* cpu has to be offline */ 4620 /* cpu has to be offline */
4614 BUG_ON(cpu_online(cpu)); 4621 BUG_ON(cpu_online(cpu));
4615 4622
4616 /* Strictly not necessary since rest of the CPUs are stopped by now 4623 /* Strictly not necessary since rest of the CPUs are stopped by now
4617 * and interrupts disabled on current cpu. 4624 * and interrupts disabled on current cpu.
4618 */ 4625 */
4619 spin_lock_irqsave(&rq->lock, flags); 4626 spin_lock_irqsave(&rq->lock, flags);
4620 4627
4621 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 4628 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4622 /* Add idle task to _front_ of it's priority queue */ 4629 /* Add idle task to _front_ of it's priority queue */
4623 __activate_idle_task(p, rq); 4630 __activate_idle_task(p, rq);
4624 4631
4625 spin_unlock_irqrestore(&rq->lock, flags); 4632 spin_unlock_irqrestore(&rq->lock, flags);
4626 } 4633 }
4627 4634
4628 /* Ensures that the idle task is using init_mm right before its cpu goes 4635 /* Ensures that the idle task is using init_mm right before its cpu goes
4629 * offline. 4636 * offline.
4630 */ 4637 */
4631 void idle_task_exit(void) 4638 void idle_task_exit(void)
4632 { 4639 {
4633 struct mm_struct *mm = current->active_mm; 4640 struct mm_struct *mm = current->active_mm;
4634 4641
4635 BUG_ON(cpu_online(smp_processor_id())); 4642 BUG_ON(cpu_online(smp_processor_id()));
4636 4643
4637 if (mm != &init_mm) 4644 if (mm != &init_mm)
4638 switch_mm(mm, &init_mm, current); 4645 switch_mm(mm, &init_mm, current);
4639 mmdrop(mm); 4646 mmdrop(mm);
4640 } 4647 }
4641 4648
4642 static void migrate_dead(unsigned int dead_cpu, task_t *tsk) 4649 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4643 { 4650 {
4644 struct runqueue *rq = cpu_rq(dead_cpu); 4651 struct runqueue *rq = cpu_rq(dead_cpu);
4645 4652
4646 /* Must be exiting, otherwise would be on tasklist. */ 4653 /* Must be exiting, otherwise would be on tasklist. */
4647 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); 4654 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4648 4655
4649 /* Cannot have done final schedule yet: would have vanished. */ 4656 /* Cannot have done final schedule yet: would have vanished. */
4650 BUG_ON(tsk->flags & PF_DEAD); 4657 BUG_ON(tsk->flags & PF_DEAD);
4651 4658
4652 get_task_struct(tsk); 4659 get_task_struct(tsk);
4653 4660
4654 /* 4661 /*
4655 * Drop lock around migration; if someone else moves it, 4662 * Drop lock around migration; if someone else moves it,
4656 * that's OK. No task can be added to this CPU, so iteration is 4663 * that's OK. No task can be added to this CPU, so iteration is
4657 * fine. 4664 * fine.
4658 */ 4665 */
4659 spin_unlock_irq(&rq->lock); 4666 spin_unlock_irq(&rq->lock);
4660 move_task_off_dead_cpu(dead_cpu, tsk); 4667 move_task_off_dead_cpu(dead_cpu, tsk);
4661 spin_lock_irq(&rq->lock); 4668 spin_lock_irq(&rq->lock);
4662 4669
4663 put_task_struct(tsk); 4670 put_task_struct(tsk);
4664 } 4671 }
4665 4672
4666 /* release_task() removes task from tasklist, so we won't find dead tasks. */ 4673 /* release_task() removes task from tasklist, so we won't find dead tasks. */
4667 static void migrate_dead_tasks(unsigned int dead_cpu) 4674 static void migrate_dead_tasks(unsigned int dead_cpu)
4668 { 4675 {
4669 unsigned arr, i; 4676 unsigned arr, i;
4670 struct runqueue *rq = cpu_rq(dead_cpu); 4677 struct runqueue *rq = cpu_rq(dead_cpu);
4671 4678
4672 for (arr = 0; arr < 2; arr++) { 4679 for (arr = 0; arr < 2; arr++) {
4673 for (i = 0; i < MAX_PRIO; i++) { 4680 for (i = 0; i < MAX_PRIO; i++) {
4674 struct list_head *list = &rq->arrays[arr].queue[i]; 4681 struct list_head *list = &rq->arrays[arr].queue[i];
4675 while (!list_empty(list)) 4682 while (!list_empty(list))
4676 migrate_dead(dead_cpu, 4683 migrate_dead(dead_cpu,
4677 list_entry(list->next, task_t, 4684 list_entry(list->next, task_t,
4678 run_list)); 4685 run_list));
4679 } 4686 }
4680 } 4687 }
4681 } 4688 }
4682 #endif /* CONFIG_HOTPLUG_CPU */ 4689 #endif /* CONFIG_HOTPLUG_CPU */
4683 4690
4684 /* 4691 /*
4685 * migration_call - callback that gets triggered when a CPU is added. 4692 * migration_call - callback that gets triggered when a CPU is added.
4686 * Here we can start up the necessary migration thread for the new CPU. 4693 * Here we can start up the necessary migration thread for the new CPU.
4687 */ 4694 */
4688 static int migration_call(struct notifier_block *nfb, unsigned long action, 4695 static int migration_call(struct notifier_block *nfb, unsigned long action,
4689 void *hcpu) 4696 void *hcpu)
4690 { 4697 {
4691 int cpu = (long)hcpu; 4698 int cpu = (long)hcpu;
4692 struct task_struct *p; 4699 struct task_struct *p;
4693 struct runqueue *rq; 4700 struct runqueue *rq;
4694 unsigned long flags; 4701 unsigned long flags;
4695 4702
4696 switch (action) { 4703 switch (action) {
4697 case CPU_UP_PREPARE: 4704 case CPU_UP_PREPARE:
4698 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 4705 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4699 if (IS_ERR(p)) 4706 if (IS_ERR(p))
4700 return NOTIFY_BAD; 4707 return NOTIFY_BAD;
4701 p->flags |= PF_NOFREEZE; 4708 p->flags |= PF_NOFREEZE;
4702 kthread_bind(p, cpu); 4709 kthread_bind(p, cpu);
4703 /* Must be high prio: stop_machine expects to yield to it. */ 4710 /* Must be high prio: stop_machine expects to yield to it. */
4704 rq = task_rq_lock(p, &flags); 4711 rq = task_rq_lock(p, &flags);
4705 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 4712 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4706 task_rq_unlock(rq, &flags); 4713 task_rq_unlock(rq, &flags);
4707 cpu_rq(cpu)->migration_thread = p; 4714 cpu_rq(cpu)->migration_thread = p;
4708 break; 4715 break;
4709 case CPU_ONLINE: 4716 case CPU_ONLINE:
4710 /* Strictly unneccessary, as first user will wake it. */ 4717 /* Strictly unneccessary, as first user will wake it. */
4711 wake_up_process(cpu_rq(cpu)->migration_thread); 4718 wake_up_process(cpu_rq(cpu)->migration_thread);
4712 break; 4719 break;
4713 #ifdef CONFIG_HOTPLUG_CPU 4720 #ifdef CONFIG_HOTPLUG_CPU
4714 case CPU_UP_CANCELED: 4721 case CPU_UP_CANCELED:
4715 /* Unbind it from offline cpu so it can run. Fall thru. */ 4722 /* Unbind it from offline cpu so it can run. Fall thru. */
4716 kthread_bind(cpu_rq(cpu)->migration_thread, 4723 kthread_bind(cpu_rq(cpu)->migration_thread,
4717 any_online_cpu(cpu_online_map)); 4724 any_online_cpu(cpu_online_map));
4718 kthread_stop(cpu_rq(cpu)->migration_thread); 4725 kthread_stop(cpu_rq(cpu)->migration_thread);
4719 cpu_rq(cpu)->migration_thread = NULL; 4726 cpu_rq(cpu)->migration_thread = NULL;
4720 break; 4727 break;
4721 case CPU_DEAD: 4728 case CPU_DEAD:
4722 migrate_live_tasks(cpu); 4729 migrate_live_tasks(cpu);
4723 rq = cpu_rq(cpu); 4730 rq = cpu_rq(cpu);
4724 kthread_stop(rq->migration_thread); 4731 kthread_stop(rq->migration_thread);
4725 rq->migration_thread = NULL; 4732 rq->migration_thread = NULL;
4726 /* Idle task back to normal (off runqueue, low prio) */ 4733 /* Idle task back to normal (off runqueue, low prio) */
4727 rq = task_rq_lock(rq->idle, &flags); 4734 rq = task_rq_lock(rq->idle, &flags);
4728 deactivate_task(rq->idle, rq); 4735 deactivate_task(rq->idle, rq);
4729 rq->idle->static_prio = MAX_PRIO; 4736 rq->idle->static_prio = MAX_PRIO;
4730 __setscheduler(rq->idle, SCHED_NORMAL, 0); 4737 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4731 migrate_dead_tasks(cpu); 4738 migrate_dead_tasks(cpu);
4732 task_rq_unlock(rq, &flags); 4739 task_rq_unlock(rq, &flags);
4733 migrate_nr_uninterruptible(rq); 4740 migrate_nr_uninterruptible(rq);
4734 BUG_ON(rq->nr_running != 0); 4741 BUG_ON(rq->nr_running != 0);
4735 4742
4736 /* No need to migrate the tasks: it was best-effort if 4743 /* No need to migrate the tasks: it was best-effort if
4737 * they didn't do lock_cpu_hotplug(). Just wake up 4744 * they didn't do lock_cpu_hotplug(). Just wake up
4738 * the requestors. */ 4745 * the requestors. */
4739 spin_lock_irq(&rq->lock); 4746 spin_lock_irq(&rq->lock);
4740 while (!list_empty(&rq->migration_queue)) { 4747 while (!list_empty(&rq->migration_queue)) {
4741 migration_req_t *req; 4748 migration_req_t *req;
4742 req = list_entry(rq->migration_queue.next, 4749 req = list_entry(rq->migration_queue.next,
4743 migration_req_t, list); 4750 migration_req_t, list);
4744 list_del_init(&req->list); 4751 list_del_init(&req->list);
4745 complete(&req->done); 4752 complete(&req->done);
4746 } 4753 }
4747 spin_unlock_irq(&rq->lock); 4754 spin_unlock_irq(&rq->lock);
4748 break; 4755 break;
4749 #endif 4756 #endif
4750 } 4757 }
4751 return NOTIFY_OK; 4758 return NOTIFY_OK;
4752 } 4759 }
4753 4760
4754 /* Register at highest priority so that task migration (migrate_all_tasks) 4761 /* Register at highest priority so that task migration (migrate_all_tasks)
4755 * happens before everything else. 4762 * happens before everything else.
4756 */ 4763 */
4757 static struct notifier_block __devinitdata migration_notifier = { 4764 static struct notifier_block __devinitdata migration_notifier = {
4758 .notifier_call = migration_call, 4765 .notifier_call = migration_call,
4759 .priority = 10 4766 .priority = 10
4760 }; 4767 };
4761 4768
4762 int __init migration_init(void) 4769 int __init migration_init(void)
4763 { 4770 {
4764 void *cpu = (void *)(long)smp_processor_id(); 4771 void *cpu = (void *)(long)smp_processor_id();
4765 /* Start one for boot CPU. */ 4772 /* Start one for boot CPU. */
4766 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 4773 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4767 migration_call(&migration_notifier, CPU_ONLINE, cpu); 4774 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4768 register_cpu_notifier(&migration_notifier); 4775 register_cpu_notifier(&migration_notifier);
4769 return 0; 4776 return 0;
4770 } 4777 }
4771 #endif 4778 #endif
4772 4779
4773 #ifdef CONFIG_SMP 4780 #ifdef CONFIG_SMP
4774 #undef SCHED_DOMAIN_DEBUG 4781 #undef SCHED_DOMAIN_DEBUG
4775 #ifdef SCHED_DOMAIN_DEBUG 4782 #ifdef SCHED_DOMAIN_DEBUG
4776 static void sched_domain_debug(struct sched_domain *sd, int cpu) 4783 static void sched_domain_debug(struct sched_domain *sd, int cpu)
4777 { 4784 {
4778 int level = 0; 4785 int level = 0;
4779 4786
4780 if (!sd) { 4787 if (!sd) {
4781 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 4788 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4782 return; 4789 return;
4783 } 4790 }
4784 4791
4785 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4792 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4786 4793
4787 do { 4794 do {
4788 int i; 4795 int i;
4789 char str[NR_CPUS]; 4796 char str[NR_CPUS];
4790 struct sched_group *group = sd->groups; 4797 struct sched_group *group = sd->groups;
4791 cpumask_t groupmask; 4798 cpumask_t groupmask;
4792 4799
4793 cpumask_scnprintf(str, NR_CPUS, sd->span); 4800 cpumask_scnprintf(str, NR_CPUS, sd->span);
4794 cpus_clear(groupmask); 4801 cpus_clear(groupmask);
4795 4802
4796 printk(KERN_DEBUG); 4803 printk(KERN_DEBUG);
4797 for (i = 0; i < level + 1; i++) 4804 for (i = 0; i < level + 1; i++)
4798 printk(" "); 4805 printk(" ");
4799 printk("domain %d: ", level); 4806 printk("domain %d: ", level);
4800 4807
4801 if (!(sd->flags & SD_LOAD_BALANCE)) { 4808 if (!(sd->flags & SD_LOAD_BALANCE)) {
4802 printk("does not load-balance\n"); 4809 printk("does not load-balance\n");
4803 if (sd->parent) 4810 if (sd->parent)
4804 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 4811 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
4805 break; 4812 break;
4806 } 4813 }
4807 4814
4808 printk("span %s\n", str); 4815 printk("span %s\n", str);
4809 4816
4810 if (!cpu_isset(cpu, sd->span)) 4817 if (!cpu_isset(cpu, sd->span))
4811 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 4818 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
4812 if (!cpu_isset(cpu, group->cpumask)) 4819 if (!cpu_isset(cpu, group->cpumask))
4813 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 4820 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
4814 4821
4815 printk(KERN_DEBUG); 4822 printk(KERN_DEBUG);
4816 for (i = 0; i < level + 2; i++) 4823 for (i = 0; i < level + 2; i++)
4817 printk(" "); 4824 printk(" ");
4818 printk("groups:"); 4825 printk("groups:");
4819 do { 4826 do {
4820 if (!group) { 4827 if (!group) {
4821 printk("\n"); 4828 printk("\n");
4822 printk(KERN_ERR "ERROR: group is NULL\n"); 4829 printk(KERN_ERR "ERROR: group is NULL\n");
4823 break; 4830 break;
4824 } 4831 }
4825 4832
4826 if (!group->cpu_power) { 4833 if (!group->cpu_power) {
4827 printk("\n"); 4834 printk("\n");
4828 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 4835 printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
4829 } 4836 }
4830 4837
4831 if (!cpus_weight(group->cpumask)) { 4838 if (!cpus_weight(group->cpumask)) {
4832 printk("\n"); 4839 printk("\n");
4833 printk(KERN_ERR "ERROR: empty group\n"); 4840 printk(KERN_ERR "ERROR: empty group\n");
4834 } 4841 }
4835 4842
4836 if (cpus_intersects(groupmask, group->cpumask)) { 4843 if (cpus_intersects(groupmask, group->cpumask)) {
4837 printk("\n"); 4844 printk("\n");
4838 printk(KERN_ERR "ERROR: repeated CPUs\n"); 4845 printk(KERN_ERR "ERROR: repeated CPUs\n");
4839 } 4846 }
4840 4847
4841 cpus_or(groupmask, groupmask, group->cpumask); 4848 cpus_or(groupmask, groupmask, group->cpumask);
4842 4849
4843 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 4850 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4844 printk(" %s", str); 4851 printk(" %s", str);
4845 4852
4846 group = group->next; 4853 group = group->next;
4847 } while (group != sd->groups); 4854 } while (group != sd->groups);
4848 printk("\n"); 4855 printk("\n");
4849 4856
4850 if (!cpus_equal(sd->span, groupmask)) 4857 if (!cpus_equal(sd->span, groupmask))
4851 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 4858 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
4852 4859
4853 level++; 4860 level++;
4854 sd = sd->parent; 4861 sd = sd->parent;
4855 4862
4856 if (sd) { 4863 if (sd) {
4857 if (!cpus_subset(groupmask, sd->span)) 4864 if (!cpus_subset(groupmask, sd->span))
4858 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 4865 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
4859 } 4866 }
4860 4867
4861 } while (sd); 4868 } while (sd);
4862 } 4869 }
4863 #else 4870 #else
4864 #define sched_domain_debug(sd, cpu) {} 4871 #define sched_domain_debug(sd, cpu) {}
4865 #endif 4872 #endif
4866 4873
4867 static int sd_degenerate(struct sched_domain *sd) 4874 static int sd_degenerate(struct sched_domain *sd)
4868 { 4875 {
4869 if (cpus_weight(sd->span) == 1) 4876 if (cpus_weight(sd->span) == 1)
4870 return 1; 4877 return 1;
4871 4878
4872 /* Following flags need at least 2 groups */ 4879 /* Following flags need at least 2 groups */
4873 if (sd->flags & (SD_LOAD_BALANCE | 4880 if (sd->flags & (SD_LOAD_BALANCE |
4874 SD_BALANCE_NEWIDLE | 4881 SD_BALANCE_NEWIDLE |
4875 SD_BALANCE_FORK | 4882 SD_BALANCE_FORK |
4876 SD_BALANCE_EXEC)) { 4883 SD_BALANCE_EXEC)) {
4877 if (sd->groups != sd->groups->next) 4884 if (sd->groups != sd->groups->next)
4878 return 0; 4885 return 0;
4879 } 4886 }
4880 4887
4881 /* Following flags don't use groups */ 4888 /* Following flags don't use groups */
4882 if (sd->flags & (SD_WAKE_IDLE | 4889 if (sd->flags & (SD_WAKE_IDLE |
4883 SD_WAKE_AFFINE | 4890 SD_WAKE_AFFINE |
4884 SD_WAKE_BALANCE)) 4891 SD_WAKE_BALANCE))
4885 return 0; 4892 return 0;
4886 4893
4887 return 1; 4894 return 1;
4888 } 4895 }
4889 4896
4890 static int sd_parent_degenerate(struct sched_domain *sd, 4897 static int sd_parent_degenerate(struct sched_domain *sd,
4891 struct sched_domain *parent) 4898 struct sched_domain *parent)
4892 { 4899 {
4893 unsigned long cflags = sd->flags, pflags = parent->flags; 4900 unsigned long cflags = sd->flags, pflags = parent->flags;
4894 4901
4895 if (sd_degenerate(parent)) 4902 if (sd_degenerate(parent))
4896 return 1; 4903 return 1;
4897 4904
4898 if (!cpus_equal(sd->span, parent->span)) 4905 if (!cpus_equal(sd->span, parent->span))
4899 return 0; 4906 return 0;
4900 4907
4901 /* Does parent contain flags not in child? */ 4908 /* Does parent contain flags not in child? */
4902 /* WAKE_BALANCE is a subset of WAKE_AFFINE */ 4909 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
4903 if (cflags & SD_WAKE_AFFINE) 4910 if (cflags & SD_WAKE_AFFINE)
4904 pflags &= ~SD_WAKE_BALANCE; 4911 pflags &= ~SD_WAKE_BALANCE;
4905 /* Flags needing groups don't count if only 1 group in parent */ 4912 /* Flags needing groups don't count if only 1 group in parent */
4906 if (parent->groups == parent->groups->next) { 4913 if (parent->groups == parent->groups->next) {
4907 pflags &= ~(SD_LOAD_BALANCE | 4914 pflags &= ~(SD_LOAD_BALANCE |
4908 SD_BALANCE_NEWIDLE | 4915 SD_BALANCE_NEWIDLE |
4909 SD_BALANCE_FORK | 4916 SD_BALANCE_FORK |
4910 SD_BALANCE_EXEC); 4917 SD_BALANCE_EXEC);
4911 } 4918 }
4912 if (~cflags & pflags) 4919 if (~cflags & pflags)
4913 return 0; 4920 return 0;
4914 4921
4915 return 1; 4922 return 1;
4916 } 4923 }
4917 4924
4918 /* 4925 /*
4919 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4926 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4920 * hold the hotplug lock. 4927 * hold the hotplug lock.
4921 */ 4928 */
4922 static void cpu_attach_domain(struct sched_domain *sd, int cpu) 4929 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4923 { 4930 {
4924 runqueue_t *rq = cpu_rq(cpu); 4931 runqueue_t *rq = cpu_rq(cpu);
4925 struct sched_domain *tmp; 4932 struct sched_domain *tmp;
4926 4933
4927 /* Remove the sched domains which do not contribute to scheduling. */ 4934 /* Remove the sched domains which do not contribute to scheduling. */
4928 for (tmp = sd; tmp; tmp = tmp->parent) { 4935 for (tmp = sd; tmp; tmp = tmp->parent) {
4929 struct sched_domain *parent = tmp->parent; 4936 struct sched_domain *parent = tmp->parent;
4930 if (!parent) 4937 if (!parent)
4931 break; 4938 break;
4932 if (sd_parent_degenerate(tmp, parent)) 4939 if (sd_parent_degenerate(tmp, parent))
4933 tmp->parent = parent->parent; 4940 tmp->parent = parent->parent;
4934 } 4941 }
4935 4942
4936 if (sd && sd_degenerate(sd)) 4943 if (sd && sd_degenerate(sd))
4937 sd = sd->parent; 4944 sd = sd->parent;
4938 4945
4939 sched_domain_debug(sd, cpu); 4946 sched_domain_debug(sd, cpu);
4940 4947
4941 rcu_assign_pointer(rq->sd, sd); 4948 rcu_assign_pointer(rq->sd, sd);
4942 } 4949 }
4943 4950
4944 /* cpus with isolated domains */ 4951 /* cpus with isolated domains */
4945 static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4952 static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4946 4953
4947 /* Setup the mask of cpus configured for isolated domains */ 4954 /* Setup the mask of cpus configured for isolated domains */
4948 static int __init isolated_cpu_setup(char *str) 4955 static int __init isolated_cpu_setup(char *str)
4949 { 4956 {
4950 int ints[NR_CPUS], i; 4957 int ints[NR_CPUS], i;
4951 4958
4952 str = get_options(str, ARRAY_SIZE(ints), ints); 4959 str = get_options(str, ARRAY_SIZE(ints), ints);
4953 cpus_clear(cpu_isolated_map); 4960 cpus_clear(cpu_isolated_map);
4954 for (i = 1; i <= ints[0]; i++) 4961 for (i = 1; i <= ints[0]; i++)
4955 if (ints[i] < NR_CPUS) 4962 if (ints[i] < NR_CPUS)
4956 cpu_set(ints[i], cpu_isolated_map); 4963 cpu_set(ints[i], cpu_isolated_map);
4957 return 1; 4964 return 1;
4958 } 4965 }
4959 4966
4960 __setup ("isolcpus=", isolated_cpu_setup); 4967 __setup ("isolcpus=", isolated_cpu_setup);
4961 4968
4962 /* 4969 /*
4963 * init_sched_build_groups takes an array of groups, the cpumask we wish 4970 * init_sched_build_groups takes an array of groups, the cpumask we wish
4964 * to span, and a pointer to a function which identifies what group a CPU 4971 * to span, and a pointer to a function which identifies what group a CPU
4965 * belongs to. The return value of group_fn must be a valid index into the 4972 * belongs to. The return value of group_fn must be a valid index into the
4966 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 4973 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
4967 * keep track of groups covered with a cpumask_t). 4974 * keep track of groups covered with a cpumask_t).
4968 * 4975 *
4969 * init_sched_build_groups will build a circular linked list of the groups 4976 * init_sched_build_groups will build a circular linked list of the groups
4970 * covered by the given span, and will set each group's ->cpumask correctly, 4977 * covered by the given span, and will set each group's ->cpumask correctly,
4971 * and ->cpu_power to 0. 4978 * and ->cpu_power to 0.
4972 */ 4979 */
4973 static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, 4980 static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4974 int (*group_fn)(int cpu)) 4981 int (*group_fn)(int cpu))
4975 { 4982 {
4976 struct sched_group *first = NULL, *last = NULL; 4983 struct sched_group *first = NULL, *last = NULL;
4977 cpumask_t covered = CPU_MASK_NONE; 4984 cpumask_t covered = CPU_MASK_NONE;
4978 int i; 4985 int i;
4979 4986
4980 for_each_cpu_mask(i, span) { 4987 for_each_cpu_mask(i, span) {
4981 int group = group_fn(i); 4988 int group = group_fn(i);
4982 struct sched_group *sg = &groups[group]; 4989 struct sched_group *sg = &groups[group];
4983 int j; 4990 int j;
4984 4991
4985 if (cpu_isset(i, covered)) 4992 if (cpu_isset(i, covered))
4986 continue; 4993 continue;
4987 4994
4988 sg->cpumask = CPU_MASK_NONE; 4995 sg->cpumask = CPU_MASK_NONE;
4989 sg->cpu_power = 0; 4996 sg->cpu_power = 0;
4990 4997
4991 for_each_cpu_mask(j, span) { 4998 for_each_cpu_mask(j, span) {
4992 if (group_fn(j) != group) 4999 if (group_fn(j) != group)
4993 continue; 5000 continue;
4994 5001
4995 cpu_set(j, covered); 5002 cpu_set(j, covered);
4996 cpu_set(j, sg->cpumask); 5003 cpu_set(j, sg->cpumask);
4997 } 5004 }
4998 if (!first) 5005 if (!first)
4999 first = sg; 5006 first = sg;
5000 if (last) 5007 if (last)
5001 last->next = sg; 5008 last->next = sg;
5002 last = sg; 5009 last = sg;
5003 } 5010 }
5004 last->next = first; 5011 last->next = first;
5005 } 5012 }
5006 5013
5007 #define SD_NODES_PER_DOMAIN 16 5014 #define SD_NODES_PER_DOMAIN 16
5008 5015
5009 /* 5016 /*
5010 * Self-tuning task migration cost measurement between source and target CPUs. 5017 * Self-tuning task migration cost measurement between source and target CPUs.
5011 * 5018 *
5012 * This is done by measuring the cost of manipulating buffers of varying 5019 * This is done by measuring the cost of manipulating buffers of varying
5013 * sizes. For a given buffer-size here are the steps that are taken: 5020 * sizes. For a given buffer-size here are the steps that are taken:
5014 * 5021 *
5015 * 1) the source CPU reads+dirties a shared buffer 5022 * 1) the source CPU reads+dirties a shared buffer
5016 * 2) the target CPU reads+dirties the same shared buffer 5023 * 2) the target CPU reads+dirties the same shared buffer
5017 * 5024 *
5018 * We measure how long they take, in the following 4 scenarios: 5025 * We measure how long they take, in the following 4 scenarios:
5019 * 5026 *
5020 * - source: CPU1, target: CPU2 | cost1 5027 * - source: CPU1, target: CPU2 | cost1
5021 * - source: CPU2, target: CPU1 | cost2 5028 * - source: CPU2, target: CPU1 | cost2
5022 * - source: CPU1, target: CPU1 | cost3 5029 * - source: CPU1, target: CPU1 | cost3
5023 * - source: CPU2, target: CPU2 | cost4 5030 * - source: CPU2, target: CPU2 | cost4
5024 * 5031 *
5025 * We then calculate the cost3+cost4-cost1-cost2 difference - this is 5032 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5026 * the cost of migration. 5033 * the cost of migration.
5027 * 5034 *
5028 * We then start off from a small buffer-size and iterate up to larger 5035 * We then start off from a small buffer-size and iterate up to larger
5029 * buffer sizes, in 5% steps - measuring each buffer-size separately, and 5036 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5030 * doing a maximum search for the cost. (The maximum cost for a migration 5037 * doing a maximum search for the cost. (The maximum cost for a migration
5031 * normally occurs when the working set size is around the effective cache 5038 * normally occurs when the working set size is around the effective cache
5032 * size.) 5039 * size.)
5033 */ 5040 */
5034 #define SEARCH_SCOPE 2 5041 #define SEARCH_SCOPE 2
5035 #define MIN_CACHE_SIZE (64*1024U) 5042 #define MIN_CACHE_SIZE (64*1024U)
5036 #define DEFAULT_CACHE_SIZE (5*1024*1024U) 5043 #define DEFAULT_CACHE_SIZE (5*1024*1024U)
5037 #define ITERATIONS 1 5044 #define ITERATIONS 1
5038 #define SIZE_THRESH 130 5045 #define SIZE_THRESH 130
5039 #define COST_THRESH 130 5046 #define COST_THRESH 130
5040 5047
5041 /* 5048 /*
5042 * The migration cost is a function of 'domain distance'. Domain 5049 * The migration cost is a function of 'domain distance'. Domain
5043 * distance is the number of steps a CPU has to iterate down its 5050 * distance is the number of steps a CPU has to iterate down its
5044 * domain tree to share a domain with the other CPU. The farther 5051 * domain tree to share a domain with the other CPU. The farther
5045 * two CPUs are from each other, the larger the distance gets. 5052 * two CPUs are from each other, the larger the distance gets.
5046 * 5053 *
5047 * Note that we use the distance only to cache measurement results, 5054 * Note that we use the distance only to cache measurement results,
5048 * the distance value is not used numerically otherwise. When two 5055 * the distance value is not used numerically otherwise. When two
5049 * CPUs have the same distance it is assumed that the migration 5056 * CPUs have the same distance it is assumed that the migration
5050 * cost is the same. (this is a simplification but quite practical) 5057 * cost is the same. (this is a simplification but quite practical)
5051 */ 5058 */
5052 #define MAX_DOMAIN_DISTANCE 32 5059 #define MAX_DOMAIN_DISTANCE 32
5053 5060
5054 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = 5061 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5055 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = 5062 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5056 /* 5063 /*
5057 * Architectures may override the migration cost and thus avoid 5064 * Architectures may override the migration cost and thus avoid
5058 * boot-time calibration. Unit is nanoseconds. Mostly useful for 5065 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5059 * virtualized hardware: 5066 * virtualized hardware:
5060 */ 5067 */
5061 #ifdef CONFIG_DEFAULT_MIGRATION_COST 5068 #ifdef CONFIG_DEFAULT_MIGRATION_COST
5062 CONFIG_DEFAULT_MIGRATION_COST 5069 CONFIG_DEFAULT_MIGRATION_COST
5063 #else 5070 #else
5064 -1LL 5071 -1LL
5065 #endif 5072 #endif
5066 }; 5073 };
5067 5074
5068 /* 5075 /*
5069 * Allow override of migration cost - in units of microseconds. 5076 * Allow override of migration cost - in units of microseconds.
5070 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost 5077 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5071 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: 5078 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5072 */ 5079 */
5073 static int __init migration_cost_setup(char *str) 5080 static int __init migration_cost_setup(char *str)
5074 { 5081 {
5075 int ints[MAX_DOMAIN_DISTANCE+1], i; 5082 int ints[MAX_DOMAIN_DISTANCE+1], i;
5076 5083
5077 str = get_options(str, ARRAY_SIZE(ints), ints); 5084 str = get_options(str, ARRAY_SIZE(ints), ints);
5078 5085
5079 printk("#ints: %d\n", ints[0]); 5086 printk("#ints: %d\n", ints[0]);
5080 for (i = 1; i <= ints[0]; i++) { 5087 for (i = 1; i <= ints[0]; i++) {
5081 migration_cost[i-1] = (unsigned long long)ints[i]*1000; 5088 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5082 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); 5089 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5083 } 5090 }
5084 return 1; 5091 return 1;
5085 } 5092 }
5086 5093
5087 __setup ("migration_cost=", migration_cost_setup); 5094 __setup ("migration_cost=", migration_cost_setup);
5088 5095
5089 /* 5096 /*
5090 * Global multiplier (divisor) for migration-cutoff values, 5097 * Global multiplier (divisor) for migration-cutoff values,
5091 * in percentiles. E.g. use a value of 150 to get 1.5 times 5098 * in percentiles. E.g. use a value of 150 to get 1.5 times
5092 * longer cache-hot cutoff times. 5099 * longer cache-hot cutoff times.
5093 * 5100 *
5094 * (We scale it from 100 to 128 to long long handling easier.) 5101 * (We scale it from 100 to 128 to long long handling easier.)
5095 */ 5102 */
5096 5103
5097 #define MIGRATION_FACTOR_SCALE 128 5104 #define MIGRATION_FACTOR_SCALE 128
5098 5105
5099 static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; 5106 static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5100 5107
5101 static int __init setup_migration_factor(char *str) 5108 static int __init setup_migration_factor(char *str)
5102 { 5109 {
5103 get_option(&str, &migration_factor); 5110 get_option(&str, &migration_factor);
5104 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; 5111 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5105 return 1; 5112 return 1;
5106 } 5113 }
5107 5114
5108 __setup("migration_factor=", setup_migration_factor); 5115 __setup("migration_factor=", setup_migration_factor);
5109 5116
5110 /* 5117 /*
5111 * Estimated distance of two CPUs, measured via the number of domains 5118 * Estimated distance of two CPUs, measured via the number of domains
5112 * we have to pass for the two CPUs to be in the same span: 5119 * we have to pass for the two CPUs to be in the same span:
5113 */ 5120 */
5114 static unsigned long domain_distance(int cpu1, int cpu2) 5121 static unsigned long domain_distance(int cpu1, int cpu2)
5115 { 5122 {
5116 unsigned long distance = 0; 5123 unsigned long distance = 0;
5117 struct sched_domain *sd; 5124 struct sched_domain *sd;
5118 5125
5119 for_each_domain(cpu1, sd) { 5126 for_each_domain(cpu1, sd) {
5120 WARN_ON(!cpu_isset(cpu1, sd->span)); 5127 WARN_ON(!cpu_isset(cpu1, sd->span));
5121 if (cpu_isset(cpu2, sd->span)) 5128 if (cpu_isset(cpu2, sd->span))
5122 return distance; 5129 return distance;
5123 distance++; 5130 distance++;
5124 } 5131 }
5125 if (distance >= MAX_DOMAIN_DISTANCE) { 5132 if (distance >= MAX_DOMAIN_DISTANCE) {
5126 WARN_ON(1); 5133 WARN_ON(1);
5127 distance = MAX_DOMAIN_DISTANCE-1; 5134 distance = MAX_DOMAIN_DISTANCE-1;
5128 } 5135 }
5129 5136
5130 return distance; 5137 return distance;
5131 } 5138 }
5132 5139
5133 static unsigned int migration_debug; 5140 static unsigned int migration_debug;
5134 5141
5135 static int __init setup_migration_debug(char *str) 5142 static int __init setup_migration_debug(char *str)
5136 { 5143 {
5137 get_option(&str, &migration_debug); 5144 get_option(&str, &migration_debug);
5138 return 1; 5145 return 1;
5139 } 5146 }
5140 5147
5141 __setup("migration_debug=", setup_migration_debug); 5148 __setup("migration_debug=", setup_migration_debug);
5142 5149
5143 /* 5150 /*
5144 * Maximum cache-size that the scheduler should try to measure. 5151 * Maximum cache-size that the scheduler should try to measure.
5145 * Architectures with larger caches should tune this up during 5152 * Architectures with larger caches should tune this up during
5146 * bootup. Gets used in the domain-setup code (i.e. during SMP 5153 * bootup. Gets used in the domain-setup code (i.e. during SMP
5147 * bootup). 5154 * bootup).
5148 */ 5155 */
5149 unsigned int max_cache_size; 5156 unsigned int max_cache_size;
5150 5157
5151 static int __init setup_max_cache_size(char *str) 5158 static int __init setup_max_cache_size(char *str)
5152 { 5159 {
5153 get_option(&str, &max_cache_size); 5160 get_option(&str, &max_cache_size);
5154 return 1; 5161 return 1;
5155 } 5162 }
5156 5163
5157 __setup("max_cache_size=", setup_max_cache_size); 5164 __setup("max_cache_size=", setup_max_cache_size);
5158 5165
5159 /* 5166 /*
5160 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This 5167 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5161 * is the operation that is timed, so we try to generate unpredictable 5168 * is the operation that is timed, so we try to generate unpredictable
5162 * cachemisses that still end up filling the L2 cache: 5169 * cachemisses that still end up filling the L2 cache:
5163 */ 5170 */
5164 static void touch_cache(void *__cache, unsigned long __size) 5171 static void touch_cache(void *__cache, unsigned long __size)
5165 { 5172 {
5166 unsigned long size = __size/sizeof(long), chunk1 = size/3, 5173 unsigned long size = __size/sizeof(long), chunk1 = size/3,
5167 chunk2 = 2*size/3; 5174 chunk2 = 2*size/3;
5168 unsigned long *cache = __cache; 5175 unsigned long *cache = __cache;
5169 int i; 5176 int i;
5170 5177
5171 for (i = 0; i < size/6; i += 8) { 5178 for (i = 0; i < size/6; i += 8) {
5172 switch (i % 6) { 5179 switch (i % 6) {
5173 case 0: cache[i]++; 5180 case 0: cache[i]++;
5174 case 1: cache[size-1-i]++; 5181 case 1: cache[size-1-i]++;
5175 case 2: cache[chunk1-i]++; 5182 case 2: cache[chunk1-i]++;
5176 case 3: cache[chunk1+i]++; 5183 case 3: cache[chunk1+i]++;
5177 case 4: cache[chunk2-i]++; 5184 case 4: cache[chunk2-i]++;
5178 case 5: cache[chunk2+i]++; 5185 case 5: cache[chunk2+i]++;
5179 } 5186 }
5180 } 5187 }
5181 } 5188 }
5182 5189
5183 /* 5190 /*
5184 * Measure the cache-cost of one task migration. Returns in units of nsec. 5191 * Measure the cache-cost of one task migration. Returns in units of nsec.
5185 */ 5192 */
5186 static unsigned long long measure_one(void *cache, unsigned long size, 5193 static unsigned long long measure_one(void *cache, unsigned long size,
5187 int source, int target) 5194 int source, int target)
5188 { 5195 {
5189 cpumask_t mask, saved_mask; 5196 cpumask_t mask, saved_mask;
5190 unsigned long long t0, t1, t2, t3, cost; 5197 unsigned long long t0, t1, t2, t3, cost;
5191 5198
5192 saved_mask = current->cpus_allowed; 5199 saved_mask = current->cpus_allowed;
5193 5200
5194 /* 5201 /*
5195 * Flush source caches to RAM and invalidate them: 5202 * Flush source caches to RAM and invalidate them:
5196 */ 5203 */
5197 sched_cacheflush(); 5204 sched_cacheflush();
5198 5205
5199 /* 5206 /*
5200 * Migrate to the source CPU: 5207 * Migrate to the source CPU:
5201 */ 5208 */
5202 mask = cpumask_of_cpu(source); 5209 mask = cpumask_of_cpu(source);
5203 set_cpus_allowed(current, mask); 5210 set_cpus_allowed(current, mask);
5204 WARN_ON(smp_processor_id() != source); 5211 WARN_ON(smp_processor_id() != source);
5205 5212
5206 /* 5213 /*
5207 * Dirty the working set: 5214 * Dirty the working set:
5208 */ 5215 */
5209 t0 = sched_clock(); 5216 t0 = sched_clock();
5210 touch_cache(cache, size); 5217 touch_cache(cache, size);
5211 t1 = sched_clock(); 5218 t1 = sched_clock();
5212 5219
5213 /* 5220 /*
5214 * Migrate to the target CPU, dirty the L2 cache and access 5221 * Migrate to the target CPU, dirty the L2 cache and access
5215 * the shared buffer. (which represents the working set 5222 * the shared buffer. (which represents the working set
5216 * of a migrated task.) 5223 * of a migrated task.)
5217 */ 5224 */
5218 mask = cpumask_of_cpu(target); 5225 mask = cpumask_of_cpu(target);
5219 set_cpus_allowed(current, mask); 5226 set_cpus_allowed(current, mask);
5220 WARN_ON(smp_processor_id() != target); 5227 WARN_ON(smp_processor_id() != target);
5221 5228
5222 t2 = sched_clock(); 5229 t2 = sched_clock();
5223 touch_cache(cache, size); 5230 touch_cache(cache, size);
5224 t3 = sched_clock(); 5231 t3 = sched_clock();
5225 5232
5226 cost = t1-t0 + t3-t2; 5233 cost = t1-t0 + t3-t2;
5227 5234
5228 if (migration_debug >= 2) 5235 if (migration_debug >= 2)
5229 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", 5236 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
5230 source, target, t1-t0, t1-t0, t3-t2, cost); 5237 source, target, t1-t0, t1-t0, t3-t2, cost);
5231 /* 5238 /*
5232 * Flush target caches to RAM and invalidate them: 5239 * Flush target caches to RAM and invalidate them:
5233 */ 5240 */
5234 sched_cacheflush(); 5241 sched_cacheflush();
5235 5242
5236 set_cpus_allowed(current, saved_mask); 5243 set_cpus_allowed(current, saved_mask);
5237 5244
5238 return cost; 5245 return cost;
5239 } 5246 }
5240 5247
5241 /* 5248 /*
5242 * Measure a series of task migrations and return the average 5249 * Measure a series of task migrations and return the average
5243 * result. Since this code runs early during bootup the system 5250 * result. Since this code runs early during bootup the system
5244 * is 'undisturbed' and the average latency makes sense. 5251 * is 'undisturbed' and the average latency makes sense.
5245 * 5252 *
5246 * The algorithm in essence auto-detects the relevant cache-size, 5253 * The algorithm in essence auto-detects the relevant cache-size,
5247 * so it will properly detect different cachesizes for different 5254 * so it will properly detect different cachesizes for different
5248 * cache-hierarchies, depending on how the CPUs are connected. 5255 * cache-hierarchies, depending on how the CPUs are connected.
5249 * 5256 *
5250 * Architectures can prime the upper limit of the search range via 5257 * Architectures can prime the upper limit of the search range via
5251 * max_cache_size, otherwise the search range defaults to 20MB...64K. 5258 * max_cache_size, otherwise the search range defaults to 20MB...64K.
5252 */ 5259 */
5253 static unsigned long long 5260 static unsigned long long
5254 measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) 5261 measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5255 { 5262 {
5256 unsigned long long cost1, cost2; 5263 unsigned long long cost1, cost2;
5257 int i; 5264 int i;
5258 5265
5259 /* 5266 /*
5260 * Measure the migration cost of 'size' bytes, over an 5267 * Measure the migration cost of 'size' bytes, over an
5261 * average of 10 runs: 5268 * average of 10 runs:
5262 * 5269 *
5263 * (We perturb the cache size by a small (0..4k) 5270 * (We perturb the cache size by a small (0..4k)
5264 * value to compensate size/alignment related artifacts. 5271 * value to compensate size/alignment related artifacts.
5265 * We also subtract the cost of the operation done on 5272 * We also subtract the cost of the operation done on
5266 * the same CPU.) 5273 * the same CPU.)
5267 */ 5274 */
5268 cost1 = 0; 5275 cost1 = 0;
5269 5276
5270 /* 5277 /*
5271 * dry run, to make sure we start off cache-cold on cpu1, 5278 * dry run, to make sure we start off cache-cold on cpu1,
5272 * and to get any vmalloc pagefaults in advance: 5279 * and to get any vmalloc pagefaults in advance:
5273 */ 5280 */
5274 measure_one(cache, size, cpu1, cpu2); 5281 measure_one(cache, size, cpu1, cpu2);
5275 for (i = 0; i < ITERATIONS; i++) 5282 for (i = 0; i < ITERATIONS; i++)
5276 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); 5283 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
5277 5284
5278 measure_one(cache, size, cpu2, cpu1); 5285 measure_one(cache, size, cpu2, cpu1);
5279 for (i = 0; i < ITERATIONS; i++) 5286 for (i = 0; i < ITERATIONS; i++)
5280 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); 5287 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
5281 5288
5282 /* 5289 /*
5283 * (We measure the non-migrating [cached] cost on both 5290 * (We measure the non-migrating [cached] cost on both
5284 * cpu1 and cpu2, to handle CPUs with different speeds) 5291 * cpu1 and cpu2, to handle CPUs with different speeds)
5285 */ 5292 */
5286 cost2 = 0; 5293 cost2 = 0;
5287 5294
5288 measure_one(cache, size, cpu1, cpu1); 5295 measure_one(cache, size, cpu1, cpu1);
5289 for (i = 0; i < ITERATIONS; i++) 5296 for (i = 0; i < ITERATIONS; i++)
5290 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); 5297 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
5291 5298
5292 measure_one(cache, size, cpu2, cpu2); 5299 measure_one(cache, size, cpu2, cpu2);
5293 for (i = 0; i < ITERATIONS; i++) 5300 for (i = 0; i < ITERATIONS; i++)
5294 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); 5301 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
5295 5302
5296 /* 5303 /*
5297 * Get the per-iteration migration cost: 5304 * Get the per-iteration migration cost:
5298 */ 5305 */
5299 do_div(cost1, 2*ITERATIONS); 5306 do_div(cost1, 2*ITERATIONS);
5300 do_div(cost2, 2*ITERATIONS); 5307 do_div(cost2, 2*ITERATIONS);
5301 5308
5302 return cost1 - cost2; 5309 return cost1 - cost2;
5303 } 5310 }
5304 5311
5305 static unsigned long long measure_migration_cost(int cpu1, int cpu2) 5312 static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5306 { 5313 {
5307 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; 5314 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
5308 unsigned int max_size, size, size_found = 0; 5315 unsigned int max_size, size, size_found = 0;
5309 long long cost = 0, prev_cost; 5316 long long cost = 0, prev_cost;
5310 void *cache; 5317 void *cache;
5311 5318
5312 /* 5319 /*
5313 * Search from max_cache_size*5 down to 64K - the real relevant 5320 * Search from max_cache_size*5 down to 64K - the real relevant
5314 * cachesize has to lie somewhere inbetween. 5321 * cachesize has to lie somewhere inbetween.
5315 */ 5322 */
5316 if (max_cache_size) { 5323 if (max_cache_size) {
5317 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); 5324 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
5318 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); 5325 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
5319 } else { 5326 } else {
5320 /* 5327 /*
5321 * Since we have no estimation about the relevant 5328 * Since we have no estimation about the relevant
5322 * search range 5329 * search range
5323 */ 5330 */
5324 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; 5331 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
5325 size = MIN_CACHE_SIZE; 5332 size = MIN_CACHE_SIZE;
5326 } 5333 }
5327 5334
5328 if (!cpu_online(cpu1) || !cpu_online(cpu2)) { 5335 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
5329 printk("cpu %d and %d not both online!\n", cpu1, cpu2); 5336 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
5330 return 0; 5337 return 0;
5331 } 5338 }
5332 5339
5333 /* 5340 /*
5334 * Allocate the working set: 5341 * Allocate the working set:
5335 */ 5342 */
5336 cache = vmalloc(max_size); 5343 cache = vmalloc(max_size);
5337 if (!cache) { 5344 if (!cache) {
5338 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 5345 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5339 return 1000000; // return 1 msec on very small boxen 5346 return 1000000; // return 1 msec on very small boxen
5340 } 5347 }
5341 5348
5342 while (size <= max_size) { 5349 while (size <= max_size) {
5343 prev_cost = cost; 5350 prev_cost = cost;
5344 cost = measure_cost(cpu1, cpu2, cache, size); 5351 cost = measure_cost(cpu1, cpu2, cache, size);
5345 5352
5346 /* 5353 /*
5347 * Update the max: 5354 * Update the max:
5348 */ 5355 */
5349 if (cost > 0) { 5356 if (cost > 0) {
5350 if (max_cost < cost) { 5357 if (max_cost < cost) {
5351 max_cost = cost; 5358 max_cost = cost;
5352 size_found = size; 5359 size_found = size;
5353 } 5360 }
5354 } 5361 }
5355 /* 5362 /*
5356 * Calculate average fluctuation, we use this to prevent 5363 * Calculate average fluctuation, we use this to prevent
5357 * noise from triggering an early break out of the loop: 5364 * noise from triggering an early break out of the loop:
5358 */ 5365 */
5359 fluct = abs(cost - prev_cost); 5366 fluct = abs(cost - prev_cost);
5360 avg_fluct = (avg_fluct + fluct)/2; 5367 avg_fluct = (avg_fluct + fluct)/2;
5361 5368
5362 if (migration_debug) 5369 if (migration_debug)
5363 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", 5370 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
5364 cpu1, cpu2, size, 5371 cpu1, cpu2, size,
5365 (long)cost / 1000000, 5372 (long)cost / 1000000,
5366 ((long)cost / 100000) % 10, 5373 ((long)cost / 100000) % 10,
5367 (long)max_cost / 1000000, 5374 (long)max_cost / 1000000,
5368 ((long)max_cost / 100000) % 10, 5375 ((long)max_cost / 100000) % 10,
5369 domain_distance(cpu1, cpu2), 5376 domain_distance(cpu1, cpu2),
5370 cost, avg_fluct); 5377 cost, avg_fluct);
5371 5378
5372 /* 5379 /*
5373 * If we iterated at least 20% past the previous maximum, 5380 * If we iterated at least 20% past the previous maximum,
5374 * and the cost has dropped by more than 20% already, 5381 * and the cost has dropped by more than 20% already,
5375 * (taking fluctuations into account) then we assume to 5382 * (taking fluctuations into account) then we assume to
5376 * have found the maximum and break out of the loop early: 5383 * have found the maximum and break out of the loop early:
5377 */ 5384 */
5378 if (size_found && (size*100 > size_found*SIZE_THRESH)) 5385 if (size_found && (size*100 > size_found*SIZE_THRESH))
5379 if (cost+avg_fluct <= 0 || 5386 if (cost+avg_fluct <= 0 ||
5380 max_cost*100 > (cost+avg_fluct)*COST_THRESH) { 5387 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
5381 5388
5382 if (migration_debug) 5389 if (migration_debug)
5383 printk("-> found max.\n"); 5390 printk("-> found max.\n");
5384 break; 5391 break;
5385 } 5392 }
5386 /* 5393 /*
5387 * Increase the cachesize in 10% steps: 5394 * Increase the cachesize in 10% steps:
5388 */ 5395 */
5389 size = size * 10 / 9; 5396 size = size * 10 / 9;
5390 } 5397 }
5391 5398
5392 if (migration_debug) 5399 if (migration_debug)
5393 printk("[%d][%d] working set size found: %d, cost: %Ld\n", 5400 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
5394 cpu1, cpu2, size_found, max_cost); 5401 cpu1, cpu2, size_found, max_cost);
5395 5402
5396 vfree(cache); 5403 vfree(cache);
5397 5404
5398 /* 5405 /*
5399 * A task is considered 'cache cold' if at least 2 times 5406 * A task is considered 'cache cold' if at least 2 times
5400 * the worst-case cost of migration has passed. 5407 * the worst-case cost of migration has passed.
5401 * 5408 *
5402 * (this limit is only listened to if the load-balancing 5409 * (this limit is only listened to if the load-balancing
5403 * situation is 'nice' - if there is a large imbalance we 5410 * situation is 'nice' - if there is a large imbalance we
5404 * ignore it for the sake of CPU utilization and 5411 * ignore it for the sake of CPU utilization and
5405 * processing fairness.) 5412 * processing fairness.)
5406 */ 5413 */
5407 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; 5414 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
5408 } 5415 }
5409 5416
5410 static void calibrate_migration_costs(const cpumask_t *cpu_map) 5417 static void calibrate_migration_costs(const cpumask_t *cpu_map)
5411 { 5418 {
5412 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); 5419 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
5413 unsigned long j0, j1, distance, max_distance = 0; 5420 unsigned long j0, j1, distance, max_distance = 0;
5414 struct sched_domain *sd; 5421 struct sched_domain *sd;
5415 5422
5416 j0 = jiffies; 5423 j0 = jiffies;
5417 5424
5418 /* 5425 /*
5419 * First pass - calculate the cacheflush times: 5426 * First pass - calculate the cacheflush times:
5420 */ 5427 */
5421 for_each_cpu_mask(cpu1, *cpu_map) { 5428 for_each_cpu_mask(cpu1, *cpu_map) {
5422 for_each_cpu_mask(cpu2, *cpu_map) { 5429 for_each_cpu_mask(cpu2, *cpu_map) {
5423 if (cpu1 == cpu2) 5430 if (cpu1 == cpu2)
5424 continue; 5431 continue;
5425 distance = domain_distance(cpu1, cpu2); 5432 distance = domain_distance(cpu1, cpu2);
5426 max_distance = max(max_distance, distance); 5433 max_distance = max(max_distance, distance);
5427 /* 5434 /*
5428 * No result cached yet? 5435 * No result cached yet?
5429 */ 5436 */
5430 if (migration_cost[distance] == -1LL) 5437 if (migration_cost[distance] == -1LL)
5431 migration_cost[distance] = 5438 migration_cost[distance] =
5432 measure_migration_cost(cpu1, cpu2); 5439 measure_migration_cost(cpu1, cpu2);
5433 } 5440 }
5434 } 5441 }
5435 /* 5442 /*
5436 * Second pass - update the sched domain hierarchy with 5443 * Second pass - update the sched domain hierarchy with
5437 * the new cache-hot-time estimations: 5444 * the new cache-hot-time estimations:
5438 */ 5445 */
5439 for_each_cpu_mask(cpu, *cpu_map) { 5446 for_each_cpu_mask(cpu, *cpu_map) {
5440 distance = 0; 5447 distance = 0;
5441 for_each_domain(cpu, sd) { 5448 for_each_domain(cpu, sd) {
5442 sd->cache_hot_time = migration_cost[distance]; 5449 sd->cache_hot_time = migration_cost[distance];
5443 distance++; 5450 distance++;
5444 } 5451 }
5445 } 5452 }
5446 /* 5453 /*
5447 * Print the matrix: 5454 * Print the matrix:
5448 */ 5455 */
5449 if (migration_debug) 5456 if (migration_debug)
5450 printk("migration: max_cache_size: %d, cpu: %d MHz:\n", 5457 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
5451 max_cache_size, 5458 max_cache_size,
5452 #ifdef CONFIG_X86 5459 #ifdef CONFIG_X86
5453 cpu_khz/1000 5460 cpu_khz/1000
5454 #else 5461 #else
5455 -1 5462 -1
5456 #endif 5463 #endif
5457 ); 5464 );
5458 if (system_state == SYSTEM_BOOTING) { 5465 if (system_state == SYSTEM_BOOTING) {
5459 printk("migration_cost="); 5466 printk("migration_cost=");
5460 for (distance = 0; distance <= max_distance; distance++) { 5467 for (distance = 0; distance <= max_distance; distance++) {
5461 if (distance) 5468 if (distance)
5462 printk(","); 5469 printk(",");
5463 printk("%ld", (long)migration_cost[distance] / 1000); 5470 printk("%ld", (long)migration_cost[distance] / 1000);
5464 } 5471 }
5465 printk("\n"); 5472 printk("\n");
5466 } 5473 }
5467 j1 = jiffies; 5474 j1 = jiffies;
5468 if (migration_debug) 5475 if (migration_debug)
5469 printk("migration: %ld seconds\n", (j1-j0)/HZ); 5476 printk("migration: %ld seconds\n", (j1-j0)/HZ);
5470 5477
5471 /* 5478 /*
5472 * Move back to the original CPU. NUMA-Q gets confused 5479 * Move back to the original CPU. NUMA-Q gets confused
5473 * if we migrate to another quad during bootup. 5480 * if we migrate to another quad during bootup.
5474 */ 5481 */
5475 if (raw_smp_processor_id() != orig_cpu) { 5482 if (raw_smp_processor_id() != orig_cpu) {
5476 cpumask_t mask = cpumask_of_cpu(orig_cpu), 5483 cpumask_t mask = cpumask_of_cpu(orig_cpu),
5477 saved_mask = current->cpus_allowed; 5484 saved_mask = current->cpus_allowed;
5478 5485
5479 set_cpus_allowed(current, mask); 5486 set_cpus_allowed(current, mask);
5480 set_cpus_allowed(current, saved_mask); 5487 set_cpus_allowed(current, saved_mask);
5481 } 5488 }
5482 } 5489 }
5483 5490
5484 #ifdef CONFIG_NUMA 5491 #ifdef CONFIG_NUMA
5485 5492
5486 /** 5493 /**
5487 * find_next_best_node - find the next node to include in a sched_domain 5494 * find_next_best_node - find the next node to include in a sched_domain
5488 * @node: node whose sched_domain we're building 5495 * @node: node whose sched_domain we're building
5489 * @used_nodes: nodes already in the sched_domain 5496 * @used_nodes: nodes already in the sched_domain
5490 * 5497 *
5491 * Find the next node to include in a given scheduling domain. Simply 5498 * Find the next node to include in a given scheduling domain. Simply
5492 * finds the closest node not already in the @used_nodes map. 5499 * finds the closest node not already in the @used_nodes map.
5493 * 5500 *
5494 * Should use nodemask_t. 5501 * Should use nodemask_t.
5495 */ 5502 */
5496 static int find_next_best_node(int node, unsigned long *used_nodes) 5503 static int find_next_best_node(int node, unsigned long *used_nodes)
5497 { 5504 {
5498 int i, n, val, min_val, best_node = 0; 5505 int i, n, val, min_val, best_node = 0;
5499 5506
5500 min_val = INT_MAX; 5507 min_val = INT_MAX;
5501 5508
5502 for (i = 0; i < MAX_NUMNODES; i++) { 5509 for (i = 0; i < MAX_NUMNODES; i++) {
5503 /* Start at @node */ 5510 /* Start at @node */
5504 n = (node + i) % MAX_NUMNODES; 5511 n = (node + i) % MAX_NUMNODES;
5505 5512
5506 if (!nr_cpus_node(n)) 5513 if (!nr_cpus_node(n))
5507 continue; 5514 continue;
5508 5515
5509 /* Skip already used nodes */ 5516 /* Skip already used nodes */
5510 if (test_bit(n, used_nodes)) 5517 if (test_bit(n, used_nodes))
5511 continue; 5518 continue;
5512 5519
5513 /* Simple min distance search */ 5520 /* Simple min distance search */
5514 val = node_distance(node, n); 5521 val = node_distance(node, n);
5515 5522
5516 if (val < min_val) { 5523 if (val < min_val) {
5517 min_val = val; 5524 min_val = val;
5518 best_node = n; 5525 best_node = n;
5519 } 5526 }
5520 } 5527 }
5521 5528
5522 set_bit(best_node, used_nodes); 5529 set_bit(best_node, used_nodes);
5523 return best_node; 5530 return best_node;
5524 } 5531 }
5525 5532
5526 /** 5533 /**
5527 * sched_domain_node_span - get a cpumask for a node's sched_domain 5534 * sched_domain_node_span - get a cpumask for a node's sched_domain
5528 * @node: node whose cpumask we're constructing 5535 * @node: node whose cpumask we're constructing
5529 * @size: number of nodes to include in this span 5536 * @size: number of nodes to include in this span
5530 * 5537 *
5531 * Given a node, construct a good cpumask for its sched_domain to span. It 5538 * Given a node, construct a good cpumask for its sched_domain to span. It
5532 * should be one that prevents unnecessary balancing, but also spreads tasks 5539 * should be one that prevents unnecessary balancing, but also spreads tasks
5533 * out optimally. 5540 * out optimally.
5534 */ 5541 */
5535 static cpumask_t sched_domain_node_span(int node) 5542 static cpumask_t sched_domain_node_span(int node)
5536 { 5543 {
5537 int i; 5544 int i;
5538 cpumask_t span, nodemask; 5545 cpumask_t span, nodemask;
5539 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 5546 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5540 5547
5541 cpus_clear(span); 5548 cpus_clear(span);
5542 bitmap_zero(used_nodes, MAX_NUMNODES); 5549 bitmap_zero(used_nodes, MAX_NUMNODES);
5543 5550
5544 nodemask = node_to_cpumask(node); 5551 nodemask = node_to_cpumask(node);
5545 cpus_or(span, span, nodemask); 5552 cpus_or(span, span, nodemask);
5546 set_bit(node, used_nodes); 5553 set_bit(node, used_nodes);
5547 5554
5548 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 5555 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5549 int next_node = find_next_best_node(node, used_nodes); 5556 int next_node = find_next_best_node(node, used_nodes);
5550 nodemask = node_to_cpumask(next_node); 5557 nodemask = node_to_cpumask(next_node);
5551 cpus_or(span, span, nodemask); 5558 cpus_or(span, span, nodemask);
5552 } 5559 }
5553 5560
5554 return span; 5561 return span;
5555 } 5562 }
5556 #endif 5563 #endif
5557 5564
5558 /* 5565 /*
5559 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 5566 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5560 * can switch it on easily if needed. 5567 * can switch it on easily if needed.
5561 */ 5568 */
5562 #ifdef CONFIG_SCHED_SMT 5569 #ifdef CONFIG_SCHED_SMT
5563 static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 5570 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5564 static struct sched_group sched_group_cpus[NR_CPUS]; 5571 static struct sched_group sched_group_cpus[NR_CPUS];
5565 static int cpu_to_cpu_group(int cpu) 5572 static int cpu_to_cpu_group(int cpu)
5566 { 5573 {
5567 return cpu; 5574 return cpu;
5568 } 5575 }
5569 #endif 5576 #endif
5570 5577
5571 static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5578 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5572 static struct sched_group sched_group_phys[NR_CPUS]; 5579 static struct sched_group sched_group_phys[NR_CPUS];
5573 static int cpu_to_phys_group(int cpu) 5580 static int cpu_to_phys_group(int cpu)
5574 { 5581 {
5575 #ifdef CONFIG_SCHED_SMT 5582 #ifdef CONFIG_SCHED_SMT
5576 return first_cpu(cpu_sibling_map[cpu]); 5583 return first_cpu(cpu_sibling_map[cpu]);
5577 #else 5584 #else
5578 return cpu; 5585 return cpu;
5579 #endif 5586 #endif
5580 } 5587 }
5581 5588
5582 #ifdef CONFIG_NUMA 5589 #ifdef CONFIG_NUMA
5583 /* 5590 /*
5584 * The init_sched_build_groups can't handle what we want to do with node 5591 * The init_sched_build_groups can't handle what we want to do with node
5585 * groups, so roll our own. Now each node has its own list of groups which 5592 * groups, so roll our own. Now each node has its own list of groups which
5586 * gets dynamically allocated. 5593 * gets dynamically allocated.
5587 */ 5594 */
5588 static DEFINE_PER_CPU(struct sched_domain, node_domains); 5595 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5589 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 5596 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5590 5597
5591 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 5598 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5592 static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 5599 static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
5593 5600
5594 static int cpu_to_allnodes_group(int cpu) 5601 static int cpu_to_allnodes_group(int cpu)
5595 { 5602 {
5596 return cpu_to_node(cpu); 5603 return cpu_to_node(cpu);
5597 } 5604 }
5598 #endif 5605 #endif
5599 5606
5600 /* 5607 /*
5601 * Build sched domains for a given set of cpus and attach the sched domains 5608 * Build sched domains for a given set of cpus and attach the sched domains
5602 * to the individual cpus 5609 * to the individual cpus
5603 */ 5610 */
5604 void build_sched_domains(const cpumask_t *cpu_map) 5611 void build_sched_domains(const cpumask_t *cpu_map)
5605 { 5612 {
5606 int i; 5613 int i;
5607 #ifdef CONFIG_NUMA 5614 #ifdef CONFIG_NUMA
5608 struct sched_group **sched_group_nodes = NULL; 5615 struct sched_group **sched_group_nodes = NULL;
5609 struct sched_group *sched_group_allnodes = NULL; 5616 struct sched_group *sched_group_allnodes = NULL;
5610 5617
5611 /* 5618 /*
5612 * Allocate the per-node list of sched groups 5619 * Allocate the per-node list of sched groups
5613 */ 5620 */
5614 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5621 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5615 GFP_ATOMIC); 5622 GFP_ATOMIC);
5616 if (!sched_group_nodes) { 5623 if (!sched_group_nodes) {
5617 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5624 printk(KERN_WARNING "Can not alloc sched group node list\n");
5618 return; 5625 return;
5619 } 5626 }
5620 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 5627 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5621 #endif 5628 #endif
5622 5629
5623 /* 5630 /*
5624 * Set up domains for cpus specified by the cpu_map. 5631 * Set up domains for cpus specified by the cpu_map.
5625 */ 5632 */
5626 for_each_cpu_mask(i, *cpu_map) { 5633 for_each_cpu_mask(i, *cpu_map) {
5627 int group; 5634 int group;
5628 struct sched_domain *sd = NULL, *p; 5635 struct sched_domain *sd = NULL, *p;
5629 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 5636 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5630 5637
5631 cpus_and(nodemask, nodemask, *cpu_map); 5638 cpus_and(nodemask, nodemask, *cpu_map);
5632 5639
5633 #ifdef CONFIG_NUMA 5640 #ifdef CONFIG_NUMA
5634 if (cpus_weight(*cpu_map) 5641 if (cpus_weight(*cpu_map)
5635 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5642 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5636 if (!sched_group_allnodes) { 5643 if (!sched_group_allnodes) {
5637 sched_group_allnodes 5644 sched_group_allnodes
5638 = kmalloc(sizeof(struct sched_group) 5645 = kmalloc(sizeof(struct sched_group)
5639 * MAX_NUMNODES, 5646 * MAX_NUMNODES,
5640 GFP_KERNEL); 5647 GFP_KERNEL);
5641 if (!sched_group_allnodes) { 5648 if (!sched_group_allnodes) {
5642 printk(KERN_WARNING 5649 printk(KERN_WARNING
5643 "Can not alloc allnodes sched group\n"); 5650 "Can not alloc allnodes sched group\n");
5644 break; 5651 break;
5645 } 5652 }
5646 sched_group_allnodes_bycpu[i] 5653 sched_group_allnodes_bycpu[i]
5647 = sched_group_allnodes; 5654 = sched_group_allnodes;
5648 } 5655 }
5649 sd = &per_cpu(allnodes_domains, i); 5656 sd = &per_cpu(allnodes_domains, i);
5650 *sd = SD_ALLNODES_INIT; 5657 *sd = SD_ALLNODES_INIT;
5651 sd->span = *cpu_map; 5658 sd->span = *cpu_map;
5652 group = cpu_to_allnodes_group(i); 5659 group = cpu_to_allnodes_group(i);
5653 sd->groups = &sched_group_allnodes[group]; 5660 sd->groups = &sched_group_allnodes[group];
5654 p = sd; 5661 p = sd;
5655 } else 5662 } else
5656 p = NULL; 5663 p = NULL;
5657 5664
5658 sd = &per_cpu(node_domains, i); 5665 sd = &per_cpu(node_domains, i);
5659 *sd = SD_NODE_INIT; 5666 *sd = SD_NODE_INIT;
5660 sd->span = sched_domain_node_span(cpu_to_node(i)); 5667 sd->span = sched_domain_node_span(cpu_to_node(i));
5661 sd->parent = p; 5668 sd->parent = p;
5662 cpus_and(sd->span, sd->span, *cpu_map); 5669 cpus_and(sd->span, sd->span, *cpu_map);
5663 #endif 5670 #endif
5664 5671
5665 p = sd; 5672 p = sd;
5666 sd = &per_cpu(phys_domains, i); 5673 sd = &per_cpu(phys_domains, i);
5667 group = cpu_to_phys_group(i); 5674 group = cpu_to_phys_group(i);
5668 *sd = SD_CPU_INIT; 5675 *sd = SD_CPU_INIT;
5669 sd->span = nodemask; 5676 sd->span = nodemask;
5670 sd->parent = p; 5677 sd->parent = p;
5671 sd->groups = &sched_group_phys[group]; 5678 sd->groups = &sched_group_phys[group];
5672 5679
5673 #ifdef CONFIG_SCHED_SMT 5680 #ifdef CONFIG_SCHED_SMT
5674 p = sd; 5681 p = sd;
5675 sd = &per_cpu(cpu_domains, i); 5682 sd = &per_cpu(cpu_domains, i);
5676 group = cpu_to_cpu_group(i); 5683 group = cpu_to_cpu_group(i);
5677 *sd = SD_SIBLING_INIT; 5684 *sd = SD_SIBLING_INIT;
5678 sd->span = cpu_sibling_map[i]; 5685 sd->span = cpu_sibling_map[i];
5679 cpus_and(sd->span, sd->span, *cpu_map); 5686 cpus_and(sd->span, sd->span, *cpu_map);
5680 sd->parent = p; 5687 sd->parent = p;
5681 sd->groups = &sched_group_cpus[group]; 5688 sd->groups = &sched_group_cpus[group];
5682 #endif 5689 #endif
5683 } 5690 }
5684 5691
5685 #ifdef CONFIG_SCHED_SMT 5692 #ifdef CONFIG_SCHED_SMT
5686 /* Set up CPU (sibling) groups */ 5693 /* Set up CPU (sibling) groups */
5687 for_each_cpu_mask(i, *cpu_map) { 5694 for_each_cpu_mask(i, *cpu_map) {
5688 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5695 cpumask_t this_sibling_map = cpu_sibling_map[i];
5689 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5696 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5690 if (i != first_cpu(this_sibling_map)) 5697 if (i != first_cpu(this_sibling_map))
5691 continue; 5698 continue;
5692 5699
5693 init_sched_build_groups(sched_group_cpus, this_sibling_map, 5700 init_sched_build_groups(sched_group_cpus, this_sibling_map,
5694 &cpu_to_cpu_group); 5701 &cpu_to_cpu_group);
5695 } 5702 }
5696 #endif 5703 #endif
5697 5704
5698 /* Set up physical groups */ 5705 /* Set up physical groups */
5699 for (i = 0; i < MAX_NUMNODES; i++) { 5706 for (i = 0; i < MAX_NUMNODES; i++) {
5700 cpumask_t nodemask = node_to_cpumask(i); 5707 cpumask_t nodemask = node_to_cpumask(i);
5701 5708
5702 cpus_and(nodemask, nodemask, *cpu_map); 5709 cpus_and(nodemask, nodemask, *cpu_map);
5703 if (cpus_empty(nodemask)) 5710 if (cpus_empty(nodemask))
5704 continue; 5711 continue;
5705 5712
5706 init_sched_build_groups(sched_group_phys, nodemask, 5713 init_sched_build_groups(sched_group_phys, nodemask,
5707 &cpu_to_phys_group); 5714 &cpu_to_phys_group);
5708 } 5715 }
5709 5716
5710 #ifdef CONFIG_NUMA 5717 #ifdef CONFIG_NUMA
5711 /* Set up node groups */ 5718 /* Set up node groups */
5712 if (sched_group_allnodes) 5719 if (sched_group_allnodes)
5713 init_sched_build_groups(sched_group_allnodes, *cpu_map, 5720 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5714 &cpu_to_allnodes_group); 5721 &cpu_to_allnodes_group);
5715 5722
5716 for (i = 0; i < MAX_NUMNODES; i++) { 5723 for (i = 0; i < MAX_NUMNODES; i++) {
5717 /* Set up node groups */ 5724 /* Set up node groups */
5718 struct sched_group *sg, *prev; 5725 struct sched_group *sg, *prev;
5719 cpumask_t nodemask = node_to_cpumask(i); 5726 cpumask_t nodemask = node_to_cpumask(i);
5720 cpumask_t domainspan; 5727 cpumask_t domainspan;
5721 cpumask_t covered = CPU_MASK_NONE; 5728 cpumask_t covered = CPU_MASK_NONE;
5722 int j; 5729 int j;
5723 5730
5724 cpus_and(nodemask, nodemask, *cpu_map); 5731 cpus_and(nodemask, nodemask, *cpu_map);
5725 if (cpus_empty(nodemask)) { 5732 if (cpus_empty(nodemask)) {
5726 sched_group_nodes[i] = NULL; 5733 sched_group_nodes[i] = NULL;
5727 continue; 5734 continue;
5728 } 5735 }
5729 5736
5730 domainspan = sched_domain_node_span(i); 5737 domainspan = sched_domain_node_span(i);
5731 cpus_and(domainspan, domainspan, *cpu_map); 5738 cpus_and(domainspan, domainspan, *cpu_map);
5732 5739
5733 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 5740 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5734 sched_group_nodes[i] = sg; 5741 sched_group_nodes[i] = sg;
5735 for_each_cpu_mask(j, nodemask) { 5742 for_each_cpu_mask(j, nodemask) {
5736 struct sched_domain *sd; 5743 struct sched_domain *sd;
5737 sd = &per_cpu(node_domains, j); 5744 sd = &per_cpu(node_domains, j);
5738 sd->groups = sg; 5745 sd->groups = sg;
5739 if (sd->groups == NULL) { 5746 if (sd->groups == NULL) {
5740 /* Turn off balancing if we have no groups */ 5747 /* Turn off balancing if we have no groups */
5741 sd->flags = 0; 5748 sd->flags = 0;
5742 } 5749 }
5743 } 5750 }
5744 if (!sg) { 5751 if (!sg) {
5745 printk(KERN_WARNING 5752 printk(KERN_WARNING
5746 "Can not alloc domain group for node %d\n", i); 5753 "Can not alloc domain group for node %d\n", i);
5747 continue; 5754 continue;
5748 } 5755 }
5749 sg->cpu_power = 0; 5756 sg->cpu_power = 0;
5750 sg->cpumask = nodemask; 5757 sg->cpumask = nodemask;
5751 cpus_or(covered, covered, nodemask); 5758 cpus_or(covered, covered, nodemask);
5752 prev = sg; 5759 prev = sg;
5753 5760
5754 for (j = 0; j < MAX_NUMNODES; j++) { 5761 for (j = 0; j < MAX_NUMNODES; j++) {
5755 cpumask_t tmp, notcovered; 5762 cpumask_t tmp, notcovered;
5756 int n = (i + j) % MAX_NUMNODES; 5763 int n = (i + j) % MAX_NUMNODES;
5757 5764
5758 cpus_complement(notcovered, covered); 5765 cpus_complement(notcovered, covered);
5759 cpus_and(tmp, notcovered, *cpu_map); 5766 cpus_and(tmp, notcovered, *cpu_map);
5760 cpus_and(tmp, tmp, domainspan); 5767 cpus_and(tmp, tmp, domainspan);
5761 if (cpus_empty(tmp)) 5768 if (cpus_empty(tmp))
5762 break; 5769 break;
5763 5770
5764 nodemask = node_to_cpumask(n); 5771 nodemask = node_to_cpumask(n);
5765 cpus_and(tmp, tmp, nodemask); 5772 cpus_and(tmp, tmp, nodemask);
5766 if (cpus_empty(tmp)) 5773 if (cpus_empty(tmp))
5767 continue; 5774 continue;
5768 5775
5769 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 5776 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5770 if (!sg) { 5777 if (!sg) {
5771 printk(KERN_WARNING 5778 printk(KERN_WARNING
5772 "Can not alloc domain group for node %d\n", j); 5779 "Can not alloc domain group for node %d\n", j);
5773 break; 5780 break;
5774 } 5781 }
5775 sg->cpu_power = 0; 5782 sg->cpu_power = 0;
5776 sg->cpumask = tmp; 5783 sg->cpumask = tmp;
5777 cpus_or(covered, covered, tmp); 5784 cpus_or(covered, covered, tmp);
5778 prev->next = sg; 5785 prev->next = sg;
5779 prev = sg; 5786 prev = sg;
5780 } 5787 }
5781 prev->next = sched_group_nodes[i]; 5788 prev->next = sched_group_nodes[i];
5782 } 5789 }
5783 #endif 5790 #endif
5784 5791
5785 /* Calculate CPU power for physical packages and nodes */ 5792 /* Calculate CPU power for physical packages and nodes */
5786 for_each_cpu_mask(i, *cpu_map) { 5793 for_each_cpu_mask(i, *cpu_map) {
5787 int power; 5794 int power;
5788 struct sched_domain *sd; 5795 struct sched_domain *sd;
5789 #ifdef CONFIG_SCHED_SMT 5796 #ifdef CONFIG_SCHED_SMT
5790 sd = &per_cpu(cpu_domains, i); 5797 sd = &per_cpu(cpu_domains, i);
5791 power = SCHED_LOAD_SCALE; 5798 power = SCHED_LOAD_SCALE;
5792 sd->groups->cpu_power = power; 5799 sd->groups->cpu_power = power;
5793 #endif 5800 #endif
5794 5801
5795 sd = &per_cpu(phys_domains, i); 5802 sd = &per_cpu(phys_domains, i);
5796 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5803 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5797 (cpus_weight(sd->groups->cpumask)-1) / 10; 5804 (cpus_weight(sd->groups->cpumask)-1) / 10;
5798 sd->groups->cpu_power = power; 5805 sd->groups->cpu_power = power;
5799 5806
5800 #ifdef CONFIG_NUMA 5807 #ifdef CONFIG_NUMA
5801 sd = &per_cpu(allnodes_domains, i); 5808 sd = &per_cpu(allnodes_domains, i);
5802 if (sd->groups) { 5809 if (sd->groups) {
5803 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5810 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5804 (cpus_weight(sd->groups->cpumask)-1) / 10; 5811 (cpus_weight(sd->groups->cpumask)-1) / 10;
5805 sd->groups->cpu_power = power; 5812 sd->groups->cpu_power = power;
5806 } 5813 }
5807 #endif 5814 #endif
5808 } 5815 }
5809 5816
5810 #ifdef CONFIG_NUMA 5817 #ifdef CONFIG_NUMA
5811 for (i = 0; i < MAX_NUMNODES; i++) { 5818 for (i = 0; i < MAX_NUMNODES; i++) {
5812 struct sched_group *sg = sched_group_nodes[i]; 5819 struct sched_group *sg = sched_group_nodes[i];
5813 int j; 5820 int j;
5814 5821
5815 if (sg == NULL) 5822 if (sg == NULL)
5816 continue; 5823 continue;
5817 next_sg: 5824 next_sg:
5818 for_each_cpu_mask(j, sg->cpumask) { 5825 for_each_cpu_mask(j, sg->cpumask) {
5819 struct sched_domain *sd; 5826 struct sched_domain *sd;
5820 int power; 5827 int power;
5821 5828
5822 sd = &per_cpu(phys_domains, j); 5829 sd = &per_cpu(phys_domains, j);
5823 if (j != first_cpu(sd->groups->cpumask)) { 5830 if (j != first_cpu(sd->groups->cpumask)) {
5824 /* 5831 /*
5825 * Only add "power" once for each 5832 * Only add "power" once for each
5826 * physical package. 5833 * physical package.
5827 */ 5834 */
5828 continue; 5835 continue;
5829 } 5836 }
5830 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5837 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5831 (cpus_weight(sd->groups->cpumask)-1) / 10; 5838 (cpus_weight(sd->groups->cpumask)-1) / 10;
5832 5839
5833 sg->cpu_power += power; 5840 sg->cpu_power += power;
5834 } 5841 }
5835 sg = sg->next; 5842 sg = sg->next;
5836 if (sg != sched_group_nodes[i]) 5843 if (sg != sched_group_nodes[i])
5837 goto next_sg; 5844 goto next_sg;
5838 } 5845 }
5839 #endif 5846 #endif
5840 5847
5841 /* Attach the domains */ 5848 /* Attach the domains */
5842 for_each_cpu_mask(i, *cpu_map) { 5849 for_each_cpu_mask(i, *cpu_map) {
5843 struct sched_domain *sd; 5850 struct sched_domain *sd;
5844 #ifdef CONFIG_SCHED_SMT 5851 #ifdef CONFIG_SCHED_SMT
5845 sd = &per_cpu(cpu_domains, i); 5852 sd = &per_cpu(cpu_domains, i);
5846 #else 5853 #else
5847 sd = &per_cpu(phys_domains, i); 5854 sd = &per_cpu(phys_domains, i);
5848 #endif 5855 #endif
5849 cpu_attach_domain(sd, i); 5856 cpu_attach_domain(sd, i);
5850 } 5857 }
5851 /* 5858 /*
5852 * Tune cache-hot values: 5859 * Tune cache-hot values:
5853 */ 5860 */
5854 calibrate_migration_costs(cpu_map); 5861 calibrate_migration_costs(cpu_map);
5855 } 5862 }
5856 /* 5863 /*
5857 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5864 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5858 */ 5865 */
5859 static void arch_init_sched_domains(const cpumask_t *cpu_map) 5866 static void arch_init_sched_domains(const cpumask_t *cpu_map)
5860 { 5867 {
5861 cpumask_t cpu_default_map; 5868 cpumask_t cpu_default_map;
5862 5869
5863 /* 5870 /*
5864 * Setup mask for cpus without special case scheduling requirements. 5871 * Setup mask for cpus without special case scheduling requirements.
5865 * For now this just excludes isolated cpus, but could be used to 5872 * For now this just excludes isolated cpus, but could be used to
5866 * exclude other special cases in the future. 5873 * exclude other special cases in the future.
5867 */ 5874 */
5868 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 5875 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5869 5876
5870 build_sched_domains(&cpu_default_map); 5877 build_sched_domains(&cpu_default_map);
5871 } 5878 }
5872 5879
5873 static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5880 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5874 { 5881 {
5875 #ifdef CONFIG_NUMA 5882 #ifdef CONFIG_NUMA
5876 int i; 5883 int i;
5877 int cpu; 5884 int cpu;
5878 5885
5879 for_each_cpu_mask(cpu, *cpu_map) { 5886 for_each_cpu_mask(cpu, *cpu_map) {
5880 struct sched_group *sched_group_allnodes 5887 struct sched_group *sched_group_allnodes
5881 = sched_group_allnodes_bycpu[cpu]; 5888 = sched_group_allnodes_bycpu[cpu];
5882 struct sched_group **sched_group_nodes 5889 struct sched_group **sched_group_nodes
5883 = sched_group_nodes_bycpu[cpu]; 5890 = sched_group_nodes_bycpu[cpu];
5884 5891
5885 if (sched_group_allnodes) { 5892 if (sched_group_allnodes) {
5886 kfree(sched_group_allnodes); 5893 kfree(sched_group_allnodes);
5887 sched_group_allnodes_bycpu[cpu] = NULL; 5894 sched_group_allnodes_bycpu[cpu] = NULL;
5888 } 5895 }
5889 5896
5890 if (!sched_group_nodes) 5897 if (!sched_group_nodes)
5891 continue; 5898 continue;
5892 5899
5893 for (i = 0; i < MAX_NUMNODES; i++) { 5900 for (i = 0; i < MAX_NUMNODES; i++) {
5894 cpumask_t nodemask = node_to_cpumask(i); 5901 cpumask_t nodemask = node_to_cpumask(i);
5895 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 5902 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5896 5903
5897 cpus_and(nodemask, nodemask, *cpu_map); 5904 cpus_and(nodemask, nodemask, *cpu_map);
5898 if (cpus_empty(nodemask)) 5905 if (cpus_empty(nodemask))
5899 continue; 5906 continue;
5900 5907
5901 if (sg == NULL) 5908 if (sg == NULL)
5902 continue; 5909 continue;
5903 sg = sg->next; 5910 sg = sg->next;
5904 next_sg: 5911 next_sg:
5905 oldsg = sg; 5912 oldsg = sg;
5906 sg = sg->next; 5913 sg = sg->next;
5907 kfree(oldsg); 5914 kfree(oldsg);
5908 if (oldsg != sched_group_nodes[i]) 5915 if (oldsg != sched_group_nodes[i])
5909 goto next_sg; 5916 goto next_sg;
5910 } 5917 }
5911 kfree(sched_group_nodes); 5918 kfree(sched_group_nodes);
5912 sched_group_nodes_bycpu[cpu] = NULL; 5919 sched_group_nodes_bycpu[cpu] = NULL;
5913 } 5920 }
5914 #endif 5921 #endif
5915 } 5922 }
5916 5923
5917 /* 5924 /*
5918 * Detach sched domains from a group of cpus specified in cpu_map 5925 * Detach sched domains from a group of cpus specified in cpu_map
5919 * These cpus will now be attached to the NULL domain 5926 * These cpus will now be attached to the NULL domain
5920 */ 5927 */
5921 static void detach_destroy_domains(const cpumask_t *cpu_map) 5928 static void detach_destroy_domains(const cpumask_t *cpu_map)
5922 { 5929 {
5923 int i; 5930 int i;
5924 5931
5925 for_each_cpu_mask(i, *cpu_map) 5932 for_each_cpu_mask(i, *cpu_map)
5926 cpu_attach_domain(NULL, i); 5933 cpu_attach_domain(NULL, i);
5927 synchronize_sched(); 5934 synchronize_sched();
5928 arch_destroy_sched_domains(cpu_map); 5935 arch_destroy_sched_domains(cpu_map);
5929 } 5936 }
5930 5937
5931 /* 5938 /*
5932 * Partition sched domains as specified by the cpumasks below. 5939 * Partition sched domains as specified by the cpumasks below.
5933 * This attaches all cpus from the cpumasks to the NULL domain, 5940 * This attaches all cpus from the cpumasks to the NULL domain,
5934 * waits for a RCU quiescent period, recalculates sched 5941 * waits for a RCU quiescent period, recalculates sched
5935 * domain information and then attaches them back to the 5942 * domain information and then attaches them back to the
5936 * correct sched domains 5943 * correct sched domains
5937 * Call with hotplug lock held 5944 * Call with hotplug lock held
5938 */ 5945 */
5939 void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 5946 void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5940 { 5947 {
5941 cpumask_t change_map; 5948 cpumask_t change_map;
5942 5949
5943 cpus_and(*partition1, *partition1, cpu_online_map); 5950 cpus_and(*partition1, *partition1, cpu_online_map);
5944 cpus_and(*partition2, *partition2, cpu_online_map); 5951 cpus_and(*partition2, *partition2, cpu_online_map);
5945 cpus_or(change_map, *partition1, *partition2); 5952 cpus_or(change_map, *partition1, *partition2);
5946 5953
5947 /* Detach sched domains from all of the affected cpus */ 5954 /* Detach sched domains from all of the affected cpus */
5948 detach_destroy_domains(&change_map); 5955 detach_destroy_domains(&change_map);
5949 if (!cpus_empty(*partition1)) 5956 if (!cpus_empty(*partition1))
5950 build_sched_domains(partition1); 5957 build_sched_domains(partition1);
5951 if (!cpus_empty(*partition2)) 5958 if (!cpus_empty(*partition2))
5952 build_sched_domains(partition2); 5959 build_sched_domains(partition2);
5953 } 5960 }
5954 5961
5955 #ifdef CONFIG_HOTPLUG_CPU 5962 #ifdef CONFIG_HOTPLUG_CPU
5956 /* 5963 /*
5957 * Force a reinitialization of the sched domains hierarchy. The domains 5964 * Force a reinitialization of the sched domains hierarchy. The domains
5958 * and groups cannot be updated in place without racing with the balancing 5965 * and groups cannot be updated in place without racing with the balancing
5959 * code, so we temporarily attach all running cpus to the NULL domain 5966 * code, so we temporarily attach all running cpus to the NULL domain
5960 * which will prevent rebalancing while the sched domains are recalculated. 5967 * which will prevent rebalancing while the sched domains are recalculated.
5961 */ 5968 */
5962 static int update_sched_domains(struct notifier_block *nfb, 5969 static int update_sched_domains(struct notifier_block *nfb,
5963 unsigned long action, void *hcpu) 5970 unsigned long action, void *hcpu)
5964 { 5971 {
5965 switch (action) { 5972 switch (action) {
5966 case CPU_UP_PREPARE: 5973 case CPU_UP_PREPARE:
5967 case CPU_DOWN_PREPARE: 5974 case CPU_DOWN_PREPARE:
5968 detach_destroy_domains(&cpu_online_map); 5975 detach_destroy_domains(&cpu_online_map);
5969 return NOTIFY_OK; 5976 return NOTIFY_OK;
5970 5977
5971 case CPU_UP_CANCELED: 5978 case CPU_UP_CANCELED:
5972 case CPU_DOWN_FAILED: 5979 case CPU_DOWN_FAILED:
5973 case CPU_ONLINE: 5980 case CPU_ONLINE:
5974 case CPU_DEAD: 5981 case CPU_DEAD:
5975 /* 5982 /*
5976 * Fall through and re-initialise the domains. 5983 * Fall through and re-initialise the domains.
5977 */ 5984 */
5978 break; 5985 break;
5979 default: 5986 default:
5980 return NOTIFY_DONE; 5987 return NOTIFY_DONE;
5981 } 5988 }
5982 5989
5983 /* The hotplug lock is already held by cpu_up/cpu_down */ 5990 /* The hotplug lock is already held by cpu_up/cpu_down */
5984 arch_init_sched_domains(&cpu_online_map); 5991 arch_init_sched_domains(&cpu_online_map);
5985 5992
5986 return NOTIFY_OK; 5993 return NOTIFY_OK;
5987 } 5994 }
5988 #endif 5995 #endif
5989 5996
5990 void __init sched_init_smp(void) 5997 void __init sched_init_smp(void)
5991 { 5998 {
5992 lock_cpu_hotplug(); 5999 lock_cpu_hotplug();
5993 arch_init_sched_domains(&cpu_online_map); 6000 arch_init_sched_domains(&cpu_online_map);
5994 unlock_cpu_hotplug(); 6001 unlock_cpu_hotplug();
5995 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6002 /* XXX: Theoretical race here - CPU may be hotplugged now */
5996 hotcpu_notifier(update_sched_domains, 0); 6003 hotcpu_notifier(update_sched_domains, 0);
5997 } 6004 }
5998 #else 6005 #else
5999 void __init sched_init_smp(void) 6006 void __init sched_init_smp(void)
6000 { 6007 {
6001 } 6008 }
6002 #endif /* CONFIG_SMP */ 6009 #endif /* CONFIG_SMP */
6003 6010
6004 int in_sched_functions(unsigned long addr) 6011 int in_sched_functions(unsigned long addr)
6005 { 6012 {
6006 /* Linker adds these: start and end of __sched functions */ 6013 /* Linker adds these: start and end of __sched functions */
6007 extern char __sched_text_start[], __sched_text_end[]; 6014 extern char __sched_text_start[], __sched_text_end[];
6008 return in_lock_functions(addr) || 6015 return in_lock_functions(addr) ||
6009 (addr >= (unsigned long)__sched_text_start 6016 (addr >= (unsigned long)__sched_text_start
6010 && addr < (unsigned long)__sched_text_end); 6017 && addr < (unsigned long)__sched_text_end);
6011 } 6018 }
6012 6019
6013 void __init sched_init(void) 6020 void __init sched_init(void)
6014 { 6021 {
6015 runqueue_t *rq; 6022 runqueue_t *rq;
6016 int i, j, k; 6023 int i, j, k;
6017 6024
6018 for_each_cpu(i) { 6025 for_each_cpu(i) {
6019 prio_array_t *array; 6026 prio_array_t *array;
6020 6027
6021 rq = cpu_rq(i); 6028 rq = cpu_rq(i);
6022 spin_lock_init(&rq->lock); 6029 spin_lock_init(&rq->lock);
6023 rq->nr_running = 0; 6030 rq->nr_running = 0;
6024 rq->active = rq->arrays; 6031 rq->active = rq->arrays;
6025 rq->expired = rq->arrays + 1; 6032 rq->expired = rq->arrays + 1;
6026 rq->best_expired_prio = MAX_PRIO; 6033 rq->best_expired_prio = MAX_PRIO;
6027 6034
6028 #ifdef CONFIG_SMP 6035 #ifdef CONFIG_SMP
6029 rq->sd = NULL; 6036 rq->sd = NULL;
6030 for (j = 1; j < 3; j++) 6037 for (j = 1; j < 3; j++)
6031 rq->cpu_load[j] = 0; 6038 rq->cpu_load[j] = 0;
6032 rq->active_balance = 0; 6039 rq->active_balance = 0;
6033 rq->push_cpu = 0; 6040 rq->push_cpu = 0;
6034 rq->migration_thread = NULL; 6041 rq->migration_thread = NULL;
6035 INIT_LIST_HEAD(&rq->migration_queue); 6042 INIT_LIST_HEAD(&rq->migration_queue);
6036 rq->cpu = i; 6043 rq->cpu = i;
6037 #endif 6044 #endif
6038 atomic_set(&rq->nr_iowait, 0); 6045 atomic_set(&rq->nr_iowait, 0);
6039 6046
6040 for (j = 0; j < 2; j++) { 6047 for (j = 0; j < 2; j++) {
6041 array = rq->arrays + j; 6048 array = rq->arrays + j;
6042 for (k = 0; k < MAX_PRIO; k++) { 6049 for (k = 0; k < MAX_PRIO; k++) {
6043 INIT_LIST_HEAD(array->queue + k); 6050 INIT_LIST_HEAD(array->queue + k);
6044 __clear_bit(k, array->bitmap); 6051 __clear_bit(k, array->bitmap);
6045 } 6052 }
6046 // delimiter for bitsearch 6053 // delimiter for bitsearch
6047 __set_bit(MAX_PRIO, array->bitmap); 6054 __set_bit(MAX_PRIO, array->bitmap);
6048 } 6055 }
6049 } 6056 }
6050 6057
6051 /* 6058 /*
6052 * The boot idle thread does lazy MMU switching as well: 6059 * The boot idle thread does lazy MMU switching as well:
6053 */ 6060 */
6054 atomic_inc(&init_mm.mm_count); 6061 atomic_inc(&init_mm.mm_count);
6055 enter_lazy_tlb(&init_mm, current); 6062 enter_lazy_tlb(&init_mm, current);
6056 6063
6057 /* 6064 /*
6058 * Make us the idle thread. Technically, schedule() should not be 6065 * Make us the idle thread. Technically, schedule() should not be
6059 * called from this thread, however somewhere below it might be, 6066 * called from this thread, however somewhere below it might be,
6060 * but because we are the idle thread, we just pick up running again 6067 * but because we are the idle thread, we just pick up running again
6061 * when this runqueue becomes "idle". 6068 * when this runqueue becomes "idle".
6062 */ 6069 */
6063 init_idle(current, smp_processor_id()); 6070 init_idle(current, smp_processor_id());
6064 } 6071 }
6065 6072
6066 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6073 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6067 void __might_sleep(char *file, int line) 6074 void __might_sleep(char *file, int line)
6068 { 6075 {
6069 #if defined(in_atomic) 6076 #if defined(in_atomic)
6070 static unsigned long prev_jiffy; /* ratelimiting */ 6077 static unsigned long prev_jiffy; /* ratelimiting */
6071 6078
6072 if ((in_atomic() || irqs_disabled()) && 6079 if ((in_atomic() || irqs_disabled()) &&
6073 system_state == SYSTEM_RUNNING && !oops_in_progress) { 6080 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6074 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6081 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6075 return; 6082 return;
6076 prev_jiffy = jiffies; 6083 prev_jiffy = jiffies;
6077 printk(KERN_ERR "BUG: sleeping function called from invalid" 6084 printk(KERN_ERR "BUG: sleeping function called from invalid"
6078 " context at %s:%d\n", file, line); 6085 " context at %s:%d\n", file, line);
6079 printk("in_atomic():%d, irqs_disabled():%d\n", 6086 printk("in_atomic():%d, irqs_disabled():%d\n",
6080 in_atomic(), irqs_disabled()); 6087 in_atomic(), irqs_disabled());
6081 dump_stack(); 6088 dump_stack();
6082 } 6089 }
6083 #endif 6090 #endif
6084 } 6091 }
6085 EXPORT_SYMBOL(__might_sleep); 6092 EXPORT_SYMBOL(__might_sleep);
6086 #endif 6093 #endif
6087 6094
6088 #ifdef CONFIG_MAGIC_SYSRQ 6095 #ifdef CONFIG_MAGIC_SYSRQ
6089 void normalize_rt_tasks(void) 6096 void normalize_rt_tasks(void)
6090 { 6097 {
6091 struct task_struct *p; 6098 struct task_struct *p;
6092 prio_array_t *array; 6099 prio_array_t *array;
6093 unsigned long flags; 6100 unsigned long flags;
6094 runqueue_t *rq; 6101 runqueue_t *rq;
6095 6102
6096 read_lock_irq(&tasklist_lock); 6103 read_lock_irq(&tasklist_lock);
6097 for_each_process (p) { 6104 for_each_process (p) {
6098 if (!rt_task(p)) 6105 if (!rt_task(p))
6099 continue; 6106 continue;
6100 6107
6101 rq = task_rq_lock(p, &flags); 6108 rq = task_rq_lock(p, &flags);
6102 6109
6103 array = p->array; 6110 array = p->array;
6104 if (array) 6111 if (array)
6105 deactivate_task(p, task_rq(p)); 6112 deactivate_task(p, task_rq(p));
6106 __setscheduler(p, SCHED_NORMAL, 0); 6113 __setscheduler(p, SCHED_NORMAL, 0);
6107 if (array) { 6114 if (array) {
6108 __activate_task(p, task_rq(p)); 6115 __activate_task(p, task_rq(p));
6109 resched_task(rq->curr); 6116 resched_task(rq->curr);
6110 } 6117 }
6111 6118
6112 task_rq_unlock(rq, &flags); 6119 task_rq_unlock(rq, &flags);
6113 } 6120 }
6114 read_unlock_irq(&tasklist_lock); 6121 read_unlock_irq(&tasklist_lock);
6115 } 6122 }
6116 6123
6117 #endif /* CONFIG_MAGIC_SYSRQ */ 6124 #endif /* CONFIG_MAGIC_SYSRQ */
6118 6125
6119 #ifdef CONFIG_IA64 6126 #ifdef CONFIG_IA64
6120 /* 6127 /*
6121 * These functions are only useful for the IA64 MCA handling. 6128 * These functions are only useful for the IA64 MCA handling.
6122 * 6129 *
6123 * They can only be called when the whole system has been 6130 * They can only be called when the whole system has been
6124 * stopped - every CPU needs to be quiescent, and no scheduling 6131 * stopped - every CPU needs to be quiescent, and no scheduling
6125 * activity can take place. Using them for anything else would 6132 * activity can take place. Using them for anything else would
6126 * be a serious bug, and as a result, they aren't even visible 6133 * be a serious bug, and as a result, they aren't even visible
6127 * under any other configuration. 6134 * under any other configuration.
6128 */ 6135 */
6129 6136
6130 /** 6137 /**
6131 * curr_task - return the current task for a given cpu. 6138 * curr_task - return the current task for a given cpu.
6132 * @cpu: the processor in question. 6139 * @cpu: the processor in question.
6133 * 6140 *
6134 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6141 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6135 */ 6142 */
6136 task_t *curr_task(int cpu) 6143 task_t *curr_task(int cpu)
6137 { 6144 {
6138 return cpu_curr(cpu); 6145 return cpu_curr(cpu);
6139 } 6146 }
6140 6147
6141 /** 6148 /**
6142 * set_curr_task - set the current task for a given cpu. 6149 * set_curr_task - set the current task for a given cpu.
6143 * @cpu: the processor in question. 6150 * @cpu: the processor in question.
6144 * @p: the task pointer to set. 6151 * @p: the task pointer to set.
6145 * 6152 *
6146 * Description: This function must only be used when non-maskable interrupts 6153 * Description: This function must only be used when non-maskable interrupts
6147 * are serviced on a separate stack. It allows the architecture to switch the 6154 * are serviced on a separate stack. It allows the architecture to switch the
6148 * notion of the current task on a cpu in a non-blocking manner. This function 6155 * notion of the current task on a cpu in a non-blocking manner. This function
6149 * must be called with all CPU's synchronized, and interrupts disabled, the 6156 * must be called with all CPU's synchronized, and interrupts disabled, the
6150 * and caller must save the original value of the current task (see 6157 * and caller must save the original value of the current task (see
6151 * curr_task() above) and restore that value before reenabling interrupts and 6158 * curr_task() above) and restore that value before reenabling interrupts and
6152 * re-starting the system. 6159 * re-starting the system.
6153 * 6160 *
6154 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6161 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6155 */ 6162 */
6156 void set_curr_task(int cpu, task_t *p) 6163 void set_curr_task(int cpu, task_t *p)
6157 { 6164 {
6158 cpu_curr(cpu) = p; 6165 cpu_curr(cpu) = p;
6159 } 6166 }
6160 6167
6161 #endif 6168 #endif
6162 6169