Commit c6fd91f0bdcd294a0ae0ba2b2a7f7456ef4b7144
Committed by
Linus Torvalds
1 parent
c9becf58d9
Exists in
master
and in
7 other branches
[PATCH] kretprobe instance recycled by parent process
When kretprobe probes the schedule() function, if the probed process exits then schedule() will never return, so some kretprobe instances will never be recycled. In this patch the parent process will recycle retprobe instances of the probed function and there will be no memory leak of kretprobe instances. Signed-off-by: bibo mao <bibo.mao@intel.com> Cc: Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 6 changed files with 14 additions and 32 deletions Inline Diff
arch/i386/kernel/process.c
1 | /* | 1 | /* |
2 | * linux/arch/i386/kernel/process.c | 2 | * linux/arch/i386/kernel/process.c |
3 | * | 3 | * |
4 | * Copyright (C) 1995 Linus Torvalds | 4 | * Copyright (C) 1995 Linus Torvalds |
5 | * | 5 | * |
6 | * Pentium III FXSR, SSE support | 6 | * Pentium III FXSR, SSE support |
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | 7 | * Gareth Hughes <gareth@valinux.com>, May 2000 |
8 | */ | 8 | */ |
9 | 9 | ||
10 | /* | 10 | /* |
11 | * This file handles the architecture-dependent parts of process handling.. | 11 | * This file handles the architecture-dependent parts of process handling.. |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <stdarg.h> | 14 | #include <stdarg.h> |
15 | 15 | ||
16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
17 | #include <linux/errno.h> | 17 | #include <linux/errno.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/fs.h> | 19 | #include <linux/fs.h> |
20 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/elfcore.h> | 22 | #include <linux/elfcore.h> |
23 | #include <linux/smp.h> | 23 | #include <linux/smp.h> |
24 | #include <linux/smp_lock.h> | 24 | #include <linux/smp_lock.h> |
25 | #include <linux/stddef.h> | 25 | #include <linux/stddef.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
28 | #include <linux/user.h> | 28 | #include <linux/user.h> |
29 | #include <linux/a.out.h> | 29 | #include <linux/a.out.h> |
30 | #include <linux/interrupt.h> | 30 | #include <linux/interrupt.h> |
31 | #include <linux/config.h> | 31 | #include <linux/config.h> |
32 | #include <linux/utsname.h> | 32 | #include <linux/utsname.h> |
33 | #include <linux/delay.h> | 33 | #include <linux/delay.h> |
34 | #include <linux/reboot.h> | 34 | #include <linux/reboot.h> |
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/mc146818rtc.h> | 36 | #include <linux/mc146818rtc.h> |
37 | #include <linux/module.h> | 37 | #include <linux/module.h> |
38 | #include <linux/kallsyms.h> | 38 | #include <linux/kallsyms.h> |
39 | #include <linux/ptrace.h> | 39 | #include <linux/ptrace.h> |
40 | #include <linux/random.h> | 40 | #include <linux/random.h> |
41 | #include <linux/kprobes.h> | ||
42 | 41 | ||
43 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
44 | #include <asm/pgtable.h> | 43 | #include <asm/pgtable.h> |
45 | #include <asm/system.h> | 44 | #include <asm/system.h> |
46 | #include <asm/io.h> | 45 | #include <asm/io.h> |
47 | #include <asm/ldt.h> | 46 | #include <asm/ldt.h> |
48 | #include <asm/processor.h> | 47 | #include <asm/processor.h> |
49 | #include <asm/i387.h> | 48 | #include <asm/i387.h> |
50 | #include <asm/desc.h> | 49 | #include <asm/desc.h> |
51 | #include <asm/vm86.h> | 50 | #include <asm/vm86.h> |
52 | #ifdef CONFIG_MATH_EMULATION | 51 | #ifdef CONFIG_MATH_EMULATION |
53 | #include <asm/math_emu.h> | 52 | #include <asm/math_emu.h> |
54 | #endif | 53 | #endif |
55 | 54 | ||
56 | #include <linux/err.h> | 55 | #include <linux/err.h> |
57 | 56 | ||
58 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
59 | #include <asm/cpu.h> | 58 | #include <asm/cpu.h> |
60 | 59 | ||
61 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 60 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
62 | 61 | ||
63 | static int hlt_counter; | 62 | static int hlt_counter; |
64 | 63 | ||
65 | unsigned long boot_option_idle_override = 0; | 64 | unsigned long boot_option_idle_override = 0; |
66 | EXPORT_SYMBOL(boot_option_idle_override); | 65 | EXPORT_SYMBOL(boot_option_idle_override); |
67 | 66 | ||
68 | /* | 67 | /* |
69 | * Return saved PC of a blocked thread. | 68 | * Return saved PC of a blocked thread. |
70 | */ | 69 | */ |
71 | unsigned long thread_saved_pc(struct task_struct *tsk) | 70 | unsigned long thread_saved_pc(struct task_struct *tsk) |
72 | { | 71 | { |
73 | return ((unsigned long *)tsk->thread.esp)[3]; | 72 | return ((unsigned long *)tsk->thread.esp)[3]; |
74 | } | 73 | } |
75 | 74 | ||
76 | /* | 75 | /* |
77 | * Powermanagement idle function, if any.. | 76 | * Powermanagement idle function, if any.. |
78 | */ | 77 | */ |
79 | void (*pm_idle)(void); | 78 | void (*pm_idle)(void); |
80 | EXPORT_SYMBOL(pm_idle); | 79 | EXPORT_SYMBOL(pm_idle); |
81 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | 80 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); |
82 | 81 | ||
83 | void disable_hlt(void) | 82 | void disable_hlt(void) |
84 | { | 83 | { |
85 | hlt_counter++; | 84 | hlt_counter++; |
86 | } | 85 | } |
87 | 86 | ||
88 | EXPORT_SYMBOL(disable_hlt); | 87 | EXPORT_SYMBOL(disable_hlt); |
89 | 88 | ||
90 | void enable_hlt(void) | 89 | void enable_hlt(void) |
91 | { | 90 | { |
92 | hlt_counter--; | 91 | hlt_counter--; |
93 | } | 92 | } |
94 | 93 | ||
95 | EXPORT_SYMBOL(enable_hlt); | 94 | EXPORT_SYMBOL(enable_hlt); |
96 | 95 | ||
97 | /* | 96 | /* |
98 | * We use this if we don't have any better | 97 | * We use this if we don't have any better |
99 | * idle routine.. | 98 | * idle routine.. |
100 | */ | 99 | */ |
101 | void default_idle(void) | 100 | void default_idle(void) |
102 | { | 101 | { |
103 | local_irq_enable(); | 102 | local_irq_enable(); |
104 | 103 | ||
105 | if (!hlt_counter && boot_cpu_data.hlt_works_ok) { | 104 | if (!hlt_counter && boot_cpu_data.hlt_works_ok) { |
106 | clear_thread_flag(TIF_POLLING_NRFLAG); | 105 | clear_thread_flag(TIF_POLLING_NRFLAG); |
107 | smp_mb__after_clear_bit(); | 106 | smp_mb__after_clear_bit(); |
108 | while (!need_resched()) { | 107 | while (!need_resched()) { |
109 | local_irq_disable(); | 108 | local_irq_disable(); |
110 | if (!need_resched()) | 109 | if (!need_resched()) |
111 | safe_halt(); | 110 | safe_halt(); |
112 | else | 111 | else |
113 | local_irq_enable(); | 112 | local_irq_enable(); |
114 | } | 113 | } |
115 | set_thread_flag(TIF_POLLING_NRFLAG); | 114 | set_thread_flag(TIF_POLLING_NRFLAG); |
116 | } else { | 115 | } else { |
117 | while (!need_resched()) | 116 | while (!need_resched()) |
118 | cpu_relax(); | 117 | cpu_relax(); |
119 | } | 118 | } |
120 | } | 119 | } |
121 | #ifdef CONFIG_APM_MODULE | 120 | #ifdef CONFIG_APM_MODULE |
122 | EXPORT_SYMBOL(default_idle); | 121 | EXPORT_SYMBOL(default_idle); |
123 | #endif | 122 | #endif |
124 | 123 | ||
125 | /* | 124 | /* |
126 | * On SMP it's slightly faster (but much more power-consuming!) | 125 | * On SMP it's slightly faster (but much more power-consuming!) |
127 | * to poll the ->work.need_resched flag instead of waiting for the | 126 | * to poll the ->work.need_resched flag instead of waiting for the |
128 | * cross-CPU IPI to arrive. Use this option with caution. | 127 | * cross-CPU IPI to arrive. Use this option with caution. |
129 | */ | 128 | */ |
130 | static void poll_idle (void) | 129 | static void poll_idle (void) |
131 | { | 130 | { |
132 | local_irq_enable(); | 131 | local_irq_enable(); |
133 | 132 | ||
134 | asm volatile( | 133 | asm volatile( |
135 | "2:" | 134 | "2:" |
136 | "testl %0, %1;" | 135 | "testl %0, %1;" |
137 | "rep; nop;" | 136 | "rep; nop;" |
138 | "je 2b;" | 137 | "je 2b;" |
139 | : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); | 138 | : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); |
140 | } | 139 | } |
141 | 140 | ||
142 | #ifdef CONFIG_HOTPLUG_CPU | 141 | #ifdef CONFIG_HOTPLUG_CPU |
143 | #include <asm/nmi.h> | 142 | #include <asm/nmi.h> |
144 | /* We don't actually take CPU down, just spin without interrupts. */ | 143 | /* We don't actually take CPU down, just spin without interrupts. */ |
145 | static inline void play_dead(void) | 144 | static inline void play_dead(void) |
146 | { | 145 | { |
147 | /* This must be done before dead CPU ack */ | 146 | /* This must be done before dead CPU ack */ |
148 | cpu_exit_clear(); | 147 | cpu_exit_clear(); |
149 | wbinvd(); | 148 | wbinvd(); |
150 | mb(); | 149 | mb(); |
151 | /* Ack it */ | 150 | /* Ack it */ |
152 | __get_cpu_var(cpu_state) = CPU_DEAD; | 151 | __get_cpu_var(cpu_state) = CPU_DEAD; |
153 | 152 | ||
154 | /* | 153 | /* |
155 | * With physical CPU hotplug, we should halt the cpu | 154 | * With physical CPU hotplug, we should halt the cpu |
156 | */ | 155 | */ |
157 | local_irq_disable(); | 156 | local_irq_disable(); |
158 | while (1) | 157 | while (1) |
159 | halt(); | 158 | halt(); |
160 | } | 159 | } |
161 | #else | 160 | #else |
162 | static inline void play_dead(void) | 161 | static inline void play_dead(void) |
163 | { | 162 | { |
164 | BUG(); | 163 | BUG(); |
165 | } | 164 | } |
166 | #endif /* CONFIG_HOTPLUG_CPU */ | 165 | #endif /* CONFIG_HOTPLUG_CPU */ |
167 | 166 | ||
168 | /* | 167 | /* |
169 | * The idle thread. There's no useful work to be | 168 | * The idle thread. There's no useful work to be |
170 | * done, so just try to conserve power and have a | 169 | * done, so just try to conserve power and have a |
171 | * low exit latency (ie sit in a loop waiting for | 170 | * low exit latency (ie sit in a loop waiting for |
172 | * somebody to say that they'd like to reschedule) | 171 | * somebody to say that they'd like to reschedule) |
173 | */ | 172 | */ |
174 | void cpu_idle(void) | 173 | void cpu_idle(void) |
175 | { | 174 | { |
176 | int cpu = smp_processor_id(); | 175 | int cpu = smp_processor_id(); |
177 | 176 | ||
178 | set_thread_flag(TIF_POLLING_NRFLAG); | 177 | set_thread_flag(TIF_POLLING_NRFLAG); |
179 | 178 | ||
180 | /* endless idle loop with no priority at all */ | 179 | /* endless idle loop with no priority at all */ |
181 | while (1) { | 180 | while (1) { |
182 | while (!need_resched()) { | 181 | while (!need_resched()) { |
183 | void (*idle)(void); | 182 | void (*idle)(void); |
184 | 183 | ||
185 | if (__get_cpu_var(cpu_idle_state)) | 184 | if (__get_cpu_var(cpu_idle_state)) |
186 | __get_cpu_var(cpu_idle_state) = 0; | 185 | __get_cpu_var(cpu_idle_state) = 0; |
187 | 186 | ||
188 | rmb(); | 187 | rmb(); |
189 | idle = pm_idle; | 188 | idle = pm_idle; |
190 | 189 | ||
191 | if (!idle) | 190 | if (!idle) |
192 | idle = default_idle; | 191 | idle = default_idle; |
193 | 192 | ||
194 | if (cpu_is_offline(cpu)) | 193 | if (cpu_is_offline(cpu)) |
195 | play_dead(); | 194 | play_dead(); |
196 | 195 | ||
197 | __get_cpu_var(irq_stat).idle_timestamp = jiffies; | 196 | __get_cpu_var(irq_stat).idle_timestamp = jiffies; |
198 | idle(); | 197 | idle(); |
199 | } | 198 | } |
200 | preempt_enable_no_resched(); | 199 | preempt_enable_no_resched(); |
201 | schedule(); | 200 | schedule(); |
202 | preempt_disable(); | 201 | preempt_disable(); |
203 | } | 202 | } |
204 | } | 203 | } |
205 | 204 | ||
206 | void cpu_idle_wait(void) | 205 | void cpu_idle_wait(void) |
207 | { | 206 | { |
208 | unsigned int cpu, this_cpu = get_cpu(); | 207 | unsigned int cpu, this_cpu = get_cpu(); |
209 | cpumask_t map; | 208 | cpumask_t map; |
210 | 209 | ||
211 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | 210 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); |
212 | put_cpu(); | 211 | put_cpu(); |
213 | 212 | ||
214 | cpus_clear(map); | 213 | cpus_clear(map); |
215 | for_each_online_cpu(cpu) { | 214 | for_each_online_cpu(cpu) { |
216 | per_cpu(cpu_idle_state, cpu) = 1; | 215 | per_cpu(cpu_idle_state, cpu) = 1; |
217 | cpu_set(cpu, map); | 216 | cpu_set(cpu, map); |
218 | } | 217 | } |
219 | 218 | ||
220 | __get_cpu_var(cpu_idle_state) = 0; | 219 | __get_cpu_var(cpu_idle_state) = 0; |
221 | 220 | ||
222 | wmb(); | 221 | wmb(); |
223 | do { | 222 | do { |
224 | ssleep(1); | 223 | ssleep(1); |
225 | for_each_online_cpu(cpu) { | 224 | for_each_online_cpu(cpu) { |
226 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | 225 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) |
227 | cpu_clear(cpu, map); | 226 | cpu_clear(cpu, map); |
228 | } | 227 | } |
229 | cpus_and(map, map, cpu_online_map); | 228 | cpus_and(map, map, cpu_online_map); |
230 | } while (!cpus_empty(map)); | 229 | } while (!cpus_empty(map)); |
231 | } | 230 | } |
232 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | 231 | EXPORT_SYMBOL_GPL(cpu_idle_wait); |
233 | 232 | ||
234 | /* | 233 | /* |
235 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | 234 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, |
236 | * which can obviate IPI to trigger checking of need_resched. | 235 | * which can obviate IPI to trigger checking of need_resched. |
237 | * We execute MONITOR against need_resched and enter optimized wait state | 236 | * We execute MONITOR against need_resched and enter optimized wait state |
238 | * through MWAIT. Whenever someone changes need_resched, we would be woken | 237 | * through MWAIT. Whenever someone changes need_resched, we would be woken |
239 | * up from MWAIT (without an IPI). | 238 | * up from MWAIT (without an IPI). |
240 | */ | 239 | */ |
241 | static void mwait_idle(void) | 240 | static void mwait_idle(void) |
242 | { | 241 | { |
243 | local_irq_enable(); | 242 | local_irq_enable(); |
244 | 243 | ||
245 | while (!need_resched()) { | 244 | while (!need_resched()) { |
246 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 245 | __monitor((void *)¤t_thread_info()->flags, 0, 0); |
247 | smp_mb(); | 246 | smp_mb(); |
248 | if (need_resched()) | 247 | if (need_resched()) |
249 | break; | 248 | break; |
250 | __mwait(0, 0); | 249 | __mwait(0, 0); |
251 | } | 250 | } |
252 | } | 251 | } |
253 | 252 | ||
254 | void __devinit select_idle_routine(const struct cpuinfo_x86 *c) | 253 | void __devinit select_idle_routine(const struct cpuinfo_x86 *c) |
255 | { | 254 | { |
256 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | 255 | if (cpu_has(c, X86_FEATURE_MWAIT)) { |
257 | printk("monitor/mwait feature present.\n"); | 256 | printk("monitor/mwait feature present.\n"); |
258 | /* | 257 | /* |
259 | * Skip, if setup has overridden idle. | 258 | * Skip, if setup has overridden idle. |
260 | * One CPU supports mwait => All CPUs supports mwait | 259 | * One CPU supports mwait => All CPUs supports mwait |
261 | */ | 260 | */ |
262 | if (!pm_idle) { | 261 | if (!pm_idle) { |
263 | printk("using mwait in idle threads.\n"); | 262 | printk("using mwait in idle threads.\n"); |
264 | pm_idle = mwait_idle; | 263 | pm_idle = mwait_idle; |
265 | } | 264 | } |
266 | } | 265 | } |
267 | } | 266 | } |
268 | 267 | ||
269 | static int __init idle_setup (char *str) | 268 | static int __init idle_setup (char *str) |
270 | { | 269 | { |
271 | if (!strncmp(str, "poll", 4)) { | 270 | if (!strncmp(str, "poll", 4)) { |
272 | printk("using polling idle threads.\n"); | 271 | printk("using polling idle threads.\n"); |
273 | pm_idle = poll_idle; | 272 | pm_idle = poll_idle; |
274 | #ifdef CONFIG_X86_SMP | 273 | #ifdef CONFIG_X86_SMP |
275 | if (smp_num_siblings > 1) | 274 | if (smp_num_siblings > 1) |
276 | printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); | 275 | printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); |
277 | #endif | 276 | #endif |
278 | } else if (!strncmp(str, "halt", 4)) { | 277 | } else if (!strncmp(str, "halt", 4)) { |
279 | printk("using halt in idle threads.\n"); | 278 | printk("using halt in idle threads.\n"); |
280 | pm_idle = default_idle; | 279 | pm_idle = default_idle; |
281 | } | 280 | } |
282 | 281 | ||
283 | boot_option_idle_override = 1; | 282 | boot_option_idle_override = 1; |
284 | return 1; | 283 | return 1; |
285 | } | 284 | } |
286 | 285 | ||
287 | __setup("idle=", idle_setup); | 286 | __setup("idle=", idle_setup); |
288 | 287 | ||
289 | void show_regs(struct pt_regs * regs) | 288 | void show_regs(struct pt_regs * regs) |
290 | { | 289 | { |
291 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | 290 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; |
292 | 291 | ||
293 | printk("\n"); | 292 | printk("\n"); |
294 | printk("Pid: %d, comm: %20s\n", current->pid, current->comm); | 293 | printk("Pid: %d, comm: %20s\n", current->pid, current->comm); |
295 | printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); | 294 | printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); |
296 | print_symbol("EIP is at %s\n", regs->eip); | 295 | print_symbol("EIP is at %s\n", regs->eip); |
297 | 296 | ||
298 | if (user_mode_vm(regs)) | 297 | if (user_mode_vm(regs)) |
299 | printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); | 298 | printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); |
300 | printk(" EFLAGS: %08lx %s (%s %.*s)\n", | 299 | printk(" EFLAGS: %08lx %s (%s %.*s)\n", |
301 | regs->eflags, print_tainted(), system_utsname.release, | 300 | regs->eflags, print_tainted(), system_utsname.release, |
302 | (int)strcspn(system_utsname.version, " "), | 301 | (int)strcspn(system_utsname.version, " "), |
303 | system_utsname.version); | 302 | system_utsname.version); |
304 | printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", | 303 | printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", |
305 | regs->eax,regs->ebx,regs->ecx,regs->edx); | 304 | regs->eax,regs->ebx,regs->ecx,regs->edx); |
306 | printk("ESI: %08lx EDI: %08lx EBP: %08lx", | 305 | printk("ESI: %08lx EDI: %08lx EBP: %08lx", |
307 | regs->esi, regs->edi, regs->ebp); | 306 | regs->esi, regs->edi, regs->ebp); |
308 | printk(" DS: %04x ES: %04x\n", | 307 | printk(" DS: %04x ES: %04x\n", |
309 | 0xffff & regs->xds,0xffff & regs->xes); | 308 | 0xffff & regs->xds,0xffff & regs->xes); |
310 | 309 | ||
311 | cr0 = read_cr0(); | 310 | cr0 = read_cr0(); |
312 | cr2 = read_cr2(); | 311 | cr2 = read_cr2(); |
313 | cr3 = read_cr3(); | 312 | cr3 = read_cr3(); |
314 | cr4 = read_cr4_safe(); | 313 | cr4 = read_cr4_safe(); |
315 | printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); | 314 | printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); |
316 | show_trace(NULL, ®s->esp); | 315 | show_trace(NULL, ®s->esp); |
317 | } | 316 | } |
318 | 317 | ||
319 | /* | 318 | /* |
320 | * This gets run with %ebx containing the | 319 | * This gets run with %ebx containing the |
321 | * function to call, and %edx containing | 320 | * function to call, and %edx containing |
322 | * the "args". | 321 | * the "args". |
323 | */ | 322 | */ |
324 | extern void kernel_thread_helper(void); | 323 | extern void kernel_thread_helper(void); |
325 | __asm__(".section .text\n" | 324 | __asm__(".section .text\n" |
326 | ".align 4\n" | 325 | ".align 4\n" |
327 | "kernel_thread_helper:\n\t" | 326 | "kernel_thread_helper:\n\t" |
328 | "movl %edx,%eax\n\t" | 327 | "movl %edx,%eax\n\t" |
329 | "pushl %edx\n\t" | 328 | "pushl %edx\n\t" |
330 | "call *%ebx\n\t" | 329 | "call *%ebx\n\t" |
331 | "pushl %eax\n\t" | 330 | "pushl %eax\n\t" |
332 | "call do_exit\n" | 331 | "call do_exit\n" |
333 | ".previous"); | 332 | ".previous"); |
334 | 333 | ||
335 | /* | 334 | /* |
336 | * Create a kernel thread | 335 | * Create a kernel thread |
337 | */ | 336 | */ |
338 | int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) | 337 | int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) |
339 | { | 338 | { |
340 | struct pt_regs regs; | 339 | struct pt_regs regs; |
341 | 340 | ||
342 | memset(®s, 0, sizeof(regs)); | 341 | memset(®s, 0, sizeof(regs)); |
343 | 342 | ||
344 | regs.ebx = (unsigned long) fn; | 343 | regs.ebx = (unsigned long) fn; |
345 | regs.edx = (unsigned long) arg; | 344 | regs.edx = (unsigned long) arg; |
346 | 345 | ||
347 | regs.xds = __USER_DS; | 346 | regs.xds = __USER_DS; |
348 | regs.xes = __USER_DS; | 347 | regs.xes = __USER_DS; |
349 | regs.orig_eax = -1; | 348 | regs.orig_eax = -1; |
350 | regs.eip = (unsigned long) kernel_thread_helper; | 349 | regs.eip = (unsigned long) kernel_thread_helper; |
351 | regs.xcs = __KERNEL_CS; | 350 | regs.xcs = __KERNEL_CS; |
352 | regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; | 351 | regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; |
353 | 352 | ||
354 | /* Ok, create the new process.. */ | 353 | /* Ok, create the new process.. */ |
355 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); | 354 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); |
356 | } | 355 | } |
357 | EXPORT_SYMBOL(kernel_thread); | 356 | EXPORT_SYMBOL(kernel_thread); |
358 | 357 | ||
359 | /* | 358 | /* |
360 | * Free current thread data structures etc.. | 359 | * Free current thread data structures etc.. |
361 | */ | 360 | */ |
362 | void exit_thread(void) | 361 | void exit_thread(void) |
363 | { | 362 | { |
364 | struct task_struct *tsk = current; | 363 | struct task_struct *tsk = current; |
365 | struct thread_struct *t = &tsk->thread; | 364 | struct thread_struct *t = &tsk->thread; |
366 | |||
367 | /* | ||
368 | * Remove function-return probe instances associated with this task | ||
369 | * and put them back on the free list. Do not insert an exit probe for | ||
370 | * this function, it will be disabled by kprobe_flush_task if you do. | ||
371 | */ | ||
372 | kprobe_flush_task(tsk); | ||
373 | 365 | ||
374 | /* The process may have allocated an io port bitmap... nuke it. */ | 366 | /* The process may have allocated an io port bitmap... nuke it. */ |
375 | if (unlikely(NULL != t->io_bitmap_ptr)) { | 367 | if (unlikely(NULL != t->io_bitmap_ptr)) { |
376 | int cpu = get_cpu(); | 368 | int cpu = get_cpu(); |
377 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 369 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
378 | 370 | ||
379 | kfree(t->io_bitmap_ptr); | 371 | kfree(t->io_bitmap_ptr); |
380 | t->io_bitmap_ptr = NULL; | 372 | t->io_bitmap_ptr = NULL; |
381 | /* | 373 | /* |
382 | * Careful, clear this in the TSS too: | 374 | * Careful, clear this in the TSS too: |
383 | */ | 375 | */ |
384 | memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); | 376 | memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); |
385 | t->io_bitmap_max = 0; | 377 | t->io_bitmap_max = 0; |
386 | tss->io_bitmap_owner = NULL; | 378 | tss->io_bitmap_owner = NULL; |
387 | tss->io_bitmap_max = 0; | 379 | tss->io_bitmap_max = 0; |
388 | tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; | 380 | tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; |
389 | put_cpu(); | 381 | put_cpu(); |
390 | } | 382 | } |
391 | } | 383 | } |
392 | 384 | ||
393 | void flush_thread(void) | 385 | void flush_thread(void) |
394 | { | 386 | { |
395 | struct task_struct *tsk = current; | 387 | struct task_struct *tsk = current; |
396 | 388 | ||
397 | memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); | 389 | memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); |
398 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | 390 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); |
399 | /* | 391 | /* |
400 | * Forget coprocessor state.. | 392 | * Forget coprocessor state.. |
401 | */ | 393 | */ |
402 | clear_fpu(tsk); | 394 | clear_fpu(tsk); |
403 | clear_used_math(); | 395 | clear_used_math(); |
404 | } | 396 | } |
405 | 397 | ||
406 | void release_thread(struct task_struct *dead_task) | 398 | void release_thread(struct task_struct *dead_task) |
407 | { | 399 | { |
408 | BUG_ON(dead_task->mm); | 400 | BUG_ON(dead_task->mm); |
409 | release_vm86_irqs(dead_task); | 401 | release_vm86_irqs(dead_task); |
410 | } | 402 | } |
411 | 403 | ||
412 | /* | 404 | /* |
413 | * This gets called before we allocate a new thread and copy | 405 | * This gets called before we allocate a new thread and copy |
414 | * the current task into it. | 406 | * the current task into it. |
415 | */ | 407 | */ |
416 | void prepare_to_copy(struct task_struct *tsk) | 408 | void prepare_to_copy(struct task_struct *tsk) |
417 | { | 409 | { |
418 | unlazy_fpu(tsk); | 410 | unlazy_fpu(tsk); |
419 | } | 411 | } |
420 | 412 | ||
421 | int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, | 413 | int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, |
422 | unsigned long unused, | 414 | unsigned long unused, |
423 | struct task_struct * p, struct pt_regs * regs) | 415 | struct task_struct * p, struct pt_regs * regs) |
424 | { | 416 | { |
425 | struct pt_regs * childregs; | 417 | struct pt_regs * childregs; |
426 | struct task_struct *tsk; | 418 | struct task_struct *tsk; |
427 | int err; | 419 | int err; |
428 | 420 | ||
429 | childregs = task_pt_regs(p); | 421 | childregs = task_pt_regs(p); |
430 | *childregs = *regs; | 422 | *childregs = *regs; |
431 | childregs->eax = 0; | 423 | childregs->eax = 0; |
432 | childregs->esp = esp; | 424 | childregs->esp = esp; |
433 | 425 | ||
434 | p->thread.esp = (unsigned long) childregs; | 426 | p->thread.esp = (unsigned long) childregs; |
435 | p->thread.esp0 = (unsigned long) (childregs+1); | 427 | p->thread.esp0 = (unsigned long) (childregs+1); |
436 | 428 | ||
437 | p->thread.eip = (unsigned long) ret_from_fork; | 429 | p->thread.eip = (unsigned long) ret_from_fork; |
438 | 430 | ||
439 | savesegment(fs,p->thread.fs); | 431 | savesegment(fs,p->thread.fs); |
440 | savesegment(gs,p->thread.gs); | 432 | savesegment(gs,p->thread.gs); |
441 | 433 | ||
442 | tsk = current; | 434 | tsk = current; |
443 | if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { | 435 | if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { |
444 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | 436 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); |
445 | if (!p->thread.io_bitmap_ptr) { | 437 | if (!p->thread.io_bitmap_ptr) { |
446 | p->thread.io_bitmap_max = 0; | 438 | p->thread.io_bitmap_max = 0; |
447 | return -ENOMEM; | 439 | return -ENOMEM; |
448 | } | 440 | } |
449 | memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, | 441 | memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, |
450 | IO_BITMAP_BYTES); | 442 | IO_BITMAP_BYTES); |
451 | } | 443 | } |
452 | 444 | ||
453 | /* | 445 | /* |
454 | * Set a new TLS for the child thread? | 446 | * Set a new TLS for the child thread? |
455 | */ | 447 | */ |
456 | if (clone_flags & CLONE_SETTLS) { | 448 | if (clone_flags & CLONE_SETTLS) { |
457 | struct desc_struct *desc; | 449 | struct desc_struct *desc; |
458 | struct user_desc info; | 450 | struct user_desc info; |
459 | int idx; | 451 | int idx; |
460 | 452 | ||
461 | err = -EFAULT; | 453 | err = -EFAULT; |
462 | if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) | 454 | if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) |
463 | goto out; | 455 | goto out; |
464 | err = -EINVAL; | 456 | err = -EINVAL; |
465 | if (LDT_empty(&info)) | 457 | if (LDT_empty(&info)) |
466 | goto out; | 458 | goto out; |
467 | 459 | ||
468 | idx = info.entry_number; | 460 | idx = info.entry_number; |
469 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | 461 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) |
470 | goto out; | 462 | goto out; |
471 | 463 | ||
472 | desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | 464 | desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; |
473 | desc->a = LDT_entry_a(&info); | 465 | desc->a = LDT_entry_a(&info); |
474 | desc->b = LDT_entry_b(&info); | 466 | desc->b = LDT_entry_b(&info); |
475 | } | 467 | } |
476 | 468 | ||
477 | err = 0; | 469 | err = 0; |
478 | out: | 470 | out: |
479 | if (err && p->thread.io_bitmap_ptr) { | 471 | if (err && p->thread.io_bitmap_ptr) { |
480 | kfree(p->thread.io_bitmap_ptr); | 472 | kfree(p->thread.io_bitmap_ptr); |
481 | p->thread.io_bitmap_max = 0; | 473 | p->thread.io_bitmap_max = 0; |
482 | } | 474 | } |
483 | return err; | 475 | return err; |
484 | } | 476 | } |
485 | 477 | ||
486 | /* | 478 | /* |
487 | * fill in the user structure for a core dump.. | 479 | * fill in the user structure for a core dump.. |
488 | */ | 480 | */ |
489 | void dump_thread(struct pt_regs * regs, struct user * dump) | 481 | void dump_thread(struct pt_regs * regs, struct user * dump) |
490 | { | 482 | { |
491 | int i; | 483 | int i; |
492 | 484 | ||
493 | /* changed the size calculations - should hopefully work better. lbt */ | 485 | /* changed the size calculations - should hopefully work better. lbt */ |
494 | dump->magic = CMAGIC; | 486 | dump->magic = CMAGIC; |
495 | dump->start_code = 0; | 487 | dump->start_code = 0; |
496 | dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); | 488 | dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); |
497 | dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; | 489 | dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; |
498 | dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; | 490 | dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; |
499 | dump->u_dsize -= dump->u_tsize; | 491 | dump->u_dsize -= dump->u_tsize; |
500 | dump->u_ssize = 0; | 492 | dump->u_ssize = 0; |
501 | for (i = 0; i < 8; i++) | 493 | for (i = 0; i < 8; i++) |
502 | dump->u_debugreg[i] = current->thread.debugreg[i]; | 494 | dump->u_debugreg[i] = current->thread.debugreg[i]; |
503 | 495 | ||
504 | if (dump->start_stack < TASK_SIZE) | 496 | if (dump->start_stack < TASK_SIZE) |
505 | dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; | 497 | dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; |
506 | 498 | ||
507 | dump->regs.ebx = regs->ebx; | 499 | dump->regs.ebx = regs->ebx; |
508 | dump->regs.ecx = regs->ecx; | 500 | dump->regs.ecx = regs->ecx; |
509 | dump->regs.edx = regs->edx; | 501 | dump->regs.edx = regs->edx; |
510 | dump->regs.esi = regs->esi; | 502 | dump->regs.esi = regs->esi; |
511 | dump->regs.edi = regs->edi; | 503 | dump->regs.edi = regs->edi; |
512 | dump->regs.ebp = regs->ebp; | 504 | dump->regs.ebp = regs->ebp; |
513 | dump->regs.eax = regs->eax; | 505 | dump->regs.eax = regs->eax; |
514 | dump->regs.ds = regs->xds; | 506 | dump->regs.ds = regs->xds; |
515 | dump->regs.es = regs->xes; | 507 | dump->regs.es = regs->xes; |
516 | savesegment(fs,dump->regs.fs); | 508 | savesegment(fs,dump->regs.fs); |
517 | savesegment(gs,dump->regs.gs); | 509 | savesegment(gs,dump->regs.gs); |
518 | dump->regs.orig_eax = regs->orig_eax; | 510 | dump->regs.orig_eax = regs->orig_eax; |
519 | dump->regs.eip = regs->eip; | 511 | dump->regs.eip = regs->eip; |
520 | dump->regs.cs = regs->xcs; | 512 | dump->regs.cs = regs->xcs; |
521 | dump->regs.eflags = regs->eflags; | 513 | dump->regs.eflags = regs->eflags; |
522 | dump->regs.esp = regs->esp; | 514 | dump->regs.esp = regs->esp; |
523 | dump->regs.ss = regs->xss; | 515 | dump->regs.ss = regs->xss; |
524 | 516 | ||
525 | dump->u_fpvalid = dump_fpu (regs, &dump->i387); | 517 | dump->u_fpvalid = dump_fpu (regs, &dump->i387); |
526 | } | 518 | } |
527 | EXPORT_SYMBOL(dump_thread); | 519 | EXPORT_SYMBOL(dump_thread); |
528 | 520 | ||
529 | /* | 521 | /* |
530 | * Capture the user space registers if the task is not running (in user space) | 522 | * Capture the user space registers if the task is not running (in user space) |
531 | */ | 523 | */ |
532 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | 524 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) |
533 | { | 525 | { |
534 | struct pt_regs ptregs = *task_pt_regs(tsk); | 526 | struct pt_regs ptregs = *task_pt_regs(tsk); |
535 | ptregs.xcs &= 0xffff; | 527 | ptregs.xcs &= 0xffff; |
536 | ptregs.xds &= 0xffff; | 528 | ptregs.xds &= 0xffff; |
537 | ptregs.xes &= 0xffff; | 529 | ptregs.xes &= 0xffff; |
538 | ptregs.xss &= 0xffff; | 530 | ptregs.xss &= 0xffff; |
539 | 531 | ||
540 | elf_core_copy_regs(regs, &ptregs); | 532 | elf_core_copy_regs(regs, &ptregs); |
541 | 533 | ||
542 | return 1; | 534 | return 1; |
543 | } | 535 | } |
544 | 536 | ||
545 | static inline void | 537 | static inline void |
546 | handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) | 538 | handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) |
547 | { | 539 | { |
548 | if (!next->io_bitmap_ptr) { | 540 | if (!next->io_bitmap_ptr) { |
549 | /* | 541 | /* |
550 | * Disable the bitmap via an invalid offset. We still cache | 542 | * Disable the bitmap via an invalid offset. We still cache |
551 | * the previous bitmap owner and the IO bitmap contents: | 543 | * the previous bitmap owner and the IO bitmap contents: |
552 | */ | 544 | */ |
553 | tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; | 545 | tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; |
554 | return; | 546 | return; |
555 | } | 547 | } |
556 | if (likely(next == tss->io_bitmap_owner)) { | 548 | if (likely(next == tss->io_bitmap_owner)) { |
557 | /* | 549 | /* |
558 | * Previous owner of the bitmap (hence the bitmap content) | 550 | * Previous owner of the bitmap (hence the bitmap content) |
559 | * matches the next task, we dont have to do anything but | 551 | * matches the next task, we dont have to do anything but |
560 | * to set a valid offset in the TSS: | 552 | * to set a valid offset in the TSS: |
561 | */ | 553 | */ |
562 | tss->io_bitmap_base = IO_BITMAP_OFFSET; | 554 | tss->io_bitmap_base = IO_BITMAP_OFFSET; |
563 | return; | 555 | return; |
564 | } | 556 | } |
565 | /* | 557 | /* |
566 | * Lazy TSS's I/O bitmap copy. We set an invalid offset here | 558 | * Lazy TSS's I/O bitmap copy. We set an invalid offset here |
567 | * and we let the task to get a GPF in case an I/O instruction | 559 | * and we let the task to get a GPF in case an I/O instruction |
568 | * is performed. The handler of the GPF will verify that the | 560 | * is performed. The handler of the GPF will verify that the |
569 | * faulting task has a valid I/O bitmap and, it true, does the | 561 | * faulting task has a valid I/O bitmap and, it true, does the |
570 | * real copy and restart the instruction. This will save us | 562 | * real copy and restart the instruction. This will save us |
571 | * redundant copies when the currently switched task does not | 563 | * redundant copies when the currently switched task does not |
572 | * perform any I/O during its timeslice. | 564 | * perform any I/O during its timeslice. |
573 | */ | 565 | */ |
574 | tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; | 566 | tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; |
575 | } | 567 | } |
576 | 568 | ||
577 | /* | 569 | /* |
578 | * This function selects if the context switch from prev to next | 570 | * This function selects if the context switch from prev to next |
579 | * has to tweak the TSC disable bit in the cr4. | 571 | * has to tweak the TSC disable bit in the cr4. |
580 | */ | 572 | */ |
581 | static inline void disable_tsc(struct task_struct *prev_p, | 573 | static inline void disable_tsc(struct task_struct *prev_p, |
582 | struct task_struct *next_p) | 574 | struct task_struct *next_p) |
583 | { | 575 | { |
584 | struct thread_info *prev, *next; | 576 | struct thread_info *prev, *next; |
585 | 577 | ||
586 | /* | 578 | /* |
587 | * gcc should eliminate the ->thread_info dereference if | 579 | * gcc should eliminate the ->thread_info dereference if |
588 | * has_secure_computing returns 0 at compile time (SECCOMP=n). | 580 | * has_secure_computing returns 0 at compile time (SECCOMP=n). |
589 | */ | 581 | */ |
590 | prev = task_thread_info(prev_p); | 582 | prev = task_thread_info(prev_p); |
591 | next = task_thread_info(next_p); | 583 | next = task_thread_info(next_p); |
592 | 584 | ||
593 | if (has_secure_computing(prev) || has_secure_computing(next)) { | 585 | if (has_secure_computing(prev) || has_secure_computing(next)) { |
594 | /* slow path here */ | 586 | /* slow path here */ |
595 | if (has_secure_computing(prev) && | 587 | if (has_secure_computing(prev) && |
596 | !has_secure_computing(next)) { | 588 | !has_secure_computing(next)) { |
597 | write_cr4(read_cr4() & ~X86_CR4_TSD); | 589 | write_cr4(read_cr4() & ~X86_CR4_TSD); |
598 | } else if (!has_secure_computing(prev) && | 590 | } else if (!has_secure_computing(prev) && |
599 | has_secure_computing(next)) | 591 | has_secure_computing(next)) |
600 | write_cr4(read_cr4() | X86_CR4_TSD); | 592 | write_cr4(read_cr4() | X86_CR4_TSD); |
601 | } | 593 | } |
602 | } | 594 | } |
603 | 595 | ||
604 | /* | 596 | /* |
605 | * switch_to(x,yn) should switch tasks from x to y. | 597 | * switch_to(x,yn) should switch tasks from x to y. |
606 | * | 598 | * |
607 | * We fsave/fwait so that an exception goes off at the right time | 599 | * We fsave/fwait so that an exception goes off at the right time |
608 | * (as a call from the fsave or fwait in effect) rather than to | 600 | * (as a call from the fsave or fwait in effect) rather than to |
609 | * the wrong process. Lazy FP saving no longer makes any sense | 601 | * the wrong process. Lazy FP saving no longer makes any sense |
610 | * with modern CPU's, and this simplifies a lot of things (SMP | 602 | * with modern CPU's, and this simplifies a lot of things (SMP |
611 | * and UP become the same). | 603 | * and UP become the same). |
612 | * | 604 | * |
613 | * NOTE! We used to use the x86 hardware context switching. The | 605 | * NOTE! We used to use the x86 hardware context switching. The |
614 | * reason for not using it any more becomes apparent when you | 606 | * reason for not using it any more becomes apparent when you |
615 | * try to recover gracefully from saved state that is no longer | 607 | * try to recover gracefully from saved state that is no longer |
616 | * valid (stale segment register values in particular). With the | 608 | * valid (stale segment register values in particular). With the |
617 | * hardware task-switch, there is no way to fix up bad state in | 609 | * hardware task-switch, there is no way to fix up bad state in |
618 | * a reasonable manner. | 610 | * a reasonable manner. |
619 | * | 611 | * |
620 | * The fact that Intel documents the hardware task-switching to | 612 | * The fact that Intel documents the hardware task-switching to |
621 | * be slow is a fairly red herring - this code is not noticeably | 613 | * be slow is a fairly red herring - this code is not noticeably |
622 | * faster. However, there _is_ some room for improvement here, | 614 | * faster. However, there _is_ some room for improvement here, |
623 | * so the performance issues may eventually be a valid point. | 615 | * so the performance issues may eventually be a valid point. |
624 | * More important, however, is the fact that this allows us much | 616 | * More important, however, is the fact that this allows us much |
625 | * more flexibility. | 617 | * more flexibility. |
626 | * | 618 | * |
627 | * The return value (in %eax) will be the "prev" task after | 619 | * The return value (in %eax) will be the "prev" task after |
628 | * the task-switch, and shows up in ret_from_fork in entry.S, | 620 | * the task-switch, and shows up in ret_from_fork in entry.S, |
629 | * for example. | 621 | * for example. |
630 | */ | 622 | */ |
631 | struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | 623 | struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
632 | { | 624 | { |
633 | struct thread_struct *prev = &prev_p->thread, | 625 | struct thread_struct *prev = &prev_p->thread, |
634 | *next = &next_p->thread; | 626 | *next = &next_p->thread; |
635 | int cpu = smp_processor_id(); | 627 | int cpu = smp_processor_id(); |
636 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 628 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
637 | 629 | ||
638 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 630 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
639 | 631 | ||
640 | __unlazy_fpu(prev_p); | 632 | __unlazy_fpu(prev_p); |
641 | 633 | ||
642 | /* | 634 | /* |
643 | * Reload esp0. | 635 | * Reload esp0. |
644 | */ | 636 | */ |
645 | load_esp0(tss, next); | 637 | load_esp0(tss, next); |
646 | 638 | ||
647 | /* | 639 | /* |
648 | * Save away %fs and %gs. No need to save %es and %ds, as | 640 | * Save away %fs and %gs. No need to save %es and %ds, as |
649 | * those are always kernel segments while inside the kernel. | 641 | * those are always kernel segments while inside the kernel. |
650 | * Doing this before setting the new TLS descriptors avoids | 642 | * Doing this before setting the new TLS descriptors avoids |
651 | * the situation where we temporarily have non-reloadable | 643 | * the situation where we temporarily have non-reloadable |
652 | * segments in %fs and %gs. This could be an issue if the | 644 | * segments in %fs and %gs. This could be an issue if the |
653 | * NMI handler ever used %fs or %gs (it does not today), or | 645 | * NMI handler ever used %fs or %gs (it does not today), or |
654 | * if the kernel is running inside of a hypervisor layer. | 646 | * if the kernel is running inside of a hypervisor layer. |
655 | */ | 647 | */ |
656 | savesegment(fs, prev->fs); | 648 | savesegment(fs, prev->fs); |
657 | savesegment(gs, prev->gs); | 649 | savesegment(gs, prev->gs); |
658 | 650 | ||
659 | /* | 651 | /* |
660 | * Load the per-thread Thread-Local Storage descriptor. | 652 | * Load the per-thread Thread-Local Storage descriptor. |
661 | */ | 653 | */ |
662 | load_TLS(next, cpu); | 654 | load_TLS(next, cpu); |
663 | 655 | ||
664 | /* | 656 | /* |
665 | * Restore %fs and %gs if needed. | 657 | * Restore %fs and %gs if needed. |
666 | * | 658 | * |
667 | * Glibc normally makes %fs be zero, and %gs is one of | 659 | * Glibc normally makes %fs be zero, and %gs is one of |
668 | * the TLS segments. | 660 | * the TLS segments. |
669 | */ | 661 | */ |
670 | if (unlikely(prev->fs | next->fs)) | 662 | if (unlikely(prev->fs | next->fs)) |
671 | loadsegment(fs, next->fs); | 663 | loadsegment(fs, next->fs); |
672 | 664 | ||
673 | if (prev->gs | next->gs) | 665 | if (prev->gs | next->gs) |
674 | loadsegment(gs, next->gs); | 666 | loadsegment(gs, next->gs); |
675 | 667 | ||
676 | /* | 668 | /* |
677 | * Restore IOPL if needed. | 669 | * Restore IOPL if needed. |
678 | */ | 670 | */ |
679 | if (unlikely(prev->iopl != next->iopl)) | 671 | if (unlikely(prev->iopl != next->iopl)) |
680 | set_iopl_mask(next->iopl); | 672 | set_iopl_mask(next->iopl); |
681 | 673 | ||
682 | /* | 674 | /* |
683 | * Now maybe reload the debug registers | 675 | * Now maybe reload the debug registers |
684 | */ | 676 | */ |
685 | if (unlikely(next->debugreg[7])) { | 677 | if (unlikely(next->debugreg[7])) { |
686 | set_debugreg(next->debugreg[0], 0); | 678 | set_debugreg(next->debugreg[0], 0); |
687 | set_debugreg(next->debugreg[1], 1); | 679 | set_debugreg(next->debugreg[1], 1); |
688 | set_debugreg(next->debugreg[2], 2); | 680 | set_debugreg(next->debugreg[2], 2); |
689 | set_debugreg(next->debugreg[3], 3); | 681 | set_debugreg(next->debugreg[3], 3); |
690 | /* no 4 and 5 */ | 682 | /* no 4 and 5 */ |
691 | set_debugreg(next->debugreg[6], 6); | 683 | set_debugreg(next->debugreg[6], 6); |
692 | set_debugreg(next->debugreg[7], 7); | 684 | set_debugreg(next->debugreg[7], 7); |
693 | } | 685 | } |
694 | 686 | ||
695 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) | 687 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) |
696 | handle_io_bitmap(next, tss); | 688 | handle_io_bitmap(next, tss); |
697 | 689 | ||
698 | disable_tsc(prev_p, next_p); | 690 | disable_tsc(prev_p, next_p); |
699 | 691 | ||
700 | return prev_p; | 692 | return prev_p; |
701 | } | 693 | } |
702 | 694 | ||
703 | asmlinkage int sys_fork(struct pt_regs regs) | 695 | asmlinkage int sys_fork(struct pt_regs regs) |
704 | { | 696 | { |
705 | return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | 697 | return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); |
706 | } | 698 | } |
707 | 699 | ||
708 | asmlinkage int sys_clone(struct pt_regs regs) | 700 | asmlinkage int sys_clone(struct pt_regs regs) |
709 | { | 701 | { |
710 | unsigned long clone_flags; | 702 | unsigned long clone_flags; |
711 | unsigned long newsp; | 703 | unsigned long newsp; |
712 | int __user *parent_tidptr, *child_tidptr; | 704 | int __user *parent_tidptr, *child_tidptr; |
713 | 705 | ||
714 | clone_flags = regs.ebx; | 706 | clone_flags = regs.ebx; |
715 | newsp = regs.ecx; | 707 | newsp = regs.ecx; |
716 | parent_tidptr = (int __user *)regs.edx; | 708 | parent_tidptr = (int __user *)regs.edx; |
717 | child_tidptr = (int __user *)regs.edi; | 709 | child_tidptr = (int __user *)regs.edi; |
718 | if (!newsp) | 710 | if (!newsp) |
719 | newsp = regs.esp; | 711 | newsp = regs.esp; |
720 | return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); | 712 | return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); |
721 | } | 713 | } |
722 | 714 | ||
723 | /* | 715 | /* |
724 | * This is trivial, and on the face of it looks like it | 716 | * This is trivial, and on the face of it looks like it |
725 | * could equally well be done in user mode. | 717 | * could equally well be done in user mode. |
726 | * | 718 | * |
727 | * Not so, for quite unobvious reasons - register pressure. | 719 | * Not so, for quite unobvious reasons - register pressure. |
728 | * In user mode vfork() cannot have a stack frame, and if | 720 | * In user mode vfork() cannot have a stack frame, and if |
729 | * done by calling the "clone()" system call directly, you | 721 | * done by calling the "clone()" system call directly, you |
730 | * do not have enough call-clobbered registers to hold all | 722 | * do not have enough call-clobbered registers to hold all |
731 | * the information you need. | 723 | * the information you need. |
732 | */ | 724 | */ |
733 | asmlinkage int sys_vfork(struct pt_regs regs) | 725 | asmlinkage int sys_vfork(struct pt_regs regs) |
734 | { | 726 | { |
735 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); | 727 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); |
736 | } | 728 | } |
737 | 729 | ||
738 | /* | 730 | /* |
739 | * sys_execve() executes a new program. | 731 | * sys_execve() executes a new program. |
740 | */ | 732 | */ |
741 | asmlinkage int sys_execve(struct pt_regs regs) | 733 | asmlinkage int sys_execve(struct pt_regs regs) |
742 | { | 734 | { |
743 | int error; | 735 | int error; |
744 | char * filename; | 736 | char * filename; |
745 | 737 | ||
746 | filename = getname((char __user *) regs.ebx); | 738 | filename = getname((char __user *) regs.ebx); |
747 | error = PTR_ERR(filename); | 739 | error = PTR_ERR(filename); |
748 | if (IS_ERR(filename)) | 740 | if (IS_ERR(filename)) |
749 | goto out; | 741 | goto out; |
750 | error = do_execve(filename, | 742 | error = do_execve(filename, |
751 | (char __user * __user *) regs.ecx, | 743 | (char __user * __user *) regs.ecx, |
752 | (char __user * __user *) regs.edx, | 744 | (char __user * __user *) regs.edx, |
753 | ®s); | 745 | ®s); |
754 | if (error == 0) { | 746 | if (error == 0) { |
755 | task_lock(current); | 747 | task_lock(current); |
756 | current->ptrace &= ~PT_DTRACE; | 748 | current->ptrace &= ~PT_DTRACE; |
757 | task_unlock(current); | 749 | task_unlock(current); |
758 | /* Make sure we don't return using sysenter.. */ | 750 | /* Make sure we don't return using sysenter.. */ |
759 | set_thread_flag(TIF_IRET); | 751 | set_thread_flag(TIF_IRET); |
760 | } | 752 | } |
761 | putname(filename); | 753 | putname(filename); |
762 | out: | 754 | out: |
763 | return error; | 755 | return error; |
764 | } | 756 | } |
765 | 757 | ||
766 | #define top_esp (THREAD_SIZE - sizeof(unsigned long)) | 758 | #define top_esp (THREAD_SIZE - sizeof(unsigned long)) |
767 | #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) | 759 | #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) |
768 | 760 | ||
769 | unsigned long get_wchan(struct task_struct *p) | 761 | unsigned long get_wchan(struct task_struct *p) |
770 | { | 762 | { |
771 | unsigned long ebp, esp, eip; | 763 | unsigned long ebp, esp, eip; |
772 | unsigned long stack_page; | 764 | unsigned long stack_page; |
773 | int count = 0; | 765 | int count = 0; |
774 | if (!p || p == current || p->state == TASK_RUNNING) | 766 | if (!p || p == current || p->state == TASK_RUNNING) |
775 | return 0; | 767 | return 0; |
776 | stack_page = (unsigned long)task_stack_page(p); | 768 | stack_page = (unsigned long)task_stack_page(p); |
777 | esp = p->thread.esp; | 769 | esp = p->thread.esp; |
778 | if (!stack_page || esp < stack_page || esp > top_esp+stack_page) | 770 | if (!stack_page || esp < stack_page || esp > top_esp+stack_page) |
779 | return 0; | 771 | return 0; |
780 | /* include/asm-i386/system.h:switch_to() pushes ebp last. */ | 772 | /* include/asm-i386/system.h:switch_to() pushes ebp last. */ |
781 | ebp = *(unsigned long *) esp; | 773 | ebp = *(unsigned long *) esp; |
782 | do { | 774 | do { |
783 | if (ebp < stack_page || ebp > top_ebp+stack_page) | 775 | if (ebp < stack_page || ebp > top_ebp+stack_page) |
784 | return 0; | 776 | return 0; |
785 | eip = *(unsigned long *) (ebp+4); | 777 | eip = *(unsigned long *) (ebp+4); |
786 | if (!in_sched_functions(eip)) | 778 | if (!in_sched_functions(eip)) |
787 | return eip; | 779 | return eip; |
788 | ebp = *(unsigned long *) ebp; | 780 | ebp = *(unsigned long *) ebp; |
789 | } while (count++ < 16); | 781 | } while (count++ < 16); |
790 | return 0; | 782 | return 0; |
791 | } | 783 | } |
792 | EXPORT_SYMBOL(get_wchan); | 784 | EXPORT_SYMBOL(get_wchan); |
793 | 785 | ||
794 | /* | 786 | /* |
795 | * sys_alloc_thread_area: get a yet unused TLS descriptor index. | 787 | * sys_alloc_thread_area: get a yet unused TLS descriptor index. |
796 | */ | 788 | */ |
797 | static int get_free_idx(void) | 789 | static int get_free_idx(void) |
798 | { | 790 | { |
799 | struct thread_struct *t = ¤t->thread; | 791 | struct thread_struct *t = ¤t->thread; |
800 | int idx; | 792 | int idx; |
801 | 793 | ||
802 | for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) | 794 | for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) |
803 | if (desc_empty(t->tls_array + idx)) | 795 | if (desc_empty(t->tls_array + idx)) |
804 | return idx + GDT_ENTRY_TLS_MIN; | 796 | return idx + GDT_ENTRY_TLS_MIN; |
805 | return -ESRCH; | 797 | return -ESRCH; |
806 | } | 798 | } |
807 | 799 | ||
808 | /* | 800 | /* |
809 | * Set a given TLS descriptor: | 801 | * Set a given TLS descriptor: |
810 | */ | 802 | */ |
811 | asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) | 803 | asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) |
812 | { | 804 | { |
813 | struct thread_struct *t = ¤t->thread; | 805 | struct thread_struct *t = ¤t->thread; |
814 | struct user_desc info; | 806 | struct user_desc info; |
815 | struct desc_struct *desc; | 807 | struct desc_struct *desc; |
816 | int cpu, idx; | 808 | int cpu, idx; |
817 | 809 | ||
818 | if (copy_from_user(&info, u_info, sizeof(info))) | 810 | if (copy_from_user(&info, u_info, sizeof(info))) |
819 | return -EFAULT; | 811 | return -EFAULT; |
820 | idx = info.entry_number; | 812 | idx = info.entry_number; |
821 | 813 | ||
822 | /* | 814 | /* |
823 | * index -1 means the kernel should try to find and | 815 | * index -1 means the kernel should try to find and |
824 | * allocate an empty descriptor: | 816 | * allocate an empty descriptor: |
825 | */ | 817 | */ |
826 | if (idx == -1) { | 818 | if (idx == -1) { |
827 | idx = get_free_idx(); | 819 | idx = get_free_idx(); |
828 | if (idx < 0) | 820 | if (idx < 0) |
829 | return idx; | 821 | return idx; |
830 | if (put_user(idx, &u_info->entry_number)) | 822 | if (put_user(idx, &u_info->entry_number)) |
831 | return -EFAULT; | 823 | return -EFAULT; |
832 | } | 824 | } |
833 | 825 | ||
834 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | 826 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) |
835 | return -EINVAL; | 827 | return -EINVAL; |
836 | 828 | ||
837 | desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; | 829 | desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; |
838 | 830 | ||
839 | /* | 831 | /* |
840 | * We must not get preempted while modifying the TLS. | 832 | * We must not get preempted while modifying the TLS. |
841 | */ | 833 | */ |
842 | cpu = get_cpu(); | 834 | cpu = get_cpu(); |
843 | 835 | ||
844 | if (LDT_empty(&info)) { | 836 | if (LDT_empty(&info)) { |
845 | desc->a = 0; | 837 | desc->a = 0; |
846 | desc->b = 0; | 838 | desc->b = 0; |
847 | } else { | 839 | } else { |
848 | desc->a = LDT_entry_a(&info); | 840 | desc->a = LDT_entry_a(&info); |
849 | desc->b = LDT_entry_b(&info); | 841 | desc->b = LDT_entry_b(&info); |
850 | } | 842 | } |
851 | load_TLS(t, cpu); | 843 | load_TLS(t, cpu); |
852 | 844 | ||
853 | put_cpu(); | 845 | put_cpu(); |
854 | 846 | ||
855 | return 0; | 847 | return 0; |
856 | } | 848 | } |
857 | 849 | ||
858 | /* | 850 | /* |
859 | * Get the current Thread-Local Storage area: | 851 | * Get the current Thread-Local Storage area: |
860 | */ | 852 | */ |
861 | 853 | ||
862 | #define GET_BASE(desc) ( \ | 854 | #define GET_BASE(desc) ( \ |
863 | (((desc)->a >> 16) & 0x0000ffff) | \ | 855 | (((desc)->a >> 16) & 0x0000ffff) | \ |
864 | (((desc)->b << 16) & 0x00ff0000) | \ | 856 | (((desc)->b << 16) & 0x00ff0000) | \ |
865 | ( (desc)->b & 0xff000000) ) | 857 | ( (desc)->b & 0xff000000) ) |
866 | 858 | ||
867 | #define GET_LIMIT(desc) ( \ | 859 | #define GET_LIMIT(desc) ( \ |
868 | ((desc)->a & 0x0ffff) | \ | 860 | ((desc)->a & 0x0ffff) | \ |
869 | ((desc)->b & 0xf0000) ) | 861 | ((desc)->b & 0xf0000) ) |
870 | 862 | ||
871 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) | 863 | #define GET_32BIT(desc) (((desc)->b >> 22) & 1) |
872 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) | 864 | #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) |
873 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) | 865 | #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) |
874 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) | 866 | #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) |
875 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) | 867 | #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) |
876 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) | 868 | #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) |
877 | 869 | ||
878 | asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) | 870 | asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) |
879 | { | 871 | { |
880 | struct user_desc info; | 872 | struct user_desc info; |
881 | struct desc_struct *desc; | 873 | struct desc_struct *desc; |
882 | int idx; | 874 | int idx; |
883 | 875 | ||
884 | if (get_user(idx, &u_info->entry_number)) | 876 | if (get_user(idx, &u_info->entry_number)) |
885 | return -EFAULT; | 877 | return -EFAULT; |
886 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | 878 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) |
887 | return -EINVAL; | 879 | return -EINVAL; |
888 | 880 | ||
889 | memset(&info, 0, sizeof(info)); | 881 | memset(&info, 0, sizeof(info)); |
890 | 882 | ||
891 | desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; | 883 | desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; |
892 | 884 | ||
893 | info.entry_number = idx; | 885 | info.entry_number = idx; |
894 | info.base_addr = GET_BASE(desc); | 886 | info.base_addr = GET_BASE(desc); |
895 | info.limit = GET_LIMIT(desc); | 887 | info.limit = GET_LIMIT(desc); |
896 | info.seg_32bit = GET_32BIT(desc); | 888 | info.seg_32bit = GET_32BIT(desc); |
897 | info.contents = GET_CONTENTS(desc); | 889 | info.contents = GET_CONTENTS(desc); |
898 | info.read_exec_only = !GET_WRITABLE(desc); | 890 | info.read_exec_only = !GET_WRITABLE(desc); |
899 | info.limit_in_pages = GET_LIMIT_PAGES(desc); | 891 | info.limit_in_pages = GET_LIMIT_PAGES(desc); |
900 | info.seg_not_present = !GET_PRESENT(desc); | 892 | info.seg_not_present = !GET_PRESENT(desc); |
901 | info.useable = GET_USEABLE(desc); | 893 | info.useable = GET_USEABLE(desc); |
902 | 894 | ||
903 | if (copy_to_user(u_info, &info, sizeof(info))) | 895 | if (copy_to_user(u_info, &info, sizeof(info))) |
904 | return -EFAULT; | 896 | return -EFAULT; |
905 | return 0; | 897 | return 0; |
906 | } | 898 | } |
907 | 899 | ||
908 | unsigned long arch_align_stack(unsigned long sp) | 900 | unsigned long arch_align_stack(unsigned long sp) |
909 | { | 901 | { |
910 | if (randomize_va_space) | 902 | if (randomize_va_space) |
911 | sp -= get_random_int() % 8192; | 903 | sp -= get_random_int() % 8192; |
912 | return sp & ~0xf; | 904 | return sp & ~0xf; |
913 | } | 905 | } |
914 | 906 |
arch/ia64/kernel/process.c
1 | /* | 1 | /* |
2 | * Architecture-specific setup. | 2 | * Architecture-specific setup. |
3 | * | 3 | * |
4 | * Copyright (C) 1998-2003 Hewlett-Packard Co | 4 | * Copyright (C) 1998-2003 Hewlett-Packard Co |
5 | * David Mosberger-Tang <davidm@hpl.hp.com> | 5 | * David Mosberger-Tang <davidm@hpl.hp.com> |
6 | * 04/11/17 Ashok Raj <ashok.raj@intel.com> Added CPU Hotplug Support | 6 | * 04/11/17 Ashok Raj <ashok.raj@intel.com> Added CPU Hotplug Support |
7 | * | 7 | * |
8 | * 2005-10-07 Keith Owens <kaos@sgi.com> | 8 | * 2005-10-07 Keith Owens <kaos@sgi.com> |
9 | * Add notify_die() hooks. | 9 | * Add notify_die() hooks. |
10 | */ | 10 | */ |
11 | #define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ | 11 | #define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ |
12 | #include <linux/config.h> | 12 | #include <linux/config.h> |
13 | 13 | ||
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/pm.h> | 15 | #include <linux/pm.h> |
16 | #include <linux/elf.h> | 16 | #include <linux/elf.h> |
17 | #include <linux/errno.h> | 17 | #include <linux/errno.h> |
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/notifier.h> | 22 | #include <linux/notifier.h> |
23 | #include <linux/personality.h> | 23 | #include <linux/personality.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/smp_lock.h> | 26 | #include <linux/smp_lock.h> |
27 | #include <linux/stddef.h> | 27 | #include <linux/stddef.h> |
28 | #include <linux/thread_info.h> | 28 | #include <linux/thread_info.h> |
29 | #include <linux/unistd.h> | 29 | #include <linux/unistd.h> |
30 | #include <linux/efi.h> | 30 | #include <linux/efi.h> |
31 | #include <linux/interrupt.h> | 31 | #include <linux/interrupt.h> |
32 | #include <linux/delay.h> | 32 | #include <linux/delay.h> |
33 | #include <linux/kprobes.h> | ||
34 | 33 | ||
35 | #include <asm/cpu.h> | 34 | #include <asm/cpu.h> |
36 | #include <asm/delay.h> | 35 | #include <asm/delay.h> |
37 | #include <asm/elf.h> | 36 | #include <asm/elf.h> |
38 | #include <asm/ia32.h> | 37 | #include <asm/ia32.h> |
39 | #include <asm/irq.h> | 38 | #include <asm/irq.h> |
40 | #include <asm/kdebug.h> | 39 | #include <asm/kdebug.h> |
41 | #include <asm/pgalloc.h> | 40 | #include <asm/pgalloc.h> |
42 | #include <asm/processor.h> | 41 | #include <asm/processor.h> |
43 | #include <asm/sal.h> | 42 | #include <asm/sal.h> |
44 | #include <asm/tlbflush.h> | 43 | #include <asm/tlbflush.h> |
45 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
46 | #include <asm/unwind.h> | 45 | #include <asm/unwind.h> |
47 | #include <asm/user.h> | 46 | #include <asm/user.h> |
48 | 47 | ||
49 | #include "entry.h" | 48 | #include "entry.h" |
50 | 49 | ||
51 | #ifdef CONFIG_PERFMON | 50 | #ifdef CONFIG_PERFMON |
52 | # include <asm/perfmon.h> | 51 | # include <asm/perfmon.h> |
53 | #endif | 52 | #endif |
54 | 53 | ||
55 | #include "sigframe.h" | 54 | #include "sigframe.h" |
56 | 55 | ||
57 | void (*ia64_mark_idle)(int); | 56 | void (*ia64_mark_idle)(int); |
58 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | 57 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); |
59 | 58 | ||
60 | unsigned long boot_option_idle_override = 0; | 59 | unsigned long boot_option_idle_override = 0; |
61 | EXPORT_SYMBOL(boot_option_idle_override); | 60 | EXPORT_SYMBOL(boot_option_idle_override); |
62 | 61 | ||
63 | void | 62 | void |
64 | ia64_do_show_stack (struct unw_frame_info *info, void *arg) | 63 | ia64_do_show_stack (struct unw_frame_info *info, void *arg) |
65 | { | 64 | { |
66 | unsigned long ip, sp, bsp; | 65 | unsigned long ip, sp, bsp; |
67 | char buf[128]; /* don't make it so big that it overflows the stack! */ | 66 | char buf[128]; /* don't make it so big that it overflows the stack! */ |
68 | 67 | ||
69 | printk("\nCall Trace:\n"); | 68 | printk("\nCall Trace:\n"); |
70 | do { | 69 | do { |
71 | unw_get_ip(info, &ip); | 70 | unw_get_ip(info, &ip); |
72 | if (ip == 0) | 71 | if (ip == 0) |
73 | break; | 72 | break; |
74 | 73 | ||
75 | unw_get_sp(info, &sp); | 74 | unw_get_sp(info, &sp); |
76 | unw_get_bsp(info, &bsp); | 75 | unw_get_bsp(info, &bsp); |
77 | snprintf(buf, sizeof(buf), | 76 | snprintf(buf, sizeof(buf), |
78 | " [<%016lx>] %%s\n" | 77 | " [<%016lx>] %%s\n" |
79 | " sp=%016lx bsp=%016lx\n", | 78 | " sp=%016lx bsp=%016lx\n", |
80 | ip, sp, bsp); | 79 | ip, sp, bsp); |
81 | print_symbol(buf, ip); | 80 | print_symbol(buf, ip); |
82 | } while (unw_unwind(info) >= 0); | 81 | } while (unw_unwind(info) >= 0); |
83 | } | 82 | } |
84 | 83 | ||
85 | void | 84 | void |
86 | show_stack (struct task_struct *task, unsigned long *sp) | 85 | show_stack (struct task_struct *task, unsigned long *sp) |
87 | { | 86 | { |
88 | if (!task) | 87 | if (!task) |
89 | unw_init_running(ia64_do_show_stack, NULL); | 88 | unw_init_running(ia64_do_show_stack, NULL); |
90 | else { | 89 | else { |
91 | struct unw_frame_info info; | 90 | struct unw_frame_info info; |
92 | 91 | ||
93 | unw_init_from_blocked_task(&info, task); | 92 | unw_init_from_blocked_task(&info, task); |
94 | ia64_do_show_stack(&info, NULL); | 93 | ia64_do_show_stack(&info, NULL); |
95 | } | 94 | } |
96 | } | 95 | } |
97 | 96 | ||
98 | void | 97 | void |
99 | dump_stack (void) | 98 | dump_stack (void) |
100 | { | 99 | { |
101 | show_stack(NULL, NULL); | 100 | show_stack(NULL, NULL); |
102 | } | 101 | } |
103 | 102 | ||
104 | EXPORT_SYMBOL(dump_stack); | 103 | EXPORT_SYMBOL(dump_stack); |
105 | 104 | ||
106 | void | 105 | void |
107 | show_regs (struct pt_regs *regs) | 106 | show_regs (struct pt_regs *regs) |
108 | { | 107 | { |
109 | unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; | 108 | unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; |
110 | 109 | ||
111 | print_modules(); | 110 | print_modules(); |
112 | printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); | 111 | printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); |
113 | printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", | 112 | printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", |
114 | regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); | 113 | regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); |
115 | print_symbol("ip is at %s\n", ip); | 114 | print_symbol("ip is at %s\n", ip); |
116 | printk("unat: %016lx pfs : %016lx rsc : %016lx\n", | 115 | printk("unat: %016lx pfs : %016lx rsc : %016lx\n", |
117 | regs->ar_unat, regs->ar_pfs, regs->ar_rsc); | 116 | regs->ar_unat, regs->ar_pfs, regs->ar_rsc); |
118 | printk("rnat: %016lx bsps: %016lx pr : %016lx\n", | 117 | printk("rnat: %016lx bsps: %016lx pr : %016lx\n", |
119 | regs->ar_rnat, regs->ar_bspstore, regs->pr); | 118 | regs->ar_rnat, regs->ar_bspstore, regs->pr); |
120 | printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n", | 119 | printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n", |
121 | regs->loadrs, regs->ar_ccv, regs->ar_fpsr); | 120 | regs->loadrs, regs->ar_ccv, regs->ar_fpsr); |
122 | printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd); | 121 | printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd); |
123 | printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7); | 122 | printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7); |
124 | printk("f6 : %05lx%016lx f7 : %05lx%016lx\n", | 123 | printk("f6 : %05lx%016lx f7 : %05lx%016lx\n", |
125 | regs->f6.u.bits[1], regs->f6.u.bits[0], | 124 | regs->f6.u.bits[1], regs->f6.u.bits[0], |
126 | regs->f7.u.bits[1], regs->f7.u.bits[0]); | 125 | regs->f7.u.bits[1], regs->f7.u.bits[0]); |
127 | printk("f8 : %05lx%016lx f9 : %05lx%016lx\n", | 126 | printk("f8 : %05lx%016lx f9 : %05lx%016lx\n", |
128 | regs->f8.u.bits[1], regs->f8.u.bits[0], | 127 | regs->f8.u.bits[1], regs->f8.u.bits[0], |
129 | regs->f9.u.bits[1], regs->f9.u.bits[0]); | 128 | regs->f9.u.bits[1], regs->f9.u.bits[0]); |
130 | printk("f10 : %05lx%016lx f11 : %05lx%016lx\n", | 129 | printk("f10 : %05lx%016lx f11 : %05lx%016lx\n", |
131 | regs->f10.u.bits[1], regs->f10.u.bits[0], | 130 | regs->f10.u.bits[1], regs->f10.u.bits[0], |
132 | regs->f11.u.bits[1], regs->f11.u.bits[0]); | 131 | regs->f11.u.bits[1], regs->f11.u.bits[0]); |
133 | 132 | ||
134 | printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3); | 133 | printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3); |
135 | printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10); | 134 | printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10); |
136 | printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13); | 135 | printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13); |
137 | printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16); | 136 | printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16); |
138 | printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19); | 137 | printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19); |
139 | printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22); | 138 | printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22); |
140 | printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25); | 139 | printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25); |
141 | printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28); | 140 | printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28); |
142 | printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31); | 141 | printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31); |
143 | 142 | ||
144 | if (user_mode(regs)) { | 143 | if (user_mode(regs)) { |
145 | /* print the stacked registers */ | 144 | /* print the stacked registers */ |
146 | unsigned long val, *bsp, ndirty; | 145 | unsigned long val, *bsp, ndirty; |
147 | int i, sof, is_nat = 0; | 146 | int i, sof, is_nat = 0; |
148 | 147 | ||
149 | sof = regs->cr_ifs & 0x7f; /* size of frame */ | 148 | sof = regs->cr_ifs & 0x7f; /* size of frame */ |
150 | ndirty = (regs->loadrs >> 19); | 149 | ndirty = (regs->loadrs >> 19); |
151 | bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty); | 150 | bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty); |
152 | for (i = 0; i < sof; ++i) { | 151 | for (i = 0; i < sof; ++i) { |
153 | get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i)); | 152 | get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i)); |
154 | printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val, | 153 | printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val, |
155 | ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); | 154 | ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); |
156 | } | 155 | } |
157 | } else | 156 | } else |
158 | show_stack(NULL, NULL); | 157 | show_stack(NULL, NULL); |
159 | } | 158 | } |
160 | 159 | ||
161 | void | 160 | void |
162 | do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) | 161 | do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) |
163 | { | 162 | { |
164 | if (fsys_mode(current, &scr->pt)) { | 163 | if (fsys_mode(current, &scr->pt)) { |
165 | /* defer signal-handling etc. until we return to privilege-level 0. */ | 164 | /* defer signal-handling etc. until we return to privilege-level 0. */ |
166 | if (!ia64_psr(&scr->pt)->lp) | 165 | if (!ia64_psr(&scr->pt)->lp) |
167 | ia64_psr(&scr->pt)->lp = 1; | 166 | ia64_psr(&scr->pt)->lp = 1; |
168 | return; | 167 | return; |
169 | } | 168 | } |
170 | 169 | ||
171 | #ifdef CONFIG_PERFMON | 170 | #ifdef CONFIG_PERFMON |
172 | if (current->thread.pfm_needs_checking) | 171 | if (current->thread.pfm_needs_checking) |
173 | pfm_handle_work(); | 172 | pfm_handle_work(); |
174 | #endif | 173 | #endif |
175 | 174 | ||
176 | /* deal with pending signal delivery */ | 175 | /* deal with pending signal delivery */ |
177 | if (test_thread_flag(TIF_SIGPENDING)) | 176 | if (test_thread_flag(TIF_SIGPENDING)) |
178 | ia64_do_signal(oldset, scr, in_syscall); | 177 | ia64_do_signal(oldset, scr, in_syscall); |
179 | } | 178 | } |
180 | 179 | ||
181 | static int pal_halt = 1; | 180 | static int pal_halt = 1; |
182 | static int can_do_pal_halt = 1; | 181 | static int can_do_pal_halt = 1; |
183 | 182 | ||
184 | static int __init nohalt_setup(char * str) | 183 | static int __init nohalt_setup(char * str) |
185 | { | 184 | { |
186 | pal_halt = can_do_pal_halt = 0; | 185 | pal_halt = can_do_pal_halt = 0; |
187 | return 1; | 186 | return 1; |
188 | } | 187 | } |
189 | __setup("nohalt", nohalt_setup); | 188 | __setup("nohalt", nohalt_setup); |
190 | 189 | ||
191 | void | 190 | void |
192 | update_pal_halt_status(int status) | 191 | update_pal_halt_status(int status) |
193 | { | 192 | { |
194 | can_do_pal_halt = pal_halt && status; | 193 | can_do_pal_halt = pal_halt && status; |
195 | } | 194 | } |
196 | 195 | ||
197 | /* | 196 | /* |
198 | * We use this if we don't have any better idle routine.. | 197 | * We use this if we don't have any better idle routine.. |
199 | */ | 198 | */ |
200 | void | 199 | void |
201 | default_idle (void) | 200 | default_idle (void) |
202 | { | 201 | { |
203 | local_irq_enable(); | 202 | local_irq_enable(); |
204 | while (!need_resched()) { | 203 | while (!need_resched()) { |
205 | if (can_do_pal_halt) | 204 | if (can_do_pal_halt) |
206 | safe_halt(); | 205 | safe_halt(); |
207 | else | 206 | else |
208 | cpu_relax(); | 207 | cpu_relax(); |
209 | } | 208 | } |
210 | } | 209 | } |
211 | 210 | ||
212 | #ifdef CONFIG_HOTPLUG_CPU | 211 | #ifdef CONFIG_HOTPLUG_CPU |
213 | /* We don't actually take CPU down, just spin without interrupts. */ | 212 | /* We don't actually take CPU down, just spin without interrupts. */ |
214 | static inline void play_dead(void) | 213 | static inline void play_dead(void) |
215 | { | 214 | { |
216 | extern void ia64_cpu_local_tick (void); | 215 | extern void ia64_cpu_local_tick (void); |
217 | unsigned int this_cpu = smp_processor_id(); | 216 | unsigned int this_cpu = smp_processor_id(); |
218 | 217 | ||
219 | /* Ack it */ | 218 | /* Ack it */ |
220 | __get_cpu_var(cpu_state) = CPU_DEAD; | 219 | __get_cpu_var(cpu_state) = CPU_DEAD; |
221 | 220 | ||
222 | max_xtp(); | 221 | max_xtp(); |
223 | local_irq_disable(); | 222 | local_irq_disable(); |
224 | idle_task_exit(); | 223 | idle_task_exit(); |
225 | ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]); | 224 | ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]); |
226 | /* | 225 | /* |
227 | * The above is a point of no-return, the processor is | 226 | * The above is a point of no-return, the processor is |
228 | * expected to be in SAL loop now. | 227 | * expected to be in SAL loop now. |
229 | */ | 228 | */ |
230 | BUG(); | 229 | BUG(); |
231 | } | 230 | } |
232 | #else | 231 | #else |
233 | static inline void play_dead(void) | 232 | static inline void play_dead(void) |
234 | { | 233 | { |
235 | BUG(); | 234 | BUG(); |
236 | } | 235 | } |
237 | #endif /* CONFIG_HOTPLUG_CPU */ | 236 | #endif /* CONFIG_HOTPLUG_CPU */ |
238 | 237 | ||
239 | void cpu_idle_wait(void) | 238 | void cpu_idle_wait(void) |
240 | { | 239 | { |
241 | unsigned int cpu, this_cpu = get_cpu(); | 240 | unsigned int cpu, this_cpu = get_cpu(); |
242 | cpumask_t map; | 241 | cpumask_t map; |
243 | 242 | ||
244 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | 243 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); |
245 | put_cpu(); | 244 | put_cpu(); |
246 | 245 | ||
247 | cpus_clear(map); | 246 | cpus_clear(map); |
248 | for_each_online_cpu(cpu) { | 247 | for_each_online_cpu(cpu) { |
249 | per_cpu(cpu_idle_state, cpu) = 1; | 248 | per_cpu(cpu_idle_state, cpu) = 1; |
250 | cpu_set(cpu, map); | 249 | cpu_set(cpu, map); |
251 | } | 250 | } |
252 | 251 | ||
253 | __get_cpu_var(cpu_idle_state) = 0; | 252 | __get_cpu_var(cpu_idle_state) = 0; |
254 | 253 | ||
255 | wmb(); | 254 | wmb(); |
256 | do { | 255 | do { |
257 | ssleep(1); | 256 | ssleep(1); |
258 | for_each_online_cpu(cpu) { | 257 | for_each_online_cpu(cpu) { |
259 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) | 258 | if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) |
260 | cpu_clear(cpu, map); | 259 | cpu_clear(cpu, map); |
261 | } | 260 | } |
262 | cpus_and(map, map, cpu_online_map); | 261 | cpus_and(map, map, cpu_online_map); |
263 | } while (!cpus_empty(map)); | 262 | } while (!cpus_empty(map)); |
264 | } | 263 | } |
265 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | 264 | EXPORT_SYMBOL_GPL(cpu_idle_wait); |
266 | 265 | ||
267 | void __attribute__((noreturn)) | 266 | void __attribute__((noreturn)) |
268 | cpu_idle (void) | 267 | cpu_idle (void) |
269 | { | 268 | { |
270 | void (*mark_idle)(int) = ia64_mark_idle; | 269 | void (*mark_idle)(int) = ia64_mark_idle; |
271 | int cpu = smp_processor_id(); | 270 | int cpu = smp_processor_id(); |
272 | 271 | ||
273 | /* endless idle loop with no priority at all */ | 272 | /* endless idle loop with no priority at all */ |
274 | while (1) { | 273 | while (1) { |
275 | if (can_do_pal_halt) | 274 | if (can_do_pal_halt) |
276 | clear_thread_flag(TIF_POLLING_NRFLAG); | 275 | clear_thread_flag(TIF_POLLING_NRFLAG); |
277 | else | 276 | else |
278 | set_thread_flag(TIF_POLLING_NRFLAG); | 277 | set_thread_flag(TIF_POLLING_NRFLAG); |
279 | 278 | ||
280 | if (!need_resched()) { | 279 | if (!need_resched()) { |
281 | void (*idle)(void); | 280 | void (*idle)(void); |
282 | #ifdef CONFIG_SMP | 281 | #ifdef CONFIG_SMP |
283 | min_xtp(); | 282 | min_xtp(); |
284 | #endif | 283 | #endif |
285 | if (__get_cpu_var(cpu_idle_state)) | 284 | if (__get_cpu_var(cpu_idle_state)) |
286 | __get_cpu_var(cpu_idle_state) = 0; | 285 | __get_cpu_var(cpu_idle_state) = 0; |
287 | 286 | ||
288 | rmb(); | 287 | rmb(); |
289 | if (mark_idle) | 288 | if (mark_idle) |
290 | (*mark_idle)(1); | 289 | (*mark_idle)(1); |
291 | 290 | ||
292 | idle = pm_idle; | 291 | idle = pm_idle; |
293 | if (!idle) | 292 | if (!idle) |
294 | idle = default_idle; | 293 | idle = default_idle; |
295 | (*idle)(); | 294 | (*idle)(); |
296 | if (mark_idle) | 295 | if (mark_idle) |
297 | (*mark_idle)(0); | 296 | (*mark_idle)(0); |
298 | #ifdef CONFIG_SMP | 297 | #ifdef CONFIG_SMP |
299 | normal_xtp(); | 298 | normal_xtp(); |
300 | #endif | 299 | #endif |
301 | } | 300 | } |
302 | preempt_enable_no_resched(); | 301 | preempt_enable_no_resched(); |
303 | schedule(); | 302 | schedule(); |
304 | preempt_disable(); | 303 | preempt_disable(); |
305 | check_pgt_cache(); | 304 | check_pgt_cache(); |
306 | if (cpu_is_offline(cpu)) | 305 | if (cpu_is_offline(cpu)) |
307 | play_dead(); | 306 | play_dead(); |
308 | } | 307 | } |
309 | } | 308 | } |
310 | 309 | ||
311 | void | 310 | void |
312 | ia64_save_extra (struct task_struct *task) | 311 | ia64_save_extra (struct task_struct *task) |
313 | { | 312 | { |
314 | #ifdef CONFIG_PERFMON | 313 | #ifdef CONFIG_PERFMON |
315 | unsigned long info; | 314 | unsigned long info; |
316 | #endif | 315 | #endif |
317 | 316 | ||
318 | if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) | 317 | if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) |
319 | ia64_save_debug_regs(&task->thread.dbr[0]); | 318 | ia64_save_debug_regs(&task->thread.dbr[0]); |
320 | 319 | ||
321 | #ifdef CONFIG_PERFMON | 320 | #ifdef CONFIG_PERFMON |
322 | if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) | 321 | if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) |
323 | pfm_save_regs(task); | 322 | pfm_save_regs(task); |
324 | 323 | ||
325 | info = __get_cpu_var(pfm_syst_info); | 324 | info = __get_cpu_var(pfm_syst_info); |
326 | if (info & PFM_CPUINFO_SYST_WIDE) | 325 | if (info & PFM_CPUINFO_SYST_WIDE) |
327 | pfm_syst_wide_update_task(task, info, 0); | 326 | pfm_syst_wide_update_task(task, info, 0); |
328 | #endif | 327 | #endif |
329 | 328 | ||
330 | #ifdef CONFIG_IA32_SUPPORT | 329 | #ifdef CONFIG_IA32_SUPPORT |
331 | if (IS_IA32_PROCESS(task_pt_regs(task))) | 330 | if (IS_IA32_PROCESS(task_pt_regs(task))) |
332 | ia32_save_state(task); | 331 | ia32_save_state(task); |
333 | #endif | 332 | #endif |
334 | } | 333 | } |
335 | 334 | ||
336 | void | 335 | void |
337 | ia64_load_extra (struct task_struct *task) | 336 | ia64_load_extra (struct task_struct *task) |
338 | { | 337 | { |
339 | #ifdef CONFIG_PERFMON | 338 | #ifdef CONFIG_PERFMON |
340 | unsigned long info; | 339 | unsigned long info; |
341 | #endif | 340 | #endif |
342 | 341 | ||
343 | if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) | 342 | if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) |
344 | ia64_load_debug_regs(&task->thread.dbr[0]); | 343 | ia64_load_debug_regs(&task->thread.dbr[0]); |
345 | 344 | ||
346 | #ifdef CONFIG_PERFMON | 345 | #ifdef CONFIG_PERFMON |
347 | if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) | 346 | if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) |
348 | pfm_load_regs(task); | 347 | pfm_load_regs(task); |
349 | 348 | ||
350 | info = __get_cpu_var(pfm_syst_info); | 349 | info = __get_cpu_var(pfm_syst_info); |
351 | if (info & PFM_CPUINFO_SYST_WIDE) | 350 | if (info & PFM_CPUINFO_SYST_WIDE) |
352 | pfm_syst_wide_update_task(task, info, 1); | 351 | pfm_syst_wide_update_task(task, info, 1); |
353 | #endif | 352 | #endif |
354 | 353 | ||
355 | #ifdef CONFIG_IA32_SUPPORT | 354 | #ifdef CONFIG_IA32_SUPPORT |
356 | if (IS_IA32_PROCESS(task_pt_regs(task))) | 355 | if (IS_IA32_PROCESS(task_pt_regs(task))) |
357 | ia32_load_state(task); | 356 | ia32_load_state(task); |
358 | #endif | 357 | #endif |
359 | } | 358 | } |
360 | 359 | ||
361 | /* | 360 | /* |
362 | * Copy the state of an ia-64 thread. | 361 | * Copy the state of an ia-64 thread. |
363 | * | 362 | * |
364 | * We get here through the following call chain: | 363 | * We get here through the following call chain: |
365 | * | 364 | * |
366 | * from user-level: from kernel: | 365 | * from user-level: from kernel: |
367 | * | 366 | * |
368 | * <clone syscall> <some kernel call frames> | 367 | * <clone syscall> <some kernel call frames> |
369 | * sys_clone : | 368 | * sys_clone : |
370 | * do_fork do_fork | 369 | * do_fork do_fork |
371 | * copy_thread copy_thread | 370 | * copy_thread copy_thread |
372 | * | 371 | * |
373 | * This means that the stack layout is as follows: | 372 | * This means that the stack layout is as follows: |
374 | * | 373 | * |
375 | * +---------------------+ (highest addr) | 374 | * +---------------------+ (highest addr) |
376 | * | struct pt_regs | | 375 | * | struct pt_regs | |
377 | * +---------------------+ | 376 | * +---------------------+ |
378 | * | struct switch_stack | | 377 | * | struct switch_stack | |
379 | * +---------------------+ | 378 | * +---------------------+ |
380 | * | | | 379 | * | | |
381 | * | memory stack | | 380 | * | memory stack | |
382 | * | | <-- sp (lowest addr) | 381 | * | | <-- sp (lowest addr) |
383 | * +---------------------+ | 382 | * +---------------------+ |
384 | * | 383 | * |
385 | * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an | 384 | * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an |
386 | * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register, | 385 | * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register, |
387 | * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the | 386 | * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the |
388 | * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since | 387 | * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since |
389 | * the stack is page aligned and the page size is at least 4KB, this is always the case, | 388 | * the stack is page aligned and the page size is at least 4KB, this is always the case, |
390 | * so there is nothing to worry about. | 389 | * so there is nothing to worry about. |
391 | */ | 390 | */ |
392 | int | 391 | int |
393 | copy_thread (int nr, unsigned long clone_flags, | 392 | copy_thread (int nr, unsigned long clone_flags, |
394 | unsigned long user_stack_base, unsigned long user_stack_size, | 393 | unsigned long user_stack_base, unsigned long user_stack_size, |
395 | struct task_struct *p, struct pt_regs *regs) | 394 | struct task_struct *p, struct pt_regs *regs) |
396 | { | 395 | { |
397 | extern char ia64_ret_from_clone, ia32_ret_from_clone; | 396 | extern char ia64_ret_from_clone, ia32_ret_from_clone; |
398 | struct switch_stack *child_stack, *stack; | 397 | struct switch_stack *child_stack, *stack; |
399 | unsigned long rbs, child_rbs, rbs_size; | 398 | unsigned long rbs, child_rbs, rbs_size; |
400 | struct pt_regs *child_ptregs; | 399 | struct pt_regs *child_ptregs; |
401 | int retval = 0; | 400 | int retval = 0; |
402 | 401 | ||
403 | #ifdef CONFIG_SMP | 402 | #ifdef CONFIG_SMP |
404 | /* | 403 | /* |
405 | * For SMP idle threads, fork_by_hand() calls do_fork with | 404 | * For SMP idle threads, fork_by_hand() calls do_fork with |
406 | * NULL regs. | 405 | * NULL regs. |
407 | */ | 406 | */ |
408 | if (!regs) | 407 | if (!regs) |
409 | return 0; | 408 | return 0; |
410 | #endif | 409 | #endif |
411 | 410 | ||
412 | stack = ((struct switch_stack *) regs) - 1; | 411 | stack = ((struct switch_stack *) regs) - 1; |
413 | 412 | ||
414 | child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; | 413 | child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; |
415 | child_stack = (struct switch_stack *) child_ptregs - 1; | 414 | child_stack = (struct switch_stack *) child_ptregs - 1; |
416 | 415 | ||
417 | /* copy parent's switch_stack & pt_regs to child: */ | 416 | /* copy parent's switch_stack & pt_regs to child: */ |
418 | memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); | 417 | memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); |
419 | 418 | ||
420 | rbs = (unsigned long) current + IA64_RBS_OFFSET; | 419 | rbs = (unsigned long) current + IA64_RBS_OFFSET; |
421 | child_rbs = (unsigned long) p + IA64_RBS_OFFSET; | 420 | child_rbs = (unsigned long) p + IA64_RBS_OFFSET; |
422 | rbs_size = stack->ar_bspstore - rbs; | 421 | rbs_size = stack->ar_bspstore - rbs; |
423 | 422 | ||
424 | /* copy the parent's register backing store to the child: */ | 423 | /* copy the parent's register backing store to the child: */ |
425 | memcpy((void *) child_rbs, (void *) rbs, rbs_size); | 424 | memcpy((void *) child_rbs, (void *) rbs, rbs_size); |
426 | 425 | ||
427 | if (likely(user_mode(child_ptregs))) { | 426 | if (likely(user_mode(child_ptregs))) { |
428 | if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) | 427 | if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) |
429 | child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ | 428 | child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ |
430 | if (user_stack_base) { | 429 | if (user_stack_base) { |
431 | child_ptregs->r12 = user_stack_base + user_stack_size - 16; | 430 | child_ptregs->r12 = user_stack_base + user_stack_size - 16; |
432 | child_ptregs->ar_bspstore = user_stack_base; | 431 | child_ptregs->ar_bspstore = user_stack_base; |
433 | child_ptregs->ar_rnat = 0; | 432 | child_ptregs->ar_rnat = 0; |
434 | child_ptregs->loadrs = 0; | 433 | child_ptregs->loadrs = 0; |
435 | } | 434 | } |
436 | } else { | 435 | } else { |
437 | /* | 436 | /* |
438 | * Note: we simply preserve the relative position of | 437 | * Note: we simply preserve the relative position of |
439 | * the stack pointer here. There is no need to | 438 | * the stack pointer here. There is no need to |
440 | * allocate a scratch area here, since that will have | 439 | * allocate a scratch area here, since that will have |
441 | * been taken care of by the caller of sys_clone() | 440 | * been taken care of by the caller of sys_clone() |
442 | * already. | 441 | * already. |
443 | */ | 442 | */ |
444 | child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ | 443 | child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ |
445 | child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ | 444 | child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ |
446 | } | 445 | } |
447 | child_stack->ar_bspstore = child_rbs + rbs_size; | 446 | child_stack->ar_bspstore = child_rbs + rbs_size; |
448 | if (IS_IA32_PROCESS(regs)) | 447 | if (IS_IA32_PROCESS(regs)) |
449 | child_stack->b0 = (unsigned long) &ia32_ret_from_clone; | 448 | child_stack->b0 = (unsigned long) &ia32_ret_from_clone; |
450 | else | 449 | else |
451 | child_stack->b0 = (unsigned long) &ia64_ret_from_clone; | 450 | child_stack->b0 = (unsigned long) &ia64_ret_from_clone; |
452 | 451 | ||
453 | /* copy parts of thread_struct: */ | 452 | /* copy parts of thread_struct: */ |
454 | p->thread.ksp = (unsigned long) child_stack - 16; | 453 | p->thread.ksp = (unsigned long) child_stack - 16; |
455 | 454 | ||
456 | /* stop some PSR bits from being inherited. | 455 | /* stop some PSR bits from being inherited. |
457 | * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() | 456 | * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() |
458 | * therefore we must specify them explicitly here and not include them in | 457 | * therefore we must specify them explicitly here and not include them in |
459 | * IA64_PSR_BITS_TO_CLEAR. | 458 | * IA64_PSR_BITS_TO_CLEAR. |
460 | */ | 459 | */ |
461 | child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) | 460 | child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) |
462 | & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); | 461 | & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); |
463 | 462 | ||
464 | /* | 463 | /* |
465 | * NOTE: The calling convention considers all floating point | 464 | * NOTE: The calling convention considers all floating point |
466 | * registers in the high partition (fph) to be scratch. Since | 465 | * registers in the high partition (fph) to be scratch. Since |
467 | * the only way to get to this point is through a system call, | 466 | * the only way to get to this point is through a system call, |
468 | * we know that the values in fph are all dead. Hence, there | 467 | * we know that the values in fph are all dead. Hence, there |
469 | * is no need to inherit the fph state from the parent to the | 468 | * is no need to inherit the fph state from the parent to the |
470 | * child and all we have to do is to make sure that | 469 | * child and all we have to do is to make sure that |
471 | * IA64_THREAD_FPH_VALID is cleared in the child. | 470 | * IA64_THREAD_FPH_VALID is cleared in the child. |
472 | * | 471 | * |
473 | * XXX We could push this optimization a bit further by | 472 | * XXX We could push this optimization a bit further by |
474 | * clearing IA64_THREAD_FPH_VALID on ANY system call. | 473 | * clearing IA64_THREAD_FPH_VALID on ANY system call. |
475 | * However, it's not clear this is worth doing. Also, it | 474 | * However, it's not clear this is worth doing. Also, it |
476 | * would be a slight deviation from the normal Linux system | 475 | * would be a slight deviation from the normal Linux system |
477 | * call behavior where scratch registers are preserved across | 476 | * call behavior where scratch registers are preserved across |
478 | * system calls (unless used by the system call itself). | 477 | * system calls (unless used by the system call itself). |
479 | */ | 478 | */ |
480 | # define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ | 479 | # define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ |
481 | | IA64_THREAD_PM_VALID) | 480 | | IA64_THREAD_PM_VALID) |
482 | # define THREAD_FLAGS_TO_SET 0 | 481 | # define THREAD_FLAGS_TO_SET 0 |
483 | p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) | 482 | p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) |
484 | | THREAD_FLAGS_TO_SET); | 483 | | THREAD_FLAGS_TO_SET); |
485 | ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ | 484 | ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ |
486 | #ifdef CONFIG_IA32_SUPPORT | 485 | #ifdef CONFIG_IA32_SUPPORT |
487 | /* | 486 | /* |
488 | * If we're cloning an IA32 task then save the IA32 extra | 487 | * If we're cloning an IA32 task then save the IA32 extra |
489 | * state from the current task to the new task | 488 | * state from the current task to the new task |
490 | */ | 489 | */ |
491 | if (IS_IA32_PROCESS(task_pt_regs(current))) { | 490 | if (IS_IA32_PROCESS(task_pt_regs(current))) { |
492 | ia32_save_state(p); | 491 | ia32_save_state(p); |
493 | if (clone_flags & CLONE_SETTLS) | 492 | if (clone_flags & CLONE_SETTLS) |
494 | retval = ia32_clone_tls(p, child_ptregs); | 493 | retval = ia32_clone_tls(p, child_ptregs); |
495 | 494 | ||
496 | /* Copy partially mapped page list */ | 495 | /* Copy partially mapped page list */ |
497 | if (!retval) | 496 | if (!retval) |
498 | retval = ia32_copy_partial_page_list(p, clone_flags); | 497 | retval = ia32_copy_partial_page_list(p, clone_flags); |
499 | } | 498 | } |
500 | #endif | 499 | #endif |
501 | 500 | ||
502 | #ifdef CONFIG_PERFMON | 501 | #ifdef CONFIG_PERFMON |
503 | if (current->thread.pfm_context) | 502 | if (current->thread.pfm_context) |
504 | pfm_inherit(p, child_ptregs); | 503 | pfm_inherit(p, child_ptregs); |
505 | #endif | 504 | #endif |
506 | return retval; | 505 | return retval; |
507 | } | 506 | } |
508 | 507 | ||
509 | static void | 508 | static void |
510 | do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) | 509 | do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) |
511 | { | 510 | { |
512 | unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm; | 511 | unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm; |
513 | elf_greg_t *dst = arg; | 512 | elf_greg_t *dst = arg; |
514 | struct pt_regs *pt; | 513 | struct pt_regs *pt; |
515 | char nat; | 514 | char nat; |
516 | int i; | 515 | int i; |
517 | 516 | ||
518 | memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */ | 517 | memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */ |
519 | 518 | ||
520 | if (unw_unwind_to_user(info) < 0) | 519 | if (unw_unwind_to_user(info) < 0) |
521 | return; | 520 | return; |
522 | 521 | ||
523 | unw_get_sp(info, &sp); | 522 | unw_get_sp(info, &sp); |
524 | pt = (struct pt_regs *) (sp + 16); | 523 | pt = (struct pt_regs *) (sp + 16); |
525 | 524 | ||
526 | urbs_end = ia64_get_user_rbs_end(task, pt, &cfm); | 525 | urbs_end = ia64_get_user_rbs_end(task, pt, &cfm); |
527 | 526 | ||
528 | if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0) | 527 | if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0) |
529 | return; | 528 | return; |
530 | 529 | ||
531 | ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end), | 530 | ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end), |
532 | &ar_rnat); | 531 | &ar_rnat); |
533 | 532 | ||
534 | /* | 533 | /* |
535 | * coredump format: | 534 | * coredump format: |
536 | * r0-r31 | 535 | * r0-r31 |
537 | * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) | 536 | * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) |
538 | * predicate registers (p0-p63) | 537 | * predicate registers (p0-p63) |
539 | * b0-b7 | 538 | * b0-b7 |
540 | * ip cfm user-mask | 539 | * ip cfm user-mask |
541 | * ar.rsc ar.bsp ar.bspstore ar.rnat | 540 | * ar.rsc ar.bsp ar.bspstore ar.rnat |
542 | * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec | 541 | * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec |
543 | */ | 542 | */ |
544 | 543 | ||
545 | /* r0 is zero */ | 544 | /* r0 is zero */ |
546 | for (i = 1, mask = (1UL << i); i < 32; ++i) { | 545 | for (i = 1, mask = (1UL << i); i < 32; ++i) { |
547 | unw_get_gr(info, i, &dst[i], &nat); | 546 | unw_get_gr(info, i, &dst[i], &nat); |
548 | if (nat) | 547 | if (nat) |
549 | nat_bits |= mask; | 548 | nat_bits |= mask; |
550 | mask <<= 1; | 549 | mask <<= 1; |
551 | } | 550 | } |
552 | dst[32] = nat_bits; | 551 | dst[32] = nat_bits; |
553 | unw_get_pr(info, &dst[33]); | 552 | unw_get_pr(info, &dst[33]); |
554 | 553 | ||
555 | for (i = 0; i < 8; ++i) | 554 | for (i = 0; i < 8; ++i) |
556 | unw_get_br(info, i, &dst[34 + i]); | 555 | unw_get_br(info, i, &dst[34 + i]); |
557 | 556 | ||
558 | unw_get_rp(info, &ip); | 557 | unw_get_rp(info, &ip); |
559 | dst[42] = ip + ia64_psr(pt)->ri; | 558 | dst[42] = ip + ia64_psr(pt)->ri; |
560 | dst[43] = cfm; | 559 | dst[43] = cfm; |
561 | dst[44] = pt->cr_ipsr & IA64_PSR_UM; | 560 | dst[44] = pt->cr_ipsr & IA64_PSR_UM; |
562 | 561 | ||
563 | unw_get_ar(info, UNW_AR_RSC, &dst[45]); | 562 | unw_get_ar(info, UNW_AR_RSC, &dst[45]); |
564 | /* | 563 | /* |
565 | * For bsp and bspstore, unw_get_ar() would return the kernel | 564 | * For bsp and bspstore, unw_get_ar() would return the kernel |
566 | * addresses, but we need the user-level addresses instead: | 565 | * addresses, but we need the user-level addresses instead: |
567 | */ | 566 | */ |
568 | dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */ | 567 | dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */ |
569 | dst[47] = pt->ar_bspstore; | 568 | dst[47] = pt->ar_bspstore; |
570 | dst[48] = ar_rnat; | 569 | dst[48] = ar_rnat; |
571 | unw_get_ar(info, UNW_AR_CCV, &dst[49]); | 570 | unw_get_ar(info, UNW_AR_CCV, &dst[49]); |
572 | unw_get_ar(info, UNW_AR_UNAT, &dst[50]); | 571 | unw_get_ar(info, UNW_AR_UNAT, &dst[50]); |
573 | unw_get_ar(info, UNW_AR_FPSR, &dst[51]); | 572 | unw_get_ar(info, UNW_AR_FPSR, &dst[51]); |
574 | dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */ | 573 | dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */ |
575 | unw_get_ar(info, UNW_AR_LC, &dst[53]); | 574 | unw_get_ar(info, UNW_AR_LC, &dst[53]); |
576 | unw_get_ar(info, UNW_AR_EC, &dst[54]); | 575 | unw_get_ar(info, UNW_AR_EC, &dst[54]); |
577 | unw_get_ar(info, UNW_AR_CSD, &dst[55]); | 576 | unw_get_ar(info, UNW_AR_CSD, &dst[55]); |
578 | unw_get_ar(info, UNW_AR_SSD, &dst[56]); | 577 | unw_get_ar(info, UNW_AR_SSD, &dst[56]); |
579 | } | 578 | } |
580 | 579 | ||
581 | void | 580 | void |
582 | do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg) | 581 | do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg) |
583 | { | 582 | { |
584 | elf_fpreg_t *dst = arg; | 583 | elf_fpreg_t *dst = arg; |
585 | int i; | 584 | int i; |
586 | 585 | ||
587 | memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */ | 586 | memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */ |
588 | 587 | ||
589 | if (unw_unwind_to_user(info) < 0) | 588 | if (unw_unwind_to_user(info) < 0) |
590 | return; | 589 | return; |
591 | 590 | ||
592 | /* f0 is 0.0, f1 is 1.0 */ | 591 | /* f0 is 0.0, f1 is 1.0 */ |
593 | 592 | ||
594 | for (i = 2; i < 32; ++i) | 593 | for (i = 2; i < 32; ++i) |
595 | unw_get_fr(info, i, dst + i); | 594 | unw_get_fr(info, i, dst + i); |
596 | 595 | ||
597 | ia64_flush_fph(task); | 596 | ia64_flush_fph(task); |
598 | if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0) | 597 | if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0) |
599 | memcpy(dst + 32, task->thread.fph, 96*16); | 598 | memcpy(dst + 32, task->thread.fph, 96*16); |
600 | } | 599 | } |
601 | 600 | ||
602 | void | 601 | void |
603 | do_copy_regs (struct unw_frame_info *info, void *arg) | 602 | do_copy_regs (struct unw_frame_info *info, void *arg) |
604 | { | 603 | { |
605 | do_copy_task_regs(current, info, arg); | 604 | do_copy_task_regs(current, info, arg); |
606 | } | 605 | } |
607 | 606 | ||
608 | void | 607 | void |
609 | do_dump_fpu (struct unw_frame_info *info, void *arg) | 608 | do_dump_fpu (struct unw_frame_info *info, void *arg) |
610 | { | 609 | { |
611 | do_dump_task_fpu(current, info, arg); | 610 | do_dump_task_fpu(current, info, arg); |
612 | } | 611 | } |
613 | 612 | ||
614 | int | 613 | int |
615 | dump_task_regs(struct task_struct *task, elf_gregset_t *regs) | 614 | dump_task_regs(struct task_struct *task, elf_gregset_t *regs) |
616 | { | 615 | { |
617 | struct unw_frame_info tcore_info; | 616 | struct unw_frame_info tcore_info; |
618 | 617 | ||
619 | if (current == task) { | 618 | if (current == task) { |
620 | unw_init_running(do_copy_regs, regs); | 619 | unw_init_running(do_copy_regs, regs); |
621 | } else { | 620 | } else { |
622 | memset(&tcore_info, 0, sizeof(tcore_info)); | 621 | memset(&tcore_info, 0, sizeof(tcore_info)); |
623 | unw_init_from_blocked_task(&tcore_info, task); | 622 | unw_init_from_blocked_task(&tcore_info, task); |
624 | do_copy_task_regs(task, &tcore_info, regs); | 623 | do_copy_task_regs(task, &tcore_info, regs); |
625 | } | 624 | } |
626 | return 1; | 625 | return 1; |
627 | } | 626 | } |
628 | 627 | ||
629 | void | 628 | void |
630 | ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) | 629 | ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) |
631 | { | 630 | { |
632 | unw_init_running(do_copy_regs, dst); | 631 | unw_init_running(do_copy_regs, dst); |
633 | } | 632 | } |
634 | 633 | ||
635 | int | 634 | int |
636 | dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) | 635 | dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) |
637 | { | 636 | { |
638 | struct unw_frame_info tcore_info; | 637 | struct unw_frame_info tcore_info; |
639 | 638 | ||
640 | if (current == task) { | 639 | if (current == task) { |
641 | unw_init_running(do_dump_fpu, dst); | 640 | unw_init_running(do_dump_fpu, dst); |
642 | } else { | 641 | } else { |
643 | memset(&tcore_info, 0, sizeof(tcore_info)); | 642 | memset(&tcore_info, 0, sizeof(tcore_info)); |
644 | unw_init_from_blocked_task(&tcore_info, task); | 643 | unw_init_from_blocked_task(&tcore_info, task); |
645 | do_dump_task_fpu(task, &tcore_info, dst); | 644 | do_dump_task_fpu(task, &tcore_info, dst); |
646 | } | 645 | } |
647 | return 1; | 646 | return 1; |
648 | } | 647 | } |
649 | 648 | ||
650 | int | 649 | int |
651 | dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) | 650 | dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) |
652 | { | 651 | { |
653 | unw_init_running(do_dump_fpu, dst); | 652 | unw_init_running(do_dump_fpu, dst); |
654 | return 1; /* f0-f31 are always valid so we always return 1 */ | 653 | return 1; /* f0-f31 are always valid so we always return 1 */ |
655 | } | 654 | } |
656 | 655 | ||
657 | long | 656 | long |
658 | sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, | 657 | sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, |
659 | struct pt_regs *regs) | 658 | struct pt_regs *regs) |
660 | { | 659 | { |
661 | char *fname; | 660 | char *fname; |
662 | int error; | 661 | int error; |
663 | 662 | ||
664 | fname = getname(filename); | 663 | fname = getname(filename); |
665 | error = PTR_ERR(fname); | 664 | error = PTR_ERR(fname); |
666 | if (IS_ERR(fname)) | 665 | if (IS_ERR(fname)) |
667 | goto out; | 666 | goto out; |
668 | error = do_execve(fname, argv, envp, regs); | 667 | error = do_execve(fname, argv, envp, regs); |
669 | putname(fname); | 668 | putname(fname); |
670 | out: | 669 | out: |
671 | return error; | 670 | return error; |
672 | } | 671 | } |
673 | 672 | ||
674 | pid_t | 673 | pid_t |
675 | kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) | 674 | kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) |
676 | { | 675 | { |
677 | extern void start_kernel_thread (void); | 676 | extern void start_kernel_thread (void); |
678 | unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; | 677 | unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; |
679 | struct { | 678 | struct { |
680 | struct switch_stack sw; | 679 | struct switch_stack sw; |
681 | struct pt_regs pt; | 680 | struct pt_regs pt; |
682 | } regs; | 681 | } regs; |
683 | 682 | ||
684 | memset(®s, 0, sizeof(regs)); | 683 | memset(®s, 0, sizeof(regs)); |
685 | regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ | 684 | regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ |
686 | regs.pt.r1 = helper_fptr[1]; /* set GP */ | 685 | regs.pt.r1 = helper_fptr[1]; /* set GP */ |
687 | regs.pt.r9 = (unsigned long) fn; /* 1st argument */ | 686 | regs.pt.r9 = (unsigned long) fn; /* 1st argument */ |
688 | regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ | 687 | regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ |
689 | /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ | 688 | /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ |
690 | regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; | 689 | regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; |
691 | regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ | 690 | regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ |
692 | regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); | 691 | regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); |
693 | regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; | 692 | regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; |
694 | regs.sw.pr = (1 << PRED_KERNEL_STACK); | 693 | regs.sw.pr = (1 << PRED_KERNEL_STACK); |
695 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL); | 694 | return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL); |
696 | } | 695 | } |
697 | EXPORT_SYMBOL(kernel_thread); | 696 | EXPORT_SYMBOL(kernel_thread); |
698 | 697 | ||
699 | /* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ | 698 | /* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ |
700 | int | 699 | int |
701 | kernel_thread_helper (int (*fn)(void *), void *arg) | 700 | kernel_thread_helper (int (*fn)(void *), void *arg) |
702 | { | 701 | { |
703 | #ifdef CONFIG_IA32_SUPPORT | 702 | #ifdef CONFIG_IA32_SUPPORT |
704 | if (IS_IA32_PROCESS(task_pt_regs(current))) { | 703 | if (IS_IA32_PROCESS(task_pt_regs(current))) { |
705 | /* A kernel thread is always a 64-bit process. */ | 704 | /* A kernel thread is always a 64-bit process. */ |
706 | current->thread.map_base = DEFAULT_MAP_BASE; | 705 | current->thread.map_base = DEFAULT_MAP_BASE; |
707 | current->thread.task_size = DEFAULT_TASK_SIZE; | 706 | current->thread.task_size = DEFAULT_TASK_SIZE; |
708 | ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); | 707 | ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); |
709 | ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); | 708 | ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); |
710 | } | 709 | } |
711 | #endif | 710 | #endif |
712 | return (*fn)(arg); | 711 | return (*fn)(arg); |
713 | } | 712 | } |
714 | 713 | ||
715 | /* | 714 | /* |
716 | * Flush thread state. This is called when a thread does an execve(). | 715 | * Flush thread state. This is called when a thread does an execve(). |
717 | */ | 716 | */ |
718 | void | 717 | void |
719 | flush_thread (void) | 718 | flush_thread (void) |
720 | { | 719 | { |
721 | /* drop floating-point and debug-register state if it exists: */ | 720 | /* drop floating-point and debug-register state if it exists: */ |
722 | current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); | 721 | current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); |
723 | ia64_drop_fpu(current); | 722 | ia64_drop_fpu(current); |
724 | #ifdef CONFIG_IA32_SUPPORT | 723 | #ifdef CONFIG_IA32_SUPPORT |
725 | if (IS_IA32_PROCESS(task_pt_regs(current))) { | 724 | if (IS_IA32_PROCESS(task_pt_regs(current))) { |
726 | ia32_drop_partial_page_list(current); | 725 | ia32_drop_partial_page_list(current); |
727 | current->thread.task_size = IA32_PAGE_OFFSET; | 726 | current->thread.task_size = IA32_PAGE_OFFSET; |
728 | set_fs(USER_DS); | 727 | set_fs(USER_DS); |
729 | } | 728 | } |
730 | #endif | 729 | #endif |
731 | } | 730 | } |
732 | 731 | ||
733 | /* | 732 | /* |
734 | * Clean up state associated with current thread. This is called when | 733 | * Clean up state associated with current thread. This is called when |
735 | * the thread calls exit(). | 734 | * the thread calls exit(). |
736 | */ | 735 | */ |
737 | void | 736 | void |
738 | exit_thread (void) | 737 | exit_thread (void) |
739 | { | 738 | { |
740 | |||
741 | /* | ||
742 | * Remove function-return probe instances associated with this task | ||
743 | * and put them back on the free list. Do not insert an exit probe for | ||
744 | * this function, it will be disabled by kprobe_flush_task if you do. | ||
745 | */ | ||
746 | kprobe_flush_task(current); | ||
747 | 739 | ||
748 | ia64_drop_fpu(current); | 740 | ia64_drop_fpu(current); |
749 | #ifdef CONFIG_PERFMON | 741 | #ifdef CONFIG_PERFMON |
750 | /* if needed, stop monitoring and flush state to perfmon context */ | 742 | /* if needed, stop monitoring and flush state to perfmon context */ |
751 | if (current->thread.pfm_context) | 743 | if (current->thread.pfm_context) |
752 | pfm_exit_thread(current); | 744 | pfm_exit_thread(current); |
753 | 745 | ||
754 | /* free debug register resources */ | 746 | /* free debug register resources */ |
755 | if (current->thread.flags & IA64_THREAD_DBG_VALID) | 747 | if (current->thread.flags & IA64_THREAD_DBG_VALID) |
756 | pfm_release_debug_registers(current); | 748 | pfm_release_debug_registers(current); |
757 | #endif | 749 | #endif |
758 | if (IS_IA32_PROCESS(task_pt_regs(current))) | 750 | if (IS_IA32_PROCESS(task_pt_regs(current))) |
759 | ia32_drop_partial_page_list(current); | 751 | ia32_drop_partial_page_list(current); |
760 | } | 752 | } |
761 | 753 | ||
762 | unsigned long | 754 | unsigned long |
763 | get_wchan (struct task_struct *p) | 755 | get_wchan (struct task_struct *p) |
764 | { | 756 | { |
765 | struct unw_frame_info info; | 757 | struct unw_frame_info info; |
766 | unsigned long ip; | 758 | unsigned long ip; |
767 | int count = 0; | 759 | int count = 0; |
768 | 760 | ||
769 | /* | 761 | /* |
770 | * Note: p may not be a blocked task (it could be current or | 762 | * Note: p may not be a blocked task (it could be current or |
771 | * another process running on some other CPU. Rather than | 763 | * another process running on some other CPU. Rather than |
772 | * trying to determine if p is really blocked, we just assume | 764 | * trying to determine if p is really blocked, we just assume |
773 | * it's blocked and rely on the unwind routines to fail | 765 | * it's blocked and rely on the unwind routines to fail |
774 | * gracefully if the process wasn't really blocked after all. | 766 | * gracefully if the process wasn't really blocked after all. |
775 | * --davidm 99/12/15 | 767 | * --davidm 99/12/15 |
776 | */ | 768 | */ |
777 | unw_init_from_blocked_task(&info, p); | 769 | unw_init_from_blocked_task(&info, p); |
778 | do { | 770 | do { |
779 | if (unw_unwind(&info) < 0) | 771 | if (unw_unwind(&info) < 0) |
780 | return 0; | 772 | return 0; |
781 | unw_get_ip(&info, &ip); | 773 | unw_get_ip(&info, &ip); |
782 | if (!in_sched_functions(ip)) | 774 | if (!in_sched_functions(ip)) |
783 | return ip; | 775 | return ip; |
784 | } while (count++ < 16); | 776 | } while (count++ < 16); |
785 | return 0; | 777 | return 0; |
786 | } | 778 | } |
787 | 779 | ||
788 | void | 780 | void |
789 | cpu_halt (void) | 781 | cpu_halt (void) |
790 | { | 782 | { |
791 | pal_power_mgmt_info_u_t power_info[8]; | 783 | pal_power_mgmt_info_u_t power_info[8]; |
792 | unsigned long min_power; | 784 | unsigned long min_power; |
793 | int i, min_power_state; | 785 | int i, min_power_state; |
794 | 786 | ||
795 | if (ia64_pal_halt_info(power_info) != 0) | 787 | if (ia64_pal_halt_info(power_info) != 0) |
796 | return; | 788 | return; |
797 | 789 | ||
798 | min_power_state = 0; | 790 | min_power_state = 0; |
799 | min_power = power_info[0].pal_power_mgmt_info_s.power_consumption; | 791 | min_power = power_info[0].pal_power_mgmt_info_s.power_consumption; |
800 | for (i = 1; i < 8; ++i) | 792 | for (i = 1; i < 8; ++i) |
801 | if (power_info[i].pal_power_mgmt_info_s.im | 793 | if (power_info[i].pal_power_mgmt_info_s.im |
802 | && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) { | 794 | && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) { |
803 | min_power = power_info[i].pal_power_mgmt_info_s.power_consumption; | 795 | min_power = power_info[i].pal_power_mgmt_info_s.power_consumption; |
804 | min_power_state = i; | 796 | min_power_state = i; |
805 | } | 797 | } |
806 | 798 | ||
807 | while (1) | 799 | while (1) |
808 | ia64_pal_halt(min_power_state); | 800 | ia64_pal_halt(min_power_state); |
809 | } | 801 | } |
810 | 802 | ||
811 | void | 803 | void |
812 | machine_restart (char *restart_cmd) | 804 | machine_restart (char *restart_cmd) |
813 | { | 805 | { |
814 | (void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0); | 806 | (void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0); |
815 | (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); | 807 | (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); |
816 | } | 808 | } |
817 | 809 | ||
818 | void | 810 | void |
819 | machine_halt (void) | 811 | machine_halt (void) |
820 | { | 812 | { |
821 | (void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0); | 813 | (void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0); |
822 | cpu_halt(); | 814 | cpu_halt(); |
823 | } | 815 | } |
824 | 816 | ||
825 | void | 817 | void |
826 | machine_power_off (void) | 818 | machine_power_off (void) |
827 | { | 819 | { |
828 | if (pm_power_off) | 820 | if (pm_power_off) |
829 | pm_power_off(); | 821 | pm_power_off(); |
830 | machine_halt(); | 822 | machine_halt(); |
831 | } | 823 | } |
832 | 824 | ||
833 | 825 |
arch/powerpc/kernel/process.c
1 | /* | 1 | /* |
2 | * Derived from "arch/i386/kernel/process.c" | 2 | * Derived from "arch/i386/kernel/process.c" |
3 | * Copyright (C) 1995 Linus Torvalds | 3 | * Copyright (C) 1995 Linus Torvalds |
4 | * | 4 | * |
5 | * Updated and modified by Cort Dougan (cort@cs.nmt.edu) and | 5 | * Updated and modified by Cort Dougan (cort@cs.nmt.edu) and |
6 | * Paul Mackerras (paulus@cs.anu.edu.au) | 6 | * Paul Mackerras (paulus@cs.anu.edu.au) |
7 | * | 7 | * |
8 | * PowerPC version | 8 | * PowerPC version |
9 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | 9 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
10 | * | 10 | * |
11 | * This program is free software; you can redistribute it and/or | 11 | * This program is free software; you can redistribute it and/or |
12 | * modify it under the terms of the GNU General Public License | 12 | * modify it under the terms of the GNU General Public License |
13 | * as published by the Free Software Foundation; either version | 13 | * as published by the Free Software Foundation; either version |
14 | * 2 of the License, or (at your option) any later version. | 14 | * 2 of the License, or (at your option) any later version. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/config.h> | 17 | #include <linux/config.h> |
18 | #include <linux/errno.h> | 18 | #include <linux/errno.h> |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/smp.h> | 22 | #include <linux/smp.h> |
23 | #include <linux/smp_lock.h> | 23 | #include <linux/smp_lock.h> |
24 | #include <linux/stddef.h> | 24 | #include <linux/stddef.h> |
25 | #include <linux/unistd.h> | 25 | #include <linux/unistd.h> |
26 | #include <linux/ptrace.h> | 26 | #include <linux/ptrace.h> |
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/user.h> | 28 | #include <linux/user.h> |
29 | #include <linux/elf.h> | 29 | #include <linux/elf.h> |
30 | #include <linux/init.h> | 30 | #include <linux/init.h> |
31 | #include <linux/prctl.h> | 31 | #include <linux/prctl.h> |
32 | #include <linux/init_task.h> | 32 | #include <linux/init_task.h> |
33 | #include <linux/module.h> | 33 | #include <linux/module.h> |
34 | #include <linux/kallsyms.h> | 34 | #include <linux/kallsyms.h> |
35 | #include <linux/mqueue.h> | 35 | #include <linux/mqueue.h> |
36 | #include <linux/hardirq.h> | 36 | #include <linux/hardirq.h> |
37 | #include <linux/utsname.h> | 37 | #include <linux/utsname.h> |
38 | #include <linux/kprobes.h> | ||
39 | 38 | ||
40 | #include <asm/pgtable.h> | 39 | #include <asm/pgtable.h> |
41 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
42 | #include <asm/system.h> | 41 | #include <asm/system.h> |
43 | #include <asm/io.h> | 42 | #include <asm/io.h> |
44 | #include <asm/processor.h> | 43 | #include <asm/processor.h> |
45 | #include <asm/mmu.h> | 44 | #include <asm/mmu.h> |
46 | #include <asm/prom.h> | 45 | #include <asm/prom.h> |
47 | #include <asm/machdep.h> | 46 | #include <asm/machdep.h> |
48 | #include <asm/time.h> | 47 | #include <asm/time.h> |
49 | #ifdef CONFIG_PPC64 | 48 | #ifdef CONFIG_PPC64 |
50 | #include <asm/firmware.h> | 49 | #include <asm/firmware.h> |
51 | #endif | 50 | #endif |
52 | 51 | ||
53 | extern unsigned long _get_SP(void); | 52 | extern unsigned long _get_SP(void); |
54 | 53 | ||
55 | #ifndef CONFIG_SMP | 54 | #ifndef CONFIG_SMP |
56 | struct task_struct *last_task_used_math = NULL; | 55 | struct task_struct *last_task_used_math = NULL; |
57 | struct task_struct *last_task_used_altivec = NULL; | 56 | struct task_struct *last_task_used_altivec = NULL; |
58 | struct task_struct *last_task_used_spe = NULL; | 57 | struct task_struct *last_task_used_spe = NULL; |
59 | #endif | 58 | #endif |
60 | 59 | ||
61 | /* | 60 | /* |
62 | * Make sure the floating-point register state in the | 61 | * Make sure the floating-point register state in the |
63 | * the thread_struct is up to date for task tsk. | 62 | * the thread_struct is up to date for task tsk. |
64 | */ | 63 | */ |
65 | void flush_fp_to_thread(struct task_struct *tsk) | 64 | void flush_fp_to_thread(struct task_struct *tsk) |
66 | { | 65 | { |
67 | if (tsk->thread.regs) { | 66 | if (tsk->thread.regs) { |
68 | /* | 67 | /* |
69 | * We need to disable preemption here because if we didn't, | 68 | * We need to disable preemption here because if we didn't, |
70 | * another process could get scheduled after the regs->msr | 69 | * another process could get scheduled after the regs->msr |
71 | * test but before we have finished saving the FP registers | 70 | * test but before we have finished saving the FP registers |
72 | * to the thread_struct. That process could take over the | 71 | * to the thread_struct. That process could take over the |
73 | * FPU, and then when we get scheduled again we would store | 72 | * FPU, and then when we get scheduled again we would store |
74 | * bogus values for the remaining FP registers. | 73 | * bogus values for the remaining FP registers. |
75 | */ | 74 | */ |
76 | preempt_disable(); | 75 | preempt_disable(); |
77 | if (tsk->thread.regs->msr & MSR_FP) { | 76 | if (tsk->thread.regs->msr & MSR_FP) { |
78 | #ifdef CONFIG_SMP | 77 | #ifdef CONFIG_SMP |
79 | /* | 78 | /* |
80 | * This should only ever be called for current or | 79 | * This should only ever be called for current or |
81 | * for a stopped child process. Since we save away | 80 | * for a stopped child process. Since we save away |
82 | * the FP register state on context switch on SMP, | 81 | * the FP register state on context switch on SMP, |
83 | * there is something wrong if a stopped child appears | 82 | * there is something wrong if a stopped child appears |
84 | * to still have its FP state in the CPU registers. | 83 | * to still have its FP state in the CPU registers. |
85 | */ | 84 | */ |
86 | BUG_ON(tsk != current); | 85 | BUG_ON(tsk != current); |
87 | #endif | 86 | #endif |
88 | giveup_fpu(current); | 87 | giveup_fpu(current); |
89 | } | 88 | } |
90 | preempt_enable(); | 89 | preempt_enable(); |
91 | } | 90 | } |
92 | } | 91 | } |
93 | 92 | ||
94 | void enable_kernel_fp(void) | 93 | void enable_kernel_fp(void) |
95 | { | 94 | { |
96 | WARN_ON(preemptible()); | 95 | WARN_ON(preemptible()); |
97 | 96 | ||
98 | #ifdef CONFIG_SMP | 97 | #ifdef CONFIG_SMP |
99 | if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) | 98 | if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) |
100 | giveup_fpu(current); | 99 | giveup_fpu(current); |
101 | else | 100 | else |
102 | giveup_fpu(NULL); /* just enables FP for kernel */ | 101 | giveup_fpu(NULL); /* just enables FP for kernel */ |
103 | #else | 102 | #else |
104 | giveup_fpu(last_task_used_math); | 103 | giveup_fpu(last_task_used_math); |
105 | #endif /* CONFIG_SMP */ | 104 | #endif /* CONFIG_SMP */ |
106 | } | 105 | } |
107 | EXPORT_SYMBOL(enable_kernel_fp); | 106 | EXPORT_SYMBOL(enable_kernel_fp); |
108 | 107 | ||
109 | int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs) | 108 | int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs) |
110 | { | 109 | { |
111 | if (!tsk->thread.regs) | 110 | if (!tsk->thread.regs) |
112 | return 0; | 111 | return 0; |
113 | flush_fp_to_thread(current); | 112 | flush_fp_to_thread(current); |
114 | 113 | ||
115 | memcpy(fpregs, &tsk->thread.fpr[0], sizeof(*fpregs)); | 114 | memcpy(fpregs, &tsk->thread.fpr[0], sizeof(*fpregs)); |
116 | 115 | ||
117 | return 1; | 116 | return 1; |
118 | } | 117 | } |
119 | 118 | ||
120 | #ifdef CONFIG_ALTIVEC | 119 | #ifdef CONFIG_ALTIVEC |
121 | void enable_kernel_altivec(void) | 120 | void enable_kernel_altivec(void) |
122 | { | 121 | { |
123 | WARN_ON(preemptible()); | 122 | WARN_ON(preemptible()); |
124 | 123 | ||
125 | #ifdef CONFIG_SMP | 124 | #ifdef CONFIG_SMP |
126 | if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) | 125 | if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) |
127 | giveup_altivec(current); | 126 | giveup_altivec(current); |
128 | else | 127 | else |
129 | giveup_altivec(NULL); /* just enable AltiVec for kernel - force */ | 128 | giveup_altivec(NULL); /* just enable AltiVec for kernel - force */ |
130 | #else | 129 | #else |
131 | giveup_altivec(last_task_used_altivec); | 130 | giveup_altivec(last_task_used_altivec); |
132 | #endif /* CONFIG_SMP */ | 131 | #endif /* CONFIG_SMP */ |
133 | } | 132 | } |
134 | EXPORT_SYMBOL(enable_kernel_altivec); | 133 | EXPORT_SYMBOL(enable_kernel_altivec); |
135 | 134 | ||
136 | /* | 135 | /* |
137 | * Make sure the VMX/Altivec register state in the | 136 | * Make sure the VMX/Altivec register state in the |
138 | * the thread_struct is up to date for task tsk. | 137 | * the thread_struct is up to date for task tsk. |
139 | */ | 138 | */ |
140 | void flush_altivec_to_thread(struct task_struct *tsk) | 139 | void flush_altivec_to_thread(struct task_struct *tsk) |
141 | { | 140 | { |
142 | if (tsk->thread.regs) { | 141 | if (tsk->thread.regs) { |
143 | preempt_disable(); | 142 | preempt_disable(); |
144 | if (tsk->thread.regs->msr & MSR_VEC) { | 143 | if (tsk->thread.regs->msr & MSR_VEC) { |
145 | #ifdef CONFIG_SMP | 144 | #ifdef CONFIG_SMP |
146 | BUG_ON(tsk != current); | 145 | BUG_ON(tsk != current); |
147 | #endif | 146 | #endif |
148 | giveup_altivec(current); | 147 | giveup_altivec(current); |
149 | } | 148 | } |
150 | preempt_enable(); | 149 | preempt_enable(); |
151 | } | 150 | } |
152 | } | 151 | } |
153 | 152 | ||
154 | int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs) | 153 | int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs) |
155 | { | 154 | { |
156 | flush_altivec_to_thread(current); | 155 | flush_altivec_to_thread(current); |
157 | memcpy(vrregs, ¤t->thread.vr[0], sizeof(*vrregs)); | 156 | memcpy(vrregs, ¤t->thread.vr[0], sizeof(*vrregs)); |
158 | return 1; | 157 | return 1; |
159 | } | 158 | } |
160 | #endif /* CONFIG_ALTIVEC */ | 159 | #endif /* CONFIG_ALTIVEC */ |
161 | 160 | ||
162 | #ifdef CONFIG_SPE | 161 | #ifdef CONFIG_SPE |
163 | 162 | ||
164 | void enable_kernel_spe(void) | 163 | void enable_kernel_spe(void) |
165 | { | 164 | { |
166 | WARN_ON(preemptible()); | 165 | WARN_ON(preemptible()); |
167 | 166 | ||
168 | #ifdef CONFIG_SMP | 167 | #ifdef CONFIG_SMP |
169 | if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) | 168 | if (current->thread.regs && (current->thread.regs->msr & MSR_SPE)) |
170 | giveup_spe(current); | 169 | giveup_spe(current); |
171 | else | 170 | else |
172 | giveup_spe(NULL); /* just enable SPE for kernel - force */ | 171 | giveup_spe(NULL); /* just enable SPE for kernel - force */ |
173 | #else | 172 | #else |
174 | giveup_spe(last_task_used_spe); | 173 | giveup_spe(last_task_used_spe); |
175 | #endif /* __SMP __ */ | 174 | #endif /* __SMP __ */ |
176 | } | 175 | } |
177 | EXPORT_SYMBOL(enable_kernel_spe); | 176 | EXPORT_SYMBOL(enable_kernel_spe); |
178 | 177 | ||
179 | void flush_spe_to_thread(struct task_struct *tsk) | 178 | void flush_spe_to_thread(struct task_struct *tsk) |
180 | { | 179 | { |
181 | if (tsk->thread.regs) { | 180 | if (tsk->thread.regs) { |
182 | preempt_disable(); | 181 | preempt_disable(); |
183 | if (tsk->thread.regs->msr & MSR_SPE) { | 182 | if (tsk->thread.regs->msr & MSR_SPE) { |
184 | #ifdef CONFIG_SMP | 183 | #ifdef CONFIG_SMP |
185 | BUG_ON(tsk != current); | 184 | BUG_ON(tsk != current); |
186 | #endif | 185 | #endif |
187 | giveup_spe(current); | 186 | giveup_spe(current); |
188 | } | 187 | } |
189 | preempt_enable(); | 188 | preempt_enable(); |
190 | } | 189 | } |
191 | } | 190 | } |
192 | 191 | ||
193 | int dump_spe(struct pt_regs *regs, elf_vrregset_t *evrregs) | 192 | int dump_spe(struct pt_regs *regs, elf_vrregset_t *evrregs) |
194 | { | 193 | { |
195 | flush_spe_to_thread(current); | 194 | flush_spe_to_thread(current); |
196 | /* We copy u32 evr[32] + u64 acc + u32 spefscr -> 35 */ | 195 | /* We copy u32 evr[32] + u64 acc + u32 spefscr -> 35 */ |
197 | memcpy(evrregs, ¤t->thread.evr[0], sizeof(u32) * 35); | 196 | memcpy(evrregs, ¤t->thread.evr[0], sizeof(u32) * 35); |
198 | return 1; | 197 | return 1; |
199 | } | 198 | } |
200 | #endif /* CONFIG_SPE */ | 199 | #endif /* CONFIG_SPE */ |
201 | 200 | ||
202 | #ifndef CONFIG_SMP | 201 | #ifndef CONFIG_SMP |
203 | /* | 202 | /* |
204 | * If we are doing lazy switching of CPU state (FP, altivec or SPE), | 203 | * If we are doing lazy switching of CPU state (FP, altivec or SPE), |
205 | * and the current task has some state, discard it. | 204 | * and the current task has some state, discard it. |
206 | */ | 205 | */ |
207 | void discard_lazy_cpu_state(void) | 206 | void discard_lazy_cpu_state(void) |
208 | { | 207 | { |
209 | preempt_disable(); | 208 | preempt_disable(); |
210 | if (last_task_used_math == current) | 209 | if (last_task_used_math == current) |
211 | last_task_used_math = NULL; | 210 | last_task_used_math = NULL; |
212 | #ifdef CONFIG_ALTIVEC | 211 | #ifdef CONFIG_ALTIVEC |
213 | if (last_task_used_altivec == current) | 212 | if (last_task_used_altivec == current) |
214 | last_task_used_altivec = NULL; | 213 | last_task_used_altivec = NULL; |
215 | #endif /* CONFIG_ALTIVEC */ | 214 | #endif /* CONFIG_ALTIVEC */ |
216 | #ifdef CONFIG_SPE | 215 | #ifdef CONFIG_SPE |
217 | if (last_task_used_spe == current) | 216 | if (last_task_used_spe == current) |
218 | last_task_used_spe = NULL; | 217 | last_task_used_spe = NULL; |
219 | #endif | 218 | #endif |
220 | preempt_enable(); | 219 | preempt_enable(); |
221 | } | 220 | } |
222 | #endif /* CONFIG_SMP */ | 221 | #endif /* CONFIG_SMP */ |
223 | 222 | ||
224 | #ifdef CONFIG_PPC_MERGE /* XXX for now */ | 223 | #ifdef CONFIG_PPC_MERGE /* XXX for now */ |
225 | int set_dabr(unsigned long dabr) | 224 | int set_dabr(unsigned long dabr) |
226 | { | 225 | { |
227 | if (ppc_md.set_dabr) | 226 | if (ppc_md.set_dabr) |
228 | return ppc_md.set_dabr(dabr); | 227 | return ppc_md.set_dabr(dabr); |
229 | 228 | ||
230 | mtspr(SPRN_DABR, dabr); | 229 | mtspr(SPRN_DABR, dabr); |
231 | return 0; | 230 | return 0; |
232 | } | 231 | } |
233 | #endif | 232 | #endif |
234 | 233 | ||
235 | #ifdef CONFIG_PPC64 | 234 | #ifdef CONFIG_PPC64 |
236 | DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array); | 235 | DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array); |
237 | static DEFINE_PER_CPU(unsigned long, current_dabr); | 236 | static DEFINE_PER_CPU(unsigned long, current_dabr); |
238 | #endif | 237 | #endif |
239 | 238 | ||
240 | struct task_struct *__switch_to(struct task_struct *prev, | 239 | struct task_struct *__switch_to(struct task_struct *prev, |
241 | struct task_struct *new) | 240 | struct task_struct *new) |
242 | { | 241 | { |
243 | struct thread_struct *new_thread, *old_thread; | 242 | struct thread_struct *new_thread, *old_thread; |
244 | unsigned long flags; | 243 | unsigned long flags; |
245 | struct task_struct *last; | 244 | struct task_struct *last; |
246 | 245 | ||
247 | #ifdef CONFIG_SMP | 246 | #ifdef CONFIG_SMP |
248 | /* avoid complexity of lazy save/restore of fpu | 247 | /* avoid complexity of lazy save/restore of fpu |
249 | * by just saving it every time we switch out if | 248 | * by just saving it every time we switch out if |
250 | * this task used the fpu during the last quantum. | 249 | * this task used the fpu during the last quantum. |
251 | * | 250 | * |
252 | * If it tries to use the fpu again, it'll trap and | 251 | * If it tries to use the fpu again, it'll trap and |
253 | * reload its fp regs. So we don't have to do a restore | 252 | * reload its fp regs. So we don't have to do a restore |
254 | * every switch, just a save. | 253 | * every switch, just a save. |
255 | * -- Cort | 254 | * -- Cort |
256 | */ | 255 | */ |
257 | if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP)) | 256 | if (prev->thread.regs && (prev->thread.regs->msr & MSR_FP)) |
258 | giveup_fpu(prev); | 257 | giveup_fpu(prev); |
259 | #ifdef CONFIG_ALTIVEC | 258 | #ifdef CONFIG_ALTIVEC |
260 | /* | 259 | /* |
261 | * If the previous thread used altivec in the last quantum | 260 | * If the previous thread used altivec in the last quantum |
262 | * (thus changing altivec regs) then save them. | 261 | * (thus changing altivec regs) then save them. |
263 | * We used to check the VRSAVE register but not all apps | 262 | * We used to check the VRSAVE register but not all apps |
264 | * set it, so we don't rely on it now (and in fact we need | 263 | * set it, so we don't rely on it now (and in fact we need |
265 | * to save & restore VSCR even if VRSAVE == 0). -- paulus | 264 | * to save & restore VSCR even if VRSAVE == 0). -- paulus |
266 | * | 265 | * |
267 | * On SMP we always save/restore altivec regs just to avoid the | 266 | * On SMP we always save/restore altivec regs just to avoid the |
268 | * complexity of changing processors. | 267 | * complexity of changing processors. |
269 | * -- Cort | 268 | * -- Cort |
270 | */ | 269 | */ |
271 | if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC)) | 270 | if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC)) |
272 | giveup_altivec(prev); | 271 | giveup_altivec(prev); |
273 | #endif /* CONFIG_ALTIVEC */ | 272 | #endif /* CONFIG_ALTIVEC */ |
274 | #ifdef CONFIG_SPE | 273 | #ifdef CONFIG_SPE |
275 | /* | 274 | /* |
276 | * If the previous thread used spe in the last quantum | 275 | * If the previous thread used spe in the last quantum |
277 | * (thus changing spe regs) then save them. | 276 | * (thus changing spe regs) then save them. |
278 | * | 277 | * |
279 | * On SMP we always save/restore spe regs just to avoid the | 278 | * On SMP we always save/restore spe regs just to avoid the |
280 | * complexity of changing processors. | 279 | * complexity of changing processors. |
281 | */ | 280 | */ |
282 | if ((prev->thread.regs && (prev->thread.regs->msr & MSR_SPE))) | 281 | if ((prev->thread.regs && (prev->thread.regs->msr & MSR_SPE))) |
283 | giveup_spe(prev); | 282 | giveup_spe(prev); |
284 | #endif /* CONFIG_SPE */ | 283 | #endif /* CONFIG_SPE */ |
285 | 284 | ||
286 | #else /* CONFIG_SMP */ | 285 | #else /* CONFIG_SMP */ |
287 | #ifdef CONFIG_ALTIVEC | 286 | #ifdef CONFIG_ALTIVEC |
288 | /* Avoid the trap. On smp this this never happens since | 287 | /* Avoid the trap. On smp this this never happens since |
289 | * we don't set last_task_used_altivec -- Cort | 288 | * we don't set last_task_used_altivec -- Cort |
290 | */ | 289 | */ |
291 | if (new->thread.regs && last_task_used_altivec == new) | 290 | if (new->thread.regs && last_task_used_altivec == new) |
292 | new->thread.regs->msr |= MSR_VEC; | 291 | new->thread.regs->msr |= MSR_VEC; |
293 | #endif /* CONFIG_ALTIVEC */ | 292 | #endif /* CONFIG_ALTIVEC */ |
294 | #ifdef CONFIG_SPE | 293 | #ifdef CONFIG_SPE |
295 | /* Avoid the trap. On smp this this never happens since | 294 | /* Avoid the trap. On smp this this never happens since |
296 | * we don't set last_task_used_spe | 295 | * we don't set last_task_used_spe |
297 | */ | 296 | */ |
298 | if (new->thread.regs && last_task_used_spe == new) | 297 | if (new->thread.regs && last_task_used_spe == new) |
299 | new->thread.regs->msr |= MSR_SPE; | 298 | new->thread.regs->msr |= MSR_SPE; |
300 | #endif /* CONFIG_SPE */ | 299 | #endif /* CONFIG_SPE */ |
301 | 300 | ||
302 | #endif /* CONFIG_SMP */ | 301 | #endif /* CONFIG_SMP */ |
303 | 302 | ||
304 | #ifdef CONFIG_PPC64 /* for now */ | 303 | #ifdef CONFIG_PPC64 /* for now */ |
305 | if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) { | 304 | if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) { |
306 | set_dabr(new->thread.dabr); | 305 | set_dabr(new->thread.dabr); |
307 | __get_cpu_var(current_dabr) = new->thread.dabr; | 306 | __get_cpu_var(current_dabr) = new->thread.dabr; |
308 | } | 307 | } |
309 | 308 | ||
310 | flush_tlb_pending(); | 309 | flush_tlb_pending(); |
311 | #endif | 310 | #endif |
312 | 311 | ||
313 | new_thread = &new->thread; | 312 | new_thread = &new->thread; |
314 | old_thread = ¤t->thread; | 313 | old_thread = ¤t->thread; |
315 | 314 | ||
316 | #ifdef CONFIG_PPC64 | 315 | #ifdef CONFIG_PPC64 |
317 | /* | 316 | /* |
318 | * Collect processor utilization data per process | 317 | * Collect processor utilization data per process |
319 | */ | 318 | */ |
320 | if (firmware_has_feature(FW_FEATURE_SPLPAR)) { | 319 | if (firmware_has_feature(FW_FEATURE_SPLPAR)) { |
321 | struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); | 320 | struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); |
322 | long unsigned start_tb, current_tb; | 321 | long unsigned start_tb, current_tb; |
323 | start_tb = old_thread->start_tb; | 322 | start_tb = old_thread->start_tb; |
324 | cu->current_tb = current_tb = mfspr(SPRN_PURR); | 323 | cu->current_tb = current_tb = mfspr(SPRN_PURR); |
325 | old_thread->accum_tb += (current_tb - start_tb); | 324 | old_thread->accum_tb += (current_tb - start_tb); |
326 | new_thread->start_tb = current_tb; | 325 | new_thread->start_tb = current_tb; |
327 | } | 326 | } |
328 | #endif | 327 | #endif |
329 | 328 | ||
330 | local_irq_save(flags); | 329 | local_irq_save(flags); |
331 | 330 | ||
332 | account_system_vtime(current); | 331 | account_system_vtime(current); |
333 | account_process_vtime(current); | 332 | account_process_vtime(current); |
334 | calculate_steal_time(); | 333 | calculate_steal_time(); |
335 | 334 | ||
336 | last = _switch(old_thread, new_thread); | 335 | last = _switch(old_thread, new_thread); |
337 | 336 | ||
338 | local_irq_restore(flags); | 337 | local_irq_restore(flags); |
339 | 338 | ||
340 | return last; | 339 | return last; |
341 | } | 340 | } |
342 | 341 | ||
343 | static int instructions_to_print = 16; | 342 | static int instructions_to_print = 16; |
344 | 343 | ||
345 | #ifdef CONFIG_PPC64 | 344 | #ifdef CONFIG_PPC64 |
346 | #define BAD_PC(pc) ((REGION_ID(pc) != KERNEL_REGION_ID) && \ | 345 | #define BAD_PC(pc) ((REGION_ID(pc) != KERNEL_REGION_ID) && \ |
347 | (REGION_ID(pc) != VMALLOC_REGION_ID)) | 346 | (REGION_ID(pc) != VMALLOC_REGION_ID)) |
348 | #else | 347 | #else |
349 | #define BAD_PC(pc) ((pc) < KERNELBASE) | 348 | #define BAD_PC(pc) ((pc) < KERNELBASE) |
350 | #endif | 349 | #endif |
351 | 350 | ||
352 | static void show_instructions(struct pt_regs *regs) | 351 | static void show_instructions(struct pt_regs *regs) |
353 | { | 352 | { |
354 | int i; | 353 | int i; |
355 | unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * | 354 | unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * |
356 | sizeof(int)); | 355 | sizeof(int)); |
357 | 356 | ||
358 | printk("Instruction dump:"); | 357 | printk("Instruction dump:"); |
359 | 358 | ||
360 | for (i = 0; i < instructions_to_print; i++) { | 359 | for (i = 0; i < instructions_to_print; i++) { |
361 | int instr; | 360 | int instr; |
362 | 361 | ||
363 | if (!(i % 8)) | 362 | if (!(i % 8)) |
364 | printk("\n"); | 363 | printk("\n"); |
365 | 364 | ||
366 | if (BAD_PC(pc) || __get_user(instr, (unsigned int *)pc)) { | 365 | if (BAD_PC(pc) || __get_user(instr, (unsigned int *)pc)) { |
367 | printk("XXXXXXXX "); | 366 | printk("XXXXXXXX "); |
368 | } else { | 367 | } else { |
369 | if (regs->nip == pc) | 368 | if (regs->nip == pc) |
370 | printk("<%08x> ", instr); | 369 | printk("<%08x> ", instr); |
371 | else | 370 | else |
372 | printk("%08x ", instr); | 371 | printk("%08x ", instr); |
373 | } | 372 | } |
374 | 373 | ||
375 | pc += sizeof(int); | 374 | pc += sizeof(int); |
376 | } | 375 | } |
377 | 376 | ||
378 | printk("\n"); | 377 | printk("\n"); |
379 | } | 378 | } |
380 | 379 | ||
381 | static struct regbit { | 380 | static struct regbit { |
382 | unsigned long bit; | 381 | unsigned long bit; |
383 | const char *name; | 382 | const char *name; |
384 | } msr_bits[] = { | 383 | } msr_bits[] = { |
385 | {MSR_EE, "EE"}, | 384 | {MSR_EE, "EE"}, |
386 | {MSR_PR, "PR"}, | 385 | {MSR_PR, "PR"}, |
387 | {MSR_FP, "FP"}, | 386 | {MSR_FP, "FP"}, |
388 | {MSR_ME, "ME"}, | 387 | {MSR_ME, "ME"}, |
389 | {MSR_IR, "IR"}, | 388 | {MSR_IR, "IR"}, |
390 | {MSR_DR, "DR"}, | 389 | {MSR_DR, "DR"}, |
391 | {0, NULL} | 390 | {0, NULL} |
392 | }; | 391 | }; |
393 | 392 | ||
394 | static void printbits(unsigned long val, struct regbit *bits) | 393 | static void printbits(unsigned long val, struct regbit *bits) |
395 | { | 394 | { |
396 | const char *sep = ""; | 395 | const char *sep = ""; |
397 | 396 | ||
398 | printk("<"); | 397 | printk("<"); |
399 | for (; bits->bit; ++bits) | 398 | for (; bits->bit; ++bits) |
400 | if (val & bits->bit) { | 399 | if (val & bits->bit) { |
401 | printk("%s%s", sep, bits->name); | 400 | printk("%s%s", sep, bits->name); |
402 | sep = ","; | 401 | sep = ","; |
403 | } | 402 | } |
404 | printk(">"); | 403 | printk(">"); |
405 | } | 404 | } |
406 | 405 | ||
407 | #ifdef CONFIG_PPC64 | 406 | #ifdef CONFIG_PPC64 |
408 | #define REG "%016lX" | 407 | #define REG "%016lX" |
409 | #define REGS_PER_LINE 4 | 408 | #define REGS_PER_LINE 4 |
410 | #define LAST_VOLATILE 13 | 409 | #define LAST_VOLATILE 13 |
411 | #else | 410 | #else |
412 | #define REG "%08lX" | 411 | #define REG "%08lX" |
413 | #define REGS_PER_LINE 8 | 412 | #define REGS_PER_LINE 8 |
414 | #define LAST_VOLATILE 12 | 413 | #define LAST_VOLATILE 12 |
415 | #endif | 414 | #endif |
416 | 415 | ||
417 | void show_regs(struct pt_regs * regs) | 416 | void show_regs(struct pt_regs * regs) |
418 | { | 417 | { |
419 | int i, trap; | 418 | int i, trap; |
420 | 419 | ||
421 | printk("NIP: "REG" LR: "REG" CTR: "REG"\n", | 420 | printk("NIP: "REG" LR: "REG" CTR: "REG"\n", |
422 | regs->nip, regs->link, regs->ctr); | 421 | regs->nip, regs->link, regs->ctr); |
423 | printk("REGS: %p TRAP: %04lx %s (%s)\n", | 422 | printk("REGS: %p TRAP: %04lx %s (%s)\n", |
424 | regs, regs->trap, print_tainted(), system_utsname.release); | 423 | regs, regs->trap, print_tainted(), system_utsname.release); |
425 | printk("MSR: "REG" ", regs->msr); | 424 | printk("MSR: "REG" ", regs->msr); |
426 | printbits(regs->msr, msr_bits); | 425 | printbits(regs->msr, msr_bits); |
427 | printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer); | 426 | printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer); |
428 | trap = TRAP(regs); | 427 | trap = TRAP(regs); |
429 | if (trap == 0x300 || trap == 0x600) | 428 | if (trap == 0x300 || trap == 0x600) |
430 | printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr); | 429 | printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr); |
431 | printk("TASK = %p[%d] '%s' THREAD: %p", | 430 | printk("TASK = %p[%d] '%s' THREAD: %p", |
432 | current, current->pid, current->comm, task_thread_info(current)); | 431 | current, current->pid, current->comm, task_thread_info(current)); |
433 | 432 | ||
434 | #ifdef CONFIG_SMP | 433 | #ifdef CONFIG_SMP |
435 | printk(" CPU: %d", smp_processor_id()); | 434 | printk(" CPU: %d", smp_processor_id()); |
436 | #endif /* CONFIG_SMP */ | 435 | #endif /* CONFIG_SMP */ |
437 | 436 | ||
438 | for (i = 0; i < 32; i++) { | 437 | for (i = 0; i < 32; i++) { |
439 | if ((i % REGS_PER_LINE) == 0) | 438 | if ((i % REGS_PER_LINE) == 0) |
440 | printk("\n" KERN_INFO "GPR%02d: ", i); | 439 | printk("\n" KERN_INFO "GPR%02d: ", i); |
441 | printk(REG " ", regs->gpr[i]); | 440 | printk(REG " ", regs->gpr[i]); |
442 | if (i == LAST_VOLATILE && !FULL_REGS(regs)) | 441 | if (i == LAST_VOLATILE && !FULL_REGS(regs)) |
443 | break; | 442 | break; |
444 | } | 443 | } |
445 | printk("\n"); | 444 | printk("\n"); |
446 | #ifdef CONFIG_KALLSYMS | 445 | #ifdef CONFIG_KALLSYMS |
447 | /* | 446 | /* |
448 | * Lookup NIP late so we have the best change of getting the | 447 | * Lookup NIP late so we have the best change of getting the |
449 | * above info out without failing | 448 | * above info out without failing |
450 | */ | 449 | */ |
451 | printk("NIP ["REG"] ", regs->nip); | 450 | printk("NIP ["REG"] ", regs->nip); |
452 | print_symbol("%s\n", regs->nip); | 451 | print_symbol("%s\n", regs->nip); |
453 | printk("LR ["REG"] ", regs->link); | 452 | printk("LR ["REG"] ", regs->link); |
454 | print_symbol("%s\n", regs->link); | 453 | print_symbol("%s\n", regs->link); |
455 | #endif | 454 | #endif |
456 | show_stack(current, (unsigned long *) regs->gpr[1]); | 455 | show_stack(current, (unsigned long *) regs->gpr[1]); |
457 | if (!user_mode(regs)) | 456 | if (!user_mode(regs)) |
458 | show_instructions(regs); | 457 | show_instructions(regs); |
459 | } | 458 | } |
460 | 459 | ||
461 | void exit_thread(void) | 460 | void exit_thread(void) |
462 | { | 461 | { |
463 | kprobe_flush_task(current); | ||
464 | discard_lazy_cpu_state(); | 462 | discard_lazy_cpu_state(); |
465 | } | 463 | } |
466 | 464 | ||
467 | void flush_thread(void) | 465 | void flush_thread(void) |
468 | { | 466 | { |
469 | #ifdef CONFIG_PPC64 | 467 | #ifdef CONFIG_PPC64 |
470 | struct thread_info *t = current_thread_info(); | 468 | struct thread_info *t = current_thread_info(); |
471 | 469 | ||
472 | if (t->flags & _TIF_ABI_PENDING) | 470 | if (t->flags & _TIF_ABI_PENDING) |
473 | t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT); | 471 | t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT); |
474 | #endif | 472 | #endif |
475 | 473 | ||
476 | discard_lazy_cpu_state(); | 474 | discard_lazy_cpu_state(); |
477 | 475 | ||
478 | #ifdef CONFIG_PPC64 /* for now */ | 476 | #ifdef CONFIG_PPC64 /* for now */ |
479 | if (current->thread.dabr) { | 477 | if (current->thread.dabr) { |
480 | current->thread.dabr = 0; | 478 | current->thread.dabr = 0; |
481 | set_dabr(0); | 479 | set_dabr(0); |
482 | } | 480 | } |
483 | #endif | 481 | #endif |
484 | } | 482 | } |
485 | 483 | ||
486 | void | 484 | void |
487 | release_thread(struct task_struct *t) | 485 | release_thread(struct task_struct *t) |
488 | { | 486 | { |
489 | } | 487 | } |
490 | 488 | ||
491 | /* | 489 | /* |
492 | * This gets called before we allocate a new thread and copy | 490 | * This gets called before we allocate a new thread and copy |
493 | * the current task into it. | 491 | * the current task into it. |
494 | */ | 492 | */ |
495 | void prepare_to_copy(struct task_struct *tsk) | 493 | void prepare_to_copy(struct task_struct *tsk) |
496 | { | 494 | { |
497 | flush_fp_to_thread(current); | 495 | flush_fp_to_thread(current); |
498 | flush_altivec_to_thread(current); | 496 | flush_altivec_to_thread(current); |
499 | flush_spe_to_thread(current); | 497 | flush_spe_to_thread(current); |
500 | } | 498 | } |
501 | 499 | ||
502 | /* | 500 | /* |
503 | * Copy a thread.. | 501 | * Copy a thread.. |
504 | */ | 502 | */ |
505 | int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, | 503 | int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, |
506 | unsigned long unused, struct task_struct *p, | 504 | unsigned long unused, struct task_struct *p, |
507 | struct pt_regs *regs) | 505 | struct pt_regs *regs) |
508 | { | 506 | { |
509 | struct pt_regs *childregs, *kregs; | 507 | struct pt_regs *childregs, *kregs; |
510 | extern void ret_from_fork(void); | 508 | extern void ret_from_fork(void); |
511 | unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; | 509 | unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; |
512 | 510 | ||
513 | CHECK_FULL_REGS(regs); | 511 | CHECK_FULL_REGS(regs); |
514 | /* Copy registers */ | 512 | /* Copy registers */ |
515 | sp -= sizeof(struct pt_regs); | 513 | sp -= sizeof(struct pt_regs); |
516 | childregs = (struct pt_regs *) sp; | 514 | childregs = (struct pt_regs *) sp; |
517 | *childregs = *regs; | 515 | *childregs = *regs; |
518 | if ((childregs->msr & MSR_PR) == 0) { | 516 | if ((childregs->msr & MSR_PR) == 0) { |
519 | /* for kernel thread, set `current' and stackptr in new task */ | 517 | /* for kernel thread, set `current' and stackptr in new task */ |
520 | childregs->gpr[1] = sp + sizeof(struct pt_regs); | 518 | childregs->gpr[1] = sp + sizeof(struct pt_regs); |
521 | #ifdef CONFIG_PPC32 | 519 | #ifdef CONFIG_PPC32 |
522 | childregs->gpr[2] = (unsigned long) p; | 520 | childregs->gpr[2] = (unsigned long) p; |
523 | #else | 521 | #else |
524 | clear_tsk_thread_flag(p, TIF_32BIT); | 522 | clear_tsk_thread_flag(p, TIF_32BIT); |
525 | #endif | 523 | #endif |
526 | p->thread.regs = NULL; /* no user register state */ | 524 | p->thread.regs = NULL; /* no user register state */ |
527 | } else { | 525 | } else { |
528 | childregs->gpr[1] = usp; | 526 | childregs->gpr[1] = usp; |
529 | p->thread.regs = childregs; | 527 | p->thread.regs = childregs; |
530 | if (clone_flags & CLONE_SETTLS) { | 528 | if (clone_flags & CLONE_SETTLS) { |
531 | #ifdef CONFIG_PPC64 | 529 | #ifdef CONFIG_PPC64 |
532 | if (!test_thread_flag(TIF_32BIT)) | 530 | if (!test_thread_flag(TIF_32BIT)) |
533 | childregs->gpr[13] = childregs->gpr[6]; | 531 | childregs->gpr[13] = childregs->gpr[6]; |
534 | else | 532 | else |
535 | #endif | 533 | #endif |
536 | childregs->gpr[2] = childregs->gpr[6]; | 534 | childregs->gpr[2] = childregs->gpr[6]; |
537 | } | 535 | } |
538 | } | 536 | } |
539 | childregs->gpr[3] = 0; /* Result from fork() */ | 537 | childregs->gpr[3] = 0; /* Result from fork() */ |
540 | sp -= STACK_FRAME_OVERHEAD; | 538 | sp -= STACK_FRAME_OVERHEAD; |
541 | 539 | ||
542 | /* | 540 | /* |
543 | * The way this works is that at some point in the future | 541 | * The way this works is that at some point in the future |
544 | * some task will call _switch to switch to the new task. | 542 | * some task will call _switch to switch to the new task. |
545 | * That will pop off the stack frame created below and start | 543 | * That will pop off the stack frame created below and start |
546 | * the new task running at ret_from_fork. The new task will | 544 | * the new task running at ret_from_fork. The new task will |
547 | * do some house keeping and then return from the fork or clone | 545 | * do some house keeping and then return from the fork or clone |
548 | * system call, using the stack frame created above. | 546 | * system call, using the stack frame created above. |
549 | */ | 547 | */ |
550 | sp -= sizeof(struct pt_regs); | 548 | sp -= sizeof(struct pt_regs); |
551 | kregs = (struct pt_regs *) sp; | 549 | kregs = (struct pt_regs *) sp; |
552 | sp -= STACK_FRAME_OVERHEAD; | 550 | sp -= STACK_FRAME_OVERHEAD; |
553 | p->thread.ksp = sp; | 551 | p->thread.ksp = sp; |
554 | 552 | ||
555 | #ifdef CONFIG_PPC64 | 553 | #ifdef CONFIG_PPC64 |
556 | if (cpu_has_feature(CPU_FTR_SLB)) { | 554 | if (cpu_has_feature(CPU_FTR_SLB)) { |
557 | unsigned long sp_vsid = get_kernel_vsid(sp); | 555 | unsigned long sp_vsid = get_kernel_vsid(sp); |
558 | unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; | 556 | unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; |
559 | 557 | ||
560 | sp_vsid <<= SLB_VSID_SHIFT; | 558 | sp_vsid <<= SLB_VSID_SHIFT; |
561 | sp_vsid |= SLB_VSID_KERNEL | llp; | 559 | sp_vsid |= SLB_VSID_KERNEL | llp; |
562 | p->thread.ksp_vsid = sp_vsid; | 560 | p->thread.ksp_vsid = sp_vsid; |
563 | } | 561 | } |
564 | 562 | ||
565 | /* | 563 | /* |
566 | * The PPC64 ABI makes use of a TOC to contain function | 564 | * The PPC64 ABI makes use of a TOC to contain function |
567 | * pointers. The function (ret_from_except) is actually a pointer | 565 | * pointers. The function (ret_from_except) is actually a pointer |
568 | * to the TOC entry. The first entry is a pointer to the actual | 566 | * to the TOC entry. The first entry is a pointer to the actual |
569 | * function. | 567 | * function. |
570 | */ | 568 | */ |
571 | kregs->nip = *((unsigned long *)ret_from_fork); | 569 | kregs->nip = *((unsigned long *)ret_from_fork); |
572 | #else | 570 | #else |
573 | kregs->nip = (unsigned long)ret_from_fork; | 571 | kregs->nip = (unsigned long)ret_from_fork; |
574 | p->thread.last_syscall = -1; | 572 | p->thread.last_syscall = -1; |
575 | #endif | 573 | #endif |
576 | 574 | ||
577 | return 0; | 575 | return 0; |
578 | } | 576 | } |
579 | 577 | ||
580 | /* | 578 | /* |
581 | * Set up a thread for executing a new program | 579 | * Set up a thread for executing a new program |
582 | */ | 580 | */ |
583 | void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) | 581 | void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) |
584 | { | 582 | { |
585 | #ifdef CONFIG_PPC64 | 583 | #ifdef CONFIG_PPC64 |
586 | unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ | 584 | unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ |
587 | #endif | 585 | #endif |
588 | 586 | ||
589 | set_fs(USER_DS); | 587 | set_fs(USER_DS); |
590 | 588 | ||
591 | /* | 589 | /* |
592 | * If we exec out of a kernel thread then thread.regs will not be | 590 | * If we exec out of a kernel thread then thread.regs will not be |
593 | * set. Do it now. | 591 | * set. Do it now. |
594 | */ | 592 | */ |
595 | if (!current->thread.regs) { | 593 | if (!current->thread.regs) { |
596 | struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; | 594 | struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; |
597 | current->thread.regs = regs - 1; | 595 | current->thread.regs = regs - 1; |
598 | } | 596 | } |
599 | 597 | ||
600 | memset(regs->gpr, 0, sizeof(regs->gpr)); | 598 | memset(regs->gpr, 0, sizeof(regs->gpr)); |
601 | regs->ctr = 0; | 599 | regs->ctr = 0; |
602 | regs->link = 0; | 600 | regs->link = 0; |
603 | regs->xer = 0; | 601 | regs->xer = 0; |
604 | regs->ccr = 0; | 602 | regs->ccr = 0; |
605 | regs->gpr[1] = sp; | 603 | regs->gpr[1] = sp; |
606 | 604 | ||
607 | #ifdef CONFIG_PPC32 | 605 | #ifdef CONFIG_PPC32 |
608 | regs->mq = 0; | 606 | regs->mq = 0; |
609 | regs->nip = start; | 607 | regs->nip = start; |
610 | regs->msr = MSR_USER; | 608 | regs->msr = MSR_USER; |
611 | #else | 609 | #else |
612 | if (!test_thread_flag(TIF_32BIT)) { | 610 | if (!test_thread_flag(TIF_32BIT)) { |
613 | unsigned long entry, toc; | 611 | unsigned long entry, toc; |
614 | 612 | ||
615 | /* start is a relocated pointer to the function descriptor for | 613 | /* start is a relocated pointer to the function descriptor for |
616 | * the elf _start routine. The first entry in the function | 614 | * the elf _start routine. The first entry in the function |
617 | * descriptor is the entry address of _start and the second | 615 | * descriptor is the entry address of _start and the second |
618 | * entry is the TOC value we need to use. | 616 | * entry is the TOC value we need to use. |
619 | */ | 617 | */ |
620 | __get_user(entry, (unsigned long __user *)start); | 618 | __get_user(entry, (unsigned long __user *)start); |
621 | __get_user(toc, (unsigned long __user *)start+1); | 619 | __get_user(toc, (unsigned long __user *)start+1); |
622 | 620 | ||
623 | /* Check whether the e_entry function descriptor entries | 621 | /* Check whether the e_entry function descriptor entries |
624 | * need to be relocated before we can use them. | 622 | * need to be relocated before we can use them. |
625 | */ | 623 | */ |
626 | if (load_addr != 0) { | 624 | if (load_addr != 0) { |
627 | entry += load_addr; | 625 | entry += load_addr; |
628 | toc += load_addr; | 626 | toc += load_addr; |
629 | } | 627 | } |
630 | regs->nip = entry; | 628 | regs->nip = entry; |
631 | regs->gpr[2] = toc; | 629 | regs->gpr[2] = toc; |
632 | regs->msr = MSR_USER64; | 630 | regs->msr = MSR_USER64; |
633 | } else { | 631 | } else { |
634 | regs->nip = start; | 632 | regs->nip = start; |
635 | regs->gpr[2] = 0; | 633 | regs->gpr[2] = 0; |
636 | regs->msr = MSR_USER32; | 634 | regs->msr = MSR_USER32; |
637 | } | 635 | } |
638 | #endif | 636 | #endif |
639 | 637 | ||
640 | discard_lazy_cpu_state(); | 638 | discard_lazy_cpu_state(); |
641 | memset(current->thread.fpr, 0, sizeof(current->thread.fpr)); | 639 | memset(current->thread.fpr, 0, sizeof(current->thread.fpr)); |
642 | current->thread.fpscr.val = 0; | 640 | current->thread.fpscr.val = 0; |
643 | #ifdef CONFIG_ALTIVEC | 641 | #ifdef CONFIG_ALTIVEC |
644 | memset(current->thread.vr, 0, sizeof(current->thread.vr)); | 642 | memset(current->thread.vr, 0, sizeof(current->thread.vr)); |
645 | memset(¤t->thread.vscr, 0, sizeof(current->thread.vscr)); | 643 | memset(¤t->thread.vscr, 0, sizeof(current->thread.vscr)); |
646 | current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */ | 644 | current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */ |
647 | current->thread.vrsave = 0; | 645 | current->thread.vrsave = 0; |
648 | current->thread.used_vr = 0; | 646 | current->thread.used_vr = 0; |
649 | #endif /* CONFIG_ALTIVEC */ | 647 | #endif /* CONFIG_ALTIVEC */ |
650 | #ifdef CONFIG_SPE | 648 | #ifdef CONFIG_SPE |
651 | memset(current->thread.evr, 0, sizeof(current->thread.evr)); | 649 | memset(current->thread.evr, 0, sizeof(current->thread.evr)); |
652 | current->thread.acc = 0; | 650 | current->thread.acc = 0; |
653 | current->thread.spefscr = 0; | 651 | current->thread.spefscr = 0; |
654 | current->thread.used_spe = 0; | 652 | current->thread.used_spe = 0; |
655 | #endif /* CONFIG_SPE */ | 653 | #endif /* CONFIG_SPE */ |
656 | } | 654 | } |
657 | 655 | ||
658 | #define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \ | 656 | #define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \ |
659 | | PR_FP_EXC_RES | PR_FP_EXC_INV) | 657 | | PR_FP_EXC_RES | PR_FP_EXC_INV) |
660 | 658 | ||
661 | int set_fpexc_mode(struct task_struct *tsk, unsigned int val) | 659 | int set_fpexc_mode(struct task_struct *tsk, unsigned int val) |
662 | { | 660 | { |
663 | struct pt_regs *regs = tsk->thread.regs; | 661 | struct pt_regs *regs = tsk->thread.regs; |
664 | 662 | ||
665 | /* This is a bit hairy. If we are an SPE enabled processor | 663 | /* This is a bit hairy. If we are an SPE enabled processor |
666 | * (have embedded fp) we store the IEEE exception enable flags in | 664 | * (have embedded fp) we store the IEEE exception enable flags in |
667 | * fpexc_mode. fpexc_mode is also used for setting FP exception | 665 | * fpexc_mode. fpexc_mode is also used for setting FP exception |
668 | * mode (asyn, precise, disabled) for 'Classic' FP. */ | 666 | * mode (asyn, precise, disabled) for 'Classic' FP. */ |
669 | if (val & PR_FP_EXC_SW_ENABLE) { | 667 | if (val & PR_FP_EXC_SW_ENABLE) { |
670 | #ifdef CONFIG_SPE | 668 | #ifdef CONFIG_SPE |
671 | tsk->thread.fpexc_mode = val & | 669 | tsk->thread.fpexc_mode = val & |
672 | (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); | 670 | (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); |
673 | return 0; | 671 | return 0; |
674 | #else | 672 | #else |
675 | return -EINVAL; | 673 | return -EINVAL; |
676 | #endif | 674 | #endif |
677 | } | 675 | } |
678 | 676 | ||
679 | /* on a CONFIG_SPE this does not hurt us. The bits that | 677 | /* on a CONFIG_SPE this does not hurt us. The bits that |
680 | * __pack_fe01 use do not overlap with bits used for | 678 | * __pack_fe01 use do not overlap with bits used for |
681 | * PR_FP_EXC_SW_ENABLE. Additionally, the MSR[FE0,FE1] bits | 679 | * PR_FP_EXC_SW_ENABLE. Additionally, the MSR[FE0,FE1] bits |
682 | * on CONFIG_SPE implementations are reserved so writing to | 680 | * on CONFIG_SPE implementations are reserved so writing to |
683 | * them does not change anything */ | 681 | * them does not change anything */ |
684 | if (val > PR_FP_EXC_PRECISE) | 682 | if (val > PR_FP_EXC_PRECISE) |
685 | return -EINVAL; | 683 | return -EINVAL; |
686 | tsk->thread.fpexc_mode = __pack_fe01(val); | 684 | tsk->thread.fpexc_mode = __pack_fe01(val); |
687 | if (regs != NULL && (regs->msr & MSR_FP) != 0) | 685 | if (regs != NULL && (regs->msr & MSR_FP) != 0) |
688 | regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1)) | 686 | regs->msr = (regs->msr & ~(MSR_FE0|MSR_FE1)) |
689 | | tsk->thread.fpexc_mode; | 687 | | tsk->thread.fpexc_mode; |
690 | return 0; | 688 | return 0; |
691 | } | 689 | } |
692 | 690 | ||
693 | int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) | 691 | int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) |
694 | { | 692 | { |
695 | unsigned int val; | 693 | unsigned int val; |
696 | 694 | ||
697 | if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) | 695 | if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) |
698 | #ifdef CONFIG_SPE | 696 | #ifdef CONFIG_SPE |
699 | val = tsk->thread.fpexc_mode; | 697 | val = tsk->thread.fpexc_mode; |
700 | #else | 698 | #else |
701 | return -EINVAL; | 699 | return -EINVAL; |
702 | #endif | 700 | #endif |
703 | else | 701 | else |
704 | val = __unpack_fe01(tsk->thread.fpexc_mode); | 702 | val = __unpack_fe01(tsk->thread.fpexc_mode); |
705 | return put_user(val, (unsigned int __user *) adr); | 703 | return put_user(val, (unsigned int __user *) adr); |
706 | } | 704 | } |
707 | 705 | ||
708 | #define TRUNC_PTR(x) ((typeof(x))(((unsigned long)(x)) & 0xffffffff)) | 706 | #define TRUNC_PTR(x) ((typeof(x))(((unsigned long)(x)) & 0xffffffff)) |
709 | 707 | ||
710 | int sys_clone(unsigned long clone_flags, unsigned long usp, | 708 | int sys_clone(unsigned long clone_flags, unsigned long usp, |
711 | int __user *parent_tidp, void __user *child_threadptr, | 709 | int __user *parent_tidp, void __user *child_threadptr, |
712 | int __user *child_tidp, int p6, | 710 | int __user *child_tidp, int p6, |
713 | struct pt_regs *regs) | 711 | struct pt_regs *regs) |
714 | { | 712 | { |
715 | CHECK_FULL_REGS(regs); | 713 | CHECK_FULL_REGS(regs); |
716 | if (usp == 0) | 714 | if (usp == 0) |
717 | usp = regs->gpr[1]; /* stack pointer for child */ | 715 | usp = regs->gpr[1]; /* stack pointer for child */ |
718 | #ifdef CONFIG_PPC64 | 716 | #ifdef CONFIG_PPC64 |
719 | if (test_thread_flag(TIF_32BIT)) { | 717 | if (test_thread_flag(TIF_32BIT)) { |
720 | parent_tidp = TRUNC_PTR(parent_tidp); | 718 | parent_tidp = TRUNC_PTR(parent_tidp); |
721 | child_tidp = TRUNC_PTR(child_tidp); | 719 | child_tidp = TRUNC_PTR(child_tidp); |
722 | } | 720 | } |
723 | #endif | 721 | #endif |
724 | return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp); | 722 | return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp); |
725 | } | 723 | } |
726 | 724 | ||
727 | int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3, | 725 | int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3, |
728 | unsigned long p4, unsigned long p5, unsigned long p6, | 726 | unsigned long p4, unsigned long p5, unsigned long p6, |
729 | struct pt_regs *regs) | 727 | struct pt_regs *regs) |
730 | { | 728 | { |
731 | CHECK_FULL_REGS(regs); | 729 | CHECK_FULL_REGS(regs); |
732 | return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL); | 730 | return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL); |
733 | } | 731 | } |
734 | 732 | ||
735 | int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3, | 733 | int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3, |
736 | unsigned long p4, unsigned long p5, unsigned long p6, | 734 | unsigned long p4, unsigned long p5, unsigned long p6, |
737 | struct pt_regs *regs) | 735 | struct pt_regs *regs) |
738 | { | 736 | { |
739 | CHECK_FULL_REGS(regs); | 737 | CHECK_FULL_REGS(regs); |
740 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1], | 738 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1], |
741 | regs, 0, NULL, NULL); | 739 | regs, 0, NULL, NULL); |
742 | } | 740 | } |
743 | 741 | ||
744 | int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, | 742 | int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, |
745 | unsigned long a3, unsigned long a4, unsigned long a5, | 743 | unsigned long a3, unsigned long a4, unsigned long a5, |
746 | struct pt_regs *regs) | 744 | struct pt_regs *regs) |
747 | { | 745 | { |
748 | int error; | 746 | int error; |
749 | char *filename; | 747 | char *filename; |
750 | 748 | ||
751 | filename = getname((char __user *) a0); | 749 | filename = getname((char __user *) a0); |
752 | error = PTR_ERR(filename); | 750 | error = PTR_ERR(filename); |
753 | if (IS_ERR(filename)) | 751 | if (IS_ERR(filename)) |
754 | goto out; | 752 | goto out; |
755 | flush_fp_to_thread(current); | 753 | flush_fp_to_thread(current); |
756 | flush_altivec_to_thread(current); | 754 | flush_altivec_to_thread(current); |
757 | flush_spe_to_thread(current); | 755 | flush_spe_to_thread(current); |
758 | error = do_execve(filename, (char __user * __user *) a1, | 756 | error = do_execve(filename, (char __user * __user *) a1, |
759 | (char __user * __user *) a2, regs); | 757 | (char __user * __user *) a2, regs); |
760 | if (error == 0) { | 758 | if (error == 0) { |
761 | task_lock(current); | 759 | task_lock(current); |
762 | current->ptrace &= ~PT_DTRACE; | 760 | current->ptrace &= ~PT_DTRACE; |
763 | task_unlock(current); | 761 | task_unlock(current); |
764 | } | 762 | } |
765 | putname(filename); | 763 | putname(filename); |
766 | out: | 764 | out: |
767 | return error; | 765 | return error; |
768 | } | 766 | } |
769 | 767 | ||
770 | static int validate_sp(unsigned long sp, struct task_struct *p, | 768 | static int validate_sp(unsigned long sp, struct task_struct *p, |
771 | unsigned long nbytes) | 769 | unsigned long nbytes) |
772 | { | 770 | { |
773 | unsigned long stack_page = (unsigned long)task_stack_page(p); | 771 | unsigned long stack_page = (unsigned long)task_stack_page(p); |
774 | 772 | ||
775 | if (sp >= stack_page + sizeof(struct thread_struct) | 773 | if (sp >= stack_page + sizeof(struct thread_struct) |
776 | && sp <= stack_page + THREAD_SIZE - nbytes) | 774 | && sp <= stack_page + THREAD_SIZE - nbytes) |
777 | return 1; | 775 | return 1; |
778 | 776 | ||
779 | #ifdef CONFIG_IRQSTACKS | 777 | #ifdef CONFIG_IRQSTACKS |
780 | stack_page = (unsigned long) hardirq_ctx[task_cpu(p)]; | 778 | stack_page = (unsigned long) hardirq_ctx[task_cpu(p)]; |
781 | if (sp >= stack_page + sizeof(struct thread_struct) | 779 | if (sp >= stack_page + sizeof(struct thread_struct) |
782 | && sp <= stack_page + THREAD_SIZE - nbytes) | 780 | && sp <= stack_page + THREAD_SIZE - nbytes) |
783 | return 1; | 781 | return 1; |
784 | 782 | ||
785 | stack_page = (unsigned long) softirq_ctx[task_cpu(p)]; | 783 | stack_page = (unsigned long) softirq_ctx[task_cpu(p)]; |
786 | if (sp >= stack_page + sizeof(struct thread_struct) | 784 | if (sp >= stack_page + sizeof(struct thread_struct) |
787 | && sp <= stack_page + THREAD_SIZE - nbytes) | 785 | && sp <= stack_page + THREAD_SIZE - nbytes) |
788 | return 1; | 786 | return 1; |
789 | #endif | 787 | #endif |
790 | 788 | ||
791 | return 0; | 789 | return 0; |
792 | } | 790 | } |
793 | 791 | ||
794 | #ifdef CONFIG_PPC64 | 792 | #ifdef CONFIG_PPC64 |
795 | #define MIN_STACK_FRAME 112 /* same as STACK_FRAME_OVERHEAD, in fact */ | 793 | #define MIN_STACK_FRAME 112 /* same as STACK_FRAME_OVERHEAD, in fact */ |
796 | #define FRAME_LR_SAVE 2 | 794 | #define FRAME_LR_SAVE 2 |
797 | #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD + 288) | 795 | #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD + 288) |
798 | #define REGS_MARKER 0x7265677368657265ul | 796 | #define REGS_MARKER 0x7265677368657265ul |
799 | #define FRAME_MARKER 12 | 797 | #define FRAME_MARKER 12 |
800 | #else | 798 | #else |
801 | #define MIN_STACK_FRAME 16 | 799 | #define MIN_STACK_FRAME 16 |
802 | #define FRAME_LR_SAVE 1 | 800 | #define FRAME_LR_SAVE 1 |
803 | #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) | 801 | #define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) |
804 | #define REGS_MARKER 0x72656773ul | 802 | #define REGS_MARKER 0x72656773ul |
805 | #define FRAME_MARKER 2 | 803 | #define FRAME_MARKER 2 |
806 | #endif | 804 | #endif |
807 | 805 | ||
808 | unsigned long get_wchan(struct task_struct *p) | 806 | unsigned long get_wchan(struct task_struct *p) |
809 | { | 807 | { |
810 | unsigned long ip, sp; | 808 | unsigned long ip, sp; |
811 | int count = 0; | 809 | int count = 0; |
812 | 810 | ||
813 | if (!p || p == current || p->state == TASK_RUNNING) | 811 | if (!p || p == current || p->state == TASK_RUNNING) |
814 | return 0; | 812 | return 0; |
815 | 813 | ||
816 | sp = p->thread.ksp; | 814 | sp = p->thread.ksp; |
817 | if (!validate_sp(sp, p, MIN_STACK_FRAME)) | 815 | if (!validate_sp(sp, p, MIN_STACK_FRAME)) |
818 | return 0; | 816 | return 0; |
819 | 817 | ||
820 | do { | 818 | do { |
821 | sp = *(unsigned long *)sp; | 819 | sp = *(unsigned long *)sp; |
822 | if (!validate_sp(sp, p, MIN_STACK_FRAME)) | 820 | if (!validate_sp(sp, p, MIN_STACK_FRAME)) |
823 | return 0; | 821 | return 0; |
824 | if (count > 0) { | 822 | if (count > 0) { |
825 | ip = ((unsigned long *)sp)[FRAME_LR_SAVE]; | 823 | ip = ((unsigned long *)sp)[FRAME_LR_SAVE]; |
826 | if (!in_sched_functions(ip)) | 824 | if (!in_sched_functions(ip)) |
827 | return ip; | 825 | return ip; |
828 | } | 826 | } |
829 | } while (count++ < 16); | 827 | } while (count++ < 16); |
830 | return 0; | 828 | return 0; |
831 | } | 829 | } |
832 | EXPORT_SYMBOL(get_wchan); | 830 | EXPORT_SYMBOL(get_wchan); |
833 | 831 | ||
834 | static int kstack_depth_to_print = 64; | 832 | static int kstack_depth_to_print = 64; |
835 | 833 | ||
836 | void show_stack(struct task_struct *tsk, unsigned long *stack) | 834 | void show_stack(struct task_struct *tsk, unsigned long *stack) |
837 | { | 835 | { |
838 | unsigned long sp, ip, lr, newsp; | 836 | unsigned long sp, ip, lr, newsp; |
839 | int count = 0; | 837 | int count = 0; |
840 | int firstframe = 1; | 838 | int firstframe = 1; |
841 | 839 | ||
842 | sp = (unsigned long) stack; | 840 | sp = (unsigned long) stack; |
843 | if (tsk == NULL) | 841 | if (tsk == NULL) |
844 | tsk = current; | 842 | tsk = current; |
845 | if (sp == 0) { | 843 | if (sp == 0) { |
846 | if (tsk == current) | 844 | if (tsk == current) |
847 | asm("mr %0,1" : "=r" (sp)); | 845 | asm("mr %0,1" : "=r" (sp)); |
848 | else | 846 | else |
849 | sp = tsk->thread.ksp; | 847 | sp = tsk->thread.ksp; |
850 | } | 848 | } |
851 | 849 | ||
852 | lr = 0; | 850 | lr = 0; |
853 | printk("Call Trace:\n"); | 851 | printk("Call Trace:\n"); |
854 | do { | 852 | do { |
855 | if (!validate_sp(sp, tsk, MIN_STACK_FRAME)) | 853 | if (!validate_sp(sp, tsk, MIN_STACK_FRAME)) |
856 | return; | 854 | return; |
857 | 855 | ||
858 | stack = (unsigned long *) sp; | 856 | stack = (unsigned long *) sp; |
859 | newsp = stack[0]; | 857 | newsp = stack[0]; |
860 | ip = stack[FRAME_LR_SAVE]; | 858 | ip = stack[FRAME_LR_SAVE]; |
861 | if (!firstframe || ip != lr) { | 859 | if (!firstframe || ip != lr) { |
862 | printk("["REG"] ["REG"] ", sp, ip); | 860 | printk("["REG"] ["REG"] ", sp, ip); |
863 | print_symbol("%s", ip); | 861 | print_symbol("%s", ip); |
864 | if (firstframe) | 862 | if (firstframe) |
865 | printk(" (unreliable)"); | 863 | printk(" (unreliable)"); |
866 | printk("\n"); | 864 | printk("\n"); |
867 | } | 865 | } |
868 | firstframe = 0; | 866 | firstframe = 0; |
869 | 867 | ||
870 | /* | 868 | /* |
871 | * See if this is an exception frame. | 869 | * See if this is an exception frame. |
872 | * We look for the "regshere" marker in the current frame. | 870 | * We look for the "regshere" marker in the current frame. |
873 | */ | 871 | */ |
874 | if (validate_sp(sp, tsk, INT_FRAME_SIZE) | 872 | if (validate_sp(sp, tsk, INT_FRAME_SIZE) |
875 | && stack[FRAME_MARKER] == REGS_MARKER) { | 873 | && stack[FRAME_MARKER] == REGS_MARKER) { |
876 | struct pt_regs *regs = (struct pt_regs *) | 874 | struct pt_regs *regs = (struct pt_regs *) |
877 | (sp + STACK_FRAME_OVERHEAD); | 875 | (sp + STACK_FRAME_OVERHEAD); |
878 | printk("--- Exception: %lx", regs->trap); | 876 | printk("--- Exception: %lx", regs->trap); |
879 | print_symbol(" at %s\n", regs->nip); | 877 | print_symbol(" at %s\n", regs->nip); |
880 | lr = regs->link; | 878 | lr = regs->link; |
881 | print_symbol(" LR = %s\n", lr); | 879 | print_symbol(" LR = %s\n", lr); |
882 | firstframe = 1; | 880 | firstframe = 1; |
883 | } | 881 | } |
884 | 882 | ||
885 | sp = newsp; | 883 | sp = newsp; |
886 | } while (count++ < kstack_depth_to_print); | 884 | } while (count++ < kstack_depth_to_print); |
887 | } | 885 | } |
888 | 886 | ||
889 | void dump_stack(void) | 887 | void dump_stack(void) |
890 | { | 888 | { |
891 | show_stack(current, NULL); | 889 | show_stack(current, NULL); |
892 | } | 890 | } |
893 | EXPORT_SYMBOL(dump_stack); | 891 | EXPORT_SYMBOL(dump_stack); |
894 | 892 | ||
895 | #ifdef CONFIG_PPC64 | 893 | #ifdef CONFIG_PPC64 |
896 | void ppc64_runlatch_on(void) | 894 | void ppc64_runlatch_on(void) |
897 | { | 895 | { |
898 | unsigned long ctrl; | 896 | unsigned long ctrl; |
899 | 897 | ||
900 | if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_flag(TIF_RUNLATCH)) { | 898 | if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_flag(TIF_RUNLATCH)) { |
901 | HMT_medium(); | 899 | HMT_medium(); |
902 | 900 | ||
903 | ctrl = mfspr(SPRN_CTRLF); | 901 | ctrl = mfspr(SPRN_CTRLF); |
904 | ctrl |= CTRL_RUNLATCH; | 902 | ctrl |= CTRL_RUNLATCH; |
905 | mtspr(SPRN_CTRLT, ctrl); | 903 | mtspr(SPRN_CTRLT, ctrl); |
906 | 904 | ||
907 | set_thread_flag(TIF_RUNLATCH); | 905 | set_thread_flag(TIF_RUNLATCH); |
908 | } | 906 | } |
909 | } | 907 | } |
910 | 908 | ||
911 | void ppc64_runlatch_off(void) | 909 | void ppc64_runlatch_off(void) |
912 | { | 910 | { |
913 | unsigned long ctrl; | 911 | unsigned long ctrl; |
914 | 912 | ||
915 | if (cpu_has_feature(CPU_FTR_CTRL) && test_thread_flag(TIF_RUNLATCH)) { | 913 | if (cpu_has_feature(CPU_FTR_CTRL) && test_thread_flag(TIF_RUNLATCH)) { |
916 | HMT_medium(); | 914 | HMT_medium(); |
917 | 915 | ||
918 | clear_thread_flag(TIF_RUNLATCH); | 916 | clear_thread_flag(TIF_RUNLATCH); |
919 | 917 | ||
920 | ctrl = mfspr(SPRN_CTRLF); | 918 | ctrl = mfspr(SPRN_CTRLF); |
921 | ctrl &= ~CTRL_RUNLATCH; | 919 | ctrl &= ~CTRL_RUNLATCH; |
922 | mtspr(SPRN_CTRLT, ctrl); | 920 | mtspr(SPRN_CTRLT, ctrl); |
923 | } | 921 | } |
924 | } | 922 | } |
925 | #endif | 923 | #endif |
926 | 924 |
arch/x86_64/kernel/process.c
1 | /* | 1 | /* |
2 | * linux/arch/x86-64/kernel/process.c | 2 | * linux/arch/x86-64/kernel/process.c |
3 | * | 3 | * |
4 | * Copyright (C) 1995 Linus Torvalds | 4 | * Copyright (C) 1995 Linus Torvalds |
5 | * | 5 | * |
6 | * Pentium III FXSR, SSE support | 6 | * Pentium III FXSR, SSE support |
7 | * Gareth Hughes <gareth@valinux.com>, May 2000 | 7 | * Gareth Hughes <gareth@valinux.com>, May 2000 |
8 | * | 8 | * |
9 | * X86-64 port | 9 | * X86-64 port |
10 | * Andi Kleen. | 10 | * Andi Kleen. |
11 | * | 11 | * |
12 | * CPU hotplug support - ashok.raj@intel.com | 12 | * CPU hotplug support - ashok.raj@intel.com |
13 | * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ | 13 | * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ |
14 | */ | 14 | */ |
15 | 15 | ||
16 | /* | 16 | /* |
17 | * This file handles the architecture-dependent parts of process handling.. | 17 | * This file handles the architecture-dependent parts of process handling.. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include <stdarg.h> | 20 | #include <stdarg.h> |
21 | 21 | ||
22 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
23 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/elfcore.h> | 27 | #include <linux/elfcore.h> |
28 | #include <linux/smp.h> | 28 | #include <linux/smp.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/user.h> | 30 | #include <linux/user.h> |
31 | #include <linux/module.h> | 31 | #include <linux/module.h> |
32 | #include <linux/a.out.h> | 32 | #include <linux/a.out.h> |
33 | #include <linux/interrupt.h> | 33 | #include <linux/interrupt.h> |
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/ptrace.h> | 35 | #include <linux/ptrace.h> |
36 | #include <linux/utsname.h> | 36 | #include <linux/utsname.h> |
37 | #include <linux/random.h> | 37 | #include <linux/random.h> |
38 | #include <linux/kprobes.h> | ||
39 | #include <linux/notifier.h> | 38 | #include <linux/notifier.h> |
39 | #include <linux/kprobes.h> | ||
40 | 40 | ||
41 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
42 | #include <asm/pgtable.h> | 42 | #include <asm/pgtable.h> |
43 | #include <asm/system.h> | 43 | #include <asm/system.h> |
44 | #include <asm/io.h> | 44 | #include <asm/io.h> |
45 | #include <asm/processor.h> | 45 | #include <asm/processor.h> |
46 | #include <asm/i387.h> | 46 | #include <asm/i387.h> |
47 | #include <asm/mmu_context.h> | 47 | #include <asm/mmu_context.h> |
48 | #include <asm/pda.h> | 48 | #include <asm/pda.h> |
49 | #include <asm/prctl.h> | 49 | #include <asm/prctl.h> |
50 | #include <asm/kdebug.h> | 50 | #include <asm/kdebug.h> |
51 | #include <asm/desc.h> | 51 | #include <asm/desc.h> |
52 | #include <asm/proto.h> | 52 | #include <asm/proto.h> |
53 | #include <asm/ia32.h> | 53 | #include <asm/ia32.h> |
54 | #include <asm/idle.h> | 54 | #include <asm/idle.h> |
55 | 55 | ||
56 | asmlinkage extern void ret_from_fork(void); | 56 | asmlinkage extern void ret_from_fork(void); |
57 | 57 | ||
58 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; | 58 | unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; |
59 | 59 | ||
60 | unsigned long boot_option_idle_override = 0; | 60 | unsigned long boot_option_idle_override = 0; |
61 | EXPORT_SYMBOL(boot_option_idle_override); | 61 | EXPORT_SYMBOL(boot_option_idle_override); |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Powermanagement idle function, if any.. | 64 | * Powermanagement idle function, if any.. |
65 | */ | 65 | */ |
66 | void (*pm_idle)(void); | 66 | void (*pm_idle)(void); |
67 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); | 67 | static DEFINE_PER_CPU(unsigned int, cpu_idle_state); |
68 | 68 | ||
69 | static struct notifier_block *idle_notifier; | 69 | static struct notifier_block *idle_notifier; |
70 | static DEFINE_SPINLOCK(idle_notifier_lock); | 70 | static DEFINE_SPINLOCK(idle_notifier_lock); |
71 | 71 | ||
72 | void idle_notifier_register(struct notifier_block *n) | 72 | void idle_notifier_register(struct notifier_block *n) |
73 | { | 73 | { |
74 | unsigned long flags; | 74 | unsigned long flags; |
75 | spin_lock_irqsave(&idle_notifier_lock, flags); | 75 | spin_lock_irqsave(&idle_notifier_lock, flags); |
76 | notifier_chain_register(&idle_notifier, n); | 76 | notifier_chain_register(&idle_notifier, n); |
77 | spin_unlock_irqrestore(&idle_notifier_lock, flags); | 77 | spin_unlock_irqrestore(&idle_notifier_lock, flags); |
78 | } | 78 | } |
79 | EXPORT_SYMBOL_GPL(idle_notifier_register); | 79 | EXPORT_SYMBOL_GPL(idle_notifier_register); |
80 | 80 | ||
81 | void idle_notifier_unregister(struct notifier_block *n) | 81 | void idle_notifier_unregister(struct notifier_block *n) |
82 | { | 82 | { |
83 | unsigned long flags; | 83 | unsigned long flags; |
84 | spin_lock_irqsave(&idle_notifier_lock, flags); | 84 | spin_lock_irqsave(&idle_notifier_lock, flags); |
85 | notifier_chain_unregister(&idle_notifier, n); | 85 | notifier_chain_unregister(&idle_notifier, n); |
86 | spin_unlock_irqrestore(&idle_notifier_lock, flags); | 86 | spin_unlock_irqrestore(&idle_notifier_lock, flags); |
87 | } | 87 | } |
88 | EXPORT_SYMBOL(idle_notifier_unregister); | 88 | EXPORT_SYMBOL(idle_notifier_unregister); |
89 | 89 | ||
90 | enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; | 90 | enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; |
91 | static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; | 91 | static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; |
92 | 92 | ||
93 | void enter_idle(void) | 93 | void enter_idle(void) |
94 | { | 94 | { |
95 | __get_cpu_var(idle_state) = CPU_IDLE; | 95 | __get_cpu_var(idle_state) = CPU_IDLE; |
96 | notifier_call_chain(&idle_notifier, IDLE_START, NULL); | 96 | notifier_call_chain(&idle_notifier, IDLE_START, NULL); |
97 | } | 97 | } |
98 | 98 | ||
99 | static void __exit_idle(void) | 99 | static void __exit_idle(void) |
100 | { | 100 | { |
101 | __get_cpu_var(idle_state) = CPU_NOT_IDLE; | 101 | __get_cpu_var(idle_state) = CPU_NOT_IDLE; |
102 | notifier_call_chain(&idle_notifier, IDLE_END, NULL); | 102 | notifier_call_chain(&idle_notifier, IDLE_END, NULL); |
103 | } | 103 | } |
104 | 104 | ||
105 | /* Called from interrupts to signify idle end */ | 105 | /* Called from interrupts to signify idle end */ |
106 | void exit_idle(void) | 106 | void exit_idle(void) |
107 | { | 107 | { |
108 | if (current->pid | read_pda(irqcount)) | 108 | if (current->pid | read_pda(irqcount)) |
109 | return; | 109 | return; |
110 | __exit_idle(); | 110 | __exit_idle(); |
111 | } | 111 | } |
112 | 112 | ||
113 | /* | 113 | /* |
114 | * We use this if we don't have any better | 114 | * We use this if we don't have any better |
115 | * idle routine.. | 115 | * idle routine.. |
116 | */ | 116 | */ |
117 | static void default_idle(void) | 117 | static void default_idle(void) |
118 | { | 118 | { |
119 | local_irq_enable(); | 119 | local_irq_enable(); |
120 | 120 | ||
121 | clear_thread_flag(TIF_POLLING_NRFLAG); | 121 | clear_thread_flag(TIF_POLLING_NRFLAG); |
122 | smp_mb__after_clear_bit(); | 122 | smp_mb__after_clear_bit(); |
123 | while (!need_resched()) { | 123 | while (!need_resched()) { |
124 | local_irq_disable(); | 124 | local_irq_disable(); |
125 | if (!need_resched()) | 125 | if (!need_resched()) |
126 | safe_halt(); | 126 | safe_halt(); |
127 | else | 127 | else |
128 | local_irq_enable(); | 128 | local_irq_enable(); |
129 | } | 129 | } |
130 | set_thread_flag(TIF_POLLING_NRFLAG); | 130 | set_thread_flag(TIF_POLLING_NRFLAG); |
131 | } | 131 | } |
132 | 132 | ||
133 | /* | 133 | /* |
134 | * On SMP it's slightly faster (but much more power-consuming!) | 134 | * On SMP it's slightly faster (but much more power-consuming!) |
135 | * to poll the ->need_resched flag instead of waiting for the | 135 | * to poll the ->need_resched flag instead of waiting for the |
136 | * cross-CPU IPI to arrive. Use this option with caution. | 136 | * cross-CPU IPI to arrive. Use this option with caution. |
137 | */ | 137 | */ |
138 | static void poll_idle (void) | 138 | static void poll_idle (void) |
139 | { | 139 | { |
140 | local_irq_enable(); | 140 | local_irq_enable(); |
141 | 141 | ||
142 | asm volatile( | 142 | asm volatile( |
143 | "2:" | 143 | "2:" |
144 | "testl %0,%1;" | 144 | "testl %0,%1;" |
145 | "rep; nop;" | 145 | "rep; nop;" |
146 | "je 2b;" | 146 | "je 2b;" |
147 | : : | 147 | : : |
148 | "i" (_TIF_NEED_RESCHED), | 148 | "i" (_TIF_NEED_RESCHED), |
149 | "m" (current_thread_info()->flags)); | 149 | "m" (current_thread_info()->flags)); |
150 | } | 150 | } |
151 | 151 | ||
152 | void cpu_idle_wait(void) | 152 | void cpu_idle_wait(void) |
153 | { | 153 | { |
154 | unsigned int cpu, this_cpu = get_cpu(); | 154 | unsigned int cpu, this_cpu = get_cpu(); |
155 | cpumask_t map; | 155 | cpumask_t map; |
156 | 156 | ||
157 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); | 157 | set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); |
158 | put_cpu(); | 158 | put_cpu(); |
159 | 159 | ||
160 | cpus_clear(map); | 160 | cpus_clear(map); |
161 | for_each_online_cpu(cpu) { | 161 | for_each_online_cpu(cpu) { |
162 | per_cpu(cpu_idle_state, cpu) = 1; | 162 | per_cpu(cpu_idle_state, cpu) = 1; |
163 | cpu_set(cpu, map); | 163 | cpu_set(cpu, map); |
164 | } | 164 | } |
165 | 165 | ||
166 | __get_cpu_var(cpu_idle_state) = 0; | 166 | __get_cpu_var(cpu_idle_state) = 0; |
167 | 167 | ||
168 | wmb(); | 168 | wmb(); |
169 | do { | 169 | do { |
170 | ssleep(1); | 170 | ssleep(1); |
171 | for_each_online_cpu(cpu) { | 171 | for_each_online_cpu(cpu) { |
172 | if (cpu_isset(cpu, map) && | 172 | if (cpu_isset(cpu, map) && |
173 | !per_cpu(cpu_idle_state, cpu)) | 173 | !per_cpu(cpu_idle_state, cpu)) |
174 | cpu_clear(cpu, map); | 174 | cpu_clear(cpu, map); |
175 | } | 175 | } |
176 | cpus_and(map, map, cpu_online_map); | 176 | cpus_and(map, map, cpu_online_map); |
177 | } while (!cpus_empty(map)); | 177 | } while (!cpus_empty(map)); |
178 | } | 178 | } |
179 | EXPORT_SYMBOL_GPL(cpu_idle_wait); | 179 | EXPORT_SYMBOL_GPL(cpu_idle_wait); |
180 | 180 | ||
181 | #ifdef CONFIG_HOTPLUG_CPU | 181 | #ifdef CONFIG_HOTPLUG_CPU |
182 | DECLARE_PER_CPU(int, cpu_state); | 182 | DECLARE_PER_CPU(int, cpu_state); |
183 | 183 | ||
184 | #include <asm/nmi.h> | 184 | #include <asm/nmi.h> |
185 | /* We halt the CPU with physical CPU hotplug */ | 185 | /* We halt the CPU with physical CPU hotplug */ |
186 | static inline void play_dead(void) | 186 | static inline void play_dead(void) |
187 | { | 187 | { |
188 | idle_task_exit(); | 188 | idle_task_exit(); |
189 | wbinvd(); | 189 | wbinvd(); |
190 | mb(); | 190 | mb(); |
191 | /* Ack it */ | 191 | /* Ack it */ |
192 | __get_cpu_var(cpu_state) = CPU_DEAD; | 192 | __get_cpu_var(cpu_state) = CPU_DEAD; |
193 | 193 | ||
194 | local_irq_disable(); | 194 | local_irq_disable(); |
195 | while (1) | 195 | while (1) |
196 | halt(); | 196 | halt(); |
197 | } | 197 | } |
198 | #else | 198 | #else |
199 | static inline void play_dead(void) | 199 | static inline void play_dead(void) |
200 | { | 200 | { |
201 | BUG(); | 201 | BUG(); |
202 | } | 202 | } |
203 | #endif /* CONFIG_HOTPLUG_CPU */ | 203 | #endif /* CONFIG_HOTPLUG_CPU */ |
204 | 204 | ||
205 | /* | 205 | /* |
206 | * The idle thread. There's no useful work to be | 206 | * The idle thread. There's no useful work to be |
207 | * done, so just try to conserve power and have a | 207 | * done, so just try to conserve power and have a |
208 | * low exit latency (ie sit in a loop waiting for | 208 | * low exit latency (ie sit in a loop waiting for |
209 | * somebody to say that they'd like to reschedule) | 209 | * somebody to say that they'd like to reschedule) |
210 | */ | 210 | */ |
211 | void cpu_idle (void) | 211 | void cpu_idle (void) |
212 | { | 212 | { |
213 | set_thread_flag(TIF_POLLING_NRFLAG); | 213 | set_thread_flag(TIF_POLLING_NRFLAG); |
214 | 214 | ||
215 | /* endless idle loop with no priority at all */ | 215 | /* endless idle loop with no priority at all */ |
216 | while (1) { | 216 | while (1) { |
217 | while (!need_resched()) { | 217 | while (!need_resched()) { |
218 | void (*idle)(void); | 218 | void (*idle)(void); |
219 | 219 | ||
220 | if (__get_cpu_var(cpu_idle_state)) | 220 | if (__get_cpu_var(cpu_idle_state)) |
221 | __get_cpu_var(cpu_idle_state) = 0; | 221 | __get_cpu_var(cpu_idle_state) = 0; |
222 | 222 | ||
223 | rmb(); | 223 | rmb(); |
224 | idle = pm_idle; | 224 | idle = pm_idle; |
225 | if (!idle) | 225 | if (!idle) |
226 | idle = default_idle; | 226 | idle = default_idle; |
227 | if (cpu_is_offline(smp_processor_id())) | 227 | if (cpu_is_offline(smp_processor_id())) |
228 | play_dead(); | 228 | play_dead(); |
229 | enter_idle(); | 229 | enter_idle(); |
230 | idle(); | 230 | idle(); |
231 | __exit_idle(); | 231 | __exit_idle(); |
232 | } | 232 | } |
233 | 233 | ||
234 | preempt_enable_no_resched(); | 234 | preempt_enable_no_resched(); |
235 | schedule(); | 235 | schedule(); |
236 | preempt_disable(); | 236 | preempt_disable(); |
237 | } | 237 | } |
238 | } | 238 | } |
239 | 239 | ||
240 | /* | 240 | /* |
241 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | 241 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, |
242 | * which can obviate IPI to trigger checking of need_resched. | 242 | * which can obviate IPI to trigger checking of need_resched. |
243 | * We execute MONITOR against need_resched and enter optimized wait state | 243 | * We execute MONITOR against need_resched and enter optimized wait state |
244 | * through MWAIT. Whenever someone changes need_resched, we would be woken | 244 | * through MWAIT. Whenever someone changes need_resched, we would be woken |
245 | * up from MWAIT (without an IPI). | 245 | * up from MWAIT (without an IPI). |
246 | */ | 246 | */ |
247 | static void mwait_idle(void) | 247 | static void mwait_idle(void) |
248 | { | 248 | { |
249 | local_irq_enable(); | 249 | local_irq_enable(); |
250 | 250 | ||
251 | while (!need_resched()) { | 251 | while (!need_resched()) { |
252 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 252 | __monitor((void *)¤t_thread_info()->flags, 0, 0); |
253 | smp_mb(); | 253 | smp_mb(); |
254 | if (need_resched()) | 254 | if (need_resched()) |
255 | break; | 255 | break; |
256 | __mwait(0, 0); | 256 | __mwait(0, 0); |
257 | } | 257 | } |
258 | } | 258 | } |
259 | 259 | ||
260 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | 260 | void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) |
261 | { | 261 | { |
262 | static int printed; | 262 | static int printed; |
263 | if (cpu_has(c, X86_FEATURE_MWAIT)) { | 263 | if (cpu_has(c, X86_FEATURE_MWAIT)) { |
264 | /* | 264 | /* |
265 | * Skip, if setup has overridden idle. | 265 | * Skip, if setup has overridden idle. |
266 | * One CPU supports mwait => All CPUs supports mwait | 266 | * One CPU supports mwait => All CPUs supports mwait |
267 | */ | 267 | */ |
268 | if (!pm_idle) { | 268 | if (!pm_idle) { |
269 | if (!printed) { | 269 | if (!printed) { |
270 | printk("using mwait in idle threads.\n"); | 270 | printk("using mwait in idle threads.\n"); |
271 | printed = 1; | 271 | printed = 1; |
272 | } | 272 | } |
273 | pm_idle = mwait_idle; | 273 | pm_idle = mwait_idle; |
274 | } | 274 | } |
275 | } | 275 | } |
276 | } | 276 | } |
277 | 277 | ||
278 | static int __init idle_setup (char *str) | 278 | static int __init idle_setup (char *str) |
279 | { | 279 | { |
280 | if (!strncmp(str, "poll", 4)) { | 280 | if (!strncmp(str, "poll", 4)) { |
281 | printk("using polling idle threads.\n"); | 281 | printk("using polling idle threads.\n"); |
282 | pm_idle = poll_idle; | 282 | pm_idle = poll_idle; |
283 | } | 283 | } |
284 | 284 | ||
285 | boot_option_idle_override = 1; | 285 | boot_option_idle_override = 1; |
286 | return 1; | 286 | return 1; |
287 | } | 287 | } |
288 | 288 | ||
289 | __setup("idle=", idle_setup); | 289 | __setup("idle=", idle_setup); |
290 | 290 | ||
291 | /* Prints also some state that isn't saved in the pt_regs */ | 291 | /* Prints also some state that isn't saved in the pt_regs */ |
292 | void __show_regs(struct pt_regs * regs) | 292 | void __show_regs(struct pt_regs * regs) |
293 | { | 293 | { |
294 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | 294 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; |
295 | unsigned int fsindex,gsindex; | 295 | unsigned int fsindex,gsindex; |
296 | unsigned int ds,cs,es; | 296 | unsigned int ds,cs,es; |
297 | 297 | ||
298 | printk("\n"); | 298 | printk("\n"); |
299 | print_modules(); | 299 | print_modules(); |
300 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | 300 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", |
301 | current->pid, current->comm, print_tainted(), | 301 | current->pid, current->comm, print_tainted(), |
302 | system_utsname.release, | 302 | system_utsname.release, |
303 | (int)strcspn(system_utsname.version, " "), | 303 | (int)strcspn(system_utsname.version, " "), |
304 | system_utsname.version); | 304 | system_utsname.version); |
305 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); | 305 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); |
306 | printk_address(regs->rip); | 306 | printk_address(regs->rip); |
307 | printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, | 307 | printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, |
308 | regs->eflags); | 308 | regs->eflags); |
309 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | 309 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", |
310 | regs->rax, regs->rbx, regs->rcx); | 310 | regs->rax, regs->rbx, regs->rcx); |
311 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | 311 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", |
312 | regs->rdx, regs->rsi, regs->rdi); | 312 | regs->rdx, regs->rsi, regs->rdi); |
313 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | 313 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", |
314 | regs->rbp, regs->r8, regs->r9); | 314 | regs->rbp, regs->r8, regs->r9); |
315 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | 315 | printk("R10: %016lx R11: %016lx R12: %016lx\n", |
316 | regs->r10, regs->r11, regs->r12); | 316 | regs->r10, regs->r11, regs->r12); |
317 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | 317 | printk("R13: %016lx R14: %016lx R15: %016lx\n", |
318 | regs->r13, regs->r14, regs->r15); | 318 | regs->r13, regs->r14, regs->r15); |
319 | 319 | ||
320 | asm("movl %%ds,%0" : "=r" (ds)); | 320 | asm("movl %%ds,%0" : "=r" (ds)); |
321 | asm("movl %%cs,%0" : "=r" (cs)); | 321 | asm("movl %%cs,%0" : "=r" (cs)); |
322 | asm("movl %%es,%0" : "=r" (es)); | 322 | asm("movl %%es,%0" : "=r" (es)); |
323 | asm("movl %%fs,%0" : "=r" (fsindex)); | 323 | asm("movl %%fs,%0" : "=r" (fsindex)); |
324 | asm("movl %%gs,%0" : "=r" (gsindex)); | 324 | asm("movl %%gs,%0" : "=r" (gsindex)); |
325 | 325 | ||
326 | rdmsrl(MSR_FS_BASE, fs); | 326 | rdmsrl(MSR_FS_BASE, fs); |
327 | rdmsrl(MSR_GS_BASE, gs); | 327 | rdmsrl(MSR_GS_BASE, gs); |
328 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | 328 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); |
329 | 329 | ||
330 | asm("movq %%cr0, %0": "=r" (cr0)); | 330 | asm("movq %%cr0, %0": "=r" (cr0)); |
331 | asm("movq %%cr2, %0": "=r" (cr2)); | 331 | asm("movq %%cr2, %0": "=r" (cr2)); |
332 | asm("movq %%cr3, %0": "=r" (cr3)); | 332 | asm("movq %%cr3, %0": "=r" (cr3)); |
333 | asm("movq %%cr4, %0": "=r" (cr4)); | 333 | asm("movq %%cr4, %0": "=r" (cr4)); |
334 | 334 | ||
335 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | 335 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", |
336 | fs,fsindex,gs,gsindex,shadowgs); | 336 | fs,fsindex,gs,gsindex,shadowgs); |
337 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); | 337 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); |
338 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); | 338 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); |
339 | } | 339 | } |
340 | 340 | ||
341 | void show_regs(struct pt_regs *regs) | 341 | void show_regs(struct pt_regs *regs) |
342 | { | 342 | { |
343 | printk("CPU %d:", smp_processor_id()); | 343 | printk("CPU %d:", smp_processor_id()); |
344 | __show_regs(regs); | 344 | __show_regs(regs); |
345 | show_trace(®s->rsp); | 345 | show_trace(®s->rsp); |
346 | } | 346 | } |
347 | 347 | ||
348 | /* | 348 | /* |
349 | * Free current thread data structures etc.. | 349 | * Free current thread data structures etc.. |
350 | */ | 350 | */ |
351 | void exit_thread(void) | 351 | void exit_thread(void) |
352 | { | 352 | { |
353 | struct task_struct *me = current; | 353 | struct task_struct *me = current; |
354 | struct thread_struct *t = &me->thread; | 354 | struct thread_struct *t = &me->thread; |
355 | |||
356 | /* | ||
357 | * Remove function-return probe instances associated with this task | ||
358 | * and put them back on the free list. Do not insert an exit probe for | ||
359 | * this function, it will be disabled by kprobe_flush_task if you do. | ||
360 | */ | ||
361 | kprobe_flush_task(me); | ||
362 | 355 | ||
363 | if (me->thread.io_bitmap_ptr) { | 356 | if (me->thread.io_bitmap_ptr) { |
364 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); | 357 | struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); |
365 | 358 | ||
366 | kfree(t->io_bitmap_ptr); | 359 | kfree(t->io_bitmap_ptr); |
367 | t->io_bitmap_ptr = NULL; | 360 | t->io_bitmap_ptr = NULL; |
368 | /* | 361 | /* |
369 | * Careful, clear this in the TSS too: | 362 | * Careful, clear this in the TSS too: |
370 | */ | 363 | */ |
371 | memset(tss->io_bitmap, 0xff, t->io_bitmap_max); | 364 | memset(tss->io_bitmap, 0xff, t->io_bitmap_max); |
372 | t->io_bitmap_max = 0; | 365 | t->io_bitmap_max = 0; |
373 | put_cpu(); | 366 | put_cpu(); |
374 | } | 367 | } |
375 | } | 368 | } |
376 | 369 | ||
377 | void flush_thread(void) | 370 | void flush_thread(void) |
378 | { | 371 | { |
379 | struct task_struct *tsk = current; | 372 | struct task_struct *tsk = current; |
380 | struct thread_info *t = current_thread_info(); | 373 | struct thread_info *t = current_thread_info(); |
381 | 374 | ||
382 | if (t->flags & _TIF_ABI_PENDING) | 375 | if (t->flags & _TIF_ABI_PENDING) |
383 | t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); | 376 | t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); |
384 | 377 | ||
385 | tsk->thread.debugreg0 = 0; | 378 | tsk->thread.debugreg0 = 0; |
386 | tsk->thread.debugreg1 = 0; | 379 | tsk->thread.debugreg1 = 0; |
387 | tsk->thread.debugreg2 = 0; | 380 | tsk->thread.debugreg2 = 0; |
388 | tsk->thread.debugreg3 = 0; | 381 | tsk->thread.debugreg3 = 0; |
389 | tsk->thread.debugreg6 = 0; | 382 | tsk->thread.debugreg6 = 0; |
390 | tsk->thread.debugreg7 = 0; | 383 | tsk->thread.debugreg7 = 0; |
391 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); | 384 | memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); |
392 | /* | 385 | /* |
393 | * Forget coprocessor state.. | 386 | * Forget coprocessor state.. |
394 | */ | 387 | */ |
395 | clear_fpu(tsk); | 388 | clear_fpu(tsk); |
396 | clear_used_math(); | 389 | clear_used_math(); |
397 | } | 390 | } |
398 | 391 | ||
399 | void release_thread(struct task_struct *dead_task) | 392 | void release_thread(struct task_struct *dead_task) |
400 | { | 393 | { |
401 | if (dead_task->mm) { | 394 | if (dead_task->mm) { |
402 | if (dead_task->mm->context.size) { | 395 | if (dead_task->mm->context.size) { |
403 | printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", | 396 | printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", |
404 | dead_task->comm, | 397 | dead_task->comm, |
405 | dead_task->mm->context.ldt, | 398 | dead_task->mm->context.ldt, |
406 | dead_task->mm->context.size); | 399 | dead_task->mm->context.size); |
407 | BUG(); | 400 | BUG(); |
408 | } | 401 | } |
409 | } | 402 | } |
410 | } | 403 | } |
411 | 404 | ||
412 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) | 405 | static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) |
413 | { | 406 | { |
414 | struct user_desc ud = { | 407 | struct user_desc ud = { |
415 | .base_addr = addr, | 408 | .base_addr = addr, |
416 | .limit = 0xfffff, | 409 | .limit = 0xfffff, |
417 | .seg_32bit = 1, | 410 | .seg_32bit = 1, |
418 | .limit_in_pages = 1, | 411 | .limit_in_pages = 1, |
419 | .useable = 1, | 412 | .useable = 1, |
420 | }; | 413 | }; |
421 | struct n_desc_struct *desc = (void *)t->thread.tls_array; | 414 | struct n_desc_struct *desc = (void *)t->thread.tls_array; |
422 | desc += tls; | 415 | desc += tls; |
423 | desc->a = LDT_entry_a(&ud); | 416 | desc->a = LDT_entry_a(&ud); |
424 | desc->b = LDT_entry_b(&ud); | 417 | desc->b = LDT_entry_b(&ud); |
425 | } | 418 | } |
426 | 419 | ||
427 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) | 420 | static inline u32 read_32bit_tls(struct task_struct *t, int tls) |
428 | { | 421 | { |
429 | struct desc_struct *desc = (void *)t->thread.tls_array; | 422 | struct desc_struct *desc = (void *)t->thread.tls_array; |
430 | desc += tls; | 423 | desc += tls; |
431 | return desc->base0 | | 424 | return desc->base0 | |
432 | (((u32)desc->base1) << 16) | | 425 | (((u32)desc->base1) << 16) | |
433 | (((u32)desc->base2) << 24); | 426 | (((u32)desc->base2) << 24); |
434 | } | 427 | } |
435 | 428 | ||
436 | /* | 429 | /* |
437 | * This gets called before we allocate a new thread and copy | 430 | * This gets called before we allocate a new thread and copy |
438 | * the current task into it. | 431 | * the current task into it. |
439 | */ | 432 | */ |
440 | void prepare_to_copy(struct task_struct *tsk) | 433 | void prepare_to_copy(struct task_struct *tsk) |
441 | { | 434 | { |
442 | unlazy_fpu(tsk); | 435 | unlazy_fpu(tsk); |
443 | } | 436 | } |
444 | 437 | ||
445 | int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, | 438 | int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, |
446 | unsigned long unused, | 439 | unsigned long unused, |
447 | struct task_struct * p, struct pt_regs * regs) | 440 | struct task_struct * p, struct pt_regs * regs) |
448 | { | 441 | { |
449 | int err; | 442 | int err; |
450 | struct pt_regs * childregs; | 443 | struct pt_regs * childregs; |
451 | struct task_struct *me = current; | 444 | struct task_struct *me = current; |
452 | 445 | ||
453 | childregs = ((struct pt_regs *) | 446 | childregs = ((struct pt_regs *) |
454 | (THREAD_SIZE + task_stack_page(p))) - 1; | 447 | (THREAD_SIZE + task_stack_page(p))) - 1; |
455 | *childregs = *regs; | 448 | *childregs = *regs; |
456 | 449 | ||
457 | childregs->rax = 0; | 450 | childregs->rax = 0; |
458 | childregs->rsp = rsp; | 451 | childregs->rsp = rsp; |
459 | if (rsp == ~0UL) | 452 | if (rsp == ~0UL) |
460 | childregs->rsp = (unsigned long)childregs; | 453 | childregs->rsp = (unsigned long)childregs; |
461 | 454 | ||
462 | p->thread.rsp = (unsigned long) childregs; | 455 | p->thread.rsp = (unsigned long) childregs; |
463 | p->thread.rsp0 = (unsigned long) (childregs+1); | 456 | p->thread.rsp0 = (unsigned long) (childregs+1); |
464 | p->thread.userrsp = me->thread.userrsp; | 457 | p->thread.userrsp = me->thread.userrsp; |
465 | 458 | ||
466 | set_tsk_thread_flag(p, TIF_FORK); | 459 | set_tsk_thread_flag(p, TIF_FORK); |
467 | 460 | ||
468 | p->thread.fs = me->thread.fs; | 461 | p->thread.fs = me->thread.fs; |
469 | p->thread.gs = me->thread.gs; | 462 | p->thread.gs = me->thread.gs; |
470 | 463 | ||
471 | asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); | 464 | asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); |
472 | asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); | 465 | asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); |
473 | asm("mov %%es,%0" : "=m" (p->thread.es)); | 466 | asm("mov %%es,%0" : "=m" (p->thread.es)); |
474 | asm("mov %%ds,%0" : "=m" (p->thread.ds)); | 467 | asm("mov %%ds,%0" : "=m" (p->thread.ds)); |
475 | 468 | ||
476 | if (unlikely(me->thread.io_bitmap_ptr != NULL)) { | 469 | if (unlikely(me->thread.io_bitmap_ptr != NULL)) { |
477 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); | 470 | p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); |
478 | if (!p->thread.io_bitmap_ptr) { | 471 | if (!p->thread.io_bitmap_ptr) { |
479 | p->thread.io_bitmap_max = 0; | 472 | p->thread.io_bitmap_max = 0; |
480 | return -ENOMEM; | 473 | return -ENOMEM; |
481 | } | 474 | } |
482 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, | 475 | memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, |
483 | IO_BITMAP_BYTES); | 476 | IO_BITMAP_BYTES); |
484 | } | 477 | } |
485 | 478 | ||
486 | /* | 479 | /* |
487 | * Set a new TLS for the child thread? | 480 | * Set a new TLS for the child thread? |
488 | */ | 481 | */ |
489 | if (clone_flags & CLONE_SETTLS) { | 482 | if (clone_flags & CLONE_SETTLS) { |
490 | #ifdef CONFIG_IA32_EMULATION | 483 | #ifdef CONFIG_IA32_EMULATION |
491 | if (test_thread_flag(TIF_IA32)) | 484 | if (test_thread_flag(TIF_IA32)) |
492 | err = ia32_child_tls(p, childregs); | 485 | err = ia32_child_tls(p, childregs); |
493 | else | 486 | else |
494 | #endif | 487 | #endif |
495 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | 488 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); |
496 | if (err) | 489 | if (err) |
497 | goto out; | 490 | goto out; |
498 | } | 491 | } |
499 | err = 0; | 492 | err = 0; |
500 | out: | 493 | out: |
501 | if (err && p->thread.io_bitmap_ptr) { | 494 | if (err && p->thread.io_bitmap_ptr) { |
502 | kfree(p->thread.io_bitmap_ptr); | 495 | kfree(p->thread.io_bitmap_ptr); |
503 | p->thread.io_bitmap_max = 0; | 496 | p->thread.io_bitmap_max = 0; |
504 | } | 497 | } |
505 | return err; | 498 | return err; |
506 | } | 499 | } |
507 | 500 | ||
508 | /* | 501 | /* |
509 | * This special macro can be used to load a debugging register | 502 | * This special macro can be used to load a debugging register |
510 | */ | 503 | */ |
511 | #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) | 504 | #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) |
512 | 505 | ||
513 | /* | 506 | /* |
514 | * switch_to(x,y) should switch tasks from x to y. | 507 | * switch_to(x,y) should switch tasks from x to y. |
515 | * | 508 | * |
516 | * This could still be optimized: | 509 | * This could still be optimized: |
517 | * - fold all the options into a flag word and test it with a single test. | 510 | * - fold all the options into a flag word and test it with a single test. |
518 | * - could test fs/gs bitsliced | 511 | * - could test fs/gs bitsliced |
519 | * | 512 | * |
520 | * Kprobes not supported here. Set the probe on schedule instead. | 513 | * Kprobes not supported here. Set the probe on schedule instead. |
521 | */ | 514 | */ |
522 | __kprobes struct task_struct * | 515 | __kprobes struct task_struct * |
523 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | 516 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
524 | { | 517 | { |
525 | struct thread_struct *prev = &prev_p->thread, | 518 | struct thread_struct *prev = &prev_p->thread, |
526 | *next = &next_p->thread; | 519 | *next = &next_p->thread; |
527 | int cpu = smp_processor_id(); | 520 | int cpu = smp_processor_id(); |
528 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 521 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
529 | 522 | ||
530 | /* | 523 | /* |
531 | * Reload esp0, LDT and the page table pointer: | 524 | * Reload esp0, LDT and the page table pointer: |
532 | */ | 525 | */ |
533 | tss->rsp0 = next->rsp0; | 526 | tss->rsp0 = next->rsp0; |
534 | 527 | ||
535 | /* | 528 | /* |
536 | * Switch DS and ES. | 529 | * Switch DS and ES. |
537 | * This won't pick up thread selector changes, but I guess that is ok. | 530 | * This won't pick up thread selector changes, but I guess that is ok. |
538 | */ | 531 | */ |
539 | asm volatile("mov %%es,%0" : "=m" (prev->es)); | 532 | asm volatile("mov %%es,%0" : "=m" (prev->es)); |
540 | if (unlikely(next->es | prev->es)) | 533 | if (unlikely(next->es | prev->es)) |
541 | loadsegment(es, next->es); | 534 | loadsegment(es, next->es); |
542 | 535 | ||
543 | asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); | 536 | asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); |
544 | if (unlikely(next->ds | prev->ds)) | 537 | if (unlikely(next->ds | prev->ds)) |
545 | loadsegment(ds, next->ds); | 538 | loadsegment(ds, next->ds); |
546 | 539 | ||
547 | load_TLS(next, cpu); | 540 | load_TLS(next, cpu); |
548 | 541 | ||
549 | /* | 542 | /* |
550 | * Switch FS and GS. | 543 | * Switch FS and GS. |
551 | */ | 544 | */ |
552 | { | 545 | { |
553 | unsigned fsindex; | 546 | unsigned fsindex; |
554 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); | 547 | asm volatile("movl %%fs,%0" : "=r" (fsindex)); |
555 | /* segment register != 0 always requires a reload. | 548 | /* segment register != 0 always requires a reload. |
556 | also reload when it has changed. | 549 | also reload when it has changed. |
557 | when prev process used 64bit base always reload | 550 | when prev process used 64bit base always reload |
558 | to avoid an information leak. */ | 551 | to avoid an information leak. */ |
559 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | 552 | if (unlikely(fsindex | next->fsindex | prev->fs)) { |
560 | loadsegment(fs, next->fsindex); | 553 | loadsegment(fs, next->fsindex); |
561 | /* check if the user used a selector != 0 | 554 | /* check if the user used a selector != 0 |
562 | * if yes clear 64bit base, since overloaded base | 555 | * if yes clear 64bit base, since overloaded base |
563 | * is always mapped to the Null selector | 556 | * is always mapped to the Null selector |
564 | */ | 557 | */ |
565 | if (fsindex) | 558 | if (fsindex) |
566 | prev->fs = 0; | 559 | prev->fs = 0; |
567 | } | 560 | } |
568 | /* when next process has a 64bit base use it */ | 561 | /* when next process has a 64bit base use it */ |
569 | if (next->fs) | 562 | if (next->fs) |
570 | wrmsrl(MSR_FS_BASE, next->fs); | 563 | wrmsrl(MSR_FS_BASE, next->fs); |
571 | prev->fsindex = fsindex; | 564 | prev->fsindex = fsindex; |
572 | } | 565 | } |
573 | { | 566 | { |
574 | unsigned gsindex; | 567 | unsigned gsindex; |
575 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); | 568 | asm volatile("movl %%gs,%0" : "=r" (gsindex)); |
576 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | 569 | if (unlikely(gsindex | next->gsindex | prev->gs)) { |
577 | load_gs_index(next->gsindex); | 570 | load_gs_index(next->gsindex); |
578 | if (gsindex) | 571 | if (gsindex) |
579 | prev->gs = 0; | 572 | prev->gs = 0; |
580 | } | 573 | } |
581 | if (next->gs) | 574 | if (next->gs) |
582 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | 575 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); |
583 | prev->gsindex = gsindex; | 576 | prev->gsindex = gsindex; |
584 | } | 577 | } |
585 | 578 | ||
586 | /* | 579 | /* |
587 | * Switch the PDA and FPU contexts. | 580 | * Switch the PDA and FPU contexts. |
588 | */ | 581 | */ |
589 | prev->userrsp = read_pda(oldrsp); | 582 | prev->userrsp = read_pda(oldrsp); |
590 | write_pda(oldrsp, next->userrsp); | 583 | write_pda(oldrsp, next->userrsp); |
591 | write_pda(pcurrent, next_p); | 584 | write_pda(pcurrent, next_p); |
592 | /* This must be here to ensure both math_state_restore() and | 585 | /* This must be here to ensure both math_state_restore() and |
593 | kernel_fpu_begin() work consistently. */ | 586 | kernel_fpu_begin() work consistently. */ |
594 | unlazy_fpu(prev_p); | 587 | unlazy_fpu(prev_p); |
595 | write_pda(kernelstack, | 588 | write_pda(kernelstack, |
596 | task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); | 589 | task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); |
597 | 590 | ||
598 | /* | 591 | /* |
599 | * Now maybe reload the debug registers | 592 | * Now maybe reload the debug registers |
600 | */ | 593 | */ |
601 | if (unlikely(next->debugreg7)) { | 594 | if (unlikely(next->debugreg7)) { |
602 | loaddebug(next, 0); | 595 | loaddebug(next, 0); |
603 | loaddebug(next, 1); | 596 | loaddebug(next, 1); |
604 | loaddebug(next, 2); | 597 | loaddebug(next, 2); |
605 | loaddebug(next, 3); | 598 | loaddebug(next, 3); |
606 | /* no 4 and 5 */ | 599 | /* no 4 and 5 */ |
607 | loaddebug(next, 6); | 600 | loaddebug(next, 6); |
608 | loaddebug(next, 7); | 601 | loaddebug(next, 7); |
609 | } | 602 | } |
610 | 603 | ||
611 | 604 | ||
612 | /* | 605 | /* |
613 | * Handle the IO bitmap | 606 | * Handle the IO bitmap |
614 | */ | 607 | */ |
615 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { | 608 | if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { |
616 | if (next->io_bitmap_ptr) | 609 | if (next->io_bitmap_ptr) |
617 | /* | 610 | /* |
618 | * Copy the relevant range of the IO bitmap. | 611 | * Copy the relevant range of the IO bitmap. |
619 | * Normally this is 128 bytes or less: | 612 | * Normally this is 128 bytes or less: |
620 | */ | 613 | */ |
621 | memcpy(tss->io_bitmap, next->io_bitmap_ptr, | 614 | memcpy(tss->io_bitmap, next->io_bitmap_ptr, |
622 | max(prev->io_bitmap_max, next->io_bitmap_max)); | 615 | max(prev->io_bitmap_max, next->io_bitmap_max)); |
623 | else { | 616 | else { |
624 | /* | 617 | /* |
625 | * Clear any possible leftover bits: | 618 | * Clear any possible leftover bits: |
626 | */ | 619 | */ |
627 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); | 620 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); |
628 | } | 621 | } |
629 | } | 622 | } |
630 | 623 | ||
631 | return prev_p; | 624 | return prev_p; |
632 | } | 625 | } |
633 | 626 | ||
634 | /* | 627 | /* |
635 | * sys_execve() executes a new program. | 628 | * sys_execve() executes a new program. |
636 | */ | 629 | */ |
637 | asmlinkage | 630 | asmlinkage |
638 | long sys_execve(char __user *name, char __user * __user *argv, | 631 | long sys_execve(char __user *name, char __user * __user *argv, |
639 | char __user * __user *envp, struct pt_regs regs) | 632 | char __user * __user *envp, struct pt_regs regs) |
640 | { | 633 | { |
641 | long error; | 634 | long error; |
642 | char * filename; | 635 | char * filename; |
643 | 636 | ||
644 | filename = getname(name); | 637 | filename = getname(name); |
645 | error = PTR_ERR(filename); | 638 | error = PTR_ERR(filename); |
646 | if (IS_ERR(filename)) | 639 | if (IS_ERR(filename)) |
647 | return error; | 640 | return error; |
648 | error = do_execve(filename, argv, envp, ®s); | 641 | error = do_execve(filename, argv, envp, ®s); |
649 | if (error == 0) { | 642 | if (error == 0) { |
650 | task_lock(current); | 643 | task_lock(current); |
651 | current->ptrace &= ~PT_DTRACE; | 644 | current->ptrace &= ~PT_DTRACE; |
652 | task_unlock(current); | 645 | task_unlock(current); |
653 | } | 646 | } |
654 | putname(filename); | 647 | putname(filename); |
655 | return error; | 648 | return error; |
656 | } | 649 | } |
657 | 650 | ||
658 | void set_personality_64bit(void) | 651 | void set_personality_64bit(void) |
659 | { | 652 | { |
660 | /* inherit personality from parent */ | 653 | /* inherit personality from parent */ |
661 | 654 | ||
662 | /* Make sure to be in 64bit mode */ | 655 | /* Make sure to be in 64bit mode */ |
663 | clear_thread_flag(TIF_IA32); | 656 | clear_thread_flag(TIF_IA32); |
664 | 657 | ||
665 | /* TBD: overwrites user setup. Should have two bits. | 658 | /* TBD: overwrites user setup. Should have two bits. |
666 | But 64bit processes have always behaved this way, | 659 | But 64bit processes have always behaved this way, |
667 | so it's not too bad. The main problem is just that | 660 | so it's not too bad. The main problem is just that |
668 | 32bit childs are affected again. */ | 661 | 32bit childs are affected again. */ |
669 | current->personality &= ~READ_IMPLIES_EXEC; | 662 | current->personality &= ~READ_IMPLIES_EXEC; |
670 | } | 663 | } |
671 | 664 | ||
672 | asmlinkage long sys_fork(struct pt_regs *regs) | 665 | asmlinkage long sys_fork(struct pt_regs *regs) |
673 | { | 666 | { |
674 | return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); | 667 | return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); |
675 | } | 668 | } |
676 | 669 | ||
677 | asmlinkage long | 670 | asmlinkage long |
678 | sys_clone(unsigned long clone_flags, unsigned long newsp, | 671 | sys_clone(unsigned long clone_flags, unsigned long newsp, |
679 | void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | 672 | void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) |
680 | { | 673 | { |
681 | if (!newsp) | 674 | if (!newsp) |
682 | newsp = regs->rsp; | 675 | newsp = regs->rsp; |
683 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | 676 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); |
684 | } | 677 | } |
685 | 678 | ||
686 | /* | 679 | /* |
687 | * This is trivial, and on the face of it looks like it | 680 | * This is trivial, and on the face of it looks like it |
688 | * could equally well be done in user mode. | 681 | * could equally well be done in user mode. |
689 | * | 682 | * |
690 | * Not so, for quite unobvious reasons - register pressure. | 683 | * Not so, for quite unobvious reasons - register pressure. |
691 | * In user mode vfork() cannot have a stack frame, and if | 684 | * In user mode vfork() cannot have a stack frame, and if |
692 | * done by calling the "clone()" system call directly, you | 685 | * done by calling the "clone()" system call directly, you |
693 | * do not have enough call-clobbered registers to hold all | 686 | * do not have enough call-clobbered registers to hold all |
694 | * the information you need. | 687 | * the information you need. |
695 | */ | 688 | */ |
696 | asmlinkage long sys_vfork(struct pt_regs *regs) | 689 | asmlinkage long sys_vfork(struct pt_regs *regs) |
697 | { | 690 | { |
698 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, | 691 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, |
699 | NULL, NULL); | 692 | NULL, NULL); |
700 | } | 693 | } |
701 | 694 | ||
702 | unsigned long get_wchan(struct task_struct *p) | 695 | unsigned long get_wchan(struct task_struct *p) |
703 | { | 696 | { |
704 | unsigned long stack; | 697 | unsigned long stack; |
705 | u64 fp,rip; | 698 | u64 fp,rip; |
706 | int count = 0; | 699 | int count = 0; |
707 | 700 | ||
708 | if (!p || p == current || p->state==TASK_RUNNING) | 701 | if (!p || p == current || p->state==TASK_RUNNING) |
709 | return 0; | 702 | return 0; |
710 | stack = (unsigned long)task_stack_page(p); | 703 | stack = (unsigned long)task_stack_page(p); |
711 | if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) | 704 | if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) |
712 | return 0; | 705 | return 0; |
713 | fp = *(u64 *)(p->thread.rsp); | 706 | fp = *(u64 *)(p->thread.rsp); |
714 | do { | 707 | do { |
715 | if (fp < (unsigned long)stack || | 708 | if (fp < (unsigned long)stack || |
716 | fp > (unsigned long)stack+THREAD_SIZE) | 709 | fp > (unsigned long)stack+THREAD_SIZE) |
717 | return 0; | 710 | return 0; |
718 | rip = *(u64 *)(fp+8); | 711 | rip = *(u64 *)(fp+8); |
719 | if (!in_sched_functions(rip)) | 712 | if (!in_sched_functions(rip)) |
720 | return rip; | 713 | return rip; |
721 | fp = *(u64 *)fp; | 714 | fp = *(u64 *)fp; |
722 | } while (count++ < 16); | 715 | } while (count++ < 16); |
723 | return 0; | 716 | return 0; |
724 | } | 717 | } |
725 | 718 | ||
726 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | 719 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) |
727 | { | 720 | { |
728 | int ret = 0; | 721 | int ret = 0; |
729 | int doit = task == current; | 722 | int doit = task == current; |
730 | int cpu; | 723 | int cpu; |
731 | 724 | ||
732 | switch (code) { | 725 | switch (code) { |
733 | case ARCH_SET_GS: | 726 | case ARCH_SET_GS: |
734 | if (addr >= TASK_SIZE_OF(task)) | 727 | if (addr >= TASK_SIZE_OF(task)) |
735 | return -EPERM; | 728 | return -EPERM; |
736 | cpu = get_cpu(); | 729 | cpu = get_cpu(); |
737 | /* handle small bases via the GDT because that's faster to | 730 | /* handle small bases via the GDT because that's faster to |
738 | switch. */ | 731 | switch. */ |
739 | if (addr <= 0xffffffff) { | 732 | if (addr <= 0xffffffff) { |
740 | set_32bit_tls(task, GS_TLS, addr); | 733 | set_32bit_tls(task, GS_TLS, addr); |
741 | if (doit) { | 734 | if (doit) { |
742 | load_TLS(&task->thread, cpu); | 735 | load_TLS(&task->thread, cpu); |
743 | load_gs_index(GS_TLS_SEL); | 736 | load_gs_index(GS_TLS_SEL); |
744 | } | 737 | } |
745 | task->thread.gsindex = GS_TLS_SEL; | 738 | task->thread.gsindex = GS_TLS_SEL; |
746 | task->thread.gs = 0; | 739 | task->thread.gs = 0; |
747 | } else { | 740 | } else { |
748 | task->thread.gsindex = 0; | 741 | task->thread.gsindex = 0; |
749 | task->thread.gs = addr; | 742 | task->thread.gs = addr; |
750 | if (doit) { | 743 | if (doit) { |
751 | load_gs_index(0); | 744 | load_gs_index(0); |
752 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); | 745 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); |
753 | } | 746 | } |
754 | } | 747 | } |
755 | put_cpu(); | 748 | put_cpu(); |
756 | break; | 749 | break; |
757 | case ARCH_SET_FS: | 750 | case ARCH_SET_FS: |
758 | /* Not strictly needed for fs, but do it for symmetry | 751 | /* Not strictly needed for fs, but do it for symmetry |
759 | with gs */ | 752 | with gs */ |
760 | if (addr >= TASK_SIZE_OF(task)) | 753 | if (addr >= TASK_SIZE_OF(task)) |
761 | return -EPERM; | 754 | return -EPERM; |
762 | cpu = get_cpu(); | 755 | cpu = get_cpu(); |
763 | /* handle small bases via the GDT because that's faster to | 756 | /* handle small bases via the GDT because that's faster to |
764 | switch. */ | 757 | switch. */ |
765 | if (addr <= 0xffffffff) { | 758 | if (addr <= 0xffffffff) { |
766 | set_32bit_tls(task, FS_TLS, addr); | 759 | set_32bit_tls(task, FS_TLS, addr); |
767 | if (doit) { | 760 | if (doit) { |
768 | load_TLS(&task->thread, cpu); | 761 | load_TLS(&task->thread, cpu); |
769 | asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); | 762 | asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); |
770 | } | 763 | } |
771 | task->thread.fsindex = FS_TLS_SEL; | 764 | task->thread.fsindex = FS_TLS_SEL; |
772 | task->thread.fs = 0; | 765 | task->thread.fs = 0; |
773 | } else { | 766 | } else { |
774 | task->thread.fsindex = 0; | 767 | task->thread.fsindex = 0; |
775 | task->thread.fs = addr; | 768 | task->thread.fs = addr; |
776 | if (doit) { | 769 | if (doit) { |
777 | /* set the selector to 0 to not confuse | 770 | /* set the selector to 0 to not confuse |
778 | __switch_to */ | 771 | __switch_to */ |
779 | asm volatile("movl %0,%%fs" :: "r" (0)); | 772 | asm volatile("movl %0,%%fs" :: "r" (0)); |
780 | ret = checking_wrmsrl(MSR_FS_BASE, addr); | 773 | ret = checking_wrmsrl(MSR_FS_BASE, addr); |
781 | } | 774 | } |
782 | } | 775 | } |
783 | put_cpu(); | 776 | put_cpu(); |
784 | break; | 777 | break; |
785 | case ARCH_GET_FS: { | 778 | case ARCH_GET_FS: { |
786 | unsigned long base; | 779 | unsigned long base; |
787 | if (task->thread.fsindex == FS_TLS_SEL) | 780 | if (task->thread.fsindex == FS_TLS_SEL) |
788 | base = read_32bit_tls(task, FS_TLS); | 781 | base = read_32bit_tls(task, FS_TLS); |
789 | else if (doit) | 782 | else if (doit) |
790 | rdmsrl(MSR_FS_BASE, base); | 783 | rdmsrl(MSR_FS_BASE, base); |
791 | else | 784 | else |
792 | base = task->thread.fs; | 785 | base = task->thread.fs; |
793 | ret = put_user(base, (unsigned long __user *)addr); | 786 | ret = put_user(base, (unsigned long __user *)addr); |
794 | break; | 787 | break; |
795 | } | 788 | } |
796 | case ARCH_GET_GS: { | 789 | case ARCH_GET_GS: { |
797 | unsigned long base; | 790 | unsigned long base; |
798 | if (task->thread.gsindex == GS_TLS_SEL) | 791 | if (task->thread.gsindex == GS_TLS_SEL) |
799 | base = read_32bit_tls(task, GS_TLS); | 792 | base = read_32bit_tls(task, GS_TLS); |
800 | else if (doit) | 793 | else if (doit) |
801 | rdmsrl(MSR_KERNEL_GS_BASE, base); | 794 | rdmsrl(MSR_KERNEL_GS_BASE, base); |
802 | else | 795 | else |
803 | base = task->thread.gs; | 796 | base = task->thread.gs; |
804 | ret = put_user(base, (unsigned long __user *)addr); | 797 | ret = put_user(base, (unsigned long __user *)addr); |
805 | break; | 798 | break; |
806 | } | 799 | } |
807 | 800 | ||
808 | default: | 801 | default: |
809 | ret = -EINVAL; | 802 | ret = -EINVAL; |
810 | break; | 803 | break; |
811 | } | 804 | } |
812 | 805 | ||
813 | return ret; | 806 | return ret; |
814 | } | 807 | } |
815 | 808 | ||
816 | long sys_arch_prctl(int code, unsigned long addr) | 809 | long sys_arch_prctl(int code, unsigned long addr) |
817 | { | 810 | { |
818 | return do_arch_prctl(current, code, addr); | 811 | return do_arch_prctl(current, code, addr); |
819 | } | 812 | } |
820 | 813 | ||
821 | /* | 814 | /* |
822 | * Capture the user space registers if the task is not running (in user space) | 815 | * Capture the user space registers if the task is not running (in user space) |
823 | */ | 816 | */ |
824 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) | 817 | int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) |
825 | { | 818 | { |
826 | struct pt_regs *pp, ptregs; | 819 | struct pt_regs *pp, ptregs; |
827 | 820 | ||
828 | pp = task_pt_regs(tsk); | 821 | pp = task_pt_regs(tsk); |
829 | 822 | ||
830 | ptregs = *pp; | 823 | ptregs = *pp; |
831 | ptregs.cs &= 0xffff; | 824 | ptregs.cs &= 0xffff; |
832 | ptregs.ss &= 0xffff; | 825 | ptregs.ss &= 0xffff; |
833 | 826 | ||
834 | elf_core_copy_regs(regs, &ptregs); | 827 | elf_core_copy_regs(regs, &ptregs); |
835 | 828 | ||
836 | return 1; | 829 | return 1; |
837 | } | 830 | } |
838 | 831 | ||
839 | unsigned long arch_align_stack(unsigned long sp) | 832 | unsigned long arch_align_stack(unsigned long sp) |
840 | { | 833 | { |
841 | if (randomize_va_space) | 834 | if (randomize_va_space) |
842 | sp -= get_random_int() % 8192; | 835 | sp -= get_random_int() % 8192; |
843 | return sp & ~0xf; | 836 | return sp & ~0xf; |
844 | } | 837 | } |
kernel/kprobes.c
1 | /* | 1 | /* |
2 | * Kernel Probes (KProbes) | 2 | * Kernel Probes (KProbes) |
3 | * kernel/kprobes.c | 3 | * kernel/kprobes.c |
4 | * | 4 | * |
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or | 7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. | 8 | * (at your option) any later version. |
9 | * | 9 | * |
10 | * This program is distributed in the hope that it will be useful, | 10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. | 13 | * GNU General Public License for more details. |
14 | * | 14 | * |
15 | * You should have received a copy of the GNU General Public License | 15 | * You should have received a copy of the GNU General Public License |
16 | * along with this program; if not, write to the Free Software | 16 | * along with this program; if not, write to the Free Software |
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
18 | * | 18 | * |
19 | * Copyright (C) IBM Corporation, 2002, 2004 | 19 | * Copyright (C) IBM Corporation, 2002, 2004 |
20 | * | 20 | * |
21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | 21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel |
22 | * Probes initial implementation (includes suggestions from | 22 | * Probes initial implementation (includes suggestions from |
23 | * Rusty Russell). | 23 | * Rusty Russell). |
24 | * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with | 24 | * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with |
25 | * hlists and exceptions notifier as suggested by Andi Kleen. | 25 | * hlists and exceptions notifier as suggested by Andi Kleen. |
26 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes | 26 | * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes |
27 | * interface to access function arguments. | 27 | * interface to access function arguments. |
28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes | 28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes |
29 | * exceptions notifier to be first on the priority list. | 29 | * exceptions notifier to be first on the priority list. |
30 | * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston | 30 | * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston |
31 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi | 31 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi |
32 | * <prasanna@in.ibm.com> added function-return probes. | 32 | * <prasanna@in.ibm.com> added function-return probes. |
33 | */ | 33 | */ |
34 | #include <linux/kprobes.h> | 34 | #include <linux/kprobes.h> |
35 | #include <linux/hash.h> | 35 | #include <linux/hash.h> |
36 | #include <linux/init.h> | 36 | #include <linux/init.h> |
37 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
40 | #include <asm-generic/sections.h> | 40 | #include <asm-generic/sections.h> |
41 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
42 | #include <asm/errno.h> | 42 | #include <asm/errno.h> |
43 | #include <asm/kdebug.h> | 43 | #include <asm/kdebug.h> |
44 | 44 | ||
45 | #define KPROBE_HASH_BITS 6 | 45 | #define KPROBE_HASH_BITS 6 |
46 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) | 46 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) |
47 | 47 | ||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | 50 | ||
51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 54 | ||
55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
56 | /* | 56 | /* |
57 | * kprobe->ainsn.insn points to the copy of the instruction to be | 57 | * kprobe->ainsn.insn points to the copy of the instruction to be |
58 | * single-stepped. x86_64, POWER4 and above have no-exec support and | 58 | * single-stepped. x86_64, POWER4 and above have no-exec support and |
59 | * stepping on the instruction on a vmalloced/kmalloced/data page | 59 | * stepping on the instruction on a vmalloced/kmalloced/data page |
60 | * is a recipe for disaster | 60 | * is a recipe for disaster |
61 | */ | 61 | */ |
62 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) | 62 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) |
63 | 63 | ||
64 | struct kprobe_insn_page { | 64 | struct kprobe_insn_page { |
65 | struct hlist_node hlist; | 65 | struct hlist_node hlist; |
66 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 66 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
67 | char slot_used[INSNS_PER_PAGE]; | 67 | char slot_used[INSNS_PER_PAGE]; |
68 | int nused; | 68 | int nused; |
69 | }; | 69 | }; |
70 | 70 | ||
71 | static struct hlist_head kprobe_insn_pages; | 71 | static struct hlist_head kprobe_insn_pages; |
72 | 72 | ||
73 | /** | 73 | /** |
74 | * get_insn_slot() - Find a slot on an executable page for an instruction. | 74 | * get_insn_slot() - Find a slot on an executable page for an instruction. |
75 | * We allocate an executable page if there's no room on existing ones. | 75 | * We allocate an executable page if there's no room on existing ones. |
76 | */ | 76 | */ |
77 | kprobe_opcode_t __kprobes *get_insn_slot(void) | 77 | kprobe_opcode_t __kprobes *get_insn_slot(void) |
78 | { | 78 | { |
79 | struct kprobe_insn_page *kip; | 79 | struct kprobe_insn_page *kip; |
80 | struct hlist_node *pos; | 80 | struct hlist_node *pos; |
81 | 81 | ||
82 | hlist_for_each(pos, &kprobe_insn_pages) { | 82 | hlist_for_each(pos, &kprobe_insn_pages) { |
83 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | 83 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); |
84 | if (kip->nused < INSNS_PER_PAGE) { | 84 | if (kip->nused < INSNS_PER_PAGE) { |
85 | int i; | 85 | int i; |
86 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 86 | for (i = 0; i < INSNS_PER_PAGE; i++) { |
87 | if (!kip->slot_used[i]) { | 87 | if (!kip->slot_used[i]) { |
88 | kip->slot_used[i] = 1; | 88 | kip->slot_used[i] = 1; |
89 | kip->nused++; | 89 | kip->nused++; |
90 | return kip->insns + (i * MAX_INSN_SIZE); | 90 | return kip->insns + (i * MAX_INSN_SIZE); |
91 | } | 91 | } |
92 | } | 92 | } |
93 | /* Surprise! No unused slots. Fix kip->nused. */ | 93 | /* Surprise! No unused slots. Fix kip->nused. */ |
94 | kip->nused = INSNS_PER_PAGE; | 94 | kip->nused = INSNS_PER_PAGE; |
95 | } | 95 | } |
96 | } | 96 | } |
97 | 97 | ||
98 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | 98 | /* All out of space. Need to allocate a new page. Use slot 0.*/ |
99 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 99 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); |
100 | if (!kip) { | 100 | if (!kip) { |
101 | return NULL; | 101 | return NULL; |
102 | } | 102 | } |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * Use module_alloc so this page is within +/- 2GB of where the | 105 | * Use module_alloc so this page is within +/- 2GB of where the |
106 | * kernel image and loaded module images reside. This is required | 106 | * kernel image and loaded module images reside. This is required |
107 | * so x86_64 can correctly handle the %rip-relative fixups. | 107 | * so x86_64 can correctly handle the %rip-relative fixups. |
108 | */ | 108 | */ |
109 | kip->insns = module_alloc(PAGE_SIZE); | 109 | kip->insns = module_alloc(PAGE_SIZE); |
110 | if (!kip->insns) { | 110 | if (!kip->insns) { |
111 | kfree(kip); | 111 | kfree(kip); |
112 | return NULL; | 112 | return NULL; |
113 | } | 113 | } |
114 | INIT_HLIST_NODE(&kip->hlist); | 114 | INIT_HLIST_NODE(&kip->hlist); |
115 | hlist_add_head(&kip->hlist, &kprobe_insn_pages); | 115 | hlist_add_head(&kip->hlist, &kprobe_insn_pages); |
116 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | 116 | memset(kip->slot_used, 0, INSNS_PER_PAGE); |
117 | kip->slot_used[0] = 1; | 117 | kip->slot_used[0] = 1; |
118 | kip->nused = 1; | 118 | kip->nused = 1; |
119 | return kip->insns; | 119 | return kip->insns; |
120 | } | 120 | } |
121 | 121 | ||
122 | void __kprobes free_insn_slot(kprobe_opcode_t *slot) | 122 | void __kprobes free_insn_slot(kprobe_opcode_t *slot) |
123 | { | 123 | { |
124 | struct kprobe_insn_page *kip; | 124 | struct kprobe_insn_page *kip; |
125 | struct hlist_node *pos; | 125 | struct hlist_node *pos; |
126 | 126 | ||
127 | hlist_for_each(pos, &kprobe_insn_pages) { | 127 | hlist_for_each(pos, &kprobe_insn_pages) { |
128 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | 128 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); |
129 | if (kip->insns <= slot && | 129 | if (kip->insns <= slot && |
130 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 130 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { |
131 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | 131 | int i = (slot - kip->insns) / MAX_INSN_SIZE; |
132 | kip->slot_used[i] = 0; | 132 | kip->slot_used[i] = 0; |
133 | kip->nused--; | 133 | kip->nused--; |
134 | if (kip->nused == 0) { | 134 | if (kip->nused == 0) { |
135 | /* | 135 | /* |
136 | * Page is no longer in use. Free it unless | 136 | * Page is no longer in use. Free it unless |
137 | * it's the last one. We keep the last one | 137 | * it's the last one. We keep the last one |
138 | * so as not to have to set it up again the | 138 | * so as not to have to set it up again the |
139 | * next time somebody inserts a probe. | 139 | * next time somebody inserts a probe. |
140 | */ | 140 | */ |
141 | hlist_del(&kip->hlist); | 141 | hlist_del(&kip->hlist); |
142 | if (hlist_empty(&kprobe_insn_pages)) { | 142 | if (hlist_empty(&kprobe_insn_pages)) { |
143 | INIT_HLIST_NODE(&kip->hlist); | 143 | INIT_HLIST_NODE(&kip->hlist); |
144 | hlist_add_head(&kip->hlist, | 144 | hlist_add_head(&kip->hlist, |
145 | &kprobe_insn_pages); | 145 | &kprobe_insn_pages); |
146 | } else { | 146 | } else { |
147 | module_free(NULL, kip->insns); | 147 | module_free(NULL, kip->insns); |
148 | kfree(kip); | 148 | kfree(kip); |
149 | } | 149 | } |
150 | } | 150 | } |
151 | return; | 151 | return; |
152 | } | 152 | } |
153 | } | 153 | } |
154 | } | 154 | } |
155 | #endif | 155 | #endif |
156 | 156 | ||
157 | /* We have preemption disabled.. so it is safe to use __ versions */ | 157 | /* We have preemption disabled.. so it is safe to use __ versions */ |
158 | static inline void set_kprobe_instance(struct kprobe *kp) | 158 | static inline void set_kprobe_instance(struct kprobe *kp) |
159 | { | 159 | { |
160 | __get_cpu_var(kprobe_instance) = kp; | 160 | __get_cpu_var(kprobe_instance) = kp; |
161 | } | 161 | } |
162 | 162 | ||
163 | static inline void reset_kprobe_instance(void) | 163 | static inline void reset_kprobe_instance(void) |
164 | { | 164 | { |
165 | __get_cpu_var(kprobe_instance) = NULL; | 165 | __get_cpu_var(kprobe_instance) = NULL; |
166 | } | 166 | } |
167 | 167 | ||
168 | /* | 168 | /* |
169 | * This routine is called either: | 169 | * This routine is called either: |
170 | * - under the kprobe_mutex - during kprobe_[un]register() | 170 | * - under the kprobe_mutex - during kprobe_[un]register() |
171 | * OR | 171 | * OR |
172 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c | 172 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c |
173 | */ | 173 | */ |
174 | struct kprobe __kprobes *get_kprobe(void *addr) | 174 | struct kprobe __kprobes *get_kprobe(void *addr) |
175 | { | 175 | { |
176 | struct hlist_head *head; | 176 | struct hlist_head *head; |
177 | struct hlist_node *node; | 177 | struct hlist_node *node; |
178 | struct kprobe *p; | 178 | struct kprobe *p; |
179 | 179 | ||
180 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; | 180 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; |
181 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 181 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
182 | if (p->addr == addr) | 182 | if (p->addr == addr) |
183 | return p; | 183 | return p; |
184 | } | 184 | } |
185 | return NULL; | 185 | return NULL; |
186 | } | 186 | } |
187 | 187 | ||
188 | /* | 188 | /* |
189 | * Aggregate handlers for multiple kprobes support - these handlers | 189 | * Aggregate handlers for multiple kprobes support - these handlers |
190 | * take care of invoking the individual kprobe handlers on p->list | 190 | * take care of invoking the individual kprobe handlers on p->list |
191 | */ | 191 | */ |
192 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | 192 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) |
193 | { | 193 | { |
194 | struct kprobe *kp; | 194 | struct kprobe *kp; |
195 | 195 | ||
196 | list_for_each_entry_rcu(kp, &p->list, list) { | 196 | list_for_each_entry_rcu(kp, &p->list, list) { |
197 | if (kp->pre_handler) { | 197 | if (kp->pre_handler) { |
198 | set_kprobe_instance(kp); | 198 | set_kprobe_instance(kp); |
199 | if (kp->pre_handler(kp, regs)) | 199 | if (kp->pre_handler(kp, regs)) |
200 | return 1; | 200 | return 1; |
201 | } | 201 | } |
202 | reset_kprobe_instance(); | 202 | reset_kprobe_instance(); |
203 | } | 203 | } |
204 | return 0; | 204 | return 0; |
205 | } | 205 | } |
206 | 206 | ||
207 | static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | 207 | static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, |
208 | unsigned long flags) | 208 | unsigned long flags) |
209 | { | 209 | { |
210 | struct kprobe *kp; | 210 | struct kprobe *kp; |
211 | 211 | ||
212 | list_for_each_entry_rcu(kp, &p->list, list) { | 212 | list_for_each_entry_rcu(kp, &p->list, list) { |
213 | if (kp->post_handler) { | 213 | if (kp->post_handler) { |
214 | set_kprobe_instance(kp); | 214 | set_kprobe_instance(kp); |
215 | kp->post_handler(kp, regs, flags); | 215 | kp->post_handler(kp, regs, flags); |
216 | reset_kprobe_instance(); | 216 | reset_kprobe_instance(); |
217 | } | 217 | } |
218 | } | 218 | } |
219 | return; | 219 | return; |
220 | } | 220 | } |
221 | 221 | ||
222 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 222 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
223 | int trapnr) | 223 | int trapnr) |
224 | { | 224 | { |
225 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 225 | struct kprobe *cur = __get_cpu_var(kprobe_instance); |
226 | 226 | ||
227 | /* | 227 | /* |
228 | * if we faulted "during" the execution of a user specified | 228 | * if we faulted "during" the execution of a user specified |
229 | * probe handler, invoke just that probe's fault handler | 229 | * probe handler, invoke just that probe's fault handler |
230 | */ | 230 | */ |
231 | if (cur && cur->fault_handler) { | 231 | if (cur && cur->fault_handler) { |
232 | if (cur->fault_handler(cur, regs, trapnr)) | 232 | if (cur->fault_handler(cur, regs, trapnr)) |
233 | return 1; | 233 | return 1; |
234 | } | 234 | } |
235 | return 0; | 235 | return 0; |
236 | } | 236 | } |
237 | 237 | ||
238 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 238 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
239 | { | 239 | { |
240 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | 240 | struct kprobe *cur = __get_cpu_var(kprobe_instance); |
241 | int ret = 0; | 241 | int ret = 0; |
242 | 242 | ||
243 | if (cur && cur->break_handler) { | 243 | if (cur && cur->break_handler) { |
244 | if (cur->break_handler(cur, regs)) | 244 | if (cur->break_handler(cur, regs)) |
245 | ret = 1; | 245 | ret = 1; |
246 | } | 246 | } |
247 | reset_kprobe_instance(); | 247 | reset_kprobe_instance(); |
248 | return ret; | 248 | return ret; |
249 | } | 249 | } |
250 | 250 | ||
251 | /* Walks the list and increments nmissed count for multiprobe case */ | 251 | /* Walks the list and increments nmissed count for multiprobe case */ |
252 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | 252 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) |
253 | { | 253 | { |
254 | struct kprobe *kp; | 254 | struct kprobe *kp; |
255 | if (p->pre_handler != aggr_pre_handler) { | 255 | if (p->pre_handler != aggr_pre_handler) { |
256 | p->nmissed++; | 256 | p->nmissed++; |
257 | } else { | 257 | } else { |
258 | list_for_each_entry_rcu(kp, &p->list, list) | 258 | list_for_each_entry_rcu(kp, &p->list, list) |
259 | kp->nmissed++; | 259 | kp->nmissed++; |
260 | } | 260 | } |
261 | return; | 261 | return; |
262 | } | 262 | } |
263 | 263 | ||
264 | /* Called with kretprobe_lock held */ | 264 | /* Called with kretprobe_lock held */ |
265 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) | 265 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) |
266 | { | 266 | { |
267 | struct hlist_node *node; | 267 | struct hlist_node *node; |
268 | struct kretprobe_instance *ri; | 268 | struct kretprobe_instance *ri; |
269 | hlist_for_each_entry(ri, node, &rp->free_instances, uflist) | 269 | hlist_for_each_entry(ri, node, &rp->free_instances, uflist) |
270 | return ri; | 270 | return ri; |
271 | return NULL; | 271 | return NULL; |
272 | } | 272 | } |
273 | 273 | ||
274 | /* Called with kretprobe_lock held */ | 274 | /* Called with kretprobe_lock held */ |
275 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe | 275 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe |
276 | *rp) | 276 | *rp) |
277 | { | 277 | { |
278 | struct hlist_node *node; | 278 | struct hlist_node *node; |
279 | struct kretprobe_instance *ri; | 279 | struct kretprobe_instance *ri; |
280 | hlist_for_each_entry(ri, node, &rp->used_instances, uflist) | 280 | hlist_for_each_entry(ri, node, &rp->used_instances, uflist) |
281 | return ri; | 281 | return ri; |
282 | return NULL; | 282 | return NULL; |
283 | } | 283 | } |
284 | 284 | ||
285 | /* Called with kretprobe_lock held */ | 285 | /* Called with kretprobe_lock held */ |
286 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) | 286 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) |
287 | { | 287 | { |
288 | /* | 288 | /* |
289 | * Remove rp inst off the free list - | 289 | * Remove rp inst off the free list - |
290 | * Add it back when probed function returns | 290 | * Add it back when probed function returns |
291 | */ | 291 | */ |
292 | hlist_del(&ri->uflist); | 292 | hlist_del(&ri->uflist); |
293 | 293 | ||
294 | /* Add rp inst onto table */ | 294 | /* Add rp inst onto table */ |
295 | INIT_HLIST_NODE(&ri->hlist); | 295 | INIT_HLIST_NODE(&ri->hlist); |
296 | hlist_add_head(&ri->hlist, | 296 | hlist_add_head(&ri->hlist, |
297 | &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); | 297 | &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); |
298 | 298 | ||
299 | /* Also add this rp inst to the used list. */ | 299 | /* Also add this rp inst to the used list. */ |
300 | INIT_HLIST_NODE(&ri->uflist); | 300 | INIT_HLIST_NODE(&ri->uflist); |
301 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | 301 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); |
302 | } | 302 | } |
303 | 303 | ||
304 | /* Called with kretprobe_lock held */ | 304 | /* Called with kretprobe_lock held */ |
305 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) | 305 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) |
306 | { | 306 | { |
307 | /* remove rp inst off the rprobe_inst_table */ | 307 | /* remove rp inst off the rprobe_inst_table */ |
308 | hlist_del(&ri->hlist); | 308 | hlist_del(&ri->hlist); |
309 | if (ri->rp) { | 309 | if (ri->rp) { |
310 | /* remove rp inst off the used list */ | 310 | /* remove rp inst off the used list */ |
311 | hlist_del(&ri->uflist); | 311 | hlist_del(&ri->uflist); |
312 | /* put rp inst back onto the free list */ | 312 | /* put rp inst back onto the free list */ |
313 | INIT_HLIST_NODE(&ri->uflist); | 313 | INIT_HLIST_NODE(&ri->uflist); |
314 | hlist_add_head(&ri->uflist, &ri->rp->free_instances); | 314 | hlist_add_head(&ri->uflist, &ri->rp->free_instances); |
315 | } else | 315 | } else |
316 | /* Unregistering */ | 316 | /* Unregistering */ |
317 | kfree(ri); | 317 | kfree(ri); |
318 | } | 318 | } |
319 | 319 | ||
320 | struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) | 320 | struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) |
321 | { | 321 | { |
322 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; | 322 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; |
323 | } | 323 | } |
324 | 324 | ||
325 | /* | 325 | /* |
326 | * This function is called from exit_thread or flush_thread when task tk's | 326 | * This function is called from finish_task_switch when task tk becomes dead, |
327 | * stack is being recycled so that we can recycle any function-return probe | 327 | * so that we can recycle any function-return probe instances associated |
328 | * instances associated with this task. These left over instances represent | 328 | * with this task. These left over instances represent probed functions |
329 | * probed functions that have been called but will never return. | 329 | * that have been called but will never return. |
330 | */ | 330 | */ |
331 | void __kprobes kprobe_flush_task(struct task_struct *tk) | 331 | void __kprobes kprobe_flush_task(struct task_struct *tk) |
332 | { | 332 | { |
333 | struct kretprobe_instance *ri; | 333 | struct kretprobe_instance *ri; |
334 | struct hlist_head *head; | 334 | struct hlist_head *head; |
335 | struct hlist_node *node, *tmp; | 335 | struct hlist_node *node, *tmp; |
336 | unsigned long flags = 0; | 336 | unsigned long flags = 0; |
337 | 337 | ||
338 | spin_lock_irqsave(&kretprobe_lock, flags); | 338 | spin_lock_irqsave(&kretprobe_lock, flags); |
339 | head = kretprobe_inst_table_head(current); | 339 | head = kretprobe_inst_table_head(tk); |
340 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | 340 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { |
341 | if (ri->task == tk) | 341 | if (ri->task == tk) |
342 | recycle_rp_inst(ri); | 342 | recycle_rp_inst(ri); |
343 | } | 343 | } |
344 | spin_unlock_irqrestore(&kretprobe_lock, flags); | 344 | spin_unlock_irqrestore(&kretprobe_lock, flags); |
345 | } | 345 | } |
346 | 346 | ||
347 | static inline void free_rp_inst(struct kretprobe *rp) | 347 | static inline void free_rp_inst(struct kretprobe *rp) |
348 | { | 348 | { |
349 | struct kretprobe_instance *ri; | 349 | struct kretprobe_instance *ri; |
350 | while ((ri = get_free_rp_inst(rp)) != NULL) { | 350 | while ((ri = get_free_rp_inst(rp)) != NULL) { |
351 | hlist_del(&ri->uflist); | 351 | hlist_del(&ri->uflist); |
352 | kfree(ri); | 352 | kfree(ri); |
353 | } | 353 | } |
354 | } | 354 | } |
355 | 355 | ||
356 | /* | 356 | /* |
357 | * Keep all fields in the kprobe consistent | 357 | * Keep all fields in the kprobe consistent |
358 | */ | 358 | */ |
359 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | 359 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) |
360 | { | 360 | { |
361 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | 361 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); |
362 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | 362 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); |
363 | } | 363 | } |
364 | 364 | ||
365 | /* | 365 | /* |
366 | * Add the new probe to old_p->list. Fail if this is the | 366 | * Add the new probe to old_p->list. Fail if this is the |
367 | * second jprobe at the address - two jprobes can't coexist | 367 | * second jprobe at the address - two jprobes can't coexist |
368 | */ | 368 | */ |
369 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 369 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
370 | { | 370 | { |
371 | struct kprobe *kp; | 371 | struct kprobe *kp; |
372 | 372 | ||
373 | if (p->break_handler) { | 373 | if (p->break_handler) { |
374 | list_for_each_entry_rcu(kp, &old_p->list, list) { | 374 | list_for_each_entry_rcu(kp, &old_p->list, list) { |
375 | if (kp->break_handler) | 375 | if (kp->break_handler) |
376 | return -EEXIST; | 376 | return -EEXIST; |
377 | } | 377 | } |
378 | list_add_tail_rcu(&p->list, &old_p->list); | 378 | list_add_tail_rcu(&p->list, &old_p->list); |
379 | } else | 379 | } else |
380 | list_add_rcu(&p->list, &old_p->list); | 380 | list_add_rcu(&p->list, &old_p->list); |
381 | return 0; | 381 | return 0; |
382 | } | 382 | } |
383 | 383 | ||
384 | /* | 384 | /* |
385 | * Fill in the required fields of the "manager kprobe". Replace the | 385 | * Fill in the required fields of the "manager kprobe". Replace the |
386 | * earlier kprobe in the hlist with the manager kprobe | 386 | * earlier kprobe in the hlist with the manager kprobe |
387 | */ | 387 | */ |
388 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 388 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
389 | { | 389 | { |
390 | copy_kprobe(p, ap); | 390 | copy_kprobe(p, ap); |
391 | ap->addr = p->addr; | 391 | ap->addr = p->addr; |
392 | ap->pre_handler = aggr_pre_handler; | 392 | ap->pre_handler = aggr_pre_handler; |
393 | ap->post_handler = aggr_post_handler; | 393 | ap->post_handler = aggr_post_handler; |
394 | ap->fault_handler = aggr_fault_handler; | 394 | ap->fault_handler = aggr_fault_handler; |
395 | ap->break_handler = aggr_break_handler; | 395 | ap->break_handler = aggr_break_handler; |
396 | 396 | ||
397 | INIT_LIST_HEAD(&ap->list); | 397 | INIT_LIST_HEAD(&ap->list); |
398 | list_add_rcu(&p->list, &ap->list); | 398 | list_add_rcu(&p->list, &ap->list); |
399 | 399 | ||
400 | hlist_replace_rcu(&p->hlist, &ap->hlist); | 400 | hlist_replace_rcu(&p->hlist, &ap->hlist); |
401 | } | 401 | } |
402 | 402 | ||
403 | /* | 403 | /* |
404 | * This is the second or subsequent kprobe at the address - handle | 404 | * This is the second or subsequent kprobe at the address - handle |
405 | * the intricacies | 405 | * the intricacies |
406 | */ | 406 | */ |
407 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 407 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, |
408 | struct kprobe *p) | 408 | struct kprobe *p) |
409 | { | 409 | { |
410 | int ret = 0; | 410 | int ret = 0; |
411 | struct kprobe *ap; | 411 | struct kprobe *ap; |
412 | 412 | ||
413 | if (old_p->pre_handler == aggr_pre_handler) { | 413 | if (old_p->pre_handler == aggr_pre_handler) { |
414 | copy_kprobe(old_p, p); | 414 | copy_kprobe(old_p, p); |
415 | ret = add_new_kprobe(old_p, p); | 415 | ret = add_new_kprobe(old_p, p); |
416 | } else { | 416 | } else { |
417 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); | 417 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); |
418 | if (!ap) | 418 | if (!ap) |
419 | return -ENOMEM; | 419 | return -ENOMEM; |
420 | add_aggr_kprobe(ap, old_p); | 420 | add_aggr_kprobe(ap, old_p); |
421 | copy_kprobe(ap, p); | 421 | copy_kprobe(ap, p); |
422 | ret = add_new_kprobe(ap, p); | 422 | ret = add_new_kprobe(ap, p); |
423 | } | 423 | } |
424 | return ret; | 424 | return ret; |
425 | } | 425 | } |
426 | 426 | ||
427 | static int __kprobes in_kprobes_functions(unsigned long addr) | 427 | static int __kprobes in_kprobes_functions(unsigned long addr) |
428 | { | 428 | { |
429 | if (addr >= (unsigned long)__kprobes_text_start | 429 | if (addr >= (unsigned long)__kprobes_text_start |
430 | && addr < (unsigned long)__kprobes_text_end) | 430 | && addr < (unsigned long)__kprobes_text_end) |
431 | return -EINVAL; | 431 | return -EINVAL; |
432 | return 0; | 432 | return 0; |
433 | } | 433 | } |
434 | 434 | ||
435 | static int __kprobes __register_kprobe(struct kprobe *p, | 435 | static int __kprobes __register_kprobe(struct kprobe *p, |
436 | unsigned long called_from) | 436 | unsigned long called_from) |
437 | { | 437 | { |
438 | int ret = 0; | 438 | int ret = 0; |
439 | struct kprobe *old_p; | 439 | struct kprobe *old_p; |
440 | struct module *probed_mod; | 440 | struct module *probed_mod; |
441 | 441 | ||
442 | if ((!kernel_text_address((unsigned long) p->addr)) || | 442 | if ((!kernel_text_address((unsigned long) p->addr)) || |
443 | in_kprobes_functions((unsigned long) p->addr)) | 443 | in_kprobes_functions((unsigned long) p->addr)) |
444 | return -EINVAL; | 444 | return -EINVAL; |
445 | 445 | ||
446 | p->mod_refcounted = 0; | 446 | p->mod_refcounted = 0; |
447 | /* Check are we probing a module */ | 447 | /* Check are we probing a module */ |
448 | if ((probed_mod = module_text_address((unsigned long) p->addr))) { | 448 | if ((probed_mod = module_text_address((unsigned long) p->addr))) { |
449 | struct module *calling_mod = module_text_address(called_from); | 449 | struct module *calling_mod = module_text_address(called_from); |
450 | /* We must allow modules to probe themself and | 450 | /* We must allow modules to probe themself and |
451 | * in this case avoid incrementing the module refcount, | 451 | * in this case avoid incrementing the module refcount, |
452 | * so as to allow unloading of self probing modules. | 452 | * so as to allow unloading of self probing modules. |
453 | */ | 453 | */ |
454 | if (calling_mod && (calling_mod != probed_mod)) { | 454 | if (calling_mod && (calling_mod != probed_mod)) { |
455 | if (unlikely(!try_module_get(probed_mod))) | 455 | if (unlikely(!try_module_get(probed_mod))) |
456 | return -EINVAL; | 456 | return -EINVAL; |
457 | p->mod_refcounted = 1; | 457 | p->mod_refcounted = 1; |
458 | } else | 458 | } else |
459 | probed_mod = NULL; | 459 | probed_mod = NULL; |
460 | } | 460 | } |
461 | 461 | ||
462 | p->nmissed = 0; | 462 | p->nmissed = 0; |
463 | mutex_lock(&kprobe_mutex); | 463 | mutex_lock(&kprobe_mutex); |
464 | old_p = get_kprobe(p->addr); | 464 | old_p = get_kprobe(p->addr); |
465 | if (old_p) { | 465 | if (old_p) { |
466 | ret = register_aggr_kprobe(old_p, p); | 466 | ret = register_aggr_kprobe(old_p, p); |
467 | goto out; | 467 | goto out; |
468 | } | 468 | } |
469 | 469 | ||
470 | if ((ret = arch_prepare_kprobe(p)) != 0) | 470 | if ((ret = arch_prepare_kprobe(p)) != 0) |
471 | goto out; | 471 | goto out; |
472 | 472 | ||
473 | INIT_HLIST_NODE(&p->hlist); | 473 | INIT_HLIST_NODE(&p->hlist); |
474 | hlist_add_head_rcu(&p->hlist, | 474 | hlist_add_head_rcu(&p->hlist, |
475 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 475 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
476 | 476 | ||
477 | arch_arm_kprobe(p); | 477 | arch_arm_kprobe(p); |
478 | 478 | ||
479 | out: | 479 | out: |
480 | mutex_unlock(&kprobe_mutex); | 480 | mutex_unlock(&kprobe_mutex); |
481 | 481 | ||
482 | if (ret && probed_mod) | 482 | if (ret && probed_mod) |
483 | module_put(probed_mod); | 483 | module_put(probed_mod); |
484 | return ret; | 484 | return ret; |
485 | } | 485 | } |
486 | 486 | ||
487 | int __kprobes register_kprobe(struct kprobe *p) | 487 | int __kprobes register_kprobe(struct kprobe *p) |
488 | { | 488 | { |
489 | return __register_kprobe(p, | 489 | return __register_kprobe(p, |
490 | (unsigned long)__builtin_return_address(0)); | 490 | (unsigned long)__builtin_return_address(0)); |
491 | } | 491 | } |
492 | 492 | ||
493 | void __kprobes unregister_kprobe(struct kprobe *p) | 493 | void __kprobes unregister_kprobe(struct kprobe *p) |
494 | { | 494 | { |
495 | struct module *mod; | 495 | struct module *mod; |
496 | struct kprobe *old_p, *list_p; | 496 | struct kprobe *old_p, *list_p; |
497 | int cleanup_p; | 497 | int cleanup_p; |
498 | 498 | ||
499 | mutex_lock(&kprobe_mutex); | 499 | mutex_lock(&kprobe_mutex); |
500 | old_p = get_kprobe(p->addr); | 500 | old_p = get_kprobe(p->addr); |
501 | if (unlikely(!old_p)) { | 501 | if (unlikely(!old_p)) { |
502 | mutex_unlock(&kprobe_mutex); | 502 | mutex_unlock(&kprobe_mutex); |
503 | return; | 503 | return; |
504 | } | 504 | } |
505 | if (p != old_p) { | 505 | if (p != old_p) { |
506 | list_for_each_entry_rcu(list_p, &old_p->list, list) | 506 | list_for_each_entry_rcu(list_p, &old_p->list, list) |
507 | if (list_p == p) | 507 | if (list_p == p) |
508 | /* kprobe p is a valid probe */ | 508 | /* kprobe p is a valid probe */ |
509 | goto valid_p; | 509 | goto valid_p; |
510 | mutex_unlock(&kprobe_mutex); | 510 | mutex_unlock(&kprobe_mutex); |
511 | return; | 511 | return; |
512 | } | 512 | } |
513 | valid_p: | 513 | valid_p: |
514 | if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && | 514 | if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && |
515 | (p->list.next == &old_p->list) && | 515 | (p->list.next == &old_p->list) && |
516 | (p->list.prev == &old_p->list))) { | 516 | (p->list.prev == &old_p->list))) { |
517 | /* Only probe on the hash list */ | 517 | /* Only probe on the hash list */ |
518 | arch_disarm_kprobe(p); | 518 | arch_disarm_kprobe(p); |
519 | hlist_del_rcu(&old_p->hlist); | 519 | hlist_del_rcu(&old_p->hlist); |
520 | cleanup_p = 1; | 520 | cleanup_p = 1; |
521 | } else { | 521 | } else { |
522 | list_del_rcu(&p->list); | 522 | list_del_rcu(&p->list); |
523 | cleanup_p = 0; | 523 | cleanup_p = 0; |
524 | } | 524 | } |
525 | 525 | ||
526 | mutex_unlock(&kprobe_mutex); | 526 | mutex_unlock(&kprobe_mutex); |
527 | 527 | ||
528 | synchronize_sched(); | 528 | synchronize_sched(); |
529 | if (p->mod_refcounted && | 529 | if (p->mod_refcounted && |
530 | (mod = module_text_address((unsigned long)p->addr))) | 530 | (mod = module_text_address((unsigned long)p->addr))) |
531 | module_put(mod); | 531 | module_put(mod); |
532 | 532 | ||
533 | if (cleanup_p) { | 533 | if (cleanup_p) { |
534 | if (p != old_p) { | 534 | if (p != old_p) { |
535 | list_del_rcu(&p->list); | 535 | list_del_rcu(&p->list); |
536 | kfree(old_p); | 536 | kfree(old_p); |
537 | } | 537 | } |
538 | arch_remove_kprobe(p); | 538 | arch_remove_kprobe(p); |
539 | } | 539 | } |
540 | } | 540 | } |
541 | 541 | ||
542 | static struct notifier_block kprobe_exceptions_nb = { | 542 | static struct notifier_block kprobe_exceptions_nb = { |
543 | .notifier_call = kprobe_exceptions_notify, | 543 | .notifier_call = kprobe_exceptions_notify, |
544 | .priority = 0x7fffffff /* we need to notified first */ | 544 | .priority = 0x7fffffff /* we need to notified first */ |
545 | }; | 545 | }; |
546 | 546 | ||
547 | int __kprobes register_jprobe(struct jprobe *jp) | 547 | int __kprobes register_jprobe(struct jprobe *jp) |
548 | { | 548 | { |
549 | /* Todo: Verify probepoint is a function entry point */ | 549 | /* Todo: Verify probepoint is a function entry point */ |
550 | jp->kp.pre_handler = setjmp_pre_handler; | 550 | jp->kp.pre_handler = setjmp_pre_handler; |
551 | jp->kp.break_handler = longjmp_break_handler; | 551 | jp->kp.break_handler = longjmp_break_handler; |
552 | 552 | ||
553 | return __register_kprobe(&jp->kp, | 553 | return __register_kprobe(&jp->kp, |
554 | (unsigned long)__builtin_return_address(0)); | 554 | (unsigned long)__builtin_return_address(0)); |
555 | } | 555 | } |
556 | 556 | ||
557 | void __kprobes unregister_jprobe(struct jprobe *jp) | 557 | void __kprobes unregister_jprobe(struct jprobe *jp) |
558 | { | 558 | { |
559 | unregister_kprobe(&jp->kp); | 559 | unregister_kprobe(&jp->kp); |
560 | } | 560 | } |
561 | 561 | ||
562 | #ifdef ARCH_SUPPORTS_KRETPROBES | 562 | #ifdef ARCH_SUPPORTS_KRETPROBES |
563 | 563 | ||
564 | /* | 564 | /* |
565 | * This kprobe pre_handler is registered with every kretprobe. When probe | 565 | * This kprobe pre_handler is registered with every kretprobe. When probe |
566 | * hits it will set up the return probe. | 566 | * hits it will set up the return probe. |
567 | */ | 567 | */ |
568 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, | 568 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, |
569 | struct pt_regs *regs) | 569 | struct pt_regs *regs) |
570 | { | 570 | { |
571 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | 571 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); |
572 | unsigned long flags = 0; | 572 | unsigned long flags = 0; |
573 | 573 | ||
574 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | 574 | /*TODO: consider to only swap the RA after the last pre_handler fired */ |
575 | spin_lock_irqsave(&kretprobe_lock, flags); | 575 | spin_lock_irqsave(&kretprobe_lock, flags); |
576 | arch_prepare_kretprobe(rp, regs); | 576 | arch_prepare_kretprobe(rp, regs); |
577 | spin_unlock_irqrestore(&kretprobe_lock, flags); | 577 | spin_unlock_irqrestore(&kretprobe_lock, flags); |
578 | return 0; | 578 | return 0; |
579 | } | 579 | } |
580 | 580 | ||
581 | int __kprobes register_kretprobe(struct kretprobe *rp) | 581 | int __kprobes register_kretprobe(struct kretprobe *rp) |
582 | { | 582 | { |
583 | int ret = 0; | 583 | int ret = 0; |
584 | struct kretprobe_instance *inst; | 584 | struct kretprobe_instance *inst; |
585 | int i; | 585 | int i; |
586 | 586 | ||
587 | rp->kp.pre_handler = pre_handler_kretprobe; | 587 | rp->kp.pre_handler = pre_handler_kretprobe; |
588 | 588 | ||
589 | /* Pre-allocate memory for max kretprobe instances */ | 589 | /* Pre-allocate memory for max kretprobe instances */ |
590 | if (rp->maxactive <= 0) { | 590 | if (rp->maxactive <= 0) { |
591 | #ifdef CONFIG_PREEMPT | 591 | #ifdef CONFIG_PREEMPT |
592 | rp->maxactive = max(10, 2 * NR_CPUS); | 592 | rp->maxactive = max(10, 2 * NR_CPUS); |
593 | #else | 593 | #else |
594 | rp->maxactive = NR_CPUS; | 594 | rp->maxactive = NR_CPUS; |
595 | #endif | 595 | #endif |
596 | } | 596 | } |
597 | INIT_HLIST_HEAD(&rp->used_instances); | 597 | INIT_HLIST_HEAD(&rp->used_instances); |
598 | INIT_HLIST_HEAD(&rp->free_instances); | 598 | INIT_HLIST_HEAD(&rp->free_instances); |
599 | for (i = 0; i < rp->maxactive; i++) { | 599 | for (i = 0; i < rp->maxactive; i++) { |
600 | inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); | 600 | inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); |
601 | if (inst == NULL) { | 601 | if (inst == NULL) { |
602 | free_rp_inst(rp); | 602 | free_rp_inst(rp); |
603 | return -ENOMEM; | 603 | return -ENOMEM; |
604 | } | 604 | } |
605 | INIT_HLIST_NODE(&inst->uflist); | 605 | INIT_HLIST_NODE(&inst->uflist); |
606 | hlist_add_head(&inst->uflist, &rp->free_instances); | 606 | hlist_add_head(&inst->uflist, &rp->free_instances); |
607 | } | 607 | } |
608 | 608 | ||
609 | rp->nmissed = 0; | 609 | rp->nmissed = 0; |
610 | /* Establish function entry probe point */ | 610 | /* Establish function entry probe point */ |
611 | if ((ret = __register_kprobe(&rp->kp, | 611 | if ((ret = __register_kprobe(&rp->kp, |
612 | (unsigned long)__builtin_return_address(0))) != 0) | 612 | (unsigned long)__builtin_return_address(0))) != 0) |
613 | free_rp_inst(rp); | 613 | free_rp_inst(rp); |
614 | return ret; | 614 | return ret; |
615 | } | 615 | } |
616 | 616 | ||
617 | #else /* ARCH_SUPPORTS_KRETPROBES */ | 617 | #else /* ARCH_SUPPORTS_KRETPROBES */ |
618 | 618 | ||
619 | int __kprobes register_kretprobe(struct kretprobe *rp) | 619 | int __kprobes register_kretprobe(struct kretprobe *rp) |
620 | { | 620 | { |
621 | return -ENOSYS; | 621 | return -ENOSYS; |
622 | } | 622 | } |
623 | 623 | ||
624 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | 624 | #endif /* ARCH_SUPPORTS_KRETPROBES */ |
625 | 625 | ||
626 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 626 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
627 | { | 627 | { |
628 | unsigned long flags; | 628 | unsigned long flags; |
629 | struct kretprobe_instance *ri; | 629 | struct kretprobe_instance *ri; |
630 | 630 | ||
631 | unregister_kprobe(&rp->kp); | 631 | unregister_kprobe(&rp->kp); |
632 | /* No race here */ | 632 | /* No race here */ |
633 | spin_lock_irqsave(&kretprobe_lock, flags); | 633 | spin_lock_irqsave(&kretprobe_lock, flags); |
634 | while ((ri = get_used_rp_inst(rp)) != NULL) { | 634 | while ((ri = get_used_rp_inst(rp)) != NULL) { |
635 | ri->rp = NULL; | 635 | ri->rp = NULL; |
636 | hlist_del(&ri->uflist); | 636 | hlist_del(&ri->uflist); |
637 | } | 637 | } |
638 | spin_unlock_irqrestore(&kretprobe_lock, flags); | 638 | spin_unlock_irqrestore(&kretprobe_lock, flags); |
639 | free_rp_inst(rp); | 639 | free_rp_inst(rp); |
640 | } | 640 | } |
641 | 641 | ||
642 | static int __init init_kprobes(void) | 642 | static int __init init_kprobes(void) |
643 | { | 643 | { |
644 | int i, err = 0; | 644 | int i, err = 0; |
645 | 645 | ||
646 | /* FIXME allocate the probe table, currently defined statically */ | 646 | /* FIXME allocate the probe table, currently defined statically */ |
647 | /* initialize all list heads */ | 647 | /* initialize all list heads */ |
648 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 648 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
649 | INIT_HLIST_HEAD(&kprobe_table[i]); | 649 | INIT_HLIST_HEAD(&kprobe_table[i]); |
650 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 650 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
651 | } | 651 | } |
652 | 652 | ||
653 | err = arch_init_kprobes(); | 653 | err = arch_init_kprobes(); |
654 | if (!err) | 654 | if (!err) |
655 | err = register_die_notifier(&kprobe_exceptions_nb); | 655 | err = register_die_notifier(&kprobe_exceptions_nb); |
656 | 656 | ||
657 | return err; | 657 | return err; |
658 | } | 658 | } |
659 | 659 | ||
660 | __initcall(init_kprobes); | 660 | __initcall(init_kprobes); |
661 | 661 | ||
662 | EXPORT_SYMBOL_GPL(register_kprobe); | 662 | EXPORT_SYMBOL_GPL(register_kprobe); |
663 | EXPORT_SYMBOL_GPL(unregister_kprobe); | 663 | EXPORT_SYMBOL_GPL(unregister_kprobe); |
664 | EXPORT_SYMBOL_GPL(register_jprobe); | 664 | EXPORT_SYMBOL_GPL(register_jprobe); |
665 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 665 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
666 | EXPORT_SYMBOL_GPL(jprobe_return); | 666 | EXPORT_SYMBOL_GPL(jprobe_return); |
667 | EXPORT_SYMBOL_GPL(register_kretprobe); | 667 | EXPORT_SYMBOL_GPL(register_kretprobe); |
668 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 668 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
669 | 669 | ||
670 | 670 |
kernel/sched.c
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | 7 | * |
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | 8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 | * make semaphores SMP safe | 9 | * make semaphores SMP safe |
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | 10 | * 1998-11-19 Implemented schedule_timeout() and related stuff |
11 | * by Andrea Arcangeli | 11 | * by Andrea Arcangeli |
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | 12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: |
13 | * hybrid priority-list and round-robin design with | 13 | * hybrid priority-list and round-robin design with |
14 | * an array-switch method of distributing timeslices | 14 | * an array-switch method of distributing timeslices |
15 | * and per-CPU runqueues. Cleanups and useful suggestions | 15 | * and per-CPU runqueues. Cleanups and useful suggestions |
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 17 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 18 | * 2004-04-02 Scheduler domains code by Nick Piggin |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/nmi.h> | 23 | #include <linux/nmi.h> |
24 | #include <linux/init.h> | 24 | #include <linux/init.h> |
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
27 | #include <linux/smp_lock.h> | 27 | #include <linux/smp_lock.h> |
28 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
31 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
36 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
37 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
38 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
39 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
40 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
41 | #include <linux/threads.h> | 41 | #include <linux/threads.h> |
42 | #include <linux/timer.h> | 42 | #include <linux/timer.h> |
43 | #include <linux/rcupdate.h> | 43 | #include <linux/rcupdate.h> |
44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
45 | #include <linux/cpuset.h> | 45 | #include <linux/cpuset.h> |
46 | #include <linux/percpu.h> | 46 | #include <linux/percpu.h> |
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | #include <linux/seq_file.h> | 48 | #include <linux/seq_file.h> |
49 | #include <linux/syscalls.h> | 49 | #include <linux/syscalls.h> |
50 | #include <linux/times.h> | 50 | #include <linux/times.h> |
51 | #include <linux/acct.h> | 51 | #include <linux/acct.h> |
52 | #include <linux/kprobes.h> | ||
52 | #include <asm/tlb.h> | 53 | #include <asm/tlb.h> |
53 | 54 | ||
54 | #include <asm/unistd.h> | 55 | #include <asm/unistd.h> |
55 | 56 | ||
56 | /* | 57 | /* |
57 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 58 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
58 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 59 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
59 | * and back. | 60 | * and back. |
60 | */ | 61 | */ |
61 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | 62 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) |
62 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | 63 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) |
63 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | 64 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) |
64 | 65 | ||
65 | /* | 66 | /* |
66 | * 'User priority' is the nice value converted to something we | 67 | * 'User priority' is the nice value converted to something we |
67 | * can work with better when scaling various scheduler parameters, | 68 | * can work with better when scaling various scheduler parameters, |
68 | * it's a [ 0 ... 39 ] range. | 69 | * it's a [ 0 ... 39 ] range. |
69 | */ | 70 | */ |
70 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | 71 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) |
71 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | 72 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) |
72 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 73 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
73 | 74 | ||
74 | /* | 75 | /* |
75 | * Some helpers for converting nanosecond timing to jiffy resolution | 76 | * Some helpers for converting nanosecond timing to jiffy resolution |
76 | */ | 77 | */ |
77 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 78 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
78 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 79 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
79 | 80 | ||
80 | /* | 81 | /* |
81 | * These are the 'tuning knobs' of the scheduler: | 82 | * These are the 'tuning knobs' of the scheduler: |
82 | * | 83 | * |
83 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 84 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), |
84 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | 85 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. |
85 | * Timeslices get refilled after they expire. | 86 | * Timeslices get refilled after they expire. |
86 | */ | 87 | */ |
87 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | 88 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
88 | #define DEF_TIMESLICE (100 * HZ / 1000) | 89 | #define DEF_TIMESLICE (100 * HZ / 1000) |
89 | #define ON_RUNQUEUE_WEIGHT 30 | 90 | #define ON_RUNQUEUE_WEIGHT 30 |
90 | #define CHILD_PENALTY 95 | 91 | #define CHILD_PENALTY 95 |
91 | #define PARENT_PENALTY 100 | 92 | #define PARENT_PENALTY 100 |
92 | #define EXIT_WEIGHT 3 | 93 | #define EXIT_WEIGHT 3 |
93 | #define PRIO_BONUS_RATIO 25 | 94 | #define PRIO_BONUS_RATIO 25 |
94 | #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | 95 | #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) |
95 | #define INTERACTIVE_DELTA 2 | 96 | #define INTERACTIVE_DELTA 2 |
96 | #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | 97 | #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) |
97 | #define STARVATION_LIMIT (MAX_SLEEP_AVG) | 98 | #define STARVATION_LIMIT (MAX_SLEEP_AVG) |
98 | #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | 99 | #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) |
99 | 100 | ||
100 | /* | 101 | /* |
101 | * If a task is 'interactive' then we reinsert it in the active | 102 | * If a task is 'interactive' then we reinsert it in the active |
102 | * array after it has expired its current timeslice. (it will not | 103 | * array after it has expired its current timeslice. (it will not |
103 | * continue to run immediately, it will still roundrobin with | 104 | * continue to run immediately, it will still roundrobin with |
104 | * other interactive tasks.) | 105 | * other interactive tasks.) |
105 | * | 106 | * |
106 | * This part scales the interactivity limit depending on niceness. | 107 | * This part scales the interactivity limit depending on niceness. |
107 | * | 108 | * |
108 | * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | 109 | * We scale it linearly, offset by the INTERACTIVE_DELTA delta. |
109 | * Here are a few examples of different nice levels: | 110 | * Here are a few examples of different nice levels: |
110 | * | 111 | * |
111 | * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | 112 | * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] |
112 | * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | 113 | * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] |
113 | * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | 114 | * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] |
114 | * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | 115 | * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] |
115 | * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | 116 | * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] |
116 | * | 117 | * |
117 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic | 118 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic |
118 | * priority range a task can explore, a value of '1' means the | 119 | * priority range a task can explore, a value of '1' means the |
119 | * task is rated interactive.) | 120 | * task is rated interactive.) |
120 | * | 121 | * |
121 | * Ie. nice +19 tasks can never get 'interactive' enough to be | 122 | * Ie. nice +19 tasks can never get 'interactive' enough to be |
122 | * reinserted into the active array. And only heavily CPU-hog nice -20 | 123 | * reinserted into the active array. And only heavily CPU-hog nice -20 |
123 | * tasks will be expired. Default nice 0 tasks are somewhere between, | 124 | * tasks will be expired. Default nice 0 tasks are somewhere between, |
124 | * it takes some effort for them to get interactive, but it's not | 125 | * it takes some effort for them to get interactive, but it's not |
125 | * too hard. | 126 | * too hard. |
126 | */ | 127 | */ |
127 | 128 | ||
128 | #define CURRENT_BONUS(p) \ | 129 | #define CURRENT_BONUS(p) \ |
129 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | 130 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ |
130 | MAX_SLEEP_AVG) | 131 | MAX_SLEEP_AVG) |
131 | 132 | ||
132 | #define GRANULARITY (10 * HZ / 1000 ? : 1) | 133 | #define GRANULARITY (10 * HZ / 1000 ? : 1) |
133 | 134 | ||
134 | #ifdef CONFIG_SMP | 135 | #ifdef CONFIG_SMP |
135 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | 136 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
136 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | 137 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ |
137 | num_online_cpus()) | 138 | num_online_cpus()) |
138 | #else | 139 | #else |
139 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | 140 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
140 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | 141 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) |
141 | #endif | 142 | #endif |
142 | 143 | ||
143 | #define SCALE(v1,v1_max,v2_max) \ | 144 | #define SCALE(v1,v1_max,v2_max) \ |
144 | (v1) * (v2_max) / (v1_max) | 145 | (v1) * (v2_max) / (v1_max) |
145 | 146 | ||
146 | #define DELTA(p) \ | 147 | #define DELTA(p) \ |
147 | (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) | 148 | (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) |
148 | 149 | ||
149 | #define TASK_INTERACTIVE(p) \ | 150 | #define TASK_INTERACTIVE(p) \ |
150 | ((p)->prio <= (p)->static_prio - DELTA(p)) | 151 | ((p)->prio <= (p)->static_prio - DELTA(p)) |
151 | 152 | ||
152 | #define INTERACTIVE_SLEEP(p) \ | 153 | #define INTERACTIVE_SLEEP(p) \ |
153 | (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | 154 | (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ |
154 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | 155 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) |
155 | 156 | ||
156 | #define TASK_PREEMPTS_CURR(p, rq) \ | 157 | #define TASK_PREEMPTS_CURR(p, rq) \ |
157 | ((p)->prio < (rq)->curr->prio) | 158 | ((p)->prio < (rq)->curr->prio) |
158 | 159 | ||
159 | /* | 160 | /* |
160 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | 161 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
161 | * to time slice values: [800ms ... 100ms ... 5ms] | 162 | * to time slice values: [800ms ... 100ms ... 5ms] |
162 | * | 163 | * |
163 | * The higher a thread's priority, the bigger timeslices | 164 | * The higher a thread's priority, the bigger timeslices |
164 | * it gets during one round of execution. But even the lowest | 165 | * it gets during one round of execution. But even the lowest |
165 | * priority thread gets MIN_TIMESLICE worth of execution time. | 166 | * priority thread gets MIN_TIMESLICE worth of execution time. |
166 | */ | 167 | */ |
167 | 168 | ||
168 | #define SCALE_PRIO(x, prio) \ | 169 | #define SCALE_PRIO(x, prio) \ |
169 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 170 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) |
170 | 171 | ||
171 | static unsigned int task_timeslice(task_t *p) | 172 | static unsigned int task_timeslice(task_t *p) |
172 | { | 173 | { |
173 | if (p->static_prio < NICE_TO_PRIO(0)) | 174 | if (p->static_prio < NICE_TO_PRIO(0)) |
174 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 175 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); |
175 | else | 176 | else |
176 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 177 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); |
177 | } | 178 | } |
178 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 179 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
179 | < (long long) (sd)->cache_hot_time) | 180 | < (long long) (sd)->cache_hot_time) |
180 | 181 | ||
181 | /* | 182 | /* |
182 | * These are the runqueue data structures: | 183 | * These are the runqueue data structures: |
183 | */ | 184 | */ |
184 | 185 | ||
185 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | 186 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) |
186 | 187 | ||
187 | typedef struct runqueue runqueue_t; | 188 | typedef struct runqueue runqueue_t; |
188 | 189 | ||
189 | struct prio_array { | 190 | struct prio_array { |
190 | unsigned int nr_active; | 191 | unsigned int nr_active; |
191 | unsigned long bitmap[BITMAP_SIZE]; | 192 | unsigned long bitmap[BITMAP_SIZE]; |
192 | struct list_head queue[MAX_PRIO]; | 193 | struct list_head queue[MAX_PRIO]; |
193 | }; | 194 | }; |
194 | 195 | ||
195 | /* | 196 | /* |
196 | * This is the main, per-CPU runqueue data structure. | 197 | * This is the main, per-CPU runqueue data structure. |
197 | * | 198 | * |
198 | * Locking rule: those places that want to lock multiple runqueues | 199 | * Locking rule: those places that want to lock multiple runqueues |
199 | * (such as the load balancing or the thread migration code), lock | 200 | * (such as the load balancing or the thread migration code), lock |
200 | * acquire operations must be ordered by ascending &runqueue. | 201 | * acquire operations must be ordered by ascending &runqueue. |
201 | */ | 202 | */ |
202 | struct runqueue { | 203 | struct runqueue { |
203 | spinlock_t lock; | 204 | spinlock_t lock; |
204 | 205 | ||
205 | /* | 206 | /* |
206 | * nr_running and cpu_load should be in the same cacheline because | 207 | * nr_running and cpu_load should be in the same cacheline because |
207 | * remote CPUs use both these fields when doing load calculation. | 208 | * remote CPUs use both these fields when doing load calculation. |
208 | */ | 209 | */ |
209 | unsigned long nr_running; | 210 | unsigned long nr_running; |
210 | #ifdef CONFIG_SMP | 211 | #ifdef CONFIG_SMP |
211 | unsigned long cpu_load[3]; | 212 | unsigned long cpu_load[3]; |
212 | #endif | 213 | #endif |
213 | unsigned long long nr_switches; | 214 | unsigned long long nr_switches; |
214 | 215 | ||
215 | /* | 216 | /* |
216 | * This is part of a global counter where only the total sum | 217 | * This is part of a global counter where only the total sum |
217 | * over all CPUs matters. A task can increase this counter on | 218 | * over all CPUs matters. A task can increase this counter on |
218 | * one CPU and if it got migrated afterwards it may decrease | 219 | * one CPU and if it got migrated afterwards it may decrease |
219 | * it on another CPU. Always updated under the runqueue lock: | 220 | * it on another CPU. Always updated under the runqueue lock: |
220 | */ | 221 | */ |
221 | unsigned long nr_uninterruptible; | 222 | unsigned long nr_uninterruptible; |
222 | 223 | ||
223 | unsigned long expired_timestamp; | 224 | unsigned long expired_timestamp; |
224 | unsigned long long timestamp_last_tick; | 225 | unsigned long long timestamp_last_tick; |
225 | task_t *curr, *idle; | 226 | task_t *curr, *idle; |
226 | struct mm_struct *prev_mm; | 227 | struct mm_struct *prev_mm; |
227 | prio_array_t *active, *expired, arrays[2]; | 228 | prio_array_t *active, *expired, arrays[2]; |
228 | int best_expired_prio; | 229 | int best_expired_prio; |
229 | atomic_t nr_iowait; | 230 | atomic_t nr_iowait; |
230 | 231 | ||
231 | #ifdef CONFIG_SMP | 232 | #ifdef CONFIG_SMP |
232 | struct sched_domain *sd; | 233 | struct sched_domain *sd; |
233 | 234 | ||
234 | /* For active balancing */ | 235 | /* For active balancing */ |
235 | int active_balance; | 236 | int active_balance; |
236 | int push_cpu; | 237 | int push_cpu; |
237 | 238 | ||
238 | task_t *migration_thread; | 239 | task_t *migration_thread; |
239 | struct list_head migration_queue; | 240 | struct list_head migration_queue; |
240 | int cpu; | 241 | int cpu; |
241 | #endif | 242 | #endif |
242 | 243 | ||
243 | #ifdef CONFIG_SCHEDSTATS | 244 | #ifdef CONFIG_SCHEDSTATS |
244 | /* latency stats */ | 245 | /* latency stats */ |
245 | struct sched_info rq_sched_info; | 246 | struct sched_info rq_sched_info; |
246 | 247 | ||
247 | /* sys_sched_yield() stats */ | 248 | /* sys_sched_yield() stats */ |
248 | unsigned long yld_exp_empty; | 249 | unsigned long yld_exp_empty; |
249 | unsigned long yld_act_empty; | 250 | unsigned long yld_act_empty; |
250 | unsigned long yld_both_empty; | 251 | unsigned long yld_both_empty; |
251 | unsigned long yld_cnt; | 252 | unsigned long yld_cnt; |
252 | 253 | ||
253 | /* schedule() stats */ | 254 | /* schedule() stats */ |
254 | unsigned long sched_switch; | 255 | unsigned long sched_switch; |
255 | unsigned long sched_cnt; | 256 | unsigned long sched_cnt; |
256 | unsigned long sched_goidle; | 257 | unsigned long sched_goidle; |
257 | 258 | ||
258 | /* try_to_wake_up() stats */ | 259 | /* try_to_wake_up() stats */ |
259 | unsigned long ttwu_cnt; | 260 | unsigned long ttwu_cnt; |
260 | unsigned long ttwu_local; | 261 | unsigned long ttwu_local; |
261 | #endif | 262 | #endif |
262 | }; | 263 | }; |
263 | 264 | ||
264 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 265 | static DEFINE_PER_CPU(struct runqueue, runqueues); |
265 | 266 | ||
266 | /* | 267 | /* |
267 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 268 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
268 | * See detach_destroy_domains: synchronize_sched for details. | 269 | * See detach_destroy_domains: synchronize_sched for details. |
269 | * | 270 | * |
270 | * The domain tree of any CPU may only be accessed from within | 271 | * The domain tree of any CPU may only be accessed from within |
271 | * preempt-disabled sections. | 272 | * preempt-disabled sections. |
272 | */ | 273 | */ |
273 | #define for_each_domain(cpu, domain) \ | 274 | #define for_each_domain(cpu, domain) \ |
274 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | 275 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) |
275 | 276 | ||
276 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 277 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
277 | #define this_rq() (&__get_cpu_var(runqueues)) | 278 | #define this_rq() (&__get_cpu_var(runqueues)) |
278 | #define task_rq(p) cpu_rq(task_cpu(p)) | 279 | #define task_rq(p) cpu_rq(task_cpu(p)) |
279 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 280 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
280 | 281 | ||
281 | #ifndef prepare_arch_switch | 282 | #ifndef prepare_arch_switch |
282 | # define prepare_arch_switch(next) do { } while (0) | 283 | # define prepare_arch_switch(next) do { } while (0) |
283 | #endif | 284 | #endif |
284 | #ifndef finish_arch_switch | 285 | #ifndef finish_arch_switch |
285 | # define finish_arch_switch(prev) do { } while (0) | 286 | # define finish_arch_switch(prev) do { } while (0) |
286 | #endif | 287 | #endif |
287 | 288 | ||
288 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 289 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
289 | static inline int task_running(runqueue_t *rq, task_t *p) | 290 | static inline int task_running(runqueue_t *rq, task_t *p) |
290 | { | 291 | { |
291 | return rq->curr == p; | 292 | return rq->curr == p; |
292 | } | 293 | } |
293 | 294 | ||
294 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 295 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) |
295 | { | 296 | { |
296 | } | 297 | } |
297 | 298 | ||
298 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 299 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) |
299 | { | 300 | { |
300 | #ifdef CONFIG_DEBUG_SPINLOCK | 301 | #ifdef CONFIG_DEBUG_SPINLOCK |
301 | /* this is a valid case when another task releases the spinlock */ | 302 | /* this is a valid case when another task releases the spinlock */ |
302 | rq->lock.owner = current; | 303 | rq->lock.owner = current; |
303 | #endif | 304 | #endif |
304 | spin_unlock_irq(&rq->lock); | 305 | spin_unlock_irq(&rq->lock); |
305 | } | 306 | } |
306 | 307 | ||
307 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 308 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
308 | static inline int task_running(runqueue_t *rq, task_t *p) | 309 | static inline int task_running(runqueue_t *rq, task_t *p) |
309 | { | 310 | { |
310 | #ifdef CONFIG_SMP | 311 | #ifdef CONFIG_SMP |
311 | return p->oncpu; | 312 | return p->oncpu; |
312 | #else | 313 | #else |
313 | return rq->curr == p; | 314 | return rq->curr == p; |
314 | #endif | 315 | #endif |
315 | } | 316 | } |
316 | 317 | ||
317 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 318 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) |
318 | { | 319 | { |
319 | #ifdef CONFIG_SMP | 320 | #ifdef CONFIG_SMP |
320 | /* | 321 | /* |
321 | * We can optimise this out completely for !SMP, because the | 322 | * We can optimise this out completely for !SMP, because the |
322 | * SMP rebalancing from interrupt is the only thing that cares | 323 | * SMP rebalancing from interrupt is the only thing that cares |
323 | * here. | 324 | * here. |
324 | */ | 325 | */ |
325 | next->oncpu = 1; | 326 | next->oncpu = 1; |
326 | #endif | 327 | #endif |
327 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 328 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
328 | spin_unlock_irq(&rq->lock); | 329 | spin_unlock_irq(&rq->lock); |
329 | #else | 330 | #else |
330 | spin_unlock(&rq->lock); | 331 | spin_unlock(&rq->lock); |
331 | #endif | 332 | #endif |
332 | } | 333 | } |
333 | 334 | ||
334 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 335 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) |
335 | { | 336 | { |
336 | #ifdef CONFIG_SMP | 337 | #ifdef CONFIG_SMP |
337 | /* | 338 | /* |
338 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 339 | * After ->oncpu is cleared, the task can be moved to a different CPU. |
339 | * We must ensure this doesn't happen until the switch is completely | 340 | * We must ensure this doesn't happen until the switch is completely |
340 | * finished. | 341 | * finished. |
341 | */ | 342 | */ |
342 | smp_wmb(); | 343 | smp_wmb(); |
343 | prev->oncpu = 0; | 344 | prev->oncpu = 0; |
344 | #endif | 345 | #endif |
345 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 346 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
346 | local_irq_enable(); | 347 | local_irq_enable(); |
347 | #endif | 348 | #endif |
348 | } | 349 | } |
349 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 350 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
350 | 351 | ||
351 | /* | 352 | /* |
352 | * task_rq_lock - lock the runqueue a given task resides on and disable | 353 | * task_rq_lock - lock the runqueue a given task resides on and disable |
353 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 354 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
354 | * explicitly disabling preemption. | 355 | * explicitly disabling preemption. |
355 | */ | 356 | */ |
356 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 357 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) |
357 | __acquires(rq->lock) | 358 | __acquires(rq->lock) |
358 | { | 359 | { |
359 | struct runqueue *rq; | 360 | struct runqueue *rq; |
360 | 361 | ||
361 | repeat_lock_task: | 362 | repeat_lock_task: |
362 | local_irq_save(*flags); | 363 | local_irq_save(*flags); |
363 | rq = task_rq(p); | 364 | rq = task_rq(p); |
364 | spin_lock(&rq->lock); | 365 | spin_lock(&rq->lock); |
365 | if (unlikely(rq != task_rq(p))) { | 366 | if (unlikely(rq != task_rq(p))) { |
366 | spin_unlock_irqrestore(&rq->lock, *flags); | 367 | spin_unlock_irqrestore(&rq->lock, *flags); |
367 | goto repeat_lock_task; | 368 | goto repeat_lock_task; |
368 | } | 369 | } |
369 | return rq; | 370 | return rq; |
370 | } | 371 | } |
371 | 372 | ||
372 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 373 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) |
373 | __releases(rq->lock) | 374 | __releases(rq->lock) |
374 | { | 375 | { |
375 | spin_unlock_irqrestore(&rq->lock, *flags); | 376 | spin_unlock_irqrestore(&rq->lock, *flags); |
376 | } | 377 | } |
377 | 378 | ||
378 | #ifdef CONFIG_SCHEDSTATS | 379 | #ifdef CONFIG_SCHEDSTATS |
379 | /* | 380 | /* |
380 | * bump this up when changing the output format or the meaning of an existing | 381 | * bump this up when changing the output format or the meaning of an existing |
381 | * format, so that tools can adapt (or abort) | 382 | * format, so that tools can adapt (or abort) |
382 | */ | 383 | */ |
383 | #define SCHEDSTAT_VERSION 12 | 384 | #define SCHEDSTAT_VERSION 12 |
384 | 385 | ||
385 | static int show_schedstat(struct seq_file *seq, void *v) | 386 | static int show_schedstat(struct seq_file *seq, void *v) |
386 | { | 387 | { |
387 | int cpu; | 388 | int cpu; |
388 | 389 | ||
389 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 390 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
390 | seq_printf(seq, "timestamp %lu\n", jiffies); | 391 | seq_printf(seq, "timestamp %lu\n", jiffies); |
391 | for_each_online_cpu(cpu) { | 392 | for_each_online_cpu(cpu) { |
392 | runqueue_t *rq = cpu_rq(cpu); | 393 | runqueue_t *rq = cpu_rq(cpu); |
393 | #ifdef CONFIG_SMP | 394 | #ifdef CONFIG_SMP |
394 | struct sched_domain *sd; | 395 | struct sched_domain *sd; |
395 | int dcnt = 0; | 396 | int dcnt = 0; |
396 | #endif | 397 | #endif |
397 | 398 | ||
398 | /* runqueue-specific stats */ | 399 | /* runqueue-specific stats */ |
399 | seq_printf(seq, | 400 | seq_printf(seq, |
400 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", | 401 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", |
401 | cpu, rq->yld_both_empty, | 402 | cpu, rq->yld_both_empty, |
402 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | 403 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, |
403 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | 404 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, |
404 | rq->ttwu_cnt, rq->ttwu_local, | 405 | rq->ttwu_cnt, rq->ttwu_local, |
405 | rq->rq_sched_info.cpu_time, | 406 | rq->rq_sched_info.cpu_time, |
406 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | 407 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); |
407 | 408 | ||
408 | seq_printf(seq, "\n"); | 409 | seq_printf(seq, "\n"); |
409 | 410 | ||
410 | #ifdef CONFIG_SMP | 411 | #ifdef CONFIG_SMP |
411 | /* domain-specific stats */ | 412 | /* domain-specific stats */ |
412 | preempt_disable(); | 413 | preempt_disable(); |
413 | for_each_domain(cpu, sd) { | 414 | for_each_domain(cpu, sd) { |
414 | enum idle_type itype; | 415 | enum idle_type itype; |
415 | char mask_str[NR_CPUS]; | 416 | char mask_str[NR_CPUS]; |
416 | 417 | ||
417 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 418 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); |
418 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 419 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); |
419 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | 420 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; |
420 | itype++) { | 421 | itype++) { |
421 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", | 422 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", |
422 | sd->lb_cnt[itype], | 423 | sd->lb_cnt[itype], |
423 | sd->lb_balanced[itype], | 424 | sd->lb_balanced[itype], |
424 | sd->lb_failed[itype], | 425 | sd->lb_failed[itype], |
425 | sd->lb_imbalance[itype], | 426 | sd->lb_imbalance[itype], |
426 | sd->lb_gained[itype], | 427 | sd->lb_gained[itype], |
427 | sd->lb_hot_gained[itype], | 428 | sd->lb_hot_gained[itype], |
428 | sd->lb_nobusyq[itype], | 429 | sd->lb_nobusyq[itype], |
429 | sd->lb_nobusyg[itype]); | 430 | sd->lb_nobusyg[itype]); |
430 | } | 431 | } |
431 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", | 432 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", |
432 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 433 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
433 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 434 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
434 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 435 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, |
435 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 436 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); |
436 | } | 437 | } |
437 | preempt_enable(); | 438 | preempt_enable(); |
438 | #endif | 439 | #endif |
439 | } | 440 | } |
440 | return 0; | 441 | return 0; |
441 | } | 442 | } |
442 | 443 | ||
443 | static int schedstat_open(struct inode *inode, struct file *file) | 444 | static int schedstat_open(struct inode *inode, struct file *file) |
444 | { | 445 | { |
445 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | 446 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); |
446 | char *buf = kmalloc(size, GFP_KERNEL); | 447 | char *buf = kmalloc(size, GFP_KERNEL); |
447 | struct seq_file *m; | 448 | struct seq_file *m; |
448 | int res; | 449 | int res; |
449 | 450 | ||
450 | if (!buf) | 451 | if (!buf) |
451 | return -ENOMEM; | 452 | return -ENOMEM; |
452 | res = single_open(file, show_schedstat, NULL); | 453 | res = single_open(file, show_schedstat, NULL); |
453 | if (!res) { | 454 | if (!res) { |
454 | m = file->private_data; | 455 | m = file->private_data; |
455 | m->buf = buf; | 456 | m->buf = buf; |
456 | m->size = size; | 457 | m->size = size; |
457 | } else | 458 | } else |
458 | kfree(buf); | 459 | kfree(buf); |
459 | return res; | 460 | return res; |
460 | } | 461 | } |
461 | 462 | ||
462 | struct file_operations proc_schedstat_operations = { | 463 | struct file_operations proc_schedstat_operations = { |
463 | .open = schedstat_open, | 464 | .open = schedstat_open, |
464 | .read = seq_read, | 465 | .read = seq_read, |
465 | .llseek = seq_lseek, | 466 | .llseek = seq_lseek, |
466 | .release = single_release, | 467 | .release = single_release, |
467 | }; | 468 | }; |
468 | 469 | ||
469 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 470 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
470 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 471 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
471 | #else /* !CONFIG_SCHEDSTATS */ | 472 | #else /* !CONFIG_SCHEDSTATS */ |
472 | # define schedstat_inc(rq, field) do { } while (0) | 473 | # define schedstat_inc(rq, field) do { } while (0) |
473 | # define schedstat_add(rq, field, amt) do { } while (0) | 474 | # define schedstat_add(rq, field, amt) do { } while (0) |
474 | #endif | 475 | #endif |
475 | 476 | ||
476 | /* | 477 | /* |
477 | * rq_lock - lock a given runqueue and disable interrupts. | 478 | * rq_lock - lock a given runqueue and disable interrupts. |
478 | */ | 479 | */ |
479 | static inline runqueue_t *this_rq_lock(void) | 480 | static inline runqueue_t *this_rq_lock(void) |
480 | __acquires(rq->lock) | 481 | __acquires(rq->lock) |
481 | { | 482 | { |
482 | runqueue_t *rq; | 483 | runqueue_t *rq; |
483 | 484 | ||
484 | local_irq_disable(); | 485 | local_irq_disable(); |
485 | rq = this_rq(); | 486 | rq = this_rq(); |
486 | spin_lock(&rq->lock); | 487 | spin_lock(&rq->lock); |
487 | 488 | ||
488 | return rq; | 489 | return rq; |
489 | } | 490 | } |
490 | 491 | ||
491 | #ifdef CONFIG_SCHEDSTATS | 492 | #ifdef CONFIG_SCHEDSTATS |
492 | /* | 493 | /* |
493 | * Called when a process is dequeued from the active array and given | 494 | * Called when a process is dequeued from the active array and given |
494 | * the cpu. We should note that with the exception of interactive | 495 | * the cpu. We should note that with the exception of interactive |
495 | * tasks, the expired queue will become the active queue after the active | 496 | * tasks, the expired queue will become the active queue after the active |
496 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | 497 | * queue is empty, without explicitly dequeuing and requeuing tasks in the |
497 | * expired queue. (Interactive tasks may be requeued directly to the | 498 | * expired queue. (Interactive tasks may be requeued directly to the |
498 | * active queue, thus delaying tasks in the expired queue from running; | 499 | * active queue, thus delaying tasks in the expired queue from running; |
499 | * see scheduler_tick()). | 500 | * see scheduler_tick()). |
500 | * | 501 | * |
501 | * This function is only called from sched_info_arrive(), rather than | 502 | * This function is only called from sched_info_arrive(), rather than |
502 | * dequeue_task(). Even though a task may be queued and dequeued multiple | 503 | * dequeue_task(). Even though a task may be queued and dequeued multiple |
503 | * times as it is shuffled about, we're really interested in knowing how | 504 | * times as it is shuffled about, we're really interested in knowing how |
504 | * long it was from the *first* time it was queued to the time that it | 505 | * long it was from the *first* time it was queued to the time that it |
505 | * finally hit a cpu. | 506 | * finally hit a cpu. |
506 | */ | 507 | */ |
507 | static inline void sched_info_dequeued(task_t *t) | 508 | static inline void sched_info_dequeued(task_t *t) |
508 | { | 509 | { |
509 | t->sched_info.last_queued = 0; | 510 | t->sched_info.last_queued = 0; |
510 | } | 511 | } |
511 | 512 | ||
512 | /* | 513 | /* |
513 | * Called when a task finally hits the cpu. We can now calculate how | 514 | * Called when a task finally hits the cpu. We can now calculate how |
514 | * long it was waiting to run. We also note when it began so that we | 515 | * long it was waiting to run. We also note when it began so that we |
515 | * can keep stats on how long its timeslice is. | 516 | * can keep stats on how long its timeslice is. |
516 | */ | 517 | */ |
517 | static void sched_info_arrive(task_t *t) | 518 | static void sched_info_arrive(task_t *t) |
518 | { | 519 | { |
519 | unsigned long now = jiffies, diff = 0; | 520 | unsigned long now = jiffies, diff = 0; |
520 | struct runqueue *rq = task_rq(t); | 521 | struct runqueue *rq = task_rq(t); |
521 | 522 | ||
522 | if (t->sched_info.last_queued) | 523 | if (t->sched_info.last_queued) |
523 | diff = now - t->sched_info.last_queued; | 524 | diff = now - t->sched_info.last_queued; |
524 | sched_info_dequeued(t); | 525 | sched_info_dequeued(t); |
525 | t->sched_info.run_delay += diff; | 526 | t->sched_info.run_delay += diff; |
526 | t->sched_info.last_arrival = now; | 527 | t->sched_info.last_arrival = now; |
527 | t->sched_info.pcnt++; | 528 | t->sched_info.pcnt++; |
528 | 529 | ||
529 | if (!rq) | 530 | if (!rq) |
530 | return; | 531 | return; |
531 | 532 | ||
532 | rq->rq_sched_info.run_delay += diff; | 533 | rq->rq_sched_info.run_delay += diff; |
533 | rq->rq_sched_info.pcnt++; | 534 | rq->rq_sched_info.pcnt++; |
534 | } | 535 | } |
535 | 536 | ||
536 | /* | 537 | /* |
537 | * Called when a process is queued into either the active or expired | 538 | * Called when a process is queued into either the active or expired |
538 | * array. The time is noted and later used to determine how long we | 539 | * array. The time is noted and later used to determine how long we |
539 | * had to wait for us to reach the cpu. Since the expired queue will | 540 | * had to wait for us to reach the cpu. Since the expired queue will |
540 | * become the active queue after active queue is empty, without dequeuing | 541 | * become the active queue after active queue is empty, without dequeuing |
541 | * and requeuing any tasks, we are interested in queuing to either. It | 542 | * and requeuing any tasks, we are interested in queuing to either. It |
542 | * is unusual but not impossible for tasks to be dequeued and immediately | 543 | * is unusual but not impossible for tasks to be dequeued and immediately |
543 | * requeued in the same or another array: this can happen in sched_yield(), | 544 | * requeued in the same or another array: this can happen in sched_yield(), |
544 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | 545 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue |
545 | * to runqueue. | 546 | * to runqueue. |
546 | * | 547 | * |
547 | * This function is only called from enqueue_task(), but also only updates | 548 | * This function is only called from enqueue_task(), but also only updates |
548 | * the timestamp if it is already not set. It's assumed that | 549 | * the timestamp if it is already not set. It's assumed that |
549 | * sched_info_dequeued() will clear that stamp when appropriate. | 550 | * sched_info_dequeued() will clear that stamp when appropriate. |
550 | */ | 551 | */ |
551 | static inline void sched_info_queued(task_t *t) | 552 | static inline void sched_info_queued(task_t *t) |
552 | { | 553 | { |
553 | if (!t->sched_info.last_queued) | 554 | if (!t->sched_info.last_queued) |
554 | t->sched_info.last_queued = jiffies; | 555 | t->sched_info.last_queued = jiffies; |
555 | } | 556 | } |
556 | 557 | ||
557 | /* | 558 | /* |
558 | * Called when a process ceases being the active-running process, either | 559 | * Called when a process ceases being the active-running process, either |
559 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 560 | * voluntarily or involuntarily. Now we can calculate how long we ran. |
560 | */ | 561 | */ |
561 | static inline void sched_info_depart(task_t *t) | 562 | static inline void sched_info_depart(task_t *t) |
562 | { | 563 | { |
563 | struct runqueue *rq = task_rq(t); | 564 | struct runqueue *rq = task_rq(t); |
564 | unsigned long diff = jiffies - t->sched_info.last_arrival; | 565 | unsigned long diff = jiffies - t->sched_info.last_arrival; |
565 | 566 | ||
566 | t->sched_info.cpu_time += diff; | 567 | t->sched_info.cpu_time += diff; |
567 | 568 | ||
568 | if (rq) | 569 | if (rq) |
569 | rq->rq_sched_info.cpu_time += diff; | 570 | rq->rq_sched_info.cpu_time += diff; |
570 | } | 571 | } |
571 | 572 | ||
572 | /* | 573 | /* |
573 | * Called when tasks are switched involuntarily due, typically, to expiring | 574 | * Called when tasks are switched involuntarily due, typically, to expiring |
574 | * their time slice. (This may also be called when switching to or from | 575 | * their time slice. (This may also be called when switching to or from |
575 | * the idle task.) We are only called when prev != next. | 576 | * the idle task.) We are only called when prev != next. |
576 | */ | 577 | */ |
577 | static inline void sched_info_switch(task_t *prev, task_t *next) | 578 | static inline void sched_info_switch(task_t *prev, task_t *next) |
578 | { | 579 | { |
579 | struct runqueue *rq = task_rq(prev); | 580 | struct runqueue *rq = task_rq(prev); |
580 | 581 | ||
581 | /* | 582 | /* |
582 | * prev now departs the cpu. It's not interesting to record | 583 | * prev now departs the cpu. It's not interesting to record |
583 | * stats about how efficient we were at scheduling the idle | 584 | * stats about how efficient we were at scheduling the idle |
584 | * process, however. | 585 | * process, however. |
585 | */ | 586 | */ |
586 | if (prev != rq->idle) | 587 | if (prev != rq->idle) |
587 | sched_info_depart(prev); | 588 | sched_info_depart(prev); |
588 | 589 | ||
589 | if (next != rq->idle) | 590 | if (next != rq->idle) |
590 | sched_info_arrive(next); | 591 | sched_info_arrive(next); |
591 | } | 592 | } |
592 | #else | 593 | #else |
593 | #define sched_info_queued(t) do { } while (0) | 594 | #define sched_info_queued(t) do { } while (0) |
594 | #define sched_info_switch(t, next) do { } while (0) | 595 | #define sched_info_switch(t, next) do { } while (0) |
595 | #endif /* CONFIG_SCHEDSTATS */ | 596 | #endif /* CONFIG_SCHEDSTATS */ |
596 | 597 | ||
597 | /* | 598 | /* |
598 | * Adding/removing a task to/from a priority array: | 599 | * Adding/removing a task to/from a priority array: |
599 | */ | 600 | */ |
600 | static void dequeue_task(struct task_struct *p, prio_array_t *array) | 601 | static void dequeue_task(struct task_struct *p, prio_array_t *array) |
601 | { | 602 | { |
602 | array->nr_active--; | 603 | array->nr_active--; |
603 | list_del(&p->run_list); | 604 | list_del(&p->run_list); |
604 | if (list_empty(array->queue + p->prio)) | 605 | if (list_empty(array->queue + p->prio)) |
605 | __clear_bit(p->prio, array->bitmap); | 606 | __clear_bit(p->prio, array->bitmap); |
606 | } | 607 | } |
607 | 608 | ||
608 | static void enqueue_task(struct task_struct *p, prio_array_t *array) | 609 | static void enqueue_task(struct task_struct *p, prio_array_t *array) |
609 | { | 610 | { |
610 | sched_info_queued(p); | 611 | sched_info_queued(p); |
611 | list_add_tail(&p->run_list, array->queue + p->prio); | 612 | list_add_tail(&p->run_list, array->queue + p->prio); |
612 | __set_bit(p->prio, array->bitmap); | 613 | __set_bit(p->prio, array->bitmap); |
613 | array->nr_active++; | 614 | array->nr_active++; |
614 | p->array = array; | 615 | p->array = array; |
615 | } | 616 | } |
616 | 617 | ||
617 | /* | 618 | /* |
618 | * Put task to the end of the run list without the overhead of dequeue | 619 | * Put task to the end of the run list without the overhead of dequeue |
619 | * followed by enqueue. | 620 | * followed by enqueue. |
620 | */ | 621 | */ |
621 | static void requeue_task(struct task_struct *p, prio_array_t *array) | 622 | static void requeue_task(struct task_struct *p, prio_array_t *array) |
622 | { | 623 | { |
623 | list_move_tail(&p->run_list, array->queue + p->prio); | 624 | list_move_tail(&p->run_list, array->queue + p->prio); |
624 | } | 625 | } |
625 | 626 | ||
626 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | 627 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) |
627 | { | 628 | { |
628 | list_add(&p->run_list, array->queue + p->prio); | 629 | list_add(&p->run_list, array->queue + p->prio); |
629 | __set_bit(p->prio, array->bitmap); | 630 | __set_bit(p->prio, array->bitmap); |
630 | array->nr_active++; | 631 | array->nr_active++; |
631 | p->array = array; | 632 | p->array = array; |
632 | } | 633 | } |
633 | 634 | ||
634 | /* | 635 | /* |
635 | * effective_prio - return the priority that is based on the static | 636 | * effective_prio - return the priority that is based on the static |
636 | * priority but is modified by bonuses/penalties. | 637 | * priority but is modified by bonuses/penalties. |
637 | * | 638 | * |
638 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 639 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
639 | * into the -5 ... 0 ... +5 bonus/penalty range. | 640 | * into the -5 ... 0 ... +5 bonus/penalty range. |
640 | * | 641 | * |
641 | * We use 25% of the full 0...39 priority range so that: | 642 | * We use 25% of the full 0...39 priority range so that: |
642 | * | 643 | * |
643 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | 644 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. |
644 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | 645 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. |
645 | * | 646 | * |
646 | * Both properties are important to certain workloads. | 647 | * Both properties are important to certain workloads. |
647 | */ | 648 | */ |
648 | static int effective_prio(task_t *p) | 649 | static int effective_prio(task_t *p) |
649 | { | 650 | { |
650 | int bonus, prio; | 651 | int bonus, prio; |
651 | 652 | ||
652 | if (rt_task(p)) | 653 | if (rt_task(p)) |
653 | return p->prio; | 654 | return p->prio; |
654 | 655 | ||
655 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 656 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
656 | 657 | ||
657 | prio = p->static_prio - bonus; | 658 | prio = p->static_prio - bonus; |
658 | if (prio < MAX_RT_PRIO) | 659 | if (prio < MAX_RT_PRIO) |
659 | prio = MAX_RT_PRIO; | 660 | prio = MAX_RT_PRIO; |
660 | if (prio > MAX_PRIO-1) | 661 | if (prio > MAX_PRIO-1) |
661 | prio = MAX_PRIO-1; | 662 | prio = MAX_PRIO-1; |
662 | return prio; | 663 | return prio; |
663 | } | 664 | } |
664 | 665 | ||
665 | /* | 666 | /* |
666 | * __activate_task - move a task to the runqueue. | 667 | * __activate_task - move a task to the runqueue. |
667 | */ | 668 | */ |
668 | static inline void __activate_task(task_t *p, runqueue_t *rq) | 669 | static inline void __activate_task(task_t *p, runqueue_t *rq) |
669 | { | 670 | { |
670 | enqueue_task(p, rq->active); | 671 | enqueue_task(p, rq->active); |
671 | rq->nr_running++; | 672 | rq->nr_running++; |
672 | } | 673 | } |
673 | 674 | ||
674 | /* | 675 | /* |
675 | * __activate_idle_task - move idle task to the _front_ of runqueue. | 676 | * __activate_idle_task - move idle task to the _front_ of runqueue. |
676 | */ | 677 | */ |
677 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 678 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
678 | { | 679 | { |
679 | enqueue_task_head(p, rq->active); | 680 | enqueue_task_head(p, rq->active); |
680 | rq->nr_running++; | 681 | rq->nr_running++; |
681 | } | 682 | } |
682 | 683 | ||
683 | static int recalc_task_prio(task_t *p, unsigned long long now) | 684 | static int recalc_task_prio(task_t *p, unsigned long long now) |
684 | { | 685 | { |
685 | /* Caller must always ensure 'now >= p->timestamp' */ | 686 | /* Caller must always ensure 'now >= p->timestamp' */ |
686 | unsigned long long __sleep_time = now - p->timestamp; | 687 | unsigned long long __sleep_time = now - p->timestamp; |
687 | unsigned long sleep_time; | 688 | unsigned long sleep_time; |
688 | 689 | ||
689 | if (unlikely(p->policy == SCHED_BATCH)) | 690 | if (unlikely(p->policy == SCHED_BATCH)) |
690 | sleep_time = 0; | 691 | sleep_time = 0; |
691 | else { | 692 | else { |
692 | if (__sleep_time > NS_MAX_SLEEP_AVG) | 693 | if (__sleep_time > NS_MAX_SLEEP_AVG) |
693 | sleep_time = NS_MAX_SLEEP_AVG; | 694 | sleep_time = NS_MAX_SLEEP_AVG; |
694 | else | 695 | else |
695 | sleep_time = (unsigned long)__sleep_time; | 696 | sleep_time = (unsigned long)__sleep_time; |
696 | } | 697 | } |
697 | 698 | ||
698 | if (likely(sleep_time > 0)) { | 699 | if (likely(sleep_time > 0)) { |
699 | /* | 700 | /* |
700 | * User tasks that sleep a long time are categorised as | 701 | * User tasks that sleep a long time are categorised as |
701 | * idle and will get just interactive status to stay active & | 702 | * idle and will get just interactive status to stay active & |
702 | * prevent them suddenly becoming cpu hogs and starving | 703 | * prevent them suddenly becoming cpu hogs and starving |
703 | * other processes. | 704 | * other processes. |
704 | */ | 705 | */ |
705 | if (p->mm && p->activated != -1 && | 706 | if (p->mm && p->activated != -1 && |
706 | sleep_time > INTERACTIVE_SLEEP(p)) { | 707 | sleep_time > INTERACTIVE_SLEEP(p)) { |
707 | p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 708 | p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - |
708 | DEF_TIMESLICE); | 709 | DEF_TIMESLICE); |
709 | } else { | 710 | } else { |
710 | /* | 711 | /* |
711 | * Tasks waking from uninterruptible sleep are | 712 | * Tasks waking from uninterruptible sleep are |
712 | * limited in their sleep_avg rise as they | 713 | * limited in their sleep_avg rise as they |
713 | * are likely to be waiting on I/O | 714 | * are likely to be waiting on I/O |
714 | */ | 715 | */ |
715 | if (p->activated == -1 && p->mm) { | 716 | if (p->activated == -1 && p->mm) { |
716 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 717 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) |
717 | sleep_time = 0; | 718 | sleep_time = 0; |
718 | else if (p->sleep_avg + sleep_time >= | 719 | else if (p->sleep_avg + sleep_time >= |
719 | INTERACTIVE_SLEEP(p)) { | 720 | INTERACTIVE_SLEEP(p)) { |
720 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 721 | p->sleep_avg = INTERACTIVE_SLEEP(p); |
721 | sleep_time = 0; | 722 | sleep_time = 0; |
722 | } | 723 | } |
723 | } | 724 | } |
724 | 725 | ||
725 | /* | 726 | /* |
726 | * This code gives a bonus to interactive tasks. | 727 | * This code gives a bonus to interactive tasks. |
727 | * | 728 | * |
728 | * The boost works by updating the 'average sleep time' | 729 | * The boost works by updating the 'average sleep time' |
729 | * value here, based on ->timestamp. The more time a | 730 | * value here, based on ->timestamp. The more time a |
730 | * task spends sleeping, the higher the average gets - | 731 | * task spends sleeping, the higher the average gets - |
731 | * and the higher the priority boost gets as well. | 732 | * and the higher the priority boost gets as well. |
732 | */ | 733 | */ |
733 | p->sleep_avg += sleep_time; | 734 | p->sleep_avg += sleep_time; |
734 | 735 | ||
735 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | 736 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) |
736 | p->sleep_avg = NS_MAX_SLEEP_AVG; | 737 | p->sleep_avg = NS_MAX_SLEEP_AVG; |
737 | } | 738 | } |
738 | } | 739 | } |
739 | 740 | ||
740 | return effective_prio(p); | 741 | return effective_prio(p); |
741 | } | 742 | } |
742 | 743 | ||
743 | /* | 744 | /* |
744 | * activate_task - move a task to the runqueue and do priority recalculation | 745 | * activate_task - move a task to the runqueue and do priority recalculation |
745 | * | 746 | * |
746 | * Update all the scheduling statistics stuff. (sleep average | 747 | * Update all the scheduling statistics stuff. (sleep average |
747 | * calculation, priority modifiers, etc.) | 748 | * calculation, priority modifiers, etc.) |
748 | */ | 749 | */ |
749 | static void activate_task(task_t *p, runqueue_t *rq, int local) | 750 | static void activate_task(task_t *p, runqueue_t *rq, int local) |
750 | { | 751 | { |
751 | unsigned long long now; | 752 | unsigned long long now; |
752 | 753 | ||
753 | now = sched_clock(); | 754 | now = sched_clock(); |
754 | #ifdef CONFIG_SMP | 755 | #ifdef CONFIG_SMP |
755 | if (!local) { | 756 | if (!local) { |
756 | /* Compensate for drifting sched_clock */ | 757 | /* Compensate for drifting sched_clock */ |
757 | runqueue_t *this_rq = this_rq(); | 758 | runqueue_t *this_rq = this_rq(); |
758 | now = (now - this_rq->timestamp_last_tick) | 759 | now = (now - this_rq->timestamp_last_tick) |
759 | + rq->timestamp_last_tick; | 760 | + rq->timestamp_last_tick; |
760 | } | 761 | } |
761 | #endif | 762 | #endif |
762 | 763 | ||
763 | if (!rt_task(p)) | 764 | if (!rt_task(p)) |
764 | p->prio = recalc_task_prio(p, now); | 765 | p->prio = recalc_task_prio(p, now); |
765 | 766 | ||
766 | /* | 767 | /* |
767 | * This checks to make sure it's not an uninterruptible task | 768 | * This checks to make sure it's not an uninterruptible task |
768 | * that is now waking up. | 769 | * that is now waking up. |
769 | */ | 770 | */ |
770 | if (!p->activated) { | 771 | if (!p->activated) { |
771 | /* | 772 | /* |
772 | * Tasks which were woken up by interrupts (ie. hw events) | 773 | * Tasks which were woken up by interrupts (ie. hw events) |
773 | * are most likely of interactive nature. So we give them | 774 | * are most likely of interactive nature. So we give them |
774 | * the credit of extending their sleep time to the period | 775 | * the credit of extending their sleep time to the period |
775 | * of time they spend on the runqueue, waiting for execution | 776 | * of time they spend on the runqueue, waiting for execution |
776 | * on a CPU, first time around: | 777 | * on a CPU, first time around: |
777 | */ | 778 | */ |
778 | if (in_interrupt()) | 779 | if (in_interrupt()) |
779 | p->activated = 2; | 780 | p->activated = 2; |
780 | else { | 781 | else { |
781 | /* | 782 | /* |
782 | * Normal first-time wakeups get a credit too for | 783 | * Normal first-time wakeups get a credit too for |
783 | * on-runqueue time, but it will be weighted down: | 784 | * on-runqueue time, but it will be weighted down: |
784 | */ | 785 | */ |
785 | p->activated = 1; | 786 | p->activated = 1; |
786 | } | 787 | } |
787 | } | 788 | } |
788 | p->timestamp = now; | 789 | p->timestamp = now; |
789 | 790 | ||
790 | __activate_task(p, rq); | 791 | __activate_task(p, rq); |
791 | } | 792 | } |
792 | 793 | ||
793 | /* | 794 | /* |
794 | * deactivate_task - remove a task from the runqueue. | 795 | * deactivate_task - remove a task from the runqueue. |
795 | */ | 796 | */ |
796 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 797 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
797 | { | 798 | { |
798 | rq->nr_running--; | 799 | rq->nr_running--; |
799 | dequeue_task(p, p->array); | 800 | dequeue_task(p, p->array); |
800 | p->array = NULL; | 801 | p->array = NULL; |
801 | } | 802 | } |
802 | 803 | ||
803 | /* | 804 | /* |
804 | * resched_task - mark a task 'to be rescheduled now'. | 805 | * resched_task - mark a task 'to be rescheduled now'. |
805 | * | 806 | * |
806 | * On UP this means the setting of the need_resched flag, on SMP it | 807 | * On UP this means the setting of the need_resched flag, on SMP it |
807 | * might also involve a cross-CPU call to trigger the scheduler on | 808 | * might also involve a cross-CPU call to trigger the scheduler on |
808 | * the target CPU. | 809 | * the target CPU. |
809 | */ | 810 | */ |
810 | #ifdef CONFIG_SMP | 811 | #ifdef CONFIG_SMP |
811 | static void resched_task(task_t *p) | 812 | static void resched_task(task_t *p) |
812 | { | 813 | { |
813 | int cpu; | 814 | int cpu; |
814 | 815 | ||
815 | assert_spin_locked(&task_rq(p)->lock); | 816 | assert_spin_locked(&task_rq(p)->lock); |
816 | 817 | ||
817 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 818 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) |
818 | return; | 819 | return; |
819 | 820 | ||
820 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 821 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); |
821 | 822 | ||
822 | cpu = task_cpu(p); | 823 | cpu = task_cpu(p); |
823 | if (cpu == smp_processor_id()) | 824 | if (cpu == smp_processor_id()) |
824 | return; | 825 | return; |
825 | 826 | ||
826 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 827 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ |
827 | smp_mb(); | 828 | smp_mb(); |
828 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 829 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) |
829 | smp_send_reschedule(cpu); | 830 | smp_send_reschedule(cpu); |
830 | } | 831 | } |
831 | #else | 832 | #else |
832 | static inline void resched_task(task_t *p) | 833 | static inline void resched_task(task_t *p) |
833 | { | 834 | { |
834 | assert_spin_locked(&task_rq(p)->lock); | 835 | assert_spin_locked(&task_rq(p)->lock); |
835 | set_tsk_need_resched(p); | 836 | set_tsk_need_resched(p); |
836 | } | 837 | } |
837 | #endif | 838 | #endif |
838 | 839 | ||
839 | /** | 840 | /** |
840 | * task_curr - is this task currently executing on a CPU? | 841 | * task_curr - is this task currently executing on a CPU? |
841 | * @p: the task in question. | 842 | * @p: the task in question. |
842 | */ | 843 | */ |
843 | inline int task_curr(const task_t *p) | 844 | inline int task_curr(const task_t *p) |
844 | { | 845 | { |
845 | return cpu_curr(task_cpu(p)) == p; | 846 | return cpu_curr(task_cpu(p)) == p; |
846 | } | 847 | } |
847 | 848 | ||
848 | #ifdef CONFIG_SMP | 849 | #ifdef CONFIG_SMP |
849 | typedef struct { | 850 | typedef struct { |
850 | struct list_head list; | 851 | struct list_head list; |
851 | 852 | ||
852 | task_t *task; | 853 | task_t *task; |
853 | int dest_cpu; | 854 | int dest_cpu; |
854 | 855 | ||
855 | struct completion done; | 856 | struct completion done; |
856 | } migration_req_t; | 857 | } migration_req_t; |
857 | 858 | ||
858 | /* | 859 | /* |
859 | * The task's runqueue lock must be held. | 860 | * The task's runqueue lock must be held. |
860 | * Returns true if you have to wait for migration thread. | 861 | * Returns true if you have to wait for migration thread. |
861 | */ | 862 | */ |
862 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | 863 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) |
863 | { | 864 | { |
864 | runqueue_t *rq = task_rq(p); | 865 | runqueue_t *rq = task_rq(p); |
865 | 866 | ||
866 | /* | 867 | /* |
867 | * If the task is not on a runqueue (and not running), then | 868 | * If the task is not on a runqueue (and not running), then |
868 | * it is sufficient to simply update the task's cpu field. | 869 | * it is sufficient to simply update the task's cpu field. |
869 | */ | 870 | */ |
870 | if (!p->array && !task_running(rq, p)) { | 871 | if (!p->array && !task_running(rq, p)) { |
871 | set_task_cpu(p, dest_cpu); | 872 | set_task_cpu(p, dest_cpu); |
872 | return 0; | 873 | return 0; |
873 | } | 874 | } |
874 | 875 | ||
875 | init_completion(&req->done); | 876 | init_completion(&req->done); |
876 | req->task = p; | 877 | req->task = p; |
877 | req->dest_cpu = dest_cpu; | 878 | req->dest_cpu = dest_cpu; |
878 | list_add(&req->list, &rq->migration_queue); | 879 | list_add(&req->list, &rq->migration_queue); |
879 | return 1; | 880 | return 1; |
880 | } | 881 | } |
881 | 882 | ||
882 | /* | 883 | /* |
883 | * wait_task_inactive - wait for a thread to unschedule. | 884 | * wait_task_inactive - wait for a thread to unschedule. |
884 | * | 885 | * |
885 | * The caller must ensure that the task *will* unschedule sometime soon, | 886 | * The caller must ensure that the task *will* unschedule sometime soon, |
886 | * else this function might spin for a *long* time. This function can't | 887 | * else this function might spin for a *long* time. This function can't |
887 | * be called with interrupts off, or it may introduce deadlock with | 888 | * be called with interrupts off, or it may introduce deadlock with |
888 | * smp_call_function() if an IPI is sent by the same process we are | 889 | * smp_call_function() if an IPI is sent by the same process we are |
889 | * waiting to become inactive. | 890 | * waiting to become inactive. |
890 | */ | 891 | */ |
891 | void wait_task_inactive(task_t *p) | 892 | void wait_task_inactive(task_t *p) |
892 | { | 893 | { |
893 | unsigned long flags; | 894 | unsigned long flags; |
894 | runqueue_t *rq; | 895 | runqueue_t *rq; |
895 | int preempted; | 896 | int preempted; |
896 | 897 | ||
897 | repeat: | 898 | repeat: |
898 | rq = task_rq_lock(p, &flags); | 899 | rq = task_rq_lock(p, &flags); |
899 | /* Must be off runqueue entirely, not preempted. */ | 900 | /* Must be off runqueue entirely, not preempted. */ |
900 | if (unlikely(p->array || task_running(rq, p))) { | 901 | if (unlikely(p->array || task_running(rq, p))) { |
901 | /* If it's preempted, we yield. It could be a while. */ | 902 | /* If it's preempted, we yield. It could be a while. */ |
902 | preempted = !task_running(rq, p); | 903 | preempted = !task_running(rq, p); |
903 | task_rq_unlock(rq, &flags); | 904 | task_rq_unlock(rq, &flags); |
904 | cpu_relax(); | 905 | cpu_relax(); |
905 | if (preempted) | 906 | if (preempted) |
906 | yield(); | 907 | yield(); |
907 | goto repeat; | 908 | goto repeat; |
908 | } | 909 | } |
909 | task_rq_unlock(rq, &flags); | 910 | task_rq_unlock(rq, &flags); |
910 | } | 911 | } |
911 | 912 | ||
912 | /*** | 913 | /*** |
913 | * kick_process - kick a running thread to enter/exit the kernel | 914 | * kick_process - kick a running thread to enter/exit the kernel |
914 | * @p: the to-be-kicked thread | 915 | * @p: the to-be-kicked thread |
915 | * | 916 | * |
916 | * Cause a process which is running on another CPU to enter | 917 | * Cause a process which is running on another CPU to enter |
917 | * kernel-mode, without any delay. (to get signals handled.) | 918 | * kernel-mode, without any delay. (to get signals handled.) |
918 | * | 919 | * |
919 | * NOTE: this function doesnt have to take the runqueue lock, | 920 | * NOTE: this function doesnt have to take the runqueue lock, |
920 | * because all it wants to ensure is that the remote task enters | 921 | * because all it wants to ensure is that the remote task enters |
921 | * the kernel. If the IPI races and the task has been migrated | 922 | * the kernel. If the IPI races and the task has been migrated |
922 | * to another CPU then no harm is done and the purpose has been | 923 | * to another CPU then no harm is done and the purpose has been |
923 | * achieved as well. | 924 | * achieved as well. |
924 | */ | 925 | */ |
925 | void kick_process(task_t *p) | 926 | void kick_process(task_t *p) |
926 | { | 927 | { |
927 | int cpu; | 928 | int cpu; |
928 | 929 | ||
929 | preempt_disable(); | 930 | preempt_disable(); |
930 | cpu = task_cpu(p); | 931 | cpu = task_cpu(p); |
931 | if ((cpu != smp_processor_id()) && task_curr(p)) | 932 | if ((cpu != smp_processor_id()) && task_curr(p)) |
932 | smp_send_reschedule(cpu); | 933 | smp_send_reschedule(cpu); |
933 | preempt_enable(); | 934 | preempt_enable(); |
934 | } | 935 | } |
935 | 936 | ||
936 | /* | 937 | /* |
937 | * Return a low guess at the load of a migration-source cpu. | 938 | * Return a low guess at the load of a migration-source cpu. |
938 | * | 939 | * |
939 | * We want to under-estimate the load of migration sources, to | 940 | * We want to under-estimate the load of migration sources, to |
940 | * balance conservatively. | 941 | * balance conservatively. |
941 | */ | 942 | */ |
942 | static inline unsigned long source_load(int cpu, int type) | 943 | static inline unsigned long source_load(int cpu, int type) |
943 | { | 944 | { |
944 | runqueue_t *rq = cpu_rq(cpu); | 945 | runqueue_t *rq = cpu_rq(cpu); |
945 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 946 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
946 | if (type == 0) | 947 | if (type == 0) |
947 | return load_now; | 948 | return load_now; |
948 | 949 | ||
949 | return min(rq->cpu_load[type-1], load_now); | 950 | return min(rq->cpu_load[type-1], load_now); |
950 | } | 951 | } |
951 | 952 | ||
952 | /* | 953 | /* |
953 | * Return a high guess at the load of a migration-target cpu | 954 | * Return a high guess at the load of a migration-target cpu |
954 | */ | 955 | */ |
955 | static inline unsigned long target_load(int cpu, int type) | 956 | static inline unsigned long target_load(int cpu, int type) |
956 | { | 957 | { |
957 | runqueue_t *rq = cpu_rq(cpu); | 958 | runqueue_t *rq = cpu_rq(cpu); |
958 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 959 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
959 | if (type == 0) | 960 | if (type == 0) |
960 | return load_now; | 961 | return load_now; |
961 | 962 | ||
962 | return max(rq->cpu_load[type-1], load_now); | 963 | return max(rq->cpu_load[type-1], load_now); |
963 | } | 964 | } |
964 | 965 | ||
965 | /* | 966 | /* |
966 | * find_idlest_group finds and returns the least busy CPU group within the | 967 | * find_idlest_group finds and returns the least busy CPU group within the |
967 | * domain. | 968 | * domain. |
968 | */ | 969 | */ |
969 | static struct sched_group * | 970 | static struct sched_group * |
970 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | 971 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) |
971 | { | 972 | { |
972 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 973 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; |
973 | unsigned long min_load = ULONG_MAX, this_load = 0; | 974 | unsigned long min_load = ULONG_MAX, this_load = 0; |
974 | int load_idx = sd->forkexec_idx; | 975 | int load_idx = sd->forkexec_idx; |
975 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 976 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
976 | 977 | ||
977 | do { | 978 | do { |
978 | unsigned long load, avg_load; | 979 | unsigned long load, avg_load; |
979 | int local_group; | 980 | int local_group; |
980 | int i; | 981 | int i; |
981 | 982 | ||
982 | /* Skip over this group if it has no CPUs allowed */ | 983 | /* Skip over this group if it has no CPUs allowed */ |
983 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 984 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
984 | goto nextgroup; | 985 | goto nextgroup; |
985 | 986 | ||
986 | local_group = cpu_isset(this_cpu, group->cpumask); | 987 | local_group = cpu_isset(this_cpu, group->cpumask); |
987 | 988 | ||
988 | /* Tally up the load of all CPUs in the group */ | 989 | /* Tally up the load of all CPUs in the group */ |
989 | avg_load = 0; | 990 | avg_load = 0; |
990 | 991 | ||
991 | for_each_cpu_mask(i, group->cpumask) { | 992 | for_each_cpu_mask(i, group->cpumask) { |
992 | /* Bias balancing toward cpus of our domain */ | 993 | /* Bias balancing toward cpus of our domain */ |
993 | if (local_group) | 994 | if (local_group) |
994 | load = source_load(i, load_idx); | 995 | load = source_load(i, load_idx); |
995 | else | 996 | else |
996 | load = target_load(i, load_idx); | 997 | load = target_load(i, load_idx); |
997 | 998 | ||
998 | avg_load += load; | 999 | avg_load += load; |
999 | } | 1000 | } |
1000 | 1001 | ||
1001 | /* Adjust by relative CPU power of the group */ | 1002 | /* Adjust by relative CPU power of the group */ |
1002 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1003 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1003 | 1004 | ||
1004 | if (local_group) { | 1005 | if (local_group) { |
1005 | this_load = avg_load; | 1006 | this_load = avg_load; |
1006 | this = group; | 1007 | this = group; |
1007 | } else if (avg_load < min_load) { | 1008 | } else if (avg_load < min_load) { |
1008 | min_load = avg_load; | 1009 | min_load = avg_load; |
1009 | idlest = group; | 1010 | idlest = group; |
1010 | } | 1011 | } |
1011 | nextgroup: | 1012 | nextgroup: |
1012 | group = group->next; | 1013 | group = group->next; |
1013 | } while (group != sd->groups); | 1014 | } while (group != sd->groups); |
1014 | 1015 | ||
1015 | if (!idlest || 100*this_load < imbalance*min_load) | 1016 | if (!idlest || 100*this_load < imbalance*min_load) |
1016 | return NULL; | 1017 | return NULL; |
1017 | return idlest; | 1018 | return idlest; |
1018 | } | 1019 | } |
1019 | 1020 | ||
1020 | /* | 1021 | /* |
1021 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | 1022 | * find_idlest_queue - find the idlest runqueue among the cpus in group. |
1022 | */ | 1023 | */ |
1023 | static int | 1024 | static int |
1024 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 1025 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
1025 | { | 1026 | { |
1026 | cpumask_t tmp; | 1027 | cpumask_t tmp; |
1027 | unsigned long load, min_load = ULONG_MAX; | 1028 | unsigned long load, min_load = ULONG_MAX; |
1028 | int idlest = -1; | 1029 | int idlest = -1; |
1029 | int i; | 1030 | int i; |
1030 | 1031 | ||
1031 | /* Traverse only the allowed CPUs */ | 1032 | /* Traverse only the allowed CPUs */ |
1032 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1033 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1033 | 1034 | ||
1034 | for_each_cpu_mask(i, tmp) { | 1035 | for_each_cpu_mask(i, tmp) { |
1035 | load = source_load(i, 0); | 1036 | load = source_load(i, 0); |
1036 | 1037 | ||
1037 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1038 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1038 | min_load = load; | 1039 | min_load = load; |
1039 | idlest = i; | 1040 | idlest = i; |
1040 | } | 1041 | } |
1041 | } | 1042 | } |
1042 | 1043 | ||
1043 | return idlest; | 1044 | return idlest; |
1044 | } | 1045 | } |
1045 | 1046 | ||
1046 | /* | 1047 | /* |
1047 | * sched_balance_self: balance the current task (running on cpu) in domains | 1048 | * sched_balance_self: balance the current task (running on cpu) in domains |
1048 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | 1049 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and |
1049 | * SD_BALANCE_EXEC. | 1050 | * SD_BALANCE_EXEC. |
1050 | * | 1051 | * |
1051 | * Balance, ie. select the least loaded group. | 1052 | * Balance, ie. select the least loaded group. |
1052 | * | 1053 | * |
1053 | * Returns the target CPU number, or the same CPU if no balancing is needed. | 1054 | * Returns the target CPU number, or the same CPU if no balancing is needed. |
1054 | * | 1055 | * |
1055 | * preempt must be disabled. | 1056 | * preempt must be disabled. |
1056 | */ | 1057 | */ |
1057 | static int sched_balance_self(int cpu, int flag) | 1058 | static int sched_balance_self(int cpu, int flag) |
1058 | { | 1059 | { |
1059 | struct task_struct *t = current; | 1060 | struct task_struct *t = current; |
1060 | struct sched_domain *tmp, *sd = NULL; | 1061 | struct sched_domain *tmp, *sd = NULL; |
1061 | 1062 | ||
1062 | for_each_domain(cpu, tmp) | 1063 | for_each_domain(cpu, tmp) |
1063 | if (tmp->flags & flag) | 1064 | if (tmp->flags & flag) |
1064 | sd = tmp; | 1065 | sd = tmp; |
1065 | 1066 | ||
1066 | while (sd) { | 1067 | while (sd) { |
1067 | cpumask_t span; | 1068 | cpumask_t span; |
1068 | struct sched_group *group; | 1069 | struct sched_group *group; |
1069 | int new_cpu; | 1070 | int new_cpu; |
1070 | int weight; | 1071 | int weight; |
1071 | 1072 | ||
1072 | span = sd->span; | 1073 | span = sd->span; |
1073 | group = find_idlest_group(sd, t, cpu); | 1074 | group = find_idlest_group(sd, t, cpu); |
1074 | if (!group) | 1075 | if (!group) |
1075 | goto nextlevel; | 1076 | goto nextlevel; |
1076 | 1077 | ||
1077 | new_cpu = find_idlest_cpu(group, t, cpu); | 1078 | new_cpu = find_idlest_cpu(group, t, cpu); |
1078 | if (new_cpu == -1 || new_cpu == cpu) | 1079 | if (new_cpu == -1 || new_cpu == cpu) |
1079 | goto nextlevel; | 1080 | goto nextlevel; |
1080 | 1081 | ||
1081 | /* Now try balancing at a lower domain level */ | 1082 | /* Now try balancing at a lower domain level */ |
1082 | cpu = new_cpu; | 1083 | cpu = new_cpu; |
1083 | nextlevel: | 1084 | nextlevel: |
1084 | sd = NULL; | 1085 | sd = NULL; |
1085 | weight = cpus_weight(span); | 1086 | weight = cpus_weight(span); |
1086 | for_each_domain(cpu, tmp) { | 1087 | for_each_domain(cpu, tmp) { |
1087 | if (weight <= cpus_weight(tmp->span)) | 1088 | if (weight <= cpus_weight(tmp->span)) |
1088 | break; | 1089 | break; |
1089 | if (tmp->flags & flag) | 1090 | if (tmp->flags & flag) |
1090 | sd = tmp; | 1091 | sd = tmp; |
1091 | } | 1092 | } |
1092 | /* while loop will break here if sd == NULL */ | 1093 | /* while loop will break here if sd == NULL */ |
1093 | } | 1094 | } |
1094 | 1095 | ||
1095 | return cpu; | 1096 | return cpu; |
1096 | } | 1097 | } |
1097 | 1098 | ||
1098 | #endif /* CONFIG_SMP */ | 1099 | #endif /* CONFIG_SMP */ |
1099 | 1100 | ||
1100 | /* | 1101 | /* |
1101 | * wake_idle() will wake a task on an idle cpu if task->cpu is | 1102 | * wake_idle() will wake a task on an idle cpu if task->cpu is |
1102 | * not idle and an idle cpu is available. The span of cpus to | 1103 | * not idle and an idle cpu is available. The span of cpus to |
1103 | * search starts with cpus closest then further out as needed, | 1104 | * search starts with cpus closest then further out as needed, |
1104 | * so we always favor a closer, idle cpu. | 1105 | * so we always favor a closer, idle cpu. |
1105 | * | 1106 | * |
1106 | * Returns the CPU we should wake onto. | 1107 | * Returns the CPU we should wake onto. |
1107 | */ | 1108 | */ |
1108 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1109 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
1109 | static int wake_idle(int cpu, task_t *p) | 1110 | static int wake_idle(int cpu, task_t *p) |
1110 | { | 1111 | { |
1111 | cpumask_t tmp; | 1112 | cpumask_t tmp; |
1112 | struct sched_domain *sd; | 1113 | struct sched_domain *sd; |
1113 | int i; | 1114 | int i; |
1114 | 1115 | ||
1115 | if (idle_cpu(cpu)) | 1116 | if (idle_cpu(cpu)) |
1116 | return cpu; | 1117 | return cpu; |
1117 | 1118 | ||
1118 | for_each_domain(cpu, sd) { | 1119 | for_each_domain(cpu, sd) { |
1119 | if (sd->flags & SD_WAKE_IDLE) { | 1120 | if (sd->flags & SD_WAKE_IDLE) { |
1120 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1121 | cpus_and(tmp, sd->span, p->cpus_allowed); |
1121 | for_each_cpu_mask(i, tmp) { | 1122 | for_each_cpu_mask(i, tmp) { |
1122 | if (idle_cpu(i)) | 1123 | if (idle_cpu(i)) |
1123 | return i; | 1124 | return i; |
1124 | } | 1125 | } |
1125 | } | 1126 | } |
1126 | else | 1127 | else |
1127 | break; | 1128 | break; |
1128 | } | 1129 | } |
1129 | return cpu; | 1130 | return cpu; |
1130 | } | 1131 | } |
1131 | #else | 1132 | #else |
1132 | static inline int wake_idle(int cpu, task_t *p) | 1133 | static inline int wake_idle(int cpu, task_t *p) |
1133 | { | 1134 | { |
1134 | return cpu; | 1135 | return cpu; |
1135 | } | 1136 | } |
1136 | #endif | 1137 | #endif |
1137 | 1138 | ||
1138 | /*** | 1139 | /*** |
1139 | * try_to_wake_up - wake up a thread | 1140 | * try_to_wake_up - wake up a thread |
1140 | * @p: the to-be-woken-up thread | 1141 | * @p: the to-be-woken-up thread |
1141 | * @state: the mask of task states that can be woken | 1142 | * @state: the mask of task states that can be woken |
1142 | * @sync: do a synchronous wakeup? | 1143 | * @sync: do a synchronous wakeup? |
1143 | * | 1144 | * |
1144 | * Put it on the run-queue if it's not already there. The "current" | 1145 | * Put it on the run-queue if it's not already there. The "current" |
1145 | * thread is always on the run-queue (except when the actual | 1146 | * thread is always on the run-queue (except when the actual |
1146 | * re-schedule is in progress), and as such you're allowed to do | 1147 | * re-schedule is in progress), and as such you're allowed to do |
1147 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1148 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
1148 | * runnable without the overhead of this. | 1149 | * runnable without the overhead of this. |
1149 | * | 1150 | * |
1150 | * returns failure only if the task is already active. | 1151 | * returns failure only if the task is already active. |
1151 | */ | 1152 | */ |
1152 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) | 1153 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) |
1153 | { | 1154 | { |
1154 | int cpu, this_cpu, success = 0; | 1155 | int cpu, this_cpu, success = 0; |
1155 | unsigned long flags; | 1156 | unsigned long flags; |
1156 | long old_state; | 1157 | long old_state; |
1157 | runqueue_t *rq; | 1158 | runqueue_t *rq; |
1158 | #ifdef CONFIG_SMP | 1159 | #ifdef CONFIG_SMP |
1159 | unsigned long load, this_load; | 1160 | unsigned long load, this_load; |
1160 | struct sched_domain *sd, *this_sd = NULL; | 1161 | struct sched_domain *sd, *this_sd = NULL; |
1161 | int new_cpu; | 1162 | int new_cpu; |
1162 | #endif | 1163 | #endif |
1163 | 1164 | ||
1164 | rq = task_rq_lock(p, &flags); | 1165 | rq = task_rq_lock(p, &flags); |
1165 | old_state = p->state; | 1166 | old_state = p->state; |
1166 | if (!(old_state & state)) | 1167 | if (!(old_state & state)) |
1167 | goto out; | 1168 | goto out; |
1168 | 1169 | ||
1169 | if (p->array) | 1170 | if (p->array) |
1170 | goto out_running; | 1171 | goto out_running; |
1171 | 1172 | ||
1172 | cpu = task_cpu(p); | 1173 | cpu = task_cpu(p); |
1173 | this_cpu = smp_processor_id(); | 1174 | this_cpu = smp_processor_id(); |
1174 | 1175 | ||
1175 | #ifdef CONFIG_SMP | 1176 | #ifdef CONFIG_SMP |
1176 | if (unlikely(task_running(rq, p))) | 1177 | if (unlikely(task_running(rq, p))) |
1177 | goto out_activate; | 1178 | goto out_activate; |
1178 | 1179 | ||
1179 | new_cpu = cpu; | 1180 | new_cpu = cpu; |
1180 | 1181 | ||
1181 | schedstat_inc(rq, ttwu_cnt); | 1182 | schedstat_inc(rq, ttwu_cnt); |
1182 | if (cpu == this_cpu) { | 1183 | if (cpu == this_cpu) { |
1183 | schedstat_inc(rq, ttwu_local); | 1184 | schedstat_inc(rq, ttwu_local); |
1184 | goto out_set_cpu; | 1185 | goto out_set_cpu; |
1185 | } | 1186 | } |
1186 | 1187 | ||
1187 | for_each_domain(this_cpu, sd) { | 1188 | for_each_domain(this_cpu, sd) { |
1188 | if (cpu_isset(cpu, sd->span)) { | 1189 | if (cpu_isset(cpu, sd->span)) { |
1189 | schedstat_inc(sd, ttwu_wake_remote); | 1190 | schedstat_inc(sd, ttwu_wake_remote); |
1190 | this_sd = sd; | 1191 | this_sd = sd; |
1191 | break; | 1192 | break; |
1192 | } | 1193 | } |
1193 | } | 1194 | } |
1194 | 1195 | ||
1195 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1196 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1196 | goto out_set_cpu; | 1197 | goto out_set_cpu; |
1197 | 1198 | ||
1198 | /* | 1199 | /* |
1199 | * Check for affine wakeup and passive balancing possibilities. | 1200 | * Check for affine wakeup and passive balancing possibilities. |
1200 | */ | 1201 | */ |
1201 | if (this_sd) { | 1202 | if (this_sd) { |
1202 | int idx = this_sd->wake_idx; | 1203 | int idx = this_sd->wake_idx; |
1203 | unsigned int imbalance; | 1204 | unsigned int imbalance; |
1204 | 1205 | ||
1205 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | 1206 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
1206 | 1207 | ||
1207 | load = source_load(cpu, idx); | 1208 | load = source_load(cpu, idx); |
1208 | this_load = target_load(this_cpu, idx); | 1209 | this_load = target_load(this_cpu, idx); |
1209 | 1210 | ||
1210 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | 1211 | new_cpu = this_cpu; /* Wake to this CPU if we can */ |
1211 | 1212 | ||
1212 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1213 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1213 | unsigned long tl = this_load; | 1214 | unsigned long tl = this_load; |
1214 | /* | 1215 | /* |
1215 | * If sync wakeup then subtract the (maximum possible) | 1216 | * If sync wakeup then subtract the (maximum possible) |
1216 | * effect of the currently running task from the load | 1217 | * effect of the currently running task from the load |
1217 | * of the current CPU: | 1218 | * of the current CPU: |
1218 | */ | 1219 | */ |
1219 | if (sync) | 1220 | if (sync) |
1220 | tl -= SCHED_LOAD_SCALE; | 1221 | tl -= SCHED_LOAD_SCALE; |
1221 | 1222 | ||
1222 | if ((tl <= load && | 1223 | if ((tl <= load && |
1223 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1224 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || |
1224 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1225 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { |
1225 | /* | 1226 | /* |
1226 | * This domain has SD_WAKE_AFFINE and | 1227 | * This domain has SD_WAKE_AFFINE and |
1227 | * p is cache cold in this domain, and | 1228 | * p is cache cold in this domain, and |
1228 | * there is no bad imbalance. | 1229 | * there is no bad imbalance. |
1229 | */ | 1230 | */ |
1230 | schedstat_inc(this_sd, ttwu_move_affine); | 1231 | schedstat_inc(this_sd, ttwu_move_affine); |
1231 | goto out_set_cpu; | 1232 | goto out_set_cpu; |
1232 | } | 1233 | } |
1233 | } | 1234 | } |
1234 | 1235 | ||
1235 | /* | 1236 | /* |
1236 | * Start passive balancing when half the imbalance_pct | 1237 | * Start passive balancing when half the imbalance_pct |
1237 | * limit is reached. | 1238 | * limit is reached. |
1238 | */ | 1239 | */ |
1239 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1240 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1240 | if (imbalance*this_load <= 100*load) { | 1241 | if (imbalance*this_load <= 100*load) { |
1241 | schedstat_inc(this_sd, ttwu_move_balance); | 1242 | schedstat_inc(this_sd, ttwu_move_balance); |
1242 | goto out_set_cpu; | 1243 | goto out_set_cpu; |
1243 | } | 1244 | } |
1244 | } | 1245 | } |
1245 | } | 1246 | } |
1246 | 1247 | ||
1247 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | 1248 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ |
1248 | out_set_cpu: | 1249 | out_set_cpu: |
1249 | new_cpu = wake_idle(new_cpu, p); | 1250 | new_cpu = wake_idle(new_cpu, p); |
1250 | if (new_cpu != cpu) { | 1251 | if (new_cpu != cpu) { |
1251 | set_task_cpu(p, new_cpu); | 1252 | set_task_cpu(p, new_cpu); |
1252 | task_rq_unlock(rq, &flags); | 1253 | task_rq_unlock(rq, &flags); |
1253 | /* might preempt at this point */ | 1254 | /* might preempt at this point */ |
1254 | rq = task_rq_lock(p, &flags); | 1255 | rq = task_rq_lock(p, &flags); |
1255 | old_state = p->state; | 1256 | old_state = p->state; |
1256 | if (!(old_state & state)) | 1257 | if (!(old_state & state)) |
1257 | goto out; | 1258 | goto out; |
1258 | if (p->array) | 1259 | if (p->array) |
1259 | goto out_running; | 1260 | goto out_running; |
1260 | 1261 | ||
1261 | this_cpu = smp_processor_id(); | 1262 | this_cpu = smp_processor_id(); |
1262 | cpu = task_cpu(p); | 1263 | cpu = task_cpu(p); |
1263 | } | 1264 | } |
1264 | 1265 | ||
1265 | out_activate: | 1266 | out_activate: |
1266 | #endif /* CONFIG_SMP */ | 1267 | #endif /* CONFIG_SMP */ |
1267 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1268 | if (old_state == TASK_UNINTERRUPTIBLE) { |
1268 | rq->nr_uninterruptible--; | 1269 | rq->nr_uninterruptible--; |
1269 | /* | 1270 | /* |
1270 | * Tasks on involuntary sleep don't earn | 1271 | * Tasks on involuntary sleep don't earn |
1271 | * sleep_avg beyond just interactive state. | 1272 | * sleep_avg beyond just interactive state. |
1272 | */ | 1273 | */ |
1273 | p->activated = -1; | 1274 | p->activated = -1; |
1274 | } | 1275 | } |
1275 | 1276 | ||
1276 | /* | 1277 | /* |
1277 | * Tasks that have marked their sleep as noninteractive get | 1278 | * Tasks that have marked their sleep as noninteractive get |
1278 | * woken up without updating their sleep average. (i.e. their | 1279 | * woken up without updating their sleep average. (i.e. their |
1279 | * sleep is handled in a priority-neutral manner, no priority | 1280 | * sleep is handled in a priority-neutral manner, no priority |
1280 | * boost and no penalty.) | 1281 | * boost and no penalty.) |
1281 | */ | 1282 | */ |
1282 | if (old_state & TASK_NONINTERACTIVE) | 1283 | if (old_state & TASK_NONINTERACTIVE) |
1283 | __activate_task(p, rq); | 1284 | __activate_task(p, rq); |
1284 | else | 1285 | else |
1285 | activate_task(p, rq, cpu == this_cpu); | 1286 | activate_task(p, rq, cpu == this_cpu); |
1286 | /* | 1287 | /* |
1287 | * Sync wakeups (i.e. those types of wakeups where the waker | 1288 | * Sync wakeups (i.e. those types of wakeups where the waker |
1288 | * has indicated that it will leave the CPU in short order) | 1289 | * has indicated that it will leave the CPU in short order) |
1289 | * don't trigger a preemption, if the woken up task will run on | 1290 | * don't trigger a preemption, if the woken up task will run on |
1290 | * this cpu. (in this case the 'I will reschedule' promise of | 1291 | * this cpu. (in this case the 'I will reschedule' promise of |
1291 | * the waker guarantees that the freshly woken up task is going | 1292 | * the waker guarantees that the freshly woken up task is going |
1292 | * to be considered on this CPU.) | 1293 | * to be considered on this CPU.) |
1293 | */ | 1294 | */ |
1294 | if (!sync || cpu != this_cpu) { | 1295 | if (!sync || cpu != this_cpu) { |
1295 | if (TASK_PREEMPTS_CURR(p, rq)) | 1296 | if (TASK_PREEMPTS_CURR(p, rq)) |
1296 | resched_task(rq->curr); | 1297 | resched_task(rq->curr); |
1297 | } | 1298 | } |
1298 | success = 1; | 1299 | success = 1; |
1299 | 1300 | ||
1300 | out_running: | 1301 | out_running: |
1301 | p->state = TASK_RUNNING; | 1302 | p->state = TASK_RUNNING; |
1302 | out: | 1303 | out: |
1303 | task_rq_unlock(rq, &flags); | 1304 | task_rq_unlock(rq, &flags); |
1304 | 1305 | ||
1305 | return success; | 1306 | return success; |
1306 | } | 1307 | } |
1307 | 1308 | ||
1308 | int fastcall wake_up_process(task_t *p) | 1309 | int fastcall wake_up_process(task_t *p) |
1309 | { | 1310 | { |
1310 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1311 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1311 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1312 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
1312 | } | 1313 | } |
1313 | 1314 | ||
1314 | EXPORT_SYMBOL(wake_up_process); | 1315 | EXPORT_SYMBOL(wake_up_process); |
1315 | 1316 | ||
1316 | int fastcall wake_up_state(task_t *p, unsigned int state) | 1317 | int fastcall wake_up_state(task_t *p, unsigned int state) |
1317 | { | 1318 | { |
1318 | return try_to_wake_up(p, state, 0); | 1319 | return try_to_wake_up(p, state, 0); |
1319 | } | 1320 | } |
1320 | 1321 | ||
1321 | /* | 1322 | /* |
1322 | * Perform scheduler related setup for a newly forked process p. | 1323 | * Perform scheduler related setup for a newly forked process p. |
1323 | * p is forked by current. | 1324 | * p is forked by current. |
1324 | */ | 1325 | */ |
1325 | void fastcall sched_fork(task_t *p, int clone_flags) | 1326 | void fastcall sched_fork(task_t *p, int clone_flags) |
1326 | { | 1327 | { |
1327 | int cpu = get_cpu(); | 1328 | int cpu = get_cpu(); |
1328 | 1329 | ||
1329 | #ifdef CONFIG_SMP | 1330 | #ifdef CONFIG_SMP |
1330 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1331 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
1331 | #endif | 1332 | #endif |
1332 | set_task_cpu(p, cpu); | 1333 | set_task_cpu(p, cpu); |
1333 | 1334 | ||
1334 | /* | 1335 | /* |
1335 | * We mark the process as running here, but have not actually | 1336 | * We mark the process as running here, but have not actually |
1336 | * inserted it onto the runqueue yet. This guarantees that | 1337 | * inserted it onto the runqueue yet. This guarantees that |
1337 | * nobody will actually run it, and a signal or other external | 1338 | * nobody will actually run it, and a signal or other external |
1338 | * event cannot wake it up and insert it on the runqueue either. | 1339 | * event cannot wake it up and insert it on the runqueue either. |
1339 | */ | 1340 | */ |
1340 | p->state = TASK_RUNNING; | 1341 | p->state = TASK_RUNNING; |
1341 | INIT_LIST_HEAD(&p->run_list); | 1342 | INIT_LIST_HEAD(&p->run_list); |
1342 | p->array = NULL; | 1343 | p->array = NULL; |
1343 | #ifdef CONFIG_SCHEDSTATS | 1344 | #ifdef CONFIG_SCHEDSTATS |
1344 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1345 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1345 | #endif | 1346 | #endif |
1346 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1347 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1347 | p->oncpu = 0; | 1348 | p->oncpu = 0; |
1348 | #endif | 1349 | #endif |
1349 | #ifdef CONFIG_PREEMPT | 1350 | #ifdef CONFIG_PREEMPT |
1350 | /* Want to start with kernel preemption disabled. */ | 1351 | /* Want to start with kernel preemption disabled. */ |
1351 | task_thread_info(p)->preempt_count = 1; | 1352 | task_thread_info(p)->preempt_count = 1; |
1352 | #endif | 1353 | #endif |
1353 | /* | 1354 | /* |
1354 | * Share the timeslice between parent and child, thus the | 1355 | * Share the timeslice between parent and child, thus the |
1355 | * total amount of pending timeslices in the system doesn't change, | 1356 | * total amount of pending timeslices in the system doesn't change, |
1356 | * resulting in more scheduling fairness. | 1357 | * resulting in more scheduling fairness. |
1357 | */ | 1358 | */ |
1358 | local_irq_disable(); | 1359 | local_irq_disable(); |
1359 | p->time_slice = (current->time_slice + 1) >> 1; | 1360 | p->time_slice = (current->time_slice + 1) >> 1; |
1360 | /* | 1361 | /* |
1361 | * The remainder of the first timeslice might be recovered by | 1362 | * The remainder of the first timeslice might be recovered by |
1362 | * the parent if the child exits early enough. | 1363 | * the parent if the child exits early enough. |
1363 | */ | 1364 | */ |
1364 | p->first_time_slice = 1; | 1365 | p->first_time_slice = 1; |
1365 | current->time_slice >>= 1; | 1366 | current->time_slice >>= 1; |
1366 | p->timestamp = sched_clock(); | 1367 | p->timestamp = sched_clock(); |
1367 | if (unlikely(!current->time_slice)) { | 1368 | if (unlikely(!current->time_slice)) { |
1368 | /* | 1369 | /* |
1369 | * This case is rare, it happens when the parent has only | 1370 | * This case is rare, it happens when the parent has only |
1370 | * a single jiffy left from its timeslice. Taking the | 1371 | * a single jiffy left from its timeslice. Taking the |
1371 | * runqueue lock is not a problem. | 1372 | * runqueue lock is not a problem. |
1372 | */ | 1373 | */ |
1373 | current->time_slice = 1; | 1374 | current->time_slice = 1; |
1374 | scheduler_tick(); | 1375 | scheduler_tick(); |
1375 | } | 1376 | } |
1376 | local_irq_enable(); | 1377 | local_irq_enable(); |
1377 | put_cpu(); | 1378 | put_cpu(); |
1378 | } | 1379 | } |
1379 | 1380 | ||
1380 | /* | 1381 | /* |
1381 | * wake_up_new_task - wake up a newly created task for the first time. | 1382 | * wake_up_new_task - wake up a newly created task for the first time. |
1382 | * | 1383 | * |
1383 | * This function will do some initial scheduler statistics housekeeping | 1384 | * This function will do some initial scheduler statistics housekeeping |
1384 | * that must be done for every newly created context, then puts the task | 1385 | * that must be done for every newly created context, then puts the task |
1385 | * on the runqueue and wakes it. | 1386 | * on the runqueue and wakes it. |
1386 | */ | 1387 | */ |
1387 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | 1388 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) |
1388 | { | 1389 | { |
1389 | unsigned long flags; | 1390 | unsigned long flags; |
1390 | int this_cpu, cpu; | 1391 | int this_cpu, cpu; |
1391 | runqueue_t *rq, *this_rq; | 1392 | runqueue_t *rq, *this_rq; |
1392 | 1393 | ||
1393 | rq = task_rq_lock(p, &flags); | 1394 | rq = task_rq_lock(p, &flags); |
1394 | BUG_ON(p->state != TASK_RUNNING); | 1395 | BUG_ON(p->state != TASK_RUNNING); |
1395 | this_cpu = smp_processor_id(); | 1396 | this_cpu = smp_processor_id(); |
1396 | cpu = task_cpu(p); | 1397 | cpu = task_cpu(p); |
1397 | 1398 | ||
1398 | /* | 1399 | /* |
1399 | * We decrease the sleep average of forking parents | 1400 | * We decrease the sleep average of forking parents |
1400 | * and children as well, to keep max-interactive tasks | 1401 | * and children as well, to keep max-interactive tasks |
1401 | * from forking tasks that are max-interactive. The parent | 1402 | * from forking tasks that are max-interactive. The parent |
1402 | * (current) is done further down, under its lock. | 1403 | * (current) is done further down, under its lock. |
1403 | */ | 1404 | */ |
1404 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | 1405 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * |
1405 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | 1406 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
1406 | 1407 | ||
1407 | p->prio = effective_prio(p); | 1408 | p->prio = effective_prio(p); |
1408 | 1409 | ||
1409 | if (likely(cpu == this_cpu)) { | 1410 | if (likely(cpu == this_cpu)) { |
1410 | if (!(clone_flags & CLONE_VM)) { | 1411 | if (!(clone_flags & CLONE_VM)) { |
1411 | /* | 1412 | /* |
1412 | * The VM isn't cloned, so we're in a good position to | 1413 | * The VM isn't cloned, so we're in a good position to |
1413 | * do child-runs-first in anticipation of an exec. This | 1414 | * do child-runs-first in anticipation of an exec. This |
1414 | * usually avoids a lot of COW overhead. | 1415 | * usually avoids a lot of COW overhead. |
1415 | */ | 1416 | */ |
1416 | if (unlikely(!current->array)) | 1417 | if (unlikely(!current->array)) |
1417 | __activate_task(p, rq); | 1418 | __activate_task(p, rq); |
1418 | else { | 1419 | else { |
1419 | p->prio = current->prio; | 1420 | p->prio = current->prio; |
1420 | list_add_tail(&p->run_list, ¤t->run_list); | 1421 | list_add_tail(&p->run_list, ¤t->run_list); |
1421 | p->array = current->array; | 1422 | p->array = current->array; |
1422 | p->array->nr_active++; | 1423 | p->array->nr_active++; |
1423 | rq->nr_running++; | 1424 | rq->nr_running++; |
1424 | } | 1425 | } |
1425 | set_need_resched(); | 1426 | set_need_resched(); |
1426 | } else | 1427 | } else |
1427 | /* Run child last */ | 1428 | /* Run child last */ |
1428 | __activate_task(p, rq); | 1429 | __activate_task(p, rq); |
1429 | /* | 1430 | /* |
1430 | * We skip the following code due to cpu == this_cpu | 1431 | * We skip the following code due to cpu == this_cpu |
1431 | * | 1432 | * |
1432 | * task_rq_unlock(rq, &flags); | 1433 | * task_rq_unlock(rq, &flags); |
1433 | * this_rq = task_rq_lock(current, &flags); | 1434 | * this_rq = task_rq_lock(current, &flags); |
1434 | */ | 1435 | */ |
1435 | this_rq = rq; | 1436 | this_rq = rq; |
1436 | } else { | 1437 | } else { |
1437 | this_rq = cpu_rq(this_cpu); | 1438 | this_rq = cpu_rq(this_cpu); |
1438 | 1439 | ||
1439 | /* | 1440 | /* |
1440 | * Not the local CPU - must adjust timestamp. This should | 1441 | * Not the local CPU - must adjust timestamp. This should |
1441 | * get optimised away in the !CONFIG_SMP case. | 1442 | * get optimised away in the !CONFIG_SMP case. |
1442 | */ | 1443 | */ |
1443 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | 1444 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) |
1444 | + rq->timestamp_last_tick; | 1445 | + rq->timestamp_last_tick; |
1445 | __activate_task(p, rq); | 1446 | __activate_task(p, rq); |
1446 | if (TASK_PREEMPTS_CURR(p, rq)) | 1447 | if (TASK_PREEMPTS_CURR(p, rq)) |
1447 | resched_task(rq->curr); | 1448 | resched_task(rq->curr); |
1448 | 1449 | ||
1449 | /* | 1450 | /* |
1450 | * Parent and child are on different CPUs, now get the | 1451 | * Parent and child are on different CPUs, now get the |
1451 | * parent runqueue to update the parent's ->sleep_avg: | 1452 | * parent runqueue to update the parent's ->sleep_avg: |
1452 | */ | 1453 | */ |
1453 | task_rq_unlock(rq, &flags); | 1454 | task_rq_unlock(rq, &flags); |
1454 | this_rq = task_rq_lock(current, &flags); | 1455 | this_rq = task_rq_lock(current, &flags); |
1455 | } | 1456 | } |
1456 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | 1457 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * |
1457 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | 1458 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
1458 | task_rq_unlock(this_rq, &flags); | 1459 | task_rq_unlock(this_rq, &flags); |
1459 | } | 1460 | } |
1460 | 1461 | ||
1461 | /* | 1462 | /* |
1462 | * Potentially available exiting-child timeslices are | 1463 | * Potentially available exiting-child timeslices are |
1463 | * retrieved here - this way the parent does not get | 1464 | * retrieved here - this way the parent does not get |
1464 | * penalized for creating too many threads. | 1465 | * penalized for creating too many threads. |
1465 | * | 1466 | * |
1466 | * (this cannot be used to 'generate' timeslices | 1467 | * (this cannot be used to 'generate' timeslices |
1467 | * artificially, because any timeslice recovered here | 1468 | * artificially, because any timeslice recovered here |
1468 | * was given away by the parent in the first place.) | 1469 | * was given away by the parent in the first place.) |
1469 | */ | 1470 | */ |
1470 | void fastcall sched_exit(task_t *p) | 1471 | void fastcall sched_exit(task_t *p) |
1471 | { | 1472 | { |
1472 | unsigned long flags; | 1473 | unsigned long flags; |
1473 | runqueue_t *rq; | 1474 | runqueue_t *rq; |
1474 | 1475 | ||
1475 | /* | 1476 | /* |
1476 | * If the child was a (relative-) CPU hog then decrease | 1477 | * If the child was a (relative-) CPU hog then decrease |
1477 | * the sleep_avg of the parent as well. | 1478 | * the sleep_avg of the parent as well. |
1478 | */ | 1479 | */ |
1479 | rq = task_rq_lock(p->parent, &flags); | 1480 | rq = task_rq_lock(p->parent, &flags); |
1480 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { | 1481 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
1481 | p->parent->time_slice += p->time_slice; | 1482 | p->parent->time_slice += p->time_slice; |
1482 | if (unlikely(p->parent->time_slice > task_timeslice(p))) | 1483 | if (unlikely(p->parent->time_slice > task_timeslice(p))) |
1483 | p->parent->time_slice = task_timeslice(p); | 1484 | p->parent->time_slice = task_timeslice(p); |
1484 | } | 1485 | } |
1485 | if (p->sleep_avg < p->parent->sleep_avg) | 1486 | if (p->sleep_avg < p->parent->sleep_avg) |
1486 | p->parent->sleep_avg = p->parent->sleep_avg / | 1487 | p->parent->sleep_avg = p->parent->sleep_avg / |
1487 | (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | 1488 | (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / |
1488 | (EXIT_WEIGHT + 1); | 1489 | (EXIT_WEIGHT + 1); |
1489 | task_rq_unlock(rq, &flags); | 1490 | task_rq_unlock(rq, &flags); |
1490 | } | 1491 | } |
1491 | 1492 | ||
1492 | /** | 1493 | /** |
1493 | * prepare_task_switch - prepare to switch tasks | 1494 | * prepare_task_switch - prepare to switch tasks |
1494 | * @rq: the runqueue preparing to switch | 1495 | * @rq: the runqueue preparing to switch |
1495 | * @next: the task we are going to switch to. | 1496 | * @next: the task we are going to switch to. |
1496 | * | 1497 | * |
1497 | * This is called with the rq lock held and interrupts off. It must | 1498 | * This is called with the rq lock held and interrupts off. It must |
1498 | * be paired with a subsequent finish_task_switch after the context | 1499 | * be paired with a subsequent finish_task_switch after the context |
1499 | * switch. | 1500 | * switch. |
1500 | * | 1501 | * |
1501 | * prepare_task_switch sets up locking and calls architecture specific | 1502 | * prepare_task_switch sets up locking and calls architecture specific |
1502 | * hooks. | 1503 | * hooks. |
1503 | */ | 1504 | */ |
1504 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | 1505 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) |
1505 | { | 1506 | { |
1506 | prepare_lock_switch(rq, next); | 1507 | prepare_lock_switch(rq, next); |
1507 | prepare_arch_switch(next); | 1508 | prepare_arch_switch(next); |
1508 | } | 1509 | } |
1509 | 1510 | ||
1510 | /** | 1511 | /** |
1511 | * finish_task_switch - clean up after a task-switch | 1512 | * finish_task_switch - clean up after a task-switch |
1512 | * @rq: runqueue associated with task-switch | 1513 | * @rq: runqueue associated with task-switch |
1513 | * @prev: the thread we just switched away from. | 1514 | * @prev: the thread we just switched away from. |
1514 | * | 1515 | * |
1515 | * finish_task_switch must be called after the context switch, paired | 1516 | * finish_task_switch must be called after the context switch, paired |
1516 | * with a prepare_task_switch call before the context switch. | 1517 | * with a prepare_task_switch call before the context switch. |
1517 | * finish_task_switch will reconcile locking set up by prepare_task_switch, | 1518 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
1518 | * and do any other architecture-specific cleanup actions. | 1519 | * and do any other architecture-specific cleanup actions. |
1519 | * | 1520 | * |
1520 | * Note that we may have delayed dropping an mm in context_switch(). If | 1521 | * Note that we may have delayed dropping an mm in context_switch(). If |
1521 | * so, we finish that here outside of the runqueue lock. (Doing it | 1522 | * so, we finish that here outside of the runqueue lock. (Doing it |
1522 | * with the lock held can cause deadlocks; see schedule() for | 1523 | * with the lock held can cause deadlocks; see schedule() for |
1523 | * details.) | 1524 | * details.) |
1524 | */ | 1525 | */ |
1525 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | 1526 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) |
1526 | __releases(rq->lock) | 1527 | __releases(rq->lock) |
1527 | { | 1528 | { |
1528 | struct mm_struct *mm = rq->prev_mm; | 1529 | struct mm_struct *mm = rq->prev_mm; |
1529 | unsigned long prev_task_flags; | 1530 | unsigned long prev_task_flags; |
1530 | 1531 | ||
1531 | rq->prev_mm = NULL; | 1532 | rq->prev_mm = NULL; |
1532 | 1533 | ||
1533 | /* | 1534 | /* |
1534 | * A task struct has one reference for the use as "current". | 1535 | * A task struct has one reference for the use as "current". |
1535 | * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and | 1536 | * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and |
1536 | * calls schedule one last time. The schedule call will never return, | 1537 | * calls schedule one last time. The schedule call will never return, |
1537 | * and the scheduled task must drop that reference. | 1538 | * and the scheduled task must drop that reference. |
1538 | * The test for EXIT_ZOMBIE must occur while the runqueue locks are | 1539 | * The test for EXIT_ZOMBIE must occur while the runqueue locks are |
1539 | * still held, otherwise prev could be scheduled on another cpu, die | 1540 | * still held, otherwise prev could be scheduled on another cpu, die |
1540 | * there before we look at prev->state, and then the reference would | 1541 | * there before we look at prev->state, and then the reference would |
1541 | * be dropped twice. | 1542 | * be dropped twice. |
1542 | * Manfred Spraul <manfred@colorfullife.com> | 1543 | * Manfred Spraul <manfred@colorfullife.com> |
1543 | */ | 1544 | */ |
1544 | prev_task_flags = prev->flags; | 1545 | prev_task_flags = prev->flags; |
1545 | finish_arch_switch(prev); | 1546 | finish_arch_switch(prev); |
1546 | finish_lock_switch(rq, prev); | 1547 | finish_lock_switch(rq, prev); |
1547 | if (mm) | 1548 | if (mm) |
1548 | mmdrop(mm); | 1549 | mmdrop(mm); |
1549 | if (unlikely(prev_task_flags & PF_DEAD)) | 1550 | if (unlikely(prev_task_flags & PF_DEAD)) { |
1551 | /* | ||
1552 | * Remove function-return probe instances associated with this | ||
1553 | * task and put them back on the free list. | ||
1554 | */ | ||
1555 | kprobe_flush_task(prev); | ||
1550 | put_task_struct(prev); | 1556 | put_task_struct(prev); |
1557 | } | ||
1551 | } | 1558 | } |
1552 | 1559 | ||
1553 | /** | 1560 | /** |
1554 | * schedule_tail - first thing a freshly forked thread must call. | 1561 | * schedule_tail - first thing a freshly forked thread must call. |
1555 | * @prev: the thread we just switched away from. | 1562 | * @prev: the thread we just switched away from. |
1556 | */ | 1563 | */ |
1557 | asmlinkage void schedule_tail(task_t *prev) | 1564 | asmlinkage void schedule_tail(task_t *prev) |
1558 | __releases(rq->lock) | 1565 | __releases(rq->lock) |
1559 | { | 1566 | { |
1560 | runqueue_t *rq = this_rq(); | 1567 | runqueue_t *rq = this_rq(); |
1561 | finish_task_switch(rq, prev); | 1568 | finish_task_switch(rq, prev); |
1562 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1569 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
1563 | /* In this case, finish_task_switch does not reenable preemption */ | 1570 | /* In this case, finish_task_switch does not reenable preemption */ |
1564 | preempt_enable(); | 1571 | preempt_enable(); |
1565 | #endif | 1572 | #endif |
1566 | if (current->set_child_tid) | 1573 | if (current->set_child_tid) |
1567 | put_user(current->pid, current->set_child_tid); | 1574 | put_user(current->pid, current->set_child_tid); |
1568 | } | 1575 | } |
1569 | 1576 | ||
1570 | /* | 1577 | /* |
1571 | * context_switch - switch to the new MM and the new | 1578 | * context_switch - switch to the new MM and the new |
1572 | * thread's register state. | 1579 | * thread's register state. |
1573 | */ | 1580 | */ |
1574 | static inline | 1581 | static inline |
1575 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | 1582 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) |
1576 | { | 1583 | { |
1577 | struct mm_struct *mm = next->mm; | 1584 | struct mm_struct *mm = next->mm; |
1578 | struct mm_struct *oldmm = prev->active_mm; | 1585 | struct mm_struct *oldmm = prev->active_mm; |
1579 | 1586 | ||
1580 | if (unlikely(!mm)) { | 1587 | if (unlikely(!mm)) { |
1581 | next->active_mm = oldmm; | 1588 | next->active_mm = oldmm; |
1582 | atomic_inc(&oldmm->mm_count); | 1589 | atomic_inc(&oldmm->mm_count); |
1583 | enter_lazy_tlb(oldmm, next); | 1590 | enter_lazy_tlb(oldmm, next); |
1584 | } else | 1591 | } else |
1585 | switch_mm(oldmm, mm, next); | 1592 | switch_mm(oldmm, mm, next); |
1586 | 1593 | ||
1587 | if (unlikely(!prev->mm)) { | 1594 | if (unlikely(!prev->mm)) { |
1588 | prev->active_mm = NULL; | 1595 | prev->active_mm = NULL; |
1589 | WARN_ON(rq->prev_mm); | 1596 | WARN_ON(rq->prev_mm); |
1590 | rq->prev_mm = oldmm; | 1597 | rq->prev_mm = oldmm; |
1591 | } | 1598 | } |
1592 | 1599 | ||
1593 | /* Here we just switch the register state and the stack. */ | 1600 | /* Here we just switch the register state and the stack. */ |
1594 | switch_to(prev, next, prev); | 1601 | switch_to(prev, next, prev); |
1595 | 1602 | ||
1596 | return prev; | 1603 | return prev; |
1597 | } | 1604 | } |
1598 | 1605 | ||
1599 | /* | 1606 | /* |
1600 | * nr_running, nr_uninterruptible and nr_context_switches: | 1607 | * nr_running, nr_uninterruptible and nr_context_switches: |
1601 | * | 1608 | * |
1602 | * externally visible scheduler statistics: current number of runnable | 1609 | * externally visible scheduler statistics: current number of runnable |
1603 | * threads, current number of uninterruptible-sleeping threads, total | 1610 | * threads, current number of uninterruptible-sleeping threads, total |
1604 | * number of context switches performed since bootup. | 1611 | * number of context switches performed since bootup. |
1605 | */ | 1612 | */ |
1606 | unsigned long nr_running(void) | 1613 | unsigned long nr_running(void) |
1607 | { | 1614 | { |
1608 | unsigned long i, sum = 0; | 1615 | unsigned long i, sum = 0; |
1609 | 1616 | ||
1610 | for_each_online_cpu(i) | 1617 | for_each_online_cpu(i) |
1611 | sum += cpu_rq(i)->nr_running; | 1618 | sum += cpu_rq(i)->nr_running; |
1612 | 1619 | ||
1613 | return sum; | 1620 | return sum; |
1614 | } | 1621 | } |
1615 | 1622 | ||
1616 | unsigned long nr_uninterruptible(void) | 1623 | unsigned long nr_uninterruptible(void) |
1617 | { | 1624 | { |
1618 | unsigned long i, sum = 0; | 1625 | unsigned long i, sum = 0; |
1619 | 1626 | ||
1620 | for_each_cpu(i) | 1627 | for_each_cpu(i) |
1621 | sum += cpu_rq(i)->nr_uninterruptible; | 1628 | sum += cpu_rq(i)->nr_uninterruptible; |
1622 | 1629 | ||
1623 | /* | 1630 | /* |
1624 | * Since we read the counters lockless, it might be slightly | 1631 | * Since we read the counters lockless, it might be slightly |
1625 | * inaccurate. Do not allow it to go below zero though: | 1632 | * inaccurate. Do not allow it to go below zero though: |
1626 | */ | 1633 | */ |
1627 | if (unlikely((long)sum < 0)) | 1634 | if (unlikely((long)sum < 0)) |
1628 | sum = 0; | 1635 | sum = 0; |
1629 | 1636 | ||
1630 | return sum; | 1637 | return sum; |
1631 | } | 1638 | } |
1632 | 1639 | ||
1633 | unsigned long long nr_context_switches(void) | 1640 | unsigned long long nr_context_switches(void) |
1634 | { | 1641 | { |
1635 | unsigned long long i, sum = 0; | 1642 | unsigned long long i, sum = 0; |
1636 | 1643 | ||
1637 | for_each_cpu(i) | 1644 | for_each_cpu(i) |
1638 | sum += cpu_rq(i)->nr_switches; | 1645 | sum += cpu_rq(i)->nr_switches; |
1639 | 1646 | ||
1640 | return sum; | 1647 | return sum; |
1641 | } | 1648 | } |
1642 | 1649 | ||
1643 | unsigned long nr_iowait(void) | 1650 | unsigned long nr_iowait(void) |
1644 | { | 1651 | { |
1645 | unsigned long i, sum = 0; | 1652 | unsigned long i, sum = 0; |
1646 | 1653 | ||
1647 | for_each_cpu(i) | 1654 | for_each_cpu(i) |
1648 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 1655 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
1649 | 1656 | ||
1650 | return sum; | 1657 | return sum; |
1651 | } | 1658 | } |
1652 | 1659 | ||
1653 | #ifdef CONFIG_SMP | 1660 | #ifdef CONFIG_SMP |
1654 | 1661 | ||
1655 | /* | 1662 | /* |
1656 | * double_rq_lock - safely lock two runqueues | 1663 | * double_rq_lock - safely lock two runqueues |
1657 | * | 1664 | * |
1658 | * We must take them in cpu order to match code in | 1665 | * We must take them in cpu order to match code in |
1659 | * dependent_sleeper and wake_dependent_sleeper. | 1666 | * dependent_sleeper and wake_dependent_sleeper. |
1660 | * | 1667 | * |
1661 | * Note this does not disable interrupts like task_rq_lock, | 1668 | * Note this does not disable interrupts like task_rq_lock, |
1662 | * you need to do so manually before calling. | 1669 | * you need to do so manually before calling. |
1663 | */ | 1670 | */ |
1664 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | 1671 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) |
1665 | __acquires(rq1->lock) | 1672 | __acquires(rq1->lock) |
1666 | __acquires(rq2->lock) | 1673 | __acquires(rq2->lock) |
1667 | { | 1674 | { |
1668 | if (rq1 == rq2) { | 1675 | if (rq1 == rq2) { |
1669 | spin_lock(&rq1->lock); | 1676 | spin_lock(&rq1->lock); |
1670 | __acquire(rq2->lock); /* Fake it out ;) */ | 1677 | __acquire(rq2->lock); /* Fake it out ;) */ |
1671 | } else { | 1678 | } else { |
1672 | if (rq1->cpu < rq2->cpu) { | 1679 | if (rq1->cpu < rq2->cpu) { |
1673 | spin_lock(&rq1->lock); | 1680 | spin_lock(&rq1->lock); |
1674 | spin_lock(&rq2->lock); | 1681 | spin_lock(&rq2->lock); |
1675 | } else { | 1682 | } else { |
1676 | spin_lock(&rq2->lock); | 1683 | spin_lock(&rq2->lock); |
1677 | spin_lock(&rq1->lock); | 1684 | spin_lock(&rq1->lock); |
1678 | } | 1685 | } |
1679 | } | 1686 | } |
1680 | } | 1687 | } |
1681 | 1688 | ||
1682 | /* | 1689 | /* |
1683 | * double_rq_unlock - safely unlock two runqueues | 1690 | * double_rq_unlock - safely unlock two runqueues |
1684 | * | 1691 | * |
1685 | * Note this does not restore interrupts like task_rq_unlock, | 1692 | * Note this does not restore interrupts like task_rq_unlock, |
1686 | * you need to do so manually after calling. | 1693 | * you need to do so manually after calling. |
1687 | */ | 1694 | */ |
1688 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | 1695 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) |
1689 | __releases(rq1->lock) | 1696 | __releases(rq1->lock) |
1690 | __releases(rq2->lock) | 1697 | __releases(rq2->lock) |
1691 | { | 1698 | { |
1692 | spin_unlock(&rq1->lock); | 1699 | spin_unlock(&rq1->lock); |
1693 | if (rq1 != rq2) | 1700 | if (rq1 != rq2) |
1694 | spin_unlock(&rq2->lock); | 1701 | spin_unlock(&rq2->lock); |
1695 | else | 1702 | else |
1696 | __release(rq2->lock); | 1703 | __release(rq2->lock); |
1697 | } | 1704 | } |
1698 | 1705 | ||
1699 | /* | 1706 | /* |
1700 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1707 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
1701 | */ | 1708 | */ |
1702 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | 1709 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) |
1703 | __releases(this_rq->lock) | 1710 | __releases(this_rq->lock) |
1704 | __acquires(busiest->lock) | 1711 | __acquires(busiest->lock) |
1705 | __acquires(this_rq->lock) | 1712 | __acquires(this_rq->lock) |
1706 | { | 1713 | { |
1707 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1714 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1708 | if (busiest->cpu < this_rq->cpu) { | 1715 | if (busiest->cpu < this_rq->cpu) { |
1709 | spin_unlock(&this_rq->lock); | 1716 | spin_unlock(&this_rq->lock); |
1710 | spin_lock(&busiest->lock); | 1717 | spin_lock(&busiest->lock); |
1711 | spin_lock(&this_rq->lock); | 1718 | spin_lock(&this_rq->lock); |
1712 | } else | 1719 | } else |
1713 | spin_lock(&busiest->lock); | 1720 | spin_lock(&busiest->lock); |
1714 | } | 1721 | } |
1715 | } | 1722 | } |
1716 | 1723 | ||
1717 | /* | 1724 | /* |
1718 | * If dest_cpu is allowed for this process, migrate the task to it. | 1725 | * If dest_cpu is allowed for this process, migrate the task to it. |
1719 | * This is accomplished by forcing the cpu_allowed mask to only | 1726 | * This is accomplished by forcing the cpu_allowed mask to only |
1720 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1727 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
1721 | * the cpu_allowed mask is restored. | 1728 | * the cpu_allowed mask is restored. |
1722 | */ | 1729 | */ |
1723 | static void sched_migrate_task(task_t *p, int dest_cpu) | 1730 | static void sched_migrate_task(task_t *p, int dest_cpu) |
1724 | { | 1731 | { |
1725 | migration_req_t req; | 1732 | migration_req_t req; |
1726 | runqueue_t *rq; | 1733 | runqueue_t *rq; |
1727 | unsigned long flags; | 1734 | unsigned long flags; |
1728 | 1735 | ||
1729 | rq = task_rq_lock(p, &flags); | 1736 | rq = task_rq_lock(p, &flags); |
1730 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 1737 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
1731 | || unlikely(cpu_is_offline(dest_cpu))) | 1738 | || unlikely(cpu_is_offline(dest_cpu))) |
1732 | goto out; | 1739 | goto out; |
1733 | 1740 | ||
1734 | /* force the process onto the specified CPU */ | 1741 | /* force the process onto the specified CPU */ |
1735 | if (migrate_task(p, dest_cpu, &req)) { | 1742 | if (migrate_task(p, dest_cpu, &req)) { |
1736 | /* Need to wait for migration thread (might exit: take ref). */ | 1743 | /* Need to wait for migration thread (might exit: take ref). */ |
1737 | struct task_struct *mt = rq->migration_thread; | 1744 | struct task_struct *mt = rq->migration_thread; |
1738 | get_task_struct(mt); | 1745 | get_task_struct(mt); |
1739 | task_rq_unlock(rq, &flags); | 1746 | task_rq_unlock(rq, &flags); |
1740 | wake_up_process(mt); | 1747 | wake_up_process(mt); |
1741 | put_task_struct(mt); | 1748 | put_task_struct(mt); |
1742 | wait_for_completion(&req.done); | 1749 | wait_for_completion(&req.done); |
1743 | return; | 1750 | return; |
1744 | } | 1751 | } |
1745 | out: | 1752 | out: |
1746 | task_rq_unlock(rq, &flags); | 1753 | task_rq_unlock(rq, &flags); |
1747 | } | 1754 | } |
1748 | 1755 | ||
1749 | /* | 1756 | /* |
1750 | * sched_exec - execve() is a valuable balancing opportunity, because at | 1757 | * sched_exec - execve() is a valuable balancing opportunity, because at |
1751 | * this point the task has the smallest effective memory and cache footprint. | 1758 | * this point the task has the smallest effective memory and cache footprint. |
1752 | */ | 1759 | */ |
1753 | void sched_exec(void) | 1760 | void sched_exec(void) |
1754 | { | 1761 | { |
1755 | int new_cpu, this_cpu = get_cpu(); | 1762 | int new_cpu, this_cpu = get_cpu(); |
1756 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 1763 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); |
1757 | put_cpu(); | 1764 | put_cpu(); |
1758 | if (new_cpu != this_cpu) | 1765 | if (new_cpu != this_cpu) |
1759 | sched_migrate_task(current, new_cpu); | 1766 | sched_migrate_task(current, new_cpu); |
1760 | } | 1767 | } |
1761 | 1768 | ||
1762 | /* | 1769 | /* |
1763 | * pull_task - move a task from a remote runqueue to the local runqueue. | 1770 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1764 | * Both runqueues must be locked. | 1771 | * Both runqueues must be locked. |
1765 | */ | 1772 | */ |
1766 | static | 1773 | static |
1767 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 1774 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, |
1768 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1775 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1769 | { | 1776 | { |
1770 | dequeue_task(p, src_array); | 1777 | dequeue_task(p, src_array); |
1771 | src_rq->nr_running--; | 1778 | src_rq->nr_running--; |
1772 | set_task_cpu(p, this_cpu); | 1779 | set_task_cpu(p, this_cpu); |
1773 | this_rq->nr_running++; | 1780 | this_rq->nr_running++; |
1774 | enqueue_task(p, this_array); | 1781 | enqueue_task(p, this_array); |
1775 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 1782 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1776 | + this_rq->timestamp_last_tick; | 1783 | + this_rq->timestamp_last_tick; |
1777 | /* | 1784 | /* |
1778 | * Note that idle threads have a prio of MAX_PRIO, for this test | 1785 | * Note that idle threads have a prio of MAX_PRIO, for this test |
1779 | * to be always true for them. | 1786 | * to be always true for them. |
1780 | */ | 1787 | */ |
1781 | if (TASK_PREEMPTS_CURR(p, this_rq)) | 1788 | if (TASK_PREEMPTS_CURR(p, this_rq)) |
1782 | resched_task(this_rq->curr); | 1789 | resched_task(this_rq->curr); |
1783 | } | 1790 | } |
1784 | 1791 | ||
1785 | /* | 1792 | /* |
1786 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 1793 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1787 | */ | 1794 | */ |
1788 | static | 1795 | static |
1789 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1796 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1790 | struct sched_domain *sd, enum idle_type idle, | 1797 | struct sched_domain *sd, enum idle_type idle, |
1791 | int *all_pinned) | 1798 | int *all_pinned) |
1792 | { | 1799 | { |
1793 | /* | 1800 | /* |
1794 | * We do not migrate tasks that are: | 1801 | * We do not migrate tasks that are: |
1795 | * 1) running (obviously), or | 1802 | * 1) running (obviously), or |
1796 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 1803 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
1797 | * 3) are cache-hot on their current CPU. | 1804 | * 3) are cache-hot on their current CPU. |
1798 | */ | 1805 | */ |
1799 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 1806 | if (!cpu_isset(this_cpu, p->cpus_allowed)) |
1800 | return 0; | 1807 | return 0; |
1801 | *all_pinned = 0; | 1808 | *all_pinned = 0; |
1802 | 1809 | ||
1803 | if (task_running(rq, p)) | 1810 | if (task_running(rq, p)) |
1804 | return 0; | 1811 | return 0; |
1805 | 1812 | ||
1806 | /* | 1813 | /* |
1807 | * Aggressive migration if: | 1814 | * Aggressive migration if: |
1808 | * 1) task is cache cold, or | 1815 | * 1) task is cache cold, or |
1809 | * 2) too many balance attempts have failed. | 1816 | * 2) too many balance attempts have failed. |
1810 | */ | 1817 | */ |
1811 | 1818 | ||
1812 | if (sd->nr_balance_failed > sd->cache_nice_tries) | 1819 | if (sd->nr_balance_failed > sd->cache_nice_tries) |
1813 | return 1; | 1820 | return 1; |
1814 | 1821 | ||
1815 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 1822 | if (task_hot(p, rq->timestamp_last_tick, sd)) |
1816 | return 0; | 1823 | return 0; |
1817 | return 1; | 1824 | return 1; |
1818 | } | 1825 | } |
1819 | 1826 | ||
1820 | /* | 1827 | /* |
1821 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 1828 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, |
1822 | * as part of a balancing operation within "domain". Returns the number of | 1829 | * as part of a balancing operation within "domain". Returns the number of |
1823 | * tasks moved. | 1830 | * tasks moved. |
1824 | * | 1831 | * |
1825 | * Called with both runqueues locked. | 1832 | * Called with both runqueues locked. |
1826 | */ | 1833 | */ |
1827 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 1834 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
1828 | unsigned long max_nr_move, struct sched_domain *sd, | 1835 | unsigned long max_nr_move, struct sched_domain *sd, |
1829 | enum idle_type idle, int *all_pinned) | 1836 | enum idle_type idle, int *all_pinned) |
1830 | { | 1837 | { |
1831 | prio_array_t *array, *dst_array; | 1838 | prio_array_t *array, *dst_array; |
1832 | struct list_head *head, *curr; | 1839 | struct list_head *head, *curr; |
1833 | int idx, pulled = 0, pinned = 0; | 1840 | int idx, pulled = 0, pinned = 0; |
1834 | task_t *tmp; | 1841 | task_t *tmp; |
1835 | 1842 | ||
1836 | if (max_nr_move == 0) | 1843 | if (max_nr_move == 0) |
1837 | goto out; | 1844 | goto out; |
1838 | 1845 | ||
1839 | pinned = 1; | 1846 | pinned = 1; |
1840 | 1847 | ||
1841 | /* | 1848 | /* |
1842 | * We first consider expired tasks. Those will likely not be | 1849 | * We first consider expired tasks. Those will likely not be |
1843 | * executed in the near future, and they are most likely to | 1850 | * executed in the near future, and they are most likely to |
1844 | * be cache-cold, thus switching CPUs has the least effect | 1851 | * be cache-cold, thus switching CPUs has the least effect |
1845 | * on them. | 1852 | * on them. |
1846 | */ | 1853 | */ |
1847 | if (busiest->expired->nr_active) { | 1854 | if (busiest->expired->nr_active) { |
1848 | array = busiest->expired; | 1855 | array = busiest->expired; |
1849 | dst_array = this_rq->expired; | 1856 | dst_array = this_rq->expired; |
1850 | } else { | 1857 | } else { |
1851 | array = busiest->active; | 1858 | array = busiest->active; |
1852 | dst_array = this_rq->active; | 1859 | dst_array = this_rq->active; |
1853 | } | 1860 | } |
1854 | 1861 | ||
1855 | new_array: | 1862 | new_array: |
1856 | /* Start searching at priority 0: */ | 1863 | /* Start searching at priority 0: */ |
1857 | idx = 0; | 1864 | idx = 0; |
1858 | skip_bitmap: | 1865 | skip_bitmap: |
1859 | if (!idx) | 1866 | if (!idx) |
1860 | idx = sched_find_first_bit(array->bitmap); | 1867 | idx = sched_find_first_bit(array->bitmap); |
1861 | else | 1868 | else |
1862 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | 1869 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); |
1863 | if (idx >= MAX_PRIO) { | 1870 | if (idx >= MAX_PRIO) { |
1864 | if (array == busiest->expired && busiest->active->nr_active) { | 1871 | if (array == busiest->expired && busiest->active->nr_active) { |
1865 | array = busiest->active; | 1872 | array = busiest->active; |
1866 | dst_array = this_rq->active; | 1873 | dst_array = this_rq->active; |
1867 | goto new_array; | 1874 | goto new_array; |
1868 | } | 1875 | } |
1869 | goto out; | 1876 | goto out; |
1870 | } | 1877 | } |
1871 | 1878 | ||
1872 | head = array->queue + idx; | 1879 | head = array->queue + idx; |
1873 | curr = head->prev; | 1880 | curr = head->prev; |
1874 | skip_queue: | 1881 | skip_queue: |
1875 | tmp = list_entry(curr, task_t, run_list); | 1882 | tmp = list_entry(curr, task_t, run_list); |
1876 | 1883 | ||
1877 | curr = curr->prev; | 1884 | curr = curr->prev; |
1878 | 1885 | ||
1879 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 1886 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { |
1880 | if (curr != head) | 1887 | if (curr != head) |
1881 | goto skip_queue; | 1888 | goto skip_queue; |
1882 | idx++; | 1889 | idx++; |
1883 | goto skip_bitmap; | 1890 | goto skip_bitmap; |
1884 | } | 1891 | } |
1885 | 1892 | ||
1886 | #ifdef CONFIG_SCHEDSTATS | 1893 | #ifdef CONFIG_SCHEDSTATS |
1887 | if (task_hot(tmp, busiest->timestamp_last_tick, sd)) | 1894 | if (task_hot(tmp, busiest->timestamp_last_tick, sd)) |
1888 | schedstat_inc(sd, lb_hot_gained[idle]); | 1895 | schedstat_inc(sd, lb_hot_gained[idle]); |
1889 | #endif | 1896 | #endif |
1890 | 1897 | ||
1891 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 1898 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1892 | pulled++; | 1899 | pulled++; |
1893 | 1900 | ||
1894 | /* We only want to steal up to the prescribed number of tasks. */ | 1901 | /* We only want to steal up to the prescribed number of tasks. */ |
1895 | if (pulled < max_nr_move) { | 1902 | if (pulled < max_nr_move) { |
1896 | if (curr != head) | 1903 | if (curr != head) |
1897 | goto skip_queue; | 1904 | goto skip_queue; |
1898 | idx++; | 1905 | idx++; |
1899 | goto skip_bitmap; | 1906 | goto skip_bitmap; |
1900 | } | 1907 | } |
1901 | out: | 1908 | out: |
1902 | /* | 1909 | /* |
1903 | * Right now, this is the only place pull_task() is called, | 1910 | * Right now, this is the only place pull_task() is called, |
1904 | * so we can safely collect pull_task() stats here rather than | 1911 | * so we can safely collect pull_task() stats here rather than |
1905 | * inside pull_task(). | 1912 | * inside pull_task(). |
1906 | */ | 1913 | */ |
1907 | schedstat_add(sd, lb_gained[idle], pulled); | 1914 | schedstat_add(sd, lb_gained[idle], pulled); |
1908 | 1915 | ||
1909 | if (all_pinned) | 1916 | if (all_pinned) |
1910 | *all_pinned = pinned; | 1917 | *all_pinned = pinned; |
1911 | return pulled; | 1918 | return pulled; |
1912 | } | 1919 | } |
1913 | 1920 | ||
1914 | /* | 1921 | /* |
1915 | * find_busiest_group finds and returns the busiest CPU group within the | 1922 | * find_busiest_group finds and returns the busiest CPU group within the |
1916 | * domain. It calculates and returns the number of tasks which should be | 1923 | * domain. It calculates and returns the number of tasks which should be |
1917 | * moved to restore balance via the imbalance parameter. | 1924 | * moved to restore balance via the imbalance parameter. |
1918 | */ | 1925 | */ |
1919 | static struct sched_group * | 1926 | static struct sched_group * |
1920 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 1927 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
1921 | unsigned long *imbalance, enum idle_type idle, int *sd_idle) | 1928 | unsigned long *imbalance, enum idle_type idle, int *sd_idle) |
1922 | { | 1929 | { |
1923 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 1930 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1924 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 1931 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1925 | unsigned long max_pull; | 1932 | unsigned long max_pull; |
1926 | int load_idx; | 1933 | int load_idx; |
1927 | 1934 | ||
1928 | max_load = this_load = total_load = total_pwr = 0; | 1935 | max_load = this_load = total_load = total_pwr = 0; |
1929 | if (idle == NOT_IDLE) | 1936 | if (idle == NOT_IDLE) |
1930 | load_idx = sd->busy_idx; | 1937 | load_idx = sd->busy_idx; |
1931 | else if (idle == NEWLY_IDLE) | 1938 | else if (idle == NEWLY_IDLE) |
1932 | load_idx = sd->newidle_idx; | 1939 | load_idx = sd->newidle_idx; |
1933 | else | 1940 | else |
1934 | load_idx = sd->idle_idx; | 1941 | load_idx = sd->idle_idx; |
1935 | 1942 | ||
1936 | do { | 1943 | do { |
1937 | unsigned long load; | 1944 | unsigned long load; |
1938 | int local_group; | 1945 | int local_group; |
1939 | int i; | 1946 | int i; |
1940 | 1947 | ||
1941 | local_group = cpu_isset(this_cpu, group->cpumask); | 1948 | local_group = cpu_isset(this_cpu, group->cpumask); |
1942 | 1949 | ||
1943 | /* Tally up the load of all CPUs in the group */ | 1950 | /* Tally up the load of all CPUs in the group */ |
1944 | avg_load = 0; | 1951 | avg_load = 0; |
1945 | 1952 | ||
1946 | for_each_cpu_mask(i, group->cpumask) { | 1953 | for_each_cpu_mask(i, group->cpumask) { |
1947 | if (*sd_idle && !idle_cpu(i)) | 1954 | if (*sd_idle && !idle_cpu(i)) |
1948 | *sd_idle = 0; | 1955 | *sd_idle = 0; |
1949 | 1956 | ||
1950 | /* Bias balancing toward cpus of our domain */ | 1957 | /* Bias balancing toward cpus of our domain */ |
1951 | if (local_group) | 1958 | if (local_group) |
1952 | load = target_load(i, load_idx); | 1959 | load = target_load(i, load_idx); |
1953 | else | 1960 | else |
1954 | load = source_load(i, load_idx); | 1961 | load = source_load(i, load_idx); |
1955 | 1962 | ||
1956 | avg_load += load; | 1963 | avg_load += load; |
1957 | } | 1964 | } |
1958 | 1965 | ||
1959 | total_load += avg_load; | 1966 | total_load += avg_load; |
1960 | total_pwr += group->cpu_power; | 1967 | total_pwr += group->cpu_power; |
1961 | 1968 | ||
1962 | /* Adjust by relative CPU power of the group */ | 1969 | /* Adjust by relative CPU power of the group */ |
1963 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1970 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1964 | 1971 | ||
1965 | if (local_group) { | 1972 | if (local_group) { |
1966 | this_load = avg_load; | 1973 | this_load = avg_load; |
1967 | this = group; | 1974 | this = group; |
1968 | } else if (avg_load > max_load) { | 1975 | } else if (avg_load > max_load) { |
1969 | max_load = avg_load; | 1976 | max_load = avg_load; |
1970 | busiest = group; | 1977 | busiest = group; |
1971 | } | 1978 | } |
1972 | group = group->next; | 1979 | group = group->next; |
1973 | } while (group != sd->groups); | 1980 | } while (group != sd->groups); |
1974 | 1981 | ||
1975 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 1982 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) |
1976 | goto out_balanced; | 1983 | goto out_balanced; |
1977 | 1984 | ||
1978 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 1985 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
1979 | 1986 | ||
1980 | if (this_load >= avg_load || | 1987 | if (this_load >= avg_load || |
1981 | 100*max_load <= sd->imbalance_pct*this_load) | 1988 | 100*max_load <= sd->imbalance_pct*this_load) |
1982 | goto out_balanced; | 1989 | goto out_balanced; |
1983 | 1990 | ||
1984 | /* | 1991 | /* |
1985 | * We're trying to get all the cpus to the average_load, so we don't | 1992 | * We're trying to get all the cpus to the average_load, so we don't |
1986 | * want to push ourselves above the average load, nor do we wish to | 1993 | * want to push ourselves above the average load, nor do we wish to |
1987 | * reduce the max loaded cpu below the average load, as either of these | 1994 | * reduce the max loaded cpu below the average load, as either of these |
1988 | * actions would just result in more rebalancing later, and ping-pong | 1995 | * actions would just result in more rebalancing later, and ping-pong |
1989 | * tasks around. Thus we look for the minimum possible imbalance. | 1996 | * tasks around. Thus we look for the minimum possible imbalance. |
1990 | * Negative imbalances (*we* are more loaded than anyone else) will | 1997 | * Negative imbalances (*we* are more loaded than anyone else) will |
1991 | * be counted as no imbalance for these purposes -- we can't fix that | 1998 | * be counted as no imbalance for these purposes -- we can't fix that |
1992 | * by pulling tasks to us. Be careful of negative numbers as they'll | 1999 | * by pulling tasks to us. Be careful of negative numbers as they'll |
1993 | * appear as very large values with unsigned longs. | 2000 | * appear as very large values with unsigned longs. |
1994 | */ | 2001 | */ |
1995 | 2002 | ||
1996 | /* Don't want to pull so many tasks that a group would go idle */ | 2003 | /* Don't want to pull so many tasks that a group would go idle */ |
1997 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2004 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); |
1998 | 2005 | ||
1999 | /* How much load to actually move to equalise the imbalance */ | 2006 | /* How much load to actually move to equalise the imbalance */ |
2000 | *imbalance = min(max_pull * busiest->cpu_power, | 2007 | *imbalance = min(max_pull * busiest->cpu_power, |
2001 | (avg_load - this_load) * this->cpu_power) | 2008 | (avg_load - this_load) * this->cpu_power) |
2002 | / SCHED_LOAD_SCALE; | 2009 | / SCHED_LOAD_SCALE; |
2003 | 2010 | ||
2004 | if (*imbalance < SCHED_LOAD_SCALE) { | 2011 | if (*imbalance < SCHED_LOAD_SCALE) { |
2005 | unsigned long pwr_now = 0, pwr_move = 0; | 2012 | unsigned long pwr_now = 0, pwr_move = 0; |
2006 | unsigned long tmp; | 2013 | unsigned long tmp; |
2007 | 2014 | ||
2008 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2015 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { |
2009 | *imbalance = 1; | 2016 | *imbalance = 1; |
2010 | return busiest; | 2017 | return busiest; |
2011 | } | 2018 | } |
2012 | 2019 | ||
2013 | /* | 2020 | /* |
2014 | * OK, we don't have enough imbalance to justify moving tasks, | 2021 | * OK, we don't have enough imbalance to justify moving tasks, |
2015 | * however we may be able to increase total CPU power used by | 2022 | * however we may be able to increase total CPU power used by |
2016 | * moving them. | 2023 | * moving them. |
2017 | */ | 2024 | */ |
2018 | 2025 | ||
2019 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2026 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); |
2020 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2027 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); |
2021 | pwr_now /= SCHED_LOAD_SCALE; | 2028 | pwr_now /= SCHED_LOAD_SCALE; |
2022 | 2029 | ||
2023 | /* Amount of load we'd subtract */ | 2030 | /* Amount of load we'd subtract */ |
2024 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2031 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; |
2025 | if (max_load > tmp) | 2032 | if (max_load > tmp) |
2026 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2033 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, |
2027 | max_load - tmp); | 2034 | max_load - tmp); |
2028 | 2035 | ||
2029 | /* Amount of load we'd add */ | 2036 | /* Amount of load we'd add */ |
2030 | if (max_load*busiest->cpu_power < | 2037 | if (max_load*busiest->cpu_power < |
2031 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2038 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) |
2032 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2039 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
2033 | else | 2040 | else |
2034 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2041 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; |
2035 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2042 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); |
2036 | pwr_move /= SCHED_LOAD_SCALE; | 2043 | pwr_move /= SCHED_LOAD_SCALE; |
2037 | 2044 | ||
2038 | /* Move if we gain throughput */ | 2045 | /* Move if we gain throughput */ |
2039 | if (pwr_move <= pwr_now) | 2046 | if (pwr_move <= pwr_now) |
2040 | goto out_balanced; | 2047 | goto out_balanced; |
2041 | 2048 | ||
2042 | *imbalance = 1; | 2049 | *imbalance = 1; |
2043 | return busiest; | 2050 | return busiest; |
2044 | } | 2051 | } |
2045 | 2052 | ||
2046 | /* Get rid of the scaling factor, rounding down as we divide */ | 2053 | /* Get rid of the scaling factor, rounding down as we divide */ |
2047 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | 2054 | *imbalance = *imbalance / SCHED_LOAD_SCALE; |
2048 | return busiest; | 2055 | return busiest; |
2049 | 2056 | ||
2050 | out_balanced: | 2057 | out_balanced: |
2051 | 2058 | ||
2052 | *imbalance = 0; | 2059 | *imbalance = 0; |
2053 | return NULL; | 2060 | return NULL; |
2054 | } | 2061 | } |
2055 | 2062 | ||
2056 | /* | 2063 | /* |
2057 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2064 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2058 | */ | 2065 | */ |
2059 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2066 | static runqueue_t *find_busiest_queue(struct sched_group *group, |
2060 | enum idle_type idle) | 2067 | enum idle_type idle) |
2061 | { | 2068 | { |
2062 | unsigned long load, max_load = 0; | 2069 | unsigned long load, max_load = 0; |
2063 | runqueue_t *busiest = NULL; | 2070 | runqueue_t *busiest = NULL; |
2064 | int i; | 2071 | int i; |
2065 | 2072 | ||
2066 | for_each_cpu_mask(i, group->cpumask) { | 2073 | for_each_cpu_mask(i, group->cpumask) { |
2067 | load = source_load(i, 0); | 2074 | load = source_load(i, 0); |
2068 | 2075 | ||
2069 | if (load > max_load) { | 2076 | if (load > max_load) { |
2070 | max_load = load; | 2077 | max_load = load; |
2071 | busiest = cpu_rq(i); | 2078 | busiest = cpu_rq(i); |
2072 | } | 2079 | } |
2073 | } | 2080 | } |
2074 | 2081 | ||
2075 | return busiest; | 2082 | return busiest; |
2076 | } | 2083 | } |
2077 | 2084 | ||
2078 | /* | 2085 | /* |
2079 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | 2086 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but |
2080 | * so long as it is large enough. | 2087 | * so long as it is large enough. |
2081 | */ | 2088 | */ |
2082 | #define MAX_PINNED_INTERVAL 512 | 2089 | #define MAX_PINNED_INTERVAL 512 |
2083 | 2090 | ||
2084 | /* | 2091 | /* |
2085 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2092 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2086 | * tasks if there is an imbalance. | 2093 | * tasks if there is an imbalance. |
2087 | * | 2094 | * |
2088 | * Called with this_rq unlocked. | 2095 | * Called with this_rq unlocked. |
2089 | */ | 2096 | */ |
2090 | static int load_balance(int this_cpu, runqueue_t *this_rq, | 2097 | static int load_balance(int this_cpu, runqueue_t *this_rq, |
2091 | struct sched_domain *sd, enum idle_type idle) | 2098 | struct sched_domain *sd, enum idle_type idle) |
2092 | { | 2099 | { |
2093 | struct sched_group *group; | 2100 | struct sched_group *group; |
2094 | runqueue_t *busiest; | 2101 | runqueue_t *busiest; |
2095 | unsigned long imbalance; | 2102 | unsigned long imbalance; |
2096 | int nr_moved, all_pinned = 0; | 2103 | int nr_moved, all_pinned = 0; |
2097 | int active_balance = 0; | 2104 | int active_balance = 0; |
2098 | int sd_idle = 0; | 2105 | int sd_idle = 0; |
2099 | 2106 | ||
2100 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2107 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) |
2101 | sd_idle = 1; | 2108 | sd_idle = 1; |
2102 | 2109 | ||
2103 | schedstat_inc(sd, lb_cnt[idle]); | 2110 | schedstat_inc(sd, lb_cnt[idle]); |
2104 | 2111 | ||
2105 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); | 2112 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); |
2106 | if (!group) { | 2113 | if (!group) { |
2107 | schedstat_inc(sd, lb_nobusyg[idle]); | 2114 | schedstat_inc(sd, lb_nobusyg[idle]); |
2108 | goto out_balanced; | 2115 | goto out_balanced; |
2109 | } | 2116 | } |
2110 | 2117 | ||
2111 | busiest = find_busiest_queue(group, idle); | 2118 | busiest = find_busiest_queue(group, idle); |
2112 | if (!busiest) { | 2119 | if (!busiest) { |
2113 | schedstat_inc(sd, lb_nobusyq[idle]); | 2120 | schedstat_inc(sd, lb_nobusyq[idle]); |
2114 | goto out_balanced; | 2121 | goto out_balanced; |
2115 | } | 2122 | } |
2116 | 2123 | ||
2117 | BUG_ON(busiest == this_rq); | 2124 | BUG_ON(busiest == this_rq); |
2118 | 2125 | ||
2119 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2126 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
2120 | 2127 | ||
2121 | nr_moved = 0; | 2128 | nr_moved = 0; |
2122 | if (busiest->nr_running > 1) { | 2129 | if (busiest->nr_running > 1) { |
2123 | /* | 2130 | /* |
2124 | * Attempt to move tasks. If find_busiest_group has found | 2131 | * Attempt to move tasks. If find_busiest_group has found |
2125 | * an imbalance but busiest->nr_running <= 1, the group is | 2132 | * an imbalance but busiest->nr_running <= 1, the group is |
2126 | * still unbalanced. nr_moved simply stays zero, so it is | 2133 | * still unbalanced. nr_moved simply stays zero, so it is |
2127 | * correctly treated as an imbalance. | 2134 | * correctly treated as an imbalance. |
2128 | */ | 2135 | */ |
2129 | double_rq_lock(this_rq, busiest); | 2136 | double_rq_lock(this_rq, busiest); |
2130 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2137 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2131 | imbalance, sd, idle, &all_pinned); | 2138 | imbalance, sd, idle, &all_pinned); |
2132 | double_rq_unlock(this_rq, busiest); | 2139 | double_rq_unlock(this_rq, busiest); |
2133 | 2140 | ||
2134 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2141 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2135 | if (unlikely(all_pinned)) | 2142 | if (unlikely(all_pinned)) |
2136 | goto out_balanced; | 2143 | goto out_balanced; |
2137 | } | 2144 | } |
2138 | 2145 | ||
2139 | if (!nr_moved) { | 2146 | if (!nr_moved) { |
2140 | schedstat_inc(sd, lb_failed[idle]); | 2147 | schedstat_inc(sd, lb_failed[idle]); |
2141 | sd->nr_balance_failed++; | 2148 | sd->nr_balance_failed++; |
2142 | 2149 | ||
2143 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2150 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
2144 | 2151 | ||
2145 | spin_lock(&busiest->lock); | 2152 | spin_lock(&busiest->lock); |
2146 | 2153 | ||
2147 | /* don't kick the migration_thread, if the curr | 2154 | /* don't kick the migration_thread, if the curr |
2148 | * task on busiest cpu can't be moved to this_cpu | 2155 | * task on busiest cpu can't be moved to this_cpu |
2149 | */ | 2156 | */ |
2150 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 2157 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
2151 | spin_unlock(&busiest->lock); | 2158 | spin_unlock(&busiest->lock); |
2152 | all_pinned = 1; | 2159 | all_pinned = 1; |
2153 | goto out_one_pinned; | 2160 | goto out_one_pinned; |
2154 | } | 2161 | } |
2155 | 2162 | ||
2156 | if (!busiest->active_balance) { | 2163 | if (!busiest->active_balance) { |
2157 | busiest->active_balance = 1; | 2164 | busiest->active_balance = 1; |
2158 | busiest->push_cpu = this_cpu; | 2165 | busiest->push_cpu = this_cpu; |
2159 | active_balance = 1; | 2166 | active_balance = 1; |
2160 | } | 2167 | } |
2161 | spin_unlock(&busiest->lock); | 2168 | spin_unlock(&busiest->lock); |
2162 | if (active_balance) | 2169 | if (active_balance) |
2163 | wake_up_process(busiest->migration_thread); | 2170 | wake_up_process(busiest->migration_thread); |
2164 | 2171 | ||
2165 | /* | 2172 | /* |
2166 | * We've kicked active balancing, reset the failure | 2173 | * We've kicked active balancing, reset the failure |
2167 | * counter. | 2174 | * counter. |
2168 | */ | 2175 | */ |
2169 | sd->nr_balance_failed = sd->cache_nice_tries+1; | 2176 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
2170 | } | 2177 | } |
2171 | } else | 2178 | } else |
2172 | sd->nr_balance_failed = 0; | 2179 | sd->nr_balance_failed = 0; |
2173 | 2180 | ||
2174 | if (likely(!active_balance)) { | 2181 | if (likely(!active_balance)) { |
2175 | /* We were unbalanced, so reset the balancing interval */ | 2182 | /* We were unbalanced, so reset the balancing interval */ |
2176 | sd->balance_interval = sd->min_interval; | 2183 | sd->balance_interval = sd->min_interval; |
2177 | } else { | 2184 | } else { |
2178 | /* | 2185 | /* |
2179 | * If we've begun active balancing, start to back off. This | 2186 | * If we've begun active balancing, start to back off. This |
2180 | * case may not be covered by the all_pinned logic if there | 2187 | * case may not be covered by the all_pinned logic if there |
2181 | * is only 1 task on the busy runqueue (because we don't call | 2188 | * is only 1 task on the busy runqueue (because we don't call |
2182 | * move_tasks). | 2189 | * move_tasks). |
2183 | */ | 2190 | */ |
2184 | if (sd->balance_interval < sd->max_interval) | 2191 | if (sd->balance_interval < sd->max_interval) |
2185 | sd->balance_interval *= 2; | 2192 | sd->balance_interval *= 2; |
2186 | } | 2193 | } |
2187 | 2194 | ||
2188 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2195 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) |
2189 | return -1; | 2196 | return -1; |
2190 | return nr_moved; | 2197 | return nr_moved; |
2191 | 2198 | ||
2192 | out_balanced: | 2199 | out_balanced: |
2193 | schedstat_inc(sd, lb_balanced[idle]); | 2200 | schedstat_inc(sd, lb_balanced[idle]); |
2194 | 2201 | ||
2195 | sd->nr_balance_failed = 0; | 2202 | sd->nr_balance_failed = 0; |
2196 | 2203 | ||
2197 | out_one_pinned: | 2204 | out_one_pinned: |
2198 | /* tune up the balancing interval */ | 2205 | /* tune up the balancing interval */ |
2199 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 2206 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
2200 | (sd->balance_interval < sd->max_interval)) | 2207 | (sd->balance_interval < sd->max_interval)) |
2201 | sd->balance_interval *= 2; | 2208 | sd->balance_interval *= 2; |
2202 | 2209 | ||
2203 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2210 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) |
2204 | return -1; | 2211 | return -1; |
2205 | return 0; | 2212 | return 0; |
2206 | } | 2213 | } |
2207 | 2214 | ||
2208 | /* | 2215 | /* |
2209 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2216 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2210 | * tasks if there is an imbalance. | 2217 | * tasks if there is an imbalance. |
2211 | * | 2218 | * |
2212 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2219 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). |
2213 | * this_rq is locked. | 2220 | * this_rq is locked. |
2214 | */ | 2221 | */ |
2215 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | 2222 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, |
2216 | struct sched_domain *sd) | 2223 | struct sched_domain *sd) |
2217 | { | 2224 | { |
2218 | struct sched_group *group; | 2225 | struct sched_group *group; |
2219 | runqueue_t *busiest = NULL; | 2226 | runqueue_t *busiest = NULL; |
2220 | unsigned long imbalance; | 2227 | unsigned long imbalance; |
2221 | int nr_moved = 0; | 2228 | int nr_moved = 0; |
2222 | int sd_idle = 0; | 2229 | int sd_idle = 0; |
2223 | 2230 | ||
2224 | if (sd->flags & SD_SHARE_CPUPOWER) | 2231 | if (sd->flags & SD_SHARE_CPUPOWER) |
2225 | sd_idle = 1; | 2232 | sd_idle = 1; |
2226 | 2233 | ||
2227 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2234 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2228 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); | 2235 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); |
2229 | if (!group) { | 2236 | if (!group) { |
2230 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2237 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2231 | goto out_balanced; | 2238 | goto out_balanced; |
2232 | } | 2239 | } |
2233 | 2240 | ||
2234 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2241 | busiest = find_busiest_queue(group, NEWLY_IDLE); |
2235 | if (!busiest) { | 2242 | if (!busiest) { |
2236 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2243 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2237 | goto out_balanced; | 2244 | goto out_balanced; |
2238 | } | 2245 | } |
2239 | 2246 | ||
2240 | BUG_ON(busiest == this_rq); | 2247 | BUG_ON(busiest == this_rq); |
2241 | 2248 | ||
2242 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2249 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
2243 | 2250 | ||
2244 | nr_moved = 0; | 2251 | nr_moved = 0; |
2245 | if (busiest->nr_running > 1) { | 2252 | if (busiest->nr_running > 1) { |
2246 | /* Attempt to move tasks */ | 2253 | /* Attempt to move tasks */ |
2247 | double_lock_balance(this_rq, busiest); | 2254 | double_lock_balance(this_rq, busiest); |
2248 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2255 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2249 | imbalance, sd, NEWLY_IDLE, NULL); | 2256 | imbalance, sd, NEWLY_IDLE, NULL); |
2250 | spin_unlock(&busiest->lock); | 2257 | spin_unlock(&busiest->lock); |
2251 | } | 2258 | } |
2252 | 2259 | ||
2253 | if (!nr_moved) { | 2260 | if (!nr_moved) { |
2254 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2261 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
2255 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2262 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) |
2256 | return -1; | 2263 | return -1; |
2257 | } else | 2264 | } else |
2258 | sd->nr_balance_failed = 0; | 2265 | sd->nr_balance_failed = 0; |
2259 | 2266 | ||
2260 | return nr_moved; | 2267 | return nr_moved; |
2261 | 2268 | ||
2262 | out_balanced: | 2269 | out_balanced: |
2263 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2270 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2264 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2271 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) |
2265 | return -1; | 2272 | return -1; |
2266 | sd->nr_balance_failed = 0; | 2273 | sd->nr_balance_failed = 0; |
2267 | return 0; | 2274 | return 0; |
2268 | } | 2275 | } |
2269 | 2276 | ||
2270 | /* | 2277 | /* |
2271 | * idle_balance is called by schedule() if this_cpu is about to become | 2278 | * idle_balance is called by schedule() if this_cpu is about to become |
2272 | * idle. Attempts to pull tasks from other CPUs. | 2279 | * idle. Attempts to pull tasks from other CPUs. |
2273 | */ | 2280 | */ |
2274 | static void idle_balance(int this_cpu, runqueue_t *this_rq) | 2281 | static void idle_balance(int this_cpu, runqueue_t *this_rq) |
2275 | { | 2282 | { |
2276 | struct sched_domain *sd; | 2283 | struct sched_domain *sd; |
2277 | 2284 | ||
2278 | for_each_domain(this_cpu, sd) { | 2285 | for_each_domain(this_cpu, sd) { |
2279 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2286 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
2280 | if (load_balance_newidle(this_cpu, this_rq, sd)) { | 2287 | if (load_balance_newidle(this_cpu, this_rq, sd)) { |
2281 | /* We've pulled tasks over so stop searching */ | 2288 | /* We've pulled tasks over so stop searching */ |
2282 | break; | 2289 | break; |
2283 | } | 2290 | } |
2284 | } | 2291 | } |
2285 | } | 2292 | } |
2286 | } | 2293 | } |
2287 | 2294 | ||
2288 | /* | 2295 | /* |
2289 | * active_load_balance is run by migration threads. It pushes running tasks | 2296 | * active_load_balance is run by migration threads. It pushes running tasks |
2290 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | 2297 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be |
2291 | * running on each physical CPU where possible, and avoids physical / | 2298 | * running on each physical CPU where possible, and avoids physical / |
2292 | * logical imbalances. | 2299 | * logical imbalances. |
2293 | * | 2300 | * |
2294 | * Called with busiest_rq locked. | 2301 | * Called with busiest_rq locked. |
2295 | */ | 2302 | */ |
2296 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2303 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) |
2297 | { | 2304 | { |
2298 | struct sched_domain *sd; | 2305 | struct sched_domain *sd; |
2299 | runqueue_t *target_rq; | 2306 | runqueue_t *target_rq; |
2300 | int target_cpu = busiest_rq->push_cpu; | 2307 | int target_cpu = busiest_rq->push_cpu; |
2301 | 2308 | ||
2302 | if (busiest_rq->nr_running <= 1) | 2309 | if (busiest_rq->nr_running <= 1) |
2303 | /* no task to move */ | 2310 | /* no task to move */ |
2304 | return; | 2311 | return; |
2305 | 2312 | ||
2306 | target_rq = cpu_rq(target_cpu); | 2313 | target_rq = cpu_rq(target_cpu); |
2307 | 2314 | ||
2308 | /* | 2315 | /* |
2309 | * This condition is "impossible", if it occurs | 2316 | * This condition is "impossible", if it occurs |
2310 | * we need to fix it. Originally reported by | 2317 | * we need to fix it. Originally reported by |
2311 | * Bjorn Helgaas on a 128-cpu setup. | 2318 | * Bjorn Helgaas on a 128-cpu setup. |
2312 | */ | 2319 | */ |
2313 | BUG_ON(busiest_rq == target_rq); | 2320 | BUG_ON(busiest_rq == target_rq); |
2314 | 2321 | ||
2315 | /* move a task from busiest_rq to target_rq */ | 2322 | /* move a task from busiest_rq to target_rq */ |
2316 | double_lock_balance(busiest_rq, target_rq); | 2323 | double_lock_balance(busiest_rq, target_rq); |
2317 | 2324 | ||
2318 | /* Search for an sd spanning us and the target CPU. */ | 2325 | /* Search for an sd spanning us and the target CPU. */ |
2319 | for_each_domain(target_cpu, sd) | 2326 | for_each_domain(target_cpu, sd) |
2320 | if ((sd->flags & SD_LOAD_BALANCE) && | 2327 | if ((sd->flags & SD_LOAD_BALANCE) && |
2321 | cpu_isset(busiest_cpu, sd->span)) | 2328 | cpu_isset(busiest_cpu, sd->span)) |
2322 | break; | 2329 | break; |
2323 | 2330 | ||
2324 | if (unlikely(sd == NULL)) | 2331 | if (unlikely(sd == NULL)) |
2325 | goto out; | 2332 | goto out; |
2326 | 2333 | ||
2327 | schedstat_inc(sd, alb_cnt); | 2334 | schedstat_inc(sd, alb_cnt); |
2328 | 2335 | ||
2329 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2336 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) |
2330 | schedstat_inc(sd, alb_pushed); | 2337 | schedstat_inc(sd, alb_pushed); |
2331 | else | 2338 | else |
2332 | schedstat_inc(sd, alb_failed); | 2339 | schedstat_inc(sd, alb_failed); |
2333 | out: | 2340 | out: |
2334 | spin_unlock(&target_rq->lock); | 2341 | spin_unlock(&target_rq->lock); |
2335 | } | 2342 | } |
2336 | 2343 | ||
2337 | /* | 2344 | /* |
2338 | * rebalance_tick will get called every timer tick, on every CPU. | 2345 | * rebalance_tick will get called every timer tick, on every CPU. |
2339 | * | 2346 | * |
2340 | * It checks each scheduling domain to see if it is due to be balanced, | 2347 | * It checks each scheduling domain to see if it is due to be balanced, |
2341 | * and initiates a balancing operation if so. | 2348 | * and initiates a balancing operation if so. |
2342 | * | 2349 | * |
2343 | * Balancing parameters are set up in arch_init_sched_domains. | 2350 | * Balancing parameters are set up in arch_init_sched_domains. |
2344 | */ | 2351 | */ |
2345 | 2352 | ||
2346 | /* Don't have all balancing operations going off at once */ | 2353 | /* Don't have all balancing operations going off at once */ |
2347 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) | 2354 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) |
2348 | 2355 | ||
2349 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | 2356 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, |
2350 | enum idle_type idle) | 2357 | enum idle_type idle) |
2351 | { | 2358 | { |
2352 | unsigned long old_load, this_load; | 2359 | unsigned long old_load, this_load; |
2353 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | 2360 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); |
2354 | struct sched_domain *sd; | 2361 | struct sched_domain *sd; |
2355 | int i; | 2362 | int i; |
2356 | 2363 | ||
2357 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2364 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; |
2358 | /* Update our load */ | 2365 | /* Update our load */ |
2359 | for (i = 0; i < 3; i++) { | 2366 | for (i = 0; i < 3; i++) { |
2360 | unsigned long new_load = this_load; | 2367 | unsigned long new_load = this_load; |
2361 | int scale = 1 << i; | 2368 | int scale = 1 << i; |
2362 | old_load = this_rq->cpu_load[i]; | 2369 | old_load = this_rq->cpu_load[i]; |
2363 | /* | 2370 | /* |
2364 | * Round up the averaging division if load is increasing. This | 2371 | * Round up the averaging division if load is increasing. This |
2365 | * prevents us from getting stuck on 9 if the load is 10, for | 2372 | * prevents us from getting stuck on 9 if the load is 10, for |
2366 | * example. | 2373 | * example. |
2367 | */ | 2374 | */ |
2368 | if (new_load > old_load) | 2375 | if (new_load > old_load) |
2369 | new_load += scale-1; | 2376 | new_load += scale-1; |
2370 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | 2377 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; |
2371 | } | 2378 | } |
2372 | 2379 | ||
2373 | for_each_domain(this_cpu, sd) { | 2380 | for_each_domain(this_cpu, sd) { |
2374 | unsigned long interval; | 2381 | unsigned long interval; |
2375 | 2382 | ||
2376 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2383 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2377 | continue; | 2384 | continue; |
2378 | 2385 | ||
2379 | interval = sd->balance_interval; | 2386 | interval = sd->balance_interval; |
2380 | if (idle != SCHED_IDLE) | 2387 | if (idle != SCHED_IDLE) |
2381 | interval *= sd->busy_factor; | 2388 | interval *= sd->busy_factor; |
2382 | 2389 | ||
2383 | /* scale ms to jiffies */ | 2390 | /* scale ms to jiffies */ |
2384 | interval = msecs_to_jiffies(interval); | 2391 | interval = msecs_to_jiffies(interval); |
2385 | if (unlikely(!interval)) | 2392 | if (unlikely(!interval)) |
2386 | interval = 1; | 2393 | interval = 1; |
2387 | 2394 | ||
2388 | if (j - sd->last_balance >= interval) { | 2395 | if (j - sd->last_balance >= interval) { |
2389 | if (load_balance(this_cpu, this_rq, sd, idle)) { | 2396 | if (load_balance(this_cpu, this_rq, sd, idle)) { |
2390 | /* | 2397 | /* |
2391 | * We've pulled tasks over so either we're no | 2398 | * We've pulled tasks over so either we're no |
2392 | * longer idle, or one of our SMT siblings is | 2399 | * longer idle, or one of our SMT siblings is |
2393 | * not idle. | 2400 | * not idle. |
2394 | */ | 2401 | */ |
2395 | idle = NOT_IDLE; | 2402 | idle = NOT_IDLE; |
2396 | } | 2403 | } |
2397 | sd->last_balance += interval; | 2404 | sd->last_balance += interval; |
2398 | } | 2405 | } |
2399 | } | 2406 | } |
2400 | } | 2407 | } |
2401 | #else | 2408 | #else |
2402 | /* | 2409 | /* |
2403 | * on UP we do not need to balance between CPUs: | 2410 | * on UP we do not need to balance between CPUs: |
2404 | */ | 2411 | */ |
2405 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) | 2412 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) |
2406 | { | 2413 | { |
2407 | } | 2414 | } |
2408 | static inline void idle_balance(int cpu, runqueue_t *rq) | 2415 | static inline void idle_balance(int cpu, runqueue_t *rq) |
2409 | { | 2416 | { |
2410 | } | 2417 | } |
2411 | #endif | 2418 | #endif |
2412 | 2419 | ||
2413 | static inline int wake_priority_sleeper(runqueue_t *rq) | 2420 | static inline int wake_priority_sleeper(runqueue_t *rq) |
2414 | { | 2421 | { |
2415 | int ret = 0; | 2422 | int ret = 0; |
2416 | #ifdef CONFIG_SCHED_SMT | 2423 | #ifdef CONFIG_SCHED_SMT |
2417 | spin_lock(&rq->lock); | 2424 | spin_lock(&rq->lock); |
2418 | /* | 2425 | /* |
2419 | * If an SMT sibling task has been put to sleep for priority | 2426 | * If an SMT sibling task has been put to sleep for priority |
2420 | * reasons reschedule the idle task to see if it can now run. | 2427 | * reasons reschedule the idle task to see if it can now run. |
2421 | */ | 2428 | */ |
2422 | if (rq->nr_running) { | 2429 | if (rq->nr_running) { |
2423 | resched_task(rq->idle); | 2430 | resched_task(rq->idle); |
2424 | ret = 1; | 2431 | ret = 1; |
2425 | } | 2432 | } |
2426 | spin_unlock(&rq->lock); | 2433 | spin_unlock(&rq->lock); |
2427 | #endif | 2434 | #endif |
2428 | return ret; | 2435 | return ret; |
2429 | } | 2436 | } |
2430 | 2437 | ||
2431 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 2438 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
2432 | 2439 | ||
2433 | EXPORT_PER_CPU_SYMBOL(kstat); | 2440 | EXPORT_PER_CPU_SYMBOL(kstat); |
2434 | 2441 | ||
2435 | /* | 2442 | /* |
2436 | * This is called on clock ticks and on context switches. | 2443 | * This is called on clock ticks and on context switches. |
2437 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 2444 | * Bank in p->sched_time the ns elapsed since the last tick or switch. |
2438 | */ | 2445 | */ |
2439 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, | 2446 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, |
2440 | unsigned long long now) | 2447 | unsigned long long now) |
2441 | { | 2448 | { |
2442 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); | 2449 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); |
2443 | p->sched_time += now - last; | 2450 | p->sched_time += now - last; |
2444 | } | 2451 | } |
2445 | 2452 | ||
2446 | /* | 2453 | /* |
2447 | * Return current->sched_time plus any more ns on the sched_clock | 2454 | * Return current->sched_time plus any more ns on the sched_clock |
2448 | * that have not yet been banked. | 2455 | * that have not yet been banked. |
2449 | */ | 2456 | */ |
2450 | unsigned long long current_sched_time(const task_t *tsk) | 2457 | unsigned long long current_sched_time(const task_t *tsk) |
2451 | { | 2458 | { |
2452 | unsigned long long ns; | 2459 | unsigned long long ns; |
2453 | unsigned long flags; | 2460 | unsigned long flags; |
2454 | local_irq_save(flags); | 2461 | local_irq_save(flags); |
2455 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); | 2462 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); |
2456 | ns = tsk->sched_time + (sched_clock() - ns); | 2463 | ns = tsk->sched_time + (sched_clock() - ns); |
2457 | local_irq_restore(flags); | 2464 | local_irq_restore(flags); |
2458 | return ns; | 2465 | return ns; |
2459 | } | 2466 | } |
2460 | 2467 | ||
2461 | /* | 2468 | /* |
2462 | * We place interactive tasks back into the active array, if possible. | 2469 | * We place interactive tasks back into the active array, if possible. |
2463 | * | 2470 | * |
2464 | * To guarantee that this does not starve expired tasks we ignore the | 2471 | * To guarantee that this does not starve expired tasks we ignore the |
2465 | * interactivity of a task if the first expired task had to wait more | 2472 | * interactivity of a task if the first expired task had to wait more |
2466 | * than a 'reasonable' amount of time. This deadline timeout is | 2473 | * than a 'reasonable' amount of time. This deadline timeout is |
2467 | * load-dependent, as the frequency of array switched decreases with | 2474 | * load-dependent, as the frequency of array switched decreases with |
2468 | * increasing number of running tasks. We also ignore the interactivity | 2475 | * increasing number of running tasks. We also ignore the interactivity |
2469 | * if a better static_prio task has expired: | 2476 | * if a better static_prio task has expired: |
2470 | */ | 2477 | */ |
2471 | #define EXPIRED_STARVING(rq) \ | 2478 | #define EXPIRED_STARVING(rq) \ |
2472 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | 2479 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ |
2473 | (jiffies - (rq)->expired_timestamp >= \ | 2480 | (jiffies - (rq)->expired_timestamp >= \ |
2474 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | 2481 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ |
2475 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) | 2482 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) |
2476 | 2483 | ||
2477 | /* | 2484 | /* |
2478 | * Account user cpu time to a process. | 2485 | * Account user cpu time to a process. |
2479 | * @p: the process that the cpu time gets accounted to | 2486 | * @p: the process that the cpu time gets accounted to |
2480 | * @hardirq_offset: the offset to subtract from hardirq_count() | 2487 | * @hardirq_offset: the offset to subtract from hardirq_count() |
2481 | * @cputime: the cpu time spent in user space since the last update | 2488 | * @cputime: the cpu time spent in user space since the last update |
2482 | */ | 2489 | */ |
2483 | void account_user_time(struct task_struct *p, cputime_t cputime) | 2490 | void account_user_time(struct task_struct *p, cputime_t cputime) |
2484 | { | 2491 | { |
2485 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2492 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2486 | cputime64_t tmp; | 2493 | cputime64_t tmp; |
2487 | 2494 | ||
2488 | p->utime = cputime_add(p->utime, cputime); | 2495 | p->utime = cputime_add(p->utime, cputime); |
2489 | 2496 | ||
2490 | /* Add user time to cpustat. */ | 2497 | /* Add user time to cpustat. */ |
2491 | tmp = cputime_to_cputime64(cputime); | 2498 | tmp = cputime_to_cputime64(cputime); |
2492 | if (TASK_NICE(p) > 0) | 2499 | if (TASK_NICE(p) > 0) |
2493 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 2500 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
2494 | else | 2501 | else |
2495 | cpustat->user = cputime64_add(cpustat->user, tmp); | 2502 | cpustat->user = cputime64_add(cpustat->user, tmp); |
2496 | } | 2503 | } |
2497 | 2504 | ||
2498 | /* | 2505 | /* |
2499 | * Account system cpu time to a process. | 2506 | * Account system cpu time to a process. |
2500 | * @p: the process that the cpu time gets accounted to | 2507 | * @p: the process that the cpu time gets accounted to |
2501 | * @hardirq_offset: the offset to subtract from hardirq_count() | 2508 | * @hardirq_offset: the offset to subtract from hardirq_count() |
2502 | * @cputime: the cpu time spent in kernel space since the last update | 2509 | * @cputime: the cpu time spent in kernel space since the last update |
2503 | */ | 2510 | */ |
2504 | void account_system_time(struct task_struct *p, int hardirq_offset, | 2511 | void account_system_time(struct task_struct *p, int hardirq_offset, |
2505 | cputime_t cputime) | 2512 | cputime_t cputime) |
2506 | { | 2513 | { |
2507 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2514 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2508 | runqueue_t *rq = this_rq(); | 2515 | runqueue_t *rq = this_rq(); |
2509 | cputime64_t tmp; | 2516 | cputime64_t tmp; |
2510 | 2517 | ||
2511 | p->stime = cputime_add(p->stime, cputime); | 2518 | p->stime = cputime_add(p->stime, cputime); |
2512 | 2519 | ||
2513 | /* Add system time to cpustat. */ | 2520 | /* Add system time to cpustat. */ |
2514 | tmp = cputime_to_cputime64(cputime); | 2521 | tmp = cputime_to_cputime64(cputime); |
2515 | if (hardirq_count() - hardirq_offset) | 2522 | if (hardirq_count() - hardirq_offset) |
2516 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 2523 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
2517 | else if (softirq_count()) | 2524 | else if (softirq_count()) |
2518 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 2525 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
2519 | else if (p != rq->idle) | 2526 | else if (p != rq->idle) |
2520 | cpustat->system = cputime64_add(cpustat->system, tmp); | 2527 | cpustat->system = cputime64_add(cpustat->system, tmp); |
2521 | else if (atomic_read(&rq->nr_iowait) > 0) | 2528 | else if (atomic_read(&rq->nr_iowait) > 0) |
2522 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 2529 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
2523 | else | 2530 | else |
2524 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 2531 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
2525 | /* Account for system time used */ | 2532 | /* Account for system time used */ |
2526 | acct_update_integrals(p); | 2533 | acct_update_integrals(p); |
2527 | } | 2534 | } |
2528 | 2535 | ||
2529 | /* | 2536 | /* |
2530 | * Account for involuntary wait time. | 2537 | * Account for involuntary wait time. |
2531 | * @p: the process from which the cpu time has been stolen | 2538 | * @p: the process from which the cpu time has been stolen |
2532 | * @steal: the cpu time spent in involuntary wait | 2539 | * @steal: the cpu time spent in involuntary wait |
2533 | */ | 2540 | */ |
2534 | void account_steal_time(struct task_struct *p, cputime_t steal) | 2541 | void account_steal_time(struct task_struct *p, cputime_t steal) |
2535 | { | 2542 | { |
2536 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2543 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2537 | cputime64_t tmp = cputime_to_cputime64(steal); | 2544 | cputime64_t tmp = cputime_to_cputime64(steal); |
2538 | runqueue_t *rq = this_rq(); | 2545 | runqueue_t *rq = this_rq(); |
2539 | 2546 | ||
2540 | if (p == rq->idle) { | 2547 | if (p == rq->idle) { |
2541 | p->stime = cputime_add(p->stime, steal); | 2548 | p->stime = cputime_add(p->stime, steal); |
2542 | if (atomic_read(&rq->nr_iowait) > 0) | 2549 | if (atomic_read(&rq->nr_iowait) > 0) |
2543 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 2550 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
2544 | else | 2551 | else |
2545 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 2552 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
2546 | } else | 2553 | } else |
2547 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 2554 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
2548 | } | 2555 | } |
2549 | 2556 | ||
2550 | /* | 2557 | /* |
2551 | * This function gets called by the timer code, with HZ frequency. | 2558 | * This function gets called by the timer code, with HZ frequency. |
2552 | * We call it with interrupts disabled. | 2559 | * We call it with interrupts disabled. |
2553 | * | 2560 | * |
2554 | * It also gets called by the fork code, when changing the parent's | 2561 | * It also gets called by the fork code, when changing the parent's |
2555 | * timeslices. | 2562 | * timeslices. |
2556 | */ | 2563 | */ |
2557 | void scheduler_tick(void) | 2564 | void scheduler_tick(void) |
2558 | { | 2565 | { |
2559 | int cpu = smp_processor_id(); | 2566 | int cpu = smp_processor_id(); |
2560 | runqueue_t *rq = this_rq(); | 2567 | runqueue_t *rq = this_rq(); |
2561 | task_t *p = current; | 2568 | task_t *p = current; |
2562 | unsigned long long now = sched_clock(); | 2569 | unsigned long long now = sched_clock(); |
2563 | 2570 | ||
2564 | update_cpu_clock(p, rq, now); | 2571 | update_cpu_clock(p, rq, now); |
2565 | 2572 | ||
2566 | rq->timestamp_last_tick = now; | 2573 | rq->timestamp_last_tick = now; |
2567 | 2574 | ||
2568 | if (p == rq->idle) { | 2575 | if (p == rq->idle) { |
2569 | if (wake_priority_sleeper(rq)) | 2576 | if (wake_priority_sleeper(rq)) |
2570 | goto out; | 2577 | goto out; |
2571 | rebalance_tick(cpu, rq, SCHED_IDLE); | 2578 | rebalance_tick(cpu, rq, SCHED_IDLE); |
2572 | return; | 2579 | return; |
2573 | } | 2580 | } |
2574 | 2581 | ||
2575 | /* Task might have expired already, but not scheduled off yet */ | 2582 | /* Task might have expired already, but not scheduled off yet */ |
2576 | if (p->array != rq->active) { | 2583 | if (p->array != rq->active) { |
2577 | set_tsk_need_resched(p); | 2584 | set_tsk_need_resched(p); |
2578 | goto out; | 2585 | goto out; |
2579 | } | 2586 | } |
2580 | spin_lock(&rq->lock); | 2587 | spin_lock(&rq->lock); |
2581 | /* | 2588 | /* |
2582 | * The task was running during this tick - update the | 2589 | * The task was running during this tick - update the |
2583 | * time slice counter. Note: we do not update a thread's | 2590 | * time slice counter. Note: we do not update a thread's |
2584 | * priority until it either goes to sleep or uses up its | 2591 | * priority until it either goes to sleep or uses up its |
2585 | * timeslice. This makes it possible for interactive tasks | 2592 | * timeslice. This makes it possible for interactive tasks |
2586 | * to use up their timeslices at their highest priority levels. | 2593 | * to use up their timeslices at their highest priority levels. |
2587 | */ | 2594 | */ |
2588 | if (rt_task(p)) { | 2595 | if (rt_task(p)) { |
2589 | /* | 2596 | /* |
2590 | * RR tasks need a special form of timeslice management. | 2597 | * RR tasks need a special form of timeslice management. |
2591 | * FIFO tasks have no timeslices. | 2598 | * FIFO tasks have no timeslices. |
2592 | */ | 2599 | */ |
2593 | if ((p->policy == SCHED_RR) && !--p->time_slice) { | 2600 | if ((p->policy == SCHED_RR) && !--p->time_slice) { |
2594 | p->time_slice = task_timeslice(p); | 2601 | p->time_slice = task_timeslice(p); |
2595 | p->first_time_slice = 0; | 2602 | p->first_time_slice = 0; |
2596 | set_tsk_need_resched(p); | 2603 | set_tsk_need_resched(p); |
2597 | 2604 | ||
2598 | /* put it at the end of the queue: */ | 2605 | /* put it at the end of the queue: */ |
2599 | requeue_task(p, rq->active); | 2606 | requeue_task(p, rq->active); |
2600 | } | 2607 | } |
2601 | goto out_unlock; | 2608 | goto out_unlock; |
2602 | } | 2609 | } |
2603 | if (!--p->time_slice) { | 2610 | if (!--p->time_slice) { |
2604 | dequeue_task(p, rq->active); | 2611 | dequeue_task(p, rq->active); |
2605 | set_tsk_need_resched(p); | 2612 | set_tsk_need_resched(p); |
2606 | p->prio = effective_prio(p); | 2613 | p->prio = effective_prio(p); |
2607 | p->time_slice = task_timeslice(p); | 2614 | p->time_slice = task_timeslice(p); |
2608 | p->first_time_slice = 0; | 2615 | p->first_time_slice = 0; |
2609 | 2616 | ||
2610 | if (!rq->expired_timestamp) | 2617 | if (!rq->expired_timestamp) |
2611 | rq->expired_timestamp = jiffies; | 2618 | rq->expired_timestamp = jiffies; |
2612 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | 2619 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { |
2613 | enqueue_task(p, rq->expired); | 2620 | enqueue_task(p, rq->expired); |
2614 | if (p->static_prio < rq->best_expired_prio) | 2621 | if (p->static_prio < rq->best_expired_prio) |
2615 | rq->best_expired_prio = p->static_prio; | 2622 | rq->best_expired_prio = p->static_prio; |
2616 | } else | 2623 | } else |
2617 | enqueue_task(p, rq->active); | 2624 | enqueue_task(p, rq->active); |
2618 | } else { | 2625 | } else { |
2619 | /* | 2626 | /* |
2620 | * Prevent a too long timeslice allowing a task to monopolize | 2627 | * Prevent a too long timeslice allowing a task to monopolize |
2621 | * the CPU. We do this by splitting up the timeslice into | 2628 | * the CPU. We do this by splitting up the timeslice into |
2622 | * smaller pieces. | 2629 | * smaller pieces. |
2623 | * | 2630 | * |
2624 | * Note: this does not mean the task's timeslices expire or | 2631 | * Note: this does not mean the task's timeslices expire or |
2625 | * get lost in any way, they just might be preempted by | 2632 | * get lost in any way, they just might be preempted by |
2626 | * another task of equal priority. (one with higher | 2633 | * another task of equal priority. (one with higher |
2627 | * priority would have preempted this task already.) We | 2634 | * priority would have preempted this task already.) We |
2628 | * requeue this task to the end of the list on this priority | 2635 | * requeue this task to the end of the list on this priority |
2629 | * level, which is in essence a round-robin of tasks with | 2636 | * level, which is in essence a round-robin of tasks with |
2630 | * equal priority. | 2637 | * equal priority. |
2631 | * | 2638 | * |
2632 | * This only applies to tasks in the interactive | 2639 | * This only applies to tasks in the interactive |
2633 | * delta range with at least TIMESLICE_GRANULARITY to requeue. | 2640 | * delta range with at least TIMESLICE_GRANULARITY to requeue. |
2634 | */ | 2641 | */ |
2635 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | 2642 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - |
2636 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && | 2643 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && |
2637 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | 2644 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && |
2638 | (p->array == rq->active)) { | 2645 | (p->array == rq->active)) { |
2639 | 2646 | ||
2640 | requeue_task(p, rq->active); | 2647 | requeue_task(p, rq->active); |
2641 | set_tsk_need_resched(p); | 2648 | set_tsk_need_resched(p); |
2642 | } | 2649 | } |
2643 | } | 2650 | } |
2644 | out_unlock: | 2651 | out_unlock: |
2645 | spin_unlock(&rq->lock); | 2652 | spin_unlock(&rq->lock); |
2646 | out: | 2653 | out: |
2647 | rebalance_tick(cpu, rq, NOT_IDLE); | 2654 | rebalance_tick(cpu, rq, NOT_IDLE); |
2648 | } | 2655 | } |
2649 | 2656 | ||
2650 | #ifdef CONFIG_SCHED_SMT | 2657 | #ifdef CONFIG_SCHED_SMT |
2651 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | 2658 | static inline void wakeup_busy_runqueue(runqueue_t *rq) |
2652 | { | 2659 | { |
2653 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | 2660 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ |
2654 | if (rq->curr == rq->idle && rq->nr_running) | 2661 | if (rq->curr == rq->idle && rq->nr_running) |
2655 | resched_task(rq->idle); | 2662 | resched_task(rq->idle); |
2656 | } | 2663 | } |
2657 | 2664 | ||
2658 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2665 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2659 | { | 2666 | { |
2660 | struct sched_domain *tmp, *sd = NULL; | 2667 | struct sched_domain *tmp, *sd = NULL; |
2661 | cpumask_t sibling_map; | 2668 | cpumask_t sibling_map; |
2662 | int i; | 2669 | int i; |
2663 | 2670 | ||
2664 | for_each_domain(this_cpu, tmp) | 2671 | for_each_domain(this_cpu, tmp) |
2665 | if (tmp->flags & SD_SHARE_CPUPOWER) | 2672 | if (tmp->flags & SD_SHARE_CPUPOWER) |
2666 | sd = tmp; | 2673 | sd = tmp; |
2667 | 2674 | ||
2668 | if (!sd) | 2675 | if (!sd) |
2669 | return; | 2676 | return; |
2670 | 2677 | ||
2671 | /* | 2678 | /* |
2672 | * Unlock the current runqueue because we have to lock in | 2679 | * Unlock the current runqueue because we have to lock in |
2673 | * CPU order to avoid deadlocks. Caller knows that we might | 2680 | * CPU order to avoid deadlocks. Caller knows that we might |
2674 | * unlock. We keep IRQs disabled. | 2681 | * unlock. We keep IRQs disabled. |
2675 | */ | 2682 | */ |
2676 | spin_unlock(&this_rq->lock); | 2683 | spin_unlock(&this_rq->lock); |
2677 | 2684 | ||
2678 | sibling_map = sd->span; | 2685 | sibling_map = sd->span; |
2679 | 2686 | ||
2680 | for_each_cpu_mask(i, sibling_map) | 2687 | for_each_cpu_mask(i, sibling_map) |
2681 | spin_lock(&cpu_rq(i)->lock); | 2688 | spin_lock(&cpu_rq(i)->lock); |
2682 | /* | 2689 | /* |
2683 | * We clear this CPU from the mask. This both simplifies the | 2690 | * We clear this CPU from the mask. This both simplifies the |
2684 | * inner loop and keps this_rq locked when we exit: | 2691 | * inner loop and keps this_rq locked when we exit: |
2685 | */ | 2692 | */ |
2686 | cpu_clear(this_cpu, sibling_map); | 2693 | cpu_clear(this_cpu, sibling_map); |
2687 | 2694 | ||
2688 | for_each_cpu_mask(i, sibling_map) { | 2695 | for_each_cpu_mask(i, sibling_map) { |
2689 | runqueue_t *smt_rq = cpu_rq(i); | 2696 | runqueue_t *smt_rq = cpu_rq(i); |
2690 | 2697 | ||
2691 | wakeup_busy_runqueue(smt_rq); | 2698 | wakeup_busy_runqueue(smt_rq); |
2692 | } | 2699 | } |
2693 | 2700 | ||
2694 | for_each_cpu_mask(i, sibling_map) | 2701 | for_each_cpu_mask(i, sibling_map) |
2695 | spin_unlock(&cpu_rq(i)->lock); | 2702 | spin_unlock(&cpu_rq(i)->lock); |
2696 | /* | 2703 | /* |
2697 | * We exit with this_cpu's rq still held and IRQs | 2704 | * We exit with this_cpu's rq still held and IRQs |
2698 | * still disabled: | 2705 | * still disabled: |
2699 | */ | 2706 | */ |
2700 | } | 2707 | } |
2701 | 2708 | ||
2702 | /* | 2709 | /* |
2703 | * number of 'lost' timeslices this task wont be able to fully | 2710 | * number of 'lost' timeslices this task wont be able to fully |
2704 | * utilize, if another task runs on a sibling. This models the | 2711 | * utilize, if another task runs on a sibling. This models the |
2705 | * slowdown effect of other tasks running on siblings: | 2712 | * slowdown effect of other tasks running on siblings: |
2706 | */ | 2713 | */ |
2707 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | 2714 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) |
2708 | { | 2715 | { |
2709 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 2716 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2710 | } | 2717 | } |
2711 | 2718 | ||
2712 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2719 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2713 | { | 2720 | { |
2714 | struct sched_domain *tmp, *sd = NULL; | 2721 | struct sched_domain *tmp, *sd = NULL; |
2715 | cpumask_t sibling_map; | 2722 | cpumask_t sibling_map; |
2716 | prio_array_t *array; | 2723 | prio_array_t *array; |
2717 | int ret = 0, i; | 2724 | int ret = 0, i; |
2718 | task_t *p; | 2725 | task_t *p; |
2719 | 2726 | ||
2720 | for_each_domain(this_cpu, tmp) | 2727 | for_each_domain(this_cpu, tmp) |
2721 | if (tmp->flags & SD_SHARE_CPUPOWER) | 2728 | if (tmp->flags & SD_SHARE_CPUPOWER) |
2722 | sd = tmp; | 2729 | sd = tmp; |
2723 | 2730 | ||
2724 | if (!sd) | 2731 | if (!sd) |
2725 | return 0; | 2732 | return 0; |
2726 | 2733 | ||
2727 | /* | 2734 | /* |
2728 | * The same locking rules and details apply as for | 2735 | * The same locking rules and details apply as for |
2729 | * wake_sleeping_dependent(): | 2736 | * wake_sleeping_dependent(): |
2730 | */ | 2737 | */ |
2731 | spin_unlock(&this_rq->lock); | 2738 | spin_unlock(&this_rq->lock); |
2732 | sibling_map = sd->span; | 2739 | sibling_map = sd->span; |
2733 | for_each_cpu_mask(i, sibling_map) | 2740 | for_each_cpu_mask(i, sibling_map) |
2734 | spin_lock(&cpu_rq(i)->lock); | 2741 | spin_lock(&cpu_rq(i)->lock); |
2735 | cpu_clear(this_cpu, sibling_map); | 2742 | cpu_clear(this_cpu, sibling_map); |
2736 | 2743 | ||
2737 | /* | 2744 | /* |
2738 | * Establish next task to be run - it might have gone away because | 2745 | * Establish next task to be run - it might have gone away because |
2739 | * we released the runqueue lock above: | 2746 | * we released the runqueue lock above: |
2740 | */ | 2747 | */ |
2741 | if (!this_rq->nr_running) | 2748 | if (!this_rq->nr_running) |
2742 | goto out_unlock; | 2749 | goto out_unlock; |
2743 | array = this_rq->active; | 2750 | array = this_rq->active; |
2744 | if (!array->nr_active) | 2751 | if (!array->nr_active) |
2745 | array = this_rq->expired; | 2752 | array = this_rq->expired; |
2746 | BUG_ON(!array->nr_active); | 2753 | BUG_ON(!array->nr_active); |
2747 | 2754 | ||
2748 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 2755 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, |
2749 | task_t, run_list); | 2756 | task_t, run_list); |
2750 | 2757 | ||
2751 | for_each_cpu_mask(i, sibling_map) { | 2758 | for_each_cpu_mask(i, sibling_map) { |
2752 | runqueue_t *smt_rq = cpu_rq(i); | 2759 | runqueue_t *smt_rq = cpu_rq(i); |
2753 | task_t *smt_curr = smt_rq->curr; | 2760 | task_t *smt_curr = smt_rq->curr; |
2754 | 2761 | ||
2755 | /* Kernel threads do not participate in dependent sleeping */ | 2762 | /* Kernel threads do not participate in dependent sleeping */ |
2756 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 2763 | if (!p->mm || !smt_curr->mm || rt_task(p)) |
2757 | goto check_smt_task; | 2764 | goto check_smt_task; |
2758 | 2765 | ||
2759 | /* | 2766 | /* |
2760 | * If a user task with lower static priority than the | 2767 | * If a user task with lower static priority than the |
2761 | * running task on the SMT sibling is trying to schedule, | 2768 | * running task on the SMT sibling is trying to schedule, |
2762 | * delay it till there is proportionately less timeslice | 2769 | * delay it till there is proportionately less timeslice |
2763 | * left of the sibling task to prevent a lower priority | 2770 | * left of the sibling task to prevent a lower priority |
2764 | * task from using an unfair proportion of the | 2771 | * task from using an unfair proportion of the |
2765 | * physical cpu's resources. -ck | 2772 | * physical cpu's resources. -ck |
2766 | */ | 2773 | */ |
2767 | if (rt_task(smt_curr)) { | 2774 | if (rt_task(smt_curr)) { |
2768 | /* | 2775 | /* |
2769 | * With real time tasks we run non-rt tasks only | 2776 | * With real time tasks we run non-rt tasks only |
2770 | * per_cpu_gain% of the time. | 2777 | * per_cpu_gain% of the time. |
2771 | */ | 2778 | */ |
2772 | if ((jiffies % DEF_TIMESLICE) > | 2779 | if ((jiffies % DEF_TIMESLICE) > |
2773 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 2780 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2774 | ret = 1; | 2781 | ret = 1; |
2775 | } else | 2782 | } else |
2776 | if (smt_curr->static_prio < p->static_prio && | 2783 | if (smt_curr->static_prio < p->static_prio && |
2777 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 2784 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
2778 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 2785 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
2779 | ret = 1; | 2786 | ret = 1; |
2780 | 2787 | ||
2781 | check_smt_task: | 2788 | check_smt_task: |
2782 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | 2789 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || |
2783 | rt_task(smt_curr)) | 2790 | rt_task(smt_curr)) |
2784 | continue; | 2791 | continue; |
2785 | if (!p->mm) { | 2792 | if (!p->mm) { |
2786 | wakeup_busy_runqueue(smt_rq); | 2793 | wakeup_busy_runqueue(smt_rq); |
2787 | continue; | 2794 | continue; |
2788 | } | 2795 | } |
2789 | 2796 | ||
2790 | /* | 2797 | /* |
2791 | * Reschedule a lower priority task on the SMT sibling for | 2798 | * Reschedule a lower priority task on the SMT sibling for |
2792 | * it to be put to sleep, or wake it up if it has been put to | 2799 | * it to be put to sleep, or wake it up if it has been put to |
2793 | * sleep for priority reasons to see if it should run now. | 2800 | * sleep for priority reasons to see if it should run now. |
2794 | */ | 2801 | */ |
2795 | if (rt_task(p)) { | 2802 | if (rt_task(p)) { |
2796 | if ((jiffies % DEF_TIMESLICE) > | 2803 | if ((jiffies % DEF_TIMESLICE) > |
2797 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2798 | resched_task(smt_curr); | 2805 | resched_task(smt_curr); |
2799 | } else { | 2806 | } else { |
2800 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | 2807 | if (TASK_PREEMPTS_CURR(p, smt_rq) && |
2801 | smt_slice(p, sd) > task_timeslice(smt_curr)) | 2808 | smt_slice(p, sd) > task_timeslice(smt_curr)) |
2802 | resched_task(smt_curr); | 2809 | resched_task(smt_curr); |
2803 | else | 2810 | else |
2804 | wakeup_busy_runqueue(smt_rq); | 2811 | wakeup_busy_runqueue(smt_rq); |
2805 | } | 2812 | } |
2806 | } | 2813 | } |
2807 | out_unlock: | 2814 | out_unlock: |
2808 | for_each_cpu_mask(i, sibling_map) | 2815 | for_each_cpu_mask(i, sibling_map) |
2809 | spin_unlock(&cpu_rq(i)->lock); | 2816 | spin_unlock(&cpu_rq(i)->lock); |
2810 | return ret; | 2817 | return ret; |
2811 | } | 2818 | } |
2812 | #else | 2819 | #else |
2813 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2820 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2814 | { | 2821 | { |
2815 | } | 2822 | } |
2816 | 2823 | ||
2817 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2824 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2818 | { | 2825 | { |
2819 | return 0; | 2826 | return 0; |
2820 | } | 2827 | } |
2821 | #endif | 2828 | #endif |
2822 | 2829 | ||
2823 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 2830 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) |
2824 | 2831 | ||
2825 | void fastcall add_preempt_count(int val) | 2832 | void fastcall add_preempt_count(int val) |
2826 | { | 2833 | { |
2827 | /* | 2834 | /* |
2828 | * Underflow? | 2835 | * Underflow? |
2829 | */ | 2836 | */ |
2830 | BUG_ON((preempt_count() < 0)); | 2837 | BUG_ON((preempt_count() < 0)); |
2831 | preempt_count() += val; | 2838 | preempt_count() += val; |
2832 | /* | 2839 | /* |
2833 | * Spinlock count overflowing soon? | 2840 | * Spinlock count overflowing soon? |
2834 | */ | 2841 | */ |
2835 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 2842 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); |
2836 | } | 2843 | } |
2837 | EXPORT_SYMBOL(add_preempt_count); | 2844 | EXPORT_SYMBOL(add_preempt_count); |
2838 | 2845 | ||
2839 | void fastcall sub_preempt_count(int val) | 2846 | void fastcall sub_preempt_count(int val) |
2840 | { | 2847 | { |
2841 | /* | 2848 | /* |
2842 | * Underflow? | 2849 | * Underflow? |
2843 | */ | 2850 | */ |
2844 | BUG_ON(val > preempt_count()); | 2851 | BUG_ON(val > preempt_count()); |
2845 | /* | 2852 | /* |
2846 | * Is the spinlock portion underflowing? | 2853 | * Is the spinlock portion underflowing? |
2847 | */ | 2854 | */ |
2848 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); | 2855 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); |
2849 | preempt_count() -= val; | 2856 | preempt_count() -= val; |
2850 | } | 2857 | } |
2851 | EXPORT_SYMBOL(sub_preempt_count); | 2858 | EXPORT_SYMBOL(sub_preempt_count); |
2852 | 2859 | ||
2853 | #endif | 2860 | #endif |
2854 | 2861 | ||
2855 | /* | 2862 | /* |
2856 | * schedule() is the main scheduler function. | 2863 | * schedule() is the main scheduler function. |
2857 | */ | 2864 | */ |
2858 | asmlinkage void __sched schedule(void) | 2865 | asmlinkage void __sched schedule(void) |
2859 | { | 2866 | { |
2860 | long *switch_count; | 2867 | long *switch_count; |
2861 | task_t *prev, *next; | 2868 | task_t *prev, *next; |
2862 | runqueue_t *rq; | 2869 | runqueue_t *rq; |
2863 | prio_array_t *array; | 2870 | prio_array_t *array; |
2864 | struct list_head *queue; | 2871 | struct list_head *queue; |
2865 | unsigned long long now; | 2872 | unsigned long long now; |
2866 | unsigned long run_time; | 2873 | unsigned long run_time; |
2867 | int cpu, idx, new_prio; | 2874 | int cpu, idx, new_prio; |
2868 | 2875 | ||
2869 | /* | 2876 | /* |
2870 | * Test if we are atomic. Since do_exit() needs to call into | 2877 | * Test if we are atomic. Since do_exit() needs to call into |
2871 | * schedule() atomically, we ignore that path for now. | 2878 | * schedule() atomically, we ignore that path for now. |
2872 | * Otherwise, whine if we are scheduling when we should not be. | 2879 | * Otherwise, whine if we are scheduling when we should not be. |
2873 | */ | 2880 | */ |
2874 | if (likely(!current->exit_state)) { | 2881 | if (likely(!current->exit_state)) { |
2875 | if (unlikely(in_atomic())) { | 2882 | if (unlikely(in_atomic())) { |
2876 | printk(KERN_ERR "BUG: scheduling while atomic: " | 2883 | printk(KERN_ERR "BUG: scheduling while atomic: " |
2877 | "%s/0x%08x/%d\n", | 2884 | "%s/0x%08x/%d\n", |
2878 | current->comm, preempt_count(), current->pid); | 2885 | current->comm, preempt_count(), current->pid); |
2879 | dump_stack(); | 2886 | dump_stack(); |
2880 | } | 2887 | } |
2881 | } | 2888 | } |
2882 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 2889 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
2883 | 2890 | ||
2884 | need_resched: | 2891 | need_resched: |
2885 | preempt_disable(); | 2892 | preempt_disable(); |
2886 | prev = current; | 2893 | prev = current; |
2887 | release_kernel_lock(prev); | 2894 | release_kernel_lock(prev); |
2888 | need_resched_nonpreemptible: | 2895 | need_resched_nonpreemptible: |
2889 | rq = this_rq(); | 2896 | rq = this_rq(); |
2890 | 2897 | ||
2891 | /* | 2898 | /* |
2892 | * The idle thread is not allowed to schedule! | 2899 | * The idle thread is not allowed to schedule! |
2893 | * Remove this check after it has been exercised a bit. | 2900 | * Remove this check after it has been exercised a bit. |
2894 | */ | 2901 | */ |
2895 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { | 2902 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { |
2896 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | 2903 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); |
2897 | dump_stack(); | 2904 | dump_stack(); |
2898 | } | 2905 | } |
2899 | 2906 | ||
2900 | schedstat_inc(rq, sched_cnt); | 2907 | schedstat_inc(rq, sched_cnt); |
2901 | now = sched_clock(); | 2908 | now = sched_clock(); |
2902 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | 2909 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { |
2903 | run_time = now - prev->timestamp; | 2910 | run_time = now - prev->timestamp; |
2904 | if (unlikely((long long)(now - prev->timestamp) < 0)) | 2911 | if (unlikely((long long)(now - prev->timestamp) < 0)) |
2905 | run_time = 0; | 2912 | run_time = 0; |
2906 | } else | 2913 | } else |
2907 | run_time = NS_MAX_SLEEP_AVG; | 2914 | run_time = NS_MAX_SLEEP_AVG; |
2908 | 2915 | ||
2909 | /* | 2916 | /* |
2910 | * Tasks charged proportionately less run_time at high sleep_avg to | 2917 | * Tasks charged proportionately less run_time at high sleep_avg to |
2911 | * delay them losing their interactive status | 2918 | * delay them losing their interactive status |
2912 | */ | 2919 | */ |
2913 | run_time /= (CURRENT_BONUS(prev) ? : 1); | 2920 | run_time /= (CURRENT_BONUS(prev) ? : 1); |
2914 | 2921 | ||
2915 | spin_lock_irq(&rq->lock); | 2922 | spin_lock_irq(&rq->lock); |
2916 | 2923 | ||
2917 | if (unlikely(prev->flags & PF_DEAD)) | 2924 | if (unlikely(prev->flags & PF_DEAD)) |
2918 | prev->state = EXIT_DEAD; | 2925 | prev->state = EXIT_DEAD; |
2919 | 2926 | ||
2920 | switch_count = &prev->nivcsw; | 2927 | switch_count = &prev->nivcsw; |
2921 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 2928 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
2922 | switch_count = &prev->nvcsw; | 2929 | switch_count = &prev->nvcsw; |
2923 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 2930 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
2924 | unlikely(signal_pending(prev)))) | 2931 | unlikely(signal_pending(prev)))) |
2925 | prev->state = TASK_RUNNING; | 2932 | prev->state = TASK_RUNNING; |
2926 | else { | 2933 | else { |
2927 | if (prev->state == TASK_UNINTERRUPTIBLE) | 2934 | if (prev->state == TASK_UNINTERRUPTIBLE) |
2928 | rq->nr_uninterruptible++; | 2935 | rq->nr_uninterruptible++; |
2929 | deactivate_task(prev, rq); | 2936 | deactivate_task(prev, rq); |
2930 | } | 2937 | } |
2931 | } | 2938 | } |
2932 | 2939 | ||
2933 | cpu = smp_processor_id(); | 2940 | cpu = smp_processor_id(); |
2934 | if (unlikely(!rq->nr_running)) { | 2941 | if (unlikely(!rq->nr_running)) { |
2935 | go_idle: | 2942 | go_idle: |
2936 | idle_balance(cpu, rq); | 2943 | idle_balance(cpu, rq); |
2937 | if (!rq->nr_running) { | 2944 | if (!rq->nr_running) { |
2938 | next = rq->idle; | 2945 | next = rq->idle; |
2939 | rq->expired_timestamp = 0; | 2946 | rq->expired_timestamp = 0; |
2940 | wake_sleeping_dependent(cpu, rq); | 2947 | wake_sleeping_dependent(cpu, rq); |
2941 | /* | 2948 | /* |
2942 | * wake_sleeping_dependent() might have released | 2949 | * wake_sleeping_dependent() might have released |
2943 | * the runqueue, so break out if we got new | 2950 | * the runqueue, so break out if we got new |
2944 | * tasks meanwhile: | 2951 | * tasks meanwhile: |
2945 | */ | 2952 | */ |
2946 | if (!rq->nr_running) | 2953 | if (!rq->nr_running) |
2947 | goto switch_tasks; | 2954 | goto switch_tasks; |
2948 | } | 2955 | } |
2949 | } else { | 2956 | } else { |
2950 | if (dependent_sleeper(cpu, rq)) { | 2957 | if (dependent_sleeper(cpu, rq)) { |
2951 | next = rq->idle; | 2958 | next = rq->idle; |
2952 | goto switch_tasks; | 2959 | goto switch_tasks; |
2953 | } | 2960 | } |
2954 | /* | 2961 | /* |
2955 | * dependent_sleeper() releases and reacquires the runqueue | 2962 | * dependent_sleeper() releases and reacquires the runqueue |
2956 | * lock, hence go into the idle loop if the rq went | 2963 | * lock, hence go into the idle loop if the rq went |
2957 | * empty meanwhile: | 2964 | * empty meanwhile: |
2958 | */ | 2965 | */ |
2959 | if (unlikely(!rq->nr_running)) | 2966 | if (unlikely(!rq->nr_running)) |
2960 | goto go_idle; | 2967 | goto go_idle; |
2961 | } | 2968 | } |
2962 | 2969 | ||
2963 | array = rq->active; | 2970 | array = rq->active; |
2964 | if (unlikely(!array->nr_active)) { | 2971 | if (unlikely(!array->nr_active)) { |
2965 | /* | 2972 | /* |
2966 | * Switch the active and expired arrays. | 2973 | * Switch the active and expired arrays. |
2967 | */ | 2974 | */ |
2968 | schedstat_inc(rq, sched_switch); | 2975 | schedstat_inc(rq, sched_switch); |
2969 | rq->active = rq->expired; | 2976 | rq->active = rq->expired; |
2970 | rq->expired = array; | 2977 | rq->expired = array; |
2971 | array = rq->active; | 2978 | array = rq->active; |
2972 | rq->expired_timestamp = 0; | 2979 | rq->expired_timestamp = 0; |
2973 | rq->best_expired_prio = MAX_PRIO; | 2980 | rq->best_expired_prio = MAX_PRIO; |
2974 | } | 2981 | } |
2975 | 2982 | ||
2976 | idx = sched_find_first_bit(array->bitmap); | 2983 | idx = sched_find_first_bit(array->bitmap); |
2977 | queue = array->queue + idx; | 2984 | queue = array->queue + idx; |
2978 | next = list_entry(queue->next, task_t, run_list); | 2985 | next = list_entry(queue->next, task_t, run_list); |
2979 | 2986 | ||
2980 | if (!rt_task(next) && next->activated > 0) { | 2987 | if (!rt_task(next) && next->activated > 0) { |
2981 | unsigned long long delta = now - next->timestamp; | 2988 | unsigned long long delta = now - next->timestamp; |
2982 | if (unlikely((long long)(now - next->timestamp) < 0)) | 2989 | if (unlikely((long long)(now - next->timestamp) < 0)) |
2983 | delta = 0; | 2990 | delta = 0; |
2984 | 2991 | ||
2985 | if (next->activated == 1) | 2992 | if (next->activated == 1) |
2986 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 2993 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
2987 | 2994 | ||
2988 | array = next->array; | 2995 | array = next->array; |
2989 | new_prio = recalc_task_prio(next, next->timestamp + delta); | 2996 | new_prio = recalc_task_prio(next, next->timestamp + delta); |
2990 | 2997 | ||
2991 | if (unlikely(next->prio != new_prio)) { | 2998 | if (unlikely(next->prio != new_prio)) { |
2992 | dequeue_task(next, array); | 2999 | dequeue_task(next, array); |
2993 | next->prio = new_prio; | 3000 | next->prio = new_prio; |
2994 | enqueue_task(next, array); | 3001 | enqueue_task(next, array); |
2995 | } else | 3002 | } else |
2996 | requeue_task(next, array); | 3003 | requeue_task(next, array); |
2997 | } | 3004 | } |
2998 | next->activated = 0; | 3005 | next->activated = 0; |
2999 | switch_tasks: | 3006 | switch_tasks: |
3000 | if (next == rq->idle) | 3007 | if (next == rq->idle) |
3001 | schedstat_inc(rq, sched_goidle); | 3008 | schedstat_inc(rq, sched_goidle); |
3002 | prefetch(next); | 3009 | prefetch(next); |
3003 | prefetch_stack(next); | 3010 | prefetch_stack(next); |
3004 | clear_tsk_need_resched(prev); | 3011 | clear_tsk_need_resched(prev); |
3005 | rcu_qsctr_inc(task_cpu(prev)); | 3012 | rcu_qsctr_inc(task_cpu(prev)); |
3006 | 3013 | ||
3007 | update_cpu_clock(prev, rq, now); | 3014 | update_cpu_clock(prev, rq, now); |
3008 | 3015 | ||
3009 | prev->sleep_avg -= run_time; | 3016 | prev->sleep_avg -= run_time; |
3010 | if ((long)prev->sleep_avg <= 0) | 3017 | if ((long)prev->sleep_avg <= 0) |
3011 | prev->sleep_avg = 0; | 3018 | prev->sleep_avg = 0; |
3012 | prev->timestamp = prev->last_ran = now; | 3019 | prev->timestamp = prev->last_ran = now; |
3013 | 3020 | ||
3014 | sched_info_switch(prev, next); | 3021 | sched_info_switch(prev, next); |
3015 | if (likely(prev != next)) { | 3022 | if (likely(prev != next)) { |
3016 | next->timestamp = now; | 3023 | next->timestamp = now; |
3017 | rq->nr_switches++; | 3024 | rq->nr_switches++; |
3018 | rq->curr = next; | 3025 | rq->curr = next; |
3019 | ++*switch_count; | 3026 | ++*switch_count; |
3020 | 3027 | ||
3021 | prepare_task_switch(rq, next); | 3028 | prepare_task_switch(rq, next); |
3022 | prev = context_switch(rq, prev, next); | 3029 | prev = context_switch(rq, prev, next); |
3023 | barrier(); | 3030 | barrier(); |
3024 | /* | 3031 | /* |
3025 | * this_rq must be evaluated again because prev may have moved | 3032 | * this_rq must be evaluated again because prev may have moved |
3026 | * CPUs since it called schedule(), thus the 'rq' on its stack | 3033 | * CPUs since it called schedule(), thus the 'rq' on its stack |
3027 | * frame will be invalid. | 3034 | * frame will be invalid. |
3028 | */ | 3035 | */ |
3029 | finish_task_switch(this_rq(), prev); | 3036 | finish_task_switch(this_rq(), prev); |
3030 | } else | 3037 | } else |
3031 | spin_unlock_irq(&rq->lock); | 3038 | spin_unlock_irq(&rq->lock); |
3032 | 3039 | ||
3033 | prev = current; | 3040 | prev = current; |
3034 | if (unlikely(reacquire_kernel_lock(prev) < 0)) | 3041 | if (unlikely(reacquire_kernel_lock(prev) < 0)) |
3035 | goto need_resched_nonpreemptible; | 3042 | goto need_resched_nonpreemptible; |
3036 | preempt_enable_no_resched(); | 3043 | preempt_enable_no_resched(); |
3037 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3044 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3038 | goto need_resched; | 3045 | goto need_resched; |
3039 | } | 3046 | } |
3040 | 3047 | ||
3041 | EXPORT_SYMBOL(schedule); | 3048 | EXPORT_SYMBOL(schedule); |
3042 | 3049 | ||
3043 | #ifdef CONFIG_PREEMPT | 3050 | #ifdef CONFIG_PREEMPT |
3044 | /* | 3051 | /* |
3045 | * this is is the entry point to schedule() from in-kernel preemption | 3052 | * this is is the entry point to schedule() from in-kernel preemption |
3046 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3053 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3047 | * occur there and call schedule directly. | 3054 | * occur there and call schedule directly. |
3048 | */ | 3055 | */ |
3049 | asmlinkage void __sched preempt_schedule(void) | 3056 | asmlinkage void __sched preempt_schedule(void) |
3050 | { | 3057 | { |
3051 | struct thread_info *ti = current_thread_info(); | 3058 | struct thread_info *ti = current_thread_info(); |
3052 | #ifdef CONFIG_PREEMPT_BKL | 3059 | #ifdef CONFIG_PREEMPT_BKL |
3053 | struct task_struct *task = current; | 3060 | struct task_struct *task = current; |
3054 | int saved_lock_depth; | 3061 | int saved_lock_depth; |
3055 | #endif | 3062 | #endif |
3056 | /* | 3063 | /* |
3057 | * If there is a non-zero preempt_count or interrupts are disabled, | 3064 | * If there is a non-zero preempt_count or interrupts are disabled, |
3058 | * we do not want to preempt the current task. Just return.. | 3065 | * we do not want to preempt the current task. Just return.. |
3059 | */ | 3066 | */ |
3060 | if (unlikely(ti->preempt_count || irqs_disabled())) | 3067 | if (unlikely(ti->preempt_count || irqs_disabled())) |
3061 | return; | 3068 | return; |
3062 | 3069 | ||
3063 | need_resched: | 3070 | need_resched: |
3064 | add_preempt_count(PREEMPT_ACTIVE); | 3071 | add_preempt_count(PREEMPT_ACTIVE); |
3065 | /* | 3072 | /* |
3066 | * We keep the big kernel semaphore locked, but we | 3073 | * We keep the big kernel semaphore locked, but we |
3067 | * clear ->lock_depth so that schedule() doesnt | 3074 | * clear ->lock_depth so that schedule() doesnt |
3068 | * auto-release the semaphore: | 3075 | * auto-release the semaphore: |
3069 | */ | 3076 | */ |
3070 | #ifdef CONFIG_PREEMPT_BKL | 3077 | #ifdef CONFIG_PREEMPT_BKL |
3071 | saved_lock_depth = task->lock_depth; | 3078 | saved_lock_depth = task->lock_depth; |
3072 | task->lock_depth = -1; | 3079 | task->lock_depth = -1; |
3073 | #endif | 3080 | #endif |
3074 | schedule(); | 3081 | schedule(); |
3075 | #ifdef CONFIG_PREEMPT_BKL | 3082 | #ifdef CONFIG_PREEMPT_BKL |
3076 | task->lock_depth = saved_lock_depth; | 3083 | task->lock_depth = saved_lock_depth; |
3077 | #endif | 3084 | #endif |
3078 | sub_preempt_count(PREEMPT_ACTIVE); | 3085 | sub_preempt_count(PREEMPT_ACTIVE); |
3079 | 3086 | ||
3080 | /* we could miss a preemption opportunity between schedule and now */ | 3087 | /* we could miss a preemption opportunity between schedule and now */ |
3081 | barrier(); | 3088 | barrier(); |
3082 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3089 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3083 | goto need_resched; | 3090 | goto need_resched; |
3084 | } | 3091 | } |
3085 | 3092 | ||
3086 | EXPORT_SYMBOL(preempt_schedule); | 3093 | EXPORT_SYMBOL(preempt_schedule); |
3087 | 3094 | ||
3088 | /* | 3095 | /* |
3089 | * this is is the entry point to schedule() from kernel preemption | 3096 | * this is is the entry point to schedule() from kernel preemption |
3090 | * off of irq context. | 3097 | * off of irq context. |
3091 | * Note, that this is called and return with irqs disabled. This will | 3098 | * Note, that this is called and return with irqs disabled. This will |
3092 | * protect us against recursive calling from irq. | 3099 | * protect us against recursive calling from irq. |
3093 | */ | 3100 | */ |
3094 | asmlinkage void __sched preempt_schedule_irq(void) | 3101 | asmlinkage void __sched preempt_schedule_irq(void) |
3095 | { | 3102 | { |
3096 | struct thread_info *ti = current_thread_info(); | 3103 | struct thread_info *ti = current_thread_info(); |
3097 | #ifdef CONFIG_PREEMPT_BKL | 3104 | #ifdef CONFIG_PREEMPT_BKL |
3098 | struct task_struct *task = current; | 3105 | struct task_struct *task = current; |
3099 | int saved_lock_depth; | 3106 | int saved_lock_depth; |
3100 | #endif | 3107 | #endif |
3101 | /* Catch callers which need to be fixed*/ | 3108 | /* Catch callers which need to be fixed*/ |
3102 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3109 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3103 | 3110 | ||
3104 | need_resched: | 3111 | need_resched: |
3105 | add_preempt_count(PREEMPT_ACTIVE); | 3112 | add_preempt_count(PREEMPT_ACTIVE); |
3106 | /* | 3113 | /* |
3107 | * We keep the big kernel semaphore locked, but we | 3114 | * We keep the big kernel semaphore locked, but we |
3108 | * clear ->lock_depth so that schedule() doesnt | 3115 | * clear ->lock_depth so that schedule() doesnt |
3109 | * auto-release the semaphore: | 3116 | * auto-release the semaphore: |
3110 | */ | 3117 | */ |
3111 | #ifdef CONFIG_PREEMPT_BKL | 3118 | #ifdef CONFIG_PREEMPT_BKL |
3112 | saved_lock_depth = task->lock_depth; | 3119 | saved_lock_depth = task->lock_depth; |
3113 | task->lock_depth = -1; | 3120 | task->lock_depth = -1; |
3114 | #endif | 3121 | #endif |
3115 | local_irq_enable(); | 3122 | local_irq_enable(); |
3116 | schedule(); | 3123 | schedule(); |
3117 | local_irq_disable(); | 3124 | local_irq_disable(); |
3118 | #ifdef CONFIG_PREEMPT_BKL | 3125 | #ifdef CONFIG_PREEMPT_BKL |
3119 | task->lock_depth = saved_lock_depth; | 3126 | task->lock_depth = saved_lock_depth; |
3120 | #endif | 3127 | #endif |
3121 | sub_preempt_count(PREEMPT_ACTIVE); | 3128 | sub_preempt_count(PREEMPT_ACTIVE); |
3122 | 3129 | ||
3123 | /* we could miss a preemption opportunity between schedule and now */ | 3130 | /* we could miss a preemption opportunity between schedule and now */ |
3124 | barrier(); | 3131 | barrier(); |
3125 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3132 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3126 | goto need_resched; | 3133 | goto need_resched; |
3127 | } | 3134 | } |
3128 | 3135 | ||
3129 | #endif /* CONFIG_PREEMPT */ | 3136 | #endif /* CONFIG_PREEMPT */ |
3130 | 3137 | ||
3131 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3138 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
3132 | void *key) | 3139 | void *key) |
3133 | { | 3140 | { |
3134 | task_t *p = curr->private; | 3141 | task_t *p = curr->private; |
3135 | return try_to_wake_up(p, mode, sync); | 3142 | return try_to_wake_up(p, mode, sync); |
3136 | } | 3143 | } |
3137 | 3144 | ||
3138 | EXPORT_SYMBOL(default_wake_function); | 3145 | EXPORT_SYMBOL(default_wake_function); |
3139 | 3146 | ||
3140 | /* | 3147 | /* |
3141 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 3148 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
3142 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 3149 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
3143 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 3150 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
3144 | * | 3151 | * |
3145 | * There are circumstances in which we can try to wake a task which has already | 3152 | * There are circumstances in which we can try to wake a task which has already |
3146 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 3153 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
3147 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 3154 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
3148 | */ | 3155 | */ |
3149 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3156 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
3150 | int nr_exclusive, int sync, void *key) | 3157 | int nr_exclusive, int sync, void *key) |
3151 | { | 3158 | { |
3152 | struct list_head *tmp, *next; | 3159 | struct list_head *tmp, *next; |
3153 | 3160 | ||
3154 | list_for_each_safe(tmp, next, &q->task_list) { | 3161 | list_for_each_safe(tmp, next, &q->task_list) { |
3155 | wait_queue_t *curr; | 3162 | wait_queue_t *curr; |
3156 | unsigned flags; | 3163 | unsigned flags; |
3157 | curr = list_entry(tmp, wait_queue_t, task_list); | 3164 | curr = list_entry(tmp, wait_queue_t, task_list); |
3158 | flags = curr->flags; | 3165 | flags = curr->flags; |
3159 | if (curr->func(curr, mode, sync, key) && | 3166 | if (curr->func(curr, mode, sync, key) && |
3160 | (flags & WQ_FLAG_EXCLUSIVE) && | 3167 | (flags & WQ_FLAG_EXCLUSIVE) && |
3161 | !--nr_exclusive) | 3168 | !--nr_exclusive) |
3162 | break; | 3169 | break; |
3163 | } | 3170 | } |
3164 | } | 3171 | } |
3165 | 3172 | ||
3166 | /** | 3173 | /** |
3167 | * __wake_up - wake up threads blocked on a waitqueue. | 3174 | * __wake_up - wake up threads blocked on a waitqueue. |
3168 | * @q: the waitqueue | 3175 | * @q: the waitqueue |
3169 | * @mode: which threads | 3176 | * @mode: which threads |
3170 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 3177 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
3171 | * @key: is directly passed to the wakeup function | 3178 | * @key: is directly passed to the wakeup function |
3172 | */ | 3179 | */ |
3173 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | 3180 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, |
3174 | int nr_exclusive, void *key) | 3181 | int nr_exclusive, void *key) |
3175 | { | 3182 | { |
3176 | unsigned long flags; | 3183 | unsigned long flags; |
3177 | 3184 | ||
3178 | spin_lock_irqsave(&q->lock, flags); | 3185 | spin_lock_irqsave(&q->lock, flags); |
3179 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3186 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
3180 | spin_unlock_irqrestore(&q->lock, flags); | 3187 | spin_unlock_irqrestore(&q->lock, flags); |
3181 | } | 3188 | } |
3182 | 3189 | ||
3183 | EXPORT_SYMBOL(__wake_up); | 3190 | EXPORT_SYMBOL(__wake_up); |
3184 | 3191 | ||
3185 | /* | 3192 | /* |
3186 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 3193 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
3187 | */ | 3194 | */ |
3188 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 3195 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) |
3189 | { | 3196 | { |
3190 | __wake_up_common(q, mode, 1, 0, NULL); | 3197 | __wake_up_common(q, mode, 1, 0, NULL); |
3191 | } | 3198 | } |
3192 | 3199 | ||
3193 | /** | 3200 | /** |
3194 | * __wake_up_sync - wake up threads blocked on a waitqueue. | 3201 | * __wake_up_sync - wake up threads blocked on a waitqueue. |
3195 | * @q: the waitqueue | 3202 | * @q: the waitqueue |
3196 | * @mode: which threads | 3203 | * @mode: which threads |
3197 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 3204 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
3198 | * | 3205 | * |
3199 | * The sync wakeup differs that the waker knows that it will schedule | 3206 | * The sync wakeup differs that the waker knows that it will schedule |
3200 | * away soon, so while the target thread will be woken up, it will not | 3207 | * away soon, so while the target thread will be woken up, it will not |
3201 | * be migrated to another CPU - ie. the two threads are 'synchronized' | 3208 | * be migrated to another CPU - ie. the two threads are 'synchronized' |
3202 | * with each other. This can prevent needless bouncing between CPUs. | 3209 | * with each other. This can prevent needless bouncing between CPUs. |
3203 | * | 3210 | * |
3204 | * On UP it can prevent extra preemption. | 3211 | * On UP it can prevent extra preemption. |
3205 | */ | 3212 | */ |
3206 | void fastcall | 3213 | void fastcall |
3207 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 3214 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) |
3208 | { | 3215 | { |
3209 | unsigned long flags; | 3216 | unsigned long flags; |
3210 | int sync = 1; | 3217 | int sync = 1; |
3211 | 3218 | ||
3212 | if (unlikely(!q)) | 3219 | if (unlikely(!q)) |
3213 | return; | 3220 | return; |
3214 | 3221 | ||
3215 | if (unlikely(!nr_exclusive)) | 3222 | if (unlikely(!nr_exclusive)) |
3216 | sync = 0; | 3223 | sync = 0; |
3217 | 3224 | ||
3218 | spin_lock_irqsave(&q->lock, flags); | 3225 | spin_lock_irqsave(&q->lock, flags); |
3219 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); | 3226 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); |
3220 | spin_unlock_irqrestore(&q->lock, flags); | 3227 | spin_unlock_irqrestore(&q->lock, flags); |
3221 | } | 3228 | } |
3222 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 3229 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
3223 | 3230 | ||
3224 | void fastcall complete(struct completion *x) | 3231 | void fastcall complete(struct completion *x) |
3225 | { | 3232 | { |
3226 | unsigned long flags; | 3233 | unsigned long flags; |
3227 | 3234 | ||
3228 | spin_lock_irqsave(&x->wait.lock, flags); | 3235 | spin_lock_irqsave(&x->wait.lock, flags); |
3229 | x->done++; | 3236 | x->done++; |
3230 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 3237 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
3231 | 1, 0, NULL); | 3238 | 1, 0, NULL); |
3232 | spin_unlock_irqrestore(&x->wait.lock, flags); | 3239 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3233 | } | 3240 | } |
3234 | EXPORT_SYMBOL(complete); | 3241 | EXPORT_SYMBOL(complete); |
3235 | 3242 | ||
3236 | void fastcall complete_all(struct completion *x) | 3243 | void fastcall complete_all(struct completion *x) |
3237 | { | 3244 | { |
3238 | unsigned long flags; | 3245 | unsigned long flags; |
3239 | 3246 | ||
3240 | spin_lock_irqsave(&x->wait.lock, flags); | 3247 | spin_lock_irqsave(&x->wait.lock, flags); |
3241 | x->done += UINT_MAX/2; | 3248 | x->done += UINT_MAX/2; |
3242 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 3249 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
3243 | 0, 0, NULL); | 3250 | 0, 0, NULL); |
3244 | spin_unlock_irqrestore(&x->wait.lock, flags); | 3251 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3245 | } | 3252 | } |
3246 | EXPORT_SYMBOL(complete_all); | 3253 | EXPORT_SYMBOL(complete_all); |
3247 | 3254 | ||
3248 | void fastcall __sched wait_for_completion(struct completion *x) | 3255 | void fastcall __sched wait_for_completion(struct completion *x) |
3249 | { | 3256 | { |
3250 | might_sleep(); | 3257 | might_sleep(); |
3251 | spin_lock_irq(&x->wait.lock); | 3258 | spin_lock_irq(&x->wait.lock); |
3252 | if (!x->done) { | 3259 | if (!x->done) { |
3253 | DECLARE_WAITQUEUE(wait, current); | 3260 | DECLARE_WAITQUEUE(wait, current); |
3254 | 3261 | ||
3255 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3262 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3256 | __add_wait_queue_tail(&x->wait, &wait); | 3263 | __add_wait_queue_tail(&x->wait, &wait); |
3257 | do { | 3264 | do { |
3258 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3265 | __set_current_state(TASK_UNINTERRUPTIBLE); |
3259 | spin_unlock_irq(&x->wait.lock); | 3266 | spin_unlock_irq(&x->wait.lock); |
3260 | schedule(); | 3267 | schedule(); |
3261 | spin_lock_irq(&x->wait.lock); | 3268 | spin_lock_irq(&x->wait.lock); |
3262 | } while (!x->done); | 3269 | } while (!x->done); |
3263 | __remove_wait_queue(&x->wait, &wait); | 3270 | __remove_wait_queue(&x->wait, &wait); |
3264 | } | 3271 | } |
3265 | x->done--; | 3272 | x->done--; |
3266 | spin_unlock_irq(&x->wait.lock); | 3273 | spin_unlock_irq(&x->wait.lock); |
3267 | } | 3274 | } |
3268 | EXPORT_SYMBOL(wait_for_completion); | 3275 | EXPORT_SYMBOL(wait_for_completion); |
3269 | 3276 | ||
3270 | unsigned long fastcall __sched | 3277 | unsigned long fastcall __sched |
3271 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 3278 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
3272 | { | 3279 | { |
3273 | might_sleep(); | 3280 | might_sleep(); |
3274 | 3281 | ||
3275 | spin_lock_irq(&x->wait.lock); | 3282 | spin_lock_irq(&x->wait.lock); |
3276 | if (!x->done) { | 3283 | if (!x->done) { |
3277 | DECLARE_WAITQUEUE(wait, current); | 3284 | DECLARE_WAITQUEUE(wait, current); |
3278 | 3285 | ||
3279 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3286 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3280 | __add_wait_queue_tail(&x->wait, &wait); | 3287 | __add_wait_queue_tail(&x->wait, &wait); |
3281 | do { | 3288 | do { |
3282 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3289 | __set_current_state(TASK_UNINTERRUPTIBLE); |
3283 | spin_unlock_irq(&x->wait.lock); | 3290 | spin_unlock_irq(&x->wait.lock); |
3284 | timeout = schedule_timeout(timeout); | 3291 | timeout = schedule_timeout(timeout); |
3285 | spin_lock_irq(&x->wait.lock); | 3292 | spin_lock_irq(&x->wait.lock); |
3286 | if (!timeout) { | 3293 | if (!timeout) { |
3287 | __remove_wait_queue(&x->wait, &wait); | 3294 | __remove_wait_queue(&x->wait, &wait); |
3288 | goto out; | 3295 | goto out; |
3289 | } | 3296 | } |
3290 | } while (!x->done); | 3297 | } while (!x->done); |
3291 | __remove_wait_queue(&x->wait, &wait); | 3298 | __remove_wait_queue(&x->wait, &wait); |
3292 | } | 3299 | } |
3293 | x->done--; | 3300 | x->done--; |
3294 | out: | 3301 | out: |
3295 | spin_unlock_irq(&x->wait.lock); | 3302 | spin_unlock_irq(&x->wait.lock); |
3296 | return timeout; | 3303 | return timeout; |
3297 | } | 3304 | } |
3298 | EXPORT_SYMBOL(wait_for_completion_timeout); | 3305 | EXPORT_SYMBOL(wait_for_completion_timeout); |
3299 | 3306 | ||
3300 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3307 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) |
3301 | { | 3308 | { |
3302 | int ret = 0; | 3309 | int ret = 0; |
3303 | 3310 | ||
3304 | might_sleep(); | 3311 | might_sleep(); |
3305 | 3312 | ||
3306 | spin_lock_irq(&x->wait.lock); | 3313 | spin_lock_irq(&x->wait.lock); |
3307 | if (!x->done) { | 3314 | if (!x->done) { |
3308 | DECLARE_WAITQUEUE(wait, current); | 3315 | DECLARE_WAITQUEUE(wait, current); |
3309 | 3316 | ||
3310 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3317 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3311 | __add_wait_queue_tail(&x->wait, &wait); | 3318 | __add_wait_queue_tail(&x->wait, &wait); |
3312 | do { | 3319 | do { |
3313 | if (signal_pending(current)) { | 3320 | if (signal_pending(current)) { |
3314 | ret = -ERESTARTSYS; | 3321 | ret = -ERESTARTSYS; |
3315 | __remove_wait_queue(&x->wait, &wait); | 3322 | __remove_wait_queue(&x->wait, &wait); |
3316 | goto out; | 3323 | goto out; |
3317 | } | 3324 | } |
3318 | __set_current_state(TASK_INTERRUPTIBLE); | 3325 | __set_current_state(TASK_INTERRUPTIBLE); |
3319 | spin_unlock_irq(&x->wait.lock); | 3326 | spin_unlock_irq(&x->wait.lock); |
3320 | schedule(); | 3327 | schedule(); |
3321 | spin_lock_irq(&x->wait.lock); | 3328 | spin_lock_irq(&x->wait.lock); |
3322 | } while (!x->done); | 3329 | } while (!x->done); |
3323 | __remove_wait_queue(&x->wait, &wait); | 3330 | __remove_wait_queue(&x->wait, &wait); |
3324 | } | 3331 | } |
3325 | x->done--; | 3332 | x->done--; |
3326 | out: | 3333 | out: |
3327 | spin_unlock_irq(&x->wait.lock); | 3334 | spin_unlock_irq(&x->wait.lock); |
3328 | 3335 | ||
3329 | return ret; | 3336 | return ret; |
3330 | } | 3337 | } |
3331 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3338 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
3332 | 3339 | ||
3333 | unsigned long fastcall __sched | 3340 | unsigned long fastcall __sched |
3334 | wait_for_completion_interruptible_timeout(struct completion *x, | 3341 | wait_for_completion_interruptible_timeout(struct completion *x, |
3335 | unsigned long timeout) | 3342 | unsigned long timeout) |
3336 | { | 3343 | { |
3337 | might_sleep(); | 3344 | might_sleep(); |
3338 | 3345 | ||
3339 | spin_lock_irq(&x->wait.lock); | 3346 | spin_lock_irq(&x->wait.lock); |
3340 | if (!x->done) { | 3347 | if (!x->done) { |
3341 | DECLARE_WAITQUEUE(wait, current); | 3348 | DECLARE_WAITQUEUE(wait, current); |
3342 | 3349 | ||
3343 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3350 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3344 | __add_wait_queue_tail(&x->wait, &wait); | 3351 | __add_wait_queue_tail(&x->wait, &wait); |
3345 | do { | 3352 | do { |
3346 | if (signal_pending(current)) { | 3353 | if (signal_pending(current)) { |
3347 | timeout = -ERESTARTSYS; | 3354 | timeout = -ERESTARTSYS; |
3348 | __remove_wait_queue(&x->wait, &wait); | 3355 | __remove_wait_queue(&x->wait, &wait); |
3349 | goto out; | 3356 | goto out; |
3350 | } | 3357 | } |
3351 | __set_current_state(TASK_INTERRUPTIBLE); | 3358 | __set_current_state(TASK_INTERRUPTIBLE); |
3352 | spin_unlock_irq(&x->wait.lock); | 3359 | spin_unlock_irq(&x->wait.lock); |
3353 | timeout = schedule_timeout(timeout); | 3360 | timeout = schedule_timeout(timeout); |
3354 | spin_lock_irq(&x->wait.lock); | 3361 | spin_lock_irq(&x->wait.lock); |
3355 | if (!timeout) { | 3362 | if (!timeout) { |
3356 | __remove_wait_queue(&x->wait, &wait); | 3363 | __remove_wait_queue(&x->wait, &wait); |
3357 | goto out; | 3364 | goto out; |
3358 | } | 3365 | } |
3359 | } while (!x->done); | 3366 | } while (!x->done); |
3360 | __remove_wait_queue(&x->wait, &wait); | 3367 | __remove_wait_queue(&x->wait, &wait); |
3361 | } | 3368 | } |
3362 | x->done--; | 3369 | x->done--; |
3363 | out: | 3370 | out: |
3364 | spin_unlock_irq(&x->wait.lock); | 3371 | spin_unlock_irq(&x->wait.lock); |
3365 | return timeout; | 3372 | return timeout; |
3366 | } | 3373 | } |
3367 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3374 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
3368 | 3375 | ||
3369 | 3376 | ||
3370 | #define SLEEP_ON_VAR \ | 3377 | #define SLEEP_ON_VAR \ |
3371 | unsigned long flags; \ | 3378 | unsigned long flags; \ |
3372 | wait_queue_t wait; \ | 3379 | wait_queue_t wait; \ |
3373 | init_waitqueue_entry(&wait, current); | 3380 | init_waitqueue_entry(&wait, current); |
3374 | 3381 | ||
3375 | #define SLEEP_ON_HEAD \ | 3382 | #define SLEEP_ON_HEAD \ |
3376 | spin_lock_irqsave(&q->lock,flags); \ | 3383 | spin_lock_irqsave(&q->lock,flags); \ |
3377 | __add_wait_queue(q, &wait); \ | 3384 | __add_wait_queue(q, &wait); \ |
3378 | spin_unlock(&q->lock); | 3385 | spin_unlock(&q->lock); |
3379 | 3386 | ||
3380 | #define SLEEP_ON_TAIL \ | 3387 | #define SLEEP_ON_TAIL \ |
3381 | spin_lock_irq(&q->lock); \ | 3388 | spin_lock_irq(&q->lock); \ |
3382 | __remove_wait_queue(q, &wait); \ | 3389 | __remove_wait_queue(q, &wait); \ |
3383 | spin_unlock_irqrestore(&q->lock, flags); | 3390 | spin_unlock_irqrestore(&q->lock, flags); |
3384 | 3391 | ||
3385 | void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | 3392 | void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) |
3386 | { | 3393 | { |
3387 | SLEEP_ON_VAR | 3394 | SLEEP_ON_VAR |
3388 | 3395 | ||
3389 | current->state = TASK_INTERRUPTIBLE; | 3396 | current->state = TASK_INTERRUPTIBLE; |
3390 | 3397 | ||
3391 | SLEEP_ON_HEAD | 3398 | SLEEP_ON_HEAD |
3392 | schedule(); | 3399 | schedule(); |
3393 | SLEEP_ON_TAIL | 3400 | SLEEP_ON_TAIL |
3394 | } | 3401 | } |
3395 | 3402 | ||
3396 | EXPORT_SYMBOL(interruptible_sleep_on); | 3403 | EXPORT_SYMBOL(interruptible_sleep_on); |
3397 | 3404 | ||
3398 | long fastcall __sched | 3405 | long fastcall __sched |
3399 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3406 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3400 | { | 3407 | { |
3401 | SLEEP_ON_VAR | 3408 | SLEEP_ON_VAR |
3402 | 3409 | ||
3403 | current->state = TASK_INTERRUPTIBLE; | 3410 | current->state = TASK_INTERRUPTIBLE; |
3404 | 3411 | ||
3405 | SLEEP_ON_HEAD | 3412 | SLEEP_ON_HEAD |
3406 | timeout = schedule_timeout(timeout); | 3413 | timeout = schedule_timeout(timeout); |
3407 | SLEEP_ON_TAIL | 3414 | SLEEP_ON_TAIL |
3408 | 3415 | ||
3409 | return timeout; | 3416 | return timeout; |
3410 | } | 3417 | } |
3411 | 3418 | ||
3412 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3419 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3413 | 3420 | ||
3414 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3421 | void fastcall __sched sleep_on(wait_queue_head_t *q) |
3415 | { | 3422 | { |
3416 | SLEEP_ON_VAR | 3423 | SLEEP_ON_VAR |
3417 | 3424 | ||
3418 | current->state = TASK_UNINTERRUPTIBLE; | 3425 | current->state = TASK_UNINTERRUPTIBLE; |
3419 | 3426 | ||
3420 | SLEEP_ON_HEAD | 3427 | SLEEP_ON_HEAD |
3421 | schedule(); | 3428 | schedule(); |
3422 | SLEEP_ON_TAIL | 3429 | SLEEP_ON_TAIL |
3423 | } | 3430 | } |
3424 | 3431 | ||
3425 | EXPORT_SYMBOL(sleep_on); | 3432 | EXPORT_SYMBOL(sleep_on); |
3426 | 3433 | ||
3427 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3434 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3428 | { | 3435 | { |
3429 | SLEEP_ON_VAR | 3436 | SLEEP_ON_VAR |
3430 | 3437 | ||
3431 | current->state = TASK_UNINTERRUPTIBLE; | 3438 | current->state = TASK_UNINTERRUPTIBLE; |
3432 | 3439 | ||
3433 | SLEEP_ON_HEAD | 3440 | SLEEP_ON_HEAD |
3434 | timeout = schedule_timeout(timeout); | 3441 | timeout = schedule_timeout(timeout); |
3435 | SLEEP_ON_TAIL | 3442 | SLEEP_ON_TAIL |
3436 | 3443 | ||
3437 | return timeout; | 3444 | return timeout; |
3438 | } | 3445 | } |
3439 | 3446 | ||
3440 | EXPORT_SYMBOL(sleep_on_timeout); | 3447 | EXPORT_SYMBOL(sleep_on_timeout); |
3441 | 3448 | ||
3442 | void set_user_nice(task_t *p, long nice) | 3449 | void set_user_nice(task_t *p, long nice) |
3443 | { | 3450 | { |
3444 | unsigned long flags; | 3451 | unsigned long flags; |
3445 | prio_array_t *array; | 3452 | prio_array_t *array; |
3446 | runqueue_t *rq; | 3453 | runqueue_t *rq; |
3447 | int old_prio, new_prio, delta; | 3454 | int old_prio, new_prio, delta; |
3448 | 3455 | ||
3449 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3456 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3450 | return; | 3457 | return; |
3451 | /* | 3458 | /* |
3452 | * We have to be careful, if called from sys_setpriority(), | 3459 | * We have to be careful, if called from sys_setpriority(), |
3453 | * the task might be in the middle of scheduling on another CPU. | 3460 | * the task might be in the middle of scheduling on another CPU. |
3454 | */ | 3461 | */ |
3455 | rq = task_rq_lock(p, &flags); | 3462 | rq = task_rq_lock(p, &flags); |
3456 | /* | 3463 | /* |
3457 | * The RT priorities are set via sched_setscheduler(), but we still | 3464 | * The RT priorities are set via sched_setscheduler(), but we still |
3458 | * allow the 'normal' nice value to be set - but as expected | 3465 | * allow the 'normal' nice value to be set - but as expected |
3459 | * it wont have any effect on scheduling until the task is | 3466 | * it wont have any effect on scheduling until the task is |
3460 | * not SCHED_NORMAL/SCHED_BATCH: | 3467 | * not SCHED_NORMAL/SCHED_BATCH: |
3461 | */ | 3468 | */ |
3462 | if (rt_task(p)) { | 3469 | if (rt_task(p)) { |
3463 | p->static_prio = NICE_TO_PRIO(nice); | 3470 | p->static_prio = NICE_TO_PRIO(nice); |
3464 | goto out_unlock; | 3471 | goto out_unlock; |
3465 | } | 3472 | } |
3466 | array = p->array; | 3473 | array = p->array; |
3467 | if (array) | 3474 | if (array) |
3468 | dequeue_task(p, array); | 3475 | dequeue_task(p, array); |
3469 | 3476 | ||
3470 | old_prio = p->prio; | 3477 | old_prio = p->prio; |
3471 | new_prio = NICE_TO_PRIO(nice); | 3478 | new_prio = NICE_TO_PRIO(nice); |
3472 | delta = new_prio - old_prio; | 3479 | delta = new_prio - old_prio; |
3473 | p->static_prio = NICE_TO_PRIO(nice); | 3480 | p->static_prio = NICE_TO_PRIO(nice); |
3474 | p->prio += delta; | 3481 | p->prio += delta; |
3475 | 3482 | ||
3476 | if (array) { | 3483 | if (array) { |
3477 | enqueue_task(p, array); | 3484 | enqueue_task(p, array); |
3478 | /* | 3485 | /* |
3479 | * If the task increased its priority or is running and | 3486 | * If the task increased its priority or is running and |
3480 | * lowered its priority, then reschedule its CPU: | 3487 | * lowered its priority, then reschedule its CPU: |
3481 | */ | 3488 | */ |
3482 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3489 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3483 | resched_task(rq->curr); | 3490 | resched_task(rq->curr); |
3484 | } | 3491 | } |
3485 | out_unlock: | 3492 | out_unlock: |
3486 | task_rq_unlock(rq, &flags); | 3493 | task_rq_unlock(rq, &flags); |
3487 | } | 3494 | } |
3488 | 3495 | ||
3489 | EXPORT_SYMBOL(set_user_nice); | 3496 | EXPORT_SYMBOL(set_user_nice); |
3490 | 3497 | ||
3491 | /* | 3498 | /* |
3492 | * can_nice - check if a task can reduce its nice value | 3499 | * can_nice - check if a task can reduce its nice value |
3493 | * @p: task | 3500 | * @p: task |
3494 | * @nice: nice value | 3501 | * @nice: nice value |
3495 | */ | 3502 | */ |
3496 | int can_nice(const task_t *p, const int nice) | 3503 | int can_nice(const task_t *p, const int nice) |
3497 | { | 3504 | { |
3498 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3505 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
3499 | int nice_rlim = 20 - nice; | 3506 | int nice_rlim = 20 - nice; |
3500 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3507 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
3501 | capable(CAP_SYS_NICE)); | 3508 | capable(CAP_SYS_NICE)); |
3502 | } | 3509 | } |
3503 | 3510 | ||
3504 | #ifdef __ARCH_WANT_SYS_NICE | 3511 | #ifdef __ARCH_WANT_SYS_NICE |
3505 | 3512 | ||
3506 | /* | 3513 | /* |
3507 | * sys_nice - change the priority of the current process. | 3514 | * sys_nice - change the priority of the current process. |
3508 | * @increment: priority increment | 3515 | * @increment: priority increment |
3509 | * | 3516 | * |
3510 | * sys_setpriority is a more generic, but much slower function that | 3517 | * sys_setpriority is a more generic, but much slower function that |
3511 | * does similar things. | 3518 | * does similar things. |
3512 | */ | 3519 | */ |
3513 | asmlinkage long sys_nice(int increment) | 3520 | asmlinkage long sys_nice(int increment) |
3514 | { | 3521 | { |
3515 | int retval; | 3522 | int retval; |
3516 | long nice; | 3523 | long nice; |
3517 | 3524 | ||
3518 | /* | 3525 | /* |
3519 | * Setpriority might change our priority at the same moment. | 3526 | * Setpriority might change our priority at the same moment. |
3520 | * We don't have to worry. Conceptually one call occurs first | 3527 | * We don't have to worry. Conceptually one call occurs first |
3521 | * and we have a single winner. | 3528 | * and we have a single winner. |
3522 | */ | 3529 | */ |
3523 | if (increment < -40) | 3530 | if (increment < -40) |
3524 | increment = -40; | 3531 | increment = -40; |
3525 | if (increment > 40) | 3532 | if (increment > 40) |
3526 | increment = 40; | 3533 | increment = 40; |
3527 | 3534 | ||
3528 | nice = PRIO_TO_NICE(current->static_prio) + increment; | 3535 | nice = PRIO_TO_NICE(current->static_prio) + increment; |
3529 | if (nice < -20) | 3536 | if (nice < -20) |
3530 | nice = -20; | 3537 | nice = -20; |
3531 | if (nice > 19) | 3538 | if (nice > 19) |
3532 | nice = 19; | 3539 | nice = 19; |
3533 | 3540 | ||
3534 | if (increment < 0 && !can_nice(current, nice)) | 3541 | if (increment < 0 && !can_nice(current, nice)) |
3535 | return -EPERM; | 3542 | return -EPERM; |
3536 | 3543 | ||
3537 | retval = security_task_setnice(current, nice); | 3544 | retval = security_task_setnice(current, nice); |
3538 | if (retval) | 3545 | if (retval) |
3539 | return retval; | 3546 | return retval; |
3540 | 3547 | ||
3541 | set_user_nice(current, nice); | 3548 | set_user_nice(current, nice); |
3542 | return 0; | 3549 | return 0; |
3543 | } | 3550 | } |
3544 | 3551 | ||
3545 | #endif | 3552 | #endif |
3546 | 3553 | ||
3547 | /** | 3554 | /** |
3548 | * task_prio - return the priority value of a given task. | 3555 | * task_prio - return the priority value of a given task. |
3549 | * @p: the task in question. | 3556 | * @p: the task in question. |
3550 | * | 3557 | * |
3551 | * This is the priority value as seen by users in /proc. | 3558 | * This is the priority value as seen by users in /proc. |
3552 | * RT tasks are offset by -200. Normal tasks are centered | 3559 | * RT tasks are offset by -200. Normal tasks are centered |
3553 | * around 0, value goes from -16 to +15. | 3560 | * around 0, value goes from -16 to +15. |
3554 | */ | 3561 | */ |
3555 | int task_prio(const task_t *p) | 3562 | int task_prio(const task_t *p) |
3556 | { | 3563 | { |
3557 | return p->prio - MAX_RT_PRIO; | 3564 | return p->prio - MAX_RT_PRIO; |
3558 | } | 3565 | } |
3559 | 3566 | ||
3560 | /** | 3567 | /** |
3561 | * task_nice - return the nice value of a given task. | 3568 | * task_nice - return the nice value of a given task. |
3562 | * @p: the task in question. | 3569 | * @p: the task in question. |
3563 | */ | 3570 | */ |
3564 | int task_nice(const task_t *p) | 3571 | int task_nice(const task_t *p) |
3565 | { | 3572 | { |
3566 | return TASK_NICE(p); | 3573 | return TASK_NICE(p); |
3567 | } | 3574 | } |
3568 | EXPORT_SYMBOL_GPL(task_nice); | 3575 | EXPORT_SYMBOL_GPL(task_nice); |
3569 | 3576 | ||
3570 | /** | 3577 | /** |
3571 | * idle_cpu - is a given cpu idle currently? | 3578 | * idle_cpu - is a given cpu idle currently? |
3572 | * @cpu: the processor in question. | 3579 | * @cpu: the processor in question. |
3573 | */ | 3580 | */ |
3574 | int idle_cpu(int cpu) | 3581 | int idle_cpu(int cpu) |
3575 | { | 3582 | { |
3576 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 3583 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
3577 | } | 3584 | } |
3578 | 3585 | ||
3579 | /** | 3586 | /** |
3580 | * idle_task - return the idle task for a given cpu. | 3587 | * idle_task - return the idle task for a given cpu. |
3581 | * @cpu: the processor in question. | 3588 | * @cpu: the processor in question. |
3582 | */ | 3589 | */ |
3583 | task_t *idle_task(int cpu) | 3590 | task_t *idle_task(int cpu) |
3584 | { | 3591 | { |
3585 | return cpu_rq(cpu)->idle; | 3592 | return cpu_rq(cpu)->idle; |
3586 | } | 3593 | } |
3587 | 3594 | ||
3588 | /** | 3595 | /** |
3589 | * find_process_by_pid - find a process with a matching PID value. | 3596 | * find_process_by_pid - find a process with a matching PID value. |
3590 | * @pid: the pid in question. | 3597 | * @pid: the pid in question. |
3591 | */ | 3598 | */ |
3592 | static inline task_t *find_process_by_pid(pid_t pid) | 3599 | static inline task_t *find_process_by_pid(pid_t pid) |
3593 | { | 3600 | { |
3594 | return pid ? find_task_by_pid(pid) : current; | 3601 | return pid ? find_task_by_pid(pid) : current; |
3595 | } | 3602 | } |
3596 | 3603 | ||
3597 | /* Actually do priority change: must hold rq lock. */ | 3604 | /* Actually do priority change: must hold rq lock. */ |
3598 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 3605 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
3599 | { | 3606 | { |
3600 | BUG_ON(p->array); | 3607 | BUG_ON(p->array); |
3601 | p->policy = policy; | 3608 | p->policy = policy; |
3602 | p->rt_priority = prio; | 3609 | p->rt_priority = prio; |
3603 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 3610 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { |
3604 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3611 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
3605 | } else { | 3612 | } else { |
3606 | p->prio = p->static_prio; | 3613 | p->prio = p->static_prio; |
3607 | /* | 3614 | /* |
3608 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 3615 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3609 | */ | 3616 | */ |
3610 | if (policy == SCHED_BATCH) | 3617 | if (policy == SCHED_BATCH) |
3611 | p->sleep_avg = 0; | 3618 | p->sleep_avg = 0; |
3612 | } | 3619 | } |
3613 | } | 3620 | } |
3614 | 3621 | ||
3615 | /** | 3622 | /** |
3616 | * sched_setscheduler - change the scheduling policy and/or RT priority of | 3623 | * sched_setscheduler - change the scheduling policy and/or RT priority of |
3617 | * a thread. | 3624 | * a thread. |
3618 | * @p: the task in question. | 3625 | * @p: the task in question. |
3619 | * @policy: new policy. | 3626 | * @policy: new policy. |
3620 | * @param: structure containing the new RT priority. | 3627 | * @param: structure containing the new RT priority. |
3621 | */ | 3628 | */ |
3622 | int sched_setscheduler(struct task_struct *p, int policy, | 3629 | int sched_setscheduler(struct task_struct *p, int policy, |
3623 | struct sched_param *param) | 3630 | struct sched_param *param) |
3624 | { | 3631 | { |
3625 | int retval; | 3632 | int retval; |
3626 | int oldprio, oldpolicy = -1; | 3633 | int oldprio, oldpolicy = -1; |
3627 | prio_array_t *array; | 3634 | prio_array_t *array; |
3628 | unsigned long flags; | 3635 | unsigned long flags; |
3629 | runqueue_t *rq; | 3636 | runqueue_t *rq; |
3630 | 3637 | ||
3631 | recheck: | 3638 | recheck: |
3632 | /* double check policy once rq lock held */ | 3639 | /* double check policy once rq lock held */ |
3633 | if (policy < 0) | 3640 | if (policy < 0) |
3634 | policy = oldpolicy = p->policy; | 3641 | policy = oldpolicy = p->policy; |
3635 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 3642 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
3636 | policy != SCHED_NORMAL && policy != SCHED_BATCH) | 3643 | policy != SCHED_NORMAL && policy != SCHED_BATCH) |
3637 | return -EINVAL; | 3644 | return -EINVAL; |
3638 | /* | 3645 | /* |
3639 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3646 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
3640 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and | 3647 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and |
3641 | * SCHED_BATCH is 0. | 3648 | * SCHED_BATCH is 0. |
3642 | */ | 3649 | */ |
3643 | if (param->sched_priority < 0 || | 3650 | if (param->sched_priority < 0 || |
3644 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3651 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
3645 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3652 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
3646 | return -EINVAL; | 3653 | return -EINVAL; |
3647 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) | 3654 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) |
3648 | != (param->sched_priority == 0)) | 3655 | != (param->sched_priority == 0)) |
3649 | return -EINVAL; | 3656 | return -EINVAL; |
3650 | 3657 | ||
3651 | /* | 3658 | /* |
3652 | * Allow unprivileged RT tasks to decrease priority: | 3659 | * Allow unprivileged RT tasks to decrease priority: |
3653 | */ | 3660 | */ |
3654 | if (!capable(CAP_SYS_NICE)) { | 3661 | if (!capable(CAP_SYS_NICE)) { |
3655 | /* | 3662 | /* |
3656 | * can't change policy, except between SCHED_NORMAL | 3663 | * can't change policy, except between SCHED_NORMAL |
3657 | * and SCHED_BATCH: | 3664 | * and SCHED_BATCH: |
3658 | */ | 3665 | */ |
3659 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && | 3666 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && |
3660 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && | 3667 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && |
3661 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3668 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) |
3662 | return -EPERM; | 3669 | return -EPERM; |
3663 | /* can't increase priority */ | 3670 | /* can't increase priority */ |
3664 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && | 3671 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && |
3665 | param->sched_priority > p->rt_priority && | 3672 | param->sched_priority > p->rt_priority && |
3666 | param->sched_priority > | 3673 | param->sched_priority > |
3667 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3674 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) |
3668 | return -EPERM; | 3675 | return -EPERM; |
3669 | /* can't change other user's priorities */ | 3676 | /* can't change other user's priorities */ |
3670 | if ((current->euid != p->euid) && | 3677 | if ((current->euid != p->euid) && |
3671 | (current->euid != p->uid)) | 3678 | (current->euid != p->uid)) |
3672 | return -EPERM; | 3679 | return -EPERM; |
3673 | } | 3680 | } |
3674 | 3681 | ||
3675 | retval = security_task_setscheduler(p, policy, param); | 3682 | retval = security_task_setscheduler(p, policy, param); |
3676 | if (retval) | 3683 | if (retval) |
3677 | return retval; | 3684 | return retval; |
3678 | /* | 3685 | /* |
3679 | * To be able to change p->policy safely, the apropriate | 3686 | * To be able to change p->policy safely, the apropriate |
3680 | * runqueue lock must be held. | 3687 | * runqueue lock must be held. |
3681 | */ | 3688 | */ |
3682 | rq = task_rq_lock(p, &flags); | 3689 | rq = task_rq_lock(p, &flags); |
3683 | /* recheck policy now with rq lock held */ | 3690 | /* recheck policy now with rq lock held */ |
3684 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 3691 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3685 | policy = oldpolicy = -1; | 3692 | policy = oldpolicy = -1; |
3686 | task_rq_unlock(rq, &flags); | 3693 | task_rq_unlock(rq, &flags); |
3687 | goto recheck; | 3694 | goto recheck; |
3688 | } | 3695 | } |
3689 | array = p->array; | 3696 | array = p->array; |
3690 | if (array) | 3697 | if (array) |
3691 | deactivate_task(p, rq); | 3698 | deactivate_task(p, rq); |
3692 | oldprio = p->prio; | 3699 | oldprio = p->prio; |
3693 | __setscheduler(p, policy, param->sched_priority); | 3700 | __setscheduler(p, policy, param->sched_priority); |
3694 | if (array) { | 3701 | if (array) { |
3695 | __activate_task(p, rq); | 3702 | __activate_task(p, rq); |
3696 | /* | 3703 | /* |
3697 | * Reschedule if we are currently running on this runqueue and | 3704 | * Reschedule if we are currently running on this runqueue and |
3698 | * our priority decreased, or if we are not currently running on | 3705 | * our priority decreased, or if we are not currently running on |
3699 | * this runqueue and our priority is higher than the current's | 3706 | * this runqueue and our priority is higher than the current's |
3700 | */ | 3707 | */ |
3701 | if (task_running(rq, p)) { | 3708 | if (task_running(rq, p)) { |
3702 | if (p->prio > oldprio) | 3709 | if (p->prio > oldprio) |
3703 | resched_task(rq->curr); | 3710 | resched_task(rq->curr); |
3704 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 3711 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3705 | resched_task(rq->curr); | 3712 | resched_task(rq->curr); |
3706 | } | 3713 | } |
3707 | task_rq_unlock(rq, &flags); | 3714 | task_rq_unlock(rq, &flags); |
3708 | return 0; | 3715 | return 0; |
3709 | } | 3716 | } |
3710 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3717 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
3711 | 3718 | ||
3712 | static int | 3719 | static int |
3713 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 3720 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
3714 | { | 3721 | { |
3715 | int retval; | 3722 | int retval; |
3716 | struct sched_param lparam; | 3723 | struct sched_param lparam; |
3717 | struct task_struct *p; | 3724 | struct task_struct *p; |
3718 | 3725 | ||
3719 | if (!param || pid < 0) | 3726 | if (!param || pid < 0) |
3720 | return -EINVAL; | 3727 | return -EINVAL; |
3721 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 3728 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
3722 | return -EFAULT; | 3729 | return -EFAULT; |
3723 | read_lock_irq(&tasklist_lock); | 3730 | read_lock_irq(&tasklist_lock); |
3724 | p = find_process_by_pid(pid); | 3731 | p = find_process_by_pid(pid); |
3725 | if (!p) { | 3732 | if (!p) { |
3726 | read_unlock_irq(&tasklist_lock); | 3733 | read_unlock_irq(&tasklist_lock); |
3727 | return -ESRCH; | 3734 | return -ESRCH; |
3728 | } | 3735 | } |
3729 | retval = sched_setscheduler(p, policy, &lparam); | 3736 | retval = sched_setscheduler(p, policy, &lparam); |
3730 | read_unlock_irq(&tasklist_lock); | 3737 | read_unlock_irq(&tasklist_lock); |
3731 | return retval; | 3738 | return retval; |
3732 | } | 3739 | } |
3733 | 3740 | ||
3734 | /** | 3741 | /** |
3735 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 3742 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
3736 | * @pid: the pid in question. | 3743 | * @pid: the pid in question. |
3737 | * @policy: new policy. | 3744 | * @policy: new policy. |
3738 | * @param: structure containing the new RT priority. | 3745 | * @param: structure containing the new RT priority. |
3739 | */ | 3746 | */ |
3740 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 3747 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
3741 | struct sched_param __user *param) | 3748 | struct sched_param __user *param) |
3742 | { | 3749 | { |
3743 | /* negative values for policy are not valid */ | 3750 | /* negative values for policy are not valid */ |
3744 | if (policy < 0) | 3751 | if (policy < 0) |
3745 | return -EINVAL; | 3752 | return -EINVAL; |
3746 | 3753 | ||
3747 | return do_sched_setscheduler(pid, policy, param); | 3754 | return do_sched_setscheduler(pid, policy, param); |
3748 | } | 3755 | } |
3749 | 3756 | ||
3750 | /** | 3757 | /** |
3751 | * sys_sched_setparam - set/change the RT priority of a thread | 3758 | * sys_sched_setparam - set/change the RT priority of a thread |
3752 | * @pid: the pid in question. | 3759 | * @pid: the pid in question. |
3753 | * @param: structure containing the new RT priority. | 3760 | * @param: structure containing the new RT priority. |
3754 | */ | 3761 | */ |
3755 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | 3762 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) |
3756 | { | 3763 | { |
3757 | return do_sched_setscheduler(pid, -1, param); | 3764 | return do_sched_setscheduler(pid, -1, param); |
3758 | } | 3765 | } |
3759 | 3766 | ||
3760 | /** | 3767 | /** |
3761 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3768 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
3762 | * @pid: the pid in question. | 3769 | * @pid: the pid in question. |
3763 | */ | 3770 | */ |
3764 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 3771 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
3765 | { | 3772 | { |
3766 | int retval = -EINVAL; | 3773 | int retval = -EINVAL; |
3767 | task_t *p; | 3774 | task_t *p; |
3768 | 3775 | ||
3769 | if (pid < 0) | 3776 | if (pid < 0) |
3770 | goto out_nounlock; | 3777 | goto out_nounlock; |
3771 | 3778 | ||
3772 | retval = -ESRCH; | 3779 | retval = -ESRCH; |
3773 | read_lock(&tasklist_lock); | 3780 | read_lock(&tasklist_lock); |
3774 | p = find_process_by_pid(pid); | 3781 | p = find_process_by_pid(pid); |
3775 | if (p) { | 3782 | if (p) { |
3776 | retval = security_task_getscheduler(p); | 3783 | retval = security_task_getscheduler(p); |
3777 | if (!retval) | 3784 | if (!retval) |
3778 | retval = p->policy; | 3785 | retval = p->policy; |
3779 | } | 3786 | } |
3780 | read_unlock(&tasklist_lock); | 3787 | read_unlock(&tasklist_lock); |
3781 | 3788 | ||
3782 | out_nounlock: | 3789 | out_nounlock: |
3783 | return retval; | 3790 | return retval; |
3784 | } | 3791 | } |
3785 | 3792 | ||
3786 | /** | 3793 | /** |
3787 | * sys_sched_getscheduler - get the RT priority of a thread | 3794 | * sys_sched_getscheduler - get the RT priority of a thread |
3788 | * @pid: the pid in question. | 3795 | * @pid: the pid in question. |
3789 | * @param: structure containing the RT priority. | 3796 | * @param: structure containing the RT priority. |
3790 | */ | 3797 | */ |
3791 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 3798 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
3792 | { | 3799 | { |
3793 | struct sched_param lp; | 3800 | struct sched_param lp; |
3794 | int retval = -EINVAL; | 3801 | int retval = -EINVAL; |
3795 | task_t *p; | 3802 | task_t *p; |
3796 | 3803 | ||
3797 | if (!param || pid < 0) | 3804 | if (!param || pid < 0) |
3798 | goto out_nounlock; | 3805 | goto out_nounlock; |
3799 | 3806 | ||
3800 | read_lock(&tasklist_lock); | 3807 | read_lock(&tasklist_lock); |
3801 | p = find_process_by_pid(pid); | 3808 | p = find_process_by_pid(pid); |
3802 | retval = -ESRCH; | 3809 | retval = -ESRCH; |
3803 | if (!p) | 3810 | if (!p) |
3804 | goto out_unlock; | 3811 | goto out_unlock; |
3805 | 3812 | ||
3806 | retval = security_task_getscheduler(p); | 3813 | retval = security_task_getscheduler(p); |
3807 | if (retval) | 3814 | if (retval) |
3808 | goto out_unlock; | 3815 | goto out_unlock; |
3809 | 3816 | ||
3810 | lp.sched_priority = p->rt_priority; | 3817 | lp.sched_priority = p->rt_priority; |
3811 | read_unlock(&tasklist_lock); | 3818 | read_unlock(&tasklist_lock); |
3812 | 3819 | ||
3813 | /* | 3820 | /* |
3814 | * This one might sleep, we cannot do it with a spinlock held ... | 3821 | * This one might sleep, we cannot do it with a spinlock held ... |
3815 | */ | 3822 | */ |
3816 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 3823 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
3817 | 3824 | ||
3818 | out_nounlock: | 3825 | out_nounlock: |
3819 | return retval; | 3826 | return retval; |
3820 | 3827 | ||
3821 | out_unlock: | 3828 | out_unlock: |
3822 | read_unlock(&tasklist_lock); | 3829 | read_unlock(&tasklist_lock); |
3823 | return retval; | 3830 | return retval; |
3824 | } | 3831 | } |
3825 | 3832 | ||
3826 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 3833 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
3827 | { | 3834 | { |
3828 | task_t *p; | 3835 | task_t *p; |
3829 | int retval; | 3836 | int retval; |
3830 | cpumask_t cpus_allowed; | 3837 | cpumask_t cpus_allowed; |
3831 | 3838 | ||
3832 | lock_cpu_hotplug(); | 3839 | lock_cpu_hotplug(); |
3833 | read_lock(&tasklist_lock); | 3840 | read_lock(&tasklist_lock); |
3834 | 3841 | ||
3835 | p = find_process_by_pid(pid); | 3842 | p = find_process_by_pid(pid); |
3836 | if (!p) { | 3843 | if (!p) { |
3837 | read_unlock(&tasklist_lock); | 3844 | read_unlock(&tasklist_lock); |
3838 | unlock_cpu_hotplug(); | 3845 | unlock_cpu_hotplug(); |
3839 | return -ESRCH; | 3846 | return -ESRCH; |
3840 | } | 3847 | } |
3841 | 3848 | ||
3842 | /* | 3849 | /* |
3843 | * It is not safe to call set_cpus_allowed with the | 3850 | * It is not safe to call set_cpus_allowed with the |
3844 | * tasklist_lock held. We will bump the task_struct's | 3851 | * tasklist_lock held. We will bump the task_struct's |
3845 | * usage count and then drop tasklist_lock. | 3852 | * usage count and then drop tasklist_lock. |
3846 | */ | 3853 | */ |
3847 | get_task_struct(p); | 3854 | get_task_struct(p); |
3848 | read_unlock(&tasklist_lock); | 3855 | read_unlock(&tasklist_lock); |
3849 | 3856 | ||
3850 | retval = -EPERM; | 3857 | retval = -EPERM; |
3851 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 3858 | if ((current->euid != p->euid) && (current->euid != p->uid) && |
3852 | !capable(CAP_SYS_NICE)) | 3859 | !capable(CAP_SYS_NICE)) |
3853 | goto out_unlock; | 3860 | goto out_unlock; |
3854 | 3861 | ||
3855 | cpus_allowed = cpuset_cpus_allowed(p); | 3862 | cpus_allowed = cpuset_cpus_allowed(p); |
3856 | cpus_and(new_mask, new_mask, cpus_allowed); | 3863 | cpus_and(new_mask, new_mask, cpus_allowed); |
3857 | retval = set_cpus_allowed(p, new_mask); | 3864 | retval = set_cpus_allowed(p, new_mask); |
3858 | 3865 | ||
3859 | out_unlock: | 3866 | out_unlock: |
3860 | put_task_struct(p); | 3867 | put_task_struct(p); |
3861 | unlock_cpu_hotplug(); | 3868 | unlock_cpu_hotplug(); |
3862 | return retval; | 3869 | return retval; |
3863 | } | 3870 | } |
3864 | 3871 | ||
3865 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | 3872 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
3866 | cpumask_t *new_mask) | 3873 | cpumask_t *new_mask) |
3867 | { | 3874 | { |
3868 | if (len < sizeof(cpumask_t)) { | 3875 | if (len < sizeof(cpumask_t)) { |
3869 | memset(new_mask, 0, sizeof(cpumask_t)); | 3876 | memset(new_mask, 0, sizeof(cpumask_t)); |
3870 | } else if (len > sizeof(cpumask_t)) { | 3877 | } else if (len > sizeof(cpumask_t)) { |
3871 | len = sizeof(cpumask_t); | 3878 | len = sizeof(cpumask_t); |
3872 | } | 3879 | } |
3873 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; | 3880 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; |
3874 | } | 3881 | } |
3875 | 3882 | ||
3876 | /** | 3883 | /** |
3877 | * sys_sched_setaffinity - set the cpu affinity of a process | 3884 | * sys_sched_setaffinity - set the cpu affinity of a process |
3878 | * @pid: pid of the process | 3885 | * @pid: pid of the process |
3879 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3886 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
3880 | * @user_mask_ptr: user-space pointer to the new cpu mask | 3887 | * @user_mask_ptr: user-space pointer to the new cpu mask |
3881 | */ | 3888 | */ |
3882 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | 3889 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, |
3883 | unsigned long __user *user_mask_ptr) | 3890 | unsigned long __user *user_mask_ptr) |
3884 | { | 3891 | { |
3885 | cpumask_t new_mask; | 3892 | cpumask_t new_mask; |
3886 | int retval; | 3893 | int retval; |
3887 | 3894 | ||
3888 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); | 3895 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); |
3889 | if (retval) | 3896 | if (retval) |
3890 | return retval; | 3897 | return retval; |
3891 | 3898 | ||
3892 | return sched_setaffinity(pid, new_mask); | 3899 | return sched_setaffinity(pid, new_mask); |
3893 | } | 3900 | } |
3894 | 3901 | ||
3895 | /* | 3902 | /* |
3896 | * Represents all cpu's present in the system | 3903 | * Represents all cpu's present in the system |
3897 | * In systems capable of hotplug, this map could dynamically grow | 3904 | * In systems capable of hotplug, this map could dynamically grow |
3898 | * as new cpu's are detected in the system via any platform specific | 3905 | * as new cpu's are detected in the system via any platform specific |
3899 | * method, such as ACPI for e.g. | 3906 | * method, such as ACPI for e.g. |
3900 | */ | 3907 | */ |
3901 | 3908 | ||
3902 | cpumask_t cpu_present_map __read_mostly; | 3909 | cpumask_t cpu_present_map __read_mostly; |
3903 | EXPORT_SYMBOL(cpu_present_map); | 3910 | EXPORT_SYMBOL(cpu_present_map); |
3904 | 3911 | ||
3905 | #ifndef CONFIG_SMP | 3912 | #ifndef CONFIG_SMP |
3906 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | 3913 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
3907 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | 3914 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
3908 | #endif | 3915 | #endif |
3909 | 3916 | ||
3910 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 3917 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
3911 | { | 3918 | { |
3912 | int retval; | 3919 | int retval; |
3913 | task_t *p; | 3920 | task_t *p; |
3914 | 3921 | ||
3915 | lock_cpu_hotplug(); | 3922 | lock_cpu_hotplug(); |
3916 | read_lock(&tasklist_lock); | 3923 | read_lock(&tasklist_lock); |
3917 | 3924 | ||
3918 | retval = -ESRCH; | 3925 | retval = -ESRCH; |
3919 | p = find_process_by_pid(pid); | 3926 | p = find_process_by_pid(pid); |
3920 | if (!p) | 3927 | if (!p) |
3921 | goto out_unlock; | 3928 | goto out_unlock; |
3922 | 3929 | ||
3923 | retval = 0; | 3930 | retval = 0; |
3924 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 3931 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
3925 | 3932 | ||
3926 | out_unlock: | 3933 | out_unlock: |
3927 | read_unlock(&tasklist_lock); | 3934 | read_unlock(&tasklist_lock); |
3928 | unlock_cpu_hotplug(); | 3935 | unlock_cpu_hotplug(); |
3929 | if (retval) | 3936 | if (retval) |
3930 | return retval; | 3937 | return retval; |
3931 | 3938 | ||
3932 | return 0; | 3939 | return 0; |
3933 | } | 3940 | } |
3934 | 3941 | ||
3935 | /** | 3942 | /** |
3936 | * sys_sched_getaffinity - get the cpu affinity of a process | 3943 | * sys_sched_getaffinity - get the cpu affinity of a process |
3937 | * @pid: pid of the process | 3944 | * @pid: pid of the process |
3938 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3945 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
3939 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 3946 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
3940 | */ | 3947 | */ |
3941 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | 3948 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, |
3942 | unsigned long __user *user_mask_ptr) | 3949 | unsigned long __user *user_mask_ptr) |
3943 | { | 3950 | { |
3944 | int ret; | 3951 | int ret; |
3945 | cpumask_t mask; | 3952 | cpumask_t mask; |
3946 | 3953 | ||
3947 | if (len < sizeof(cpumask_t)) | 3954 | if (len < sizeof(cpumask_t)) |
3948 | return -EINVAL; | 3955 | return -EINVAL; |
3949 | 3956 | ||
3950 | ret = sched_getaffinity(pid, &mask); | 3957 | ret = sched_getaffinity(pid, &mask); |
3951 | if (ret < 0) | 3958 | if (ret < 0) |
3952 | return ret; | 3959 | return ret; |
3953 | 3960 | ||
3954 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) | 3961 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) |
3955 | return -EFAULT; | 3962 | return -EFAULT; |
3956 | 3963 | ||
3957 | return sizeof(cpumask_t); | 3964 | return sizeof(cpumask_t); |
3958 | } | 3965 | } |
3959 | 3966 | ||
3960 | /** | 3967 | /** |
3961 | * sys_sched_yield - yield the current processor to other threads. | 3968 | * sys_sched_yield - yield the current processor to other threads. |
3962 | * | 3969 | * |
3963 | * this function yields the current CPU by moving the calling thread | 3970 | * this function yields the current CPU by moving the calling thread |
3964 | * to the expired array. If there are no other threads running on this | 3971 | * to the expired array. If there are no other threads running on this |
3965 | * CPU then this function will return. | 3972 | * CPU then this function will return. |
3966 | */ | 3973 | */ |
3967 | asmlinkage long sys_sched_yield(void) | 3974 | asmlinkage long sys_sched_yield(void) |
3968 | { | 3975 | { |
3969 | runqueue_t *rq = this_rq_lock(); | 3976 | runqueue_t *rq = this_rq_lock(); |
3970 | prio_array_t *array = current->array; | 3977 | prio_array_t *array = current->array; |
3971 | prio_array_t *target = rq->expired; | 3978 | prio_array_t *target = rq->expired; |
3972 | 3979 | ||
3973 | schedstat_inc(rq, yld_cnt); | 3980 | schedstat_inc(rq, yld_cnt); |
3974 | /* | 3981 | /* |
3975 | * We implement yielding by moving the task into the expired | 3982 | * We implement yielding by moving the task into the expired |
3976 | * queue. | 3983 | * queue. |
3977 | * | 3984 | * |
3978 | * (special rule: RT tasks will just roundrobin in the active | 3985 | * (special rule: RT tasks will just roundrobin in the active |
3979 | * array.) | 3986 | * array.) |
3980 | */ | 3987 | */ |
3981 | if (rt_task(current)) | 3988 | if (rt_task(current)) |
3982 | target = rq->active; | 3989 | target = rq->active; |
3983 | 3990 | ||
3984 | if (array->nr_active == 1) { | 3991 | if (array->nr_active == 1) { |
3985 | schedstat_inc(rq, yld_act_empty); | 3992 | schedstat_inc(rq, yld_act_empty); |
3986 | if (!rq->expired->nr_active) | 3993 | if (!rq->expired->nr_active) |
3987 | schedstat_inc(rq, yld_both_empty); | 3994 | schedstat_inc(rq, yld_both_empty); |
3988 | } else if (!rq->expired->nr_active) | 3995 | } else if (!rq->expired->nr_active) |
3989 | schedstat_inc(rq, yld_exp_empty); | 3996 | schedstat_inc(rq, yld_exp_empty); |
3990 | 3997 | ||
3991 | if (array != target) { | 3998 | if (array != target) { |
3992 | dequeue_task(current, array); | 3999 | dequeue_task(current, array); |
3993 | enqueue_task(current, target); | 4000 | enqueue_task(current, target); |
3994 | } else | 4001 | } else |
3995 | /* | 4002 | /* |
3996 | * requeue_task is cheaper so perform that if possible. | 4003 | * requeue_task is cheaper so perform that if possible. |
3997 | */ | 4004 | */ |
3998 | requeue_task(current, array); | 4005 | requeue_task(current, array); |
3999 | 4006 | ||
4000 | /* | 4007 | /* |
4001 | * Since we are going to call schedule() anyway, there's | 4008 | * Since we are going to call schedule() anyway, there's |
4002 | * no need to preempt or enable interrupts: | 4009 | * no need to preempt or enable interrupts: |
4003 | */ | 4010 | */ |
4004 | __release(rq->lock); | 4011 | __release(rq->lock); |
4005 | _raw_spin_unlock(&rq->lock); | 4012 | _raw_spin_unlock(&rq->lock); |
4006 | preempt_enable_no_resched(); | 4013 | preempt_enable_no_resched(); |
4007 | 4014 | ||
4008 | schedule(); | 4015 | schedule(); |
4009 | 4016 | ||
4010 | return 0; | 4017 | return 0; |
4011 | } | 4018 | } |
4012 | 4019 | ||
4013 | static inline void __cond_resched(void) | 4020 | static inline void __cond_resched(void) |
4014 | { | 4021 | { |
4015 | /* | 4022 | /* |
4016 | * The BKS might be reacquired before we have dropped | 4023 | * The BKS might be reacquired before we have dropped |
4017 | * PREEMPT_ACTIVE, which could trigger a second | 4024 | * PREEMPT_ACTIVE, which could trigger a second |
4018 | * cond_resched() call. | 4025 | * cond_resched() call. |
4019 | */ | 4026 | */ |
4020 | if (unlikely(preempt_count())) | 4027 | if (unlikely(preempt_count())) |
4021 | return; | 4028 | return; |
4022 | if (unlikely(system_state != SYSTEM_RUNNING)) | 4029 | if (unlikely(system_state != SYSTEM_RUNNING)) |
4023 | return; | 4030 | return; |
4024 | do { | 4031 | do { |
4025 | add_preempt_count(PREEMPT_ACTIVE); | 4032 | add_preempt_count(PREEMPT_ACTIVE); |
4026 | schedule(); | 4033 | schedule(); |
4027 | sub_preempt_count(PREEMPT_ACTIVE); | 4034 | sub_preempt_count(PREEMPT_ACTIVE); |
4028 | } while (need_resched()); | 4035 | } while (need_resched()); |
4029 | } | 4036 | } |
4030 | 4037 | ||
4031 | int __sched cond_resched(void) | 4038 | int __sched cond_resched(void) |
4032 | { | 4039 | { |
4033 | if (need_resched()) { | 4040 | if (need_resched()) { |
4034 | __cond_resched(); | 4041 | __cond_resched(); |
4035 | return 1; | 4042 | return 1; |
4036 | } | 4043 | } |
4037 | return 0; | 4044 | return 0; |
4038 | } | 4045 | } |
4039 | 4046 | ||
4040 | EXPORT_SYMBOL(cond_resched); | 4047 | EXPORT_SYMBOL(cond_resched); |
4041 | 4048 | ||
4042 | /* | 4049 | /* |
4043 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4050 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4044 | * call schedule, and on return reacquire the lock. | 4051 | * call schedule, and on return reacquire the lock. |
4045 | * | 4052 | * |
4046 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 4053 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4047 | * operations here to prevent schedule() from being called twice (once via | 4054 | * operations here to prevent schedule() from being called twice (once via |
4048 | * spin_unlock(), once by hand). | 4055 | * spin_unlock(), once by hand). |
4049 | */ | 4056 | */ |
4050 | int cond_resched_lock(spinlock_t *lock) | 4057 | int cond_resched_lock(spinlock_t *lock) |
4051 | { | 4058 | { |
4052 | int ret = 0; | 4059 | int ret = 0; |
4053 | 4060 | ||
4054 | if (need_lockbreak(lock)) { | 4061 | if (need_lockbreak(lock)) { |
4055 | spin_unlock(lock); | 4062 | spin_unlock(lock); |
4056 | cpu_relax(); | 4063 | cpu_relax(); |
4057 | ret = 1; | 4064 | ret = 1; |
4058 | spin_lock(lock); | 4065 | spin_lock(lock); |
4059 | } | 4066 | } |
4060 | if (need_resched()) { | 4067 | if (need_resched()) { |
4061 | _raw_spin_unlock(lock); | 4068 | _raw_spin_unlock(lock); |
4062 | preempt_enable_no_resched(); | 4069 | preempt_enable_no_resched(); |
4063 | __cond_resched(); | 4070 | __cond_resched(); |
4064 | ret = 1; | 4071 | ret = 1; |
4065 | spin_lock(lock); | 4072 | spin_lock(lock); |
4066 | } | 4073 | } |
4067 | return ret; | 4074 | return ret; |
4068 | } | 4075 | } |
4069 | 4076 | ||
4070 | EXPORT_SYMBOL(cond_resched_lock); | 4077 | EXPORT_SYMBOL(cond_resched_lock); |
4071 | 4078 | ||
4072 | int __sched cond_resched_softirq(void) | 4079 | int __sched cond_resched_softirq(void) |
4073 | { | 4080 | { |
4074 | BUG_ON(!in_softirq()); | 4081 | BUG_ON(!in_softirq()); |
4075 | 4082 | ||
4076 | if (need_resched()) { | 4083 | if (need_resched()) { |
4077 | __local_bh_enable(); | 4084 | __local_bh_enable(); |
4078 | __cond_resched(); | 4085 | __cond_resched(); |
4079 | local_bh_disable(); | 4086 | local_bh_disable(); |
4080 | return 1; | 4087 | return 1; |
4081 | } | 4088 | } |
4082 | return 0; | 4089 | return 0; |
4083 | } | 4090 | } |
4084 | 4091 | ||
4085 | EXPORT_SYMBOL(cond_resched_softirq); | 4092 | EXPORT_SYMBOL(cond_resched_softirq); |
4086 | 4093 | ||
4087 | 4094 | ||
4088 | /** | 4095 | /** |
4089 | * yield - yield the current processor to other threads. | 4096 | * yield - yield the current processor to other threads. |
4090 | * | 4097 | * |
4091 | * this is a shortcut for kernel-space yielding - it marks the | 4098 | * this is a shortcut for kernel-space yielding - it marks the |
4092 | * thread runnable and calls sys_sched_yield(). | 4099 | * thread runnable and calls sys_sched_yield(). |
4093 | */ | 4100 | */ |
4094 | void __sched yield(void) | 4101 | void __sched yield(void) |
4095 | { | 4102 | { |
4096 | set_current_state(TASK_RUNNING); | 4103 | set_current_state(TASK_RUNNING); |
4097 | sys_sched_yield(); | 4104 | sys_sched_yield(); |
4098 | } | 4105 | } |
4099 | 4106 | ||
4100 | EXPORT_SYMBOL(yield); | 4107 | EXPORT_SYMBOL(yield); |
4101 | 4108 | ||
4102 | /* | 4109 | /* |
4103 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 4110 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4104 | * that process accounting knows that this is a task in IO wait state. | 4111 | * that process accounting knows that this is a task in IO wait state. |
4105 | * | 4112 | * |
4106 | * But don't do that if it is a deliberate, throttling IO wait (this task | 4113 | * But don't do that if it is a deliberate, throttling IO wait (this task |
4107 | * has set its backing_dev_info: the queue against which it should throttle) | 4114 | * has set its backing_dev_info: the queue against which it should throttle) |
4108 | */ | 4115 | */ |
4109 | void __sched io_schedule(void) | 4116 | void __sched io_schedule(void) |
4110 | { | 4117 | { |
4111 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4118 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); |
4112 | 4119 | ||
4113 | atomic_inc(&rq->nr_iowait); | 4120 | atomic_inc(&rq->nr_iowait); |
4114 | schedule(); | 4121 | schedule(); |
4115 | atomic_dec(&rq->nr_iowait); | 4122 | atomic_dec(&rq->nr_iowait); |
4116 | } | 4123 | } |
4117 | 4124 | ||
4118 | EXPORT_SYMBOL(io_schedule); | 4125 | EXPORT_SYMBOL(io_schedule); |
4119 | 4126 | ||
4120 | long __sched io_schedule_timeout(long timeout) | 4127 | long __sched io_schedule_timeout(long timeout) |
4121 | { | 4128 | { |
4122 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4129 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); |
4123 | long ret; | 4130 | long ret; |
4124 | 4131 | ||
4125 | atomic_inc(&rq->nr_iowait); | 4132 | atomic_inc(&rq->nr_iowait); |
4126 | ret = schedule_timeout(timeout); | 4133 | ret = schedule_timeout(timeout); |
4127 | atomic_dec(&rq->nr_iowait); | 4134 | atomic_dec(&rq->nr_iowait); |
4128 | return ret; | 4135 | return ret; |
4129 | } | 4136 | } |
4130 | 4137 | ||
4131 | /** | 4138 | /** |
4132 | * sys_sched_get_priority_max - return maximum RT priority. | 4139 | * sys_sched_get_priority_max - return maximum RT priority. |
4133 | * @policy: scheduling class. | 4140 | * @policy: scheduling class. |
4134 | * | 4141 | * |
4135 | * this syscall returns the maximum rt_priority that can be used | 4142 | * this syscall returns the maximum rt_priority that can be used |
4136 | * by a given scheduling class. | 4143 | * by a given scheduling class. |
4137 | */ | 4144 | */ |
4138 | asmlinkage long sys_sched_get_priority_max(int policy) | 4145 | asmlinkage long sys_sched_get_priority_max(int policy) |
4139 | { | 4146 | { |
4140 | int ret = -EINVAL; | 4147 | int ret = -EINVAL; |
4141 | 4148 | ||
4142 | switch (policy) { | 4149 | switch (policy) { |
4143 | case SCHED_FIFO: | 4150 | case SCHED_FIFO: |
4144 | case SCHED_RR: | 4151 | case SCHED_RR: |
4145 | ret = MAX_USER_RT_PRIO-1; | 4152 | ret = MAX_USER_RT_PRIO-1; |
4146 | break; | 4153 | break; |
4147 | case SCHED_NORMAL: | 4154 | case SCHED_NORMAL: |
4148 | case SCHED_BATCH: | 4155 | case SCHED_BATCH: |
4149 | ret = 0; | 4156 | ret = 0; |
4150 | break; | 4157 | break; |
4151 | } | 4158 | } |
4152 | return ret; | 4159 | return ret; |
4153 | } | 4160 | } |
4154 | 4161 | ||
4155 | /** | 4162 | /** |
4156 | * sys_sched_get_priority_min - return minimum RT priority. | 4163 | * sys_sched_get_priority_min - return minimum RT priority. |
4157 | * @policy: scheduling class. | 4164 | * @policy: scheduling class. |
4158 | * | 4165 | * |
4159 | * this syscall returns the minimum rt_priority that can be used | 4166 | * this syscall returns the minimum rt_priority that can be used |
4160 | * by a given scheduling class. | 4167 | * by a given scheduling class. |
4161 | */ | 4168 | */ |
4162 | asmlinkage long sys_sched_get_priority_min(int policy) | 4169 | asmlinkage long sys_sched_get_priority_min(int policy) |
4163 | { | 4170 | { |
4164 | int ret = -EINVAL; | 4171 | int ret = -EINVAL; |
4165 | 4172 | ||
4166 | switch (policy) { | 4173 | switch (policy) { |
4167 | case SCHED_FIFO: | 4174 | case SCHED_FIFO: |
4168 | case SCHED_RR: | 4175 | case SCHED_RR: |
4169 | ret = 1; | 4176 | ret = 1; |
4170 | break; | 4177 | break; |
4171 | case SCHED_NORMAL: | 4178 | case SCHED_NORMAL: |
4172 | case SCHED_BATCH: | 4179 | case SCHED_BATCH: |
4173 | ret = 0; | 4180 | ret = 0; |
4174 | } | 4181 | } |
4175 | return ret; | 4182 | return ret; |
4176 | } | 4183 | } |
4177 | 4184 | ||
4178 | /** | 4185 | /** |
4179 | * sys_sched_rr_get_interval - return the default timeslice of a process. | 4186 | * sys_sched_rr_get_interval - return the default timeslice of a process. |
4180 | * @pid: pid of the process. | 4187 | * @pid: pid of the process. |
4181 | * @interval: userspace pointer to the timeslice value. | 4188 | * @interval: userspace pointer to the timeslice value. |
4182 | * | 4189 | * |
4183 | * this syscall writes the default timeslice value of a given process | 4190 | * this syscall writes the default timeslice value of a given process |
4184 | * into the user-space timespec buffer. A value of '0' means infinity. | 4191 | * into the user-space timespec buffer. A value of '0' means infinity. |
4185 | */ | 4192 | */ |
4186 | asmlinkage | 4193 | asmlinkage |
4187 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4194 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4188 | { | 4195 | { |
4189 | int retval = -EINVAL; | 4196 | int retval = -EINVAL; |
4190 | struct timespec t; | 4197 | struct timespec t; |
4191 | task_t *p; | 4198 | task_t *p; |
4192 | 4199 | ||
4193 | if (pid < 0) | 4200 | if (pid < 0) |
4194 | goto out_nounlock; | 4201 | goto out_nounlock; |
4195 | 4202 | ||
4196 | retval = -ESRCH; | 4203 | retval = -ESRCH; |
4197 | read_lock(&tasklist_lock); | 4204 | read_lock(&tasklist_lock); |
4198 | p = find_process_by_pid(pid); | 4205 | p = find_process_by_pid(pid); |
4199 | if (!p) | 4206 | if (!p) |
4200 | goto out_unlock; | 4207 | goto out_unlock; |
4201 | 4208 | ||
4202 | retval = security_task_getscheduler(p); | 4209 | retval = security_task_getscheduler(p); |
4203 | if (retval) | 4210 | if (retval) |
4204 | goto out_unlock; | 4211 | goto out_unlock; |
4205 | 4212 | ||
4206 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4213 | jiffies_to_timespec(p->policy & SCHED_FIFO ? |
4207 | 0 : task_timeslice(p), &t); | 4214 | 0 : task_timeslice(p), &t); |
4208 | read_unlock(&tasklist_lock); | 4215 | read_unlock(&tasklist_lock); |
4209 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4216 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4210 | out_nounlock: | 4217 | out_nounlock: |
4211 | return retval; | 4218 | return retval; |
4212 | out_unlock: | 4219 | out_unlock: |
4213 | read_unlock(&tasklist_lock); | 4220 | read_unlock(&tasklist_lock); |
4214 | return retval; | 4221 | return retval; |
4215 | } | 4222 | } |
4216 | 4223 | ||
4217 | static inline struct task_struct *eldest_child(struct task_struct *p) | 4224 | static inline struct task_struct *eldest_child(struct task_struct *p) |
4218 | { | 4225 | { |
4219 | if (list_empty(&p->children)) return NULL; | 4226 | if (list_empty(&p->children)) return NULL; |
4220 | return list_entry(p->children.next,struct task_struct,sibling); | 4227 | return list_entry(p->children.next,struct task_struct,sibling); |
4221 | } | 4228 | } |
4222 | 4229 | ||
4223 | static inline struct task_struct *older_sibling(struct task_struct *p) | 4230 | static inline struct task_struct *older_sibling(struct task_struct *p) |
4224 | { | 4231 | { |
4225 | if (p->sibling.prev==&p->parent->children) return NULL; | 4232 | if (p->sibling.prev==&p->parent->children) return NULL; |
4226 | return list_entry(p->sibling.prev,struct task_struct,sibling); | 4233 | return list_entry(p->sibling.prev,struct task_struct,sibling); |
4227 | } | 4234 | } |
4228 | 4235 | ||
4229 | static inline struct task_struct *younger_sibling(struct task_struct *p) | 4236 | static inline struct task_struct *younger_sibling(struct task_struct *p) |
4230 | { | 4237 | { |
4231 | if (p->sibling.next==&p->parent->children) return NULL; | 4238 | if (p->sibling.next==&p->parent->children) return NULL; |
4232 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4239 | return list_entry(p->sibling.next,struct task_struct,sibling); |
4233 | } | 4240 | } |
4234 | 4241 | ||
4235 | static void show_task(task_t *p) | 4242 | static void show_task(task_t *p) |
4236 | { | 4243 | { |
4237 | task_t *relative; | 4244 | task_t *relative; |
4238 | unsigned state; | 4245 | unsigned state; |
4239 | unsigned long free = 0; | 4246 | unsigned long free = 0; |
4240 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; | 4247 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; |
4241 | 4248 | ||
4242 | printk("%-13.13s ", p->comm); | 4249 | printk("%-13.13s ", p->comm); |
4243 | state = p->state ? __ffs(p->state) + 1 : 0; | 4250 | state = p->state ? __ffs(p->state) + 1 : 0; |
4244 | if (state < ARRAY_SIZE(stat_nam)) | 4251 | if (state < ARRAY_SIZE(stat_nam)) |
4245 | printk(stat_nam[state]); | 4252 | printk(stat_nam[state]); |
4246 | else | 4253 | else |
4247 | printk("?"); | 4254 | printk("?"); |
4248 | #if (BITS_PER_LONG == 32) | 4255 | #if (BITS_PER_LONG == 32) |
4249 | if (state == TASK_RUNNING) | 4256 | if (state == TASK_RUNNING) |
4250 | printk(" running "); | 4257 | printk(" running "); |
4251 | else | 4258 | else |
4252 | printk(" %08lX ", thread_saved_pc(p)); | 4259 | printk(" %08lX ", thread_saved_pc(p)); |
4253 | #else | 4260 | #else |
4254 | if (state == TASK_RUNNING) | 4261 | if (state == TASK_RUNNING) |
4255 | printk(" running task "); | 4262 | printk(" running task "); |
4256 | else | 4263 | else |
4257 | printk(" %016lx ", thread_saved_pc(p)); | 4264 | printk(" %016lx ", thread_saved_pc(p)); |
4258 | #endif | 4265 | #endif |
4259 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4266 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4260 | { | 4267 | { |
4261 | unsigned long *n = end_of_stack(p); | 4268 | unsigned long *n = end_of_stack(p); |
4262 | while (!*n) | 4269 | while (!*n) |
4263 | n++; | 4270 | n++; |
4264 | free = (unsigned long)n - (unsigned long)end_of_stack(p); | 4271 | free = (unsigned long)n - (unsigned long)end_of_stack(p); |
4265 | } | 4272 | } |
4266 | #endif | 4273 | #endif |
4267 | printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); | 4274 | printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); |
4268 | if ((relative = eldest_child(p))) | 4275 | if ((relative = eldest_child(p))) |
4269 | printk("%5d ", relative->pid); | 4276 | printk("%5d ", relative->pid); |
4270 | else | 4277 | else |
4271 | printk(" "); | 4278 | printk(" "); |
4272 | if ((relative = younger_sibling(p))) | 4279 | if ((relative = younger_sibling(p))) |
4273 | printk("%7d", relative->pid); | 4280 | printk("%7d", relative->pid); |
4274 | else | 4281 | else |
4275 | printk(" "); | 4282 | printk(" "); |
4276 | if ((relative = older_sibling(p))) | 4283 | if ((relative = older_sibling(p))) |
4277 | printk(" %5d", relative->pid); | 4284 | printk(" %5d", relative->pid); |
4278 | else | 4285 | else |
4279 | printk(" "); | 4286 | printk(" "); |
4280 | if (!p->mm) | 4287 | if (!p->mm) |
4281 | printk(" (L-TLB)\n"); | 4288 | printk(" (L-TLB)\n"); |
4282 | else | 4289 | else |
4283 | printk(" (NOTLB)\n"); | 4290 | printk(" (NOTLB)\n"); |
4284 | 4291 | ||
4285 | if (state != TASK_RUNNING) | 4292 | if (state != TASK_RUNNING) |
4286 | show_stack(p, NULL); | 4293 | show_stack(p, NULL); |
4287 | } | 4294 | } |
4288 | 4295 | ||
4289 | void show_state(void) | 4296 | void show_state(void) |
4290 | { | 4297 | { |
4291 | task_t *g, *p; | 4298 | task_t *g, *p; |
4292 | 4299 | ||
4293 | #if (BITS_PER_LONG == 32) | 4300 | #if (BITS_PER_LONG == 32) |
4294 | printk("\n" | 4301 | printk("\n" |
4295 | " sibling\n"); | 4302 | " sibling\n"); |
4296 | printk(" task PC pid father child younger older\n"); | 4303 | printk(" task PC pid father child younger older\n"); |
4297 | #else | 4304 | #else |
4298 | printk("\n" | 4305 | printk("\n" |
4299 | " sibling\n"); | 4306 | " sibling\n"); |
4300 | printk(" task PC pid father child younger older\n"); | 4307 | printk(" task PC pid father child younger older\n"); |
4301 | #endif | 4308 | #endif |
4302 | read_lock(&tasklist_lock); | 4309 | read_lock(&tasklist_lock); |
4303 | do_each_thread(g, p) { | 4310 | do_each_thread(g, p) { |
4304 | /* | 4311 | /* |
4305 | * reset the NMI-timeout, listing all files on a slow | 4312 | * reset the NMI-timeout, listing all files on a slow |
4306 | * console might take alot of time: | 4313 | * console might take alot of time: |
4307 | */ | 4314 | */ |
4308 | touch_nmi_watchdog(); | 4315 | touch_nmi_watchdog(); |
4309 | show_task(p); | 4316 | show_task(p); |
4310 | } while_each_thread(g, p); | 4317 | } while_each_thread(g, p); |
4311 | 4318 | ||
4312 | read_unlock(&tasklist_lock); | 4319 | read_unlock(&tasklist_lock); |
4313 | mutex_debug_show_all_locks(); | 4320 | mutex_debug_show_all_locks(); |
4314 | } | 4321 | } |
4315 | 4322 | ||
4316 | /** | 4323 | /** |
4317 | * init_idle - set up an idle thread for a given CPU | 4324 | * init_idle - set up an idle thread for a given CPU |
4318 | * @idle: task in question | 4325 | * @idle: task in question |
4319 | * @cpu: cpu the idle task belongs to | 4326 | * @cpu: cpu the idle task belongs to |
4320 | * | 4327 | * |
4321 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4328 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
4322 | * flag, to make booting more robust. | 4329 | * flag, to make booting more robust. |
4323 | */ | 4330 | */ |
4324 | void __devinit init_idle(task_t *idle, int cpu) | 4331 | void __devinit init_idle(task_t *idle, int cpu) |
4325 | { | 4332 | { |
4326 | runqueue_t *rq = cpu_rq(cpu); | 4333 | runqueue_t *rq = cpu_rq(cpu); |
4327 | unsigned long flags; | 4334 | unsigned long flags; |
4328 | 4335 | ||
4329 | idle->timestamp = sched_clock(); | 4336 | idle->timestamp = sched_clock(); |
4330 | idle->sleep_avg = 0; | 4337 | idle->sleep_avg = 0; |
4331 | idle->array = NULL; | 4338 | idle->array = NULL; |
4332 | idle->prio = MAX_PRIO; | 4339 | idle->prio = MAX_PRIO; |
4333 | idle->state = TASK_RUNNING; | 4340 | idle->state = TASK_RUNNING; |
4334 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4341 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4335 | set_task_cpu(idle, cpu); | 4342 | set_task_cpu(idle, cpu); |
4336 | 4343 | ||
4337 | spin_lock_irqsave(&rq->lock, flags); | 4344 | spin_lock_irqsave(&rq->lock, flags); |
4338 | rq->curr = rq->idle = idle; | 4345 | rq->curr = rq->idle = idle; |
4339 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 4346 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
4340 | idle->oncpu = 1; | 4347 | idle->oncpu = 1; |
4341 | #endif | 4348 | #endif |
4342 | spin_unlock_irqrestore(&rq->lock, flags); | 4349 | spin_unlock_irqrestore(&rq->lock, flags); |
4343 | 4350 | ||
4344 | /* Set the preempt count _outside_ the spinlocks! */ | 4351 | /* Set the preempt count _outside_ the spinlocks! */ |
4345 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | 4352 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) |
4346 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | 4353 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); |
4347 | #else | 4354 | #else |
4348 | task_thread_info(idle)->preempt_count = 0; | 4355 | task_thread_info(idle)->preempt_count = 0; |
4349 | #endif | 4356 | #endif |
4350 | } | 4357 | } |
4351 | 4358 | ||
4352 | /* | 4359 | /* |
4353 | * In a system that switches off the HZ timer nohz_cpu_mask | 4360 | * In a system that switches off the HZ timer nohz_cpu_mask |
4354 | * indicates which cpus entered this state. This is used | 4361 | * indicates which cpus entered this state. This is used |
4355 | * in the rcu update to wait only for active cpus. For system | 4362 | * in the rcu update to wait only for active cpus. For system |
4356 | * which do not switch off the HZ timer nohz_cpu_mask should | 4363 | * which do not switch off the HZ timer nohz_cpu_mask should |
4357 | * always be CPU_MASK_NONE. | 4364 | * always be CPU_MASK_NONE. |
4358 | */ | 4365 | */ |
4359 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4366 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4360 | 4367 | ||
4361 | #ifdef CONFIG_SMP | 4368 | #ifdef CONFIG_SMP |
4362 | /* | 4369 | /* |
4363 | * This is how migration works: | 4370 | * This is how migration works: |
4364 | * | 4371 | * |
4365 | * 1) we queue a migration_req_t structure in the source CPU's | 4372 | * 1) we queue a migration_req_t structure in the source CPU's |
4366 | * runqueue and wake up that CPU's migration thread. | 4373 | * runqueue and wake up that CPU's migration thread. |
4367 | * 2) we down() the locked semaphore => thread blocks. | 4374 | * 2) we down() the locked semaphore => thread blocks. |
4368 | * 3) migration thread wakes up (implicitly it forces the migrated | 4375 | * 3) migration thread wakes up (implicitly it forces the migrated |
4369 | * thread off the CPU) | 4376 | * thread off the CPU) |
4370 | * 4) it gets the migration request and checks whether the migrated | 4377 | * 4) it gets the migration request and checks whether the migrated |
4371 | * task is still in the wrong runqueue. | 4378 | * task is still in the wrong runqueue. |
4372 | * 5) if it's in the wrong runqueue then the migration thread removes | 4379 | * 5) if it's in the wrong runqueue then the migration thread removes |
4373 | * it and puts it into the right queue. | 4380 | * it and puts it into the right queue. |
4374 | * 6) migration thread up()s the semaphore. | 4381 | * 6) migration thread up()s the semaphore. |
4375 | * 7) we wake up and the migration is done. | 4382 | * 7) we wake up and the migration is done. |
4376 | */ | 4383 | */ |
4377 | 4384 | ||
4378 | /* | 4385 | /* |
4379 | * Change a given task's CPU affinity. Migrate the thread to a | 4386 | * Change a given task's CPU affinity. Migrate the thread to a |
4380 | * proper CPU and schedule it away if the CPU it's executing on | 4387 | * proper CPU and schedule it away if the CPU it's executing on |
4381 | * is removed from the allowed bitmask. | 4388 | * is removed from the allowed bitmask. |
4382 | * | 4389 | * |
4383 | * NOTE: the caller must have a valid reference to the task, the | 4390 | * NOTE: the caller must have a valid reference to the task, the |
4384 | * task must not exit() & deallocate itself prematurely. The | 4391 | * task must not exit() & deallocate itself prematurely. The |
4385 | * call is not atomic; no spinlocks may be held. | 4392 | * call is not atomic; no spinlocks may be held. |
4386 | */ | 4393 | */ |
4387 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) | 4394 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) |
4388 | { | 4395 | { |
4389 | unsigned long flags; | 4396 | unsigned long flags; |
4390 | int ret = 0; | 4397 | int ret = 0; |
4391 | migration_req_t req; | 4398 | migration_req_t req; |
4392 | runqueue_t *rq; | 4399 | runqueue_t *rq; |
4393 | 4400 | ||
4394 | rq = task_rq_lock(p, &flags); | 4401 | rq = task_rq_lock(p, &flags); |
4395 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4402 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
4396 | ret = -EINVAL; | 4403 | ret = -EINVAL; |
4397 | goto out; | 4404 | goto out; |
4398 | } | 4405 | } |
4399 | 4406 | ||
4400 | p->cpus_allowed = new_mask; | 4407 | p->cpus_allowed = new_mask; |
4401 | /* Can the task run on the task's current CPU? If so, we're done */ | 4408 | /* Can the task run on the task's current CPU? If so, we're done */ |
4402 | if (cpu_isset(task_cpu(p), new_mask)) | 4409 | if (cpu_isset(task_cpu(p), new_mask)) |
4403 | goto out; | 4410 | goto out; |
4404 | 4411 | ||
4405 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 4412 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { |
4406 | /* Need help from migration thread: drop lock and wait. */ | 4413 | /* Need help from migration thread: drop lock and wait. */ |
4407 | task_rq_unlock(rq, &flags); | 4414 | task_rq_unlock(rq, &flags); |
4408 | wake_up_process(rq->migration_thread); | 4415 | wake_up_process(rq->migration_thread); |
4409 | wait_for_completion(&req.done); | 4416 | wait_for_completion(&req.done); |
4410 | tlb_migrate_finish(p->mm); | 4417 | tlb_migrate_finish(p->mm); |
4411 | return 0; | 4418 | return 0; |
4412 | } | 4419 | } |
4413 | out: | 4420 | out: |
4414 | task_rq_unlock(rq, &flags); | 4421 | task_rq_unlock(rq, &flags); |
4415 | return ret; | 4422 | return ret; |
4416 | } | 4423 | } |
4417 | 4424 | ||
4418 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4425 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
4419 | 4426 | ||
4420 | /* | 4427 | /* |
4421 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 4428 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
4422 | * this because either it can't run here any more (set_cpus_allowed() | 4429 | * this because either it can't run here any more (set_cpus_allowed() |
4423 | * away from this CPU, or CPU going down), or because we're | 4430 | * away from this CPU, or CPU going down), or because we're |
4424 | * attempting to rebalance this task on exec (sched_exec). | 4431 | * attempting to rebalance this task on exec (sched_exec). |
4425 | * | 4432 | * |
4426 | * So we race with normal scheduler movements, but that's OK, as long | 4433 | * So we race with normal scheduler movements, but that's OK, as long |
4427 | * as the task is no longer on this CPU. | 4434 | * as the task is no longer on this CPU. |
4428 | */ | 4435 | */ |
4429 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4436 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4430 | { | 4437 | { |
4431 | runqueue_t *rq_dest, *rq_src; | 4438 | runqueue_t *rq_dest, *rq_src; |
4432 | 4439 | ||
4433 | if (unlikely(cpu_is_offline(dest_cpu))) | 4440 | if (unlikely(cpu_is_offline(dest_cpu))) |
4434 | return; | 4441 | return; |
4435 | 4442 | ||
4436 | rq_src = cpu_rq(src_cpu); | 4443 | rq_src = cpu_rq(src_cpu); |
4437 | rq_dest = cpu_rq(dest_cpu); | 4444 | rq_dest = cpu_rq(dest_cpu); |
4438 | 4445 | ||
4439 | double_rq_lock(rq_src, rq_dest); | 4446 | double_rq_lock(rq_src, rq_dest); |
4440 | /* Already moved. */ | 4447 | /* Already moved. */ |
4441 | if (task_cpu(p) != src_cpu) | 4448 | if (task_cpu(p) != src_cpu) |
4442 | goto out; | 4449 | goto out; |
4443 | /* Affinity changed (again). */ | 4450 | /* Affinity changed (again). */ |
4444 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 4451 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
4445 | goto out; | 4452 | goto out; |
4446 | 4453 | ||
4447 | set_task_cpu(p, dest_cpu); | 4454 | set_task_cpu(p, dest_cpu); |
4448 | if (p->array) { | 4455 | if (p->array) { |
4449 | /* | 4456 | /* |
4450 | * Sync timestamp with rq_dest's before activating. | 4457 | * Sync timestamp with rq_dest's before activating. |
4451 | * The same thing could be achieved by doing this step | 4458 | * The same thing could be achieved by doing this step |
4452 | * afterwards, and pretending it was a local activate. | 4459 | * afterwards, and pretending it was a local activate. |
4453 | * This way is cleaner and logically correct. | 4460 | * This way is cleaner and logically correct. |
4454 | */ | 4461 | */ |
4455 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 4462 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick |
4456 | + rq_dest->timestamp_last_tick; | 4463 | + rq_dest->timestamp_last_tick; |
4457 | deactivate_task(p, rq_src); | 4464 | deactivate_task(p, rq_src); |
4458 | activate_task(p, rq_dest, 0); | 4465 | activate_task(p, rq_dest, 0); |
4459 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4466 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4460 | resched_task(rq_dest->curr); | 4467 | resched_task(rq_dest->curr); |
4461 | } | 4468 | } |
4462 | 4469 | ||
4463 | out: | 4470 | out: |
4464 | double_rq_unlock(rq_src, rq_dest); | 4471 | double_rq_unlock(rq_src, rq_dest); |
4465 | } | 4472 | } |
4466 | 4473 | ||
4467 | /* | 4474 | /* |
4468 | * migration_thread - this is a highprio system thread that performs | 4475 | * migration_thread - this is a highprio system thread that performs |
4469 | * thread migration by bumping thread off CPU then 'pushing' onto | 4476 | * thread migration by bumping thread off CPU then 'pushing' onto |
4470 | * another runqueue. | 4477 | * another runqueue. |
4471 | */ | 4478 | */ |
4472 | static int migration_thread(void *data) | 4479 | static int migration_thread(void *data) |
4473 | { | 4480 | { |
4474 | runqueue_t *rq; | 4481 | runqueue_t *rq; |
4475 | int cpu = (long)data; | 4482 | int cpu = (long)data; |
4476 | 4483 | ||
4477 | rq = cpu_rq(cpu); | 4484 | rq = cpu_rq(cpu); |
4478 | BUG_ON(rq->migration_thread != current); | 4485 | BUG_ON(rq->migration_thread != current); |
4479 | 4486 | ||
4480 | set_current_state(TASK_INTERRUPTIBLE); | 4487 | set_current_state(TASK_INTERRUPTIBLE); |
4481 | while (!kthread_should_stop()) { | 4488 | while (!kthread_should_stop()) { |
4482 | struct list_head *head; | 4489 | struct list_head *head; |
4483 | migration_req_t *req; | 4490 | migration_req_t *req; |
4484 | 4491 | ||
4485 | try_to_freeze(); | 4492 | try_to_freeze(); |
4486 | 4493 | ||
4487 | spin_lock_irq(&rq->lock); | 4494 | spin_lock_irq(&rq->lock); |
4488 | 4495 | ||
4489 | if (cpu_is_offline(cpu)) { | 4496 | if (cpu_is_offline(cpu)) { |
4490 | spin_unlock_irq(&rq->lock); | 4497 | spin_unlock_irq(&rq->lock); |
4491 | goto wait_to_die; | 4498 | goto wait_to_die; |
4492 | } | 4499 | } |
4493 | 4500 | ||
4494 | if (rq->active_balance) { | 4501 | if (rq->active_balance) { |
4495 | active_load_balance(rq, cpu); | 4502 | active_load_balance(rq, cpu); |
4496 | rq->active_balance = 0; | 4503 | rq->active_balance = 0; |
4497 | } | 4504 | } |
4498 | 4505 | ||
4499 | head = &rq->migration_queue; | 4506 | head = &rq->migration_queue; |
4500 | 4507 | ||
4501 | if (list_empty(head)) { | 4508 | if (list_empty(head)) { |
4502 | spin_unlock_irq(&rq->lock); | 4509 | spin_unlock_irq(&rq->lock); |
4503 | schedule(); | 4510 | schedule(); |
4504 | set_current_state(TASK_INTERRUPTIBLE); | 4511 | set_current_state(TASK_INTERRUPTIBLE); |
4505 | continue; | 4512 | continue; |
4506 | } | 4513 | } |
4507 | req = list_entry(head->next, migration_req_t, list); | 4514 | req = list_entry(head->next, migration_req_t, list); |
4508 | list_del_init(head->next); | 4515 | list_del_init(head->next); |
4509 | 4516 | ||
4510 | spin_unlock(&rq->lock); | 4517 | spin_unlock(&rq->lock); |
4511 | __migrate_task(req->task, cpu, req->dest_cpu); | 4518 | __migrate_task(req->task, cpu, req->dest_cpu); |
4512 | local_irq_enable(); | 4519 | local_irq_enable(); |
4513 | 4520 | ||
4514 | complete(&req->done); | 4521 | complete(&req->done); |
4515 | } | 4522 | } |
4516 | __set_current_state(TASK_RUNNING); | 4523 | __set_current_state(TASK_RUNNING); |
4517 | return 0; | 4524 | return 0; |
4518 | 4525 | ||
4519 | wait_to_die: | 4526 | wait_to_die: |
4520 | /* Wait for kthread_stop */ | 4527 | /* Wait for kthread_stop */ |
4521 | set_current_state(TASK_INTERRUPTIBLE); | 4528 | set_current_state(TASK_INTERRUPTIBLE); |
4522 | while (!kthread_should_stop()) { | 4529 | while (!kthread_should_stop()) { |
4523 | schedule(); | 4530 | schedule(); |
4524 | set_current_state(TASK_INTERRUPTIBLE); | 4531 | set_current_state(TASK_INTERRUPTIBLE); |
4525 | } | 4532 | } |
4526 | __set_current_state(TASK_RUNNING); | 4533 | __set_current_state(TASK_RUNNING); |
4527 | return 0; | 4534 | return 0; |
4528 | } | 4535 | } |
4529 | 4536 | ||
4530 | #ifdef CONFIG_HOTPLUG_CPU | 4537 | #ifdef CONFIG_HOTPLUG_CPU |
4531 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 4538 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
4532 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 4539 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) |
4533 | { | 4540 | { |
4534 | int dest_cpu; | 4541 | int dest_cpu; |
4535 | cpumask_t mask; | 4542 | cpumask_t mask; |
4536 | 4543 | ||
4537 | /* On same node? */ | 4544 | /* On same node? */ |
4538 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 4545 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4539 | cpus_and(mask, mask, tsk->cpus_allowed); | 4546 | cpus_and(mask, mask, tsk->cpus_allowed); |
4540 | dest_cpu = any_online_cpu(mask); | 4547 | dest_cpu = any_online_cpu(mask); |
4541 | 4548 | ||
4542 | /* On any allowed CPU? */ | 4549 | /* On any allowed CPU? */ |
4543 | if (dest_cpu == NR_CPUS) | 4550 | if (dest_cpu == NR_CPUS) |
4544 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 4551 | dest_cpu = any_online_cpu(tsk->cpus_allowed); |
4545 | 4552 | ||
4546 | /* No more Mr. Nice Guy. */ | 4553 | /* No more Mr. Nice Guy. */ |
4547 | if (dest_cpu == NR_CPUS) { | 4554 | if (dest_cpu == NR_CPUS) { |
4548 | cpus_setall(tsk->cpus_allowed); | 4555 | cpus_setall(tsk->cpus_allowed); |
4549 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 4556 | dest_cpu = any_online_cpu(tsk->cpus_allowed); |
4550 | 4557 | ||
4551 | /* | 4558 | /* |
4552 | * Don't tell them about moving exiting tasks or | 4559 | * Don't tell them about moving exiting tasks or |
4553 | * kernel threads (both mm NULL), since they never | 4560 | * kernel threads (both mm NULL), since they never |
4554 | * leave kernel. | 4561 | * leave kernel. |
4555 | */ | 4562 | */ |
4556 | if (tsk->mm && printk_ratelimit()) | 4563 | if (tsk->mm && printk_ratelimit()) |
4557 | printk(KERN_INFO "process %d (%s) no " | 4564 | printk(KERN_INFO "process %d (%s) no " |
4558 | "longer affine to cpu%d\n", | 4565 | "longer affine to cpu%d\n", |
4559 | tsk->pid, tsk->comm, dead_cpu); | 4566 | tsk->pid, tsk->comm, dead_cpu); |
4560 | } | 4567 | } |
4561 | __migrate_task(tsk, dead_cpu, dest_cpu); | 4568 | __migrate_task(tsk, dead_cpu, dest_cpu); |
4562 | } | 4569 | } |
4563 | 4570 | ||
4564 | /* | 4571 | /* |
4565 | * While a dead CPU has no uninterruptible tasks queued at this point, | 4572 | * While a dead CPU has no uninterruptible tasks queued at this point, |
4566 | * it might still have a nonzero ->nr_uninterruptible counter, because | 4573 | * it might still have a nonzero ->nr_uninterruptible counter, because |
4567 | * for performance reasons the counter is not stricly tracking tasks to | 4574 | * for performance reasons the counter is not stricly tracking tasks to |
4568 | * their home CPUs. So we just add the counter to another CPU's counter, | 4575 | * their home CPUs. So we just add the counter to another CPU's counter, |
4569 | * to keep the global sum constant after CPU-down: | 4576 | * to keep the global sum constant after CPU-down: |
4570 | */ | 4577 | */ |
4571 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) | 4578 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) |
4572 | { | 4579 | { |
4573 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 4580 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
4574 | unsigned long flags; | 4581 | unsigned long flags; |
4575 | 4582 | ||
4576 | local_irq_save(flags); | 4583 | local_irq_save(flags); |
4577 | double_rq_lock(rq_src, rq_dest); | 4584 | double_rq_lock(rq_src, rq_dest); |
4578 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 4585 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
4579 | rq_src->nr_uninterruptible = 0; | 4586 | rq_src->nr_uninterruptible = 0; |
4580 | double_rq_unlock(rq_src, rq_dest); | 4587 | double_rq_unlock(rq_src, rq_dest); |
4581 | local_irq_restore(flags); | 4588 | local_irq_restore(flags); |
4582 | } | 4589 | } |
4583 | 4590 | ||
4584 | /* Run through task list and migrate tasks from the dead cpu. */ | 4591 | /* Run through task list and migrate tasks from the dead cpu. */ |
4585 | static void migrate_live_tasks(int src_cpu) | 4592 | static void migrate_live_tasks(int src_cpu) |
4586 | { | 4593 | { |
4587 | struct task_struct *tsk, *t; | 4594 | struct task_struct *tsk, *t; |
4588 | 4595 | ||
4589 | write_lock_irq(&tasklist_lock); | 4596 | write_lock_irq(&tasklist_lock); |
4590 | 4597 | ||
4591 | do_each_thread(t, tsk) { | 4598 | do_each_thread(t, tsk) { |
4592 | if (tsk == current) | 4599 | if (tsk == current) |
4593 | continue; | 4600 | continue; |
4594 | 4601 | ||
4595 | if (task_cpu(tsk) == src_cpu) | 4602 | if (task_cpu(tsk) == src_cpu) |
4596 | move_task_off_dead_cpu(src_cpu, tsk); | 4603 | move_task_off_dead_cpu(src_cpu, tsk); |
4597 | } while_each_thread(t, tsk); | 4604 | } while_each_thread(t, tsk); |
4598 | 4605 | ||
4599 | write_unlock_irq(&tasklist_lock); | 4606 | write_unlock_irq(&tasklist_lock); |
4600 | } | 4607 | } |
4601 | 4608 | ||
4602 | /* Schedules idle task to be the next runnable task on current CPU. | 4609 | /* Schedules idle task to be the next runnable task on current CPU. |
4603 | * It does so by boosting its priority to highest possible and adding it to | 4610 | * It does so by boosting its priority to highest possible and adding it to |
4604 | * the _front_ of runqueue. Used by CPU offline code. | 4611 | * the _front_ of runqueue. Used by CPU offline code. |
4605 | */ | 4612 | */ |
4606 | void sched_idle_next(void) | 4613 | void sched_idle_next(void) |
4607 | { | 4614 | { |
4608 | int cpu = smp_processor_id(); | 4615 | int cpu = smp_processor_id(); |
4609 | runqueue_t *rq = this_rq(); | 4616 | runqueue_t *rq = this_rq(); |
4610 | struct task_struct *p = rq->idle; | 4617 | struct task_struct *p = rq->idle; |
4611 | unsigned long flags; | 4618 | unsigned long flags; |
4612 | 4619 | ||
4613 | /* cpu has to be offline */ | 4620 | /* cpu has to be offline */ |
4614 | BUG_ON(cpu_online(cpu)); | 4621 | BUG_ON(cpu_online(cpu)); |
4615 | 4622 | ||
4616 | /* Strictly not necessary since rest of the CPUs are stopped by now | 4623 | /* Strictly not necessary since rest of the CPUs are stopped by now |
4617 | * and interrupts disabled on current cpu. | 4624 | * and interrupts disabled on current cpu. |
4618 | */ | 4625 | */ |
4619 | spin_lock_irqsave(&rq->lock, flags); | 4626 | spin_lock_irqsave(&rq->lock, flags); |
4620 | 4627 | ||
4621 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 4628 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
4622 | /* Add idle task to _front_ of it's priority queue */ | 4629 | /* Add idle task to _front_ of it's priority queue */ |
4623 | __activate_idle_task(p, rq); | 4630 | __activate_idle_task(p, rq); |
4624 | 4631 | ||
4625 | spin_unlock_irqrestore(&rq->lock, flags); | 4632 | spin_unlock_irqrestore(&rq->lock, flags); |
4626 | } | 4633 | } |
4627 | 4634 | ||
4628 | /* Ensures that the idle task is using init_mm right before its cpu goes | 4635 | /* Ensures that the idle task is using init_mm right before its cpu goes |
4629 | * offline. | 4636 | * offline. |
4630 | */ | 4637 | */ |
4631 | void idle_task_exit(void) | 4638 | void idle_task_exit(void) |
4632 | { | 4639 | { |
4633 | struct mm_struct *mm = current->active_mm; | 4640 | struct mm_struct *mm = current->active_mm; |
4634 | 4641 | ||
4635 | BUG_ON(cpu_online(smp_processor_id())); | 4642 | BUG_ON(cpu_online(smp_processor_id())); |
4636 | 4643 | ||
4637 | if (mm != &init_mm) | 4644 | if (mm != &init_mm) |
4638 | switch_mm(mm, &init_mm, current); | 4645 | switch_mm(mm, &init_mm, current); |
4639 | mmdrop(mm); | 4646 | mmdrop(mm); |
4640 | } | 4647 | } |
4641 | 4648 | ||
4642 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | 4649 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) |
4643 | { | 4650 | { |
4644 | struct runqueue *rq = cpu_rq(dead_cpu); | 4651 | struct runqueue *rq = cpu_rq(dead_cpu); |
4645 | 4652 | ||
4646 | /* Must be exiting, otherwise would be on tasklist. */ | 4653 | /* Must be exiting, otherwise would be on tasklist. */ |
4647 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); | 4654 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); |
4648 | 4655 | ||
4649 | /* Cannot have done final schedule yet: would have vanished. */ | 4656 | /* Cannot have done final schedule yet: would have vanished. */ |
4650 | BUG_ON(tsk->flags & PF_DEAD); | 4657 | BUG_ON(tsk->flags & PF_DEAD); |
4651 | 4658 | ||
4652 | get_task_struct(tsk); | 4659 | get_task_struct(tsk); |
4653 | 4660 | ||
4654 | /* | 4661 | /* |
4655 | * Drop lock around migration; if someone else moves it, | 4662 | * Drop lock around migration; if someone else moves it, |
4656 | * that's OK. No task can be added to this CPU, so iteration is | 4663 | * that's OK. No task can be added to this CPU, so iteration is |
4657 | * fine. | 4664 | * fine. |
4658 | */ | 4665 | */ |
4659 | spin_unlock_irq(&rq->lock); | 4666 | spin_unlock_irq(&rq->lock); |
4660 | move_task_off_dead_cpu(dead_cpu, tsk); | 4667 | move_task_off_dead_cpu(dead_cpu, tsk); |
4661 | spin_lock_irq(&rq->lock); | 4668 | spin_lock_irq(&rq->lock); |
4662 | 4669 | ||
4663 | put_task_struct(tsk); | 4670 | put_task_struct(tsk); |
4664 | } | 4671 | } |
4665 | 4672 | ||
4666 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 4673 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
4667 | static void migrate_dead_tasks(unsigned int dead_cpu) | 4674 | static void migrate_dead_tasks(unsigned int dead_cpu) |
4668 | { | 4675 | { |
4669 | unsigned arr, i; | 4676 | unsigned arr, i; |
4670 | struct runqueue *rq = cpu_rq(dead_cpu); | 4677 | struct runqueue *rq = cpu_rq(dead_cpu); |
4671 | 4678 | ||
4672 | for (arr = 0; arr < 2; arr++) { | 4679 | for (arr = 0; arr < 2; arr++) { |
4673 | for (i = 0; i < MAX_PRIO; i++) { | 4680 | for (i = 0; i < MAX_PRIO; i++) { |
4674 | struct list_head *list = &rq->arrays[arr].queue[i]; | 4681 | struct list_head *list = &rq->arrays[arr].queue[i]; |
4675 | while (!list_empty(list)) | 4682 | while (!list_empty(list)) |
4676 | migrate_dead(dead_cpu, | 4683 | migrate_dead(dead_cpu, |
4677 | list_entry(list->next, task_t, | 4684 | list_entry(list->next, task_t, |
4678 | run_list)); | 4685 | run_list)); |
4679 | } | 4686 | } |
4680 | } | 4687 | } |
4681 | } | 4688 | } |
4682 | #endif /* CONFIG_HOTPLUG_CPU */ | 4689 | #endif /* CONFIG_HOTPLUG_CPU */ |
4683 | 4690 | ||
4684 | /* | 4691 | /* |
4685 | * migration_call - callback that gets triggered when a CPU is added. | 4692 | * migration_call - callback that gets triggered when a CPU is added. |
4686 | * Here we can start up the necessary migration thread for the new CPU. | 4693 | * Here we can start up the necessary migration thread for the new CPU. |
4687 | */ | 4694 | */ |
4688 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 4695 | static int migration_call(struct notifier_block *nfb, unsigned long action, |
4689 | void *hcpu) | 4696 | void *hcpu) |
4690 | { | 4697 | { |
4691 | int cpu = (long)hcpu; | 4698 | int cpu = (long)hcpu; |
4692 | struct task_struct *p; | 4699 | struct task_struct *p; |
4693 | struct runqueue *rq; | 4700 | struct runqueue *rq; |
4694 | unsigned long flags; | 4701 | unsigned long flags; |
4695 | 4702 | ||
4696 | switch (action) { | 4703 | switch (action) { |
4697 | case CPU_UP_PREPARE: | 4704 | case CPU_UP_PREPARE: |
4698 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); | 4705 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); |
4699 | if (IS_ERR(p)) | 4706 | if (IS_ERR(p)) |
4700 | return NOTIFY_BAD; | 4707 | return NOTIFY_BAD; |
4701 | p->flags |= PF_NOFREEZE; | 4708 | p->flags |= PF_NOFREEZE; |
4702 | kthread_bind(p, cpu); | 4709 | kthread_bind(p, cpu); |
4703 | /* Must be high prio: stop_machine expects to yield to it. */ | 4710 | /* Must be high prio: stop_machine expects to yield to it. */ |
4704 | rq = task_rq_lock(p, &flags); | 4711 | rq = task_rq_lock(p, &flags); |
4705 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 4712 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
4706 | task_rq_unlock(rq, &flags); | 4713 | task_rq_unlock(rq, &flags); |
4707 | cpu_rq(cpu)->migration_thread = p; | 4714 | cpu_rq(cpu)->migration_thread = p; |
4708 | break; | 4715 | break; |
4709 | case CPU_ONLINE: | 4716 | case CPU_ONLINE: |
4710 | /* Strictly unneccessary, as first user will wake it. */ | 4717 | /* Strictly unneccessary, as first user will wake it. */ |
4711 | wake_up_process(cpu_rq(cpu)->migration_thread); | 4718 | wake_up_process(cpu_rq(cpu)->migration_thread); |
4712 | break; | 4719 | break; |
4713 | #ifdef CONFIG_HOTPLUG_CPU | 4720 | #ifdef CONFIG_HOTPLUG_CPU |
4714 | case CPU_UP_CANCELED: | 4721 | case CPU_UP_CANCELED: |
4715 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 4722 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
4716 | kthread_bind(cpu_rq(cpu)->migration_thread, | 4723 | kthread_bind(cpu_rq(cpu)->migration_thread, |
4717 | any_online_cpu(cpu_online_map)); | 4724 | any_online_cpu(cpu_online_map)); |
4718 | kthread_stop(cpu_rq(cpu)->migration_thread); | 4725 | kthread_stop(cpu_rq(cpu)->migration_thread); |
4719 | cpu_rq(cpu)->migration_thread = NULL; | 4726 | cpu_rq(cpu)->migration_thread = NULL; |
4720 | break; | 4727 | break; |
4721 | case CPU_DEAD: | 4728 | case CPU_DEAD: |
4722 | migrate_live_tasks(cpu); | 4729 | migrate_live_tasks(cpu); |
4723 | rq = cpu_rq(cpu); | 4730 | rq = cpu_rq(cpu); |
4724 | kthread_stop(rq->migration_thread); | 4731 | kthread_stop(rq->migration_thread); |
4725 | rq->migration_thread = NULL; | 4732 | rq->migration_thread = NULL; |
4726 | /* Idle task back to normal (off runqueue, low prio) */ | 4733 | /* Idle task back to normal (off runqueue, low prio) */ |
4727 | rq = task_rq_lock(rq->idle, &flags); | 4734 | rq = task_rq_lock(rq->idle, &flags); |
4728 | deactivate_task(rq->idle, rq); | 4735 | deactivate_task(rq->idle, rq); |
4729 | rq->idle->static_prio = MAX_PRIO; | 4736 | rq->idle->static_prio = MAX_PRIO; |
4730 | __setscheduler(rq->idle, SCHED_NORMAL, 0); | 4737 | __setscheduler(rq->idle, SCHED_NORMAL, 0); |
4731 | migrate_dead_tasks(cpu); | 4738 | migrate_dead_tasks(cpu); |
4732 | task_rq_unlock(rq, &flags); | 4739 | task_rq_unlock(rq, &flags); |
4733 | migrate_nr_uninterruptible(rq); | 4740 | migrate_nr_uninterruptible(rq); |
4734 | BUG_ON(rq->nr_running != 0); | 4741 | BUG_ON(rq->nr_running != 0); |
4735 | 4742 | ||
4736 | /* No need to migrate the tasks: it was best-effort if | 4743 | /* No need to migrate the tasks: it was best-effort if |
4737 | * they didn't do lock_cpu_hotplug(). Just wake up | 4744 | * they didn't do lock_cpu_hotplug(). Just wake up |
4738 | * the requestors. */ | 4745 | * the requestors. */ |
4739 | spin_lock_irq(&rq->lock); | 4746 | spin_lock_irq(&rq->lock); |
4740 | while (!list_empty(&rq->migration_queue)) { | 4747 | while (!list_empty(&rq->migration_queue)) { |
4741 | migration_req_t *req; | 4748 | migration_req_t *req; |
4742 | req = list_entry(rq->migration_queue.next, | 4749 | req = list_entry(rq->migration_queue.next, |
4743 | migration_req_t, list); | 4750 | migration_req_t, list); |
4744 | list_del_init(&req->list); | 4751 | list_del_init(&req->list); |
4745 | complete(&req->done); | 4752 | complete(&req->done); |
4746 | } | 4753 | } |
4747 | spin_unlock_irq(&rq->lock); | 4754 | spin_unlock_irq(&rq->lock); |
4748 | break; | 4755 | break; |
4749 | #endif | 4756 | #endif |
4750 | } | 4757 | } |
4751 | return NOTIFY_OK; | 4758 | return NOTIFY_OK; |
4752 | } | 4759 | } |
4753 | 4760 | ||
4754 | /* Register at highest priority so that task migration (migrate_all_tasks) | 4761 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4755 | * happens before everything else. | 4762 | * happens before everything else. |
4756 | */ | 4763 | */ |
4757 | static struct notifier_block __devinitdata migration_notifier = { | 4764 | static struct notifier_block __devinitdata migration_notifier = { |
4758 | .notifier_call = migration_call, | 4765 | .notifier_call = migration_call, |
4759 | .priority = 10 | 4766 | .priority = 10 |
4760 | }; | 4767 | }; |
4761 | 4768 | ||
4762 | int __init migration_init(void) | 4769 | int __init migration_init(void) |
4763 | { | 4770 | { |
4764 | void *cpu = (void *)(long)smp_processor_id(); | 4771 | void *cpu = (void *)(long)smp_processor_id(); |
4765 | /* Start one for boot CPU. */ | 4772 | /* Start one for boot CPU. */ |
4766 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 4773 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
4767 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 4774 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
4768 | register_cpu_notifier(&migration_notifier); | 4775 | register_cpu_notifier(&migration_notifier); |
4769 | return 0; | 4776 | return 0; |
4770 | } | 4777 | } |
4771 | #endif | 4778 | #endif |
4772 | 4779 | ||
4773 | #ifdef CONFIG_SMP | 4780 | #ifdef CONFIG_SMP |
4774 | #undef SCHED_DOMAIN_DEBUG | 4781 | #undef SCHED_DOMAIN_DEBUG |
4775 | #ifdef SCHED_DOMAIN_DEBUG | 4782 | #ifdef SCHED_DOMAIN_DEBUG |
4776 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 4783 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
4777 | { | 4784 | { |
4778 | int level = 0; | 4785 | int level = 0; |
4779 | 4786 | ||
4780 | if (!sd) { | 4787 | if (!sd) { |
4781 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 4788 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
4782 | return; | 4789 | return; |
4783 | } | 4790 | } |
4784 | 4791 | ||
4785 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 4792 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
4786 | 4793 | ||
4787 | do { | 4794 | do { |
4788 | int i; | 4795 | int i; |
4789 | char str[NR_CPUS]; | 4796 | char str[NR_CPUS]; |
4790 | struct sched_group *group = sd->groups; | 4797 | struct sched_group *group = sd->groups; |
4791 | cpumask_t groupmask; | 4798 | cpumask_t groupmask; |
4792 | 4799 | ||
4793 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 4800 | cpumask_scnprintf(str, NR_CPUS, sd->span); |
4794 | cpus_clear(groupmask); | 4801 | cpus_clear(groupmask); |
4795 | 4802 | ||
4796 | printk(KERN_DEBUG); | 4803 | printk(KERN_DEBUG); |
4797 | for (i = 0; i < level + 1; i++) | 4804 | for (i = 0; i < level + 1; i++) |
4798 | printk(" "); | 4805 | printk(" "); |
4799 | printk("domain %d: ", level); | 4806 | printk("domain %d: ", level); |
4800 | 4807 | ||
4801 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 4808 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
4802 | printk("does not load-balance\n"); | 4809 | printk("does not load-balance\n"); |
4803 | if (sd->parent) | 4810 | if (sd->parent) |
4804 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); | 4811 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); |
4805 | break; | 4812 | break; |
4806 | } | 4813 | } |
4807 | 4814 | ||
4808 | printk("span %s\n", str); | 4815 | printk("span %s\n", str); |
4809 | 4816 | ||
4810 | if (!cpu_isset(cpu, sd->span)) | 4817 | if (!cpu_isset(cpu, sd->span)) |
4811 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); | 4818 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); |
4812 | if (!cpu_isset(cpu, group->cpumask)) | 4819 | if (!cpu_isset(cpu, group->cpumask)) |
4813 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); | 4820 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); |
4814 | 4821 | ||
4815 | printk(KERN_DEBUG); | 4822 | printk(KERN_DEBUG); |
4816 | for (i = 0; i < level + 2; i++) | 4823 | for (i = 0; i < level + 2; i++) |
4817 | printk(" "); | 4824 | printk(" "); |
4818 | printk("groups:"); | 4825 | printk("groups:"); |
4819 | do { | 4826 | do { |
4820 | if (!group) { | 4827 | if (!group) { |
4821 | printk("\n"); | 4828 | printk("\n"); |
4822 | printk(KERN_ERR "ERROR: group is NULL\n"); | 4829 | printk(KERN_ERR "ERROR: group is NULL\n"); |
4823 | break; | 4830 | break; |
4824 | } | 4831 | } |
4825 | 4832 | ||
4826 | if (!group->cpu_power) { | 4833 | if (!group->cpu_power) { |
4827 | printk("\n"); | 4834 | printk("\n"); |
4828 | printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); | 4835 | printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); |
4829 | } | 4836 | } |
4830 | 4837 | ||
4831 | if (!cpus_weight(group->cpumask)) { | 4838 | if (!cpus_weight(group->cpumask)) { |
4832 | printk("\n"); | 4839 | printk("\n"); |
4833 | printk(KERN_ERR "ERROR: empty group\n"); | 4840 | printk(KERN_ERR "ERROR: empty group\n"); |
4834 | } | 4841 | } |
4835 | 4842 | ||
4836 | if (cpus_intersects(groupmask, group->cpumask)) { | 4843 | if (cpus_intersects(groupmask, group->cpumask)) { |
4837 | printk("\n"); | 4844 | printk("\n"); |
4838 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 4845 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
4839 | } | 4846 | } |
4840 | 4847 | ||
4841 | cpus_or(groupmask, groupmask, group->cpumask); | 4848 | cpus_or(groupmask, groupmask, group->cpumask); |
4842 | 4849 | ||
4843 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 4850 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); |
4844 | printk(" %s", str); | 4851 | printk(" %s", str); |
4845 | 4852 | ||
4846 | group = group->next; | 4853 | group = group->next; |
4847 | } while (group != sd->groups); | 4854 | } while (group != sd->groups); |
4848 | printk("\n"); | 4855 | printk("\n"); |
4849 | 4856 | ||
4850 | if (!cpus_equal(sd->span, groupmask)) | 4857 | if (!cpus_equal(sd->span, groupmask)) |
4851 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 4858 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
4852 | 4859 | ||
4853 | level++; | 4860 | level++; |
4854 | sd = sd->parent; | 4861 | sd = sd->parent; |
4855 | 4862 | ||
4856 | if (sd) { | 4863 | if (sd) { |
4857 | if (!cpus_subset(groupmask, sd->span)) | 4864 | if (!cpus_subset(groupmask, sd->span)) |
4858 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); | 4865 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); |
4859 | } | 4866 | } |
4860 | 4867 | ||
4861 | } while (sd); | 4868 | } while (sd); |
4862 | } | 4869 | } |
4863 | #else | 4870 | #else |
4864 | #define sched_domain_debug(sd, cpu) {} | 4871 | #define sched_domain_debug(sd, cpu) {} |
4865 | #endif | 4872 | #endif |
4866 | 4873 | ||
4867 | static int sd_degenerate(struct sched_domain *sd) | 4874 | static int sd_degenerate(struct sched_domain *sd) |
4868 | { | 4875 | { |
4869 | if (cpus_weight(sd->span) == 1) | 4876 | if (cpus_weight(sd->span) == 1) |
4870 | return 1; | 4877 | return 1; |
4871 | 4878 | ||
4872 | /* Following flags need at least 2 groups */ | 4879 | /* Following flags need at least 2 groups */ |
4873 | if (sd->flags & (SD_LOAD_BALANCE | | 4880 | if (sd->flags & (SD_LOAD_BALANCE | |
4874 | SD_BALANCE_NEWIDLE | | 4881 | SD_BALANCE_NEWIDLE | |
4875 | SD_BALANCE_FORK | | 4882 | SD_BALANCE_FORK | |
4876 | SD_BALANCE_EXEC)) { | 4883 | SD_BALANCE_EXEC)) { |
4877 | if (sd->groups != sd->groups->next) | 4884 | if (sd->groups != sd->groups->next) |
4878 | return 0; | 4885 | return 0; |
4879 | } | 4886 | } |
4880 | 4887 | ||
4881 | /* Following flags don't use groups */ | 4888 | /* Following flags don't use groups */ |
4882 | if (sd->flags & (SD_WAKE_IDLE | | 4889 | if (sd->flags & (SD_WAKE_IDLE | |
4883 | SD_WAKE_AFFINE | | 4890 | SD_WAKE_AFFINE | |
4884 | SD_WAKE_BALANCE)) | 4891 | SD_WAKE_BALANCE)) |
4885 | return 0; | 4892 | return 0; |
4886 | 4893 | ||
4887 | return 1; | 4894 | return 1; |
4888 | } | 4895 | } |
4889 | 4896 | ||
4890 | static int sd_parent_degenerate(struct sched_domain *sd, | 4897 | static int sd_parent_degenerate(struct sched_domain *sd, |
4891 | struct sched_domain *parent) | 4898 | struct sched_domain *parent) |
4892 | { | 4899 | { |
4893 | unsigned long cflags = sd->flags, pflags = parent->flags; | 4900 | unsigned long cflags = sd->flags, pflags = parent->flags; |
4894 | 4901 | ||
4895 | if (sd_degenerate(parent)) | 4902 | if (sd_degenerate(parent)) |
4896 | return 1; | 4903 | return 1; |
4897 | 4904 | ||
4898 | if (!cpus_equal(sd->span, parent->span)) | 4905 | if (!cpus_equal(sd->span, parent->span)) |
4899 | return 0; | 4906 | return 0; |
4900 | 4907 | ||
4901 | /* Does parent contain flags not in child? */ | 4908 | /* Does parent contain flags not in child? */ |
4902 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | 4909 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ |
4903 | if (cflags & SD_WAKE_AFFINE) | 4910 | if (cflags & SD_WAKE_AFFINE) |
4904 | pflags &= ~SD_WAKE_BALANCE; | 4911 | pflags &= ~SD_WAKE_BALANCE; |
4905 | /* Flags needing groups don't count if only 1 group in parent */ | 4912 | /* Flags needing groups don't count if only 1 group in parent */ |
4906 | if (parent->groups == parent->groups->next) { | 4913 | if (parent->groups == parent->groups->next) { |
4907 | pflags &= ~(SD_LOAD_BALANCE | | 4914 | pflags &= ~(SD_LOAD_BALANCE | |
4908 | SD_BALANCE_NEWIDLE | | 4915 | SD_BALANCE_NEWIDLE | |
4909 | SD_BALANCE_FORK | | 4916 | SD_BALANCE_FORK | |
4910 | SD_BALANCE_EXEC); | 4917 | SD_BALANCE_EXEC); |
4911 | } | 4918 | } |
4912 | if (~cflags & pflags) | 4919 | if (~cflags & pflags) |
4913 | return 0; | 4920 | return 0; |
4914 | 4921 | ||
4915 | return 1; | 4922 | return 1; |
4916 | } | 4923 | } |
4917 | 4924 | ||
4918 | /* | 4925 | /* |
4919 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4926 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
4920 | * hold the hotplug lock. | 4927 | * hold the hotplug lock. |
4921 | */ | 4928 | */ |
4922 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 4929 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4923 | { | 4930 | { |
4924 | runqueue_t *rq = cpu_rq(cpu); | 4931 | runqueue_t *rq = cpu_rq(cpu); |
4925 | struct sched_domain *tmp; | 4932 | struct sched_domain *tmp; |
4926 | 4933 | ||
4927 | /* Remove the sched domains which do not contribute to scheduling. */ | 4934 | /* Remove the sched domains which do not contribute to scheduling. */ |
4928 | for (tmp = sd; tmp; tmp = tmp->parent) { | 4935 | for (tmp = sd; tmp; tmp = tmp->parent) { |
4929 | struct sched_domain *parent = tmp->parent; | 4936 | struct sched_domain *parent = tmp->parent; |
4930 | if (!parent) | 4937 | if (!parent) |
4931 | break; | 4938 | break; |
4932 | if (sd_parent_degenerate(tmp, parent)) | 4939 | if (sd_parent_degenerate(tmp, parent)) |
4933 | tmp->parent = parent->parent; | 4940 | tmp->parent = parent->parent; |
4934 | } | 4941 | } |
4935 | 4942 | ||
4936 | if (sd && sd_degenerate(sd)) | 4943 | if (sd && sd_degenerate(sd)) |
4937 | sd = sd->parent; | 4944 | sd = sd->parent; |
4938 | 4945 | ||
4939 | sched_domain_debug(sd, cpu); | 4946 | sched_domain_debug(sd, cpu); |
4940 | 4947 | ||
4941 | rcu_assign_pointer(rq->sd, sd); | 4948 | rcu_assign_pointer(rq->sd, sd); |
4942 | } | 4949 | } |
4943 | 4950 | ||
4944 | /* cpus with isolated domains */ | 4951 | /* cpus with isolated domains */ |
4945 | static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; | 4952 | static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; |
4946 | 4953 | ||
4947 | /* Setup the mask of cpus configured for isolated domains */ | 4954 | /* Setup the mask of cpus configured for isolated domains */ |
4948 | static int __init isolated_cpu_setup(char *str) | 4955 | static int __init isolated_cpu_setup(char *str) |
4949 | { | 4956 | { |
4950 | int ints[NR_CPUS], i; | 4957 | int ints[NR_CPUS], i; |
4951 | 4958 | ||
4952 | str = get_options(str, ARRAY_SIZE(ints), ints); | 4959 | str = get_options(str, ARRAY_SIZE(ints), ints); |
4953 | cpus_clear(cpu_isolated_map); | 4960 | cpus_clear(cpu_isolated_map); |
4954 | for (i = 1; i <= ints[0]; i++) | 4961 | for (i = 1; i <= ints[0]; i++) |
4955 | if (ints[i] < NR_CPUS) | 4962 | if (ints[i] < NR_CPUS) |
4956 | cpu_set(ints[i], cpu_isolated_map); | 4963 | cpu_set(ints[i], cpu_isolated_map); |
4957 | return 1; | 4964 | return 1; |
4958 | } | 4965 | } |
4959 | 4966 | ||
4960 | __setup ("isolcpus=", isolated_cpu_setup); | 4967 | __setup ("isolcpus=", isolated_cpu_setup); |
4961 | 4968 | ||
4962 | /* | 4969 | /* |
4963 | * init_sched_build_groups takes an array of groups, the cpumask we wish | 4970 | * init_sched_build_groups takes an array of groups, the cpumask we wish |
4964 | * to span, and a pointer to a function which identifies what group a CPU | 4971 | * to span, and a pointer to a function which identifies what group a CPU |
4965 | * belongs to. The return value of group_fn must be a valid index into the | 4972 | * belongs to. The return value of group_fn must be a valid index into the |
4966 | * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we | 4973 | * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we |
4967 | * keep track of groups covered with a cpumask_t). | 4974 | * keep track of groups covered with a cpumask_t). |
4968 | * | 4975 | * |
4969 | * init_sched_build_groups will build a circular linked list of the groups | 4976 | * init_sched_build_groups will build a circular linked list of the groups |
4970 | * covered by the given span, and will set each group's ->cpumask correctly, | 4977 | * covered by the given span, and will set each group's ->cpumask correctly, |
4971 | * and ->cpu_power to 0. | 4978 | * and ->cpu_power to 0. |
4972 | */ | 4979 | */ |
4973 | static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | 4980 | static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, |
4974 | int (*group_fn)(int cpu)) | 4981 | int (*group_fn)(int cpu)) |
4975 | { | 4982 | { |
4976 | struct sched_group *first = NULL, *last = NULL; | 4983 | struct sched_group *first = NULL, *last = NULL; |
4977 | cpumask_t covered = CPU_MASK_NONE; | 4984 | cpumask_t covered = CPU_MASK_NONE; |
4978 | int i; | 4985 | int i; |
4979 | 4986 | ||
4980 | for_each_cpu_mask(i, span) { | 4987 | for_each_cpu_mask(i, span) { |
4981 | int group = group_fn(i); | 4988 | int group = group_fn(i); |
4982 | struct sched_group *sg = &groups[group]; | 4989 | struct sched_group *sg = &groups[group]; |
4983 | int j; | 4990 | int j; |
4984 | 4991 | ||
4985 | if (cpu_isset(i, covered)) | 4992 | if (cpu_isset(i, covered)) |
4986 | continue; | 4993 | continue; |
4987 | 4994 | ||
4988 | sg->cpumask = CPU_MASK_NONE; | 4995 | sg->cpumask = CPU_MASK_NONE; |
4989 | sg->cpu_power = 0; | 4996 | sg->cpu_power = 0; |
4990 | 4997 | ||
4991 | for_each_cpu_mask(j, span) { | 4998 | for_each_cpu_mask(j, span) { |
4992 | if (group_fn(j) != group) | 4999 | if (group_fn(j) != group) |
4993 | continue; | 5000 | continue; |
4994 | 5001 | ||
4995 | cpu_set(j, covered); | 5002 | cpu_set(j, covered); |
4996 | cpu_set(j, sg->cpumask); | 5003 | cpu_set(j, sg->cpumask); |
4997 | } | 5004 | } |
4998 | if (!first) | 5005 | if (!first) |
4999 | first = sg; | 5006 | first = sg; |
5000 | if (last) | 5007 | if (last) |
5001 | last->next = sg; | 5008 | last->next = sg; |
5002 | last = sg; | 5009 | last = sg; |
5003 | } | 5010 | } |
5004 | last->next = first; | 5011 | last->next = first; |
5005 | } | 5012 | } |
5006 | 5013 | ||
5007 | #define SD_NODES_PER_DOMAIN 16 | 5014 | #define SD_NODES_PER_DOMAIN 16 |
5008 | 5015 | ||
5009 | /* | 5016 | /* |
5010 | * Self-tuning task migration cost measurement between source and target CPUs. | 5017 | * Self-tuning task migration cost measurement between source and target CPUs. |
5011 | * | 5018 | * |
5012 | * This is done by measuring the cost of manipulating buffers of varying | 5019 | * This is done by measuring the cost of manipulating buffers of varying |
5013 | * sizes. For a given buffer-size here are the steps that are taken: | 5020 | * sizes. For a given buffer-size here are the steps that are taken: |
5014 | * | 5021 | * |
5015 | * 1) the source CPU reads+dirties a shared buffer | 5022 | * 1) the source CPU reads+dirties a shared buffer |
5016 | * 2) the target CPU reads+dirties the same shared buffer | 5023 | * 2) the target CPU reads+dirties the same shared buffer |
5017 | * | 5024 | * |
5018 | * We measure how long they take, in the following 4 scenarios: | 5025 | * We measure how long they take, in the following 4 scenarios: |
5019 | * | 5026 | * |
5020 | * - source: CPU1, target: CPU2 | cost1 | 5027 | * - source: CPU1, target: CPU2 | cost1 |
5021 | * - source: CPU2, target: CPU1 | cost2 | 5028 | * - source: CPU2, target: CPU1 | cost2 |
5022 | * - source: CPU1, target: CPU1 | cost3 | 5029 | * - source: CPU1, target: CPU1 | cost3 |
5023 | * - source: CPU2, target: CPU2 | cost4 | 5030 | * - source: CPU2, target: CPU2 | cost4 |
5024 | * | 5031 | * |
5025 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | 5032 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is |
5026 | * the cost of migration. | 5033 | * the cost of migration. |
5027 | * | 5034 | * |
5028 | * We then start off from a small buffer-size and iterate up to larger | 5035 | * We then start off from a small buffer-size and iterate up to larger |
5029 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | 5036 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and |
5030 | * doing a maximum search for the cost. (The maximum cost for a migration | 5037 | * doing a maximum search for the cost. (The maximum cost for a migration |
5031 | * normally occurs when the working set size is around the effective cache | 5038 | * normally occurs when the working set size is around the effective cache |
5032 | * size.) | 5039 | * size.) |
5033 | */ | 5040 | */ |
5034 | #define SEARCH_SCOPE 2 | 5041 | #define SEARCH_SCOPE 2 |
5035 | #define MIN_CACHE_SIZE (64*1024U) | 5042 | #define MIN_CACHE_SIZE (64*1024U) |
5036 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | 5043 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) |
5037 | #define ITERATIONS 1 | 5044 | #define ITERATIONS 1 |
5038 | #define SIZE_THRESH 130 | 5045 | #define SIZE_THRESH 130 |
5039 | #define COST_THRESH 130 | 5046 | #define COST_THRESH 130 |
5040 | 5047 | ||
5041 | /* | 5048 | /* |
5042 | * The migration cost is a function of 'domain distance'. Domain | 5049 | * The migration cost is a function of 'domain distance'. Domain |
5043 | * distance is the number of steps a CPU has to iterate down its | 5050 | * distance is the number of steps a CPU has to iterate down its |
5044 | * domain tree to share a domain with the other CPU. The farther | 5051 | * domain tree to share a domain with the other CPU. The farther |
5045 | * two CPUs are from each other, the larger the distance gets. | 5052 | * two CPUs are from each other, the larger the distance gets. |
5046 | * | 5053 | * |
5047 | * Note that we use the distance only to cache measurement results, | 5054 | * Note that we use the distance only to cache measurement results, |
5048 | * the distance value is not used numerically otherwise. When two | 5055 | * the distance value is not used numerically otherwise. When two |
5049 | * CPUs have the same distance it is assumed that the migration | 5056 | * CPUs have the same distance it is assumed that the migration |
5050 | * cost is the same. (this is a simplification but quite practical) | 5057 | * cost is the same. (this is a simplification but quite practical) |
5051 | */ | 5058 | */ |
5052 | #define MAX_DOMAIN_DISTANCE 32 | 5059 | #define MAX_DOMAIN_DISTANCE 32 |
5053 | 5060 | ||
5054 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | 5061 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = |
5055 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = | 5062 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = |
5056 | /* | 5063 | /* |
5057 | * Architectures may override the migration cost and thus avoid | 5064 | * Architectures may override the migration cost and thus avoid |
5058 | * boot-time calibration. Unit is nanoseconds. Mostly useful for | 5065 | * boot-time calibration. Unit is nanoseconds. Mostly useful for |
5059 | * virtualized hardware: | 5066 | * virtualized hardware: |
5060 | */ | 5067 | */ |
5061 | #ifdef CONFIG_DEFAULT_MIGRATION_COST | 5068 | #ifdef CONFIG_DEFAULT_MIGRATION_COST |
5062 | CONFIG_DEFAULT_MIGRATION_COST | 5069 | CONFIG_DEFAULT_MIGRATION_COST |
5063 | #else | 5070 | #else |
5064 | -1LL | 5071 | -1LL |
5065 | #endif | 5072 | #endif |
5066 | }; | 5073 | }; |
5067 | 5074 | ||
5068 | /* | 5075 | /* |
5069 | * Allow override of migration cost - in units of microseconds. | 5076 | * Allow override of migration cost - in units of microseconds. |
5070 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | 5077 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost |
5071 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | 5078 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: |
5072 | */ | 5079 | */ |
5073 | static int __init migration_cost_setup(char *str) | 5080 | static int __init migration_cost_setup(char *str) |
5074 | { | 5081 | { |
5075 | int ints[MAX_DOMAIN_DISTANCE+1], i; | 5082 | int ints[MAX_DOMAIN_DISTANCE+1], i; |
5076 | 5083 | ||
5077 | str = get_options(str, ARRAY_SIZE(ints), ints); | 5084 | str = get_options(str, ARRAY_SIZE(ints), ints); |
5078 | 5085 | ||
5079 | printk("#ints: %d\n", ints[0]); | 5086 | printk("#ints: %d\n", ints[0]); |
5080 | for (i = 1; i <= ints[0]; i++) { | 5087 | for (i = 1; i <= ints[0]; i++) { |
5081 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | 5088 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; |
5082 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | 5089 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); |
5083 | } | 5090 | } |
5084 | return 1; | 5091 | return 1; |
5085 | } | 5092 | } |
5086 | 5093 | ||
5087 | __setup ("migration_cost=", migration_cost_setup); | 5094 | __setup ("migration_cost=", migration_cost_setup); |
5088 | 5095 | ||
5089 | /* | 5096 | /* |
5090 | * Global multiplier (divisor) for migration-cutoff values, | 5097 | * Global multiplier (divisor) for migration-cutoff values, |
5091 | * in percentiles. E.g. use a value of 150 to get 1.5 times | 5098 | * in percentiles. E.g. use a value of 150 to get 1.5 times |
5092 | * longer cache-hot cutoff times. | 5099 | * longer cache-hot cutoff times. |
5093 | * | 5100 | * |
5094 | * (We scale it from 100 to 128 to long long handling easier.) | 5101 | * (We scale it from 100 to 128 to long long handling easier.) |
5095 | */ | 5102 | */ |
5096 | 5103 | ||
5097 | #define MIGRATION_FACTOR_SCALE 128 | 5104 | #define MIGRATION_FACTOR_SCALE 128 |
5098 | 5105 | ||
5099 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | 5106 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; |
5100 | 5107 | ||
5101 | static int __init setup_migration_factor(char *str) | 5108 | static int __init setup_migration_factor(char *str) |
5102 | { | 5109 | { |
5103 | get_option(&str, &migration_factor); | 5110 | get_option(&str, &migration_factor); |
5104 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | 5111 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; |
5105 | return 1; | 5112 | return 1; |
5106 | } | 5113 | } |
5107 | 5114 | ||
5108 | __setup("migration_factor=", setup_migration_factor); | 5115 | __setup("migration_factor=", setup_migration_factor); |
5109 | 5116 | ||
5110 | /* | 5117 | /* |
5111 | * Estimated distance of two CPUs, measured via the number of domains | 5118 | * Estimated distance of two CPUs, measured via the number of domains |
5112 | * we have to pass for the two CPUs to be in the same span: | 5119 | * we have to pass for the two CPUs to be in the same span: |
5113 | */ | 5120 | */ |
5114 | static unsigned long domain_distance(int cpu1, int cpu2) | 5121 | static unsigned long domain_distance(int cpu1, int cpu2) |
5115 | { | 5122 | { |
5116 | unsigned long distance = 0; | 5123 | unsigned long distance = 0; |
5117 | struct sched_domain *sd; | 5124 | struct sched_domain *sd; |
5118 | 5125 | ||
5119 | for_each_domain(cpu1, sd) { | 5126 | for_each_domain(cpu1, sd) { |
5120 | WARN_ON(!cpu_isset(cpu1, sd->span)); | 5127 | WARN_ON(!cpu_isset(cpu1, sd->span)); |
5121 | if (cpu_isset(cpu2, sd->span)) | 5128 | if (cpu_isset(cpu2, sd->span)) |
5122 | return distance; | 5129 | return distance; |
5123 | distance++; | 5130 | distance++; |
5124 | } | 5131 | } |
5125 | if (distance >= MAX_DOMAIN_DISTANCE) { | 5132 | if (distance >= MAX_DOMAIN_DISTANCE) { |
5126 | WARN_ON(1); | 5133 | WARN_ON(1); |
5127 | distance = MAX_DOMAIN_DISTANCE-1; | 5134 | distance = MAX_DOMAIN_DISTANCE-1; |
5128 | } | 5135 | } |
5129 | 5136 | ||
5130 | return distance; | 5137 | return distance; |
5131 | } | 5138 | } |
5132 | 5139 | ||
5133 | static unsigned int migration_debug; | 5140 | static unsigned int migration_debug; |
5134 | 5141 | ||
5135 | static int __init setup_migration_debug(char *str) | 5142 | static int __init setup_migration_debug(char *str) |
5136 | { | 5143 | { |
5137 | get_option(&str, &migration_debug); | 5144 | get_option(&str, &migration_debug); |
5138 | return 1; | 5145 | return 1; |
5139 | } | 5146 | } |
5140 | 5147 | ||
5141 | __setup("migration_debug=", setup_migration_debug); | 5148 | __setup("migration_debug=", setup_migration_debug); |
5142 | 5149 | ||
5143 | /* | 5150 | /* |
5144 | * Maximum cache-size that the scheduler should try to measure. | 5151 | * Maximum cache-size that the scheduler should try to measure. |
5145 | * Architectures with larger caches should tune this up during | 5152 | * Architectures with larger caches should tune this up during |
5146 | * bootup. Gets used in the domain-setup code (i.e. during SMP | 5153 | * bootup. Gets used in the domain-setup code (i.e. during SMP |
5147 | * bootup). | 5154 | * bootup). |
5148 | */ | 5155 | */ |
5149 | unsigned int max_cache_size; | 5156 | unsigned int max_cache_size; |
5150 | 5157 | ||
5151 | static int __init setup_max_cache_size(char *str) | 5158 | static int __init setup_max_cache_size(char *str) |
5152 | { | 5159 | { |
5153 | get_option(&str, &max_cache_size); | 5160 | get_option(&str, &max_cache_size); |
5154 | return 1; | 5161 | return 1; |
5155 | } | 5162 | } |
5156 | 5163 | ||
5157 | __setup("max_cache_size=", setup_max_cache_size); | 5164 | __setup("max_cache_size=", setup_max_cache_size); |
5158 | 5165 | ||
5159 | /* | 5166 | /* |
5160 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | 5167 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This |
5161 | * is the operation that is timed, so we try to generate unpredictable | 5168 | * is the operation that is timed, so we try to generate unpredictable |
5162 | * cachemisses that still end up filling the L2 cache: | 5169 | * cachemisses that still end up filling the L2 cache: |
5163 | */ | 5170 | */ |
5164 | static void touch_cache(void *__cache, unsigned long __size) | 5171 | static void touch_cache(void *__cache, unsigned long __size) |
5165 | { | 5172 | { |
5166 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | 5173 | unsigned long size = __size/sizeof(long), chunk1 = size/3, |
5167 | chunk2 = 2*size/3; | 5174 | chunk2 = 2*size/3; |
5168 | unsigned long *cache = __cache; | 5175 | unsigned long *cache = __cache; |
5169 | int i; | 5176 | int i; |
5170 | 5177 | ||
5171 | for (i = 0; i < size/6; i += 8) { | 5178 | for (i = 0; i < size/6; i += 8) { |
5172 | switch (i % 6) { | 5179 | switch (i % 6) { |
5173 | case 0: cache[i]++; | 5180 | case 0: cache[i]++; |
5174 | case 1: cache[size-1-i]++; | 5181 | case 1: cache[size-1-i]++; |
5175 | case 2: cache[chunk1-i]++; | 5182 | case 2: cache[chunk1-i]++; |
5176 | case 3: cache[chunk1+i]++; | 5183 | case 3: cache[chunk1+i]++; |
5177 | case 4: cache[chunk2-i]++; | 5184 | case 4: cache[chunk2-i]++; |
5178 | case 5: cache[chunk2+i]++; | 5185 | case 5: cache[chunk2+i]++; |
5179 | } | 5186 | } |
5180 | } | 5187 | } |
5181 | } | 5188 | } |
5182 | 5189 | ||
5183 | /* | 5190 | /* |
5184 | * Measure the cache-cost of one task migration. Returns in units of nsec. | 5191 | * Measure the cache-cost of one task migration. Returns in units of nsec. |
5185 | */ | 5192 | */ |
5186 | static unsigned long long measure_one(void *cache, unsigned long size, | 5193 | static unsigned long long measure_one(void *cache, unsigned long size, |
5187 | int source, int target) | 5194 | int source, int target) |
5188 | { | 5195 | { |
5189 | cpumask_t mask, saved_mask; | 5196 | cpumask_t mask, saved_mask; |
5190 | unsigned long long t0, t1, t2, t3, cost; | 5197 | unsigned long long t0, t1, t2, t3, cost; |
5191 | 5198 | ||
5192 | saved_mask = current->cpus_allowed; | 5199 | saved_mask = current->cpus_allowed; |
5193 | 5200 | ||
5194 | /* | 5201 | /* |
5195 | * Flush source caches to RAM and invalidate them: | 5202 | * Flush source caches to RAM and invalidate them: |
5196 | */ | 5203 | */ |
5197 | sched_cacheflush(); | 5204 | sched_cacheflush(); |
5198 | 5205 | ||
5199 | /* | 5206 | /* |
5200 | * Migrate to the source CPU: | 5207 | * Migrate to the source CPU: |
5201 | */ | 5208 | */ |
5202 | mask = cpumask_of_cpu(source); | 5209 | mask = cpumask_of_cpu(source); |
5203 | set_cpus_allowed(current, mask); | 5210 | set_cpus_allowed(current, mask); |
5204 | WARN_ON(smp_processor_id() != source); | 5211 | WARN_ON(smp_processor_id() != source); |
5205 | 5212 | ||
5206 | /* | 5213 | /* |
5207 | * Dirty the working set: | 5214 | * Dirty the working set: |
5208 | */ | 5215 | */ |
5209 | t0 = sched_clock(); | 5216 | t0 = sched_clock(); |
5210 | touch_cache(cache, size); | 5217 | touch_cache(cache, size); |
5211 | t1 = sched_clock(); | 5218 | t1 = sched_clock(); |
5212 | 5219 | ||
5213 | /* | 5220 | /* |
5214 | * Migrate to the target CPU, dirty the L2 cache and access | 5221 | * Migrate to the target CPU, dirty the L2 cache and access |
5215 | * the shared buffer. (which represents the working set | 5222 | * the shared buffer. (which represents the working set |
5216 | * of a migrated task.) | 5223 | * of a migrated task.) |
5217 | */ | 5224 | */ |
5218 | mask = cpumask_of_cpu(target); | 5225 | mask = cpumask_of_cpu(target); |
5219 | set_cpus_allowed(current, mask); | 5226 | set_cpus_allowed(current, mask); |
5220 | WARN_ON(smp_processor_id() != target); | 5227 | WARN_ON(smp_processor_id() != target); |
5221 | 5228 | ||
5222 | t2 = sched_clock(); | 5229 | t2 = sched_clock(); |
5223 | touch_cache(cache, size); | 5230 | touch_cache(cache, size); |
5224 | t3 = sched_clock(); | 5231 | t3 = sched_clock(); |
5225 | 5232 | ||
5226 | cost = t1-t0 + t3-t2; | 5233 | cost = t1-t0 + t3-t2; |
5227 | 5234 | ||
5228 | if (migration_debug >= 2) | 5235 | if (migration_debug >= 2) |
5229 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | 5236 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", |
5230 | source, target, t1-t0, t1-t0, t3-t2, cost); | 5237 | source, target, t1-t0, t1-t0, t3-t2, cost); |
5231 | /* | 5238 | /* |
5232 | * Flush target caches to RAM and invalidate them: | 5239 | * Flush target caches to RAM and invalidate them: |
5233 | */ | 5240 | */ |
5234 | sched_cacheflush(); | 5241 | sched_cacheflush(); |
5235 | 5242 | ||
5236 | set_cpus_allowed(current, saved_mask); | 5243 | set_cpus_allowed(current, saved_mask); |
5237 | 5244 | ||
5238 | return cost; | 5245 | return cost; |
5239 | } | 5246 | } |
5240 | 5247 | ||
5241 | /* | 5248 | /* |
5242 | * Measure a series of task migrations and return the average | 5249 | * Measure a series of task migrations and return the average |
5243 | * result. Since this code runs early during bootup the system | 5250 | * result. Since this code runs early during bootup the system |
5244 | * is 'undisturbed' and the average latency makes sense. | 5251 | * is 'undisturbed' and the average latency makes sense. |
5245 | * | 5252 | * |
5246 | * The algorithm in essence auto-detects the relevant cache-size, | 5253 | * The algorithm in essence auto-detects the relevant cache-size, |
5247 | * so it will properly detect different cachesizes for different | 5254 | * so it will properly detect different cachesizes for different |
5248 | * cache-hierarchies, depending on how the CPUs are connected. | 5255 | * cache-hierarchies, depending on how the CPUs are connected. |
5249 | * | 5256 | * |
5250 | * Architectures can prime the upper limit of the search range via | 5257 | * Architectures can prime the upper limit of the search range via |
5251 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | 5258 | * max_cache_size, otherwise the search range defaults to 20MB...64K. |
5252 | */ | 5259 | */ |
5253 | static unsigned long long | 5260 | static unsigned long long |
5254 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | 5261 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) |
5255 | { | 5262 | { |
5256 | unsigned long long cost1, cost2; | 5263 | unsigned long long cost1, cost2; |
5257 | int i; | 5264 | int i; |
5258 | 5265 | ||
5259 | /* | 5266 | /* |
5260 | * Measure the migration cost of 'size' bytes, over an | 5267 | * Measure the migration cost of 'size' bytes, over an |
5261 | * average of 10 runs: | 5268 | * average of 10 runs: |
5262 | * | 5269 | * |
5263 | * (We perturb the cache size by a small (0..4k) | 5270 | * (We perturb the cache size by a small (0..4k) |
5264 | * value to compensate size/alignment related artifacts. | 5271 | * value to compensate size/alignment related artifacts. |
5265 | * We also subtract the cost of the operation done on | 5272 | * We also subtract the cost of the operation done on |
5266 | * the same CPU.) | 5273 | * the same CPU.) |
5267 | */ | 5274 | */ |
5268 | cost1 = 0; | 5275 | cost1 = 0; |
5269 | 5276 | ||
5270 | /* | 5277 | /* |
5271 | * dry run, to make sure we start off cache-cold on cpu1, | 5278 | * dry run, to make sure we start off cache-cold on cpu1, |
5272 | * and to get any vmalloc pagefaults in advance: | 5279 | * and to get any vmalloc pagefaults in advance: |
5273 | */ | 5280 | */ |
5274 | measure_one(cache, size, cpu1, cpu2); | 5281 | measure_one(cache, size, cpu1, cpu2); |
5275 | for (i = 0; i < ITERATIONS; i++) | 5282 | for (i = 0; i < ITERATIONS; i++) |
5276 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | 5283 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); |
5277 | 5284 | ||
5278 | measure_one(cache, size, cpu2, cpu1); | 5285 | measure_one(cache, size, cpu2, cpu1); |
5279 | for (i = 0; i < ITERATIONS; i++) | 5286 | for (i = 0; i < ITERATIONS; i++) |
5280 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | 5287 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); |
5281 | 5288 | ||
5282 | /* | 5289 | /* |
5283 | * (We measure the non-migrating [cached] cost on both | 5290 | * (We measure the non-migrating [cached] cost on both |
5284 | * cpu1 and cpu2, to handle CPUs with different speeds) | 5291 | * cpu1 and cpu2, to handle CPUs with different speeds) |
5285 | */ | 5292 | */ |
5286 | cost2 = 0; | 5293 | cost2 = 0; |
5287 | 5294 | ||
5288 | measure_one(cache, size, cpu1, cpu1); | 5295 | measure_one(cache, size, cpu1, cpu1); |
5289 | for (i = 0; i < ITERATIONS; i++) | 5296 | for (i = 0; i < ITERATIONS; i++) |
5290 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | 5297 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); |
5291 | 5298 | ||
5292 | measure_one(cache, size, cpu2, cpu2); | 5299 | measure_one(cache, size, cpu2, cpu2); |
5293 | for (i = 0; i < ITERATIONS; i++) | 5300 | for (i = 0; i < ITERATIONS; i++) |
5294 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | 5301 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); |
5295 | 5302 | ||
5296 | /* | 5303 | /* |
5297 | * Get the per-iteration migration cost: | 5304 | * Get the per-iteration migration cost: |
5298 | */ | 5305 | */ |
5299 | do_div(cost1, 2*ITERATIONS); | 5306 | do_div(cost1, 2*ITERATIONS); |
5300 | do_div(cost2, 2*ITERATIONS); | 5307 | do_div(cost2, 2*ITERATIONS); |
5301 | 5308 | ||
5302 | return cost1 - cost2; | 5309 | return cost1 - cost2; |
5303 | } | 5310 | } |
5304 | 5311 | ||
5305 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | 5312 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) |
5306 | { | 5313 | { |
5307 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | 5314 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; |
5308 | unsigned int max_size, size, size_found = 0; | 5315 | unsigned int max_size, size, size_found = 0; |
5309 | long long cost = 0, prev_cost; | 5316 | long long cost = 0, prev_cost; |
5310 | void *cache; | 5317 | void *cache; |
5311 | 5318 | ||
5312 | /* | 5319 | /* |
5313 | * Search from max_cache_size*5 down to 64K - the real relevant | 5320 | * Search from max_cache_size*5 down to 64K - the real relevant |
5314 | * cachesize has to lie somewhere inbetween. | 5321 | * cachesize has to lie somewhere inbetween. |
5315 | */ | 5322 | */ |
5316 | if (max_cache_size) { | 5323 | if (max_cache_size) { |
5317 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | 5324 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); |
5318 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | 5325 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); |
5319 | } else { | 5326 | } else { |
5320 | /* | 5327 | /* |
5321 | * Since we have no estimation about the relevant | 5328 | * Since we have no estimation about the relevant |
5322 | * search range | 5329 | * search range |
5323 | */ | 5330 | */ |
5324 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | 5331 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; |
5325 | size = MIN_CACHE_SIZE; | 5332 | size = MIN_CACHE_SIZE; |
5326 | } | 5333 | } |
5327 | 5334 | ||
5328 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | 5335 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { |
5329 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | 5336 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); |
5330 | return 0; | 5337 | return 0; |
5331 | } | 5338 | } |
5332 | 5339 | ||
5333 | /* | 5340 | /* |
5334 | * Allocate the working set: | 5341 | * Allocate the working set: |
5335 | */ | 5342 | */ |
5336 | cache = vmalloc(max_size); | 5343 | cache = vmalloc(max_size); |
5337 | if (!cache) { | 5344 | if (!cache) { |
5338 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 5345 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); |
5339 | return 1000000; // return 1 msec on very small boxen | 5346 | return 1000000; // return 1 msec on very small boxen |
5340 | } | 5347 | } |
5341 | 5348 | ||
5342 | while (size <= max_size) { | 5349 | while (size <= max_size) { |
5343 | prev_cost = cost; | 5350 | prev_cost = cost; |
5344 | cost = measure_cost(cpu1, cpu2, cache, size); | 5351 | cost = measure_cost(cpu1, cpu2, cache, size); |
5345 | 5352 | ||
5346 | /* | 5353 | /* |
5347 | * Update the max: | 5354 | * Update the max: |
5348 | */ | 5355 | */ |
5349 | if (cost > 0) { | 5356 | if (cost > 0) { |
5350 | if (max_cost < cost) { | 5357 | if (max_cost < cost) { |
5351 | max_cost = cost; | 5358 | max_cost = cost; |
5352 | size_found = size; | 5359 | size_found = size; |
5353 | } | 5360 | } |
5354 | } | 5361 | } |
5355 | /* | 5362 | /* |
5356 | * Calculate average fluctuation, we use this to prevent | 5363 | * Calculate average fluctuation, we use this to prevent |
5357 | * noise from triggering an early break out of the loop: | 5364 | * noise from triggering an early break out of the loop: |
5358 | */ | 5365 | */ |
5359 | fluct = abs(cost - prev_cost); | 5366 | fluct = abs(cost - prev_cost); |
5360 | avg_fluct = (avg_fluct + fluct)/2; | 5367 | avg_fluct = (avg_fluct + fluct)/2; |
5361 | 5368 | ||
5362 | if (migration_debug) | 5369 | if (migration_debug) |
5363 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | 5370 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", |
5364 | cpu1, cpu2, size, | 5371 | cpu1, cpu2, size, |
5365 | (long)cost / 1000000, | 5372 | (long)cost / 1000000, |
5366 | ((long)cost / 100000) % 10, | 5373 | ((long)cost / 100000) % 10, |
5367 | (long)max_cost / 1000000, | 5374 | (long)max_cost / 1000000, |
5368 | ((long)max_cost / 100000) % 10, | 5375 | ((long)max_cost / 100000) % 10, |
5369 | domain_distance(cpu1, cpu2), | 5376 | domain_distance(cpu1, cpu2), |
5370 | cost, avg_fluct); | 5377 | cost, avg_fluct); |
5371 | 5378 | ||
5372 | /* | 5379 | /* |
5373 | * If we iterated at least 20% past the previous maximum, | 5380 | * If we iterated at least 20% past the previous maximum, |
5374 | * and the cost has dropped by more than 20% already, | 5381 | * and the cost has dropped by more than 20% already, |
5375 | * (taking fluctuations into account) then we assume to | 5382 | * (taking fluctuations into account) then we assume to |
5376 | * have found the maximum and break out of the loop early: | 5383 | * have found the maximum and break out of the loop early: |
5377 | */ | 5384 | */ |
5378 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | 5385 | if (size_found && (size*100 > size_found*SIZE_THRESH)) |
5379 | if (cost+avg_fluct <= 0 || | 5386 | if (cost+avg_fluct <= 0 || |
5380 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | 5387 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { |
5381 | 5388 | ||
5382 | if (migration_debug) | 5389 | if (migration_debug) |
5383 | printk("-> found max.\n"); | 5390 | printk("-> found max.\n"); |
5384 | break; | 5391 | break; |
5385 | } | 5392 | } |
5386 | /* | 5393 | /* |
5387 | * Increase the cachesize in 10% steps: | 5394 | * Increase the cachesize in 10% steps: |
5388 | */ | 5395 | */ |
5389 | size = size * 10 / 9; | 5396 | size = size * 10 / 9; |
5390 | } | 5397 | } |
5391 | 5398 | ||
5392 | if (migration_debug) | 5399 | if (migration_debug) |
5393 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | 5400 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", |
5394 | cpu1, cpu2, size_found, max_cost); | 5401 | cpu1, cpu2, size_found, max_cost); |
5395 | 5402 | ||
5396 | vfree(cache); | 5403 | vfree(cache); |
5397 | 5404 | ||
5398 | /* | 5405 | /* |
5399 | * A task is considered 'cache cold' if at least 2 times | 5406 | * A task is considered 'cache cold' if at least 2 times |
5400 | * the worst-case cost of migration has passed. | 5407 | * the worst-case cost of migration has passed. |
5401 | * | 5408 | * |
5402 | * (this limit is only listened to if the load-balancing | 5409 | * (this limit is only listened to if the load-balancing |
5403 | * situation is 'nice' - if there is a large imbalance we | 5410 | * situation is 'nice' - if there is a large imbalance we |
5404 | * ignore it for the sake of CPU utilization and | 5411 | * ignore it for the sake of CPU utilization and |
5405 | * processing fairness.) | 5412 | * processing fairness.) |
5406 | */ | 5413 | */ |
5407 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | 5414 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; |
5408 | } | 5415 | } |
5409 | 5416 | ||
5410 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | 5417 | static void calibrate_migration_costs(const cpumask_t *cpu_map) |
5411 | { | 5418 | { |
5412 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | 5419 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); |
5413 | unsigned long j0, j1, distance, max_distance = 0; | 5420 | unsigned long j0, j1, distance, max_distance = 0; |
5414 | struct sched_domain *sd; | 5421 | struct sched_domain *sd; |
5415 | 5422 | ||
5416 | j0 = jiffies; | 5423 | j0 = jiffies; |
5417 | 5424 | ||
5418 | /* | 5425 | /* |
5419 | * First pass - calculate the cacheflush times: | 5426 | * First pass - calculate the cacheflush times: |
5420 | */ | 5427 | */ |
5421 | for_each_cpu_mask(cpu1, *cpu_map) { | 5428 | for_each_cpu_mask(cpu1, *cpu_map) { |
5422 | for_each_cpu_mask(cpu2, *cpu_map) { | 5429 | for_each_cpu_mask(cpu2, *cpu_map) { |
5423 | if (cpu1 == cpu2) | 5430 | if (cpu1 == cpu2) |
5424 | continue; | 5431 | continue; |
5425 | distance = domain_distance(cpu1, cpu2); | 5432 | distance = domain_distance(cpu1, cpu2); |
5426 | max_distance = max(max_distance, distance); | 5433 | max_distance = max(max_distance, distance); |
5427 | /* | 5434 | /* |
5428 | * No result cached yet? | 5435 | * No result cached yet? |
5429 | */ | 5436 | */ |
5430 | if (migration_cost[distance] == -1LL) | 5437 | if (migration_cost[distance] == -1LL) |
5431 | migration_cost[distance] = | 5438 | migration_cost[distance] = |
5432 | measure_migration_cost(cpu1, cpu2); | 5439 | measure_migration_cost(cpu1, cpu2); |
5433 | } | 5440 | } |
5434 | } | 5441 | } |
5435 | /* | 5442 | /* |
5436 | * Second pass - update the sched domain hierarchy with | 5443 | * Second pass - update the sched domain hierarchy with |
5437 | * the new cache-hot-time estimations: | 5444 | * the new cache-hot-time estimations: |
5438 | */ | 5445 | */ |
5439 | for_each_cpu_mask(cpu, *cpu_map) { | 5446 | for_each_cpu_mask(cpu, *cpu_map) { |
5440 | distance = 0; | 5447 | distance = 0; |
5441 | for_each_domain(cpu, sd) { | 5448 | for_each_domain(cpu, sd) { |
5442 | sd->cache_hot_time = migration_cost[distance]; | 5449 | sd->cache_hot_time = migration_cost[distance]; |
5443 | distance++; | 5450 | distance++; |
5444 | } | 5451 | } |
5445 | } | 5452 | } |
5446 | /* | 5453 | /* |
5447 | * Print the matrix: | 5454 | * Print the matrix: |
5448 | */ | 5455 | */ |
5449 | if (migration_debug) | 5456 | if (migration_debug) |
5450 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | 5457 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", |
5451 | max_cache_size, | 5458 | max_cache_size, |
5452 | #ifdef CONFIG_X86 | 5459 | #ifdef CONFIG_X86 |
5453 | cpu_khz/1000 | 5460 | cpu_khz/1000 |
5454 | #else | 5461 | #else |
5455 | -1 | 5462 | -1 |
5456 | #endif | 5463 | #endif |
5457 | ); | 5464 | ); |
5458 | if (system_state == SYSTEM_BOOTING) { | 5465 | if (system_state == SYSTEM_BOOTING) { |
5459 | printk("migration_cost="); | 5466 | printk("migration_cost="); |
5460 | for (distance = 0; distance <= max_distance; distance++) { | 5467 | for (distance = 0; distance <= max_distance; distance++) { |
5461 | if (distance) | 5468 | if (distance) |
5462 | printk(","); | 5469 | printk(","); |
5463 | printk("%ld", (long)migration_cost[distance] / 1000); | 5470 | printk("%ld", (long)migration_cost[distance] / 1000); |
5464 | } | 5471 | } |
5465 | printk("\n"); | 5472 | printk("\n"); |
5466 | } | 5473 | } |
5467 | j1 = jiffies; | 5474 | j1 = jiffies; |
5468 | if (migration_debug) | 5475 | if (migration_debug) |
5469 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | 5476 | printk("migration: %ld seconds\n", (j1-j0)/HZ); |
5470 | 5477 | ||
5471 | /* | 5478 | /* |
5472 | * Move back to the original CPU. NUMA-Q gets confused | 5479 | * Move back to the original CPU. NUMA-Q gets confused |
5473 | * if we migrate to another quad during bootup. | 5480 | * if we migrate to another quad during bootup. |
5474 | */ | 5481 | */ |
5475 | if (raw_smp_processor_id() != orig_cpu) { | 5482 | if (raw_smp_processor_id() != orig_cpu) { |
5476 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | 5483 | cpumask_t mask = cpumask_of_cpu(orig_cpu), |
5477 | saved_mask = current->cpus_allowed; | 5484 | saved_mask = current->cpus_allowed; |
5478 | 5485 | ||
5479 | set_cpus_allowed(current, mask); | 5486 | set_cpus_allowed(current, mask); |
5480 | set_cpus_allowed(current, saved_mask); | 5487 | set_cpus_allowed(current, saved_mask); |
5481 | } | 5488 | } |
5482 | } | 5489 | } |
5483 | 5490 | ||
5484 | #ifdef CONFIG_NUMA | 5491 | #ifdef CONFIG_NUMA |
5485 | 5492 | ||
5486 | /** | 5493 | /** |
5487 | * find_next_best_node - find the next node to include in a sched_domain | 5494 | * find_next_best_node - find the next node to include in a sched_domain |
5488 | * @node: node whose sched_domain we're building | 5495 | * @node: node whose sched_domain we're building |
5489 | * @used_nodes: nodes already in the sched_domain | 5496 | * @used_nodes: nodes already in the sched_domain |
5490 | * | 5497 | * |
5491 | * Find the next node to include in a given scheduling domain. Simply | 5498 | * Find the next node to include in a given scheduling domain. Simply |
5492 | * finds the closest node not already in the @used_nodes map. | 5499 | * finds the closest node not already in the @used_nodes map. |
5493 | * | 5500 | * |
5494 | * Should use nodemask_t. | 5501 | * Should use nodemask_t. |
5495 | */ | 5502 | */ |
5496 | static int find_next_best_node(int node, unsigned long *used_nodes) | 5503 | static int find_next_best_node(int node, unsigned long *used_nodes) |
5497 | { | 5504 | { |
5498 | int i, n, val, min_val, best_node = 0; | 5505 | int i, n, val, min_val, best_node = 0; |
5499 | 5506 | ||
5500 | min_val = INT_MAX; | 5507 | min_val = INT_MAX; |
5501 | 5508 | ||
5502 | for (i = 0; i < MAX_NUMNODES; i++) { | 5509 | for (i = 0; i < MAX_NUMNODES; i++) { |
5503 | /* Start at @node */ | 5510 | /* Start at @node */ |
5504 | n = (node + i) % MAX_NUMNODES; | 5511 | n = (node + i) % MAX_NUMNODES; |
5505 | 5512 | ||
5506 | if (!nr_cpus_node(n)) | 5513 | if (!nr_cpus_node(n)) |
5507 | continue; | 5514 | continue; |
5508 | 5515 | ||
5509 | /* Skip already used nodes */ | 5516 | /* Skip already used nodes */ |
5510 | if (test_bit(n, used_nodes)) | 5517 | if (test_bit(n, used_nodes)) |
5511 | continue; | 5518 | continue; |
5512 | 5519 | ||
5513 | /* Simple min distance search */ | 5520 | /* Simple min distance search */ |
5514 | val = node_distance(node, n); | 5521 | val = node_distance(node, n); |
5515 | 5522 | ||
5516 | if (val < min_val) { | 5523 | if (val < min_val) { |
5517 | min_val = val; | 5524 | min_val = val; |
5518 | best_node = n; | 5525 | best_node = n; |
5519 | } | 5526 | } |
5520 | } | 5527 | } |
5521 | 5528 | ||
5522 | set_bit(best_node, used_nodes); | 5529 | set_bit(best_node, used_nodes); |
5523 | return best_node; | 5530 | return best_node; |
5524 | } | 5531 | } |
5525 | 5532 | ||
5526 | /** | 5533 | /** |
5527 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 5534 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
5528 | * @node: node whose cpumask we're constructing | 5535 | * @node: node whose cpumask we're constructing |
5529 | * @size: number of nodes to include in this span | 5536 | * @size: number of nodes to include in this span |
5530 | * | 5537 | * |
5531 | * Given a node, construct a good cpumask for its sched_domain to span. It | 5538 | * Given a node, construct a good cpumask for its sched_domain to span. It |
5532 | * should be one that prevents unnecessary balancing, but also spreads tasks | 5539 | * should be one that prevents unnecessary balancing, but also spreads tasks |
5533 | * out optimally. | 5540 | * out optimally. |
5534 | */ | 5541 | */ |
5535 | static cpumask_t sched_domain_node_span(int node) | 5542 | static cpumask_t sched_domain_node_span(int node) |
5536 | { | 5543 | { |
5537 | int i; | 5544 | int i; |
5538 | cpumask_t span, nodemask; | 5545 | cpumask_t span, nodemask; |
5539 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 5546 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
5540 | 5547 | ||
5541 | cpus_clear(span); | 5548 | cpus_clear(span); |
5542 | bitmap_zero(used_nodes, MAX_NUMNODES); | 5549 | bitmap_zero(used_nodes, MAX_NUMNODES); |
5543 | 5550 | ||
5544 | nodemask = node_to_cpumask(node); | 5551 | nodemask = node_to_cpumask(node); |
5545 | cpus_or(span, span, nodemask); | 5552 | cpus_or(span, span, nodemask); |
5546 | set_bit(node, used_nodes); | 5553 | set_bit(node, used_nodes); |
5547 | 5554 | ||
5548 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 5555 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
5549 | int next_node = find_next_best_node(node, used_nodes); | 5556 | int next_node = find_next_best_node(node, used_nodes); |
5550 | nodemask = node_to_cpumask(next_node); | 5557 | nodemask = node_to_cpumask(next_node); |
5551 | cpus_or(span, span, nodemask); | 5558 | cpus_or(span, span, nodemask); |
5552 | } | 5559 | } |
5553 | 5560 | ||
5554 | return span; | 5561 | return span; |
5555 | } | 5562 | } |
5556 | #endif | 5563 | #endif |
5557 | 5564 | ||
5558 | /* | 5565 | /* |
5559 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 5566 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
5560 | * can switch it on easily if needed. | 5567 | * can switch it on easily if needed. |
5561 | */ | 5568 | */ |
5562 | #ifdef CONFIG_SCHED_SMT | 5569 | #ifdef CONFIG_SCHED_SMT |
5563 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 5570 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
5564 | static struct sched_group sched_group_cpus[NR_CPUS]; | 5571 | static struct sched_group sched_group_cpus[NR_CPUS]; |
5565 | static int cpu_to_cpu_group(int cpu) | 5572 | static int cpu_to_cpu_group(int cpu) |
5566 | { | 5573 | { |
5567 | return cpu; | 5574 | return cpu; |
5568 | } | 5575 | } |
5569 | #endif | 5576 | #endif |
5570 | 5577 | ||
5571 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 5578 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5572 | static struct sched_group sched_group_phys[NR_CPUS]; | 5579 | static struct sched_group sched_group_phys[NR_CPUS]; |
5573 | static int cpu_to_phys_group(int cpu) | 5580 | static int cpu_to_phys_group(int cpu) |
5574 | { | 5581 | { |
5575 | #ifdef CONFIG_SCHED_SMT | 5582 | #ifdef CONFIG_SCHED_SMT |
5576 | return first_cpu(cpu_sibling_map[cpu]); | 5583 | return first_cpu(cpu_sibling_map[cpu]); |
5577 | #else | 5584 | #else |
5578 | return cpu; | 5585 | return cpu; |
5579 | #endif | 5586 | #endif |
5580 | } | 5587 | } |
5581 | 5588 | ||
5582 | #ifdef CONFIG_NUMA | 5589 | #ifdef CONFIG_NUMA |
5583 | /* | 5590 | /* |
5584 | * The init_sched_build_groups can't handle what we want to do with node | 5591 | * The init_sched_build_groups can't handle what we want to do with node |
5585 | * groups, so roll our own. Now each node has its own list of groups which | 5592 | * groups, so roll our own. Now each node has its own list of groups which |
5586 | * gets dynamically allocated. | 5593 | * gets dynamically allocated. |
5587 | */ | 5594 | */ |
5588 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 5595 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
5589 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 5596 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
5590 | 5597 | ||
5591 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 5598 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
5592 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; | 5599 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; |
5593 | 5600 | ||
5594 | static int cpu_to_allnodes_group(int cpu) | 5601 | static int cpu_to_allnodes_group(int cpu) |
5595 | { | 5602 | { |
5596 | return cpu_to_node(cpu); | 5603 | return cpu_to_node(cpu); |
5597 | } | 5604 | } |
5598 | #endif | 5605 | #endif |
5599 | 5606 | ||
5600 | /* | 5607 | /* |
5601 | * Build sched domains for a given set of cpus and attach the sched domains | 5608 | * Build sched domains for a given set of cpus and attach the sched domains |
5602 | * to the individual cpus | 5609 | * to the individual cpus |
5603 | */ | 5610 | */ |
5604 | void build_sched_domains(const cpumask_t *cpu_map) | 5611 | void build_sched_domains(const cpumask_t *cpu_map) |
5605 | { | 5612 | { |
5606 | int i; | 5613 | int i; |
5607 | #ifdef CONFIG_NUMA | 5614 | #ifdef CONFIG_NUMA |
5608 | struct sched_group **sched_group_nodes = NULL; | 5615 | struct sched_group **sched_group_nodes = NULL; |
5609 | struct sched_group *sched_group_allnodes = NULL; | 5616 | struct sched_group *sched_group_allnodes = NULL; |
5610 | 5617 | ||
5611 | /* | 5618 | /* |
5612 | * Allocate the per-node list of sched groups | 5619 | * Allocate the per-node list of sched groups |
5613 | */ | 5620 | */ |
5614 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 5621 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
5615 | GFP_ATOMIC); | 5622 | GFP_ATOMIC); |
5616 | if (!sched_group_nodes) { | 5623 | if (!sched_group_nodes) { |
5617 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 5624 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
5618 | return; | 5625 | return; |
5619 | } | 5626 | } |
5620 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 5627 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
5621 | #endif | 5628 | #endif |
5622 | 5629 | ||
5623 | /* | 5630 | /* |
5624 | * Set up domains for cpus specified by the cpu_map. | 5631 | * Set up domains for cpus specified by the cpu_map. |
5625 | */ | 5632 | */ |
5626 | for_each_cpu_mask(i, *cpu_map) { | 5633 | for_each_cpu_mask(i, *cpu_map) { |
5627 | int group; | 5634 | int group; |
5628 | struct sched_domain *sd = NULL, *p; | 5635 | struct sched_domain *sd = NULL, *p; |
5629 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 5636 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
5630 | 5637 | ||
5631 | cpus_and(nodemask, nodemask, *cpu_map); | 5638 | cpus_and(nodemask, nodemask, *cpu_map); |
5632 | 5639 | ||
5633 | #ifdef CONFIG_NUMA | 5640 | #ifdef CONFIG_NUMA |
5634 | if (cpus_weight(*cpu_map) | 5641 | if (cpus_weight(*cpu_map) |
5635 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 5642 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
5636 | if (!sched_group_allnodes) { | 5643 | if (!sched_group_allnodes) { |
5637 | sched_group_allnodes | 5644 | sched_group_allnodes |
5638 | = kmalloc(sizeof(struct sched_group) | 5645 | = kmalloc(sizeof(struct sched_group) |
5639 | * MAX_NUMNODES, | 5646 | * MAX_NUMNODES, |
5640 | GFP_KERNEL); | 5647 | GFP_KERNEL); |
5641 | if (!sched_group_allnodes) { | 5648 | if (!sched_group_allnodes) { |
5642 | printk(KERN_WARNING | 5649 | printk(KERN_WARNING |
5643 | "Can not alloc allnodes sched group\n"); | 5650 | "Can not alloc allnodes sched group\n"); |
5644 | break; | 5651 | break; |
5645 | } | 5652 | } |
5646 | sched_group_allnodes_bycpu[i] | 5653 | sched_group_allnodes_bycpu[i] |
5647 | = sched_group_allnodes; | 5654 | = sched_group_allnodes; |
5648 | } | 5655 | } |
5649 | sd = &per_cpu(allnodes_domains, i); | 5656 | sd = &per_cpu(allnodes_domains, i); |
5650 | *sd = SD_ALLNODES_INIT; | 5657 | *sd = SD_ALLNODES_INIT; |
5651 | sd->span = *cpu_map; | 5658 | sd->span = *cpu_map; |
5652 | group = cpu_to_allnodes_group(i); | 5659 | group = cpu_to_allnodes_group(i); |
5653 | sd->groups = &sched_group_allnodes[group]; | 5660 | sd->groups = &sched_group_allnodes[group]; |
5654 | p = sd; | 5661 | p = sd; |
5655 | } else | 5662 | } else |
5656 | p = NULL; | 5663 | p = NULL; |
5657 | 5664 | ||
5658 | sd = &per_cpu(node_domains, i); | 5665 | sd = &per_cpu(node_domains, i); |
5659 | *sd = SD_NODE_INIT; | 5666 | *sd = SD_NODE_INIT; |
5660 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 5667 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
5661 | sd->parent = p; | 5668 | sd->parent = p; |
5662 | cpus_and(sd->span, sd->span, *cpu_map); | 5669 | cpus_and(sd->span, sd->span, *cpu_map); |
5663 | #endif | 5670 | #endif |
5664 | 5671 | ||
5665 | p = sd; | 5672 | p = sd; |
5666 | sd = &per_cpu(phys_domains, i); | 5673 | sd = &per_cpu(phys_domains, i); |
5667 | group = cpu_to_phys_group(i); | 5674 | group = cpu_to_phys_group(i); |
5668 | *sd = SD_CPU_INIT; | 5675 | *sd = SD_CPU_INIT; |
5669 | sd->span = nodemask; | 5676 | sd->span = nodemask; |
5670 | sd->parent = p; | 5677 | sd->parent = p; |
5671 | sd->groups = &sched_group_phys[group]; | 5678 | sd->groups = &sched_group_phys[group]; |
5672 | 5679 | ||
5673 | #ifdef CONFIG_SCHED_SMT | 5680 | #ifdef CONFIG_SCHED_SMT |
5674 | p = sd; | 5681 | p = sd; |
5675 | sd = &per_cpu(cpu_domains, i); | 5682 | sd = &per_cpu(cpu_domains, i); |
5676 | group = cpu_to_cpu_group(i); | 5683 | group = cpu_to_cpu_group(i); |
5677 | *sd = SD_SIBLING_INIT; | 5684 | *sd = SD_SIBLING_INIT; |
5678 | sd->span = cpu_sibling_map[i]; | 5685 | sd->span = cpu_sibling_map[i]; |
5679 | cpus_and(sd->span, sd->span, *cpu_map); | 5686 | cpus_and(sd->span, sd->span, *cpu_map); |
5680 | sd->parent = p; | 5687 | sd->parent = p; |
5681 | sd->groups = &sched_group_cpus[group]; | 5688 | sd->groups = &sched_group_cpus[group]; |
5682 | #endif | 5689 | #endif |
5683 | } | 5690 | } |
5684 | 5691 | ||
5685 | #ifdef CONFIG_SCHED_SMT | 5692 | #ifdef CONFIG_SCHED_SMT |
5686 | /* Set up CPU (sibling) groups */ | 5693 | /* Set up CPU (sibling) groups */ |
5687 | for_each_cpu_mask(i, *cpu_map) { | 5694 | for_each_cpu_mask(i, *cpu_map) { |
5688 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 5695 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
5689 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 5696 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
5690 | if (i != first_cpu(this_sibling_map)) | 5697 | if (i != first_cpu(this_sibling_map)) |
5691 | continue; | 5698 | continue; |
5692 | 5699 | ||
5693 | init_sched_build_groups(sched_group_cpus, this_sibling_map, | 5700 | init_sched_build_groups(sched_group_cpus, this_sibling_map, |
5694 | &cpu_to_cpu_group); | 5701 | &cpu_to_cpu_group); |
5695 | } | 5702 | } |
5696 | #endif | 5703 | #endif |
5697 | 5704 | ||
5698 | /* Set up physical groups */ | 5705 | /* Set up physical groups */ |
5699 | for (i = 0; i < MAX_NUMNODES; i++) { | 5706 | for (i = 0; i < MAX_NUMNODES; i++) { |
5700 | cpumask_t nodemask = node_to_cpumask(i); | 5707 | cpumask_t nodemask = node_to_cpumask(i); |
5701 | 5708 | ||
5702 | cpus_and(nodemask, nodemask, *cpu_map); | 5709 | cpus_and(nodemask, nodemask, *cpu_map); |
5703 | if (cpus_empty(nodemask)) | 5710 | if (cpus_empty(nodemask)) |
5704 | continue; | 5711 | continue; |
5705 | 5712 | ||
5706 | init_sched_build_groups(sched_group_phys, nodemask, | 5713 | init_sched_build_groups(sched_group_phys, nodemask, |
5707 | &cpu_to_phys_group); | 5714 | &cpu_to_phys_group); |
5708 | } | 5715 | } |
5709 | 5716 | ||
5710 | #ifdef CONFIG_NUMA | 5717 | #ifdef CONFIG_NUMA |
5711 | /* Set up node groups */ | 5718 | /* Set up node groups */ |
5712 | if (sched_group_allnodes) | 5719 | if (sched_group_allnodes) |
5713 | init_sched_build_groups(sched_group_allnodes, *cpu_map, | 5720 | init_sched_build_groups(sched_group_allnodes, *cpu_map, |
5714 | &cpu_to_allnodes_group); | 5721 | &cpu_to_allnodes_group); |
5715 | 5722 | ||
5716 | for (i = 0; i < MAX_NUMNODES; i++) { | 5723 | for (i = 0; i < MAX_NUMNODES; i++) { |
5717 | /* Set up node groups */ | 5724 | /* Set up node groups */ |
5718 | struct sched_group *sg, *prev; | 5725 | struct sched_group *sg, *prev; |
5719 | cpumask_t nodemask = node_to_cpumask(i); | 5726 | cpumask_t nodemask = node_to_cpumask(i); |
5720 | cpumask_t domainspan; | 5727 | cpumask_t domainspan; |
5721 | cpumask_t covered = CPU_MASK_NONE; | 5728 | cpumask_t covered = CPU_MASK_NONE; |
5722 | int j; | 5729 | int j; |
5723 | 5730 | ||
5724 | cpus_and(nodemask, nodemask, *cpu_map); | 5731 | cpus_and(nodemask, nodemask, *cpu_map); |
5725 | if (cpus_empty(nodemask)) { | 5732 | if (cpus_empty(nodemask)) { |
5726 | sched_group_nodes[i] = NULL; | 5733 | sched_group_nodes[i] = NULL; |
5727 | continue; | 5734 | continue; |
5728 | } | 5735 | } |
5729 | 5736 | ||
5730 | domainspan = sched_domain_node_span(i); | 5737 | domainspan = sched_domain_node_span(i); |
5731 | cpus_and(domainspan, domainspan, *cpu_map); | 5738 | cpus_and(domainspan, domainspan, *cpu_map); |
5732 | 5739 | ||
5733 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 5740 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); |
5734 | sched_group_nodes[i] = sg; | 5741 | sched_group_nodes[i] = sg; |
5735 | for_each_cpu_mask(j, nodemask) { | 5742 | for_each_cpu_mask(j, nodemask) { |
5736 | struct sched_domain *sd; | 5743 | struct sched_domain *sd; |
5737 | sd = &per_cpu(node_domains, j); | 5744 | sd = &per_cpu(node_domains, j); |
5738 | sd->groups = sg; | 5745 | sd->groups = sg; |
5739 | if (sd->groups == NULL) { | 5746 | if (sd->groups == NULL) { |
5740 | /* Turn off balancing if we have no groups */ | 5747 | /* Turn off balancing if we have no groups */ |
5741 | sd->flags = 0; | 5748 | sd->flags = 0; |
5742 | } | 5749 | } |
5743 | } | 5750 | } |
5744 | if (!sg) { | 5751 | if (!sg) { |
5745 | printk(KERN_WARNING | 5752 | printk(KERN_WARNING |
5746 | "Can not alloc domain group for node %d\n", i); | 5753 | "Can not alloc domain group for node %d\n", i); |
5747 | continue; | 5754 | continue; |
5748 | } | 5755 | } |
5749 | sg->cpu_power = 0; | 5756 | sg->cpu_power = 0; |
5750 | sg->cpumask = nodemask; | 5757 | sg->cpumask = nodemask; |
5751 | cpus_or(covered, covered, nodemask); | 5758 | cpus_or(covered, covered, nodemask); |
5752 | prev = sg; | 5759 | prev = sg; |
5753 | 5760 | ||
5754 | for (j = 0; j < MAX_NUMNODES; j++) { | 5761 | for (j = 0; j < MAX_NUMNODES; j++) { |
5755 | cpumask_t tmp, notcovered; | 5762 | cpumask_t tmp, notcovered; |
5756 | int n = (i + j) % MAX_NUMNODES; | 5763 | int n = (i + j) % MAX_NUMNODES; |
5757 | 5764 | ||
5758 | cpus_complement(notcovered, covered); | 5765 | cpus_complement(notcovered, covered); |
5759 | cpus_and(tmp, notcovered, *cpu_map); | 5766 | cpus_and(tmp, notcovered, *cpu_map); |
5760 | cpus_and(tmp, tmp, domainspan); | 5767 | cpus_and(tmp, tmp, domainspan); |
5761 | if (cpus_empty(tmp)) | 5768 | if (cpus_empty(tmp)) |
5762 | break; | 5769 | break; |
5763 | 5770 | ||
5764 | nodemask = node_to_cpumask(n); | 5771 | nodemask = node_to_cpumask(n); |
5765 | cpus_and(tmp, tmp, nodemask); | 5772 | cpus_and(tmp, tmp, nodemask); |
5766 | if (cpus_empty(tmp)) | 5773 | if (cpus_empty(tmp)) |
5767 | continue; | 5774 | continue; |
5768 | 5775 | ||
5769 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 5776 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); |
5770 | if (!sg) { | 5777 | if (!sg) { |
5771 | printk(KERN_WARNING | 5778 | printk(KERN_WARNING |
5772 | "Can not alloc domain group for node %d\n", j); | 5779 | "Can not alloc domain group for node %d\n", j); |
5773 | break; | 5780 | break; |
5774 | } | 5781 | } |
5775 | sg->cpu_power = 0; | 5782 | sg->cpu_power = 0; |
5776 | sg->cpumask = tmp; | 5783 | sg->cpumask = tmp; |
5777 | cpus_or(covered, covered, tmp); | 5784 | cpus_or(covered, covered, tmp); |
5778 | prev->next = sg; | 5785 | prev->next = sg; |
5779 | prev = sg; | 5786 | prev = sg; |
5780 | } | 5787 | } |
5781 | prev->next = sched_group_nodes[i]; | 5788 | prev->next = sched_group_nodes[i]; |
5782 | } | 5789 | } |
5783 | #endif | 5790 | #endif |
5784 | 5791 | ||
5785 | /* Calculate CPU power for physical packages and nodes */ | 5792 | /* Calculate CPU power for physical packages and nodes */ |
5786 | for_each_cpu_mask(i, *cpu_map) { | 5793 | for_each_cpu_mask(i, *cpu_map) { |
5787 | int power; | 5794 | int power; |
5788 | struct sched_domain *sd; | 5795 | struct sched_domain *sd; |
5789 | #ifdef CONFIG_SCHED_SMT | 5796 | #ifdef CONFIG_SCHED_SMT |
5790 | sd = &per_cpu(cpu_domains, i); | 5797 | sd = &per_cpu(cpu_domains, i); |
5791 | power = SCHED_LOAD_SCALE; | 5798 | power = SCHED_LOAD_SCALE; |
5792 | sd->groups->cpu_power = power; | 5799 | sd->groups->cpu_power = power; |
5793 | #endif | 5800 | #endif |
5794 | 5801 | ||
5795 | sd = &per_cpu(phys_domains, i); | 5802 | sd = &per_cpu(phys_domains, i); |
5796 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 5803 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
5797 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 5804 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
5798 | sd->groups->cpu_power = power; | 5805 | sd->groups->cpu_power = power; |
5799 | 5806 | ||
5800 | #ifdef CONFIG_NUMA | 5807 | #ifdef CONFIG_NUMA |
5801 | sd = &per_cpu(allnodes_domains, i); | 5808 | sd = &per_cpu(allnodes_domains, i); |
5802 | if (sd->groups) { | 5809 | if (sd->groups) { |
5803 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 5810 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
5804 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 5811 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
5805 | sd->groups->cpu_power = power; | 5812 | sd->groups->cpu_power = power; |
5806 | } | 5813 | } |
5807 | #endif | 5814 | #endif |
5808 | } | 5815 | } |
5809 | 5816 | ||
5810 | #ifdef CONFIG_NUMA | 5817 | #ifdef CONFIG_NUMA |
5811 | for (i = 0; i < MAX_NUMNODES; i++) { | 5818 | for (i = 0; i < MAX_NUMNODES; i++) { |
5812 | struct sched_group *sg = sched_group_nodes[i]; | 5819 | struct sched_group *sg = sched_group_nodes[i]; |
5813 | int j; | 5820 | int j; |
5814 | 5821 | ||
5815 | if (sg == NULL) | 5822 | if (sg == NULL) |
5816 | continue; | 5823 | continue; |
5817 | next_sg: | 5824 | next_sg: |
5818 | for_each_cpu_mask(j, sg->cpumask) { | 5825 | for_each_cpu_mask(j, sg->cpumask) { |
5819 | struct sched_domain *sd; | 5826 | struct sched_domain *sd; |
5820 | int power; | 5827 | int power; |
5821 | 5828 | ||
5822 | sd = &per_cpu(phys_domains, j); | 5829 | sd = &per_cpu(phys_domains, j); |
5823 | if (j != first_cpu(sd->groups->cpumask)) { | 5830 | if (j != first_cpu(sd->groups->cpumask)) { |
5824 | /* | 5831 | /* |
5825 | * Only add "power" once for each | 5832 | * Only add "power" once for each |
5826 | * physical package. | 5833 | * physical package. |
5827 | */ | 5834 | */ |
5828 | continue; | 5835 | continue; |
5829 | } | 5836 | } |
5830 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 5837 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
5831 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 5838 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
5832 | 5839 | ||
5833 | sg->cpu_power += power; | 5840 | sg->cpu_power += power; |
5834 | } | 5841 | } |
5835 | sg = sg->next; | 5842 | sg = sg->next; |
5836 | if (sg != sched_group_nodes[i]) | 5843 | if (sg != sched_group_nodes[i]) |
5837 | goto next_sg; | 5844 | goto next_sg; |
5838 | } | 5845 | } |
5839 | #endif | 5846 | #endif |
5840 | 5847 | ||
5841 | /* Attach the domains */ | 5848 | /* Attach the domains */ |
5842 | for_each_cpu_mask(i, *cpu_map) { | 5849 | for_each_cpu_mask(i, *cpu_map) { |
5843 | struct sched_domain *sd; | 5850 | struct sched_domain *sd; |
5844 | #ifdef CONFIG_SCHED_SMT | 5851 | #ifdef CONFIG_SCHED_SMT |
5845 | sd = &per_cpu(cpu_domains, i); | 5852 | sd = &per_cpu(cpu_domains, i); |
5846 | #else | 5853 | #else |
5847 | sd = &per_cpu(phys_domains, i); | 5854 | sd = &per_cpu(phys_domains, i); |
5848 | #endif | 5855 | #endif |
5849 | cpu_attach_domain(sd, i); | 5856 | cpu_attach_domain(sd, i); |
5850 | } | 5857 | } |
5851 | /* | 5858 | /* |
5852 | * Tune cache-hot values: | 5859 | * Tune cache-hot values: |
5853 | */ | 5860 | */ |
5854 | calibrate_migration_costs(cpu_map); | 5861 | calibrate_migration_costs(cpu_map); |
5855 | } | 5862 | } |
5856 | /* | 5863 | /* |
5857 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5864 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5858 | */ | 5865 | */ |
5859 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 5866 | static void arch_init_sched_domains(const cpumask_t *cpu_map) |
5860 | { | 5867 | { |
5861 | cpumask_t cpu_default_map; | 5868 | cpumask_t cpu_default_map; |
5862 | 5869 | ||
5863 | /* | 5870 | /* |
5864 | * Setup mask for cpus without special case scheduling requirements. | 5871 | * Setup mask for cpus without special case scheduling requirements. |
5865 | * For now this just excludes isolated cpus, but could be used to | 5872 | * For now this just excludes isolated cpus, but could be used to |
5866 | * exclude other special cases in the future. | 5873 | * exclude other special cases in the future. |
5867 | */ | 5874 | */ |
5868 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 5875 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
5869 | 5876 | ||
5870 | build_sched_domains(&cpu_default_map); | 5877 | build_sched_domains(&cpu_default_map); |
5871 | } | 5878 | } |
5872 | 5879 | ||
5873 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 5880 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5874 | { | 5881 | { |
5875 | #ifdef CONFIG_NUMA | 5882 | #ifdef CONFIG_NUMA |
5876 | int i; | 5883 | int i; |
5877 | int cpu; | 5884 | int cpu; |
5878 | 5885 | ||
5879 | for_each_cpu_mask(cpu, *cpu_map) { | 5886 | for_each_cpu_mask(cpu, *cpu_map) { |
5880 | struct sched_group *sched_group_allnodes | 5887 | struct sched_group *sched_group_allnodes |
5881 | = sched_group_allnodes_bycpu[cpu]; | 5888 | = sched_group_allnodes_bycpu[cpu]; |
5882 | struct sched_group **sched_group_nodes | 5889 | struct sched_group **sched_group_nodes |
5883 | = sched_group_nodes_bycpu[cpu]; | 5890 | = sched_group_nodes_bycpu[cpu]; |
5884 | 5891 | ||
5885 | if (sched_group_allnodes) { | 5892 | if (sched_group_allnodes) { |
5886 | kfree(sched_group_allnodes); | 5893 | kfree(sched_group_allnodes); |
5887 | sched_group_allnodes_bycpu[cpu] = NULL; | 5894 | sched_group_allnodes_bycpu[cpu] = NULL; |
5888 | } | 5895 | } |
5889 | 5896 | ||
5890 | if (!sched_group_nodes) | 5897 | if (!sched_group_nodes) |
5891 | continue; | 5898 | continue; |
5892 | 5899 | ||
5893 | for (i = 0; i < MAX_NUMNODES; i++) { | 5900 | for (i = 0; i < MAX_NUMNODES; i++) { |
5894 | cpumask_t nodemask = node_to_cpumask(i); | 5901 | cpumask_t nodemask = node_to_cpumask(i); |
5895 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 5902 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
5896 | 5903 | ||
5897 | cpus_and(nodemask, nodemask, *cpu_map); | 5904 | cpus_and(nodemask, nodemask, *cpu_map); |
5898 | if (cpus_empty(nodemask)) | 5905 | if (cpus_empty(nodemask)) |
5899 | continue; | 5906 | continue; |
5900 | 5907 | ||
5901 | if (sg == NULL) | 5908 | if (sg == NULL) |
5902 | continue; | 5909 | continue; |
5903 | sg = sg->next; | 5910 | sg = sg->next; |
5904 | next_sg: | 5911 | next_sg: |
5905 | oldsg = sg; | 5912 | oldsg = sg; |
5906 | sg = sg->next; | 5913 | sg = sg->next; |
5907 | kfree(oldsg); | 5914 | kfree(oldsg); |
5908 | if (oldsg != sched_group_nodes[i]) | 5915 | if (oldsg != sched_group_nodes[i]) |
5909 | goto next_sg; | 5916 | goto next_sg; |
5910 | } | 5917 | } |
5911 | kfree(sched_group_nodes); | 5918 | kfree(sched_group_nodes); |
5912 | sched_group_nodes_bycpu[cpu] = NULL; | 5919 | sched_group_nodes_bycpu[cpu] = NULL; |
5913 | } | 5920 | } |
5914 | #endif | 5921 | #endif |
5915 | } | 5922 | } |
5916 | 5923 | ||
5917 | /* | 5924 | /* |
5918 | * Detach sched domains from a group of cpus specified in cpu_map | 5925 | * Detach sched domains from a group of cpus specified in cpu_map |
5919 | * These cpus will now be attached to the NULL domain | 5926 | * These cpus will now be attached to the NULL domain |
5920 | */ | 5927 | */ |
5921 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 5928 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
5922 | { | 5929 | { |
5923 | int i; | 5930 | int i; |
5924 | 5931 | ||
5925 | for_each_cpu_mask(i, *cpu_map) | 5932 | for_each_cpu_mask(i, *cpu_map) |
5926 | cpu_attach_domain(NULL, i); | 5933 | cpu_attach_domain(NULL, i); |
5927 | synchronize_sched(); | 5934 | synchronize_sched(); |
5928 | arch_destroy_sched_domains(cpu_map); | 5935 | arch_destroy_sched_domains(cpu_map); |
5929 | } | 5936 | } |
5930 | 5937 | ||
5931 | /* | 5938 | /* |
5932 | * Partition sched domains as specified by the cpumasks below. | 5939 | * Partition sched domains as specified by the cpumasks below. |
5933 | * This attaches all cpus from the cpumasks to the NULL domain, | 5940 | * This attaches all cpus from the cpumasks to the NULL domain, |
5934 | * waits for a RCU quiescent period, recalculates sched | 5941 | * waits for a RCU quiescent period, recalculates sched |
5935 | * domain information and then attaches them back to the | 5942 | * domain information and then attaches them back to the |
5936 | * correct sched domains | 5943 | * correct sched domains |
5937 | * Call with hotplug lock held | 5944 | * Call with hotplug lock held |
5938 | */ | 5945 | */ |
5939 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 5946 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
5940 | { | 5947 | { |
5941 | cpumask_t change_map; | 5948 | cpumask_t change_map; |
5942 | 5949 | ||
5943 | cpus_and(*partition1, *partition1, cpu_online_map); | 5950 | cpus_and(*partition1, *partition1, cpu_online_map); |
5944 | cpus_and(*partition2, *partition2, cpu_online_map); | 5951 | cpus_and(*partition2, *partition2, cpu_online_map); |
5945 | cpus_or(change_map, *partition1, *partition2); | 5952 | cpus_or(change_map, *partition1, *partition2); |
5946 | 5953 | ||
5947 | /* Detach sched domains from all of the affected cpus */ | 5954 | /* Detach sched domains from all of the affected cpus */ |
5948 | detach_destroy_domains(&change_map); | 5955 | detach_destroy_domains(&change_map); |
5949 | if (!cpus_empty(*partition1)) | 5956 | if (!cpus_empty(*partition1)) |
5950 | build_sched_domains(partition1); | 5957 | build_sched_domains(partition1); |
5951 | if (!cpus_empty(*partition2)) | 5958 | if (!cpus_empty(*partition2)) |
5952 | build_sched_domains(partition2); | 5959 | build_sched_domains(partition2); |
5953 | } | 5960 | } |
5954 | 5961 | ||
5955 | #ifdef CONFIG_HOTPLUG_CPU | 5962 | #ifdef CONFIG_HOTPLUG_CPU |
5956 | /* | 5963 | /* |
5957 | * Force a reinitialization of the sched domains hierarchy. The domains | 5964 | * Force a reinitialization of the sched domains hierarchy. The domains |
5958 | * and groups cannot be updated in place without racing with the balancing | 5965 | * and groups cannot be updated in place without racing with the balancing |
5959 | * code, so we temporarily attach all running cpus to the NULL domain | 5966 | * code, so we temporarily attach all running cpus to the NULL domain |
5960 | * which will prevent rebalancing while the sched domains are recalculated. | 5967 | * which will prevent rebalancing while the sched domains are recalculated. |
5961 | */ | 5968 | */ |
5962 | static int update_sched_domains(struct notifier_block *nfb, | 5969 | static int update_sched_domains(struct notifier_block *nfb, |
5963 | unsigned long action, void *hcpu) | 5970 | unsigned long action, void *hcpu) |
5964 | { | 5971 | { |
5965 | switch (action) { | 5972 | switch (action) { |
5966 | case CPU_UP_PREPARE: | 5973 | case CPU_UP_PREPARE: |
5967 | case CPU_DOWN_PREPARE: | 5974 | case CPU_DOWN_PREPARE: |
5968 | detach_destroy_domains(&cpu_online_map); | 5975 | detach_destroy_domains(&cpu_online_map); |
5969 | return NOTIFY_OK; | 5976 | return NOTIFY_OK; |
5970 | 5977 | ||
5971 | case CPU_UP_CANCELED: | 5978 | case CPU_UP_CANCELED: |
5972 | case CPU_DOWN_FAILED: | 5979 | case CPU_DOWN_FAILED: |
5973 | case CPU_ONLINE: | 5980 | case CPU_ONLINE: |
5974 | case CPU_DEAD: | 5981 | case CPU_DEAD: |
5975 | /* | 5982 | /* |
5976 | * Fall through and re-initialise the domains. | 5983 | * Fall through and re-initialise the domains. |
5977 | */ | 5984 | */ |
5978 | break; | 5985 | break; |
5979 | default: | 5986 | default: |
5980 | return NOTIFY_DONE; | 5987 | return NOTIFY_DONE; |
5981 | } | 5988 | } |
5982 | 5989 | ||
5983 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 5990 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
5984 | arch_init_sched_domains(&cpu_online_map); | 5991 | arch_init_sched_domains(&cpu_online_map); |
5985 | 5992 | ||
5986 | return NOTIFY_OK; | 5993 | return NOTIFY_OK; |
5987 | } | 5994 | } |
5988 | #endif | 5995 | #endif |
5989 | 5996 | ||
5990 | void __init sched_init_smp(void) | 5997 | void __init sched_init_smp(void) |
5991 | { | 5998 | { |
5992 | lock_cpu_hotplug(); | 5999 | lock_cpu_hotplug(); |
5993 | arch_init_sched_domains(&cpu_online_map); | 6000 | arch_init_sched_domains(&cpu_online_map); |
5994 | unlock_cpu_hotplug(); | 6001 | unlock_cpu_hotplug(); |
5995 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6002 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
5996 | hotcpu_notifier(update_sched_domains, 0); | 6003 | hotcpu_notifier(update_sched_domains, 0); |
5997 | } | 6004 | } |
5998 | #else | 6005 | #else |
5999 | void __init sched_init_smp(void) | 6006 | void __init sched_init_smp(void) |
6000 | { | 6007 | { |
6001 | } | 6008 | } |
6002 | #endif /* CONFIG_SMP */ | 6009 | #endif /* CONFIG_SMP */ |
6003 | 6010 | ||
6004 | int in_sched_functions(unsigned long addr) | 6011 | int in_sched_functions(unsigned long addr) |
6005 | { | 6012 | { |
6006 | /* Linker adds these: start and end of __sched functions */ | 6013 | /* Linker adds these: start and end of __sched functions */ |
6007 | extern char __sched_text_start[], __sched_text_end[]; | 6014 | extern char __sched_text_start[], __sched_text_end[]; |
6008 | return in_lock_functions(addr) || | 6015 | return in_lock_functions(addr) || |
6009 | (addr >= (unsigned long)__sched_text_start | 6016 | (addr >= (unsigned long)__sched_text_start |
6010 | && addr < (unsigned long)__sched_text_end); | 6017 | && addr < (unsigned long)__sched_text_end); |
6011 | } | 6018 | } |
6012 | 6019 | ||
6013 | void __init sched_init(void) | 6020 | void __init sched_init(void) |
6014 | { | 6021 | { |
6015 | runqueue_t *rq; | 6022 | runqueue_t *rq; |
6016 | int i, j, k; | 6023 | int i, j, k; |
6017 | 6024 | ||
6018 | for_each_cpu(i) { | 6025 | for_each_cpu(i) { |
6019 | prio_array_t *array; | 6026 | prio_array_t *array; |
6020 | 6027 | ||
6021 | rq = cpu_rq(i); | 6028 | rq = cpu_rq(i); |
6022 | spin_lock_init(&rq->lock); | 6029 | spin_lock_init(&rq->lock); |
6023 | rq->nr_running = 0; | 6030 | rq->nr_running = 0; |
6024 | rq->active = rq->arrays; | 6031 | rq->active = rq->arrays; |
6025 | rq->expired = rq->arrays + 1; | 6032 | rq->expired = rq->arrays + 1; |
6026 | rq->best_expired_prio = MAX_PRIO; | 6033 | rq->best_expired_prio = MAX_PRIO; |
6027 | 6034 | ||
6028 | #ifdef CONFIG_SMP | 6035 | #ifdef CONFIG_SMP |
6029 | rq->sd = NULL; | 6036 | rq->sd = NULL; |
6030 | for (j = 1; j < 3; j++) | 6037 | for (j = 1; j < 3; j++) |
6031 | rq->cpu_load[j] = 0; | 6038 | rq->cpu_load[j] = 0; |
6032 | rq->active_balance = 0; | 6039 | rq->active_balance = 0; |
6033 | rq->push_cpu = 0; | 6040 | rq->push_cpu = 0; |
6034 | rq->migration_thread = NULL; | 6041 | rq->migration_thread = NULL; |
6035 | INIT_LIST_HEAD(&rq->migration_queue); | 6042 | INIT_LIST_HEAD(&rq->migration_queue); |
6036 | rq->cpu = i; | 6043 | rq->cpu = i; |
6037 | #endif | 6044 | #endif |
6038 | atomic_set(&rq->nr_iowait, 0); | 6045 | atomic_set(&rq->nr_iowait, 0); |
6039 | 6046 | ||
6040 | for (j = 0; j < 2; j++) { | 6047 | for (j = 0; j < 2; j++) { |
6041 | array = rq->arrays + j; | 6048 | array = rq->arrays + j; |
6042 | for (k = 0; k < MAX_PRIO; k++) { | 6049 | for (k = 0; k < MAX_PRIO; k++) { |
6043 | INIT_LIST_HEAD(array->queue + k); | 6050 | INIT_LIST_HEAD(array->queue + k); |
6044 | __clear_bit(k, array->bitmap); | 6051 | __clear_bit(k, array->bitmap); |
6045 | } | 6052 | } |
6046 | // delimiter for bitsearch | 6053 | // delimiter for bitsearch |
6047 | __set_bit(MAX_PRIO, array->bitmap); | 6054 | __set_bit(MAX_PRIO, array->bitmap); |
6048 | } | 6055 | } |
6049 | } | 6056 | } |
6050 | 6057 | ||
6051 | /* | 6058 | /* |
6052 | * The boot idle thread does lazy MMU switching as well: | 6059 | * The boot idle thread does lazy MMU switching as well: |
6053 | */ | 6060 | */ |
6054 | atomic_inc(&init_mm.mm_count); | 6061 | atomic_inc(&init_mm.mm_count); |
6055 | enter_lazy_tlb(&init_mm, current); | 6062 | enter_lazy_tlb(&init_mm, current); |
6056 | 6063 | ||
6057 | /* | 6064 | /* |
6058 | * Make us the idle thread. Technically, schedule() should not be | 6065 | * Make us the idle thread. Technically, schedule() should not be |
6059 | * called from this thread, however somewhere below it might be, | 6066 | * called from this thread, however somewhere below it might be, |
6060 | * but because we are the idle thread, we just pick up running again | 6067 | * but because we are the idle thread, we just pick up running again |
6061 | * when this runqueue becomes "idle". | 6068 | * when this runqueue becomes "idle". |
6062 | */ | 6069 | */ |
6063 | init_idle(current, smp_processor_id()); | 6070 | init_idle(current, smp_processor_id()); |
6064 | } | 6071 | } |
6065 | 6072 | ||
6066 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6073 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
6067 | void __might_sleep(char *file, int line) | 6074 | void __might_sleep(char *file, int line) |
6068 | { | 6075 | { |
6069 | #if defined(in_atomic) | 6076 | #if defined(in_atomic) |
6070 | static unsigned long prev_jiffy; /* ratelimiting */ | 6077 | static unsigned long prev_jiffy; /* ratelimiting */ |
6071 | 6078 | ||
6072 | if ((in_atomic() || irqs_disabled()) && | 6079 | if ((in_atomic() || irqs_disabled()) && |
6073 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 6080 | system_state == SYSTEM_RUNNING && !oops_in_progress) { |
6074 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 6081 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
6075 | return; | 6082 | return; |
6076 | prev_jiffy = jiffies; | 6083 | prev_jiffy = jiffies; |
6077 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 6084 | printk(KERN_ERR "BUG: sleeping function called from invalid" |
6078 | " context at %s:%d\n", file, line); | 6085 | " context at %s:%d\n", file, line); |
6079 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6086 | printk("in_atomic():%d, irqs_disabled():%d\n", |
6080 | in_atomic(), irqs_disabled()); | 6087 | in_atomic(), irqs_disabled()); |
6081 | dump_stack(); | 6088 | dump_stack(); |
6082 | } | 6089 | } |
6083 | #endif | 6090 | #endif |
6084 | } | 6091 | } |
6085 | EXPORT_SYMBOL(__might_sleep); | 6092 | EXPORT_SYMBOL(__might_sleep); |
6086 | #endif | 6093 | #endif |
6087 | 6094 | ||
6088 | #ifdef CONFIG_MAGIC_SYSRQ | 6095 | #ifdef CONFIG_MAGIC_SYSRQ |
6089 | void normalize_rt_tasks(void) | 6096 | void normalize_rt_tasks(void) |
6090 | { | 6097 | { |
6091 | struct task_struct *p; | 6098 | struct task_struct *p; |
6092 | prio_array_t *array; | 6099 | prio_array_t *array; |
6093 | unsigned long flags; | 6100 | unsigned long flags; |
6094 | runqueue_t *rq; | 6101 | runqueue_t *rq; |
6095 | 6102 | ||
6096 | read_lock_irq(&tasklist_lock); | 6103 | read_lock_irq(&tasklist_lock); |
6097 | for_each_process (p) { | 6104 | for_each_process (p) { |
6098 | if (!rt_task(p)) | 6105 | if (!rt_task(p)) |
6099 | continue; | 6106 | continue; |
6100 | 6107 | ||
6101 | rq = task_rq_lock(p, &flags); | 6108 | rq = task_rq_lock(p, &flags); |
6102 | 6109 | ||
6103 | array = p->array; | 6110 | array = p->array; |
6104 | if (array) | 6111 | if (array) |
6105 | deactivate_task(p, task_rq(p)); | 6112 | deactivate_task(p, task_rq(p)); |
6106 | __setscheduler(p, SCHED_NORMAL, 0); | 6113 | __setscheduler(p, SCHED_NORMAL, 0); |
6107 | if (array) { | 6114 | if (array) { |
6108 | __activate_task(p, task_rq(p)); | 6115 | __activate_task(p, task_rq(p)); |
6109 | resched_task(rq->curr); | 6116 | resched_task(rq->curr); |
6110 | } | 6117 | } |
6111 | 6118 | ||
6112 | task_rq_unlock(rq, &flags); | 6119 | task_rq_unlock(rq, &flags); |
6113 | } | 6120 | } |
6114 | read_unlock_irq(&tasklist_lock); | 6121 | read_unlock_irq(&tasklist_lock); |
6115 | } | 6122 | } |
6116 | 6123 | ||
6117 | #endif /* CONFIG_MAGIC_SYSRQ */ | 6124 | #endif /* CONFIG_MAGIC_SYSRQ */ |
6118 | 6125 | ||
6119 | #ifdef CONFIG_IA64 | 6126 | #ifdef CONFIG_IA64 |
6120 | /* | 6127 | /* |
6121 | * These functions are only useful for the IA64 MCA handling. | 6128 | * These functions are only useful for the IA64 MCA handling. |
6122 | * | 6129 | * |
6123 | * They can only be called when the whole system has been | 6130 | * They can only be called when the whole system has been |
6124 | * stopped - every CPU needs to be quiescent, and no scheduling | 6131 | * stopped - every CPU needs to be quiescent, and no scheduling |
6125 | * activity can take place. Using them for anything else would | 6132 | * activity can take place. Using them for anything else would |
6126 | * be a serious bug, and as a result, they aren't even visible | 6133 | * be a serious bug, and as a result, they aren't even visible |
6127 | * under any other configuration. | 6134 | * under any other configuration. |
6128 | */ | 6135 | */ |
6129 | 6136 | ||
6130 | /** | 6137 | /** |
6131 | * curr_task - return the current task for a given cpu. | 6138 | * curr_task - return the current task for a given cpu. |
6132 | * @cpu: the processor in question. | 6139 | * @cpu: the processor in question. |
6133 | * | 6140 | * |
6134 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6141 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6135 | */ | 6142 | */ |
6136 | task_t *curr_task(int cpu) | 6143 | task_t *curr_task(int cpu) |
6137 | { | 6144 | { |
6138 | return cpu_curr(cpu); | 6145 | return cpu_curr(cpu); |
6139 | } | 6146 | } |
6140 | 6147 | ||
6141 | /** | 6148 | /** |
6142 | * set_curr_task - set the current task for a given cpu. | 6149 | * set_curr_task - set the current task for a given cpu. |
6143 | * @cpu: the processor in question. | 6150 | * @cpu: the processor in question. |
6144 | * @p: the task pointer to set. | 6151 | * @p: the task pointer to set. |
6145 | * | 6152 | * |
6146 | * Description: This function must only be used when non-maskable interrupts | 6153 | * Description: This function must only be used when non-maskable interrupts |
6147 | * are serviced on a separate stack. It allows the architecture to switch the | 6154 | * are serviced on a separate stack. It allows the architecture to switch the |
6148 | * notion of the current task on a cpu in a non-blocking manner. This function | 6155 | * notion of the current task on a cpu in a non-blocking manner. This function |
6149 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6156 | * must be called with all CPU's synchronized, and interrupts disabled, the |
6150 | * and caller must save the original value of the current task (see | 6157 | * and caller must save the original value of the current task (see |
6151 | * curr_task() above) and restore that value before reenabling interrupts and | 6158 | * curr_task() above) and restore that value before reenabling interrupts and |
6152 | * re-starting the system. | 6159 | * re-starting the system. |
6153 | * | 6160 | * |
6154 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6161 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6155 | */ | 6162 | */ |
6156 | void set_curr_task(int cpu, task_t *p) | 6163 | void set_curr_task(int cpu, task_t *p) |
6157 | { | 6164 | { |
6158 | cpu_curr(cpu) = p; | 6165 | cpu_curr(cpu) = p; |
6159 | } | 6166 | } |
6160 | 6167 | ||
6161 | #endif | 6168 | #endif |
6162 | 6169 |